aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
committerSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
commitee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
treee74ee766a4764769ef1d3d45d266b4dea64101d3 /fs
parentfe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parentf1d6e17f540af37bb1891480143669ba7636c4cf (diff)
Merge remote-tracking branch 'linus/master' into testing
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig13
-rw-r--r--fs/9p/Makefile4
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_dir.c72
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/9p/xattr.c4
-rw-r--r--fs/9p/xattr.h2
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/adfs/dir.c48
-rw-r--r--fs/affs/dir.c69
-rw-r--r--fs/affs/namei.c26
-rw-r--r--fs/afs/dir.c99
-rw-r--r--fs/afs/file.c10
-rw-r--r--fs/afs/flock.c7
-rw-r--r--fs/aio.c4
-rw-r--r--fs/autofs4/expire.c8
-rw-r--r--fs/autofs4/root.c6
-rw-r--r--fs/bad_inode.c4
-rw-r--r--fs/befs/linuxvfs.c40
-rw-r--r--fs/bfs/dir.c35
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c33
-rw-r--r--fs/btrfs/backref.c120
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/ctree.c121
-rw-r--r--fs/btrfs/ctree.h105
-rw-r--r--fs/btrfs/delayed-inode.c23
-rw-r--r--fs/btrfs/delayed-inode.h3
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c486
-rw-r--r--fs/btrfs/disk-io.h32
-rw-r--r--fs/btrfs/export.c5
-rw-r--r--fs/btrfs/extent-tree.c340
-rw-r--r--fs/btrfs/extent_io.c52
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file-item.c144
-rw-r--r--fs/btrfs/file.c227
-rw-r--r--fs/btrfs/free-space-cache.c103
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode.c601
-rw-r--r--fs/btrfs/ioctl.c82
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/ordered-data.c128
-rw-r--r--fs/btrfs/ordered-data.h27
-rw-r--r--fs/btrfs/qgroup.c283
-rw-r--r--fs/btrfs/relocation.c102
-rw-r--r--fs/btrfs/root-tree.c201
-rw-r--r--fs/btrfs/scrub.c92
-rw-r--r--fs/btrfs/send.c235
-rw-r--r--fs/btrfs/super.c25
-rw-r--r--fs/btrfs/transaction.c330
-rw-r--r--fs/btrfs/transaction.h52
-rw-r--r--fs/btrfs/tree-log.c46
-rw-r--r--fs/btrfs/ulist.c15
-rw-r--r--fs/btrfs/version.h4
-rw-r--r--fs/btrfs/volumes.c351
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/buffer.c55
-rw-r--r--fs/cachefiles/interface.c13
-rw-r--r--fs/cachefiles/namei.c10
-rw-r--r--fs/cachefiles/rdwr.c30
-rw-r--r--fs/cachefiles/xattr.c6
-rw-r--r--fs/ceph/addr.c15
-rw-r--r--fs/ceph/dir.c99
-rw-r--r--fs/ceph/file.c11
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/locks.c2
-rw-r--r--fs/ceph/mds_client.c10
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifs_debug.c52
-rw-r--r--fs/cifs/cifs_unicode.h8
-rw-r--r--fs/cifs/cifsencrypt.c189
-rw-r--r--fs/cifs/cifsfs.c26
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h69
-rw-r--r--fs/cifs/cifspdu.h17
-rw-r--r--fs/cifs/cifsproto.h9
-rw-r--r--fs/cifs/cifssmb.c425
-rw-r--r--fs/cifs/connect.c172
-rw-r--r--fs/cifs/dir.c23
-rw-r--r--fs/cifs/file.c75
-rw-r--r--fs/cifs/inode.c5
-rw-r--r--fs/cifs/link.c84
-rw-r--r--fs/cifs/misc.c3
-rw-r--r--fs/cifs/readdir.c215
-rw-r--r--fs/cifs/sess.c101
-rw-r--r--fs/cifs/smb1ops.c53
-rw-r--r--fs/cifs/smb2file.c24
-rw-r--r--fs/cifs/smb2glob.h2
-rw-r--r--fs/cifs/smb2inode.c57
-rw-r--r--fs/cifs/smb2misc.c4
-rw-r--r--fs/cifs/smb2ops.c102
-rw-r--r--fs/cifs/smb2pdu.c502
-rw-r--r--fs/cifs/smb2pdu.h114
-rw-r--r--fs/cifs/smb2proto.h20
-rw-r--r--fs/cifs/smb2transport.c246
-rw-r--r--fs/cifs/smbfsctl.h27
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/coda/dir.c76
-rw-r--r--fs/compat.c43
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/dir.c137
-rw-r--r--fs/configfs/file.c2
-rw-r--r--fs/coredump.c121
-rw-r--r--fs/cramfs/inode.c21
-rw-r--r--fs/dcache.c77
-rw-r--r--fs/debugfs/file.c43
-rw-r--r--fs/debugfs/inode.c69
-rw-r--r--fs/dlm/config.c5
-rw-r--r--fs/dlm/lock.c8
-rw-r--r--fs/dlm/lockspace.c9
-rw-r--r--fs/dlm/lowcomms.c177
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/ecryptfs/crypto.c342
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/file.c51
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efivarfs/inode.c14
-rw-r--r--fs/efivarfs/super.c9
-rw-r--r--fs/efs/dir.c75
-rw-r--r--fs/eventpoll.c16
-rw-r--r--fs/exec.c17
-rw-r--r--fs/exofs/dir.c38
-rw-r--r--fs/exofs/inode.c6
-rw-r--r--fs/exportfs/expfs.c14
-rw-r--r--fs/ext2/dir.c27
-rw-r--r--fs/ext2/namei.c24
-rw-r--r--fs/ext3/dir.c157
-rw-r--r--fs/ext3/fsync.c8
-rw-r--r--fs/ext3/inode.c10
-rw-r--r--fs/ext3/namei.c54
-rw-r--r--fs/ext3/super.c13
-rw-r--r--fs/ext4/balloc.c18
-rw-r--r--fs/ext4/dir.c158
-rw-r--r--fs/ext4/ext4.h189
-rw-r--r--fs/ext4/ext4_jbd2.c58
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c214
-rw-r--r--fs/ext4/extents_status.c144
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/file.c38
-rw-r--r--fs/ext4/fsync.c52
-rw-r--r--fs/ext4/ialloc.c13
-rw-r--r--fs/ext4/indirect.c40
-rw-r--r--fs/ext4/inline.c168
-rw-r--r--fs/ext4/inode.c1791
-rw-r--r--fs/ext4/ioctl.c6
-rw-r--r--fs/ext4/mballoc.c32
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c54
-rw-r--r--fs/ext4/page-io.c336
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c189
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/acl.c2
-rw-r--r--fs/f2fs/checkpoint.c99
-rw-r--r--fs/f2fs/data.c71
-rw-r--r--fs/f2fs/debug.c4
-rw-r--r--fs/f2fs/dir.c153
-rw-r--r--fs/f2fs/f2fs.h66
-rw-r--r--fs/f2fs/file.c58
-rw-r--r--fs/f2fs/gc.c42
-rw-r--r--fs/f2fs/inode.c13
-rw-r--r--fs/f2fs/namei.c17
-rw-r--r--fs/f2fs/node.c37
-rw-r--r--fs/f2fs/node.h68
-rw-r--r--fs/f2fs/recovery.c150
-rw-r--r--fs/f2fs/segment.c101
-rw-r--r--fs/f2fs/super.c253
-rw-r--r--fs/f2fs/xattr.c68
-rw-r--r--fs/f2fs/xattr.h24
-rw-r--r--fs/fat/dir.c104
-rw-r--r--fs/fat/fat.h1
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c12
-rw-r--r--fs/fat/misc.c5
-rw-r--r--fs/fat/namei_msdos.c6
-rw-r--r--fs/fat/namei_vfat.c12
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file_table.c33
-rw-r--r--fs/freevxfs/vxfs_lookup.c55
-rw-r--r--fs/fs-writeback.c19
-rw-r--r--fs/fscache/cache.c34
-rw-r--r--fs/fscache/cookie.c93
-rw-r--r--fs/fscache/fsdef.c1
-rw-r--r--fs/fscache/internal.h11
-rw-r--r--fs/fscache/main.c11
-rw-r--r--fs/fscache/netfs.c1
-rw-r--r--fs/fscache/object-list.c103
-rw-r--r--fs/fscache/object.c1106
-rw-r--r--fs/fscache/operation.c37
-rw-r--r--fs/fscache/page.c65
-rw-r--r--fs/fuse/dir.c88
-rw-r--r--fs/fuse/file.c3
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/Kconfig5
-rw-r--r--fs/gfs2/aops.c17
-rw-r--r--fs/gfs2/bmap.c4
-rw-r--r--fs/gfs2/dentry.c3
-rw-r--r--fs/gfs2/dir.c82
-rw-r--r--fs/gfs2/dir.h7
-rw-r--r--fs/gfs2/export.c10
-rw-r--r--fs/gfs2/file.c94
-rw-r--r--fs/gfs2/glops.c8
-rw-r--r--fs/gfs2/inode.c150
-rw-r--r--fs/gfs2/inode.h1
-rw-r--r--fs/gfs2/log.c78
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c22
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c7
-rw-r--r--fs/gfs2/rgrp.c14
-rw-r--r--fs/gfs2/trans.c9
-rw-r--r--fs/hfs/dir.c49
-rw-r--r--fs/hfs/hfs_fs.h7
-rw-r--r--fs/hfs/string.c6
-rw-r--r--fs/hfsplus/dir.c50
-rw-r--r--fs/hfsplus/hfsplus_fs.h7
-rw-r--r--fs/hfsplus/unicode.c7
-rw-r--r--fs/hostfs/hostfs_kern.c13
-rw-r--r--fs/hpfs/buffer.c33
-rw-r--r--fs/hpfs/dentry.c7
-rw-r--r--fs/hpfs/dir.c56
-rw-r--r--fs/hpfs/file.c40
-rw-r--r--fs/hpfs/hpfs_fn.h7
-rw-r--r--fs/hpfs/map.c22
-rw-r--r--fs/hpfs/super.c17
-rw-r--r--fs/hppfs/hppfs.c44
-rw-r--r--fs/hugetlbfs/inode.c10
-rw-r--r--fs/inode.c4
-rw-r--r--fs/internal.h6
-rw-r--r--fs/isofs/dir.c42
-rw-r--r--fs/isofs/inode.c48
-rw-r--r--fs/isofs/namei.c3
-rw-r--r--fs/jbd/transaction.c19
-rw-r--r--fs/jbd2/Kconfig6
-rw-r--r--fs/jbd2/checkpoint.c22
-rw-r--r--fs/jbd2/commit.c184
-rw-r--r--fs/jbd2/journal.c166
-rw-r--r--fs/jbd2/recovery.c11
-rw-r--r--fs/jbd2/revoke.c49
-rw-r--r--fs/jbd2/transaction.c526
-rw-r--r--fs/jffs2/dir.c52
-rw-r--r--fs/jfs/jfs_dmap.c70
-rw-r--r--fs/jfs/jfs_dtree.c100
-rw-r--r--fs/jfs/jfs_dtree.h2
-rw-r--r--fs/jfs/jfs_extent.c2
-rw-r--r--fs/jfs/jfs_imap.c69
-rw-r--r--fs/jfs/jfs_metapage.c10
-rw-r--r--fs/jfs/jfs_superblock.h1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c62
-rw-r--r--fs/jfs/namei.c11
-rw-r--r--fs/jfs/resize.c2
-rw-r--r--fs/jfs/super.c22
-rw-r--r--fs/jfs/xattr.c8
-rw-r--r--fs/libfs.c83
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/clntproc.c5
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/lockd/svclock.c18
-rw-r--r--fs/lockd/svcsubs.c12
-rw-r--r--fs/locks.c328
-rw-r--r--fs/logfs/dir.c49
-rw-r--r--fs/logfs/file.c3
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/minix/dir.c42
-rw-r--r--fs/minix/namei.c13
-rw-r--r--fs/namei.c123
-rw-r--r--fs/ncpfs/dir.c123
-rw-r--r--fs/ncpfs/inode.c16
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Kconfig14
-rw-r--r--fs/nfs/Makefile6
-rw-r--r--fs/nfs/blocklayout/blocklayout.c3
-rw-r--r--fs/nfs/callback.c6
-rw-r--r--fs/nfs/callback.h3
-rw-r--r--fs/nfs/callback_proc.c3
-rw-r--r--fs/nfs/callback_xdr.c52
-rw-r--r--fs/nfs/client.c4
-rw-r--r--fs/nfs/delegation.c10
-rw-r--r--fs/nfs/dir.c143
-rw-r--r--fs/nfs/dns_resolve.c32
-rw-r--r--fs/nfs/file.c38
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c56
-rw-r--r--fs/nfs/inode.c149
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/mount_clnt.c14
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c15
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4filelayout.c3
-rw-r--r--fs/nfs/nfs4filelayout.h3
-rw-r--r--fs/nfs/nfs4filelayoutdev.c8
-rw-r--r--fs/nfs/nfs4proc.c699
-rw-r--r--fs/nfs/nfs4session.c40
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c46
-rw-r--r--fs/nfs/nfs4super.c14
-rw-r--r--fs/nfs/nfs4xdr.c189
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h6
-rw-r--r--fs/nfs/proc.c13
-rw-r--r--fs/nfs/super.c203
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/Kconfig16
-rw-r--r--fs/nfsd/nfs4proc.c48
-rw-r--r--fs/nfsd/nfs4recover.c20
-rw-r--r--fs/nfsd/nfs4state.c235
-rw-r--r--fs/nfsd/nfs4xdr.c174
-rw-r--r--fs/nfsd/nfsd.h27
-rw-r--r--fs/nfsd/nfssvc.c13
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/vfs.c42
-rw-r--r--fs/nfsd/vfs.h7
-rw-r--r--fs/nfsd/xdr4.h4
-rw-r--r--fs/nilfs2/alloc.c63
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/dir.c48
-rw-r--r--fs/nilfs2/ifile.c22
-rw-r--r--fs/nilfs2/ifile.h2
-rw-r--r--fs/nilfs2/inode.c8
-rw-r--r--fs/nilfs2/segment.c4
-rw-r--r--fs/nilfs2/super.c33
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/nilfs2/the_nilfs.h4
-rw-r--r--fs/notify/dnotify/dnotify.c25
-rw-r--r--fs/notify/fanotify/fanotify_user.c92
-rw-r--r--fs/notify/inotify/inotify_user.c13
-rw-r--r--fs/notify/mark.c50
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/dir.c84
-rw-r--r--fs/ocfs2/alloc.c8
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/cluster/quorum.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c29
-rw-r--r--fs/ocfs2/dir.c153
-rw-r--r--fs/ocfs2/dir.h5
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/file.c22
-rw-r--r--fs/ocfs2/journal.c14
-rw-r--r--fs/ocfs2/journal.h1
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/namei.c70
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/refcounttree.c58
-rw-r--r--fs/ocfs2/refcounttree.h6
-rw-r--r--fs/ocfs2/suballoc.c37
-rw-r--r--fs/ocfs2/super.c6
-rw-r--r--fs/ocfs2/xattr.c18
-rw-r--r--fs/omfs/dir.c94
-rw-r--r--fs/open.c67
-rw-r--r--fs/openpromfs/inode.c95
-rw-r--r--fs/proc/base.c462
-rw-r--r--fs/proc/fd.c114
-rw-r--r--fs/proc/generic.c100
-rw-r--r--fs/proc/internal.h10
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/namespaces.c87
-rw-r--r--fs/proc/proc_net.c9
-rw-r--r--fs/proc/proc_sysctl.c78
-rw-r--r--fs/proc/root.c19
-rw-r--r--fs/proc/task_mmu.c168
-rw-r--r--fs/proc/uptime.c3
-rw-r--r--fs/proc/vmcore.c694
-rw-r--r--fs/pstore/ftrace.c2
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/platform.c21
-rw-r--r--fs/pstore/ram.c5
-rw-r--r--fs/pstore/ram_core.c54
-rw-r--r--fs/qnx4/dir.c66
-rw-r--r--fs/qnx6/dir.c31
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/read_write.c65
-rw-r--r--fs/readdir.c56
-rw-r--r--fs/reiserfs/dir.c36
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/procfs.c99
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/super.c3
-rw-r--r--fs/reiserfs/xattr.c33
-rw-r--r--fs/romfs/super.c21
-rw-r--r--fs/select.c66
-rw-r--r--fs/seq_file.c54
-rw-r--r--fs/splice.c38
-rw-r--r--fs/squashfs/dir.c40
-rw-r--r--fs/super.c25
-rw-r--r--fs/sysfs/dir.c68
-rw-r--r--fs/sysfs/file.c10
-rw-r--r--fs/sysfs/group.c70
-rw-r--r--fs/sysfs/inode.c2
-rw-r--r--fs/sysv/dir.c37
-rw-r--r--fs/sysv/namei.c3
-rw-r--r--fs/timerfd.c131
-rw-r--r--fs/ubifs/dir.c57
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/dir.c63
-rw-r--r--fs/udf/namei.c24
-rw-r--r--fs/ufs/dir.c28
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/xfs_alloc.c24
-rw-r--r--fs/xfs/xfs_aops.c14
-rw-r--r--fs/xfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/xfs_bmap.c199
-rw-r--r--fs/xfs/xfs_bmap.h1
-rw-r--r--fs/xfs/xfs_bmap_btree.h2
-rw-r--r--fs/xfs/xfs_buf_item.c87
-rw-r--r--fs/xfs/xfs_buf_item.h4
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dinode.h6
-rw-r--r--fs/xfs/xfs_dir2.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c37
-rw-r--r--fs/xfs/xfs_dir2_leaf.c21
-rw-r--r--fs/xfs/xfs_dir2_priv.h11
-rw-r--r--fs/xfs/xfs_dir2_sf.c31
-rw-r--r--fs/xfs/xfs_dquot.c31
-rw-r--r--fs/xfs/xfs_dquot.h11
-rw-r--r--fs/xfs/xfs_file.c18
-rw-r--r--fs/xfs/xfs_fsops.c2
-rw-r--r--fs/xfs/xfs_ialloc.c74
-rw-r--r--fs/xfs/xfs_ialloc.h8
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_icreate_item.c195
-rw-r--r--fs/xfs/xfs_icreate_item.h52
-rw-r--r--fs/xfs/xfs_inode.c105
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_ioctl.c16
-rw-r--r--fs/xfs/xfs_iomap.c13
-rw-r--r--fs/xfs/xfs_iops.c27
-rw-r--r--fs/xfs/xfs_itable.c33
-rw-r--r--fs/xfs/xfs_log.c22
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c75
-rw-r--r--fs/xfs/xfs_log_recover.c127
-rw-r--r--fs/xfs/xfs_mount.c92
-rw-r--r--fs/xfs/xfs_mount.h4
-rw-r--r--fs/xfs/xfs_qm.c394
-rw-r--r--fs/xfs/xfs_qm.h97
-rw-r--r--fs/xfs/xfs_qm_bhv.c10
-rw-r--r--fs/xfs/xfs_qm_syscalls.c75
-rw-r--r--fs/xfs/xfs_quota.h104
-rw-r--r--fs/xfs/xfs_quotaops.c6
-rw-r--r--fs/xfs/xfs_sb.h6
-rw-r--r--fs/xfs/xfs_super.c39
-rw-r--r--fs/xfs/xfs_symlink.c61
-rw-r--r--fs/xfs/xfs_symlink.h2
-rw-r--r--fs/xfs/xfs_sysctl.c26
-rw-r--r--fs/xfs/xfs_trace.h20
-rw-r--r--fs/xfs/xfs_trans.c118
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_trans_buf.c34
-rw-r--r--fs/xfs/xfs_trans_dquot.c122
-rw-r--r--fs/xfs/xfs_trans_inode.c11
-rw-r--r--fs/xfs/xfs_vnodeops.c28
-rw-r--r--fs/xfs/xfs_vnodeops.h3
470 files changed, 16631 insertions, 11991 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 55abfd62654a..6489e1fc1afd 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -31,3 +31,16 @@ config 9P_FS_POSIX_ACL
31 If you don't know what Access Control Lists are, say N 31 If you don't know what Access Control Lists are, say N
32 32
33endif 33endif
34
35
36config 9P_FS_SECURITY
37 bool "9P Security Labels"
38 depends on 9P_FS
39 help
40 Security labels support alternative access control models
41 implemented by security modules like SELinux. This option
42 enables an extended attribute handler for file security
43 labels in the 9P filesystem.
44
45 If you are not using a security module that requires using
46 extended attributes for file security labels, say N.
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ab8c12780634..ff7be98f84f2 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -11,7 +11,9 @@ obj-$(CONFIG_9P_FS) := 9p.o
11 v9fs.o \ 11 v9fs.o \
12 fid.o \ 12 fid.o \
13 xattr.o \ 13 xattr.o \
14 xattr_user.o 14 xattr_user.o \
15 xattr_trusted.o
15 16
169p-$(CONFIG_9P_FSCACHE) += cache.o 179p-$(CONFIG_9P_FSCACHE) += cache.o
179p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o 189p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
199p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 055562c580b4..9ff073f4090a 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -148,13 +148,14 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
148 * @offset: offset in the page 148 * @offset: offset in the page
149 */ 149 */
150 150
151static void v9fs_invalidate_page(struct page *page, unsigned long offset) 151static void v9fs_invalidate_page(struct page *page, unsigned int offset,
152 unsigned int length)
152{ 153{
153 /* 154 /*
154 * If called with zero offset, we should release 155 * If called with zero offset, we should release
155 * the private state assocated with the page 156 * the private state assocated with the page
156 */ 157 */
157 if (offset == 0) 158 if (offset == 0 && length == PAGE_CACHE_SIZE)
158 v9fs_fscache_invalidate_page(page); 159 v9fs_fscache_invalidate_page(page);
159} 160}
160 161
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index be1e34adc3c6..4d0c2e0be7e5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -101,16 +101,15 @@ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
101} 101}
102 102
103/** 103/**
104 * v9fs_dir_readdir - read a directory 104 * v9fs_dir_readdir - iterate through a directory
105 * @filp: opened file structure 105 * @file: opened file structure
106 * @dirent: directory structure ??? 106 * @ctx: actor we feed the entries to
107 * @filldir: function to populate directory structure ???
108 * 107 *
109 */ 108 */
110 109
111static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) 110static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
112{ 111{
113 int over; 112 bool over;
114 struct p9_wstat st; 113 struct p9_wstat st;
115 int err = 0; 114 int err = 0;
116 struct p9_fid *fid; 115 struct p9_fid *fid;
@@ -118,19 +117,19 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
118 int reclen = 0; 117 int reclen = 0;
119 struct p9_rdir *rdir; 118 struct p9_rdir *rdir;
120 119
121 p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 120 p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
122 fid = filp->private_data; 121 fid = file->private_data;
123 122
124 buflen = fid->clnt->msize - P9_IOHDRSZ; 123 buflen = fid->clnt->msize - P9_IOHDRSZ;
125 124
126 rdir = v9fs_alloc_rdir_buf(filp, buflen); 125 rdir = v9fs_alloc_rdir_buf(file, buflen);
127 if (!rdir) 126 if (!rdir)
128 return -ENOMEM; 127 return -ENOMEM;
129 128
130 while (1) { 129 while (1) {
131 if (rdir->tail == rdir->head) { 130 if (rdir->tail == rdir->head) {
132 err = v9fs_file_readn(filp, rdir->buf, NULL, 131 err = v9fs_file_readn(file, rdir->buf, NULL,
133 buflen, filp->f_pos); 132 buflen, ctx->pos);
134 if (err <= 0) 133 if (err <= 0)
135 return err; 134 return err;
136 135
@@ -148,51 +147,45 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
148 } 147 }
149 reclen = st.size+2; 148 reclen = st.size+2;
150 149
151 over = filldir(dirent, st.name, strlen(st.name), 150 over = !dir_emit(ctx, st.name, strlen(st.name),
152 filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); 151 v9fs_qid2ino(&st.qid), dt_type(&st));
153
154 p9stat_free(&st); 152 p9stat_free(&st);
155
156 if (over) 153 if (over)
157 return 0; 154 return 0;
158 155
159 rdir->head += reclen; 156 rdir->head += reclen;
160 filp->f_pos += reclen; 157 ctx->pos += reclen;
161 } 158 }
162 } 159 }
163} 160}
164 161
165/** 162/**
166 * v9fs_dir_readdir_dotl - read a directory 163 * v9fs_dir_readdir_dotl - iterate through a directory
167 * @filp: opened file structure 164 * @file: opened file structure
168 * @dirent: buffer to fill dirent structures 165 * @ctx: actor we feed the entries to
169 * @filldir: function to populate dirent structures
170 * 166 *
171 */ 167 */
172static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, 168static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
173 filldir_t filldir)
174{ 169{
175 int over;
176 int err = 0; 170 int err = 0;
177 struct p9_fid *fid; 171 struct p9_fid *fid;
178 int buflen; 172 int buflen;
179 struct p9_rdir *rdir; 173 struct p9_rdir *rdir;
180 struct p9_dirent curdirent; 174 struct p9_dirent curdirent;
181 u64 oldoffset = 0;
182 175
183 p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 176 p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name);
184 fid = filp->private_data; 177 fid = file->private_data;
185 178
186 buflen = fid->clnt->msize - P9_READDIRHDRSZ; 179 buflen = fid->clnt->msize - P9_READDIRHDRSZ;
187 180
188 rdir = v9fs_alloc_rdir_buf(filp, buflen); 181 rdir = v9fs_alloc_rdir_buf(file, buflen);
189 if (!rdir) 182 if (!rdir)
190 return -ENOMEM; 183 return -ENOMEM;
191 184
192 while (1) { 185 while (1) {
193 if (rdir->tail == rdir->head) { 186 if (rdir->tail == rdir->head) {
194 err = p9_client_readdir(fid, rdir->buf, buflen, 187 err = p9_client_readdir(fid, rdir->buf, buflen,
195 filp->f_pos); 188 ctx->pos);
196 if (err <= 0) 189 if (err <= 0)
197 return err; 190 return err;
198 191
@@ -210,22 +203,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
210 return -EIO; 203 return -EIO;
211 } 204 }
212 205
213 /* d_off in dirent structure tracks the offset into 206 if (!dir_emit(ctx, curdirent.d_name,
214 * the next dirent in the dir. However, filldir() 207 strlen(curdirent.d_name),
215 * expects offset into the current dirent. Hence 208 v9fs_qid2ino(&curdirent.qid),
216 * while calling filldir send the offset from the 209 curdirent.d_type))
217 * previous dirent structure.
218 */
219 over = filldir(dirent, curdirent.d_name,
220 strlen(curdirent.d_name),
221 oldoffset, v9fs_qid2ino(&curdirent.qid),
222 curdirent.d_type);
223 oldoffset = curdirent.d_off;
224
225 if (over)
226 return 0; 210 return 0;
227 211
228 filp->f_pos = curdirent.d_off; 212 ctx->pos = curdirent.d_off;
229 rdir->head += err; 213 rdir->head += err;
230 } 214 }
231 } 215 }
@@ -254,7 +238,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
254const struct file_operations v9fs_dir_operations = { 238const struct file_operations v9fs_dir_operations = {
255 .read = generic_read_dir, 239 .read = generic_read_dir,
256 .llseek = generic_file_llseek, 240 .llseek = generic_file_llseek,
257 .readdir = v9fs_dir_readdir, 241 .iterate = v9fs_dir_readdir,
258 .open = v9fs_file_open, 242 .open = v9fs_file_open,
259 .release = v9fs_dir_release, 243 .release = v9fs_dir_release,
260}; 244};
@@ -262,7 +246,7 @@ const struct file_operations v9fs_dir_operations = {
262const struct file_operations v9fs_dir_operations_dotl = { 246const struct file_operations v9fs_dir_operations_dotl = {
263 .read = generic_read_dir, 247 .read = generic_read_dir,
264 .llseek = generic_file_llseek, 248 .llseek = generic_file_llseek,
265 .readdir = v9fs_dir_readdir_dotl, 249 .iterate = v9fs_dir_readdir_dotl,
266 .open = v9fs_file_open, 250 .open = v9fs_file_open,
267 .release = v9fs_dir_release, 251 .release = v9fs_dir_release,
268 .fsync = v9fs_file_fsync_dotl, 252 .fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d86edc8d3fd0..25b018efb8ab 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1054,13 +1054,11 @@ static int
1054v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1054v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1055 struct kstat *stat) 1055 struct kstat *stat)
1056{ 1056{
1057 int err;
1058 struct v9fs_session_info *v9ses; 1057 struct v9fs_session_info *v9ses;
1059 struct p9_fid *fid; 1058 struct p9_fid *fid;
1060 struct p9_wstat *st; 1059 struct p9_wstat *st;
1061 1060
1062 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); 1061 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1063 err = -EPERM;
1064 v9ses = v9fs_dentry2v9ses(dentry); 1062 v9ses = v9fs_dentry2v9ses(dentry);
1065 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 1063 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
1066 generic_fillattr(dentry->d_inode, stat); 1064 generic_fillattr(dentry->d_inode, stat);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index c45e016b190f..3c28cdfb8c47 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -167,9 +167,13 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
167 167
168const struct xattr_handler *v9fs_xattr_handlers[] = { 168const struct xattr_handler *v9fs_xattr_handlers[] = {
169 &v9fs_xattr_user_handler, 169 &v9fs_xattr_user_handler,
170 &v9fs_xattr_trusted_handler,
170#ifdef CONFIG_9P_FS_POSIX_ACL 171#ifdef CONFIG_9P_FS_POSIX_ACL
171 &v9fs_xattr_acl_access_handler, 172 &v9fs_xattr_acl_access_handler,
172 &v9fs_xattr_acl_default_handler, 173 &v9fs_xattr_acl_default_handler,
173#endif 174#endif
175#ifdef CONFIG_9P_FS_SECURITY
176 &v9fs_xattr_security_handler,
177#endif
174 NULL 178 NULL
175}; 179};
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index eec348a3df71..d3e2ea3840be 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -20,6 +20,8 @@
20 20
21extern const struct xattr_handler *v9fs_xattr_handlers[]; 21extern const struct xattr_handler *v9fs_xattr_handlers[];
22extern struct xattr_handler v9fs_xattr_user_handler; 22extern struct xattr_handler v9fs_xattr_user_handler;
23extern struct xattr_handler v9fs_xattr_trusted_handler;
24extern struct xattr_handler v9fs_xattr_security_handler;
23extern const struct xattr_handler v9fs_xattr_acl_access_handler; 25extern const struct xattr_handler v9fs_xattr_acl_access_handler;
24extern const struct xattr_handler v9fs_xattr_acl_default_handler; 26extern const struct xattr_handler v9fs_xattr_acl_default_handler;
25 27
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
new file mode 100644
index 000000000000..cb247a142a6e
--- /dev/null
+++ b/fs/9p/xattr_security.c
@@ -0,0 +1,80 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/slab.h>
20#include "xattr.h"
21
22static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
23 void *buffer, size_t size, int type)
24{
25 int retval;
26 char *full_name;
27 size_t name_len;
28 size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
29
30 if (name == NULL)
31 return -EINVAL;
32
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35
36 name_len = strlen(name);
37 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
38 if (!full_name)
39 return -ENOMEM;
40 memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
41 memcpy(full_name+prefix_len, name, name_len);
42 full_name[prefix_len + name_len] = '\0';
43
44 retval = v9fs_xattr_get(dentry, full_name, buffer, size);
45 kfree(full_name);
46 return retval;
47}
48
49static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
50 const void *value, size_t size, int flags, int type)
51{
52 int retval;
53 char *full_name;
54 size_t name_len;
55 size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
56
57 if (name == NULL)
58 return -EINVAL;
59
60 if (strcmp(name, "") == 0)
61 return -EINVAL;
62
63 name_len = strlen(name);
64 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
65 if (!full_name)
66 return -ENOMEM;
67 memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
68 memcpy(full_name + prefix_len, name, name_len);
69 full_name[prefix_len + name_len] = '\0';
70
71 retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
72 kfree(full_name);
73 return retval;
74}
75
76struct xattr_handler v9fs_xattr_security_handler = {
77 .prefix = XATTR_SECURITY_PREFIX,
78 .get = v9fs_xattr_security_get,
79 .set = v9fs_xattr_security_set,
80};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
new file mode 100644
index 000000000000..e30d33b8a3fb
--- /dev/null
+++ b/fs/9p/xattr_trusted.c
@@ -0,0 +1,80 @@
1/*
2 * Copyright IBM Corporation, 2010
3 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12 *
13 */
14
15
16#include <linux/module.h>
17#include <linux/string.h>
18#include <linux/fs.h>
19#include <linux/slab.h>
20#include "xattr.h"
21
22static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
23 void *buffer, size_t size, int type)
24{
25 int retval;
26 char *full_name;
27 size_t name_len;
28 size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
29
30 if (name == NULL)
31 return -EINVAL;
32
33 if (strcmp(name, "") == 0)
34 return -EINVAL;
35
36 name_len = strlen(name);
37 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
38 if (!full_name)
39 return -ENOMEM;
40 memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
41 memcpy(full_name+prefix_len, name, name_len);
42 full_name[prefix_len + name_len] = '\0';
43
44 retval = v9fs_xattr_get(dentry, full_name, buffer, size);
45 kfree(full_name);
46 return retval;
47}
48
49static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
50 const void *value, size_t size, int flags, int type)
51{
52 int retval;
53 char *full_name;
54 size_t name_len;
55 size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
56
57 if (name == NULL)
58 return -EINVAL;
59
60 if (strcmp(name, "") == 0)
61 return -EINVAL;
62
63 name_len = strlen(name);
64 full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
65 if (!full_name)
66 return -ENOMEM;
67 memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
68 memcpy(full_name + prefix_len, name, name_len);
69 full_name[prefix_len + name_len] = '\0';
70
71 retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
72 kfree(full_name);
73 return retval;
74}
75
76struct xattr_handler v9fs_xattr_trusted_handler = {
77 .prefix = XATTR_TRUSTED_PREFIX,
78 .get = v9fs_xattr_trusted_get,
79 .set = v9fs_xattr_trusted_set,
80};
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 9cf874ce8336..0d138c0de293 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -17,47 +17,43 @@
17static DEFINE_RWLOCK(adfs_dir_lock); 17static DEFINE_RWLOCK(adfs_dir_lock);
18 18
19static int 19static int
20adfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 20adfs_readdir(struct file *file, struct dir_context *ctx)
21{ 21{
22 struct inode *inode = file_inode(filp); 22 struct inode *inode = file_inode(file);
23 struct super_block *sb = inode->i_sb; 23 struct super_block *sb = inode->i_sb;
24 struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; 24 struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
25 struct object_info obj; 25 struct object_info obj;
26 struct adfs_dir dir; 26 struct adfs_dir dir;
27 int ret = 0; 27 int ret = 0;
28 28
29 if (filp->f_pos >> 32) 29 if (ctx->pos >> 32)
30 goto out; 30 return 0;
31 31
32 ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); 32 ret = ops->read(sb, inode->i_ino, inode->i_size, &dir);
33 if (ret) 33 if (ret)
34 goto out; 34 return ret;
35 35
36 switch ((unsigned long)filp->f_pos) { 36 if (ctx->pos == 0) {
37 case 0: 37 if (!dir_emit_dot(file, ctx))
38 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
39 goto free_out; 38 goto free_out;
40 filp->f_pos += 1; 39 ctx->pos = 1;
41 40 }
42 case 1: 41 if (ctx->pos == 1) {
43 if (filldir(dirent, "..", 2, 1, dir.parent_id, DT_DIR) < 0) 42 if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR))
44 goto free_out; 43 goto free_out;
45 filp->f_pos += 1; 44 ctx->pos = 2;
46
47 default:
48 break;
49 } 45 }
50 46
51 read_lock(&adfs_dir_lock); 47 read_lock(&adfs_dir_lock);
52 48
53 ret = ops->setpos(&dir, filp->f_pos - 2); 49 ret = ops->setpos(&dir, ctx->pos - 2);
54 if (ret) 50 if (ret)
55 goto unlock_out; 51 goto unlock_out;
56 while (ops->getnext(&dir, &obj) == 0) { 52 while (ops->getnext(&dir, &obj) == 0) {
57 if (filldir(dirent, obj.name, obj.name_len, 53 if (!dir_emit(ctx, obj.name, obj.name_len,
58 filp->f_pos, obj.file_id, DT_UNKNOWN) < 0) 54 obj.file_id, DT_UNKNOWN))
59 goto unlock_out; 55 break;
60 filp->f_pos += 1; 56 ctx->pos++;
61 } 57 }
62 58
63unlock_out: 59unlock_out:
@@ -65,8 +61,6 @@ unlock_out:
65 61
66free_out: 62free_out:
67 ops->free(&dir); 63 ops->free(&dir);
68
69out:
70 return ret; 64 return ret;
71} 65}
72 66
@@ -192,13 +186,12 @@ out:
192const struct file_operations adfs_dir_operations = { 186const struct file_operations adfs_dir_operations = {
193 .read = generic_read_dir, 187 .read = generic_read_dir,
194 .llseek = generic_file_llseek, 188 .llseek = generic_file_llseek,
195 .readdir = adfs_readdir, 189 .iterate = adfs_readdir,
196 .fsync = generic_file_fsync, 190 .fsync = generic_file_fsync,
197}; 191};
198 192
199static int 193static int
200adfs_hash(const struct dentry *parent, const struct inode *inode, 194adfs_hash(const struct dentry *parent, struct qstr *qstr)
201 struct qstr *qstr)
202{ 195{
203 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen; 196 const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
204 const unsigned char *name; 197 const unsigned char *name;
@@ -234,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode,
234 * requirements of the underlying filesystem. 227 * requirements of the underlying filesystem.
235 */ 228 */
236static int 229static int
237adfs_compare(const struct dentry *parent, const struct inode *pinode, 230adfs_compare(const struct dentry *parent, const struct dentry *dentry,
238 const struct dentry *dentry, const struct inode *inode,
239 unsigned int len, const char *str, const struct qstr *name) 231 unsigned int len, const char *str, const struct qstr *name)
240{ 232{
241 int i; 233 int i;
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index fd11a6d608ee..f1eba8c3644e 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -15,12 +15,12 @@
15 15
16#include "affs.h" 16#include "affs.h"
17 17
18static int affs_readdir(struct file *, void *, filldir_t); 18static int affs_readdir(struct file *, struct dir_context *);
19 19
20const struct file_operations affs_dir_operations = { 20const struct file_operations affs_dir_operations = {
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .readdir = affs_readdir, 23 .iterate = affs_readdir,
24 .fsync = affs_file_fsync, 24 .fsync = affs_file_fsync,
25}; 25};
26 26
@@ -40,52 +40,35 @@ const struct inode_operations affs_dir_inode_operations = {
40}; 40};
41 41
42static int 42static int
43affs_readdir(struct file *filp, void *dirent, filldir_t filldir) 43affs_readdir(struct file *file, struct dir_context *ctx)
44{ 44{
45 struct inode *inode = file_inode(filp); 45 struct inode *inode = file_inode(file);
46 struct super_block *sb = inode->i_sb; 46 struct super_block *sb = inode->i_sb;
47 struct buffer_head *dir_bh; 47 struct buffer_head *dir_bh = NULL;
48 struct buffer_head *fh_bh; 48 struct buffer_head *fh_bh = NULL;
49 unsigned char *name; 49 unsigned char *name;
50 int namelen; 50 int namelen;
51 u32 i; 51 u32 i;
52 int hash_pos; 52 int hash_pos;
53 int chain_pos; 53 int chain_pos;
54 u32 f_pos;
55 u32 ino; 54 u32 ino;
56 int stored;
57 int res;
58 55
59 pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)filp->f_pos); 56 pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
60 57
61 stored = 0; 58 if (ctx->pos < 2) {
62 res = -EIO; 59 file->private_data = (void *)0;
63 dir_bh = NULL; 60 if (!dir_emit_dots(file, ctx))
64 fh_bh = NULL;
65 f_pos = filp->f_pos;
66
67 if (f_pos == 0) {
68 filp->private_data = (void *)0;
69 if (filldir(dirent, ".", 1, f_pos, inode->i_ino, DT_DIR) < 0)
70 return 0; 61 return 0;
71 filp->f_pos = f_pos = 1;
72 stored++;
73 }
74 if (f_pos == 1) {
75 if (filldir(dirent, "..", 2, f_pos, parent_ino(filp->f_path.dentry), DT_DIR) < 0)
76 return stored;
77 filp->f_pos = f_pos = 2;
78 stored++;
79 } 62 }
80 63
81 affs_lock_dir(inode); 64 affs_lock_dir(inode);
82 chain_pos = (f_pos - 2) & 0xffff; 65 chain_pos = (ctx->pos - 2) & 0xffff;
83 hash_pos = (f_pos - 2) >> 16; 66 hash_pos = (ctx->pos - 2) >> 16;
84 if (chain_pos == 0xffff) { 67 if (chain_pos == 0xffff) {
85 affs_warning(sb, "readdir", "More than 65535 entries in chain"); 68 affs_warning(sb, "readdir", "More than 65535 entries in chain");
86 chain_pos = 0; 69 chain_pos = 0;
87 hash_pos++; 70 hash_pos++;
88 filp->f_pos = ((hash_pos << 16) | chain_pos) + 2; 71 ctx->pos = ((hash_pos << 16) | chain_pos) + 2;
89 } 72 }
90 dir_bh = affs_bread(sb, inode->i_ino); 73 dir_bh = affs_bread(sb, inode->i_ino);
91 if (!dir_bh) 74 if (!dir_bh)
@@ -94,8 +77,8 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
94 /* If the directory hasn't changed since the last call to readdir(), 77 /* If the directory hasn't changed since the last call to readdir(),
95 * we can jump directly to where we left off. 78 * we can jump directly to where we left off.
96 */ 79 */
97 ino = (u32)(long)filp->private_data; 80 ino = (u32)(long)file->private_data;
98 if (ino && filp->f_version == inode->i_version) { 81 if (ino && file->f_version == inode->i_version) {
99 pr_debug("AFFS: readdir() left off=%d\n", ino); 82 pr_debug("AFFS: readdir() left off=%d\n", ino);
100 goto inside; 83 goto inside;
101 } 84 }
@@ -105,7 +88,7 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
105 fh_bh = affs_bread(sb, ino); 88 fh_bh = affs_bread(sb, ino);
106 if (!fh_bh) { 89 if (!fh_bh) {
107 affs_error(sb, "readdir","Cannot read block %d", i); 90 affs_error(sb, "readdir","Cannot read block %d", i);
108 goto readdir_out; 91 return -EIO;
109 } 92 }
110 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); 93 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
111 affs_brelse(fh_bh); 94 affs_brelse(fh_bh);
@@ -119,38 +102,34 @@ affs_readdir(struct file *filp, void *dirent, filldir_t filldir)
119 ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]); 102 ino = be32_to_cpu(AFFS_HEAD(dir_bh)->table[hash_pos]);
120 if (!ino) 103 if (!ino)
121 continue; 104 continue;
122 f_pos = (hash_pos << 16) + 2; 105 ctx->pos = (hash_pos << 16) + 2;
123inside: 106inside:
124 do { 107 do {
125 fh_bh = affs_bread(sb, ino); 108 fh_bh = affs_bread(sb, ino);
126 if (!fh_bh) { 109 if (!fh_bh) {
127 affs_error(sb, "readdir","Cannot read block %d", ino); 110 affs_error(sb, "readdir","Cannot read block %d", ino);
128 goto readdir_done; 111 break;
129 } 112 }
130 113
131 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); 114 namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
132 name = AFFS_TAIL(sb, fh_bh)->name + 1; 115 name = AFFS_TAIL(sb, fh_bh)->name + 1;
133 pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n", 116 pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
134 namelen, name, ino, hash_pos, f_pos); 117 namelen, name, ino, hash_pos, (u32)ctx->pos);
135 if (filldir(dirent, name, namelen, f_pos, ino, DT_UNKNOWN) < 0) 118 if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
136 goto readdir_done; 119 goto readdir_done;
137 stored++; 120 ctx->pos++;
138 f_pos++;
139 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain); 121 ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
140 affs_brelse(fh_bh); 122 affs_brelse(fh_bh);
141 fh_bh = NULL; 123 fh_bh = NULL;
142 } while (ino); 124 } while (ino);
143 } 125 }
144readdir_done: 126readdir_done:
145 filp->f_pos = f_pos; 127 file->f_version = inode->i_version;
146 filp->f_version = inode->i_version; 128 file->private_data = (void *)(long)ino;
147 filp->private_data = (void *)(long)ino;
148 res = stored;
149 129
150readdir_out: 130readdir_out:
151 affs_brelse(dir_bh); 131 affs_brelse(dir_bh);
152 affs_brelse(fh_bh); 132 affs_brelse(fh_bh);
153 affs_unlock_dir(inode); 133 affs_unlock_dir(inode);
154 pr_debug("AFFS: readdir()=%d\n", stored); 134 return 0;
155 return res;
156} 135}
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ff65884a7839..c36cbb4537a2 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,12 @@
13typedef int (*toupper_t)(int); 13typedef int (*toupper_t)(int);
14 14
15static int affs_toupper(int ch); 15static int affs_toupper(int ch);
16static int affs_hash_dentry(const struct dentry *, 16static int affs_hash_dentry(const struct dentry *, struct qstr *);
17 const struct inode *, struct qstr *); 17static int affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
18static int affs_compare_dentry(const struct dentry *parent,
19 const struct inode *pinode,
20 const struct dentry *dentry, const struct inode *inode,
21 unsigned int len, const char *str, const struct qstr *name); 18 unsigned int len, const char *str, const struct qstr *name);
22static int affs_intl_toupper(int ch); 19static int affs_intl_toupper(int ch);
23static int affs_intl_hash_dentry(const struct dentry *, 20static int affs_intl_hash_dentry(const struct dentry *, struct qstr *);
24 const struct inode *, struct qstr *); 21static int affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
25static int affs_intl_compare_dentry(const struct dentry *parent,
26 const struct inode *pinode,
27 const struct dentry *dentry, const struct inode *inode,
28 unsigned int len, const char *str, const struct qstr *name); 22 unsigned int len, const char *str, const struct qstr *name);
29 23
30const struct dentry_operations affs_dentry_operations = { 24const struct dentry_operations affs_dentry_operations = {
@@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
86} 80}
87 81
88static int 82static int
89affs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 83affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
90 struct qstr *qstr)
91{ 84{
92 return __affs_hash_dentry(qstr, affs_toupper); 85 return __affs_hash_dentry(qstr, affs_toupper);
93} 86}
94static int 87static int
95affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode, 88affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
96 struct qstr *qstr)
97{ 89{
98 return __affs_hash_dentry(qstr, affs_intl_toupper); 90 return __affs_hash_dentry(qstr, affs_intl_toupper);
99} 91}
@@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len,
131} 123}
132 124
133static int 125static int
134affs_compare_dentry(const struct dentry *parent, const struct inode *pinode, 126affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
135 const struct dentry *dentry, const struct inode *inode,
136 unsigned int len, const char *str, const struct qstr *name) 127 unsigned int len, const char *str, const struct qstr *name)
137{ 128{
138 return __affs_compare_dentry(len, str, name, affs_toupper); 129 return __affs_compare_dentry(len, str, name, affs_toupper);
139} 130}
140static int 131static int
141affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode, 132affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
142 const struct dentry *dentry, const struct inode *inode,
143 unsigned int len, const char *str, const struct qstr *name) 133 unsigned int len, const char *str, const struct qstr *name)
144{ 134{
145 return __affs_compare_dentry(len, str, name, affs_intl_toupper); 135 return __affs_compare_dentry(len, str, name, affs_intl_toupper);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7a465ed04444..34494fbead0a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -22,7 +22,7 @@
22static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, 22static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
23 unsigned int flags); 23 unsigned int flags);
24static int afs_dir_open(struct inode *inode, struct file *file); 24static int afs_dir_open(struct inode *inode, struct file *file);
25static int afs_readdir(struct file *file, void *dirent, filldir_t filldir); 25static int afs_readdir(struct file *file, struct dir_context *ctx);
26static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); 26static int afs_d_revalidate(struct dentry *dentry, unsigned int flags);
27static int afs_d_delete(const struct dentry *dentry); 27static int afs_d_delete(const struct dentry *dentry);
28static void afs_d_release(struct dentry *dentry); 28static void afs_d_release(struct dentry *dentry);
@@ -43,7 +43,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
43const struct file_operations afs_dir_file_operations = { 43const struct file_operations afs_dir_file_operations = {
44 .open = afs_dir_open, 44 .open = afs_dir_open,
45 .release = afs_release, 45 .release = afs_release,
46 .readdir = afs_readdir, 46 .iterate = afs_readdir,
47 .lock = afs_lock, 47 .lock = afs_lock,
48 .llseek = generic_file_llseek, 48 .llseek = generic_file_llseek,
49}; 49};
@@ -119,9 +119,9 @@ struct afs_dir_page {
119}; 119};
120 120
121struct afs_lookup_cookie { 121struct afs_lookup_cookie {
122 struct dir_context ctx;
122 struct afs_fid fid; 123 struct afs_fid fid;
123 const char *name; 124 struct qstr name;
124 size_t nlen;
125 int found; 125 int found;
126}; 126};
127 127
@@ -228,20 +228,18 @@ static int afs_dir_open(struct inode *inode, struct file *file)
228/* 228/*
229 * deal with one block in an AFS directory 229 * deal with one block in an AFS directory
230 */ 230 */
231static int afs_dir_iterate_block(unsigned *fpos, 231static int afs_dir_iterate_block(struct dir_context *ctx,
232 union afs_dir_block *block, 232 union afs_dir_block *block,
233 unsigned blkoff, 233 unsigned blkoff)
234 void *cookie,
235 filldir_t filldir)
236{ 234{
237 union afs_dirent *dire; 235 union afs_dirent *dire;
238 unsigned offset, next, curr; 236 unsigned offset, next, curr;
239 size_t nlen; 237 size_t nlen;
240 int tmp, ret; 238 int tmp;
241 239
242 _enter("%u,%x,%p,,",*fpos,blkoff,block); 240 _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block);
243 241
244 curr = (*fpos - blkoff) / sizeof(union afs_dirent); 242 curr = (ctx->pos - blkoff) / sizeof(union afs_dirent);
245 243
246 /* walk through the block, an entry at a time */ 244 /* walk through the block, an entry at a time */
247 for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries; 245 for (offset = AFS_DIRENT_PER_BLOCK - block->pagehdr.nentries;
@@ -256,7 +254,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
256 _debug("ENT[%Zu.%u]: unused", 254 _debug("ENT[%Zu.%u]: unused",
257 blkoff / sizeof(union afs_dir_block), offset); 255 blkoff / sizeof(union afs_dir_block), offset);
258 if (offset >= curr) 256 if (offset >= curr)
259 *fpos = blkoff + 257 ctx->pos = blkoff +
260 next * sizeof(union afs_dirent); 258 next * sizeof(union afs_dirent);
261 continue; 259 continue;
262 } 260 }
@@ -302,19 +300,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
302 continue; 300 continue;
303 301
304 /* found the next entry */ 302 /* found the next entry */
305 ret = filldir(cookie, 303 if (!dir_emit(ctx, dire->u.name, nlen,
306 dire->u.name,
307 nlen,
308 blkoff + offset * sizeof(union afs_dirent),
309 ntohl(dire->u.vnode), 304 ntohl(dire->u.vnode),
310 filldir == afs_lookup_filldir ? 305 ctx->actor == afs_lookup_filldir ?
311 ntohl(dire->u.unique) : DT_UNKNOWN); 306 ntohl(dire->u.unique) : DT_UNKNOWN)) {
312 if (ret < 0) {
313 _leave(" = 0 [full]"); 307 _leave(" = 0 [full]");
314 return 0; 308 return 0;
315 } 309 }
316 310
317 *fpos = blkoff + next * sizeof(union afs_dirent); 311 ctx->pos = blkoff + next * sizeof(union afs_dirent);
318 } 312 }
319 313
320 _leave(" = 1 [more]"); 314 _leave(" = 1 [more]");
@@ -324,8 +318,8 @@ static int afs_dir_iterate_block(unsigned *fpos,
324/* 318/*
325 * iterate through the data blob that lists the contents of an AFS directory 319 * iterate through the data blob that lists the contents of an AFS directory
326 */ 320 */
327static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie, 321static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
328 filldir_t filldir, struct key *key) 322 struct key *key)
329{ 323{
330 union afs_dir_block *dblock; 324 union afs_dir_block *dblock;
331 struct afs_dir_page *dbuf; 325 struct afs_dir_page *dbuf;
@@ -333,7 +327,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
333 unsigned blkoff, limit; 327 unsigned blkoff, limit;
334 int ret; 328 int ret;
335 329
336 _enter("{%lu},%u,,", dir->i_ino, *fpos); 330 _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
337 331
338 if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) { 332 if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
339 _leave(" = -ESTALE"); 333 _leave(" = -ESTALE");
@@ -341,13 +335,13 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
341 } 335 }
342 336
343 /* round the file position up to the next entry boundary */ 337 /* round the file position up to the next entry boundary */
344 *fpos += sizeof(union afs_dirent) - 1; 338 ctx->pos += sizeof(union afs_dirent) - 1;
345 *fpos &= ~(sizeof(union afs_dirent) - 1); 339 ctx->pos &= ~(sizeof(union afs_dirent) - 1);
346 340
347 /* walk through the blocks in sequence */ 341 /* walk through the blocks in sequence */
348 ret = 0; 342 ret = 0;
349 while (*fpos < dir->i_size) { 343 while (ctx->pos < dir->i_size) {
350 blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1); 344 blkoff = ctx->pos & ~(sizeof(union afs_dir_block) - 1);
351 345
352 /* fetch the appropriate page from the directory */ 346 /* fetch the appropriate page from the directory */
353 page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key); 347 page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
@@ -364,8 +358,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
364 do { 358 do {
365 dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / 359 dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) /
366 sizeof(union afs_dir_block)]; 360 sizeof(union afs_dir_block)];
367 ret = afs_dir_iterate_block(fpos, dblock, blkoff, 361 ret = afs_dir_iterate_block(ctx, dblock, blkoff);
368 cookie, filldir);
369 if (ret != 1) { 362 if (ret != 1) {
370 afs_dir_put_page(page); 363 afs_dir_put_page(page);
371 goto out; 364 goto out;
@@ -373,7 +366,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
373 366
374 blkoff += sizeof(union afs_dir_block); 367 blkoff += sizeof(union afs_dir_block);
375 368
376 } while (*fpos < dir->i_size && blkoff < limit); 369 } while (ctx->pos < dir->i_size && blkoff < limit);
377 370
378 afs_dir_put_page(page); 371 afs_dir_put_page(page);
379 ret = 0; 372 ret = 0;
@@ -387,23 +380,10 @@ out:
387/* 380/*
388 * read an AFS directory 381 * read an AFS directory
389 */ 382 */
390static int afs_readdir(struct file *file, void *cookie, filldir_t filldir) 383static int afs_readdir(struct file *file, struct dir_context *ctx)
391{ 384{
392 unsigned fpos; 385 return afs_dir_iterate(file_inode(file),
393 int ret; 386 ctx, file->private_data);
394
395 _enter("{%Ld,{%lu}}",
396 file->f_pos, file_inode(file)->i_ino);
397
398 ASSERT(file->private_data != NULL);
399
400 fpos = file->f_pos;
401 ret = afs_dir_iterate(file_inode(file), &fpos,
402 cookie, filldir, file->private_data);
403 file->f_pos = fpos;
404
405 _leave(" = %d", ret);
406 return ret;
407} 387}
408 388
409/* 389/*
@@ -416,15 +396,16 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
416{ 396{
417 struct afs_lookup_cookie *cookie = _cookie; 397 struct afs_lookup_cookie *cookie = _cookie;
418 398
419 _enter("{%s,%Zu},%s,%u,,%llu,%u", 399 _enter("{%s,%u},%s,%u,,%llu,%u",
420 cookie->name, cookie->nlen, name, nlen, 400 cookie->name.name, cookie->name.len, name, nlen,
421 (unsigned long long) ino, dtype); 401 (unsigned long long) ino, dtype);
422 402
423 /* insanity checks first */ 403 /* insanity checks first */
424 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048); 404 BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
425 BUILD_BUG_ON(sizeof(union afs_dirent) != 32); 405 BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
426 406
427 if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) { 407 if (cookie->name.len != nlen ||
408 memcmp(cookie->name.name, name, nlen) != 0) {
428 _leave(" = 0 [no]"); 409 _leave(" = 0 [no]");
429 return 0; 410 return 0;
430 } 411 }
@@ -444,24 +425,18 @@ static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
444static int afs_do_lookup(struct inode *dir, struct dentry *dentry, 425static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
445 struct afs_fid *fid, struct key *key) 426 struct afs_fid *fid, struct key *key)
446{ 427{
447 struct afs_lookup_cookie cookie; 428 struct afs_super_info *as = dir->i_sb->s_fs_info;
448 struct afs_super_info *as; 429 struct afs_lookup_cookie cookie = {
449 unsigned fpos; 430 .ctx.actor = afs_lookup_filldir,
431 .name = dentry->d_name,
432 .fid.vid = as->volume->vid
433 };
450 int ret; 434 int ret;
451 435
452 _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name); 436 _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
453 437
454 as = dir->i_sb->s_fs_info;
455
456 /* search the directory */ 438 /* search the directory */
457 cookie.name = dentry->d_name.name; 439 ret = afs_dir_iterate(dir, &cookie.ctx, key);
458 cookie.nlen = dentry->d_name.len;
459 cookie.fid.vid = as->volume->vid;
460 cookie.found = 0;
461
462 fpos = 0;
463 ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
464 key);
465 if (ret < 0) { 440 if (ret < 0) {
466 _leave(" = %d [iter]", ret); 441 _leave(" = %d [iter]", ret);
467 return ret; 442 return ret;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f6e9234d565..66d50fe2ee45 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,8 @@
19#include "internal.h" 19#include "internal.h"
20 20
21static int afs_readpage(struct file *file, struct page *page); 21static int afs_readpage(struct file *file, struct page *page);
22static void afs_invalidatepage(struct page *page, unsigned long offset); 22static void afs_invalidatepage(struct page *page, unsigned int offset,
23 unsigned int length);
23static int afs_releasepage(struct page *page, gfp_t gfp_flags); 24static int afs_releasepage(struct page *page, gfp_t gfp_flags);
24static int afs_launder_page(struct page *page); 25static int afs_launder_page(struct page *page);
25 26
@@ -310,16 +311,17 @@ static int afs_launder_page(struct page *page)
310 * - release a page and clean up its private data if offset is 0 (indicating 311 * - release a page and clean up its private data if offset is 0 (indicating
311 * the entire page) 312 * the entire page)
312 */ 313 */
313static void afs_invalidatepage(struct page *page, unsigned long offset) 314static void afs_invalidatepage(struct page *page, unsigned int offset,
315 unsigned int length)
314{ 316{
315 struct afs_writeback *wb = (struct afs_writeback *) page_private(page); 317 struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
316 318
317 _enter("{%lu},%lu", page->index, offset); 319 _enter("{%lu},%u,%u", page->index, offset, length);
318 320
319 BUG_ON(!PageLocked(page)); 321 BUG_ON(!PageLocked(page));
320 322
321 /* we clean up only if the entire page is being invalidated */ 323 /* we clean up only if the entire page is being invalidated */
322 if (offset == 0) { 324 if (offset == 0 && length == PAGE_CACHE_SIZE) {
323#ifdef CONFIG_AFS_FSCACHE 325#ifdef CONFIG_AFS_FSCACHE
324 if (PageFsCache(page)) { 326 if (PageFsCache(page)) {
325 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 327 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 2497bf306c70..a8cf2cff836c 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
252 */ 252 */
253static int afs_do_setlk(struct file *file, struct file_lock *fl) 253static int afs_do_setlk(struct file *file, struct file_lock *fl)
254{ 254{
255 struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); 255 struct inode *inode = file_inode(file);
256 struct afs_vnode *vnode = AFS_FS_I(inode);
256 afs_lock_type_t type; 257 afs_lock_type_t type;
257 struct key *key = file->private_data; 258 struct key *key = file->private_data;
258 int ret; 259 int ret;
@@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
273 274
274 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; 275 type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
275 276
276 lock_flocks(); 277 spin_lock(&inode->i_lock);
277 278
278 /* make sure we've got a callback on this file and that our view of the 279 /* make sure we've got a callback on this file and that our view of the
279 * data version is up to date */ 280 * data version is up to date */
@@ -420,7 +421,7 @@ given_lock:
420 afs_vnode_fetch_status(vnode, NULL, key); 421 afs_vnode_fetch_status(vnode, NULL, key);
421 422
422error: 423error:
423 unlock_flocks(); 424 spin_unlock(&inode->i_lock);
424 _leave(" = %d", ret); 425 _leave(" = %d", ret);
425 return ret; 426 return ret;
426 427
diff --git a/fs/aio.c b/fs/aio.c
index 2bbcacf74d0c..9b5ca1137419 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -39,6 +39,8 @@
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/uaccess.h> 40#include <asm/uaccess.h>
41 41
42#include "internal.h"
43
42#define AIO_RING_MAGIC 0xa10a10a1 44#define AIO_RING_MAGIC 0xa10a10a1
43#define AIO_RING_COMPAT_FEATURES 1 45#define AIO_RING_COMPAT_FEATURES 1
44#define AIO_RING_INCOMPAT_FEATURES 0 46#define AIO_RING_INCOMPAT_FEATURES 0
@@ -623,7 +625,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
623 625
624 /* 626 /*
625 * Add a completion event to the ring buffer. Must be done holding 627 * Add a completion event to the ring buffer. Must be done holding
626 * ctx->ctx_lock to prevent other code from messing with the tail 628 * ctx->completion_lock to prevent other code from messing with the tail
627 * pointer since we might be called from irq context. 629 * pointer since we might be called from irq context.
628 */ 630 */
629 spin_lock_irqsave(&ctx->completion_lock, flags); 631 spin_lock_irqsave(&ctx->completion_lock, flags);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 13ddec92341c..3d9d3f5d5dda 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -109,7 +109,7 @@ cont:
109 109
110 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); 110 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
111 /* Already gone or negative dentry (under construction) - try next */ 111 /* Already gone or negative dentry (under construction) - try next */
112 if (q->d_count == 0 || !simple_positive(q)) { 112 if (!d_count(q) || !simple_positive(q)) {
113 spin_unlock(&q->d_lock); 113 spin_unlock(&q->d_lock);
114 next = q->d_u.d_child.next; 114 next = q->d_u.d_child.next;
115 goto cont; 115 goto cont;
@@ -267,7 +267,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
267 else 267 else
268 ino_count++; 268 ino_count++;
269 269
270 if (p->d_count > ino_count) { 270 if (d_count(p) > ino_count) {
271 top_ino->last_used = jiffies; 271 top_ino->last_used = jiffies;
272 dput(p); 272 dput(p);
273 return 1; 273 return 1;
@@ -409,7 +409,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
409 if (!exp_leaves) { 409 if (!exp_leaves) {
410 /* Path walk currently on this dentry? */ 410 /* Path walk currently on this dentry? */
411 ino_count = atomic_read(&ino->count) + 1; 411 ino_count = atomic_read(&ino->count) + 1;
412 if (dentry->d_count > ino_count) 412 if (d_count(dentry) > ino_count)
413 goto next; 413 goto next;
414 414
415 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { 415 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
@@ -423,7 +423,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
423 } else { 423 } else {
424 /* Path walk currently on this dentry? */ 424 /* Path walk currently on this dentry? */
425 ino_count = atomic_read(&ino->count) + 1; 425 ino_count = atomic_read(&ino->count) + 1;
426 if (dentry->d_count > ino_count) 426 if (d_count(dentry) > ino_count)
427 goto next; 427 goto next;
428 428
429 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); 429 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 085da86e07c2..92ef341ba0cf 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -41,7 +41,7 @@ const struct file_operations autofs4_root_operations = {
41 .open = dcache_dir_open, 41 .open = dcache_dir_open,
42 .release = dcache_dir_close, 42 .release = dcache_dir_close,
43 .read = generic_read_dir, 43 .read = generic_read_dir,
44 .readdir = dcache_readdir, 44 .iterate = dcache_readdir,
45 .llseek = dcache_dir_lseek, 45 .llseek = dcache_dir_lseek,
46 .unlocked_ioctl = autofs4_root_ioctl, 46 .unlocked_ioctl = autofs4_root_ioctl,
47#ifdef CONFIG_COMPAT 47#ifdef CONFIG_COMPAT
@@ -53,7 +53,7 @@ const struct file_operations autofs4_dir_operations = {
53 .open = autofs4_dir_open, 53 .open = autofs4_dir_open,
54 .release = dcache_dir_close, 54 .release = dcache_dir_close,
55 .read = generic_read_dir, 55 .read = generic_read_dir,
56 .readdir = dcache_readdir, 56 .iterate = dcache_readdir,
57 .llseek = dcache_dir_lseek, 57 .llseek = dcache_dir_lseek,
58}; 58};
59 59
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
179 spin_lock(&active->d_lock); 179 spin_lock(&active->d_lock);
180 180
181 /* Already gone? */ 181 /* Already gone? */
182 if (active->d_count == 0) 182 if (!d_count(active))
183 goto next; 183 goto next;
184 184
185 qstr = &active->d_name; 185 qstr = &active->d_name;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 922ad460bff9..7c93953030fb 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -45,7 +45,7 @@ static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
45 return -EIO; 45 return -EIO;
46} 46}
47 47
48static int bad_file_readdir(struct file *filp, void *dirent, filldir_t filldir) 48static int bad_file_readdir(struct file *file, struct dir_context *ctx)
49{ 49{
50 return -EIO; 50 return -EIO;
51} 51}
@@ -152,7 +152,7 @@ static const struct file_operations bad_file_ops =
152 .write = bad_file_write, 152 .write = bad_file_write,
153 .aio_read = bad_file_aio_read, 153 .aio_read = bad_file_aio_read,
154 .aio_write = bad_file_aio_write, 154 .aio_write = bad_file_aio_write,
155 .readdir = bad_file_readdir, 155 .iterate = bad_file_readdir,
156 .poll = bad_file_poll, 156 .poll = bad_file_poll,
157 .unlocked_ioctl = bad_file_unlocked_ioctl, 157 .unlocked_ioctl = bad_file_unlocked_ioctl,
158 .compat_ioctl = bad_file_compat_ioctl, 158 .compat_ioctl = bad_file_compat_ioctl,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index f95dddced968..e9c75e20db32 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -31,7 +31,7 @@ MODULE_LICENSE("GPL");
31/* The units the vfs expects inode->i_blocks to be in */ 31/* The units the vfs expects inode->i_blocks to be in */
32#define VFS_BLOCK_SIZE 512 32#define VFS_BLOCK_SIZE 512
33 33
34static int befs_readdir(struct file *, void *, filldir_t); 34static int befs_readdir(struct file *, struct dir_context *);
35static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); 35static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
36static int befs_readpage(struct file *file, struct page *page); 36static int befs_readpage(struct file *file, struct page *page);
37static sector_t befs_bmap(struct address_space *mapping, sector_t block); 37static sector_t befs_bmap(struct address_space *mapping, sector_t block);
@@ -66,7 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
66 66
67static const struct file_operations befs_dir_operations = { 67static const struct file_operations befs_dir_operations = {
68 .read = generic_read_dir, 68 .read = generic_read_dir,
69 .readdir = befs_readdir, 69 .iterate = befs_readdir,
70 .llseek = generic_file_llseek, 70 .llseek = generic_file_llseek,
71}; 71};
72 72
@@ -211,9 +211,9 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
211} 211}
212 212
213static int 213static int
214befs_readdir(struct file *filp, void *dirent, filldir_t filldir) 214befs_readdir(struct file *file, struct dir_context *ctx)
215{ 215{
216 struct inode *inode = file_inode(filp); 216 struct inode *inode = file_inode(file);
217 struct super_block *sb = inode->i_sb; 217 struct super_block *sb = inode->i_sb;
218 befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; 218 befs_data_stream *ds = &BEFS_I(inode)->i_data.ds;
219 befs_off_t value; 219 befs_off_t value;
@@ -221,15 +221,14 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
221 size_t keysize; 221 size_t keysize;
222 unsigned char d_type; 222 unsigned char d_type;
223 char keybuf[BEFS_NAME_LEN + 1]; 223 char keybuf[BEFS_NAME_LEN + 1];
224 char *nlsname; 224 const char *dirname = file->f_path.dentry->d_name.name;
225 int nlsnamelen;
226 const char *dirname = filp->f_path.dentry->d_name.name;
227 225
228 befs_debug(sb, "---> befs_readdir() " 226 befs_debug(sb, "---> befs_readdir() "
229 "name %s, inode %ld, filp->f_pos %Ld", 227 "name %s, inode %ld, ctx->pos %Ld",
230 dirname, inode->i_ino, filp->f_pos); 228 dirname, inode->i_ino, ctx->pos);
231 229
232 result = befs_btree_read(sb, ds, filp->f_pos, BEFS_NAME_LEN + 1, 230more:
231 result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
233 keybuf, &keysize, &value); 232 keybuf, &keysize, &value);
234 233
235 if (result == BEFS_ERR) { 234 if (result == BEFS_ERR) {
@@ -251,24 +250,29 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir)
251 250
252 /* Convert to NLS */ 251 /* Convert to NLS */
253 if (BEFS_SB(sb)->nls) { 252 if (BEFS_SB(sb)->nls) {
253 char *nlsname;
254 int nlsnamelen;
254 result = 255 result =
255 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); 256 befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
256 if (result < 0) { 257 if (result < 0) {
257 befs_debug(sb, "<--- befs_readdir() ERROR"); 258 befs_debug(sb, "<--- befs_readdir() ERROR");
258 return result; 259 return result;
259 } 260 }
260 result = filldir(dirent, nlsname, nlsnamelen, filp->f_pos, 261 if (!dir_emit(ctx, nlsname, nlsnamelen,
261 (ino_t) value, d_type); 262 (ino_t) value, d_type)) {
263 kfree(nlsname);
264 return 0;
265 }
262 kfree(nlsname); 266 kfree(nlsname);
263
264 } else { 267 } else {
265 result = filldir(dirent, keybuf, keysize, filp->f_pos, 268 if (!dir_emit(ctx, keybuf, keysize,
266 (ino_t) value, d_type); 269 (ino_t) value, d_type))
270 return 0;
267 } 271 }
268 if (!result) 272 ctx->pos++;
269 filp->f_pos++; 273 goto more;
270 274
271 befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos); 275 befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
272 276
273 return 0; 277 return 0;
274} 278}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 3f422f6bb5ca..a399e6d9dc74 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -26,58 +26,51 @@ static struct buffer_head *bfs_find_entry(struct inode *dir,
26 const unsigned char *name, int namelen, 26 const unsigned char *name, int namelen,
27 struct bfs_dirent **res_dir); 27 struct bfs_dirent **res_dir);
28 28
29static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) 29static int bfs_readdir(struct file *f, struct dir_context *ctx)
30{ 30{
31 struct inode *dir = file_inode(f); 31 struct inode *dir = file_inode(f);
32 struct buffer_head *bh; 32 struct buffer_head *bh;
33 struct bfs_dirent *de; 33 struct bfs_dirent *de;
34 struct bfs_sb_info *info = BFS_SB(dir->i_sb);
35 unsigned int offset; 34 unsigned int offset;
36 int block; 35 int block;
37 36
38 mutex_lock(&info->bfs_lock); 37 if (ctx->pos & (BFS_DIRENT_SIZE - 1)) {
39
40 if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
41 printf("Bad f_pos=%08lx for %s:%08lx\n", 38 printf("Bad f_pos=%08lx for %s:%08lx\n",
42 (unsigned long)f->f_pos, 39 (unsigned long)ctx->pos,
43 dir->i_sb->s_id, dir->i_ino); 40 dir->i_sb->s_id, dir->i_ino);
44 mutex_unlock(&info->bfs_lock); 41 return -EINVAL;
45 return -EBADF;
46 } 42 }
47 43
48 while (f->f_pos < dir->i_size) { 44 while (ctx->pos < dir->i_size) {
49 offset = f->f_pos & (BFS_BSIZE - 1); 45 offset = ctx->pos & (BFS_BSIZE - 1);
50 block = BFS_I(dir)->i_sblock + (f->f_pos >> BFS_BSIZE_BITS); 46 block = BFS_I(dir)->i_sblock + (ctx->pos >> BFS_BSIZE_BITS);
51 bh = sb_bread(dir->i_sb, block); 47 bh = sb_bread(dir->i_sb, block);
52 if (!bh) { 48 if (!bh) {
53 f->f_pos += BFS_BSIZE - offset; 49 ctx->pos += BFS_BSIZE - offset;
54 continue; 50 continue;
55 } 51 }
56 do { 52 do {
57 de = (struct bfs_dirent *)(bh->b_data + offset); 53 de = (struct bfs_dirent *)(bh->b_data + offset);
58 if (de->ino) { 54 if (de->ino) {
59 int size = strnlen(de->name, BFS_NAMELEN); 55 int size = strnlen(de->name, BFS_NAMELEN);
60 if (filldir(dirent, de->name, size, f->f_pos, 56 if (!dir_emit(ctx, de->name, size,
61 le16_to_cpu(de->ino), 57 le16_to_cpu(de->ino),
62 DT_UNKNOWN) < 0) { 58 DT_UNKNOWN)) {
63 brelse(bh); 59 brelse(bh);
64 mutex_unlock(&info->bfs_lock);
65 return 0; 60 return 0;
66 } 61 }
67 } 62 }
68 offset += BFS_DIRENT_SIZE; 63 offset += BFS_DIRENT_SIZE;
69 f->f_pos += BFS_DIRENT_SIZE; 64 ctx->pos += BFS_DIRENT_SIZE;
70 } while ((offset < BFS_BSIZE) && (f->f_pos < dir->i_size)); 65 } while ((offset < BFS_BSIZE) && (ctx->pos < dir->i_size));
71 brelse(bh); 66 brelse(bh);
72 } 67 }
73 68 return 0;
74 mutex_unlock(&info->bfs_lock);
75 return 0;
76} 69}
77 70
78const struct file_operations bfs_dir_operations = { 71const struct file_operations bfs_dir_operations = {
79 .read = generic_read_dir, 72 .read = generic_read_dir,
80 .readdir = bfs_readdir, 73 .iterate = bfs_readdir,
81 .fsync = generic_file_fsync, 74 .fsync = generic_file_fsync,
82 .llseek = generic_file_llseek, 75 .llseek = generic_file_llseek,
83}; 76};
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bce87694f7b0..89dec7f789a4 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -255,8 +255,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
255 (current->mm->start_data = N_DATADDR(ex)); 255 (current->mm->start_data = N_DATADDR(ex));
256 current->mm->brk = ex.a_bss + 256 current->mm->brk = ex.a_bss +
257 (current->mm->start_brk = N_BSSADDR(ex)); 257 (current->mm->start_brk = N_BSSADDR(ex));
258 current->mm->free_area_cache = current->mm->mmap_base;
259 current->mm->cached_hole_size = 0;
260 258
261 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); 259 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
262 if (retval < 0) { 260 if (retval < 0) {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index f8a0b0efda44..100edcc5e312 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,8 +738,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
738 738
739 /* Do this so that we can load the interpreter, if need be. We will 739 /* Do this so that we can load the interpreter, if need be. We will
740 change some of these later */ 740 change some of these later */
741 current->mm->free_area_cache = current->mm->mmap_base;
742 current->mm->cached_hole_size = 0;
743 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), 741 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
744 executable_stack); 742 executable_stack);
745 if (retval < 0) { 743 if (retval < 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2091db8cdd78..c7bda5cd3da7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
58 struct backing_dev_info *dst) 58 struct backing_dev_info *dst)
59{ 59{
60 struct backing_dev_info *old = inode->i_data.backing_dev_info; 60 struct backing_dev_info *old = inode->i_data.backing_dev_info;
61 bool wakeup_bdi = false;
61 62
62 if (unlikely(dst == old)) /* deadlock avoidance */ 63 if (unlikely(dst == old)) /* deadlock avoidance */
63 return; 64 return;
64 bdi_lock_two(&old->wb, &dst->wb); 65 bdi_lock_two(&old->wb, &dst->wb);
65 spin_lock(&inode->i_lock); 66 spin_lock(&inode->i_lock);
66 inode->i_data.backing_dev_info = dst; 67 inode->i_data.backing_dev_info = dst;
67 if (inode->i_state & I_DIRTY) 68 if (inode->i_state & I_DIRTY) {
69 if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
70 wakeup_bdi = true;
68 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 71 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
72 }
69 spin_unlock(&inode->i_lock); 73 spin_unlock(&inode->i_lock);
70 spin_unlock(&old->wb.list_lock); 74 spin_unlock(&old->wb.list_lock);
71 spin_unlock(&dst->wb.list_lock); 75 spin_unlock(&dst->wb.list_lock);
76
77 if (wakeup_bdi)
78 bdi_wakeup_thread_delayed(dst);
72} 79}
73 80
74/* Kill _all_ buffers and pagecache , dirty or not.. */ 81/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -325,31 +332,10 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
325static loff_t block_llseek(struct file *file, loff_t offset, int whence) 332static loff_t block_llseek(struct file *file, loff_t offset, int whence)
326{ 333{
327 struct inode *bd_inode = file->f_mapping->host; 334 struct inode *bd_inode = file->f_mapping->host;
328 loff_t size;
329 loff_t retval; 335 loff_t retval;
330 336
331 mutex_lock(&bd_inode->i_mutex); 337 mutex_lock(&bd_inode->i_mutex);
332 size = i_size_read(bd_inode); 338 retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
333
334 retval = -EINVAL;
335 switch (whence) {
336 case SEEK_END:
337 offset += size;
338 break;
339 case SEEK_CUR:
340 offset += file->f_pos;
341 case SEEK_SET:
342 break;
343 default:
344 goto out;
345 }
346 if (offset >= 0 && offset <= size) {
347 if (offset != file->f_pos) {
348 file->f_pos = offset;
349 }
350 retval = offset;
351 }
352out:
353 mutex_unlock(&bd_inode->i_mutex); 339 mutex_unlock(&bd_inode->i_mutex);
354 return retval; 340 return retval;
355} 341}
@@ -1583,6 +1569,7 @@ static const struct address_space_operations def_blk_aops = {
1583 .writepages = generic_writepages, 1569 .writepages = generic_writepages,
1584 .releasepage = blkdev_releasepage, 1570 .releasepage = blkdev_releasepage,
1585 .direct_IO = blkdev_direct_IO, 1571 .direct_IO = blkdev_direct_IO,
1572 .is_dirty_writeback = buffer_check_dirty_writeback,
1586}; 1573};
1587 1574
1588const struct file_operations def_blk_fops = { 1575const struct file_operations def_blk_fops = {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 290e347b6db3..8bc5e8ccb091 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -36,16 +36,23 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
36 u64 extent_item_pos, 36 u64 extent_item_pos,
37 struct extent_inode_elem **eie) 37 struct extent_inode_elem **eie)
38{ 38{
39 u64 data_offset; 39 u64 offset = 0;
40 u64 data_len;
41 struct extent_inode_elem *e; 40 struct extent_inode_elem *e;
42 41
43 data_offset = btrfs_file_extent_offset(eb, fi); 42 if (!btrfs_file_extent_compression(eb, fi) &&
44 data_len = btrfs_file_extent_num_bytes(eb, fi); 43 !btrfs_file_extent_encryption(eb, fi) &&
44 !btrfs_file_extent_other_encoding(eb, fi)) {
45 u64 data_offset;
46 u64 data_len;
45 47
46 if (extent_item_pos < data_offset || 48 data_offset = btrfs_file_extent_offset(eb, fi);
47 extent_item_pos >= data_offset + data_len) 49 data_len = btrfs_file_extent_num_bytes(eb, fi);
48 return 1; 50
51 if (extent_item_pos < data_offset ||
52 extent_item_pos >= data_offset + data_len)
53 return 1;
54 offset = extent_item_pos - data_offset;
55 }
49 56
50 e = kmalloc(sizeof(*e), GFP_NOFS); 57 e = kmalloc(sizeof(*e), GFP_NOFS);
51 if (!e) 58 if (!e)
@@ -53,7 +60,7 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
53 60
54 e->next = *eie; 61 e->next = *eie;
55 e->inum = key->objectid; 62 e->inum = key->objectid;
56 e->offset = key->offset + (extent_item_pos - data_offset); 63 e->offset = key->offset + offset;
57 *eie = e; 64 *eie = e;
58 65
59 return 0; 66 return 0;
@@ -189,7 +196,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
189 struct extent_buffer *eb; 196 struct extent_buffer *eb;
190 struct btrfs_key key; 197 struct btrfs_key key;
191 struct btrfs_file_extent_item *fi; 198 struct btrfs_file_extent_item *fi;
192 struct extent_inode_elem *eie = NULL; 199 struct extent_inode_elem *eie = NULL, *old = NULL;
193 u64 disk_byte; 200 u64 disk_byte;
194 201
195 if (level != 0) { 202 if (level != 0) {
@@ -223,6 +230,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
223 230
224 if (disk_byte == wanted_disk_byte) { 231 if (disk_byte == wanted_disk_byte) {
225 eie = NULL; 232 eie = NULL;
233 old = NULL;
226 if (extent_item_pos) { 234 if (extent_item_pos) {
227 ret = check_extent_in_eb(&key, eb, fi, 235 ret = check_extent_in_eb(&key, eb, fi,
228 *extent_item_pos, 236 *extent_item_pos,
@@ -230,18 +238,20 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
230 if (ret < 0) 238 if (ret < 0)
231 break; 239 break;
232 } 240 }
233 if (!ret) { 241 if (ret > 0)
234 ret = ulist_add(parents, eb->start, 242 goto next;
235 (uintptr_t)eie, GFP_NOFS); 243 ret = ulist_add_merge(parents, eb->start,
236 if (ret < 0) 244 (uintptr_t)eie,
237 break; 245 (u64 *)&old, GFP_NOFS);
238 if (!extent_item_pos) { 246 if (ret < 0)
239 ret = btrfs_next_old_leaf(root, path, 247 break;
240 time_seq); 248 if (!ret && extent_item_pos) {
241 continue; 249 while (old->next)
242 } 250 old = old->next;
251 old->next = eie;
243 } 252 }
244 } 253 }
254next:
245 ret = btrfs_next_old_item(root, path, time_seq); 255 ret = btrfs_next_old_item(root, path, time_seq);
246 } 256 }
247 257
@@ -255,13 +265,11 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
255 * to a logical address 265 * to a logical address
256 */ 266 */
257static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, 267static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
258 int search_commit_root, 268 struct btrfs_path *path, u64 time_seq,
259 u64 time_seq, 269 struct __prelim_ref *ref,
260 struct __prelim_ref *ref, 270 struct ulist *parents,
261 struct ulist *parents, 271 const u64 *extent_item_pos)
262 const u64 *extent_item_pos)
263{ 272{
264 struct btrfs_path *path;
265 struct btrfs_root *root; 273 struct btrfs_root *root;
266 struct btrfs_key root_key; 274 struct btrfs_key root_key;
267 struct extent_buffer *eb; 275 struct extent_buffer *eb;
@@ -269,11 +277,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
269 int root_level; 277 int root_level;
270 int level = ref->level; 278 int level = ref->level;
271 279
272 path = btrfs_alloc_path();
273 if (!path)
274 return -ENOMEM;
275 path->search_commit_root = !!search_commit_root;
276
277 root_key.objectid = ref->root_id; 280 root_key.objectid = ref->root_id;
278 root_key.type = BTRFS_ROOT_ITEM_KEY; 281 root_key.type = BTRFS_ROOT_ITEM_KEY;
279 root_key.offset = (u64)-1; 282 root_key.offset = (u64)-1;
@@ -314,7 +317,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
314 time_seq, ref->wanted_disk_byte, 317 time_seq, ref->wanted_disk_byte,
315 extent_item_pos); 318 extent_item_pos);
316out: 319out:
317 btrfs_free_path(path); 320 path->lowest_level = 0;
321 btrfs_release_path(path);
318 return ret; 322 return ret;
319} 323}
320 324
@@ -322,7 +326,7 @@ out:
322 * resolve all indirect backrefs from the list 326 * resolve all indirect backrefs from the list
323 */ 327 */
324static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 328static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
325 int search_commit_root, u64 time_seq, 329 struct btrfs_path *path, u64 time_seq,
326 struct list_head *head, 330 struct list_head *head,
327 const u64 *extent_item_pos) 331 const u64 *extent_item_pos)
328{ 332{
@@ -349,9 +353,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
349 continue; 353 continue;
350 if (ref->count == 0) 354 if (ref->count == 0)
351 continue; 355 continue;
352 err = __resolve_indirect_ref(fs_info, search_commit_root, 356 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
353 time_seq, ref, parents, 357 parents, extent_item_pos);
354 extent_item_pos);
355 if (err == -ENOMEM) 358 if (err == -ENOMEM)
356 goto out; 359 goto out;
357 if (err) 360 if (err)
@@ -604,6 +607,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
604 int slot; 607 int slot;
605 struct extent_buffer *leaf; 608 struct extent_buffer *leaf;
606 struct btrfs_key key; 609 struct btrfs_key key;
610 struct btrfs_key found_key;
607 unsigned long ptr; 611 unsigned long ptr;
608 unsigned long end; 612 unsigned long end;
609 struct btrfs_extent_item *ei; 613 struct btrfs_extent_item *ei;
@@ -621,17 +625,21 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
621 625
622 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); 626 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
623 flags = btrfs_extent_flags(leaf, ei); 627 flags = btrfs_extent_flags(leaf, ei);
628 btrfs_item_key_to_cpu(leaf, &found_key, slot);
624 629
625 ptr = (unsigned long)(ei + 1); 630 ptr = (unsigned long)(ei + 1);
626 end = (unsigned long)ei + item_size; 631 end = (unsigned long)ei + item_size;
627 632
628 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 633 if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
634 flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
629 struct btrfs_tree_block_info *info; 635 struct btrfs_tree_block_info *info;
630 636
631 info = (struct btrfs_tree_block_info *)ptr; 637 info = (struct btrfs_tree_block_info *)ptr;
632 *info_level = btrfs_tree_block_level(leaf, info); 638 *info_level = btrfs_tree_block_level(leaf, info);
633 ptr += sizeof(struct btrfs_tree_block_info); 639 ptr += sizeof(struct btrfs_tree_block_info);
634 BUG_ON(ptr > end); 640 BUG_ON(ptr > end);
641 } else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
642 *info_level = found_key.offset;
635 } else { 643 } else {
636 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); 644 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
637 } 645 }
@@ -795,7 +803,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
795 struct btrfs_delayed_ref_head *head; 803 struct btrfs_delayed_ref_head *head;
796 int info_level = 0; 804 int info_level = 0;
797 int ret; 805 int ret;
798 int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
799 struct list_head prefs_delayed; 806 struct list_head prefs_delayed;
800 struct list_head prefs; 807 struct list_head prefs;
801 struct __prelim_ref *ref; 808 struct __prelim_ref *ref;
@@ -804,13 +811,17 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
804 INIT_LIST_HEAD(&prefs_delayed); 811 INIT_LIST_HEAD(&prefs_delayed);
805 812
806 key.objectid = bytenr; 813 key.objectid = bytenr;
807 key.type = BTRFS_EXTENT_ITEM_KEY;
808 key.offset = (u64)-1; 814 key.offset = (u64)-1;
815 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
816 key.type = BTRFS_METADATA_ITEM_KEY;
817 else
818 key.type = BTRFS_EXTENT_ITEM_KEY;
809 819
810 path = btrfs_alloc_path(); 820 path = btrfs_alloc_path();
811 if (!path) 821 if (!path)
812 return -ENOMEM; 822 return -ENOMEM;
813 path->search_commit_root = !!search_commit_root; 823 if (!trans)
824 path->search_commit_root = 1;
814 825
815 /* 826 /*
816 * grab both a lock on the path and a lock on the delayed ref head. 827 * grab both a lock on the path and a lock on the delayed ref head.
@@ -825,7 +836,7 @@ again:
825 goto out; 836 goto out;
826 BUG_ON(ret == 0); 837 BUG_ON(ret == 0);
827 838
828 if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) { 839 if (trans) {
829 /* 840 /*
830 * look if there are updates for this ref queued and lock the 841 * look if there are updates for this ref queued and lock the
831 * head 842 * head
@@ -869,7 +880,8 @@ again:
869 slot = path->slots[0]; 880 slot = path->slots[0];
870 btrfs_item_key_to_cpu(leaf, &key, slot); 881 btrfs_item_key_to_cpu(leaf, &key, slot);
871 if (key.objectid == bytenr && 882 if (key.objectid == bytenr &&
872 key.type == BTRFS_EXTENT_ITEM_KEY) { 883 (key.type == BTRFS_EXTENT_ITEM_KEY ||
884 key.type == BTRFS_METADATA_ITEM_KEY)) {
873 ret = __add_inline_refs(fs_info, path, bytenr, 885 ret = __add_inline_refs(fs_info, path, bytenr,
874 &info_level, &prefs); 886 &info_level, &prefs);
875 if (ret) 887 if (ret)
@@ -890,8 +902,8 @@ again:
890 902
891 __merge_refs(&prefs, 1); 903 __merge_refs(&prefs, 1);
892 904
893 ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, 905 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
894 &prefs, extent_item_pos); 906 extent_item_pos);
895 if (ret) 907 if (ret)
896 goto out; 908 goto out;
897 909
@@ -1283,12 +1295,16 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1283{ 1295{
1284 int ret; 1296 int ret;
1285 u64 flags; 1297 u64 flags;
1298 u64 size = 0;
1286 u32 item_size; 1299 u32 item_size;
1287 struct extent_buffer *eb; 1300 struct extent_buffer *eb;
1288 struct btrfs_extent_item *ei; 1301 struct btrfs_extent_item *ei;
1289 struct btrfs_key key; 1302 struct btrfs_key key;
1290 1303
1291 key.type = BTRFS_EXTENT_ITEM_KEY; 1304 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1305 key.type = BTRFS_METADATA_ITEM_KEY;
1306 else
1307 key.type = BTRFS_EXTENT_ITEM_KEY;
1292 key.objectid = logical; 1308 key.objectid = logical;
1293 key.offset = (u64)-1; 1309 key.offset = (u64)-1;
1294 1310
@@ -1301,9 +1317,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1301 return ret; 1317 return ret;
1302 1318
1303 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1319 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1304 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 1320 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1321 size = fs_info->extent_root->leafsize;
1322 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1323 size = found_key->offset;
1324
1325 if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
1326 found_key->type != BTRFS_METADATA_ITEM_KEY) ||
1305 found_key->objectid > logical || 1327 found_key->objectid > logical ||
1306 found_key->objectid + found_key->offset <= logical) { 1328 found_key->objectid + size <= logical) {
1307 pr_debug("logical %llu is not within any extent\n", 1329 pr_debug("logical %llu is not within any extent\n",
1308 (unsigned long long)logical); 1330 (unsigned long long)logical);
1309 return -ENOENT; 1331 return -ENOENT;
@@ -1459,7 +1481,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1459 iterate_extent_inodes_t *iterate, void *ctx) 1481 iterate_extent_inodes_t *iterate, void *ctx)
1460{ 1482{
1461 int ret; 1483 int ret;
1462 struct btrfs_trans_handle *trans; 1484 struct btrfs_trans_handle *trans = NULL;
1463 struct ulist *refs = NULL; 1485 struct ulist *refs = NULL;
1464 struct ulist *roots = NULL; 1486 struct ulist *roots = NULL;
1465 struct ulist_node *ref_node = NULL; 1487 struct ulist_node *ref_node = NULL;
@@ -1471,9 +1493,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1471 pr_debug("resolving all inodes for extent %llu\n", 1493 pr_debug("resolving all inodes for extent %llu\n",
1472 extent_item_objectid); 1494 extent_item_objectid);
1473 1495
1474 if (search_commit_root) { 1496 if (!search_commit_root) {
1475 trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
1476 } else {
1477 trans = btrfs_join_transaction(fs_info->extent_root); 1497 trans = btrfs_join_transaction(fs_info->extent_root);
1478 if (IS_ERR(trans)) 1498 if (IS_ERR(trans))
1479 return PTR_ERR(trans); 1499 return PTR_ERR(trans);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0f446d7ca2c0..8f2e76702932 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -23,8 +23,6 @@
23#include "ulist.h" 23#include "ulist.h"
24#include "extent_io.h" 24#include "extent_io.h"
25 25
26#define BTRFS_BACKREF_SEARCH_COMMIT_ROOT ((struct btrfs_trans_handle *)0)
27
28struct inode_fs_paths { 26struct inode_fs_paths {
29 struct btrfs_path *btrfs_path; 27 struct btrfs_path *btrfs_path;
30 struct btrfs_root *fs_root; 28 struct btrfs_root *fs_root;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 02fae7f7e42c..ed504607d8ec 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1089,7 +1089,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1089 btrfs_set_node_ptr_generation(parent, parent_slot, 1089 btrfs_set_node_ptr_generation(parent, parent_slot,
1090 trans->transid); 1090 trans->transid);
1091 btrfs_mark_buffer_dirty(parent); 1091 btrfs_mark_buffer_dirty(parent);
1092 tree_mod_log_free_eb(root->fs_info, buf); 1092 if (last_ref)
1093 tree_mod_log_free_eb(root->fs_info, buf);
1093 btrfs_free_tree_block(trans, root, buf, parent_start, 1094 btrfs_free_tree_block(trans, root, buf, parent_start,
1094 last_ref); 1095 last_ref);
1095 } 1096 }
@@ -1161,8 +1162,8 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1161 * time_seq). 1162 * time_seq).
1162 */ 1163 */
1163static void 1164static void
1164__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, 1165__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1165 struct tree_mod_elem *first_tm) 1166 u64 time_seq, struct tree_mod_elem *first_tm)
1166{ 1167{
1167 u32 n; 1168 u32 n;
1168 struct rb_node *next; 1169 struct rb_node *next;
@@ -1172,6 +1173,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1172 unsigned long p_size = sizeof(struct btrfs_key_ptr); 1173 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1173 1174
1174 n = btrfs_header_nritems(eb); 1175 n = btrfs_header_nritems(eb);
1176 tree_mod_log_read_lock(fs_info);
1175 while (tm && tm->seq >= time_seq) { 1177 while (tm && tm->seq >= time_seq) {
1176 /* 1178 /*
1177 * all the operations are recorded with the operator used for 1179 * all the operations are recorded with the operator used for
@@ -1226,6 +1228,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1226 if (tm->index != first_tm->index) 1228 if (tm->index != first_tm->index)
1227 break; 1229 break;
1228 } 1230 }
1231 tree_mod_log_read_unlock(fs_info);
1229 btrfs_set_header_nritems(eb, n); 1232 btrfs_set_header_nritems(eb, n);
1230} 1233}
1231 1234
@@ -1268,13 +1271,12 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1268 BUG_ON(!eb_rewin); 1271 BUG_ON(!eb_rewin);
1269 } 1272 }
1270 1273
1271 extent_buffer_get(eb_rewin);
1272 btrfs_tree_read_unlock(eb); 1274 btrfs_tree_read_unlock(eb);
1273 free_extent_buffer(eb); 1275 free_extent_buffer(eb);
1274 1276
1275 extent_buffer_get(eb_rewin); 1277 extent_buffer_get(eb_rewin);
1276 btrfs_tree_read_lock(eb_rewin); 1278 btrfs_tree_read_lock(eb_rewin);
1277 __tree_mod_log_rewind(eb_rewin, time_seq, tm); 1279 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1278 WARN_ON(btrfs_header_nritems(eb_rewin) > 1280 WARN_ON(btrfs_header_nritems(eb_rewin) >
1279 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); 1281 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1280 1282
@@ -1350,7 +1352,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1350 btrfs_set_header_generation(eb, old_generation); 1352 btrfs_set_header_generation(eb, old_generation);
1351 } 1353 }
1352 if (tm) 1354 if (tm)
1353 __tree_mod_log_rewind(eb, time_seq, tm); 1355 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1354 else 1356 else
1355 WARN_ON(btrfs_header_level(eb) != 0); 1357 WARN_ON(btrfs_header_level(eb) != 0);
1356 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); 1358 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
@@ -2178,12 +2180,8 @@ static void reada_for_search(struct btrfs_root *root,
2178 } 2180 }
2179} 2181}
2180 2182
2181/* 2183static noinline void reada_for_balance(struct btrfs_root *root,
2182 * returns -EAGAIN if it had to drop the path, or zero if everything was in 2184 struct btrfs_path *path, int level)
2183 * cache
2184 */
2185static noinline int reada_for_balance(struct btrfs_root *root,
2186 struct btrfs_path *path, int level)
2187{ 2185{
2188 int slot; 2186 int slot;
2189 int nritems; 2187 int nritems;
@@ -2192,12 +2190,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2192 u64 gen; 2190 u64 gen;
2193 u64 block1 = 0; 2191 u64 block1 = 0;
2194 u64 block2 = 0; 2192 u64 block2 = 0;
2195 int ret = 0;
2196 int blocksize; 2193 int blocksize;
2197 2194
2198 parent = path->nodes[level + 1]; 2195 parent = path->nodes[level + 1];
2199 if (!parent) 2196 if (!parent)
2200 return 0; 2197 return;
2201 2198
2202 nritems = btrfs_header_nritems(parent); 2199 nritems = btrfs_header_nritems(parent);
2203 slot = path->slots[level + 1]; 2200 slot = path->slots[level + 1];
@@ -2224,28 +2221,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
2224 block2 = 0; 2221 block2 = 0;
2225 free_extent_buffer(eb); 2222 free_extent_buffer(eb);
2226 } 2223 }
2227 if (block1 || block2) {
2228 ret = -EAGAIN;
2229
2230 /* release the whole path */
2231 btrfs_release_path(path);
2232 2224
2233 /* read the blocks */ 2225 if (block1)
2234 if (block1) 2226 readahead_tree_block(root, block1, blocksize, 0);
2235 readahead_tree_block(root, block1, blocksize, 0); 2227 if (block2)
2236 if (block2) 2228 readahead_tree_block(root, block2, blocksize, 0);
2237 readahead_tree_block(root, block2, blocksize, 0);
2238
2239 if (block1) {
2240 eb = read_tree_block(root, block1, blocksize, 0);
2241 free_extent_buffer(eb);
2242 }
2243 if (block2) {
2244 eb = read_tree_block(root, block2, blocksize, 0);
2245 free_extent_buffer(eb);
2246 }
2247 }
2248 return ret;
2249} 2229}
2250 2230
2251 2231
@@ -2359,35 +2339,28 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2359 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2339 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2360 if (tmp) { 2340 if (tmp) {
2361 /* first we do an atomic uptodate check */ 2341 /* first we do an atomic uptodate check */
2362 if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) { 2342 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2363 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2343 *eb_ret = tmp;
2364 /* 2344 return 0;
2365 * we found an up to date block without 2345 }
2366 * sleeping, return
2367 * right away
2368 */
2369 *eb_ret = tmp;
2370 return 0;
2371 }
2372 /* the pages were up to date, but we failed
2373 * the generation number check. Do a full
2374 * read for the generation number that is correct.
2375 * We must do this without dropping locks so
2376 * we can trust our generation number
2377 */
2378 free_extent_buffer(tmp);
2379 btrfs_set_path_blocking(p);
2380 2346
2381 /* now we're allowed to do a blocking uptodate check */ 2347 /* the pages were up to date, but we failed
2382 tmp = read_tree_block(root, blocknr, blocksize, gen); 2348 * the generation number check. Do a full
2383 if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) { 2349 * read for the generation number that is correct.
2384 *eb_ret = tmp; 2350 * We must do this without dropping locks so
2385 return 0; 2351 * we can trust our generation number
2386 } 2352 */
2387 free_extent_buffer(tmp); 2353 btrfs_set_path_blocking(p);
2388 btrfs_release_path(p); 2354
2389 return -EIO; 2355 /* now we're allowed to do a blocking uptodate check */
2356 ret = btrfs_read_buffer(tmp, gen);
2357 if (!ret) {
2358 *eb_ret = tmp;
2359 return 0;
2390 } 2360 }
2361 free_extent_buffer(tmp);
2362 btrfs_release_path(p);
2363 return -EIO;
2391 } 2364 }
2392 2365
2393 /* 2366 /*
@@ -2448,11 +2421,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2448 goto again; 2421 goto again;
2449 } 2422 }
2450 2423
2451 sret = reada_for_balance(root, p, level);
2452 if (sret)
2453 goto again;
2454
2455 btrfs_set_path_blocking(p); 2424 btrfs_set_path_blocking(p);
2425 reada_for_balance(root, p, level);
2456 sret = split_node(trans, root, p, level); 2426 sret = split_node(trans, root, p, level);
2457 btrfs_clear_path_blocking(p, NULL, 0); 2427 btrfs_clear_path_blocking(p, NULL, 0);
2458 2428
@@ -2472,11 +2442,8 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
2472 goto again; 2442 goto again;
2473 } 2443 }
2474 2444
2475 sret = reada_for_balance(root, p, level);
2476 if (sret)
2477 goto again;
2478
2479 btrfs_set_path_blocking(p); 2445 btrfs_set_path_blocking(p);
2446 reada_for_balance(root, p, level);
2480 sret = balance_level(trans, root, p, level); 2447 sret = balance_level(trans, root, p, level);
2481 btrfs_clear_path_blocking(p, NULL, 0); 2448 btrfs_clear_path_blocking(p, NULL, 0);
2482 2449
@@ -3143,7 +3110,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
3143 */ 3110 */
3144static noinline int insert_new_root(struct btrfs_trans_handle *trans, 3111static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3145 struct btrfs_root *root, 3112 struct btrfs_root *root,
3146 struct btrfs_path *path, int level, int log_removal) 3113 struct btrfs_path *path, int level)
3147{ 3114{
3148 u64 lower_gen; 3115 u64 lower_gen;
3149 struct extent_buffer *lower; 3116 struct extent_buffer *lower;
@@ -3194,7 +3161,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3194 btrfs_mark_buffer_dirty(c); 3161 btrfs_mark_buffer_dirty(c);
3195 3162
3196 old = root->node; 3163 old = root->node;
3197 tree_mod_log_set_root_pointer(root, c, log_removal); 3164 tree_mod_log_set_root_pointer(root, c, 0);
3198 rcu_assign_pointer(root->node, c); 3165 rcu_assign_pointer(root->node, c);
3199 3166
3200 /* the super has an extra ref to root->node */ 3167 /* the super has an extra ref to root->node */
@@ -3278,14 +3245,14 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3278 /* 3245 /*
3279 * trying to split the root, lets make a new one 3246 * trying to split the root, lets make a new one
3280 * 3247 *
3281 * tree mod log: We pass 0 as log_removal parameter to 3248 * tree mod log: We don't log_removal old root in
3282 * insert_new_root, because that root buffer will be kept as a 3249 * insert_new_root, because that root buffer will be kept as a
3283 * normal node. We are going to log removal of half of the 3250 * normal node. We are going to log removal of half of the
3284 * elements below with tree_mod_log_eb_copy. We're holding a 3251 * elements below with tree_mod_log_eb_copy. We're holding a
3285 * tree lock on the buffer, which is why we cannot race with 3252 * tree lock on the buffer, which is why we cannot race with
3286 * other tree_mod_log users. 3253 * other tree_mod_log users.
3287 */ 3254 */
3288 ret = insert_new_root(trans, root, path, level + 1, 0); 3255 ret = insert_new_root(trans, root, path, level + 1);
3289 if (ret) 3256 if (ret)
3290 return ret; 3257 return ret;
3291 } else { 3258 } else {
@@ -3986,7 +3953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
3986 return -EOVERFLOW; 3953 return -EOVERFLOW;
3987 3954
3988 /* first try to make some room by pushing left and right */ 3955 /* first try to make some room by pushing left and right */
3989 if (data_size) { 3956 if (data_size && path->nodes[1]) {
3990 wret = push_leaf_right(trans, root, path, data_size, 3957 wret = push_leaf_right(trans, root, path, data_size,
3991 data_size, 0, 0); 3958 data_size, 0, 0);
3992 if (wret < 0) 3959 if (wret < 0)
@@ -4005,7 +3972,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
4005 } 3972 }
4006 3973
4007 if (!path->nodes[1]) { 3974 if (!path->nodes[1]) {
4008 ret = insert_new_root(trans, root, path, 1, 1); 3975 ret = insert_new_root(trans, root, path, 1);
4009 if (ret) 3976 if (ret)
4010 return ret; 3977 return ret;
4011 } 3978 }
@@ -4430,7 +4397,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4430} 4397}
4431 4398
4432/* 4399/*
4433 * make the item pointed to by the path bigger, data_size is the new size. 4400 * make the item pointed to by the path bigger, data_size is the added size.
4434 */ 4401 */
4435void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, 4402void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4436 u32 data_size) 4403 u32 data_size)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6dd49b51ba8..e795bf135e80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -961,8 +961,8 @@ struct btrfs_dev_replace_item {
961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 961#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 962#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 963#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
964#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) 964#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
965#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) 965#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 966#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
967 967
968enum btrfs_raid_types { 968enum btrfs_raid_types {
@@ -1102,6 +1102,18 @@ struct btrfs_space_info {
1102 account */ 1102 account */
1103 1103
1104 /* 1104 /*
1105 * bytes_pinned is kept in line with what is actually pinned, as in
1106 * we've called update_block_group and dropped the bytes_used counter
1107 * and increased the bytes_pinned counter. However this means that
1108 * bytes_pinned does not reflect the bytes that will be pinned once the
1109 * delayed refs are flushed, so this counter is inc'ed everytime we call
1110 * btrfs_free_extent so it is a realtime count of what will be freed
1111 * once the transaction is committed. It will be zero'ed everytime the
1112 * transaction commits.
1113 */
1114 struct percpu_counter total_bytes_pinned;
1115
1116 /*
1105 * we bump reservation progress every time we decrement 1117 * we bump reservation progress every time we decrement
1106 * bytes_reserved. This way people waiting for reservations 1118 * bytes_reserved. This way people waiting for reservations
1107 * know something good has happened and they can check 1119 * know something good has happened and they can check
@@ -1437,25 +1449,22 @@ struct btrfs_fs_info {
1437 atomic_t open_ioctl_trans; 1449 atomic_t open_ioctl_trans;
1438 1450
1439 /* 1451 /*
1440 * this is used by the balancing code to wait for all the pending 1452 * this is used to protect the following list -- ordered_roots.
1441 * ordered extents
1442 */ 1453 */
1443 spinlock_t ordered_extent_lock; 1454 spinlock_t ordered_root_lock;
1444 1455
1445 /* 1456 /*
1446 * all of the data=ordered extents pending writeback 1457 * all fs/file tree roots in which there are data=ordered extents
1458 * pending writeback are added into this list.
1459 *
1447 * these can span multiple transactions and basically include 1460 * these can span multiple transactions and basically include
1448 * every dirty data page that isn't from nodatacow 1461 * every dirty data page that isn't from nodatacow
1449 */ 1462 */
1450 struct list_head ordered_extents; 1463 struct list_head ordered_roots;
1451 1464
1452 spinlock_t delalloc_lock; 1465 spinlock_t delalloc_root_lock;
1453 /* 1466 /* all fs/file tree roots that have delalloc inodes. */
1454 * all of the inodes that have delalloc bytes. It is possible for 1467 struct list_head delalloc_roots;
1455 * this list to be empty even when there is still dirty data=ordered
1456 * extents waiting to finish IO.
1457 */
1458 struct list_head delalloc_inodes;
1459 1468
1460 /* 1469 /*
1461 * there is a pool of worker threads for checksumming during writes 1470 * there is a pool of worker threads for checksumming during writes
@@ -1498,8 +1507,6 @@ struct btrfs_fs_info {
1498 int do_barriers; 1507 int do_barriers;
1499 int closing; 1508 int closing;
1500 int log_root_recovering; 1509 int log_root_recovering;
1501 int enospc_unlink;
1502 int trans_no_join;
1503 1510
1504 u64 total_pinned; 1511 u64 total_pinned;
1505 1512
@@ -1594,6 +1601,12 @@ struct btrfs_fs_info {
1594 struct rb_root qgroup_tree; 1601 struct rb_root qgroup_tree;
1595 spinlock_t qgroup_lock; 1602 spinlock_t qgroup_lock;
1596 1603
1604 /*
1605 * used to avoid frequently calling ulist_alloc()/ulist_free()
1606 * when doing qgroup accounting, it must be protected by qgroup_lock.
1607 */
1608 struct ulist *qgroup_ulist;
1609
1597 /* protect user change for quota operations */ 1610 /* protect user change for quota operations */
1598 struct mutex qgroup_ioctl_lock; 1611 struct mutex qgroup_ioctl_lock;
1599 1612
@@ -1607,6 +1620,8 @@ struct btrfs_fs_info {
1607 struct mutex qgroup_rescan_lock; /* protects the progress item */ 1620 struct mutex qgroup_rescan_lock; /* protects the progress item */
1608 struct btrfs_key qgroup_rescan_progress; 1621 struct btrfs_key qgroup_rescan_progress;
1609 struct btrfs_workers qgroup_rescan_workers; 1622 struct btrfs_workers qgroup_rescan_workers;
1623 struct completion qgroup_rescan_completion;
1624 struct btrfs_work qgroup_rescan_work;
1610 1625
1611 /* filesystem state */ 1626 /* filesystem state */
1612 unsigned long fs_state; 1627 unsigned long fs_state;
@@ -1739,6 +1754,31 @@ struct btrfs_root {
1739 int force_cow; 1754 int force_cow;
1740 1755
1741 spinlock_t root_item_lock; 1756 spinlock_t root_item_lock;
1757 atomic_t refs;
1758
1759 spinlock_t delalloc_lock;
1760 /*
1761 * all of the inodes that have delalloc bytes. It is possible for
1762 * this list to be empty even when there is still dirty data=ordered
1763 * extents waiting to finish IO.
1764 */
1765 struct list_head delalloc_inodes;
1766 struct list_head delalloc_root;
1767 u64 nr_delalloc_inodes;
1768 /*
1769 * this is used by the balancing code to wait for all the pending
1770 * ordered extents
1771 */
1772 spinlock_t ordered_extent_lock;
1773
1774 /*
1775 * all of the data=ordered extents pending writeback
1776 * these can span multiple transactions and basically include
1777 * every dirty data page that isn't from nodatacow
1778 */
1779 struct list_head ordered_extents;
1780 struct list_head ordered_root;
1781 u64 nr_ordered_extents;
1742}; 1782};
1743 1783
1744struct btrfs_ioctl_defrag_range_args { 1784struct btrfs_ioctl_defrag_range_args {
@@ -3028,6 +3068,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3028 num_items; 3068 num_items;
3029} 3069}
3030 3070
3071int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3072 struct btrfs_root *root);
3031void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3073void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3032int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3074int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3033 struct btrfs_root *root, unsigned long count); 3075 struct btrfs_root *root, unsigned long count);
@@ -3039,6 +3081,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
3039 u64 bytenr, u64 num, int reserved); 3081 u64 bytenr, u64 num, int reserved);
3040int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, 3082int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
3041 u64 bytenr, u64 num_bytes); 3083 u64 bytenr, u64 num_bytes);
3084int btrfs_exclude_logged_extents(struct btrfs_root *root,
3085 struct extent_buffer *eb);
3042int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 3086int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3043 struct btrfs_root *root, 3087 struct btrfs_root *root,
3044 u64 objectid, u64 offset, u64 bytenr); 3088 u64 objectid, u64 offset, u64 bytenr);
@@ -3155,6 +3199,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
3155int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3199int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3156 struct btrfs_block_rsv *dst_rsv, 3200 struct btrfs_block_rsv *dst_rsv,
3157 u64 num_bytes); 3201 u64 num_bytes);
3202int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
3203 struct btrfs_block_rsv *dest, u64 num_bytes,
3204 int min_factor);
3158void btrfs_block_rsv_release(struct btrfs_root *root, 3205void btrfs_block_rsv_release(struct btrfs_root *root,
3159 struct btrfs_block_rsv *block_rsv, 3206 struct btrfs_block_rsv *block_rsv,
3160 u64 num_bytes); 3207 u64 num_bytes);
@@ -3311,6 +3358,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
3311 smp_mb(); 3358 smp_mb();
3312 return fs_info->closing; 3359 return fs_info->closing;
3313} 3360}
3361
3362/*
3363 * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
3364 * anything except sleeping. This function is used to check the status of
3365 * the fs.
3366 */
3367static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
3368{
3369 return (root->fs_info->sb->s_flags & MS_RDONLY ||
3370 btrfs_fs_closing(root->fs_info));
3371}
3372
3314static inline void free_fs_info(struct btrfs_fs_info *fs_info) 3373static inline void free_fs_info(struct btrfs_fs_info *fs_info)
3315{ 3374{
3316 kfree(fs_info->balance_ctl); 3375 kfree(fs_info->balance_ctl);
@@ -3357,9 +3416,9 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
3357 struct btrfs_root_item *item); 3416 struct btrfs_root_item *item);
3358void btrfs_read_root_item(struct extent_buffer *eb, int slot, 3417void btrfs_read_root_item(struct extent_buffer *eb, int slot,
3359 struct btrfs_root_item *item); 3418 struct btrfs_root_item *item);
3360int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 3419int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
3361 btrfs_root_item *item, struct btrfs_key *key); 3420 struct btrfs_path *path, struct btrfs_root_item *root_item,
3362int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 3421 struct btrfs_key *root_key);
3363int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 3422int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
3364void btrfs_set_root_node(struct btrfs_root_item *item, 3423void btrfs_set_root_node(struct btrfs_root_item *item,
3365 struct extent_buffer *node); 3424 struct extent_buffer *node);
@@ -3493,6 +3552,10 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3493struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3552struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3494 size_t pg_offset, u64 start, u64 len, 3553 size_t pg_offset, u64 start, u64 len,
3495 int create); 3554 int create);
3555noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
3556 struct inode *inode, u64 offset, u64 *len,
3557 u64 *orig_start, u64 *orig_block_len,
3558 u64 *ram_bytes);
3496 3559
3497/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ 3560/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
3498#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) 3561#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -3530,6 +3593,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3530 u32 min_type); 3593 u32 min_type);
3531 3594
3532int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 3595int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
3596int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
3597 int delay_iput);
3533int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3598int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3534 struct extent_state **cached_state); 3599 struct extent_state **cached_state);
3535int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3600int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -3814,6 +3879,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
3814int btrfs_quota_disable(struct btrfs_trans_handle *trans, 3879int btrfs_quota_disable(struct btrfs_trans_handle *trans,
3815 struct btrfs_fs_info *fs_info); 3880 struct btrfs_fs_info *fs_info);
3816int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); 3881int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
3882void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
3883int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
3817int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, 3884int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
3818 struct btrfs_fs_info *fs_info, u64 src, u64 dst); 3885 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
3819int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, 3886int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f26f38ccd194..375510913fe7 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -535,20 +535,6 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
535 return next; 535 return next;
536} 536}
537 537
538static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
539 u64 root_id)
540{
541 struct btrfs_key root_key;
542
543 if (root->objectid == root_id)
544 return root;
545
546 root_key.objectid = root_id;
547 root_key.type = BTRFS_ROOT_ITEM_KEY;
548 root_key.offset = (u64)-1;
549 return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
550}
551
552static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, 538static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
553 struct btrfs_root *root, 539 struct btrfs_root *root,
554 struct btrfs_delayed_item *item) 540 struct btrfs_delayed_item *item)
@@ -1681,8 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
1681 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree 1667 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
1682 * 1668 *
1683 */ 1669 */
1684int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, 1670int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
1685 filldir_t filldir,
1686 struct list_head *ins_list) 1671 struct list_head *ins_list)
1687{ 1672{
1688 struct btrfs_dir_item *di; 1673 struct btrfs_dir_item *di;
@@ -1704,13 +1689,13 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
1704 list_for_each_entry_safe(curr, next, ins_list, readdir_list) { 1689 list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1705 list_del(&curr->readdir_list); 1690 list_del(&curr->readdir_list);
1706 1691
1707 if (curr->key.offset < filp->f_pos) { 1692 if (curr->key.offset < ctx->pos) {
1708 if (atomic_dec_and_test(&curr->refs)) 1693 if (atomic_dec_and_test(&curr->refs))
1709 kfree(curr); 1694 kfree(curr);
1710 continue; 1695 continue;
1711 } 1696 }
1712 1697
1713 filp->f_pos = curr->key.offset; 1698 ctx->pos = curr->key.offset;
1714 1699
1715 di = (struct btrfs_dir_item *)curr->data; 1700 di = (struct btrfs_dir_item *)curr->data;
1716 name = (char *)(di + 1); 1701 name = (char *)(di + 1);
@@ -1719,7 +1704,7 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
1719 d_type = btrfs_filetype_table[di->type]; 1704 d_type = btrfs_filetype_table[di->type];
1720 btrfs_disk_key_to_cpu(&location, &di->location); 1705 btrfs_disk_key_to_cpu(&location, &di->location);
1721 1706
1722 over = filldir(dirent, name, name_len, curr->key.offset, 1707 over = !dir_emit(ctx, name, name_len,
1723 location.objectid, d_type); 1708 location.objectid, d_type);
1724 1709
1725 if (atomic_dec_and_test(&curr->refs)) 1710 if (atomic_dec_and_test(&curr->refs))
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 1d5c5f7abe3e..a4b38f934d14 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -139,8 +139,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
139 struct list_head *del_list); 139 struct list_head *del_list);
140int btrfs_should_delete_dir_index(struct list_head *del_list, 140int btrfs_should_delete_dir_index(struct list_head *del_list,
141 u64 index); 141 u64 index);
142int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, 142int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
143 filldir_t filldir,
144 struct list_head *ins_list); 143 struct list_head *ins_list);
145 144
146/* for init */ 145/* for init */
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 65241f32d3f8..4253ad580e39 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -400,7 +400,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 400 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
401 btrfs_dev_replace_unlock(dev_replace); 401 btrfs_dev_replace_unlock(dev_replace);
402 402
403 btrfs_wait_ordered_extents(root, 0); 403 btrfs_wait_all_ordered_extents(root->fs_info, 0);
404 404
405 /* force writing the updated state information to disk */ 405 /* force writing the updated state information to disk */
406 trans = btrfs_start_transaction(root, 0); 406 trans = btrfs_start_transaction(root, 0);
@@ -470,12 +470,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
470 * flush all outstanding I/O and inode extent mappings before the 470 * flush all outstanding I/O and inode extent mappings before the
471 * copy operation is declared as being finished 471 * copy operation is declared as being finished
472 */ 472 */
473 ret = btrfs_start_delalloc_inodes(root, 0); 473 ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
474 if (ret) { 474 if (ret) {
475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 475 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
476 return ret; 476 return ret;
477 } 477 }
478 btrfs_wait_ordered_extents(root, 0); 478 btrfs_wait_all_ordered_extents(root->fs_info, 0);
479 479
480 trans = btrfs_start_transaction(root, 0); 480 trans = btrfs_start_transaction(root, 0);
481 if (IS_ERR(trans)) { 481 if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b8b60b660c8f..6b092a1c4e37 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1013,7 +1013,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
1013 return try_release_extent_buffer(page); 1013 return try_release_extent_buffer(page);
1014} 1014}
1015 1015
1016static void btree_invalidatepage(struct page *page, unsigned long offset) 1016static void btree_invalidatepage(struct page *page, unsigned int offset,
1017 unsigned int length)
1017{ 1018{
1018 struct extent_io_tree *tree; 1019 struct extent_io_tree *tree;
1019 tree = &BTRFS_I(page->mapping->host)->io_tree; 1020 tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1191,6 +1192,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1191 root->objectid = objectid; 1192 root->objectid = objectid;
1192 root->last_trans = 0; 1193 root->last_trans = 0;
1193 root->highest_objectid = 0; 1194 root->highest_objectid = 0;
1195 root->nr_delalloc_inodes = 0;
1196 root->nr_ordered_extents = 0;
1194 root->name = NULL; 1197 root->name = NULL;
1195 root->inode_tree = RB_ROOT; 1198 root->inode_tree = RB_ROOT;
1196 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); 1199 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1199,10 +1202,16 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1199 1202
1200 INIT_LIST_HEAD(&root->dirty_list); 1203 INIT_LIST_HEAD(&root->dirty_list);
1201 INIT_LIST_HEAD(&root->root_list); 1204 INIT_LIST_HEAD(&root->root_list);
1205 INIT_LIST_HEAD(&root->delalloc_inodes);
1206 INIT_LIST_HEAD(&root->delalloc_root);
1207 INIT_LIST_HEAD(&root->ordered_extents);
1208 INIT_LIST_HEAD(&root->ordered_root);
1202 INIT_LIST_HEAD(&root->logged_list[0]); 1209 INIT_LIST_HEAD(&root->logged_list[0]);
1203 INIT_LIST_HEAD(&root->logged_list[1]); 1210 INIT_LIST_HEAD(&root->logged_list[1]);
1204 spin_lock_init(&root->orphan_lock); 1211 spin_lock_init(&root->orphan_lock);
1205 spin_lock_init(&root->inode_lock); 1212 spin_lock_init(&root->inode_lock);
1213 spin_lock_init(&root->delalloc_lock);
1214 spin_lock_init(&root->ordered_extent_lock);
1206 spin_lock_init(&root->accounting_lock); 1215 spin_lock_init(&root->accounting_lock);
1207 spin_lock_init(&root->log_extents_lock[0]); 1216 spin_lock_init(&root->log_extents_lock[0]);
1208 spin_lock_init(&root->log_extents_lock[1]); 1217 spin_lock_init(&root->log_extents_lock[1]);
@@ -1216,6 +1225,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1216 atomic_set(&root->log_writers, 0); 1225 atomic_set(&root->log_writers, 0);
1217 atomic_set(&root->log_batch, 0); 1226 atomic_set(&root->log_batch, 0);
1218 atomic_set(&root->orphan_inodes, 0); 1227 atomic_set(&root->orphan_inodes, 0);
1228 atomic_set(&root->refs, 1);
1219 root->log_transid = 0; 1229 root->log_transid = 0;
1220 root->last_log_commit = 0; 1230 root->last_log_commit = 0;
1221 extent_io_tree_init(&root->dirty_log_pages, 1231 extent_io_tree_init(&root->dirty_log_pages,
@@ -1234,39 +1244,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1234 spin_lock_init(&root->root_item_lock); 1244 spin_lock_init(&root->root_item_lock);
1235} 1245}
1236 1246
1237static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
1238 struct btrfs_fs_info *fs_info,
1239 u64 objectid,
1240 struct btrfs_root *root)
1241{
1242 int ret;
1243 u32 blocksize;
1244 u64 generation;
1245
1246 __setup_root(tree_root->nodesize, tree_root->leafsize,
1247 tree_root->sectorsize, tree_root->stripesize,
1248 root, fs_info, objectid);
1249 ret = btrfs_find_last_root(tree_root, objectid,
1250 &root->root_item, &root->root_key);
1251 if (ret > 0)
1252 return -ENOENT;
1253 else if (ret < 0)
1254 return ret;
1255
1256 generation = btrfs_root_generation(&root->root_item);
1257 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1258 root->commit_root = NULL;
1259 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1260 blocksize, generation);
1261 if (!root->node || !btrfs_buffer_uptodate(root->node, generation, 0)) {
1262 free_extent_buffer(root->node);
1263 root->node = NULL;
1264 return -EIO;
1265 }
1266 root->commit_root = btrfs_root_node(root);
1267 return 0;
1268}
1269
1270static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info) 1247static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1271{ 1248{
1272 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS); 1249 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
@@ -1451,70 +1428,73 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1451 return 0; 1428 return 0;
1452} 1429}
1453 1430
1454struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 1431struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1455 struct btrfs_key *location) 1432 struct btrfs_key *key)
1456{ 1433{
1457 struct btrfs_root *root; 1434 struct btrfs_root *root;
1458 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1435 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1459 struct btrfs_path *path; 1436 struct btrfs_path *path;
1460 struct extent_buffer *l;
1461 u64 generation; 1437 u64 generation;
1462 u32 blocksize; 1438 u32 blocksize;
1463 int ret = 0; 1439 int ret;
1464 int slot;
1465 1440
1466 root = btrfs_alloc_root(fs_info); 1441 path = btrfs_alloc_path();
1467 if (!root) 1442 if (!path)
1468 return ERR_PTR(-ENOMEM); 1443 return ERR_PTR(-ENOMEM);
1469 if (location->offset == (u64)-1) { 1444
1470 ret = find_and_setup_root(tree_root, fs_info, 1445 root = btrfs_alloc_root(fs_info);
1471 location->objectid, root); 1446 if (!root) {
1472 if (ret) { 1447 ret = -ENOMEM;
1473 kfree(root); 1448 goto alloc_fail;
1474 return ERR_PTR(ret);
1475 }
1476 goto out;
1477 } 1449 }
1478 1450
1479 __setup_root(tree_root->nodesize, tree_root->leafsize, 1451 __setup_root(tree_root->nodesize, tree_root->leafsize,
1480 tree_root->sectorsize, tree_root->stripesize, 1452 tree_root->sectorsize, tree_root->stripesize,
1481 root, fs_info, location->objectid); 1453 root, fs_info, key->objectid);
1482 1454
1483 path = btrfs_alloc_path(); 1455 ret = btrfs_find_root(tree_root, key, path,
1484 if (!path) { 1456 &root->root_item, &root->root_key);
1485 kfree(root);
1486 return ERR_PTR(-ENOMEM);
1487 }
1488 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1489 if (ret == 0) {
1490 l = path->nodes[0];
1491 slot = path->slots[0];
1492 btrfs_read_root_item(l, slot, &root->root_item);
1493 memcpy(&root->root_key, location, sizeof(*location));
1494 }
1495 btrfs_free_path(path);
1496 if (ret) { 1457 if (ret) {
1497 kfree(root);
1498 if (ret > 0) 1458 if (ret > 0)
1499 ret = -ENOENT; 1459 ret = -ENOENT;
1500 return ERR_PTR(ret); 1460 goto find_fail;
1501 } 1461 }
1502 1462
1503 generation = btrfs_root_generation(&root->root_item); 1463 generation = btrfs_root_generation(&root->root_item);
1504 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1464 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1505 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1465 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1506 blocksize, generation); 1466 blocksize, generation);
1507 if (!root->node || !extent_buffer_uptodate(root->node)) { 1467 if (!root->node) {
1508 ret = (!root->node) ? -ENOMEM : -EIO; 1468 ret = -ENOMEM;
1509 1469 goto find_fail;
1510 free_extent_buffer(root->node); 1470 } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1511 kfree(root); 1471 ret = -EIO;
1512 return ERR_PTR(ret); 1472 goto read_fail;
1513 } 1473 }
1514
1515 root->commit_root = btrfs_root_node(root); 1474 root->commit_root = btrfs_root_node(root);
1516out: 1475out:
1517 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { 1476 btrfs_free_path(path);
1477 return root;
1478
1479read_fail:
1480 free_extent_buffer(root->node);
1481find_fail:
1482 kfree(root);
1483alloc_fail:
1484 root = ERR_PTR(ret);
1485 goto out;
1486}
1487
1488struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
1489 struct btrfs_key *location)
1490{
1491 struct btrfs_root *root;
1492
1493 root = btrfs_read_tree_root(tree_root, location);
1494 if (IS_ERR(root))
1495 return root;
1496
1497 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
1518 root->ref_cows = 1; 1498 root->ref_cows = 1;
1519 btrfs_check_and_init_root_item(&root->root_item); 1499 btrfs_check_and_init_root_item(&root->root_item);
1520 } 1500 }
@@ -1522,6 +1502,66 @@ out:
1522 return root; 1502 return root;
1523} 1503}
1524 1504
1505int btrfs_init_fs_root(struct btrfs_root *root)
1506{
1507 int ret;
1508
1509 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1510 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1511 GFP_NOFS);
1512 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1513 ret = -ENOMEM;
1514 goto fail;
1515 }
1516
1517 btrfs_init_free_ino_ctl(root);
1518 mutex_init(&root->fs_commit_mutex);
1519 spin_lock_init(&root->cache_lock);
1520 init_waitqueue_head(&root->cache_wait);
1521
1522 ret = get_anon_bdev(&root->anon_dev);
1523 if (ret)
1524 goto fail;
1525 return 0;
1526fail:
1527 kfree(root->free_ino_ctl);
1528 kfree(root->free_ino_pinned);
1529 return ret;
1530}
1531
1532struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1533 u64 root_id)
1534{
1535 struct btrfs_root *root;
1536
1537 spin_lock(&fs_info->fs_roots_radix_lock);
1538 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1539 (unsigned long)root_id);
1540 spin_unlock(&fs_info->fs_roots_radix_lock);
1541 return root;
1542}
1543
1544int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1545 struct btrfs_root *root)
1546{
1547 int ret;
1548
1549 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1550 if (ret)
1551 return ret;
1552
1553 spin_lock(&fs_info->fs_roots_radix_lock);
1554 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1555 (unsigned long)root->root_key.objectid,
1556 root);
1557 if (ret == 0)
1558 root->in_radix = 1;
1559 spin_unlock(&fs_info->fs_roots_radix_lock);
1560 radix_tree_preload_end();
1561
1562 return ret;
1563}
1564
1525struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 1565struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1526 struct btrfs_key *location) 1566 struct btrfs_key *location)
1527{ 1567{
@@ -1542,58 +1582,30 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1542 return fs_info->quota_root ? fs_info->quota_root : 1582 return fs_info->quota_root ? fs_info->quota_root :
1543 ERR_PTR(-ENOENT); 1583 ERR_PTR(-ENOENT);
1544again: 1584again:
1545 spin_lock(&fs_info->fs_roots_radix_lock); 1585 root = btrfs_lookup_fs_root(fs_info, location->objectid);
1546 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1547 (unsigned long)location->objectid);
1548 spin_unlock(&fs_info->fs_roots_radix_lock);
1549 if (root) 1586 if (root)
1550 return root; 1587 return root;
1551 1588
1552 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1589 root = btrfs_read_fs_root(fs_info->tree_root, location);
1553 if (IS_ERR(root)) 1590 if (IS_ERR(root))
1554 return root; 1591 return root;
1555 1592
1556 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); 1593 if (btrfs_root_refs(&root->root_item) == 0) {
1557 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), 1594 ret = -ENOENT;
1558 GFP_NOFS);
1559 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1560 ret = -ENOMEM;
1561 goto fail; 1595 goto fail;
1562 } 1596 }
1563 1597
1564 btrfs_init_free_ino_ctl(root); 1598 ret = btrfs_init_fs_root(root);
1565 mutex_init(&root->fs_commit_mutex);
1566 spin_lock_init(&root->cache_lock);
1567 init_waitqueue_head(&root->cache_wait);
1568
1569 ret = get_anon_bdev(&root->anon_dev);
1570 if (ret) 1599 if (ret)
1571 goto fail; 1600 goto fail;
1572 1601
1573 if (btrfs_root_refs(&root->root_item) == 0) {
1574 ret = -ENOENT;
1575 goto fail;
1576 }
1577
1578 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); 1602 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1579 if (ret < 0) 1603 if (ret < 0)
1580 goto fail; 1604 goto fail;
1581 if (ret == 0) 1605 if (ret == 0)
1582 root->orphan_item_inserted = 1; 1606 root->orphan_item_inserted = 1;
1583 1607
1584 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1608 ret = btrfs_insert_fs_root(fs_info, root);
1585 if (ret)
1586 goto fail;
1587
1588 spin_lock(&fs_info->fs_roots_radix_lock);
1589 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1590 (unsigned long)root->root_key.objectid,
1591 root);
1592 if (ret == 0)
1593 root->in_radix = 1;
1594
1595 spin_unlock(&fs_info->fs_roots_radix_lock);
1596 radix_tree_preload_end();
1597 if (ret) { 1609 if (ret) {
1598 if (ret == -EEXIST) { 1610 if (ret == -EEXIST) {
1599 free_fs_root(root); 1611 free_fs_root(root);
@@ -1601,10 +1613,6 @@ again:
1601 } 1613 }
1602 goto fail; 1614 goto fail;
1603 } 1615 }
1604
1605 ret = btrfs_find_dead_roots(fs_info->tree_root,
1606 root->root_key.objectid);
1607 WARN_ON(ret);
1608 return root; 1616 return root;
1609fail: 1617fail:
1610 free_fs_root(root); 1618 free_fs_root(root);
@@ -1676,21 +1684,37 @@ static void end_workqueue_fn(struct btrfs_work *work)
1676static int cleaner_kthread(void *arg) 1684static int cleaner_kthread(void *arg)
1677{ 1685{
1678 struct btrfs_root *root = arg; 1686 struct btrfs_root *root = arg;
1687 int again;
1679 1688
1680 do { 1689 do {
1681 int again = 0; 1690 again = 0;
1682 1691
1683 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1692 /* Make the cleaner go to sleep early. */
1684 down_read_trylock(&root->fs_info->sb->s_umount)) { 1693 if (btrfs_need_cleaner_sleep(root))
1685 if (mutex_trylock(&root->fs_info->cleaner_mutex)) { 1694 goto sleep;
1686 btrfs_run_delayed_iputs(root); 1695
1687 again = btrfs_clean_one_deleted_snapshot(root); 1696 if (!mutex_trylock(&root->fs_info->cleaner_mutex))
1688 mutex_unlock(&root->fs_info->cleaner_mutex); 1697 goto sleep;
1689 } 1698
1690 btrfs_run_defrag_inodes(root->fs_info); 1699 /*
1691 up_read(&root->fs_info->sb->s_umount); 1700 * Avoid the problem that we change the status of the fs
1701 * during the above check and trylock.
1702 */
1703 if (btrfs_need_cleaner_sleep(root)) {
1704 mutex_unlock(&root->fs_info->cleaner_mutex);
1705 goto sleep;
1692 } 1706 }
1693 1707
1708 btrfs_run_delayed_iputs(root);
1709 again = btrfs_clean_one_deleted_snapshot(root);
1710 mutex_unlock(&root->fs_info->cleaner_mutex);
1711
1712 /*
1713 * The defragger has dealt with the R/O remount and umount,
1714 * needn't do anything special here.
1715 */
1716 btrfs_run_defrag_inodes(root->fs_info);
1717sleep:
1694 if (!try_to_freeze() && !again) { 1718 if (!try_to_freeze() && !again) {
1695 set_current_state(TASK_INTERRUPTIBLE); 1719 set_current_state(TASK_INTERRUPTIBLE);
1696 if (!kthread_should_stop()) 1720 if (!kthread_should_stop())
@@ -1724,7 +1748,7 @@ static int transaction_kthread(void *arg)
1724 } 1748 }
1725 1749
1726 now = get_seconds(); 1750 now = get_seconds();
1727 if (!cur->blocked && 1751 if (cur->state < TRANS_STATE_BLOCKED &&
1728 (now < cur->start_time || now - cur->start_time < 30)) { 1752 (now < cur->start_time || now - cur->start_time < 30)) {
1729 spin_unlock(&root->fs_info->trans_lock); 1753 spin_unlock(&root->fs_info->trans_lock);
1730 delay = HZ * 5; 1754 delay = HZ * 5;
@@ -2034,11 +2058,11 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2034 list_del(&gang[0]->root_list); 2058 list_del(&gang[0]->root_list);
2035 2059
2036 if (gang[0]->in_radix) { 2060 if (gang[0]->in_radix) {
2037 btrfs_free_fs_root(fs_info, gang[0]); 2061 btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2038 } else { 2062 } else {
2039 free_extent_buffer(gang[0]->node); 2063 free_extent_buffer(gang[0]->node);
2040 free_extent_buffer(gang[0]->commit_root); 2064 free_extent_buffer(gang[0]->commit_root);
2041 kfree(gang[0]); 2065 btrfs_put_fs_root(gang[0]);
2042 } 2066 }
2043 } 2067 }
2044 2068
@@ -2049,7 +2073,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2049 if (!ret) 2073 if (!ret)
2050 break; 2074 break;
2051 for (i = 0; i < ret; i++) 2075 for (i = 0; i < ret; i++)
2052 btrfs_free_fs_root(fs_info, gang[i]); 2076 btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2053 } 2077 }
2054} 2078}
2055 2079
@@ -2081,14 +2105,8 @@ int open_ctree(struct super_block *sb,
2081 int backup_index = 0; 2105 int backup_index = 0;
2082 2106
2083 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2107 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2084 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
2085 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
2086 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2108 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
2087 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info); 2109 if (!tree_root || !chunk_root) {
2088 quota_root = fs_info->quota_root = btrfs_alloc_root(fs_info);
2089
2090 if (!tree_root || !extent_root || !csum_root ||
2091 !chunk_root || !dev_root || !quota_root) {
2092 err = -ENOMEM; 2110 err = -ENOMEM;
2093 goto fail; 2111 goto fail;
2094 } 2112 }
@@ -2131,9 +2149,9 @@ int open_ctree(struct super_block *sb,
2131 INIT_LIST_HEAD(&fs_info->trans_list); 2149 INIT_LIST_HEAD(&fs_info->trans_list);
2132 INIT_LIST_HEAD(&fs_info->dead_roots); 2150 INIT_LIST_HEAD(&fs_info->dead_roots);
2133 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2151 INIT_LIST_HEAD(&fs_info->delayed_iputs);
2134 INIT_LIST_HEAD(&fs_info->delalloc_inodes); 2152 INIT_LIST_HEAD(&fs_info->delalloc_roots);
2135 INIT_LIST_HEAD(&fs_info->caching_block_groups); 2153 INIT_LIST_HEAD(&fs_info->caching_block_groups);
2136 spin_lock_init(&fs_info->delalloc_lock); 2154 spin_lock_init(&fs_info->delalloc_root_lock);
2137 spin_lock_init(&fs_info->trans_lock); 2155 spin_lock_init(&fs_info->trans_lock);
2138 spin_lock_init(&fs_info->fs_roots_radix_lock); 2156 spin_lock_init(&fs_info->fs_roots_radix_lock);
2139 spin_lock_init(&fs_info->delayed_iput_lock); 2157 spin_lock_init(&fs_info->delayed_iput_lock);
@@ -2169,7 +2187,6 @@ int open_ctree(struct super_block *sb,
2169 fs_info->max_inline = 8192 * 1024; 2187 fs_info->max_inline = 8192 * 1024;
2170 fs_info->metadata_ratio = 0; 2188 fs_info->metadata_ratio = 0;
2171 fs_info->defrag_inodes = RB_ROOT; 2189 fs_info->defrag_inodes = RB_ROOT;
2172 fs_info->trans_no_join = 0;
2173 fs_info->free_chunk_space = 0; 2190 fs_info->free_chunk_space = 0;
2174 fs_info->tree_mod_log = RB_ROOT; 2191 fs_info->tree_mod_log = RB_ROOT;
2175 2192
@@ -2180,8 +2197,8 @@ int open_ctree(struct super_block *sb,
2180 fs_info->thread_pool_size = min_t(unsigned long, 2197 fs_info->thread_pool_size = min_t(unsigned long,
2181 num_online_cpus() + 2, 8); 2198 num_online_cpus() + 2, 8);
2182 2199
2183 INIT_LIST_HEAD(&fs_info->ordered_extents); 2200 INIT_LIST_HEAD(&fs_info->ordered_roots);
2184 spin_lock_init(&fs_info->ordered_extent_lock); 2201 spin_lock_init(&fs_info->ordered_root_lock);
2185 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2202 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
2186 GFP_NOFS); 2203 GFP_NOFS);
2187 if (!fs_info->delayed_root) { 2204 if (!fs_info->delayed_root) {
@@ -2274,6 +2291,7 @@ int open_ctree(struct super_block *sb,
2274 fs_info->qgroup_seq = 1; 2291 fs_info->qgroup_seq = 1;
2275 fs_info->quota_enabled = 0; 2292 fs_info->quota_enabled = 0;
2276 fs_info->pending_quota_state = 0; 2293 fs_info->pending_quota_state = 0;
2294 fs_info->qgroup_ulist = NULL;
2277 mutex_init(&fs_info->qgroup_rescan_lock); 2295 mutex_init(&fs_info->qgroup_rescan_lock);
2278 2296
2279 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2297 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2638,33 +2656,44 @@ retry_root_backup:
2638 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2656 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
2639 tree_root->commit_root = btrfs_root_node(tree_root); 2657 tree_root->commit_root = btrfs_root_node(tree_root);
2640 2658
2641 ret = find_and_setup_root(tree_root, fs_info, 2659 location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2642 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2660 location.type = BTRFS_ROOT_ITEM_KEY;
2643 if (ret) 2661 location.offset = 0;
2662
2663 extent_root = btrfs_read_tree_root(tree_root, &location);
2664 if (IS_ERR(extent_root)) {
2665 ret = PTR_ERR(extent_root);
2644 goto recovery_tree_root; 2666 goto recovery_tree_root;
2667 }
2645 extent_root->track_dirty = 1; 2668 extent_root->track_dirty = 1;
2669 fs_info->extent_root = extent_root;
2646 2670
2647 ret = find_and_setup_root(tree_root, fs_info, 2671 location.objectid = BTRFS_DEV_TREE_OBJECTID;
2648 BTRFS_DEV_TREE_OBJECTID, dev_root); 2672 dev_root = btrfs_read_tree_root(tree_root, &location);
2649 if (ret) 2673 if (IS_ERR(dev_root)) {
2674 ret = PTR_ERR(dev_root);
2650 goto recovery_tree_root; 2675 goto recovery_tree_root;
2676 }
2651 dev_root->track_dirty = 1; 2677 dev_root->track_dirty = 1;
2678 fs_info->dev_root = dev_root;
2679 btrfs_init_devices_late(fs_info);
2652 2680
2653 ret = find_and_setup_root(tree_root, fs_info, 2681 location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2654 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2682 csum_root = btrfs_read_tree_root(tree_root, &location);
2655 if (ret) 2683 if (IS_ERR(csum_root)) {
2684 ret = PTR_ERR(csum_root);
2656 goto recovery_tree_root; 2685 goto recovery_tree_root;
2686 }
2657 csum_root->track_dirty = 1; 2687 csum_root->track_dirty = 1;
2688 fs_info->csum_root = csum_root;
2658 2689
2659 ret = find_and_setup_root(tree_root, fs_info, 2690 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2660 BTRFS_QUOTA_TREE_OBJECTID, quota_root); 2691 quota_root = btrfs_read_tree_root(tree_root, &location);
2661 if (ret) { 2692 if (!IS_ERR(quota_root)) {
2662 kfree(quota_root);
2663 quota_root = fs_info->quota_root = NULL;
2664 } else {
2665 quota_root->track_dirty = 1; 2693 quota_root->track_dirty = 1;
2666 fs_info->quota_enabled = 1; 2694 fs_info->quota_enabled = 1;
2667 fs_info->pending_quota_state = 1; 2695 fs_info->pending_quota_state = 1;
2696 fs_info->quota_root = quota_root;
2668 } 2697 }
2669 2698
2670 fs_info->generation = generation; 2699 fs_info->generation = generation;
@@ -2817,11 +2846,9 @@ retry_root_backup:
2817 2846
2818 location.objectid = BTRFS_FS_TREE_OBJECTID; 2847 location.objectid = BTRFS_FS_TREE_OBJECTID;
2819 location.type = BTRFS_ROOT_ITEM_KEY; 2848 location.type = BTRFS_ROOT_ITEM_KEY;
2820 location.offset = (u64)-1; 2849 location.offset = 0;
2821 2850
2822 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2851 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
2823 if (!fs_info->fs_root)
2824 goto fail_qgroup;
2825 if (IS_ERR(fs_info->fs_root)) { 2852 if (IS_ERR(fs_info->fs_root)) {
2826 err = PTR_ERR(fs_info->fs_root); 2853 err = PTR_ERR(fs_info->fs_root);
2827 goto fail_qgroup; 2854 goto fail_qgroup;
@@ -2853,6 +2880,8 @@ retry_root_backup:
2853 return ret; 2880 return ret;
2854 } 2881 }
2855 2882
2883 btrfs_qgroup_rescan_resume(fs_info);
2884
2856 return 0; 2885 return 0;
2857 2886
2858fail_qgroup: 2887fail_qgroup:
@@ -3258,7 +3287,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3258 BTRFS_BLOCK_GROUP_RAID10)) { 3287 BTRFS_BLOCK_GROUP_RAID10)) {
3259 num_tolerated_disk_barrier_failures = 1; 3288 num_tolerated_disk_barrier_failures = 1;
3260 } else if (flags & 3289 } else if (flags &
3261 BTRFS_BLOCK_GROUP_RAID5) { 3290 BTRFS_BLOCK_GROUP_RAID6) {
3262 num_tolerated_disk_barrier_failures = 2; 3291 num_tolerated_disk_barrier_failures = 2;
3263 } 3292 }
3264 } 3293 }
@@ -3366,7 +3395,9 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
3366 return ret; 3395 return ret;
3367} 3396}
3368 3397
3369void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) 3398/* Drop a fs root from the radix tree and free it. */
3399void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3400 struct btrfs_root *root)
3370{ 3401{
3371 spin_lock(&fs_info->fs_roots_radix_lock); 3402 spin_lock(&fs_info->fs_roots_radix_lock);
3372 radix_tree_delete(&fs_info->fs_roots_radix, 3403 radix_tree_delete(&fs_info->fs_roots_radix,
@@ -3397,7 +3428,12 @@ static void free_fs_root(struct btrfs_root *root)
3397 kfree(root->free_ino_ctl); 3428 kfree(root->free_ino_ctl);
3398 kfree(root->free_ino_pinned); 3429 kfree(root->free_ino_pinned);
3399 kfree(root->name); 3430 kfree(root->name);
3400 kfree(root); 3431 btrfs_put_fs_root(root);
3432}
3433
3434void btrfs_free_fs_root(struct btrfs_root *root)
3435{
3436 free_fs_root(root);
3401} 3437}
3402 3438
3403int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) 3439int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -3653,7 +3689,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3653 INIT_LIST_HEAD(&splice); 3689 INIT_LIST_HEAD(&splice);
3654 3690
3655 mutex_lock(&root->fs_info->ordered_operations_mutex); 3691 mutex_lock(&root->fs_info->ordered_operations_mutex);
3656 spin_lock(&root->fs_info->ordered_extent_lock); 3692 spin_lock(&root->fs_info->ordered_root_lock);
3657 3693
3658 list_splice_init(&t->ordered_operations, &splice); 3694 list_splice_init(&t->ordered_operations, &splice);
3659 while (!list_empty(&splice)) { 3695 while (!list_empty(&splice)) {
@@ -3661,14 +3697,14 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3661 ordered_operations); 3697 ordered_operations);
3662 3698
3663 list_del_init(&btrfs_inode->ordered_operations); 3699 list_del_init(&btrfs_inode->ordered_operations);
3664 spin_unlock(&root->fs_info->ordered_extent_lock); 3700 spin_unlock(&root->fs_info->ordered_root_lock);
3665 3701
3666 btrfs_invalidate_inodes(btrfs_inode->root); 3702 btrfs_invalidate_inodes(btrfs_inode->root);
3667 3703
3668 spin_lock(&root->fs_info->ordered_extent_lock); 3704 spin_lock(&root->fs_info->ordered_root_lock);
3669 } 3705 }
3670 3706
3671 spin_unlock(&root->fs_info->ordered_extent_lock); 3707 spin_unlock(&root->fs_info->ordered_root_lock);
3672 mutex_unlock(&root->fs_info->ordered_operations_mutex); 3708 mutex_unlock(&root->fs_info->ordered_operations_mutex);
3673} 3709}
3674 3710
@@ -3676,15 +3712,36 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3676{ 3712{
3677 struct btrfs_ordered_extent *ordered; 3713 struct btrfs_ordered_extent *ordered;
3678 3714
3679 spin_lock(&root->fs_info->ordered_extent_lock); 3715 spin_lock(&root->ordered_extent_lock);
3680 /* 3716 /*
3681 * This will just short circuit the ordered completion stuff which will 3717 * This will just short circuit the ordered completion stuff which will
3682 * make sure the ordered extent gets properly cleaned up. 3718 * make sure the ordered extent gets properly cleaned up.
3683 */ 3719 */
3684 list_for_each_entry(ordered, &root->fs_info->ordered_extents, 3720 list_for_each_entry(ordered, &root->ordered_extents,
3685 root_extent_list) 3721 root_extent_list)
3686 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 3722 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
3687 spin_unlock(&root->fs_info->ordered_extent_lock); 3723 spin_unlock(&root->ordered_extent_lock);
3724}
3725
3726static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
3727{
3728 struct btrfs_root *root;
3729 struct list_head splice;
3730
3731 INIT_LIST_HEAD(&splice);
3732
3733 spin_lock(&fs_info->ordered_root_lock);
3734 list_splice_init(&fs_info->ordered_roots, &splice);
3735 while (!list_empty(&splice)) {
3736 root = list_first_entry(&splice, struct btrfs_root,
3737 ordered_root);
3738 list_del_init(&root->ordered_root);
3739
3740 btrfs_destroy_ordered_extents(root);
3741
3742 cond_resched_lock(&fs_info->ordered_root_lock);
3743 }
3744 spin_unlock(&fs_info->ordered_root_lock);
3688} 3745}
3689 3746
3690int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 3747int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3706,6 +3763,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3706 3763
3707 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3764 while ((node = rb_first(&delayed_refs->root)) != NULL) {
3708 struct btrfs_delayed_ref_head *head = NULL; 3765 struct btrfs_delayed_ref_head *head = NULL;
3766 bool pin_bytes = false;
3709 3767
3710 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3768 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3711 atomic_set(&ref->refs, 1); 3769 atomic_set(&ref->refs, 1);
@@ -3726,8 +3784,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3726 } 3784 }
3727 3785
3728 if (head->must_insert_reserved) 3786 if (head->must_insert_reserved)
3729 btrfs_pin_extent(root, ref->bytenr, 3787 pin_bytes = true;
3730 ref->num_bytes, 1);
3731 btrfs_free_delayed_extent_op(head->extent_op); 3788 btrfs_free_delayed_extent_op(head->extent_op);
3732 delayed_refs->num_heads--; 3789 delayed_refs->num_heads--;
3733 if (list_empty(&head->cluster)) 3790 if (list_empty(&head->cluster))
@@ -3738,9 +3795,13 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3738 ref->in_tree = 0; 3795 ref->in_tree = 0;
3739 rb_erase(&ref->rb_node, &delayed_refs->root); 3796 rb_erase(&ref->rb_node, &delayed_refs->root);
3740 delayed_refs->num_entries--; 3797 delayed_refs->num_entries--;
3741 if (head)
3742 mutex_unlock(&head->mutex);
3743 spin_unlock(&delayed_refs->lock); 3798 spin_unlock(&delayed_refs->lock);
3799 if (head) {
3800 if (pin_bytes)
3801 btrfs_pin_extent(root, ref->bytenr,
3802 ref->num_bytes, 1);
3803 mutex_unlock(&head->mutex);
3804 }
3744 btrfs_put_delayed_ref(ref); 3805 btrfs_put_delayed_ref(ref);
3745 3806
3746 cond_resched(); 3807 cond_resched();
@@ -3777,24 +3838,49 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
3777 3838
3778 INIT_LIST_HEAD(&splice); 3839 INIT_LIST_HEAD(&splice);
3779 3840
3780 spin_lock(&root->fs_info->delalloc_lock); 3841 spin_lock(&root->delalloc_lock);
3781 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 3842 list_splice_init(&root->delalloc_inodes, &splice);
3782 3843
3783 while (!list_empty(&splice)) { 3844 while (!list_empty(&splice)) {
3784 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 3845 btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
3785 delalloc_inodes); 3846 delalloc_inodes);
3786 3847
3787 list_del_init(&btrfs_inode->delalloc_inodes); 3848 list_del_init(&btrfs_inode->delalloc_inodes);
3788 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 3849 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
3789 &btrfs_inode->runtime_flags); 3850 &btrfs_inode->runtime_flags);
3790 spin_unlock(&root->fs_info->delalloc_lock); 3851 spin_unlock(&root->delalloc_lock);
3791 3852
3792 btrfs_invalidate_inodes(btrfs_inode->root); 3853 btrfs_invalidate_inodes(btrfs_inode->root);
3793 3854
3794 spin_lock(&root->fs_info->delalloc_lock); 3855 spin_lock(&root->delalloc_lock);
3795 } 3856 }
3796 3857
3797 spin_unlock(&root->fs_info->delalloc_lock); 3858 spin_unlock(&root->delalloc_lock);
3859}
3860
3861static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
3862{
3863 struct btrfs_root *root;
3864 struct list_head splice;
3865
3866 INIT_LIST_HEAD(&splice);
3867
3868 spin_lock(&fs_info->delalloc_root_lock);
3869 list_splice_init(&fs_info->delalloc_roots, &splice);
3870 while (!list_empty(&splice)) {
3871 root = list_first_entry(&splice, struct btrfs_root,
3872 delalloc_root);
3873 list_del_init(&root->delalloc_root);
3874 root = btrfs_grab_fs_root(root);
3875 BUG_ON(!root);
3876 spin_unlock(&fs_info->delalloc_root_lock);
3877
3878 btrfs_destroy_delalloc_inodes(root);
3879 btrfs_put_fs_root(root);
3880
3881 spin_lock(&fs_info->delalloc_root_lock);
3882 }
3883 spin_unlock(&fs_info->delalloc_root_lock);
3798} 3884}
3799 3885
3800static int btrfs_destroy_marked_extents(struct btrfs_root *root, 3886static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3878,19 +3964,14 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3878 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, 3964 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
3879 cur_trans->dirty_pages.dirty_bytes); 3965 cur_trans->dirty_pages.dirty_bytes);
3880 3966
3881 /* FIXME: cleanup wait for commit */ 3967 cur_trans->state = TRANS_STATE_COMMIT_START;
3882 cur_trans->in_commit = 1;
3883 cur_trans->blocked = 1;
3884 wake_up(&root->fs_info->transaction_blocked_wait); 3968 wake_up(&root->fs_info->transaction_blocked_wait);
3885 3969
3886 btrfs_evict_pending_snapshots(cur_trans); 3970 btrfs_evict_pending_snapshots(cur_trans);
3887 3971
3888 cur_trans->blocked = 0; 3972 cur_trans->state = TRANS_STATE_UNBLOCKED;
3889 wake_up(&root->fs_info->transaction_wait); 3973 wake_up(&root->fs_info->transaction_wait);
3890 3974
3891 cur_trans->commit_done = 1;
3892 wake_up(&cur_trans->commit_wait);
3893
3894 btrfs_destroy_delayed_inodes(root); 3975 btrfs_destroy_delayed_inodes(root);
3895 btrfs_assert_delayed_root_empty(root); 3976 btrfs_assert_delayed_root_empty(root);
3896 3977
@@ -3899,6 +3980,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
3899 btrfs_destroy_pinned_extent(root, 3980 btrfs_destroy_pinned_extent(root,
3900 root->fs_info->pinned_extents); 3981 root->fs_info->pinned_extents);
3901 3982
3983 cur_trans->state =TRANS_STATE_COMPLETED;
3984 wake_up(&cur_trans->commit_wait);
3985
3902 /* 3986 /*
3903 memset(cur_trans, 0, sizeof(*cur_trans)); 3987 memset(cur_trans, 0, sizeof(*cur_trans));
3904 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 3988 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
@@ -3914,7 +3998,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3914 3998
3915 spin_lock(&root->fs_info->trans_lock); 3999 spin_lock(&root->fs_info->trans_lock);
3916 list_splice_init(&root->fs_info->trans_list, &list); 4000 list_splice_init(&root->fs_info->trans_list, &list);
3917 root->fs_info->trans_no_join = 1; 4001 root->fs_info->running_transaction = NULL;
3918 spin_unlock(&root->fs_info->trans_lock); 4002 spin_unlock(&root->fs_info->trans_lock);
3919 4003
3920 while (!list_empty(&list)) { 4004 while (!list_empty(&list)) {
@@ -3922,37 +4006,31 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3922 4006
3923 btrfs_destroy_ordered_operations(t, root); 4007 btrfs_destroy_ordered_operations(t, root);
3924 4008
3925 btrfs_destroy_ordered_extents(root); 4009 btrfs_destroy_all_ordered_extents(root->fs_info);
3926 4010
3927 btrfs_destroy_delayed_refs(t, root); 4011 btrfs_destroy_delayed_refs(t, root);
3928 4012
3929 /* FIXME: cleanup wait for commit */ 4013 /*
3930 t->in_commit = 1; 4014 * FIXME: cleanup wait for commit
3931 t->blocked = 1; 4015 * We needn't acquire the lock here, because we are during
4016 * the umount, there is no other task which will change it.
4017 */
4018 t->state = TRANS_STATE_COMMIT_START;
3932 smp_mb(); 4019 smp_mb();
3933 if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) 4020 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3934 wake_up(&root->fs_info->transaction_blocked_wait); 4021 wake_up(&root->fs_info->transaction_blocked_wait);
3935 4022
3936 btrfs_evict_pending_snapshots(t); 4023 btrfs_evict_pending_snapshots(t);
3937 4024
3938 t->blocked = 0; 4025 t->state = TRANS_STATE_UNBLOCKED;
3939 smp_mb(); 4026 smp_mb();
3940 if (waitqueue_active(&root->fs_info->transaction_wait)) 4027 if (waitqueue_active(&root->fs_info->transaction_wait))
3941 wake_up(&root->fs_info->transaction_wait); 4028 wake_up(&root->fs_info->transaction_wait);
3942 4029
3943 t->commit_done = 1;
3944 smp_mb();
3945 if (waitqueue_active(&t->commit_wait))
3946 wake_up(&t->commit_wait);
3947
3948 btrfs_destroy_delayed_inodes(root); 4030 btrfs_destroy_delayed_inodes(root);
3949 btrfs_assert_delayed_root_empty(root); 4031 btrfs_assert_delayed_root_empty(root);
3950 4032
3951 btrfs_destroy_delalloc_inodes(root); 4033 btrfs_destroy_all_delalloc_inodes(root->fs_info);
3952
3953 spin_lock(&root->fs_info->trans_lock);
3954 root->fs_info->running_transaction = NULL;
3955 spin_unlock(&root->fs_info->trans_lock);
3956 4034
3957 btrfs_destroy_marked_extents(root, &t->dirty_pages, 4035 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3958 EXTENT_DIRTY); 4036 EXTENT_DIRTY);
@@ -3960,15 +4038,17 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3960 btrfs_destroy_pinned_extent(root, 4038 btrfs_destroy_pinned_extent(root,
3961 root->fs_info->pinned_extents); 4039 root->fs_info->pinned_extents);
3962 4040
4041 t->state = TRANS_STATE_COMPLETED;
4042 smp_mb();
4043 if (waitqueue_active(&t->commit_wait))
4044 wake_up(&t->commit_wait);
4045
3963 atomic_set(&t->use_count, 0); 4046 atomic_set(&t->use_count, 0);
3964 list_del_init(&t->list); 4047 list_del_init(&t->list);
3965 memset(t, 0, sizeof(*t)); 4048 memset(t, 0, sizeof(*t));
3966 kmem_cache_free(btrfs_transaction_cachep, t); 4049 kmem_cache_free(btrfs_transaction_cachep, t);
3967 } 4050 }
3968 4051
3969 spin_lock(&root->fs_info->trans_lock);
3970 root->fs_info->trans_no_join = 0;
3971 spin_unlock(&root->fs_info->trans_lock);
3972 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 4052 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3973 4053
3974 return 0; 4054 return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index be69ce1b07a2..b71acd6e1e5b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -63,14 +63,40 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
65 u64 bytenr, u32 blocksize); 65 u64 bytenr, u32 blocksize);
66struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
68int btrfs_init_fs_root(struct btrfs_root *root);
69int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
70 struct btrfs_root *root);
68struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 71struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
69 struct btrfs_key *location); 72 struct btrfs_key *location);
70int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 73int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
71void btrfs_btree_balance_dirty(struct btrfs_root *root); 74void btrfs_btree_balance_dirty(struct btrfs_root *root);
72void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); 75void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
73void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 76void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
77 struct btrfs_root *root);
78void btrfs_free_fs_root(struct btrfs_root *root);
79
80/*
81 * This function is used to grab the root, and avoid it is freed when we
82 * access it. But it doesn't ensure that the tree is not dropped.
83 *
84 * If you want to ensure the whole tree is safe, you should use
85 * fs_info->subvol_srcu
86 */
87static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
88{
89 if (atomic_inc_not_zero(&root->refs))
90 return root;
91 return NULL;
92}
93
94static inline void btrfs_put_fs_root(struct btrfs_root *root)
95{
96 if (atomic_dec_and_test(&root->refs))
97 kfree(root);
98}
99
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 100void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 101int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
76 int atomic); 102 int atomic);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 81ee29eeb7ca..4b8691607373 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -82,11 +82,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
82 goto fail; 82 goto fail;
83 } 83 }
84 84
85 if (btrfs_root_refs(&root->root_item) == 0) {
86 err = -ENOENT;
87 goto fail;
88 }
89
90 key.objectid = objectid; 85 key.objectid = objectid;
91 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 86 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
92 key.offset = 0; 87 key.offset = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df472ab1b5ac..1204c8ef6f32 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,6 +24,7 @@
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/percpu_counter.h>
27#include "compat.h" 28#include "compat.h"
28#include "hash.h" 29#include "hash.h"
29#include "ctree.h" 30#include "ctree.h"
@@ -2526,6 +2527,51 @@ static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2526 return 0; 2527 return 0;
2527} 2528}
2528 2529
2530static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2531{
2532 u64 num_bytes;
2533
2534 num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2535 sizeof(struct btrfs_extent_inline_ref));
2536 if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2537 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2538
2539 /*
2540 * We don't ever fill up leaves all the way so multiply by 2 just to be
2541 * closer to what we're really going to want to ouse.
2542 */
2543 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2544}
2545
2546int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2547 struct btrfs_root *root)
2548{
2549 struct btrfs_block_rsv *global_rsv;
2550 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2551 u64 num_bytes;
2552 int ret = 0;
2553
2554 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2555 num_heads = heads_to_leaves(root, num_heads);
2556 if (num_heads > 1)
2557 num_bytes += (num_heads - 1) * root->leafsize;
2558 num_bytes <<= 1;
2559 global_rsv = &root->fs_info->global_block_rsv;
2560
2561 /*
2562 * If we can't allocate any more chunks lets make sure we have _lots_ of
2563 * wiggle room since running delayed refs can create more delayed refs.
2564 */
2565 if (global_rsv->space_info->full)
2566 num_bytes <<= 1;
2567
2568 spin_lock(&global_rsv->lock);
2569 if (global_rsv->reserved <= num_bytes)
2570 ret = 1;
2571 spin_unlock(&global_rsv->lock);
2572 return ret;
2573}
2574
2529/* 2575/*
2530 * this starts processing the delayed reference count updates and 2576 * this starts processing the delayed reference count updates and
2531 * extent insertions we have queued up so far. count can be 2577 * extent insertions we have queued up so far. count can be
@@ -2573,7 +2619,8 @@ progress:
2573 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); 2619 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2574 if (old) { 2620 if (old) {
2575 DEFINE_WAIT(__wait); 2621 DEFINE_WAIT(__wait);
2576 if (delayed_refs->num_entries < 16348) 2622 if (delayed_refs->flushing ||
2623 !btrfs_should_throttle_delayed_refs(trans, root))
2577 return 0; 2624 return 0;
2578 2625
2579 prepare_to_wait(&delayed_refs->wait, &__wait, 2626 prepare_to_wait(&delayed_refs->wait, &__wait,
@@ -2608,7 +2655,7 @@ again:
2608 2655
2609 while (1) { 2656 while (1) {
2610 if (!(run_all || run_most) && 2657 if (!(run_all || run_most) &&
2611 delayed_refs->num_heads_ready < 64) 2658 !btrfs_should_throttle_delayed_refs(trans, root))
2612 break; 2659 break;
2613 2660
2614 /* 2661 /*
@@ -2629,6 +2676,7 @@ again:
2629 spin_unlock(&delayed_refs->lock); 2676 spin_unlock(&delayed_refs->lock);
2630 btrfs_abort_transaction(trans, root, ret); 2677 btrfs_abort_transaction(trans, root, ret);
2631 atomic_dec(&delayed_refs->procs_running_refs); 2678 atomic_dec(&delayed_refs->procs_running_refs);
2679 wake_up(&delayed_refs->wait);
2632 return ret; 2680 return ret;
2633 } 2681 }
2634 2682
@@ -3310,6 +3358,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3310 struct btrfs_space_info *found; 3358 struct btrfs_space_info *found;
3311 int i; 3359 int i;
3312 int factor; 3360 int factor;
3361 int ret;
3313 3362
3314 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3363 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3315 BTRFS_BLOCK_GROUP_RAID10)) 3364 BTRFS_BLOCK_GROUP_RAID10))
@@ -3333,6 +3382,12 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3333 if (!found) 3382 if (!found)
3334 return -ENOMEM; 3383 return -ENOMEM;
3335 3384
3385 ret = percpu_counter_init(&found->total_bytes_pinned, 0);
3386 if (ret) {
3387 kfree(found);
3388 return ret;
3389 }
3390
3336 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3391 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3337 INIT_LIST_HEAD(&found->block_groups[i]); 3392 INIT_LIST_HEAD(&found->block_groups[i]);
3338 init_rwsem(&found->groups_sem); 3393 init_rwsem(&found->groups_sem);
@@ -3565,10 +3620,11 @@ alloc:
3565 } 3620 }
3566 3621
3567 /* 3622 /*
3568 * If we have less pinned bytes than we want to allocate then 3623 * If we don't have enough pinned space to deal with this
3569 * don't bother committing the transaction, it won't help us. 3624 * allocation don't bother committing the transaction.
3570 */ 3625 */
3571 if (data_sinfo->bytes_pinned < bytes) 3626 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3627 bytes) < 0)
3572 committed = 1; 3628 committed = 1;
3573 spin_unlock(&data_sinfo->lock); 3629 spin_unlock(&data_sinfo->lock);
3574 3630
@@ -3577,6 +3633,7 @@ commit_trans:
3577 if (!committed && 3633 if (!committed &&
3578 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3634 !atomic_read(&root->fs_info->open_ioctl_trans)) {
3579 committed = 1; 3635 committed = 1;
3636
3580 trans = btrfs_join_transaction(root); 3637 trans = btrfs_join_transaction(root);
3581 if (IS_ERR(trans)) 3638 if (IS_ERR(trans))
3582 return PTR_ERR(trans); 3639 return PTR_ERR(trans);
@@ -3609,6 +3666,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3609 3666
3610 data_sinfo = root->fs_info->data_sinfo; 3667 data_sinfo = root->fs_info->data_sinfo;
3611 spin_lock(&data_sinfo->lock); 3668 spin_lock(&data_sinfo->lock);
3669 WARN_ON(data_sinfo->bytes_may_use < bytes);
3612 data_sinfo->bytes_may_use -= bytes; 3670 data_sinfo->bytes_may_use -= bytes;
3613 trace_btrfs_space_reservation(root->fs_info, "space_info", 3671 trace_btrfs_space_reservation(root->fs_info, "space_info",
3614 data_sinfo->flags, bytes, 0); 3672 data_sinfo->flags, bytes, 0);
@@ -3886,12 +3944,11 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3886 unsigned long nr_pages) 3944 unsigned long nr_pages)
3887{ 3945{
3888 struct super_block *sb = root->fs_info->sb; 3946 struct super_block *sb = root->fs_info->sb;
3889 int started;
3890 3947
3891 /* If we can not start writeback, just sync all the delalloc file. */ 3948 if (down_read_trylock(&sb->s_umount)) {
3892 started = try_to_writeback_inodes_sb_nr(sb, nr_pages, 3949 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
3893 WB_REASON_FS_FREE_SPACE); 3950 up_read(&sb->s_umount);
3894 if (!started) { 3951 } else {
3895 /* 3952 /*
3896 * We needn't worry the filesystem going from r/w to r/o though 3953 * We needn't worry the filesystem going from r/w to r/o though
3897 * we don't acquire ->s_umount mutex, because the filesystem 3954 * we don't acquire ->s_umount mutex, because the filesystem
@@ -3899,9 +3956,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3899 * the filesystem is readonly(all dirty pages are written to 3956 * the filesystem is readonly(all dirty pages are written to
3900 * the disk). 3957 * the disk).
3901 */ 3958 */
3902 btrfs_start_delalloc_inodes(root, 0); 3959 btrfs_start_all_delalloc_inodes(root->fs_info, 0);
3903 if (!current->journal_info) 3960 if (!current->journal_info)
3904 btrfs_wait_ordered_extents(root, 0); 3961 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3905 } 3962 }
3906} 3963}
3907 3964
@@ -3931,7 +3988,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3931 if (delalloc_bytes == 0) { 3988 if (delalloc_bytes == 0) {
3932 if (trans) 3989 if (trans)
3933 return; 3990 return;
3934 btrfs_wait_ordered_extents(root, 0); 3991 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3935 return; 3992 return;
3936 } 3993 }
3937 3994
@@ -3959,7 +4016,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3959 4016
3960 loops++; 4017 loops++;
3961 if (wait_ordered && !trans) { 4018 if (wait_ordered && !trans) {
3962 btrfs_wait_ordered_extents(root, 0); 4019 btrfs_wait_all_ordered_extents(root->fs_info, 0);
3963 } else { 4020 } else {
3964 time_left = schedule_timeout_killable(1); 4021 time_left = schedule_timeout_killable(1);
3965 if (time_left) 4022 if (time_left)
@@ -3997,7 +4054,8 @@ static int may_commit_transaction(struct btrfs_root *root,
3997 4054
3998 /* See if there is enough pinned space to make this reservation */ 4055 /* See if there is enough pinned space to make this reservation */
3999 spin_lock(&space_info->lock); 4056 spin_lock(&space_info->lock);
4000 if (space_info->bytes_pinned >= bytes) { 4057 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4058 bytes) >= 0) {
4001 spin_unlock(&space_info->lock); 4059 spin_unlock(&space_info->lock);
4002 goto commit; 4060 goto commit;
4003 } 4061 }
@@ -4012,7 +4070,8 @@ static int may_commit_transaction(struct btrfs_root *root,
4012 4070
4013 spin_lock(&space_info->lock); 4071 spin_lock(&space_info->lock);
4014 spin_lock(&delayed_rsv->lock); 4072 spin_lock(&delayed_rsv->lock);
4015 if (space_info->bytes_pinned + delayed_rsv->size < bytes) { 4073 if (percpu_counter_compare(&space_info->total_bytes_pinned,
4074 bytes - delayed_rsv->size) >= 0) {
4016 spin_unlock(&delayed_rsv->lock); 4075 spin_unlock(&delayed_rsv->lock);
4017 spin_unlock(&space_info->lock); 4076 spin_unlock(&space_info->lock);
4018 return -ENOSPC; 4077 return -ENOSPC;
@@ -4297,6 +4356,31 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4297 spin_unlock(&block_rsv->lock); 4356 spin_unlock(&block_rsv->lock);
4298} 4357}
4299 4358
4359int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4360 struct btrfs_block_rsv *dest, u64 num_bytes,
4361 int min_factor)
4362{
4363 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4364 u64 min_bytes;
4365
4366 if (global_rsv->space_info != dest->space_info)
4367 return -ENOSPC;
4368
4369 spin_lock(&global_rsv->lock);
4370 min_bytes = div_factor(global_rsv->size, min_factor);
4371 if (global_rsv->reserved < min_bytes + num_bytes) {
4372 spin_unlock(&global_rsv->lock);
4373 return -ENOSPC;
4374 }
4375 global_rsv->reserved -= num_bytes;
4376 if (global_rsv->reserved < global_rsv->size)
4377 global_rsv->full = 0;
4378 spin_unlock(&global_rsv->lock);
4379
4380 block_rsv_add_bytes(dest, num_bytes, 1);
4381 return 0;
4382}
4383
4300static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, 4384static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4301 struct btrfs_block_rsv *block_rsv, 4385 struct btrfs_block_rsv *block_rsv,
4302 struct btrfs_block_rsv *dest, u64 num_bytes) 4386 struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5030,14 +5114,14 @@ static int update_block_group(struct btrfs_root *root,
5030 int factor; 5114 int factor;
5031 5115
5032 /* block accounting for super block */ 5116 /* block accounting for super block */
5033 spin_lock(&info->delalloc_lock); 5117 spin_lock(&info->delalloc_root_lock);
5034 old_val = btrfs_super_bytes_used(info->super_copy); 5118 old_val = btrfs_super_bytes_used(info->super_copy);
5035 if (alloc) 5119 if (alloc)
5036 old_val += num_bytes; 5120 old_val += num_bytes;
5037 else 5121 else
5038 old_val -= num_bytes; 5122 old_val -= num_bytes;
5039 btrfs_set_super_bytes_used(info->super_copy, old_val); 5123 btrfs_set_super_bytes_used(info->super_copy, old_val);
5040 spin_unlock(&info->delalloc_lock); 5124 spin_unlock(&info->delalloc_root_lock);
5041 5125
5042 while (total) { 5126 while (total) {
5043 cache = btrfs_lookup_block_group(info, bytenr); 5127 cache = btrfs_lookup_block_group(info, bytenr);
@@ -5189,6 +5273,80 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5189 return ret; 5273 return ret;
5190} 5274}
5191 5275
5276static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5277{
5278 int ret;
5279 struct btrfs_block_group_cache *block_group;
5280 struct btrfs_caching_control *caching_ctl;
5281
5282 block_group = btrfs_lookup_block_group(root->fs_info, start);
5283 if (!block_group)
5284 return -EINVAL;
5285
5286 cache_block_group(block_group, 0);
5287 caching_ctl = get_caching_control(block_group);
5288
5289 if (!caching_ctl) {
5290 /* Logic error */
5291 BUG_ON(!block_group_cache_done(block_group));
5292 ret = btrfs_remove_free_space(block_group, start, num_bytes);
5293 } else {
5294 mutex_lock(&caching_ctl->mutex);
5295
5296 if (start >= caching_ctl->progress) {
5297 ret = add_excluded_extent(root, start, num_bytes);
5298 } else if (start + num_bytes <= caching_ctl->progress) {
5299 ret = btrfs_remove_free_space(block_group,
5300 start, num_bytes);
5301 } else {
5302 num_bytes = caching_ctl->progress - start;
5303 ret = btrfs_remove_free_space(block_group,
5304 start, num_bytes);
5305 if (ret)
5306 goto out_lock;
5307
5308 num_bytes = (start + num_bytes) -
5309 caching_ctl->progress;
5310 start = caching_ctl->progress;
5311 ret = add_excluded_extent(root, start, num_bytes);
5312 }
5313out_lock:
5314 mutex_unlock(&caching_ctl->mutex);
5315 put_caching_control(caching_ctl);
5316 }
5317 btrfs_put_block_group(block_group);
5318 return ret;
5319}
5320
5321int btrfs_exclude_logged_extents(struct btrfs_root *log,
5322 struct extent_buffer *eb)
5323{
5324 struct btrfs_file_extent_item *item;
5325 struct btrfs_key key;
5326 int found_type;
5327 int i;
5328
5329 if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5330 return 0;
5331
5332 for (i = 0; i < btrfs_header_nritems(eb); i++) {
5333 btrfs_item_key_to_cpu(eb, &key, i);
5334 if (key.type != BTRFS_EXTENT_DATA_KEY)
5335 continue;
5336 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5337 found_type = btrfs_file_extent_type(eb, item);
5338 if (found_type == BTRFS_FILE_EXTENT_INLINE)
5339 continue;
5340 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5341 continue;
5342 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5343 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5344 __exclude_logged_extent(log, key.objectid, key.offset);
5345 }
5346
5347 return 0;
5348}
5349
5192/** 5350/**
5193 * btrfs_update_reserved_bytes - update the block_group and space info counters 5351 * btrfs_update_reserved_bytes - update the block_group and space info counters
5194 * @cache: The cache we are manipulating 5352 * @cache: The cache we are manipulating
@@ -5251,6 +5409,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5251 struct btrfs_caching_control *next; 5409 struct btrfs_caching_control *next;
5252 struct btrfs_caching_control *caching_ctl; 5410 struct btrfs_caching_control *caching_ctl;
5253 struct btrfs_block_group_cache *cache; 5411 struct btrfs_block_group_cache *cache;
5412 struct btrfs_space_info *space_info;
5254 5413
5255 down_write(&fs_info->extent_commit_sem); 5414 down_write(&fs_info->extent_commit_sem);
5256 5415
@@ -5273,6 +5432,9 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5273 5432
5274 up_write(&fs_info->extent_commit_sem); 5433 up_write(&fs_info->extent_commit_sem);
5275 5434
5435 list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
5436 percpu_counter_set(&space_info->total_bytes_pinned, 0);
5437
5276 update_global_block_rsv(fs_info); 5438 update_global_block_rsv(fs_info);
5277} 5439}
5278 5440
@@ -5370,6 +5532,27 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5370 return 0; 5532 return 0;
5371} 5533}
5372 5534
5535static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5536 u64 owner, u64 root_objectid)
5537{
5538 struct btrfs_space_info *space_info;
5539 u64 flags;
5540
5541 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5542 if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5543 flags = BTRFS_BLOCK_GROUP_SYSTEM;
5544 else
5545 flags = BTRFS_BLOCK_GROUP_METADATA;
5546 } else {
5547 flags = BTRFS_BLOCK_GROUP_DATA;
5548 }
5549
5550 space_info = __find_space_info(fs_info, flags);
5551 BUG_ON(!space_info); /* Logic bug */
5552 percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5553}
5554
5555
5373static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 5556static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5374 struct btrfs_root *root, 5557 struct btrfs_root *root,
5375 u64 bytenr, u64 num_bytes, u64 parent, 5558 u64 bytenr, u64 num_bytes, u64 parent,
@@ -5590,6 +5773,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5590 goto out; 5773 goto out;
5591 } 5774 }
5592 } 5775 }
5776 add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
5777 root_objectid);
5593 } else { 5778 } else {
5594 if (found_extent) { 5779 if (found_extent) {
5595 BUG_ON(is_data && refs_to_drop != 5780 BUG_ON(is_data && refs_to_drop !=
@@ -5713,6 +5898,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5713 u64 parent, int last_ref) 5898 u64 parent, int last_ref)
5714{ 5899{
5715 struct btrfs_block_group_cache *cache = NULL; 5900 struct btrfs_block_group_cache *cache = NULL;
5901 int pin = 1;
5716 int ret; 5902 int ret;
5717 5903
5718 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5904 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -5745,8 +5931,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5745 5931
5746 btrfs_add_free_space(cache, buf->start, buf->len); 5932 btrfs_add_free_space(cache, buf->start, buf->len);
5747 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); 5933 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5934 pin = 0;
5748 } 5935 }
5749out: 5936out:
5937 if (pin)
5938 add_pinned_bytes(root->fs_info, buf->len,
5939 btrfs_header_level(buf),
5940 root->root_key.objectid);
5941
5750 /* 5942 /*
5751 * Deleting the buffer, clear the corrupt flag since it doesn't matter 5943 * Deleting the buffer, clear the corrupt flag since it doesn't matter
5752 * anymore. 5944 * anymore.
@@ -5763,6 +5955,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5763 int ret; 5955 int ret;
5764 struct btrfs_fs_info *fs_info = root->fs_info; 5956 struct btrfs_fs_info *fs_info = root->fs_info;
5765 5957
5958 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
5959
5766 /* 5960 /*
5767 * tree log blocks never actually go into the extent allocation 5961 * tree log blocks never actually go into the extent allocation
5768 * tree, just update pinning info and exit early. 5962 * tree, just update pinning info and exit early.
@@ -6560,52 +6754,26 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6560{ 6754{
6561 int ret; 6755 int ret;
6562 struct btrfs_block_group_cache *block_group; 6756 struct btrfs_block_group_cache *block_group;
6563 struct btrfs_caching_control *caching_ctl;
6564 u64 start = ins->objectid;
6565 u64 num_bytes = ins->offset;
6566
6567 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6568 cache_block_group(block_group, 0);
6569 caching_ctl = get_caching_control(block_group);
6570
6571 if (!caching_ctl) {
6572 BUG_ON(!block_group_cache_done(block_group));
6573 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6574 if (ret)
6575 goto out;
6576 } else {
6577 mutex_lock(&caching_ctl->mutex);
6578
6579 if (start >= caching_ctl->progress) {
6580 ret = add_excluded_extent(root, start, num_bytes);
6581 } else if (start + num_bytes <= caching_ctl->progress) {
6582 ret = btrfs_remove_free_space(block_group,
6583 start, num_bytes);
6584 } else {
6585 num_bytes = caching_ctl->progress - start;
6586 ret = btrfs_remove_free_space(block_group,
6587 start, num_bytes);
6588 if (ret)
6589 goto out_lock;
6590 6757
6591 start = caching_ctl->progress; 6758 /*
6592 num_bytes = ins->objectid + ins->offset - 6759 * Mixed block groups will exclude before processing the log so we only
6593 caching_ctl->progress; 6760 * need to do the exlude dance if this fs isn't mixed.
6594 ret = add_excluded_extent(root, start, num_bytes); 6761 */
6595 } 6762 if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
6596out_lock: 6763 ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
6597 mutex_unlock(&caching_ctl->mutex);
6598 put_caching_control(caching_ctl);
6599 if (ret) 6764 if (ret)
6600 goto out; 6765 return ret;
6601 } 6766 }
6602 6767
6768 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6769 if (!block_group)
6770 return -EINVAL;
6771
6603 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 6772 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6604 RESERVE_ALLOC_NO_ACCOUNT); 6773 RESERVE_ALLOC_NO_ACCOUNT);
6605 BUG_ON(ret); /* logic error */ 6774 BUG_ON(ret); /* logic error */
6606 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 6775 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6607 0, owner, offset, ins, 1); 6776 0, owner, offset, ins, 1);
6608out:
6609 btrfs_put_block_group(block_group); 6777 btrfs_put_block_group(block_group);
6610 return ret; 6778 return ret;
6611} 6779}
@@ -7298,6 +7466,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7298 int err = 0; 7466 int err = 0;
7299 int ret; 7467 int ret;
7300 int level; 7468 int level;
7469 bool root_dropped = false;
7301 7470
7302 path = btrfs_alloc_path(); 7471 path = btrfs_alloc_path();
7303 if (!path) { 7472 if (!path) {
@@ -7355,6 +7524,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7355 while (1) { 7524 while (1) {
7356 btrfs_tree_lock(path->nodes[level]); 7525 btrfs_tree_lock(path->nodes[level]);
7357 btrfs_set_lock_blocking(path->nodes[level]); 7526 btrfs_set_lock_blocking(path->nodes[level]);
7527 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7358 7528
7359 ret = btrfs_lookup_extent_info(trans, root, 7529 ret = btrfs_lookup_extent_info(trans, root,
7360 path->nodes[level]->start, 7530 path->nodes[level]->start,
@@ -7370,6 +7540,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7370 break; 7540 break;
7371 7541
7372 btrfs_tree_unlock(path->nodes[level]); 7542 btrfs_tree_unlock(path->nodes[level]);
7543 path->locks[level] = 0;
7373 WARN_ON(wc->refs[level] != 1); 7544 WARN_ON(wc->refs[level] != 1);
7374 level--; 7545 level--;
7375 } 7546 }
@@ -7384,11 +7555,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7384 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 7555 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7385 7556
7386 while (1) { 7557 while (1) {
7387 if (!for_reloc && btrfs_fs_closing(root->fs_info)) {
7388 pr_debug("btrfs: drop snapshot early exit\n");
7389 err = -EAGAIN;
7390 goto out_end_trans;
7391 }
7392 7558
7393 ret = walk_down_tree(trans, root, path, wc); 7559 ret = walk_down_tree(trans, root, path, wc);
7394 if (ret < 0) { 7560 if (ret < 0) {
@@ -7416,7 +7582,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7416 } 7582 }
7417 7583
7418 BUG_ON(wc->level == 0); 7584 BUG_ON(wc->level == 0);
7419 if (btrfs_should_end_transaction(trans, tree_root)) { 7585 if (btrfs_should_end_transaction(trans, tree_root) ||
7586 (!for_reloc && btrfs_need_cleaner_sleep(root))) {
7420 ret = btrfs_update_root(trans, tree_root, 7587 ret = btrfs_update_root(trans, tree_root,
7421 &root->root_key, 7588 &root->root_key,
7422 root_item); 7589 root_item);
@@ -7427,6 +7594,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7427 } 7594 }
7428 7595
7429 btrfs_end_transaction_throttle(trans, tree_root); 7596 btrfs_end_transaction_throttle(trans, tree_root);
7597 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
7598 pr_debug("btrfs: drop snapshot early exit\n");
7599 err = -EAGAIN;
7600 goto out_free;
7601 }
7602
7430 trans = btrfs_start_transaction(tree_root, 0); 7603 trans = btrfs_start_transaction(tree_root, 0);
7431 if (IS_ERR(trans)) { 7604 if (IS_ERR(trans)) {
7432 err = PTR_ERR(trans); 7605 err = PTR_ERR(trans);
@@ -7447,8 +7620,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7447 } 7620 }
7448 7621
7449 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 7622 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7450 ret = btrfs_find_last_root(tree_root, root->root_key.objectid, 7623 ret = btrfs_find_root(tree_root, &root->root_key, path,
7451 NULL, NULL); 7624 NULL, NULL);
7452 if (ret < 0) { 7625 if (ret < 0) {
7453 btrfs_abort_transaction(trans, tree_root, ret); 7626 btrfs_abort_transaction(trans, tree_root, ret);
7454 err = ret; 7627 err = ret;
@@ -7465,18 +7638,28 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7465 } 7638 }
7466 7639
7467 if (root->in_radix) { 7640 if (root->in_radix) {
7468 btrfs_free_fs_root(tree_root->fs_info, root); 7641 btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
7469 } else { 7642 } else {
7470 free_extent_buffer(root->node); 7643 free_extent_buffer(root->node);
7471 free_extent_buffer(root->commit_root); 7644 free_extent_buffer(root->commit_root);
7472 kfree(root); 7645 btrfs_put_fs_root(root);
7473 } 7646 }
7647 root_dropped = true;
7474out_end_trans: 7648out_end_trans:
7475 btrfs_end_transaction_throttle(trans, tree_root); 7649 btrfs_end_transaction_throttle(trans, tree_root);
7476out_free: 7650out_free:
7477 kfree(wc); 7651 kfree(wc);
7478 btrfs_free_path(path); 7652 btrfs_free_path(path);
7479out: 7653out:
7654 /*
7655 * So if we need to stop dropping the snapshot for whatever reason we
7656 * need to make sure to add it back to the dead root list so that we
7657 * keep trying to do the work later. This also cleans up roots if we
7658 * don't have it in the radix (like when we recover after a power fail
7659 * or unmount) so we don't leak memory.
7660 */
7661 if (root_dropped == false)
7662 btrfs_add_dead_root(root);
7480 if (err) 7663 if (err)
7481 btrfs_std_error(root->fs_info, err); 7664 btrfs_std_error(root->fs_info, err);
7482 return err; 7665 return err;
@@ -7782,6 +7965,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7782 struct btrfs_space_info *space_info; 7965 struct btrfs_space_info *space_info;
7783 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 7966 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7784 struct btrfs_device *device; 7967 struct btrfs_device *device;
7968 struct btrfs_trans_handle *trans;
7785 u64 min_free; 7969 u64 min_free;
7786 u64 dev_min = 1; 7970 u64 dev_min = 1;
7787 u64 dev_nr = 0; 7971 u64 dev_nr = 0;
@@ -7868,6 +8052,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7868 do_div(min_free, dev_min); 8052 do_div(min_free, dev_min);
7869 } 8053 }
7870 8054
8055 /* We need to do this so that we can look at pending chunks */
8056 trans = btrfs_join_transaction(root);
8057 if (IS_ERR(trans)) {
8058 ret = PTR_ERR(trans);
8059 goto out;
8060 }
8061
7871 mutex_lock(&root->fs_info->chunk_mutex); 8062 mutex_lock(&root->fs_info->chunk_mutex);
7872 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8063 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7873 u64 dev_offset; 8064 u64 dev_offset;
@@ -7878,7 +8069,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7878 */ 8069 */
7879 if (device->total_bytes > device->bytes_used + min_free && 8070 if (device->total_bytes > device->bytes_used + min_free &&
7880 !device->is_tgtdev_for_dev_replace) { 8071 !device->is_tgtdev_for_dev_replace) {
7881 ret = find_free_dev_extent(device, min_free, 8072 ret = find_free_dev_extent(trans, device, min_free,
7882 &dev_offset, NULL); 8073 &dev_offset, NULL);
7883 if (!ret) 8074 if (!ret)
7884 dev_nr++; 8075 dev_nr++;
@@ -7890,6 +8081,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7890 } 8081 }
7891 } 8082 }
7892 mutex_unlock(&root->fs_info->chunk_mutex); 8083 mutex_unlock(&root->fs_info->chunk_mutex);
8084 btrfs_end_transaction(trans, root);
7893out: 8085out:
7894 btrfs_put_block_group(block_group); 8086 btrfs_put_block_group(block_group);
7895 return ret; 8087 return ret;
@@ -8032,6 +8224,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8032 dump_space_info(space_info, 0, 0); 8224 dump_space_info(space_info, 0, 0);
8033 } 8225 }
8034 } 8226 }
8227 percpu_counter_destroy(&space_info->total_bytes_pinned);
8035 list_del(&space_info->list); 8228 list_del(&space_info->list);
8036 kfree(space_info); 8229 kfree(space_info);
8037 } 8230 }
@@ -8254,6 +8447,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8254 sizeof(item)); 8447 sizeof(item));
8255 if (ret) 8448 if (ret)
8256 btrfs_abort_transaction(trans, extent_root, ret); 8449 btrfs_abort_transaction(trans, extent_root, ret);
8450 ret = btrfs_finish_chunk_alloc(trans, extent_root,
8451 key.objectid, key.offset);
8452 if (ret)
8453 btrfs_abort_transaction(trans, extent_root, ret);
8257 } 8454 }
8258} 8455}
8259 8456
@@ -8591,8 +8788,15 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8591 if (end - start >= range->minlen) { 8788 if (end - start >= range->minlen) {
8592 if (!block_group_cache_done(cache)) { 8789 if (!block_group_cache_done(cache)) {
8593 ret = cache_block_group(cache, 0); 8790 ret = cache_block_group(cache, 0);
8594 if (!ret) 8791 if (ret) {
8595 wait_block_group_cache_done(cache); 8792 btrfs_put_block_group(cache);
8793 break;
8794 }
8795 ret = wait_block_group_cache_done(cache);
8796 if (ret) {
8797 btrfs_put_block_group(cache);
8798 break;
8799 }
8596 } 8800 }
8597 ret = btrfs_trim_block_group(cache, 8801 ret = btrfs_trim_block_group(cache,
8598 &group_trimmed, 8802 &group_trimmed,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7e7afb4a872..fe443fece851 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,10 +77,29 @@ void btrfs_leak_debug_check(void)
77 kmem_cache_free(extent_buffer_cache, eb); 77 kmem_cache_free(extent_buffer_cache, eb);
78 } 78 }
79} 79}
80
81#define btrfs_debug_check_extent_io_range(inode, start, end) \
82 __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
83static inline void __btrfs_debug_check_extent_io_range(const char *caller,
84 struct inode *inode, u64 start, u64 end)
85{
86 u64 isize = i_size_read(inode);
87
88 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
89 printk_ratelimited(KERN_DEBUG
90 "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
91 caller,
92 (unsigned long long)btrfs_ino(inode),
93 (unsigned long long)isize,
94 (unsigned long long)start,
95 (unsigned long long)end);
96 }
97}
80#else 98#else
81#define btrfs_leak_debug_add(new, head) do {} while (0) 99#define btrfs_leak_debug_add(new, head) do {} while (0)
82#define btrfs_leak_debug_del(entry) do {} while (0) 100#define btrfs_leak_debug_del(entry) do {} while (0)
83#define btrfs_leak_debug_check() do {} while (0) 101#define btrfs_leak_debug_check() do {} while (0)
102#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
84#endif 103#endif
85 104
86#define BUFFER_LRU_MAX 64 105#define BUFFER_LRU_MAX 64
@@ -522,6 +541,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
522 int err; 541 int err;
523 int clear = 0; 542 int clear = 0;
524 543
544 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
545
546 if (bits & EXTENT_DELALLOC)
547 bits |= EXTENT_NORESERVE;
548
525 if (delete) 549 if (delete)
526 bits |= ~EXTENT_CTLBITS; 550 bits |= ~EXTENT_CTLBITS;
527 bits |= EXTENT_FIRST_DELALLOC; 551 bits |= EXTENT_FIRST_DELALLOC;
@@ -677,6 +701,8 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
677 struct extent_state *state; 701 struct extent_state *state;
678 struct rb_node *node; 702 struct rb_node *node;
679 703
704 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
705
680 spin_lock(&tree->lock); 706 spin_lock(&tree->lock);
681again: 707again:
682 while (1) { 708 while (1) {
@@ -769,6 +795,8 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
769 u64 last_start; 795 u64 last_start;
770 u64 last_end; 796 u64 last_end;
771 797
798 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
799
772 bits |= EXTENT_FIRST_DELALLOC; 800 bits |= EXTENT_FIRST_DELALLOC;
773again: 801again:
774 if (!prealloc && (mask & __GFP_WAIT)) { 802 if (!prealloc && (mask & __GFP_WAIT)) {
@@ -989,6 +1017,8 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
989 u64 last_start; 1017 u64 last_start;
990 u64 last_end; 1018 u64 last_end;
991 1019
1020 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
1021
992again: 1022again:
993 if (!prealloc && (mask & __GFP_WAIT)) { 1023 if (!prealloc && (mask & __GFP_WAIT)) {
994 prealloc = alloc_extent_state(mask); 1024 prealloc = alloc_extent_state(mask);
@@ -2450,11 +2480,12 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2450 struct extent_state *cached = NULL; 2480 struct extent_state *cached = NULL;
2451 struct extent_state *state; 2481 struct extent_state *state;
2452 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2482 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2483 struct inode *inode = page->mapping->host;
2453 2484
2454 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2485 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2455 "mirror=%lu\n", (u64)bio->bi_sector, err, 2486 "mirror=%lu\n", (u64)bio->bi_sector, err,
2456 io_bio->mirror_num); 2487 io_bio->mirror_num);
2457 tree = &BTRFS_I(page->mapping->host)->io_tree; 2488 tree = &BTRFS_I(inode)->io_tree;
2458 2489
2459 /* We always issue full-page reads, but if some block 2490 /* We always issue full-page reads, but if some block
2460 * in a page fails to read, blk_update_request() will 2491 * in a page fails to read, blk_update_request() will
@@ -2528,6 +2559,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2528 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2559 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2529 2560
2530 if (uptodate) { 2561 if (uptodate) {
2562 loff_t i_size = i_size_read(inode);
2563 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2564 unsigned offset;
2565
2566 /* Zero out the end if this page straddles i_size */
2567 offset = i_size & (PAGE_CACHE_SIZE-1);
2568 if (page->index == end_index && offset)
2569 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2531 SetPageUptodate(page); 2570 SetPageUptodate(page);
2532 } else { 2571 } else {
2533 ClearPageUptodate(page); 2572 ClearPageUptodate(page);
@@ -2957,7 +2996,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2957 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2996 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2958 if (page->index > end_index || 2997 if (page->index > end_index ||
2959 (page->index == end_index && !pg_offset)) { 2998 (page->index == end_index && !pg_offset)) {
2960 page->mapping->a_ops->invalidatepage(page, 0); 2999 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
2961 unlock_page(page); 3000 unlock_page(page);
2962 return 0; 3001 return 0;
2963 } 3002 }
@@ -4009,7 +4048,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4009 } 4048 }
4010 4049
4011 while (!end) { 4050 while (!end) {
4012 u64 offset_in_extent; 4051 u64 offset_in_extent = 0;
4013 4052
4014 /* break if the extent we found is outside the range */ 4053 /* break if the extent we found is outside the range */
4015 if (em->start >= max || extent_map_end(em) < off) 4054 if (em->start >= max || extent_map_end(em) < off)
@@ -4025,9 +4064,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4025 4064
4026 /* 4065 /*
4027 * record the offset from the start of the extent 4066 * record the offset from the start of the extent
4028 * for adjusting the disk offset below 4067 * for adjusting the disk offset below. Only do this if the
4068 * extent isn't compressed since our in ram offset may be past
4069 * what we have actually allocated on disk.
4029 */ 4070 */
4030 offset_in_extent = em_start - em->start; 4071 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4072 offset_in_extent = em_start - em->start;
4031 em_end = extent_map_end(em); 4073 em_end = extent_map_end(em);
4032 em_len = em_end - em_start; 4074 em_len = em_end - em_start;
4033 emflags = em->flags; 4075 emflags = em->flags;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41fb81e7ec53..3b8c4e26e1da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13) 20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14) 21#define EXTENT_DAMAGED (1 << 14)
22#define EXTENT_NORESERVE (1 << 15)
22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 23#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 24#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
24 25
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b193bf324a41..a7bfc9541803 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,8 +34,7 @@
34 34
35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ 35#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
36 sizeof(struct btrfs_ordered_sum)) / \ 36 sizeof(struct btrfs_ordered_sum)) / \
37 sizeof(struct btrfs_sector_sum) * \ 37 sizeof(u32) * (r)->sectorsize)
38 (r)->sectorsize - (r)->sectorsize)
39 38
40int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 39int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 40 struct btrfs_root *root,
@@ -297,7 +296,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
297 struct btrfs_path *path; 296 struct btrfs_path *path;
298 struct extent_buffer *leaf; 297 struct extent_buffer *leaf;
299 struct btrfs_ordered_sum *sums; 298 struct btrfs_ordered_sum *sums;
300 struct btrfs_sector_sum *sector_sum;
301 struct btrfs_csum_item *item; 299 struct btrfs_csum_item *item;
302 LIST_HEAD(tmplist); 300 LIST_HEAD(tmplist);
303 unsigned long offset; 301 unsigned long offset;
@@ -368,34 +366,28 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
368 struct btrfs_csum_item); 366 struct btrfs_csum_item);
369 while (start < csum_end) { 367 while (start < csum_end) {
370 size = min_t(size_t, csum_end - start, 368 size = min_t(size_t, csum_end - start,
371 MAX_ORDERED_SUM_BYTES(root)); 369 MAX_ORDERED_SUM_BYTES(root));
372 sums = kzalloc(btrfs_ordered_sum_size(root, size), 370 sums = kzalloc(btrfs_ordered_sum_size(root, size),
373 GFP_NOFS); 371 GFP_NOFS);
374 if (!sums) { 372 if (!sums) {
375 ret = -ENOMEM; 373 ret = -ENOMEM;
376 goto fail; 374 goto fail;
377 } 375 }
378 376
379 sector_sum = sums->sums;
380 sums->bytenr = start; 377 sums->bytenr = start;
381 sums->len = size; 378 sums->len = (int)size;
382 379
383 offset = (start - key.offset) >> 380 offset = (start - key.offset) >>
384 root->fs_info->sb->s_blocksize_bits; 381 root->fs_info->sb->s_blocksize_bits;
385 offset *= csum_size; 382 offset *= csum_size;
383 size >>= root->fs_info->sb->s_blocksize_bits;
386 384
387 while (size > 0) { 385 read_extent_buffer(path->nodes[0],
388 read_extent_buffer(path->nodes[0], 386 sums->sums,
389 &sector_sum->sum, 387 ((unsigned long)item) + offset,
390 ((unsigned long)item) + 388 csum_size * size);
391 offset, csum_size); 389
392 sector_sum->bytenr = start; 390 start += root->sectorsize * size;
393
394 size -= root->sectorsize;
395 start += root->sectorsize;
396 offset += csum_size;
397 sector_sum++;
398 }
399 list_add_tail(&sums->list, &tmplist); 391 list_add_tail(&sums->list, &tmplist);
400 } 392 }
401 path->slots[0]++; 393 path->slots[0]++;
@@ -417,23 +409,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
417 struct bio *bio, u64 file_start, int contig) 409 struct bio *bio, u64 file_start, int contig)
418{ 410{
419 struct btrfs_ordered_sum *sums; 411 struct btrfs_ordered_sum *sums;
420 struct btrfs_sector_sum *sector_sum;
421 struct btrfs_ordered_extent *ordered; 412 struct btrfs_ordered_extent *ordered;
422 char *data; 413 char *data;
423 struct bio_vec *bvec = bio->bi_io_vec; 414 struct bio_vec *bvec = bio->bi_io_vec;
424 int bio_index = 0; 415 int bio_index = 0;
416 int index;
425 unsigned long total_bytes = 0; 417 unsigned long total_bytes = 0;
426 unsigned long this_sum_bytes = 0; 418 unsigned long this_sum_bytes = 0;
427 u64 offset; 419 u64 offset;
428 u64 disk_bytenr;
429 420
430 WARN_ON(bio->bi_vcnt <= 0); 421 WARN_ON(bio->bi_vcnt <= 0);
431 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); 422 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
432 if (!sums) 423 if (!sums)
433 return -ENOMEM; 424 return -ENOMEM;
434 425
435 sector_sum = sums->sums;
436 disk_bytenr = (u64)bio->bi_sector << 9;
437 sums->len = bio->bi_size; 426 sums->len = bio->bi_size;
438 INIT_LIST_HEAD(&sums->list); 427 INIT_LIST_HEAD(&sums->list);
439 428
@@ -444,7 +433,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
444 433
445 ordered = btrfs_lookup_ordered_extent(inode, offset); 434 ordered = btrfs_lookup_ordered_extent(inode, offset);
446 BUG_ON(!ordered); /* Logic error */ 435 BUG_ON(!ordered); /* Logic error */
447 sums->bytenr = ordered->start; 436 sums->bytenr = (u64)bio->bi_sector << 9;
437 index = 0;
448 438
449 while (bio_index < bio->bi_vcnt) { 439 while (bio_index < bio->bi_vcnt) {
450 if (!contig) 440 if (!contig)
@@ -463,28 +453,27 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
463 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 453 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
464 GFP_NOFS); 454 GFP_NOFS);
465 BUG_ON(!sums); /* -ENOMEM */ 455 BUG_ON(!sums); /* -ENOMEM */
466 sector_sum = sums->sums;
467 sums->len = bytes_left; 456 sums->len = bytes_left;
468 ordered = btrfs_lookup_ordered_extent(inode, offset); 457 ordered = btrfs_lookup_ordered_extent(inode, offset);
469 BUG_ON(!ordered); /* Logic error */ 458 BUG_ON(!ordered); /* Logic error */
470 sums->bytenr = ordered->start; 459 sums->bytenr = ((u64)bio->bi_sector << 9) +
460 total_bytes;
461 index = 0;
471 } 462 }
472 463
473 data = kmap_atomic(bvec->bv_page); 464 data = kmap_atomic(bvec->bv_page);
474 sector_sum->sum = ~(u32)0; 465 sums->sums[index] = ~(u32)0;
475 sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset, 466 sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
476 sector_sum->sum, 467 sums->sums[index],
477 bvec->bv_len); 468 bvec->bv_len);
478 kunmap_atomic(data); 469 kunmap_atomic(data);
479 btrfs_csum_final(sector_sum->sum, 470 btrfs_csum_final(sums->sums[index],
480 (char *)&sector_sum->sum); 471 (char *)(sums->sums + index));
481 sector_sum->bytenr = disk_bytenr;
482 472
483 sector_sum++;
484 bio_index++; 473 bio_index++;
474 index++;
485 total_bytes += bvec->bv_len; 475 total_bytes += bvec->bv_len;
486 this_sum_bytes += bvec->bv_len; 476 this_sum_bytes += bvec->bv_len;
487 disk_bytenr += bvec->bv_len;
488 offset += bvec->bv_len; 477 offset += bvec->bv_len;
489 bvec++; 478 bvec++;
490 } 479 }
@@ -672,62 +661,46 @@ out:
672 return ret; 661 return ret;
673} 662}
674 663
675static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
676 struct btrfs_sector_sum *sector_sum,
677 u64 total_bytes, u64 sectorsize)
678{
679 u64 tmp = sectorsize;
680 u64 next_sector = sector_sum->bytenr;
681 struct btrfs_sector_sum *next = sector_sum + 1;
682
683 while ((tmp + total_bytes) < sums->len) {
684 if (next_sector + sectorsize != next->bytenr)
685 break;
686 tmp += sectorsize;
687 next_sector = next->bytenr;
688 next++;
689 }
690 return tmp;
691}
692
693int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 664int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
694 struct btrfs_root *root, 665 struct btrfs_root *root,
695 struct btrfs_ordered_sum *sums) 666 struct btrfs_ordered_sum *sums)
696{ 667{
697 u64 bytenr;
698 int ret;
699 struct btrfs_key file_key; 668 struct btrfs_key file_key;
700 struct btrfs_key found_key; 669 struct btrfs_key found_key;
701 u64 next_offset;
702 u64 total_bytes = 0;
703 int found_next;
704 struct btrfs_path *path; 670 struct btrfs_path *path;
705 struct btrfs_csum_item *item; 671 struct btrfs_csum_item *item;
706 struct btrfs_csum_item *item_end; 672 struct btrfs_csum_item *item_end;
707 struct extent_buffer *leaf = NULL; 673 struct extent_buffer *leaf = NULL;
674 u64 next_offset;
675 u64 total_bytes = 0;
708 u64 csum_offset; 676 u64 csum_offset;
709 struct btrfs_sector_sum *sector_sum; 677 u64 bytenr;
710 u32 nritems; 678 u32 nritems;
711 u32 ins_size; 679 u32 ins_size;
680 int index = 0;
681 int found_next;
682 int ret;
712 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 683 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
713 684
714 path = btrfs_alloc_path(); 685 path = btrfs_alloc_path();
715 if (!path) 686 if (!path)
716 return -ENOMEM; 687 return -ENOMEM;
717
718 sector_sum = sums->sums;
719again: 688again:
720 next_offset = (u64)-1; 689 next_offset = (u64)-1;
721 found_next = 0; 690 found_next = 0;
691 bytenr = sums->bytenr + total_bytes;
722 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 692 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
723 file_key.offset = sector_sum->bytenr; 693 file_key.offset = bytenr;
724 bytenr = sector_sum->bytenr;
725 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 694 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
726 695
727 item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); 696 item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
728 if (!IS_ERR(item)) { 697 if (!IS_ERR(item)) {
729 leaf = path->nodes[0];
730 ret = 0; 698 ret = 0;
699 leaf = path->nodes[0];
700 item_end = btrfs_item_ptr(leaf, path->slots[0],
701 struct btrfs_csum_item);
702 item_end = (struct btrfs_csum_item *)((char *)item_end +
703 btrfs_item_size_nr(leaf, path->slots[0]));
731 goto found; 704 goto found;
732 } 705 }
733 ret = PTR_ERR(item); 706 ret = PTR_ERR(item);
@@ -807,8 +780,7 @@ again:
807 780
808 free_space = btrfs_leaf_free_space(root, leaf) - 781 free_space = btrfs_leaf_free_space(root, leaf) -
809 sizeof(struct btrfs_item) - csum_size; 782 sizeof(struct btrfs_item) - csum_size;
810 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 783 tmp = sums->len - total_bytes;
811 root->sectorsize);
812 tmp >>= root->fs_info->sb->s_blocksize_bits; 784 tmp >>= root->fs_info->sb->s_blocksize_bits;
813 WARN_ON(tmp < 1); 785 WARN_ON(tmp < 1);
814 786
@@ -822,6 +794,7 @@ again:
822 diff *= csum_size; 794 diff *= csum_size;
823 795
824 btrfs_extend_item(root, path, diff); 796 btrfs_extend_item(root, path, diff);
797 ret = 0;
825 goto csum; 798 goto csum;
826 } 799 }
827 800
@@ -831,8 +804,7 @@ insert:
831 if (found_next) { 804 if (found_next) {
832 u64 tmp; 805 u64 tmp;
833 806
834 tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, 807 tmp = sums->len - total_bytes;
835 root->sectorsize);
836 tmp >>= root->fs_info->sb->s_blocksize_bits; 808 tmp >>= root->fs_info->sb->s_blocksize_bits;
837 tmp = min(tmp, (next_offset - file_key.offset) >> 809 tmp = min(tmp, (next_offset - file_key.offset) >>
838 root->fs_info->sb->s_blocksize_bits); 810 root->fs_info->sb->s_blocksize_bits);
@@ -853,31 +825,25 @@ insert:
853 WARN_ON(1); 825 WARN_ON(1);
854 goto fail_unlock; 826 goto fail_unlock;
855 } 827 }
856csum:
857 leaf = path->nodes[0]; 828 leaf = path->nodes[0];
829csum:
858 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 830 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
859 ret = 0; 831 item_end = (struct btrfs_csum_item *)((unsigned char *)item +
832 btrfs_item_size_nr(leaf, path->slots[0]));
860 item = (struct btrfs_csum_item *)((unsigned char *)item + 833 item = (struct btrfs_csum_item *)((unsigned char *)item +
861 csum_offset * csum_size); 834 csum_offset * csum_size);
862found: 835found:
863 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 836 ins_size = (u32)(sums->len - total_bytes) >>
864 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 837 root->fs_info->sb->s_blocksize_bits;
865 btrfs_item_size_nr(leaf, path->slots[0])); 838 ins_size *= csum_size;
866next_sector: 839 ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
867 840 ins_size);
868 write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size); 841 write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
869 842 ins_size);
870 total_bytes += root->sectorsize; 843
871 sector_sum++; 844 ins_size /= csum_size;
872 if (total_bytes < sums->len) { 845 total_bytes += ins_size * root->sectorsize;
873 item = (struct btrfs_csum_item *)((char *)item + 846 index += ins_size;
874 csum_size);
875 if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
876 sector_sum->bytenr) {
877 bytenr = sector_sum->bytenr;
878 goto next_sector;
879 }
880 }
881 847
882 btrfs_mark_buffer_dirty(path->nodes[0]); 848 btrfs_mark_buffer_dirty(path->nodes[0]);
883 if (total_bytes < sums->len) { 849 if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4205ba752d40..8e686a427ce2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -309,10 +309,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
309 ret = PTR_ERR(inode_root); 309 ret = PTR_ERR(inode_root);
310 goto cleanup; 310 goto cleanup;
311 } 311 }
312 if (btrfs_root_refs(&inode_root->root_item) == 0) {
313 ret = -ENOENT;
314 goto cleanup;
315 }
316 312
317 key.objectid = defrag->ino; 313 key.objectid = defrag->ino;
318 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -600,20 +596,29 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
600 if (no_splits) 596 if (no_splits)
601 goto next; 597 goto next;
602 598
603 if (em->block_start < EXTENT_MAP_LAST_BYTE && 599 if (em->start < start) {
604 em->start < start) {
605 split->start = em->start; 600 split->start = em->start;
606 split->len = start - em->start; 601 split->len = start - em->start;
607 split->orig_start = em->orig_start;
608 split->block_start = em->block_start;
609 602
610 if (compressed) 603 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
611 split->block_len = em->block_len; 604 split->orig_start = em->orig_start;
612 else 605 split->block_start = em->block_start;
613 split->block_len = split->len; 606
614 split->ram_bytes = em->ram_bytes; 607 if (compressed)
615 split->orig_block_len = max(split->block_len, 608 split->block_len = em->block_len;
616 em->orig_block_len); 609 else
610 split->block_len = split->len;
611 split->orig_block_len = max(split->block_len,
612 em->orig_block_len);
613 split->ram_bytes = em->ram_bytes;
614 } else {
615 split->orig_start = split->start;
616 split->block_len = 0;
617 split->block_start = em->block_start;
618 split->orig_block_len = 0;
619 split->ram_bytes = split->len;
620 }
621
617 split->generation = gen; 622 split->generation = gen;
618 split->bdev = em->bdev; 623 split->bdev = em->bdev;
619 split->flags = flags; 624 split->flags = flags;
@@ -624,8 +629,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
624 split = split2; 629 split = split2;
625 split2 = NULL; 630 split2 = NULL;
626 } 631 }
627 if (em->block_start < EXTENT_MAP_LAST_BYTE && 632 if (testend && em->start + em->len > start + len) {
628 testend && em->start + em->len > start + len) {
629 u64 diff = start + len - em->start; 633 u64 diff = start + len - em->start;
630 634
631 split->start = start + len; 635 split->start = start + len;
@@ -634,18 +638,28 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
634 split->flags = flags; 638 split->flags = flags;
635 split->compress_type = em->compress_type; 639 split->compress_type = em->compress_type;
636 split->generation = gen; 640 split->generation = gen;
637 split->orig_block_len = max(em->block_len, 641
642 if (em->block_start < EXTENT_MAP_LAST_BYTE) {
643 split->orig_block_len = max(em->block_len,
638 em->orig_block_len); 644 em->orig_block_len);
639 split->ram_bytes = em->ram_bytes;
640 645
641 if (compressed) { 646 split->ram_bytes = em->ram_bytes;
642 split->block_len = em->block_len; 647 if (compressed) {
643 split->block_start = em->block_start; 648 split->block_len = em->block_len;
644 split->orig_start = em->orig_start; 649 split->block_start = em->block_start;
650 split->orig_start = em->orig_start;
651 } else {
652 split->block_len = split->len;
653 split->block_start = em->block_start
654 + diff;
655 split->orig_start = em->orig_start;
656 }
645 } else { 657 } else {
646 split->block_len = split->len; 658 split->ram_bytes = split->len;
647 split->block_start = em->block_start + diff; 659 split->orig_start = split->start;
648 split->orig_start = em->orig_start; 660 split->block_len = 0;
661 split->block_start = em->block_start;
662 split->orig_block_len = 0;
649 } 663 }
650 664
651 ret = add_extent_mapping(em_tree, split, modified); 665 ret = add_extent_mapping(em_tree, split, modified);
@@ -1317,6 +1331,56 @@ fail:
1317 1331
1318} 1332}
1319 1333
1334static noinline int check_can_nocow(struct inode *inode, loff_t pos,
1335 size_t *write_bytes)
1336{
1337 struct btrfs_trans_handle *trans;
1338 struct btrfs_root *root = BTRFS_I(inode)->root;
1339 struct btrfs_ordered_extent *ordered;
1340 u64 lockstart, lockend;
1341 u64 num_bytes;
1342 int ret;
1343
1344 lockstart = round_down(pos, root->sectorsize);
1345 lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
1346
1347 while (1) {
1348 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1349 ordered = btrfs_lookup_ordered_range(inode, lockstart,
1350 lockend - lockstart + 1);
1351 if (!ordered) {
1352 break;
1353 }
1354 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1355 btrfs_start_ordered_extent(inode, ordered, 1);
1356 btrfs_put_ordered_extent(ordered);
1357 }
1358
1359 trans = btrfs_join_transaction(root);
1360 if (IS_ERR(trans)) {
1361 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1362 return PTR_ERR(trans);
1363 }
1364
1365 num_bytes = lockend - lockstart + 1;
1366 ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
1367 NULL);
1368 btrfs_end_transaction(trans, root);
1369 if (ret <= 0) {
1370 ret = 0;
1371 } else {
1372 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1373 EXTENT_DIRTY | EXTENT_DELALLOC |
1374 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1375 NULL, GFP_NOFS);
1376 *write_bytes = min_t(size_t, *write_bytes, num_bytes);
1377 }
1378
1379 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
1380
1381 return ret;
1382}
1383
1320static noinline ssize_t __btrfs_buffered_write(struct file *file, 1384static noinline ssize_t __btrfs_buffered_write(struct file *file,
1321 struct iov_iter *i, 1385 struct iov_iter *i,
1322 loff_t pos) 1386 loff_t pos)
@@ -1324,10 +1388,12 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1324 struct inode *inode = file_inode(file); 1388 struct inode *inode = file_inode(file);
1325 struct btrfs_root *root = BTRFS_I(inode)->root; 1389 struct btrfs_root *root = BTRFS_I(inode)->root;
1326 struct page **pages = NULL; 1390 struct page **pages = NULL;
1391 u64 release_bytes = 0;
1327 unsigned long first_index; 1392 unsigned long first_index;
1328 size_t num_written = 0; 1393 size_t num_written = 0;
1329 int nrptrs; 1394 int nrptrs;
1330 int ret = 0; 1395 int ret = 0;
1396 bool only_release_metadata = false;
1331 bool force_page_uptodate = false; 1397 bool force_page_uptodate = false;
1332 1398
1333 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1399 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1348,6 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1348 offset); 1414 offset);
1349 size_t num_pages = (write_bytes + offset + 1415 size_t num_pages = (write_bytes + offset +
1350 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1416 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1417 size_t reserve_bytes;
1351 size_t dirty_pages; 1418 size_t dirty_pages;
1352 size_t copied; 1419 size_t copied;
1353 1420
@@ -1362,11 +1429,41 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1362 break; 1429 break;
1363 } 1430 }
1364 1431
1365 ret = btrfs_delalloc_reserve_space(inode, 1432 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1366 num_pages << PAGE_CACHE_SHIFT); 1433 ret = btrfs_check_data_free_space(inode, reserve_bytes);
1434 if (ret == -ENOSPC &&
1435 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1436 BTRFS_INODE_PREALLOC))) {
1437 ret = check_can_nocow(inode, pos, &write_bytes);
1438 if (ret > 0) {
1439 only_release_metadata = true;
1440 /*
1441 * our prealloc extent may be smaller than
1442 * write_bytes, so scale down.
1443 */
1444 num_pages = (write_bytes + offset +
1445 PAGE_CACHE_SIZE - 1) >>
1446 PAGE_CACHE_SHIFT;
1447 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1448 ret = 0;
1449 } else {
1450 ret = -ENOSPC;
1451 }
1452 }
1453
1367 if (ret) 1454 if (ret)
1368 break; 1455 break;
1369 1456
1457 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1458 if (ret) {
1459 if (!only_release_metadata)
1460 btrfs_free_reserved_data_space(inode,
1461 reserve_bytes);
1462 break;
1463 }
1464
1465 release_bytes = reserve_bytes;
1466
1370 /* 1467 /*
1371 * This is going to setup the pages array with the number of 1468 * This is going to setup the pages array with the number of
1372 * pages we want, so we don't really need to worry about the 1469 * pages we want, so we don't really need to worry about the
@@ -1375,11 +1472,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1375 ret = prepare_pages(root, file, pages, num_pages, 1472 ret = prepare_pages(root, file, pages, num_pages,
1376 pos, first_index, write_bytes, 1473 pos, first_index, write_bytes,
1377 force_page_uptodate); 1474 force_page_uptodate);
1378 if (ret) { 1475 if (ret)
1379 btrfs_delalloc_release_space(inode,
1380 num_pages << PAGE_CACHE_SHIFT);
1381 break; 1476 break;
1382 }
1383 1477
1384 copied = btrfs_copy_from_user(pos, num_pages, 1478 copied = btrfs_copy_from_user(pos, num_pages,
1385 write_bytes, pages, i); 1479 write_bytes, pages, i);
@@ -1409,30 +1503,46 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1409 * managed to copy. 1503 * managed to copy.
1410 */ 1504 */
1411 if (num_pages > dirty_pages) { 1505 if (num_pages > dirty_pages) {
1506 release_bytes = (num_pages - dirty_pages) <<
1507 PAGE_CACHE_SHIFT;
1412 if (copied > 0) { 1508 if (copied > 0) {
1413 spin_lock(&BTRFS_I(inode)->lock); 1509 spin_lock(&BTRFS_I(inode)->lock);
1414 BTRFS_I(inode)->outstanding_extents++; 1510 BTRFS_I(inode)->outstanding_extents++;
1415 spin_unlock(&BTRFS_I(inode)->lock); 1511 spin_unlock(&BTRFS_I(inode)->lock);
1416 } 1512 }
1417 btrfs_delalloc_release_space(inode, 1513 if (only_release_metadata)
1418 (num_pages - dirty_pages) << 1514 btrfs_delalloc_release_metadata(inode,
1419 PAGE_CACHE_SHIFT); 1515 release_bytes);
1516 else
1517 btrfs_delalloc_release_space(inode,
1518 release_bytes);
1420 } 1519 }
1421 1520
1521 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1422 if (copied > 0) { 1522 if (copied > 0) {
1423 ret = btrfs_dirty_pages(root, inode, pages, 1523 ret = btrfs_dirty_pages(root, inode, pages,
1424 dirty_pages, pos, copied, 1524 dirty_pages, pos, copied,
1425 NULL); 1525 NULL);
1426 if (ret) { 1526 if (ret) {
1427 btrfs_delalloc_release_space(inode,
1428 dirty_pages << PAGE_CACHE_SHIFT);
1429 btrfs_drop_pages(pages, num_pages); 1527 btrfs_drop_pages(pages, num_pages);
1430 break; 1528 break;
1431 } 1529 }
1432 } 1530 }
1433 1531
1532 release_bytes = 0;
1434 btrfs_drop_pages(pages, num_pages); 1533 btrfs_drop_pages(pages, num_pages);
1435 1534
1535 if (only_release_metadata && copied > 0) {
1536 u64 lockstart = round_down(pos, root->sectorsize);
1537 u64 lockend = lockstart +
1538 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1539
1540 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1541 lockend, EXTENT_NORESERVE, NULL,
1542 NULL, GFP_NOFS);
1543 only_release_metadata = false;
1544 }
1545
1436 cond_resched(); 1546 cond_resched();
1437 1547
1438 balance_dirty_pages_ratelimited(inode->i_mapping); 1548 balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1445,6 +1555,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1445 1555
1446 kfree(pages); 1556 kfree(pages);
1447 1557
1558 if (release_bytes) {
1559 if (only_release_metadata)
1560 btrfs_delalloc_release_metadata(inode, release_bytes);
1561 else
1562 btrfs_delalloc_release_space(inode, release_bytes);
1563 }
1564
1448 return num_written ? num_written : ret; 1565 return num_written ? num_written : ret;
1449} 1566}
1450 1567
@@ -2175,12 +2292,6 @@ static long btrfs_fallocate(struct file *file, int mode,
2175 goto out_reserve_fail; 2292 goto out_reserve_fail;
2176 } 2293 }
2177 2294
2178 /*
2179 * wait for ordered IO before we have any locks. We'll loop again
2180 * below with the locks held.
2181 */
2182 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2183
2184 mutex_lock(&inode->i_mutex); 2295 mutex_lock(&inode->i_mutex);
2185 ret = inode_newsize_ok(inode, alloc_end); 2296 ret = inode_newsize_ok(inode, alloc_end);
2186 if (ret) 2297 if (ret)
@@ -2191,8 +2302,23 @@ static long btrfs_fallocate(struct file *file, int mode,
2191 alloc_start); 2302 alloc_start);
2192 if (ret) 2303 if (ret)
2193 goto out; 2304 goto out;
2305 } else {
2306 /*
2307 * If we are fallocating from the end of the file onward we
2308 * need to zero out the end of the page if i_size lands in the
2309 * middle of a page.
2310 */
2311 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
2312 if (ret)
2313 goto out;
2194 } 2314 }
2195 2315
2316 /*
2317 * wait for ordered IO before we have any locks. We'll loop again
2318 * below with the locks held.
2319 */
2320 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
2321
2196 locked_end = alloc_end - 1; 2322 locked_end = alloc_end - 1;
2197 while (1) { 2323 while (1) {
2198 struct btrfs_ordered_extent *ordered; 2324 struct btrfs_ordered_extent *ordered;
@@ -2425,20 +2551,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2425 } 2551 }
2426 } 2552 }
2427 2553
2428 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) { 2554 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2429 offset = -EINVAL;
2430 goto out;
2431 }
2432 if (offset > inode->i_sb->s_maxbytes) {
2433 offset = -EINVAL;
2434 goto out;
2435 }
2436
2437 /* Special lock needed here? */
2438 if (offset != file->f_pos) {
2439 file->f_pos = offset;
2440 file->f_version = 0;
2441 }
2442out: 2555out:
2443 mutex_unlock(&inode->i_mutex); 2556 mutex_unlock(&inode->i_mutex);
2444 return offset; 2557 return offset;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e53009657f0e..b21a3cd667d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,7 +213,7 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
213 else 213 else
214 ret = 0; 214 ret = 0;
215 spin_unlock(&rsv->lock); 215 spin_unlock(&rsv->lock);
216 return 0; 216 return ret;
217} 217}
218 218
219int btrfs_truncate_free_space_cache(struct btrfs_root *root, 219int btrfs_truncate_free_space_cache(struct btrfs_root *root,
@@ -3150,6 +3150,8 @@ again:
3150 return 0; 3150 return 0;
3151} 3151}
3152 3152
3153#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
3154
3153/* 3155/*
3154 * This test just does basic sanity checking, making sure we can add an exten 3156 * This test just does basic sanity checking, making sure we can add an exten
3155 * entry and remove space from either end and the middle, and make sure we can 3157 * entry and remove space from either end and the middle, and make sure we can
@@ -3159,63 +3161,63 @@ static int test_extents(struct btrfs_block_group_cache *cache)
3159{ 3161{
3160 int ret = 0; 3162 int ret = 0;
3161 3163
3162 printk(KERN_ERR "Running extent only tests\n"); 3164 test_msg("Running extent only tests\n");
3163 3165
3164 /* First just make sure we can remove an entire entry */ 3166 /* First just make sure we can remove an entire entry */
3165 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3167 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3166 if (ret) { 3168 if (ret) {
3167 printk(KERN_ERR "Error adding initial extents %d\n", ret); 3169 test_msg("Error adding initial extents %d\n", ret);
3168 return ret; 3170 return ret;
3169 } 3171 }
3170 3172
3171 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3173 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3172 if (ret) { 3174 if (ret) {
3173 printk(KERN_ERR "Error removing extent %d\n", ret); 3175 test_msg("Error removing extent %d\n", ret);
3174 return ret; 3176 return ret;
3175 } 3177 }
3176 3178
3177 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3179 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3178 printk(KERN_ERR "Full remove left some lingering space\n"); 3180 test_msg("Full remove left some lingering space\n");
3179 return -1; 3181 return -1;
3180 } 3182 }
3181 3183
3182 /* Ok edge and middle cases now */ 3184 /* Ok edge and middle cases now */
3183 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); 3185 ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
3184 if (ret) { 3186 if (ret) {
3185 printk(KERN_ERR "Error adding half extent %d\n", ret); 3187 test_msg("Error adding half extent %d\n", ret);
3186 return ret; 3188 return ret;
3187 } 3189 }
3188 3190
3189 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); 3191 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
3190 if (ret) { 3192 if (ret) {
3191 printk(KERN_ERR "Error removing tail end %d\n", ret); 3193 test_msg("Error removing tail end %d\n", ret);
3192 return ret; 3194 return ret;
3193 } 3195 }
3194 3196
3195 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3197 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3196 if (ret) { 3198 if (ret) {
3197 printk(KERN_ERR "Error removing front end %d\n", ret); 3199 test_msg("Error removing front end %d\n", ret);
3198 return ret; 3200 return ret;
3199 } 3201 }
3200 3202
3201 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); 3203 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
3202 if (ret) { 3204 if (ret) {
3203 printk(KERN_ERR "Error removing middle peice %d\n", ret); 3205 test_msg("Error removing middle piece %d\n", ret);
3204 return ret; 3206 return ret;
3205 } 3207 }
3206 3208
3207 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3209 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3208 printk(KERN_ERR "Still have space at the front\n"); 3210 test_msg("Still have space at the front\n");
3209 return -1; 3211 return -1;
3210 } 3212 }
3211 3213
3212 if (check_exists(cache, 2 * 1024 * 1024, 4096)) { 3214 if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
3213 printk(KERN_ERR "Still have space in the middle\n"); 3215 test_msg("Still have space in the middle\n");
3214 return -1; 3216 return -1;
3215 } 3217 }
3216 3218
3217 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { 3219 if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
3218 printk(KERN_ERR "Still have space at the end\n"); 3220 test_msg("Still have space at the end\n");
3219 return -1; 3221 return -1;
3220 } 3222 }
3221 3223
@@ -3230,34 +3232,34 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3230 u64 next_bitmap_offset; 3232 u64 next_bitmap_offset;
3231 int ret; 3233 int ret;
3232 3234
3233 printk(KERN_ERR "Running bitmap only tests\n"); 3235 test_msg("Running bitmap only tests\n");
3234 3236
3235 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3237 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3236 if (ret) { 3238 if (ret) {
3237 printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret); 3239 test_msg("Couldn't create a bitmap entry %d\n", ret);
3238 return ret; 3240 return ret;
3239 } 3241 }
3240 3242
3241 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); 3243 ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
3242 if (ret) { 3244 if (ret) {
3243 printk(KERN_ERR "Error removing bitmap full range %d\n", ret); 3245 test_msg("Error removing bitmap full range %d\n", ret);
3244 return ret; 3246 return ret;
3245 } 3247 }
3246 3248
3247 if (check_exists(cache, 0, 4 * 1024 * 1024)) { 3249 if (check_exists(cache, 0, 4 * 1024 * 1024)) {
3248 printk(KERN_ERR "Left some space in bitmap\n"); 3250 test_msg("Left some space in bitmap\n");
3249 return -1; 3251 return -1;
3250 } 3252 }
3251 3253
3252 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); 3254 ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
3253 if (ret) { 3255 if (ret) {
3254 printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret); 3256 test_msg("Couldn't add to our bitmap entry %d\n", ret);
3255 return ret; 3257 return ret;
3256 } 3258 }
3257 3259
3258 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); 3260 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
3259 if (ret) { 3261 if (ret) {
3260 printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret); 3262 test_msg("Couldn't remove middle chunk %d\n", ret);
3261 return ret; 3263 return ret;
3262 } 3264 }
3263 3265
@@ -3271,21 +3273,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
3271 ret = add_free_space_entry(cache, next_bitmap_offset - 3273 ret = add_free_space_entry(cache, next_bitmap_offset -
3272 (2 * 1024 * 1024), 4 * 1024 * 1024, 1); 3274 (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
3273 if (ret) { 3275 if (ret) {
3274 printk(KERN_ERR "Couldn't add space that straddles two bitmaps" 3276 test_msg("Couldn't add space that straddles two bitmaps %d\n",
3275 " %d\n", ret); 3277 ret);
3276 return ret; 3278 return ret;
3277 } 3279 }
3278 3280
3279 ret = btrfs_remove_free_space(cache, next_bitmap_offset - 3281 ret = btrfs_remove_free_space(cache, next_bitmap_offset -
3280 (1 * 1024 * 1024), 2 * 1024 * 1024); 3282 (1 * 1024 * 1024), 2 * 1024 * 1024);
3281 if (ret) { 3283 if (ret) {
3282 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3284 test_msg("Couldn't remove overlapping space %d\n", ret);
3283 return ret; 3285 return ret;
3284 } 3286 }
3285 3287
3286 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), 3288 if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
3287 2 * 1024 * 1024)) { 3289 2 * 1024 * 1024)) {
3288 printk(KERN_ERR "Left some space when removing overlapping\n"); 3290 test_msg("Left some space when removing overlapping\n");
3289 return -1; 3291 return -1;
3290 } 3292 }
3291 3293
@@ -3300,7 +3302,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3300 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); 3302 u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
3301 int ret; 3303 int ret;
3302 3304
3303 printk(KERN_ERR "Running bitmap and extent tests\n"); 3305 test_msg("Running bitmap and extent tests\n");
3304 3306
3305 /* 3307 /*
3306 * First let's do something simple, an extent at the same offset as the 3308 * First let's do something simple, an extent at the same offset as the
@@ -3309,42 +3311,42 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3309 */ 3311 */
3310 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); 3312 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
3311 if (ret) { 3313 if (ret) {
3312 printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret); 3314 test_msg("Couldn't create bitmap entry %d\n", ret);
3313 return ret; 3315 return ret;
3314 } 3316 }
3315 3317
3316 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3318 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3317 if (ret) { 3319 if (ret) {
3318 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3320 test_msg("Couldn't add extent entry %d\n", ret);
3319 return ret; 3321 return ret;
3320 } 3322 }
3321 3323
3322 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); 3324 ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
3323 if (ret) { 3325 if (ret) {
3324 printk(KERN_ERR "Couldn't remove extent entry %d\n", ret); 3326 test_msg("Couldn't remove extent entry %d\n", ret);
3325 return ret; 3327 return ret;
3326 } 3328 }
3327 3329
3328 if (check_exists(cache, 0, 1 * 1024 * 1024)) { 3330 if (check_exists(cache, 0, 1 * 1024 * 1024)) {
3329 printk(KERN_ERR "Left remnants after our remove\n"); 3331 test_msg("Left remnants after our remove\n");
3330 return -1; 3332 return -1;
3331 } 3333 }
3332 3334
3333 /* Now to add back the extent entry and remove from the bitmap */ 3335 /* Now to add back the extent entry and remove from the bitmap */
3334 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); 3336 ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
3335 if (ret) { 3337 if (ret) {
3336 printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret); 3338 test_msg("Couldn't re-add extent entry %d\n", ret);
3337 return ret; 3339 return ret;
3338 } 3340 }
3339 3341
3340 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); 3342 ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
3341 if (ret) { 3343 if (ret) {
3342 printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret); 3344 test_msg("Couldn't remove from bitmap %d\n", ret);
3343 return ret; 3345 return ret;
3344 } 3346 }
3345 3347
3346 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { 3348 if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
3347 printk(KERN_ERR "Left remnants in the bitmap\n"); 3349 test_msg("Left remnants in the bitmap\n");
3348 return -1; 3350 return -1;
3349 } 3351 }
3350 3352
@@ -3354,19 +3356,18 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3354 */ 3356 */
3355 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); 3357 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
3356 if (ret) { 3358 if (ret) {
3357 printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret); 3359 test_msg("Couldn't add to a bitmap %d\n", ret);
3358 return ret; 3360 return ret;
3359 } 3361 }
3360 3362
3361 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); 3363 ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
3362 if (ret) { 3364 if (ret) {
3363 printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); 3365 test_msg("Couldn't remove overlapping space %d\n", ret);
3364 return ret; 3366 return ret;
3365 } 3367 }
3366 3368
3367 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { 3369 if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
3368 printk(KERN_ERR "Left over peices after removing " 3370 test_msg("Left over peices after removing overlapping\n");
3369 "overlapping\n");
3370 return -1; 3371 return -1;
3371 } 3372 }
3372 3373
@@ -3375,24 +3376,24 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3375 /* Now with the extent entry offset into the bitmap */ 3376 /* Now with the extent entry offset into the bitmap */
3376 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); 3377 ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
3377 if (ret) { 3378 if (ret) {
3378 printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret); 3379 test_msg("Couldn't add space to the bitmap %d\n", ret);
3379 return ret; 3380 return ret;
3380 } 3381 }
3381 3382
3382 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); 3383 ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
3383 if (ret) { 3384 if (ret) {
3384 printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret); 3385 test_msg("Couldn't add extent to the cache %d\n", ret);
3385 return ret; 3386 return ret;
3386 } 3387 }
3387 3388
3388 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); 3389 ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
3389 if (ret) { 3390 if (ret) {
3390 printk(KERN_ERR "Problem removing overlapping space %d\n", ret); 3391 test_msg("Problem removing overlapping space %d\n", ret);
3391 return ret; 3392 return ret;
3392 } 3393 }
3393 3394
3394 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { 3395 if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
3395 printk(KERN_ERR "Left something behind when removing space"); 3396 test_msg("Left something behind when removing space");
3396 return -1; 3397 return -1;
3397 } 3398 }
3398 3399
@@ -3410,27 +3411,27 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3410 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, 3411 ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
3411 4 * 1024 * 1024, 1); 3412 4 * 1024 * 1024, 1);
3412 if (ret) { 3413 if (ret) {
3413 printk(KERN_ERR "Couldn't add bitmap %d\n", ret); 3414 test_msg("Couldn't add bitmap %d\n", ret);
3414 return ret; 3415 return ret;
3415 } 3416 }
3416 3417
3417 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, 3418 ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
3418 5 * 1024 * 1024, 0); 3419 5 * 1024 * 1024, 0);
3419 if (ret) { 3420 if (ret) {
3420 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3421 test_msg("Couldn't add extent entry %d\n", ret);
3421 return ret; 3422 return ret;
3422 } 3423 }
3423 3424
3424 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, 3425 ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
3425 5 * 1024 * 1024); 3426 5 * 1024 * 1024);
3426 if (ret) { 3427 if (ret) {
3427 printk(KERN_ERR "Failed to free our space %d\n", ret); 3428 test_msg("Failed to free our space %d\n", ret);
3428 return ret; 3429 return ret;
3429 } 3430 }
3430 3431
3431 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, 3432 if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
3432 5 * 1024 * 1024)) { 3433 5 * 1024 * 1024)) {
3433 printk(KERN_ERR "Left stuff over\n"); 3434 test_msg("Left stuff over\n");
3434 return -1; 3435 return -1;
3435 } 3436 }
3436 3437
@@ -3444,20 +3445,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
3444 */ 3445 */
3445 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); 3446 ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
3446 if (ret) { 3447 if (ret) {
3447 printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret); 3448 test_msg("Couldn't add bitmap entry %d\n", ret);
3448 return ret; 3449 return ret;
3449 } 3450 }
3450 3451
3451 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); 3452 ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
3452 if (ret) { 3453 if (ret) {
3453 printk(KERN_ERR "Couldn't add extent entry %d\n", ret); 3454 test_msg("Couldn't add extent entry %d\n", ret);
3454 return ret; 3455 return ret;
3455 } 3456 }
3456 3457
3457 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); 3458 ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
3458 if (ret) { 3459 if (ret) {
3459 printk(KERN_ERR "Error removing bitmap and extent " 3460 test_msg("Error removing bitmap and extent overlapping %d\n", ret);
3460 "overlapping %d\n", ret);
3461 return ret; 3461 return ret;
3462 } 3462 }
3463 3463
@@ -3469,11 +3469,11 @@ void btrfs_test_free_space_cache(void)
3469{ 3469{
3470 struct btrfs_block_group_cache *cache; 3470 struct btrfs_block_group_cache *cache;
3471 3471
3472 printk(KERN_ERR "Running btrfs free space cache tests\n"); 3472 test_msg("Running btrfs free space cache tests\n");
3473 3473
3474 cache = init_test_block_group(); 3474 cache = init_test_block_group();
3475 if (!cache) { 3475 if (!cache) {
3476 printk(KERN_ERR "Couldn't run the tests\n"); 3476 test_msg("Couldn't run the tests\n");
3477 return; 3477 return;
3478 } 3478 }
3479 3479
@@ -3487,6 +3487,9 @@ out:
3487 __btrfs_remove_free_space_cache(cache->free_space_ctl); 3487 __btrfs_remove_free_space_cache(cache->free_space_ctl);
3488 kfree(cache->free_space_ctl); 3488 kfree(cache->free_space_ctl);
3489 kfree(cache); 3489 kfree(cache);
3490 printk(KERN_ERR "Free space cache tests finished\n"); 3490 test_msg("Free space cache tests finished\n");
3491} 3491}
3492#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ 3492#undef test_msg
3493#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3494void btrfs_test_free_space_cache(void) {}
3495#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 8b7f19f44961..894116b71304 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -113,8 +113,6 @@ int btrfs_return_cluster_to_free_space(
113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 113int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
114 u64 *trimmed, u64 start, u64 end, u64 minlen); 114 u64 *trimmed, u64 start, u64 end, u64 minlen);
115 115
116#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
117void btrfs_test_free_space_cache(void); 116void btrfs_test_free_space_cache(void);
118#endif
119 117
120#endif 118#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17f3064b4a3e..021694c08181 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -42,6 +42,7 @@
42#include <linux/mount.h> 42#include <linux/mount.h>
43#include <linux/btrfs.h> 43#include <linux/btrfs.h>
44#include <linux/blkdev.h> 44#include <linux/blkdev.h>
45#include <linux/posix_acl_xattr.h>
45#include "compat.h" 46#include "compat.h"
46#include "ctree.h" 47#include "ctree.h"
47#include "disk-io.h" 48#include "disk-io.h"
@@ -57,6 +58,7 @@
57#include "free-space-cache.h" 58#include "free-space-cache.h"
58#include "inode-map.h" 59#include "inode-map.h"
59#include "backref.h" 60#include "backref.h"
61#include "hash.h"
60 62
61struct btrfs_iget_args { 63struct btrfs_iget_args {
62 u64 ino; 64 u64 ino;
@@ -701,8 +703,12 @@ retry:
701 async_extent->nr_pages = 0; 703 async_extent->nr_pages = 0;
702 async_extent->pages = NULL; 704 async_extent->pages = NULL;
703 705
704 if (ret == -ENOSPC) 706 if (ret == -ENOSPC) {
707 unlock_extent(io_tree, async_extent->start,
708 async_extent->start +
709 async_extent->ram_size - 1);
705 goto retry; 710 goto retry;
711 }
706 goto out_free; 712 goto out_free;
707 } 713 }
708 714
@@ -1529,6 +1535,46 @@ static void btrfs_merge_extent_hook(struct inode *inode,
1529 spin_unlock(&BTRFS_I(inode)->lock); 1535 spin_unlock(&BTRFS_I(inode)->lock);
1530} 1536}
1531 1537
1538static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1539 struct inode *inode)
1540{
1541 spin_lock(&root->delalloc_lock);
1542 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1543 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1544 &root->delalloc_inodes);
1545 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1546 &BTRFS_I(inode)->runtime_flags);
1547 root->nr_delalloc_inodes++;
1548 if (root->nr_delalloc_inodes == 1) {
1549 spin_lock(&root->fs_info->delalloc_root_lock);
1550 BUG_ON(!list_empty(&root->delalloc_root));
1551 list_add_tail(&root->delalloc_root,
1552 &root->fs_info->delalloc_roots);
1553 spin_unlock(&root->fs_info->delalloc_root_lock);
1554 }
1555 }
1556 spin_unlock(&root->delalloc_lock);
1557}
1558
1559static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1560 struct inode *inode)
1561{
1562 spin_lock(&root->delalloc_lock);
1563 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1564 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1565 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1566 &BTRFS_I(inode)->runtime_flags);
1567 root->nr_delalloc_inodes--;
1568 if (!root->nr_delalloc_inodes) {
1569 spin_lock(&root->fs_info->delalloc_root_lock);
1570 BUG_ON(list_empty(&root->delalloc_root));
1571 list_del_init(&root->delalloc_root);
1572 spin_unlock(&root->fs_info->delalloc_root_lock);
1573 }
1574 }
1575 spin_unlock(&root->delalloc_lock);
1576}
1577
1532/* 1578/*
1533 * extent_io.c set_bit_hook, used to track delayed allocation 1579 * extent_io.c set_bit_hook, used to track delayed allocation
1534 * bytes in this file, and to maintain the list of inodes that 1580 * bytes in this file, and to maintain the list of inodes that
@@ -1561,16 +1607,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1561 spin_lock(&BTRFS_I(inode)->lock); 1607 spin_lock(&BTRFS_I(inode)->lock);
1562 BTRFS_I(inode)->delalloc_bytes += len; 1608 BTRFS_I(inode)->delalloc_bytes += len;
1563 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1609 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1564 &BTRFS_I(inode)->runtime_flags)) { 1610 &BTRFS_I(inode)->runtime_flags))
1565 spin_lock(&root->fs_info->delalloc_lock); 1611 btrfs_add_delalloc_inodes(root, inode);
1566 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1567 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1568 &root->fs_info->delalloc_inodes);
1569 set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1570 &BTRFS_I(inode)->runtime_flags);
1571 }
1572 spin_unlock(&root->fs_info->delalloc_lock);
1573 }
1574 spin_unlock(&BTRFS_I(inode)->lock); 1612 spin_unlock(&BTRFS_I(inode)->lock);
1575 } 1613 }
1576} 1614}
@@ -1604,7 +1642,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1604 btrfs_delalloc_release_metadata(inode, len); 1642 btrfs_delalloc_release_metadata(inode, len);
1605 1643
1606 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1644 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1607 && do_list) 1645 && do_list && !(state->state & EXTENT_NORESERVE))
1608 btrfs_free_reserved_data_space(inode, len); 1646 btrfs_free_reserved_data_space(inode, len);
1609 1647
1610 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1648 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
@@ -1613,15 +1651,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1613 BTRFS_I(inode)->delalloc_bytes -= len; 1651 BTRFS_I(inode)->delalloc_bytes -= len;
1614 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && 1652 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1615 test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1653 test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1616 &BTRFS_I(inode)->runtime_flags)) { 1654 &BTRFS_I(inode)->runtime_flags))
1617 spin_lock(&root->fs_info->delalloc_lock); 1655 btrfs_del_delalloc_inode(root, inode);
1618 if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1619 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1620 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1621 &BTRFS_I(inode)->runtime_flags);
1622 }
1623 spin_unlock(&root->fs_info->delalloc_lock);
1624 }
1625 spin_unlock(&BTRFS_I(inode)->lock); 1656 spin_unlock(&BTRFS_I(inode)->lock);
1626 } 1657 }
1627} 1658}
@@ -2135,16 +2166,23 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2135 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) 2166 if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2136 continue; 2167 continue;
2137 2168
2138 extent_offset = btrfs_file_extent_offset(leaf, extent); 2169 /*
2139 if (key.offset - extent_offset != offset) 2170 * 'offset' refers to the exact key.offset,
2171 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2172 * (key.offset - extent_offset).
2173 */
2174 if (key.offset != offset)
2140 continue; 2175 continue;
2141 2176
2177 extent_offset = btrfs_file_extent_offset(leaf, extent);
2142 num_bytes = btrfs_file_extent_num_bytes(leaf, extent); 2178 num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2179
2143 if (extent_offset >= old->extent_offset + old->offset + 2180 if (extent_offset >= old->extent_offset + old->offset +
2144 old->len || extent_offset + num_bytes <= 2181 old->len || extent_offset + num_bytes <=
2145 old->extent_offset + old->offset) 2182 old->extent_offset + old->offset)
2146 continue; 2183 continue;
2147 2184
2185 ret = 0;
2148 break; 2186 break;
2149 } 2187 }
2150 2188
@@ -2156,7 +2194,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2156 2194
2157 backref->root_id = root_id; 2195 backref->root_id = root_id;
2158 backref->inum = inum; 2196 backref->inum = inum;
2159 backref->file_pos = offset + extent_offset; 2197 backref->file_pos = offset;
2160 backref->num_bytes = num_bytes; 2198 backref->num_bytes = num_bytes;
2161 backref->extent_offset = extent_offset; 2199 backref->extent_offset = extent_offset;
2162 backref->generation = btrfs_file_extent_generation(leaf, extent); 2200 backref->generation = btrfs_file_extent_generation(leaf, extent);
@@ -2179,7 +2217,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
2179 new->path = path; 2217 new->path = path;
2180 2218
2181 list_for_each_entry_safe(old, tmp, &new->head, list) { 2219 list_for_each_entry_safe(old, tmp, &new->head, list) {
2182 ret = iterate_inodes_from_logical(old->bytenr, fs_info, 2220 ret = iterate_inodes_from_logical(old->bytenr +
2221 old->extent_offset, fs_info,
2183 path, record_one_backref, 2222 path, record_one_backref,
2184 old); 2223 old);
2185 BUG_ON(ret < 0 && ret != -ENOENT); 2224 BUG_ON(ret < 0 && ret != -ENOENT);
@@ -2263,11 +2302,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
2263 return 0; 2302 return 0;
2264 return PTR_ERR(root); 2303 return PTR_ERR(root);
2265 } 2304 }
2266 if (btrfs_root_refs(&root->root_item) == 0) {
2267 srcu_read_unlock(&fs_info->subvol_srcu, index);
2268 /* parse ENOENT to 0 */
2269 return 0;
2270 }
2271 2305
2272 /* step 2: get inode */ 2306 /* step 2: get inode */
2273 key.objectid = backref->inum; 2307 key.objectid = backref->inum;
@@ -3215,13 +3249,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3215 /* 1 for the orphan item deletion. */ 3249 /* 1 for the orphan item deletion. */
3216 trans = btrfs_start_transaction(root, 1); 3250 trans = btrfs_start_transaction(root, 1);
3217 if (IS_ERR(trans)) { 3251 if (IS_ERR(trans)) {
3252 iput(inode);
3218 ret = PTR_ERR(trans); 3253 ret = PTR_ERR(trans);
3219 goto out; 3254 goto out;
3220 } 3255 }
3221 ret = btrfs_orphan_add(trans, inode); 3256 ret = btrfs_orphan_add(trans, inode);
3222 btrfs_end_transaction(trans, root); 3257 btrfs_end_transaction(trans, root);
3223 if (ret) 3258 if (ret) {
3259 iput(inode);
3224 goto out; 3260 goto out;
3261 }
3225 3262
3226 ret = btrfs_truncate(inode); 3263 ret = btrfs_truncate(inode);
3227 if (ret) 3264 if (ret)
@@ -3274,8 +3311,17 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3274{ 3311{
3275 u32 nritems = btrfs_header_nritems(leaf); 3312 u32 nritems = btrfs_header_nritems(leaf);
3276 struct btrfs_key found_key; 3313 struct btrfs_key found_key;
3314 static u64 xattr_access = 0;
3315 static u64 xattr_default = 0;
3277 int scanned = 0; 3316 int scanned = 0;
3278 3317
3318 if (!xattr_access) {
3319 xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3320 strlen(POSIX_ACL_XATTR_ACCESS));
3321 xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3322 strlen(POSIX_ACL_XATTR_DEFAULT));
3323 }
3324
3279 slot++; 3325 slot++;
3280 while (slot < nritems) { 3326 while (slot < nritems) {
3281 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3327 btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3285,8 +3331,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3285 return 0; 3331 return 0;
3286 3332
3287 /* we found an xattr, assume we've got an acl */ 3333 /* we found an xattr, assume we've got an acl */
3288 if (found_key.type == BTRFS_XATTR_ITEM_KEY) 3334 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3289 return 1; 3335 if (found_key.offset == xattr_access ||
3336 found_key.offset == xattr_default)
3337 return 1;
3338 }
3290 3339
3291 /* 3340 /*
3292 * we found a key greater than an xattr key, there can't 3341 * we found a key greater than an xattr key, there can't
@@ -3660,53 +3709,20 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3660 } 3709 }
3661 return ret; 3710 return ret;
3662} 3711}
3663
3664
3665/* helper to check if there is any shared block in the path */
3666static int check_path_shared(struct btrfs_root *root,
3667 struct btrfs_path *path)
3668{
3669 struct extent_buffer *eb;
3670 int level;
3671 u64 refs = 1;
3672
3673 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
3674 int ret;
3675
3676 if (!path->nodes[level])
3677 break;
3678 eb = path->nodes[level];
3679 if (!btrfs_block_can_be_shared(root, eb))
3680 continue;
3681 ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1,
3682 &refs, NULL);
3683 if (refs > 1)
3684 return 1;
3685 }
3686 return 0;
3687}
3688 3712
3689/* 3713/*
3690 * helper to start transaction for unlink and rmdir. 3714 * helper to start transaction for unlink and rmdir.
3691 * 3715 *
3692 * unlink and rmdir are special in btrfs, they do not always free space. 3716 * unlink and rmdir are special in btrfs, they do not always free space, so
3693 * so in enospc case, we should make sure they will free space before 3717 * if we cannot make our reservations the normal way try and see if there is
3694 * allowing them to use the global metadata reservation. 3718 * plenty of slack room in the global reserve to migrate, otherwise we cannot
3719 * allow the unlink to occur.
3695 */ 3720 */
3696static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, 3721static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3697 struct dentry *dentry)
3698{ 3722{
3699 struct btrfs_trans_handle *trans; 3723 struct btrfs_trans_handle *trans;
3700 struct btrfs_root *root = BTRFS_I(dir)->root; 3724 struct btrfs_root *root = BTRFS_I(dir)->root;
3701 struct btrfs_path *path;
3702 struct btrfs_dir_item *di;
3703 struct inode *inode = dentry->d_inode;
3704 u64 index;
3705 int check_link = 1;
3706 int err = -ENOSPC;
3707 int ret; 3725 int ret;
3708 u64 ino = btrfs_ino(inode);
3709 u64 dir_ino = btrfs_ino(dir);
3710 3726
3711 /* 3727 /*
3712 * 1 for the possible orphan item 3728 * 1 for the possible orphan item
@@ -3719,158 +3735,23 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
3719 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 3735 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3720 return trans; 3736 return trans;
3721 3737
3722 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 3738 if (PTR_ERR(trans) == -ENOSPC) {
3723 return ERR_PTR(-ENOSPC); 3739 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3724 3740
3725 /* check if there is someone else holds reference */ 3741 trans = btrfs_start_transaction(root, 0);
3726 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) 3742 if (IS_ERR(trans))
3727 return ERR_PTR(-ENOSPC); 3743 return trans;
3728 3744 ret = btrfs_cond_migrate_bytes(root->fs_info,
3729 if (atomic_read(&inode->i_count) > 2) 3745 &root->fs_info->trans_block_rsv,
3730 return ERR_PTR(-ENOSPC); 3746 num_bytes, 5);
3731 3747 if (ret) {
3732 if (xchg(&root->fs_info->enospc_unlink, 1)) 3748 btrfs_end_transaction(trans, root);
3733 return ERR_PTR(-ENOSPC); 3749 return ERR_PTR(ret);
3734
3735 path = btrfs_alloc_path();
3736 if (!path) {
3737 root->fs_info->enospc_unlink = 0;
3738 return ERR_PTR(-ENOMEM);
3739 }
3740
3741 /* 1 for the orphan item */
3742 trans = btrfs_start_transaction(root, 1);
3743 if (IS_ERR(trans)) {
3744 btrfs_free_path(path);
3745 root->fs_info->enospc_unlink = 0;
3746 return trans;
3747 }
3748
3749 path->skip_locking = 1;
3750 path->search_commit_root = 1;
3751
3752 ret = btrfs_lookup_inode(trans, root, path,
3753 &BTRFS_I(dir)->location, 0);
3754 if (ret < 0) {
3755 err = ret;
3756 goto out;
3757 }
3758 if (ret == 0) {
3759 if (check_path_shared(root, path))
3760 goto out;
3761 } else {
3762 check_link = 0;
3763 }
3764 btrfs_release_path(path);
3765
3766 ret = btrfs_lookup_inode(trans, root, path,
3767 &BTRFS_I(inode)->location, 0);
3768 if (ret < 0) {
3769 err = ret;
3770 goto out;
3771 }
3772 if (ret == 0) {
3773 if (check_path_shared(root, path))
3774 goto out;
3775 } else {
3776 check_link = 0;
3777 }
3778 btrfs_release_path(path);
3779
3780 if (ret == 0 && S_ISREG(inode->i_mode)) {
3781 ret = btrfs_lookup_file_extent(trans, root, path,
3782 ino, (u64)-1, 0);
3783 if (ret < 0) {
3784 err = ret;
3785 goto out;
3786 } 3750 }
3787 BUG_ON(ret == 0); /* Corruption */
3788 if (check_path_shared(root, path))
3789 goto out;
3790 btrfs_release_path(path);
3791 }
3792
3793 if (!check_link) {
3794 err = 0;
3795 goto out;
3796 }
3797
3798 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3799 dentry->d_name.name, dentry->d_name.len, 0);
3800 if (IS_ERR(di)) {
3801 err = PTR_ERR(di);
3802 goto out;
3803 }
3804 if (di) {
3805 if (check_path_shared(root, path))
3806 goto out;
3807 } else {
3808 err = 0;
3809 goto out;
3810 }
3811 btrfs_release_path(path);
3812
3813 ret = btrfs_get_inode_ref_index(trans, root, path, dentry->d_name.name,
3814 dentry->d_name.len, ino, dir_ino, 0,
3815 &index);
3816 if (ret) {
3817 err = ret;
3818 goto out;
3819 }
3820
3821 if (check_path_shared(root, path))
3822 goto out;
3823
3824 btrfs_release_path(path);
3825
3826 /*
3827 * This is a commit root search, if we can lookup inode item and other
3828 * relative items in the commit root, it means the transaction of
3829 * dir/file creation has been committed, and the dir index item that we
3830 * delay to insert has also been inserted into the commit root. So
3831 * we needn't worry about the delayed insertion of the dir index item
3832 * here.
3833 */
3834 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
3835 dentry->d_name.name, dentry->d_name.len, 0);
3836 if (IS_ERR(di)) {
3837 err = PTR_ERR(di);
3838 goto out;
3839 }
3840 BUG_ON(ret == -ENOENT);
3841 if (check_path_shared(root, path))
3842 goto out;
3843
3844 err = 0;
3845out:
3846 btrfs_free_path(path);
3847 /* Migrate the orphan reservation over */
3848 if (!err)
3849 err = btrfs_block_rsv_migrate(trans->block_rsv,
3850 &root->fs_info->global_block_rsv,
3851 trans->bytes_reserved);
3852
3853 if (err) {
3854 btrfs_end_transaction(trans, root);
3855 root->fs_info->enospc_unlink = 0;
3856 return ERR_PTR(err);
3857 }
3858
3859 trans->block_rsv = &root->fs_info->global_block_rsv;
3860 return trans;
3861}
3862
3863static void __unlink_end_trans(struct btrfs_trans_handle *trans,
3864 struct btrfs_root *root)
3865{
3866 if (trans->block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL) {
3867 btrfs_block_rsv_release(root, trans->block_rsv,
3868 trans->bytes_reserved);
3869 trans->block_rsv = &root->fs_info->trans_block_rsv; 3751 trans->block_rsv = &root->fs_info->trans_block_rsv;
3870 BUG_ON(!root->fs_info->enospc_unlink); 3752 trans->bytes_reserved = num_bytes;
3871 root->fs_info->enospc_unlink = 0;
3872 } 3753 }
3873 btrfs_end_transaction(trans, root); 3754 return trans;
3874} 3755}
3875 3756
3876static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 3757static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3880,7 +3761,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3880 struct inode *inode = dentry->d_inode; 3761 struct inode *inode = dentry->d_inode;
3881 int ret; 3762 int ret;
3882 3763
3883 trans = __unlink_start_trans(dir, dentry); 3764 trans = __unlink_start_trans(dir);
3884 if (IS_ERR(trans)) 3765 if (IS_ERR(trans))
3885 return PTR_ERR(trans); 3766 return PTR_ERR(trans);
3886 3767
@@ -3898,7 +3779,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3898 } 3779 }
3899 3780
3900out: 3781out:
3901 __unlink_end_trans(trans, root); 3782 btrfs_end_transaction(trans, root);
3902 btrfs_btree_balance_dirty(root); 3783 btrfs_btree_balance_dirty(root);
3903 return ret; 3784 return ret;
3904} 3785}
@@ -3995,7 +3876,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3995 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) 3876 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3996 return -EPERM; 3877 return -EPERM;
3997 3878
3998 trans = __unlink_start_trans(dir, dentry); 3879 trans = __unlink_start_trans(dir);
3999 if (IS_ERR(trans)) 3880 if (IS_ERR(trans))
4000 return PTR_ERR(trans); 3881 return PTR_ERR(trans);
4001 3882
@@ -4017,7 +3898,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4017 if (!err) 3898 if (!err)
4018 btrfs_i_size_write(inode, 0); 3899 btrfs_i_size_write(inode, 0);
4019out: 3900out:
4020 __unlink_end_trans(trans, root); 3901 btrfs_end_transaction(trans, root);
4021 btrfs_btree_balance_dirty(root); 3902 btrfs_btree_balance_dirty(root);
4022 3903
4023 return err; 3904 return err;
@@ -4395,6 +4276,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4395 u64 hole_size; 4276 u64 hole_size;
4396 int err = 0; 4277 int err = 0;
4397 4278
4279 /*
4280 * If our size started in the middle of a page we need to zero out the
4281 * rest of the page before we expand the i_size, otherwise we could
4282 * expose stale data.
4283 */
4284 err = btrfs_truncate_page(inode, oldsize, 0, 0);
4285 if (err)
4286 return err;
4287
4398 if (size <= hole_start) 4288 if (size <= hole_start)
4399 return 0; 4289 return 0;
4400 4290
@@ -4509,9 +4399,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4509 int mask = attr->ia_valid; 4399 int mask = attr->ia_valid;
4510 int ret; 4400 int ret;
4511 4401
4512 if (newsize == oldsize)
4513 return 0;
4514
4515 /* 4402 /*
4516 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a 4403 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4517 * special case where we need to update the times despite not having 4404 * special case where we need to update the times despite not having
@@ -4822,11 +4709,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
4822 goto out; 4709 goto out;
4823 } 4710 }
4824 4711
4825 if (btrfs_root_refs(&new_root->root_item) == 0) {
4826 err = -ENOENT;
4827 goto out;
4828 }
4829
4830 *sub_root = new_root; 4712 *sub_root = new_root;
4831 location->objectid = btrfs_root_dirid(&new_root->root_item); 4713 location->objectid = btrfs_root_dirid(&new_root->root_item);
4832 location->type = BTRFS_INODE_ITEM_KEY; 4714 location->type = BTRFS_INODE_ITEM_KEY;
@@ -5092,8 +4974,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5092 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4974 if (!(inode->i_sb->s_flags & MS_RDONLY))
5093 ret = btrfs_orphan_cleanup(sub_root); 4975 ret = btrfs_orphan_cleanup(sub_root);
5094 up_read(&root->fs_info->cleanup_work_sem); 4976 up_read(&root->fs_info->cleanup_work_sem);
5095 if (ret) 4977 if (ret) {
4978 iput(inode);
5096 inode = ERR_PTR(ret); 4979 inode = ERR_PTR(ret);
4980 }
5097 } 4981 }
5098 4982
5099 return inode; 4983 return inode;
@@ -5137,10 +5021,9 @@ unsigned char btrfs_filetype_table[] = {
5137 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 5021 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5138}; 5022};
5139 5023
5140static int btrfs_real_readdir(struct file *filp, void *dirent, 5024static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5141 filldir_t filldir)
5142{ 5025{
5143 struct inode *inode = file_inode(filp); 5026 struct inode *inode = file_inode(file);
5144 struct btrfs_root *root = BTRFS_I(inode)->root; 5027 struct btrfs_root *root = BTRFS_I(inode)->root;
5145 struct btrfs_item *item; 5028 struct btrfs_item *item;
5146 struct btrfs_dir_item *di; 5029 struct btrfs_dir_item *di;
@@ -5161,29 +5044,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
5161 char tmp_name[32]; 5044 char tmp_name[32];
5162 char *name_ptr; 5045 char *name_ptr;
5163 int name_len; 5046 int name_len;
5164 int is_curr = 0; /* filp->f_pos points to the current index? */ 5047 int is_curr = 0; /* ctx->pos points to the current index? */
5165 5048
5166 /* FIXME, use a real flag for deciding about the key type */ 5049 /* FIXME, use a real flag for deciding about the key type */
5167 if (root->fs_info->tree_root == root) 5050 if (root->fs_info->tree_root == root)
5168 key_type = BTRFS_DIR_ITEM_KEY; 5051 key_type = BTRFS_DIR_ITEM_KEY;
5169 5052
5170 /* special case for "." */ 5053 if (!dir_emit_dots(file, ctx))
5171 if (filp->f_pos == 0) { 5054 return 0;
5172 over = filldir(dirent, ".", 1, 5055
5173 filp->f_pos, btrfs_ino(inode), DT_DIR);
5174 if (over)
5175 return 0;
5176 filp->f_pos = 1;
5177 }
5178 /* special case for .., just use the back ref */
5179 if (filp->f_pos == 1) {
5180 u64 pino = parent_ino(filp->f_path.dentry);
5181 over = filldir(dirent, "..", 2,
5182 filp->f_pos, pino, DT_DIR);
5183 if (over)
5184 return 0;
5185 filp->f_pos = 2;
5186 }
5187 path = btrfs_alloc_path(); 5056 path = btrfs_alloc_path();
5188 if (!path) 5057 if (!path)
5189 return -ENOMEM; 5058 return -ENOMEM;
@@ -5197,7 +5066,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
5197 } 5066 }
5198 5067
5199 btrfs_set_key_type(&key, key_type); 5068 btrfs_set_key_type(&key, key_type);
5200 key.offset = filp->f_pos; 5069 key.offset = ctx->pos;
5201 key.objectid = btrfs_ino(inode); 5070 key.objectid = btrfs_ino(inode);
5202 5071
5203 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 5072 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5223,14 +5092,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
5223 break; 5092 break;
5224 if (btrfs_key_type(&found_key) != key_type) 5093 if (btrfs_key_type(&found_key) != key_type)
5225 break; 5094 break;
5226 if (found_key.offset < filp->f_pos) 5095 if (found_key.offset < ctx->pos)
5227 goto next; 5096 goto next;
5228 if (key_type == BTRFS_DIR_INDEX_KEY && 5097 if (key_type == BTRFS_DIR_INDEX_KEY &&
5229 btrfs_should_delete_dir_index(&del_list, 5098 btrfs_should_delete_dir_index(&del_list,
5230 found_key.offset)) 5099 found_key.offset))
5231 goto next; 5100 goto next;
5232 5101
5233 filp->f_pos = found_key.offset; 5102 ctx->pos = found_key.offset;
5234 is_curr = 1; 5103 is_curr = 1;
5235 5104
5236 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 5105 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -5274,9 +5143,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
5274 over = 0; 5143 over = 0;
5275 goto skip; 5144 goto skip;
5276 } 5145 }
5277 over = filldir(dirent, name_ptr, name_len, 5146 over = !dir_emit(ctx, name_ptr, name_len,
5278 found_key.offset, location.objectid, 5147 location.objectid, d_type);
5279 d_type);
5280 5148
5281skip: 5149skip:
5282 if (name_ptr != tmp_name) 5150 if (name_ptr != tmp_name)
@@ -5295,22 +5163,38 @@ next:
5295 5163
5296 if (key_type == BTRFS_DIR_INDEX_KEY) { 5164 if (key_type == BTRFS_DIR_INDEX_KEY) {
5297 if (is_curr) 5165 if (is_curr)
5298 filp->f_pos++; 5166 ctx->pos++;
5299 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir, 5167 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5300 &ins_list);
5301 if (ret) 5168 if (ret)
5302 goto nopos; 5169 goto nopos;
5303 } 5170 }
5304 5171
5305 /* Reached end of directory/root. Bump pos past the last item. */ 5172 /* Reached end of directory/root. Bump pos past the last item. */
5306 if (key_type == BTRFS_DIR_INDEX_KEY) 5173 ctx->pos++;
5307 /* 5174
5308 * 32-bit glibc will use getdents64, but then strtol - 5175 /*
5309 * so the last number we can serve is this. 5176 * Stop new entries from being returned after we return the last
5310 */ 5177 * entry.
5311 filp->f_pos = 0x7fffffff; 5178 *
5312 else 5179 * New directory entries are assigned a strictly increasing
5313 filp->f_pos++; 5180 * offset. This means that new entries created during readdir
5181 * are *guaranteed* to be seen in the future by that readdir.
5182 * This has broken buggy programs which operate on names as
5183 * they're returned by readdir. Until we re-use freed offsets
5184 * we have this hack to stop new entries from being returned
5185 * under the assumption that they'll never reach this huge
5186 * offset.
5187 *
5188 * This is being careful not to overflow 32bit loff_t unless the
5189 * last entry requires it because doing so has broken 32bit apps
5190 * in the past.
5191 */
5192 if (key_type == BTRFS_DIR_INDEX_KEY) {
5193 if (ctx->pos >= INT_MAX)
5194 ctx->pos = LLONG_MAX;
5195 else
5196 ctx->pos = INT_MAX;
5197 }
5314nopos: 5198nopos:
5315 ret = 0; 5199 ret = 0;
5316err: 5200err:
@@ -6518,10 +6402,10 @@ out:
6518 * returns 1 when the nocow is safe, < 1 on error, 0 if the 6402 * returns 1 when the nocow is safe, < 1 on error, 0 if the
6519 * block must be cow'd 6403 * block must be cow'd
6520 */ 6404 */
6521static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, 6405noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
6522 struct inode *inode, u64 offset, u64 *len, 6406 struct inode *inode, u64 offset, u64 *len,
6523 u64 *orig_start, u64 *orig_block_len, 6407 u64 *orig_start, u64 *orig_block_len,
6524 u64 *ram_bytes) 6408 u64 *ram_bytes)
6525{ 6409{
6526 struct btrfs_path *path; 6410 struct btrfs_path *path;
6527 int ret; 6411 int ret;
@@ -6535,7 +6419,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6535 u64 num_bytes; 6419 u64 num_bytes;
6536 int slot; 6420 int slot;
6537 int found_type; 6421 int found_type;
6538 6422 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6539 path = btrfs_alloc_path(); 6423 path = btrfs_alloc_path();
6540 if (!path) 6424 if (!path)
6541 return -ENOMEM; 6425 return -ENOMEM;
@@ -6575,18 +6459,28 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
6575 /* not a regular extent, must cow */ 6459 /* not a regular extent, must cow */
6576 goto out; 6460 goto out;
6577 } 6461 }
6462
6463 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6464 goto out;
6465
6578 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6466 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6467 if (disk_bytenr == 0)
6468 goto out;
6469
6470 if (btrfs_file_extent_compression(leaf, fi) ||
6471 btrfs_file_extent_encryption(leaf, fi) ||
6472 btrfs_file_extent_other_encoding(leaf, fi))
6473 goto out;
6474
6579 backref_offset = btrfs_file_extent_offset(leaf, fi); 6475 backref_offset = btrfs_file_extent_offset(leaf, fi);
6580 6476
6581 *orig_start = key.offset - backref_offset; 6477 if (orig_start) {
6582 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); 6478 *orig_start = key.offset - backref_offset;
6583 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6479 *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6480 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6481 }
6584 6482
6585 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 6483 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6586 if (extent_end < offset + *len) {
6587 /* extent doesn't include our full range, must cow */
6588 goto out;
6589 }
6590 6484
6591 if (btrfs_extent_readonly(root, disk_bytenr)) 6485 if (btrfs_extent_readonly(root, disk_bytenr))
6592 goto out; 6486 goto out;
@@ -6830,8 +6724,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6830 if (IS_ERR(trans)) 6724 if (IS_ERR(trans))
6831 goto must_cow; 6725 goto must_cow;
6832 6726
6833 if (can_nocow_odirect(trans, inode, start, &len, &orig_start, 6727 if (can_nocow_extent(trans, inode, start, &len, &orig_start,
6834 &orig_block_len, &ram_bytes) == 1) { 6728 &orig_block_len, &ram_bytes) == 1) {
6835 if (type == BTRFS_ORDERED_PREALLOC) { 6729 if (type == BTRFS_ORDERED_PREALLOC) {
6836 free_extent_map(em); 6730 free_extent_map(em);
6837 em = create_pinned_em(inode, start, len, 6731 em = create_pinned_em(inode, start, len,
@@ -7260,7 +7154,6 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7260{ 7154{
7261 struct btrfs_root *root = BTRFS_I(inode)->root; 7155 struct btrfs_root *root = BTRFS_I(inode)->root;
7262 struct btrfs_dio_private *dip; 7156 struct btrfs_dio_private *dip;
7263 struct bio_vec *bvec = dio_bio->bi_io_vec;
7264 struct bio *io_bio; 7157 struct bio *io_bio;
7265 int skip_sum; 7158 int skip_sum;
7266 int write = rw & REQ_WRITE; 7159 int write = rw & REQ_WRITE;
@@ -7282,16 +7175,9 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7282 } 7175 }
7283 7176
7284 dip->private = dio_bio->bi_private; 7177 dip->private = dio_bio->bi_private;
7285 io_bio->bi_private = dio_bio->bi_private;
7286 dip->inode = inode; 7178 dip->inode = inode;
7287 dip->logical_offset = file_offset; 7179 dip->logical_offset = file_offset;
7288 7180 dip->bytes = dio_bio->bi_size;
7289 dip->bytes = 0;
7290 do {
7291 dip->bytes += bvec->bv_len;
7292 bvec++;
7293 } while (bvec <= (dio_bio->bi_io_vec + dio_bio->bi_vcnt - 1));
7294
7295 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; 7181 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7296 io_bio->bi_private = dip; 7182 io_bio->bi_private = dip;
7297 dip->errors = 0; 7183 dip->errors = 0;
@@ -7390,8 +7276,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7390 atomic_inc(&inode->i_dio_count); 7276 atomic_inc(&inode->i_dio_count);
7391 smp_mb__after_atomic_inc(); 7277 smp_mb__after_atomic_inc();
7392 7278
7279 /*
7280 * The generic stuff only does filemap_write_and_wait_range, which isn't
7281 * enough if we've written compressed pages to this area, so we need to
7282 * call btrfs_wait_ordered_range to make absolutely sure that any
7283 * outstanding dirty pages are on disk.
7284 */
7285 count = iov_length(iov, nr_segs);
7286 btrfs_wait_ordered_range(inode, offset, count);
7287
7393 if (rw & WRITE) { 7288 if (rw & WRITE) {
7394 count = iov_length(iov, nr_segs);
7395 /* 7289 /*
7396 * If the write DIO is beyond the EOF, we need update 7290 * If the write DIO is beyond the EOF, we need update
7397 * the isize, but it is protected by i_mutex. So we can 7291 * the isize, but it is protected by i_mutex. So we can
@@ -7510,7 +7404,8 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7510 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); 7404 return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7511} 7405}
7512 7406
7513static void btrfs_invalidatepage(struct page *page, unsigned long offset) 7407static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7408 unsigned int length)
7514{ 7409{
7515 struct inode *inode = page->mapping->host; 7410 struct inode *inode = page->mapping->host;
7516 struct extent_io_tree *tree; 7411 struct extent_io_tree *tree;
@@ -7710,16 +7605,12 @@ static int btrfs_truncate(struct inode *inode)
7710{ 7605{
7711 struct btrfs_root *root = BTRFS_I(inode)->root; 7606 struct btrfs_root *root = BTRFS_I(inode)->root;
7712 struct btrfs_block_rsv *rsv; 7607 struct btrfs_block_rsv *rsv;
7713 int ret; 7608 int ret = 0;
7714 int err = 0; 7609 int err = 0;
7715 struct btrfs_trans_handle *trans; 7610 struct btrfs_trans_handle *trans;
7716 u64 mask = root->sectorsize - 1; 7611 u64 mask = root->sectorsize - 1;
7717 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 7612 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7718 7613
7719 ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
7720 if (ret)
7721 return ret;
7722
7723 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 7614 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
7724 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 7615 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
7725 7616
@@ -7977,9 +7868,9 @@ void btrfs_destroy_inode(struct inode *inode)
7977 */ 7868 */
7978 smp_mb(); 7869 smp_mb();
7979 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { 7870 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7980 spin_lock(&root->fs_info->ordered_extent_lock); 7871 spin_lock(&root->fs_info->ordered_root_lock);
7981 list_del_init(&BTRFS_I(inode)->ordered_operations); 7872 list_del_init(&BTRFS_I(inode)->ordered_operations);
7982 spin_unlock(&root->fs_info->ordered_extent_lock); 7873 spin_unlock(&root->fs_info->ordered_root_lock);
7983 } 7874 }
7984 7875
7985 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 7876 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
@@ -8349,7 +8240,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8349 * some fairly slow code that needs optimization. This walks the list 8240 * some fairly slow code that needs optimization. This walks the list
8350 * of all the inodes with pending delalloc and forces them to disk. 8241 * of all the inodes with pending delalloc and forces them to disk.
8351 */ 8242 */
8352int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 8243static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8353{ 8244{
8354 struct btrfs_inode *binode; 8245 struct btrfs_inode *binode;
8355 struct inode *inode; 8246 struct inode *inode;
@@ -8358,30 +8249,23 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8358 struct list_head splice; 8249 struct list_head splice;
8359 int ret = 0; 8250 int ret = 0;
8360 8251
8361 if (root->fs_info->sb->s_flags & MS_RDONLY)
8362 return -EROFS;
8363
8364 INIT_LIST_HEAD(&works); 8252 INIT_LIST_HEAD(&works);
8365 INIT_LIST_HEAD(&splice); 8253 INIT_LIST_HEAD(&splice);
8366 8254
8367 spin_lock(&root->fs_info->delalloc_lock); 8255 spin_lock(&root->delalloc_lock);
8368 list_splice_init(&root->fs_info->delalloc_inodes, &splice); 8256 list_splice_init(&root->delalloc_inodes, &splice);
8369 while (!list_empty(&splice)) { 8257 while (!list_empty(&splice)) {
8370 binode = list_entry(splice.next, struct btrfs_inode, 8258 binode = list_entry(splice.next, struct btrfs_inode,
8371 delalloc_inodes); 8259 delalloc_inodes);
8372 8260
8373 list_del_init(&binode->delalloc_inodes); 8261 list_move_tail(&binode->delalloc_inodes,
8374 8262 &root->delalloc_inodes);
8375 inode = igrab(&binode->vfs_inode); 8263 inode = igrab(&binode->vfs_inode);
8376 if (!inode) { 8264 if (!inode) {
8377 clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, 8265 cond_resched_lock(&root->delalloc_lock);
8378 &binode->runtime_flags);
8379 continue; 8266 continue;
8380 } 8267 }
8381 8268 spin_unlock(&root->delalloc_lock);
8382 list_add_tail(&binode->delalloc_inodes,
8383 &root->fs_info->delalloc_inodes);
8384 spin_unlock(&root->fs_info->delalloc_lock);
8385 8269
8386 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8270 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8387 if (unlikely(!work)) { 8271 if (unlikely(!work)) {
@@ -8393,16 +8277,39 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8393 &work->work); 8277 &work->work);
8394 8278
8395 cond_resched(); 8279 cond_resched();
8396 spin_lock(&root->fs_info->delalloc_lock); 8280 spin_lock(&root->delalloc_lock);
8397 } 8281 }
8398 spin_unlock(&root->fs_info->delalloc_lock); 8282 spin_unlock(&root->delalloc_lock);
8399 8283
8400 list_for_each_entry_safe(work, next, &works, list) { 8284 list_for_each_entry_safe(work, next, &works, list) {
8401 list_del_init(&work->list); 8285 list_del_init(&work->list);
8402 btrfs_wait_and_free_delalloc_work(work); 8286 btrfs_wait_and_free_delalloc_work(work);
8403 } 8287 }
8288 return 0;
8289out:
8290 list_for_each_entry_safe(work, next, &works, list) {
8291 list_del_init(&work->list);
8292 btrfs_wait_and_free_delalloc_work(work);
8293 }
8294
8295 if (!list_empty_careful(&splice)) {
8296 spin_lock(&root->delalloc_lock);
8297 list_splice_tail(&splice, &root->delalloc_inodes);
8298 spin_unlock(&root->delalloc_lock);
8299 }
8300 return ret;
8301}
8302
8303int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8304{
8305 int ret;
8306
8307 if (root->fs_info->sb->s_flags & MS_RDONLY)
8308 return -EROFS;
8404 8309
8405 /* the filemap_flush will queue IO into the worker threads, but 8310 ret = __start_delalloc_inodes(root, delay_iput);
8311 /*
8312 * the filemap_flush will queue IO into the worker threads, but
8406 * we have to make sure the IO is actually started and that 8313 * we have to make sure the IO is actually started and that
8407 * ordered extents get created before we return 8314 * ordered extents get created before we return
8408 */ 8315 */
@@ -8414,17 +8321,55 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8414 atomic_read(&root->fs_info->async_delalloc_pages) == 0)); 8321 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8415 } 8322 }
8416 atomic_dec(&root->fs_info->async_submit_draining); 8323 atomic_dec(&root->fs_info->async_submit_draining);
8417 return 0; 8324 return ret;
8418out: 8325}
8419 list_for_each_entry_safe(work, next, &works, list) { 8326
8420 list_del_init(&work->list); 8327int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
8421 btrfs_wait_and_free_delalloc_work(work); 8328 int delay_iput)
8329{
8330 struct btrfs_root *root;
8331 struct list_head splice;
8332 int ret;
8333
8334 if (fs_info->sb->s_flags & MS_RDONLY)
8335 return -EROFS;
8336
8337 INIT_LIST_HEAD(&splice);
8338
8339 spin_lock(&fs_info->delalloc_root_lock);
8340 list_splice_init(&fs_info->delalloc_roots, &splice);
8341 while (!list_empty(&splice)) {
8342 root = list_first_entry(&splice, struct btrfs_root,
8343 delalloc_root);
8344 root = btrfs_grab_fs_root(root);
8345 BUG_ON(!root);
8346 list_move_tail(&root->delalloc_root,
8347 &fs_info->delalloc_roots);
8348 spin_unlock(&fs_info->delalloc_root_lock);
8349
8350 ret = __start_delalloc_inodes(root, delay_iput);
8351 btrfs_put_fs_root(root);
8352 if (ret)
8353 goto out;
8354
8355 spin_lock(&fs_info->delalloc_root_lock);
8422 } 8356 }
8357 spin_unlock(&fs_info->delalloc_root_lock);
8423 8358
8359 atomic_inc(&fs_info->async_submit_draining);
8360 while (atomic_read(&fs_info->nr_async_submits) ||
8361 atomic_read(&fs_info->async_delalloc_pages)) {
8362 wait_event(fs_info->async_submit_wait,
8363 (atomic_read(&fs_info->nr_async_submits) == 0 &&
8364 atomic_read(&fs_info->async_delalloc_pages) == 0));
8365 }
8366 atomic_dec(&fs_info->async_submit_draining);
8367 return 0;
8368out:
8424 if (!list_empty_careful(&splice)) { 8369 if (!list_empty_careful(&splice)) {
8425 spin_lock(&root->fs_info->delalloc_lock); 8370 spin_lock(&fs_info->delalloc_root_lock);
8426 list_splice_tail(&splice, &root->fs_info->delalloc_inodes); 8371 list_splice_tail(&splice, &fs_info->delalloc_roots);
8427 spin_unlock(&root->fs_info->delalloc_lock); 8372 spin_unlock(&fs_info->delalloc_root_lock);
8428 } 8373 }
8429 return ret; 8374 return ret;
8430} 8375}
@@ -8731,7 +8676,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
8731static const struct file_operations btrfs_dir_file_operations = { 8676static const struct file_operations btrfs_dir_file_operations = {
8732 .llseek = generic_file_llseek, 8677 .llseek = generic_file_llseek,
8733 .read = generic_read_dir, 8678 .read = generic_read_dir,
8734 .readdir = btrfs_real_readdir, 8679 .iterate = btrfs_real_readdir,
8735 .unlocked_ioctl = btrfs_ioctl, 8680 .unlocked_ioctl = btrfs_ioctl,
8736#ifdef CONFIG_COMPAT 8681#ifdef CONFIG_COMPAT
8737 .compat_ioctl = btrfs_ioctl, 8682 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0f81d67cdc8d..238a05545ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -555,6 +555,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
555 if (!root->ref_cows) 555 if (!root->ref_cows)
556 return -EINVAL; 556 return -EINVAL;
557 557
558 ret = btrfs_start_delalloc_inodes(root, 0);
559 if (ret)
560 return ret;
561
562 btrfs_wait_ordered_extents(root, 0);
563
558 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 564 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
559 if (!pending_snapshot) 565 if (!pending_snapshot)
560 return -ENOMEM; 566 return -ENOMEM;
@@ -2354,14 +2360,6 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2354 if (ret) 2360 if (ret)
2355 return ret; 2361 return ret;
2356 2362
2357 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2358 1)) {
2359 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2360 mnt_drop_write_file(file);
2361 return -EINVAL;
2362 }
2363
2364 mutex_lock(&root->fs_info->volume_mutex);
2365 vol_args = memdup_user(arg, sizeof(*vol_args)); 2363 vol_args = memdup_user(arg, sizeof(*vol_args));
2366 if (IS_ERR(vol_args)) { 2364 if (IS_ERR(vol_args)) {
2367 ret = PTR_ERR(vol_args); 2365 ret = PTR_ERR(vol_args);
@@ -2369,12 +2367,20 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2369 } 2367 }
2370 2368
2371 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2369 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2372 ret = btrfs_rm_device(root, vol_args->name);
2373 2370
2374 kfree(vol_args); 2371 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2375out: 2372 1)) {
2373 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
2374 goto out;
2375 }
2376
2377 mutex_lock(&root->fs_info->volume_mutex);
2378 ret = btrfs_rm_device(root, vol_args->name);
2376 mutex_unlock(&root->fs_info->volume_mutex); 2379 mutex_unlock(&root->fs_info->volume_mutex);
2377 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2380 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2381
2382out:
2383 kfree(vol_args);
2378 mnt_drop_write_file(file); 2384 mnt_drop_write_file(file);
2379 return ret; 2385 return ret;
2380} 2386}
@@ -2480,6 +2486,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2480 int ret; 2486 int ret;
2481 u64 len = olen; 2487 u64 len = olen;
2482 u64 bs = root->fs_info->sb->s_blocksize; 2488 u64 bs = root->fs_info->sb->s_blocksize;
2489 int same_inode = 0;
2483 2490
2484 /* 2491 /*
2485 * TODO: 2492 * TODO:
@@ -2516,7 +2523,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2516 2523
2517 ret = -EINVAL; 2524 ret = -EINVAL;
2518 if (src == inode) 2525 if (src == inode)
2519 goto out_fput; 2526 same_inode = 1;
2520 2527
2521 /* the src must be open for reading */ 2528 /* the src must be open for reading */
2522 if (!(src_file.file->f_mode & FMODE_READ)) 2529 if (!(src_file.file->f_mode & FMODE_READ))
@@ -2547,12 +2554,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2547 } 2554 }
2548 path->reada = 2; 2555 path->reada = 2;
2549 2556
2550 if (inode < src) { 2557 if (!same_inode) {
2551 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); 2558 if (inode < src) {
2552 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); 2559 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
2560 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
2561 } else {
2562 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
2563 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2564 }
2553 } else { 2565 } else {
2554 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); 2566 mutex_lock(&src->i_mutex);
2555 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2556 } 2567 }
2557 2568
2558 /* determine range to clone */ 2569 /* determine range to clone */
@@ -2570,6 +2581,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2570 !IS_ALIGNED(destoff, bs)) 2581 !IS_ALIGNED(destoff, bs))
2571 goto out_unlock; 2582 goto out_unlock;
2572 2583
2584 /* verify if ranges are overlapped within the same file */
2585 if (same_inode) {
2586 if (destoff + len > off && destoff < off + len)
2587 goto out_unlock;
2588 }
2589
2573 if (destoff > inode->i_size) { 2590 if (destoff > inode->i_size) {
2574 ret = btrfs_cont_expand(inode, inode->i_size, destoff); 2591 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2575 if (ret) 2592 if (ret)
@@ -2846,7 +2863,8 @@ out:
2846 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 2863 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2847out_unlock: 2864out_unlock:
2848 mutex_unlock(&src->i_mutex); 2865 mutex_unlock(&src->i_mutex);
2849 mutex_unlock(&inode->i_mutex); 2866 if (!same_inode)
2867 mutex_unlock(&inode->i_mutex);
2850 vfree(buf); 2868 vfree(buf);
2851 btrfs_free_path(path); 2869 btrfs_free_path(path);
2852out_fput: 2870out_fput:
@@ -2951,11 +2969,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2951 goto out; 2969 goto out;
2952 } 2970 }
2953 2971
2954 if (btrfs_root_refs(&new_root->root_item) == 0) {
2955 ret = -ENOENT;
2956 goto out;
2957 }
2958
2959 path = btrfs_alloc_path(); 2972 path = btrfs_alloc_path();
2960 if (!path) { 2973 if (!path) {
2961 ret = -ENOMEM; 2974 ret = -ENOMEM;
@@ -3719,9 +3732,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3719 break; 3732 break;
3720 } 3733 }
3721 3734
3722 if (copy_to_user(arg, sa, sizeof(*sa)))
3723 ret = -EFAULT;
3724
3725 err = btrfs_commit_transaction(trans, root->fs_info->tree_root); 3735 err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
3726 if (err && !ret) 3736 if (err && !ret)
3727 ret = err; 3737 ret = err;
@@ -3881,7 +3891,7 @@ drop_write:
3881 3891
3882static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) 3892static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
3883{ 3893{
3884 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3894 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3885 struct btrfs_ioctl_quota_rescan_args *qsa; 3895 struct btrfs_ioctl_quota_rescan_args *qsa;
3886 int ret; 3896 int ret;
3887 3897
@@ -3914,7 +3924,7 @@ drop_write:
3914 3924
3915static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) 3925static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
3916{ 3926{
3917 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 3927 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3918 struct btrfs_ioctl_quota_rescan_args *qsa; 3928 struct btrfs_ioctl_quota_rescan_args *qsa;
3919 int ret = 0; 3929 int ret = 0;
3920 3930
@@ -3937,6 +3947,16 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
3937 return ret; 3947 return ret;
3938} 3948}
3939 3949
3950static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
3951{
3952 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3953
3954 if (!capable(CAP_SYS_ADMIN))
3955 return -EPERM;
3956
3957 return btrfs_qgroup_wait_for_completion(root->fs_info);
3958}
3959
3940static long btrfs_ioctl_set_received_subvol(struct file *file, 3960static long btrfs_ioctl_set_received_subvol(struct file *file,
3941 void __user *arg) 3961 void __user *arg)
3942{ 3962{
@@ -4020,7 +4040,7 @@ out:
4020 4040
4021static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) 4041static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
4022{ 4042{
4023 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 4043 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4024 const char *label = root->fs_info->super_copy->label; 4044 const char *label = root->fs_info->super_copy->label;
4025 size_t len = strnlen(label, BTRFS_LABEL_SIZE); 4045 size_t len = strnlen(label, BTRFS_LABEL_SIZE);
4026 int ret; 4046 int ret;
@@ -4039,7 +4059,7 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
4039 4059
4040static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) 4060static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
4041{ 4061{
4042 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 4062 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4043 struct btrfs_super_block *super_block = root->fs_info->super_copy; 4063 struct btrfs_super_block *super_block = root->fs_info->super_copy;
4044 struct btrfs_trans_handle *trans; 4064 struct btrfs_trans_handle *trans;
4045 char label[BTRFS_LABEL_SIZE]; 4065 char label[BTRFS_LABEL_SIZE];
@@ -4179,6 +4199,8 @@ long btrfs_ioctl(struct file *file, unsigned int
4179 return btrfs_ioctl_quota_rescan(file, argp); 4199 return btrfs_ioctl_quota_rescan(file, argp);
4180 case BTRFS_IOC_QUOTA_RESCAN_STATUS: 4200 case BTRFS_IOC_QUOTA_RESCAN_STATUS:
4181 return btrfs_ioctl_quota_rescan_status(file, argp); 4201 return btrfs_ioctl_quota_rescan_status(file, argp);
4202 case BTRFS_IOC_QUOTA_RESCAN_WAIT:
4203 return btrfs_ioctl_quota_rescan_wait(file, argp);
4182 case BTRFS_IOC_DEV_REPLACE: 4204 case BTRFS_IOC_DEV_REPLACE:
4183 return btrfs_ioctl_dev_replace(root, argp); 4205 return btrfs_ioctl_dev_replace(root, argp);
4184 case BTRFS_IOC_GET_FSLABEL: 4206 case BTRFS_IOC_GET_FSLABEL:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 743b86fa4fcb..f93151a98886 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -31,8 +31,8 @@
31 31
32struct workspace { 32struct workspace {
33 void *mem; 33 void *mem;
34 void *buf; /* where compressed data goes */ 34 void *buf; /* where decompressed data goes */
35 void *cbuf; /* where decompressed data goes */ 35 void *cbuf; /* where compressed data goes */
36 struct list_head list; 36 struct list_head list;
37}; 37};
38 38
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddd728541ee..81369827e514 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -24,6 +24,7 @@
24#include "transaction.h" 24#include "transaction.h"
25#include "btrfs_inode.h" 25#include "btrfs_inode.h"
26#include "extent_io.h" 26#include "extent_io.h"
27#include "disk-io.h"
27 28
28static struct kmem_cache *btrfs_ordered_extent_cache; 29static struct kmem_cache *btrfs_ordered_extent_cache;
29 30
@@ -184,6 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
184 u64 start, u64 len, u64 disk_len, 185 u64 start, u64 len, u64 disk_len,
185 int type, int dio, int compress_type) 186 int type, int dio, int compress_type)
186{ 187{
188 struct btrfs_root *root = BTRFS_I(inode)->root;
187 struct btrfs_ordered_inode_tree *tree; 189 struct btrfs_ordered_inode_tree *tree;
188 struct rb_node *node; 190 struct rb_node *node;
189 struct btrfs_ordered_extent *entry; 191 struct btrfs_ordered_extent *entry;
@@ -227,10 +229,18 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
227 ordered_data_tree_panic(inode, -EEXIST, file_offset); 229 ordered_data_tree_panic(inode, -EEXIST, file_offset);
228 spin_unlock_irq(&tree->lock); 230 spin_unlock_irq(&tree->lock);
229 231
230 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 232 spin_lock(&root->ordered_extent_lock);
231 list_add_tail(&entry->root_extent_list, 233 list_add_tail(&entry->root_extent_list,
232 &BTRFS_I(inode)->root->fs_info->ordered_extents); 234 &root->ordered_extents);
233 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 235 root->nr_ordered_extents++;
236 if (root->nr_ordered_extents == 1) {
237 spin_lock(&root->fs_info->ordered_root_lock);
238 BUG_ON(!list_empty(&root->ordered_root));
239 list_add_tail(&root->ordered_root,
240 &root->fs_info->ordered_roots);
241 spin_unlock(&root->fs_info->ordered_root_lock);
242 }
243 spin_unlock(&root->ordered_extent_lock);
234 244
235 return 0; 245 return 0;
236} 246}
@@ -516,8 +526,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
516 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 526 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
517 spin_unlock_irq(&tree->lock); 527 spin_unlock_irq(&tree->lock);
518 528
519 spin_lock(&root->fs_info->ordered_extent_lock); 529 spin_lock(&root->ordered_extent_lock);
520 list_del_init(&entry->root_extent_list); 530 list_del_init(&entry->root_extent_list);
531 root->nr_ordered_extents--;
521 532
522 trace_btrfs_ordered_extent_remove(inode, entry); 533 trace_btrfs_ordered_extent_remove(inode, entry);
523 534
@@ -530,7 +541,14 @@ void btrfs_remove_ordered_extent(struct inode *inode,
530 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 541 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
531 list_del_init(&BTRFS_I(inode)->ordered_operations); 542 list_del_init(&BTRFS_I(inode)->ordered_operations);
532 } 543 }
533 spin_unlock(&root->fs_info->ordered_extent_lock); 544
545 if (!root->nr_ordered_extents) {
546 spin_lock(&root->fs_info->ordered_root_lock);
547 BUG_ON(list_empty(&root->ordered_root));
548 list_del_init(&root->ordered_root);
549 spin_unlock(&root->fs_info->ordered_root_lock);
550 }
551 spin_unlock(&root->ordered_extent_lock);
534 wake_up(&entry->wait); 552 wake_up(&entry->wait);
535} 553}
536 554
@@ -550,7 +568,6 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
550void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 568void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
551{ 569{
552 struct list_head splice, works; 570 struct list_head splice, works;
553 struct list_head *cur;
554 struct btrfs_ordered_extent *ordered, *next; 571 struct btrfs_ordered_extent *ordered, *next;
555 struct inode *inode; 572 struct inode *inode;
556 573
@@ -558,35 +575,34 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
558 INIT_LIST_HEAD(&works); 575 INIT_LIST_HEAD(&works);
559 576
560 mutex_lock(&root->fs_info->ordered_operations_mutex); 577 mutex_lock(&root->fs_info->ordered_operations_mutex);
561 spin_lock(&root->fs_info->ordered_extent_lock); 578 spin_lock(&root->ordered_extent_lock);
562 list_splice_init(&root->fs_info->ordered_extents, &splice); 579 list_splice_init(&root->ordered_extents, &splice);
563 while (!list_empty(&splice)) { 580 while (!list_empty(&splice)) {
564 cur = splice.next; 581 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
565 ordered = list_entry(cur, struct btrfs_ordered_extent, 582 root_extent_list);
566 root_extent_list); 583 list_move_tail(&ordered->root_extent_list,
567 list_del_init(&ordered->root_extent_list); 584 &root->ordered_extents);
568 atomic_inc(&ordered->refs);
569
570 /* 585 /*
571 * the inode may be getting freed (in sys_unlink path). 586 * the inode may be getting freed (in sys_unlink path).
572 */ 587 */
573 inode = igrab(ordered->inode); 588 inode = igrab(ordered->inode);
589 if (!inode) {
590 cond_resched_lock(&root->ordered_extent_lock);
591 continue;
592 }
574 593
575 spin_unlock(&root->fs_info->ordered_extent_lock); 594 atomic_inc(&ordered->refs);
595 spin_unlock(&root->ordered_extent_lock);
576 596
577 if (inode) { 597 ordered->flush_work.func = btrfs_run_ordered_extent_work;
578 ordered->flush_work.func = btrfs_run_ordered_extent_work; 598 list_add_tail(&ordered->work_list, &works);
579 list_add_tail(&ordered->work_list, &works); 599 btrfs_queue_worker(&root->fs_info->flush_workers,
580 btrfs_queue_worker(&root->fs_info->flush_workers, 600 &ordered->flush_work);
581 &ordered->flush_work);
582 } else {
583 btrfs_put_ordered_extent(ordered);
584 }
585 601
586 cond_resched(); 602 cond_resched();
587 spin_lock(&root->fs_info->ordered_extent_lock); 603 spin_lock(&root->ordered_extent_lock);
588 } 604 }
589 spin_unlock(&root->fs_info->ordered_extent_lock); 605 spin_unlock(&root->ordered_extent_lock);
590 606
591 list_for_each_entry_safe(ordered, next, &works, work_list) { 607 list_for_each_entry_safe(ordered, next, &works, work_list) {
592 list_del_init(&ordered->work_list); 608 list_del_init(&ordered->work_list);
@@ -604,6 +620,33 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
604 mutex_unlock(&root->fs_info->ordered_operations_mutex); 620 mutex_unlock(&root->fs_info->ordered_operations_mutex);
605} 621}
606 622
623void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
624 int delay_iput)
625{
626 struct btrfs_root *root;
627 struct list_head splice;
628
629 INIT_LIST_HEAD(&splice);
630
631 spin_lock(&fs_info->ordered_root_lock);
632 list_splice_init(&fs_info->ordered_roots, &splice);
633 while (!list_empty(&splice)) {
634 root = list_first_entry(&splice, struct btrfs_root,
635 ordered_root);
636 root = btrfs_grab_fs_root(root);
637 BUG_ON(!root);
638 list_move_tail(&root->ordered_root,
639 &fs_info->ordered_roots);
640 spin_unlock(&fs_info->ordered_root_lock);
641
642 btrfs_wait_ordered_extents(root, delay_iput);
643 btrfs_put_fs_root(root);
644
645 spin_lock(&fs_info->ordered_root_lock);
646 }
647 spin_unlock(&fs_info->ordered_root_lock);
648}
649
607/* 650/*
608 * this is used during transaction commit to write all the inodes 651 * this is used during transaction commit to write all the inodes
609 * added to the ordered operation list. These files must be fully on 652 * added to the ordered operation list. These files must be fully on
@@ -629,7 +672,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
629 INIT_LIST_HEAD(&works); 672 INIT_LIST_HEAD(&works);
630 673
631 mutex_lock(&root->fs_info->ordered_operations_mutex); 674 mutex_lock(&root->fs_info->ordered_operations_mutex);
632 spin_lock(&root->fs_info->ordered_extent_lock); 675 spin_lock(&root->fs_info->ordered_root_lock);
633 list_splice_init(&cur_trans->ordered_operations, &splice); 676 list_splice_init(&cur_trans->ordered_operations, &splice);
634 while (!list_empty(&splice)) { 677 while (!list_empty(&splice)) {
635 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 678 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
@@ -648,17 +691,17 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
648 if (!wait) 691 if (!wait)
649 list_add_tail(&BTRFS_I(inode)->ordered_operations, 692 list_add_tail(&BTRFS_I(inode)->ordered_operations,
650 &cur_trans->ordered_operations); 693 &cur_trans->ordered_operations);
651 spin_unlock(&root->fs_info->ordered_extent_lock); 694 spin_unlock(&root->fs_info->ordered_root_lock);
652 695
653 work = btrfs_alloc_delalloc_work(inode, wait, 1); 696 work = btrfs_alloc_delalloc_work(inode, wait, 1);
654 if (!work) { 697 if (!work) {
655 spin_lock(&root->fs_info->ordered_extent_lock); 698 spin_lock(&root->fs_info->ordered_root_lock);
656 if (list_empty(&BTRFS_I(inode)->ordered_operations)) 699 if (list_empty(&BTRFS_I(inode)->ordered_operations))
657 list_add_tail(&btrfs_inode->ordered_operations, 700 list_add_tail(&btrfs_inode->ordered_operations,
658 &splice); 701 &splice);
659 list_splice_tail(&splice, 702 list_splice_tail(&splice,
660 &cur_trans->ordered_operations); 703 &cur_trans->ordered_operations);
661 spin_unlock(&root->fs_info->ordered_extent_lock); 704 spin_unlock(&root->fs_info->ordered_root_lock);
662 ret = -ENOMEM; 705 ret = -ENOMEM;
663 goto out; 706 goto out;
664 } 707 }
@@ -667,9 +710,9 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
667 &work->work); 710 &work->work);
668 711
669 cond_resched(); 712 cond_resched();
670 spin_lock(&root->fs_info->ordered_extent_lock); 713 spin_lock(&root->fs_info->ordered_root_lock);
671 } 714 }
672 spin_unlock(&root->fs_info->ordered_extent_lock); 715 spin_unlock(&root->fs_info->ordered_root_lock);
673out: 716out:
674 list_for_each_entry_safe(work, next, &works, list) { 717 list_for_each_entry_safe(work, next, &works, list) {
675 list_del_init(&work->list); 718 list_del_init(&work->list);
@@ -989,7 +1032,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
989 u32 *sum, int len) 1032 u32 *sum, int len)
990{ 1033{
991 struct btrfs_ordered_sum *ordered_sum; 1034 struct btrfs_ordered_sum *ordered_sum;
992 struct btrfs_sector_sum *sector_sums;
993 struct btrfs_ordered_extent *ordered; 1035 struct btrfs_ordered_extent *ordered;
994 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 1036 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
995 unsigned long num_sectors; 1037 unsigned long num_sectors;
@@ -1007,18 +1049,16 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
1007 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 1049 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
1008 i = (disk_bytenr - ordered_sum->bytenr) >> 1050 i = (disk_bytenr - ordered_sum->bytenr) >>
1009 inode->i_sb->s_blocksize_bits; 1051 inode->i_sb->s_blocksize_bits;
1010 sector_sums = ordered_sum->sums + i;
1011 num_sectors = ordered_sum->len >> 1052 num_sectors = ordered_sum->len >>
1012 inode->i_sb->s_blocksize_bits; 1053 inode->i_sb->s_blocksize_bits;
1013 for (; i < num_sectors; i++) { 1054 num_sectors = min_t(int, len - index, num_sectors - i);
1014 if (sector_sums[i].bytenr == disk_bytenr) { 1055 memcpy(sum + index, ordered_sum->sums + i,
1015 sum[index] = sector_sums[i].sum; 1056 num_sectors);
1016 index++; 1057
1017 if (index == len) 1058 index += (int)num_sectors;
1018 goto out; 1059 if (index == len)
1019 disk_bytenr += sectorsize; 1060 goto out;
1020 } 1061 disk_bytenr += num_sectors * sectorsize;
1021 }
1022 } 1062 }
1023 } 1063 }
1024out: 1064out:
@@ -1055,12 +1095,12 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1055 if (last_mod < root->fs_info->last_trans_committed) 1095 if (last_mod < root->fs_info->last_trans_committed)
1056 return; 1096 return;
1057 1097
1058 spin_lock(&root->fs_info->ordered_extent_lock); 1098 spin_lock(&root->fs_info->ordered_root_lock);
1059 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 1099 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1060 list_add_tail(&BTRFS_I(inode)->ordered_operations, 1100 list_add_tail(&BTRFS_I(inode)->ordered_operations,
1061 &cur_trans->ordered_operations); 1101 &cur_trans->ordered_operations);
1062 } 1102 }
1063 spin_unlock(&root->fs_info->ordered_extent_lock); 1103 spin_unlock(&root->fs_info->ordered_root_lock);
1064} 1104}
1065 1105
1066int __init ordered_data_init(void) 1106int __init ordered_data_init(void)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 58b0e3b0ebad..68844d59ee6f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -26,18 +26,6 @@ struct btrfs_ordered_inode_tree {
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
28 28
29/*
30 * these are used to collect checksums done just before bios submission.
31 * They are attached via a list into the ordered extent, and
32 * checksum items are inserted into the tree after all the blocks in
33 * the ordered extent are on disk
34 */
35struct btrfs_sector_sum {
36 /* bytenr on disk */
37 u64 bytenr;
38 u32 sum;
39};
40
41struct btrfs_ordered_sum { 29struct btrfs_ordered_sum {
42 /* bytenr is the start of this extent on disk */ 30 /* bytenr is the start of this extent on disk */
43 u64 bytenr; 31 u64 bytenr;
@@ -45,10 +33,10 @@ struct btrfs_ordered_sum {
45 /* 33 /*
46 * this is the length in bytes covered by the sums array below. 34 * this is the length in bytes covered by the sums array below.
47 */ 35 */
48 unsigned long len; 36 int len;
49 struct list_head list; 37 struct list_head list;
50 /* last field is a variable length array of btrfs_sector_sums */ 38 /* last field is a variable length array of csums */
51 struct btrfs_sector_sum sums[]; 39 u32 sums[];
52}; 40};
53 41
54/* 42/*
@@ -149,11 +137,8 @@ struct btrfs_ordered_extent {
149static inline int btrfs_ordered_sum_size(struct btrfs_root *root, 137static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
150 unsigned long bytes) 138 unsigned long bytes)
151{ 139{
152 unsigned long num_sectors = (bytes + root->sectorsize - 1) / 140 int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
153 root->sectorsize; 141 return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
154 num_sectors++;
155 return sizeof(struct btrfs_ordered_sum) +
156 num_sectors * sizeof(struct btrfs_sector_sum);
157} 142}
158 143
159static inline void 144static inline void
@@ -204,6 +189,8 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
204 struct btrfs_root *root, 189 struct btrfs_root *root,
205 struct inode *inode); 190 struct inode *inode);
206void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); 191void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
192void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info,
193 int delay_iput);
207void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); 194void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
208void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); 195void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
209void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); 196void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d49c586995a..1280eff8af56 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -98,13 +98,10 @@ struct btrfs_qgroup_list {
98 struct btrfs_qgroup *member; 98 struct btrfs_qgroup *member;
99}; 99};
100 100
101struct qgroup_rescan { 101static int
102 struct btrfs_work work; 102qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
103 struct btrfs_fs_info *fs_info; 103 int init_flags);
104}; 104static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
105
106static void qgroup_rescan_start(struct btrfs_fs_info *fs_info,
107 struct qgroup_rescan *qscan);
108 105
109/* must be called with qgroup_ioctl_lock held */ 106/* must be called with qgroup_ioctl_lock held */
110static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 107static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -255,10 +252,17 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
255 int slot; 252 int slot;
256 int ret = 0; 253 int ret = 0;
257 u64 flags = 0; 254 u64 flags = 0;
255 u64 rescan_progress = 0;
258 256
259 if (!fs_info->quota_enabled) 257 if (!fs_info->quota_enabled)
260 return 0; 258 return 0;
261 259
260 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
261 if (!fs_info->qgroup_ulist) {
262 ret = -ENOMEM;
263 goto out;
264 }
265
262 path = btrfs_alloc_path(); 266 path = btrfs_alloc_path();
263 if (!path) { 267 if (!path) {
264 ret = -ENOMEM; 268 ret = -ENOMEM;
@@ -306,20 +310,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
306 } 310 }
307 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 311 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
308 ptr); 312 ptr);
309 fs_info->qgroup_rescan_progress.objectid = 313 rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
310 btrfs_qgroup_status_rescan(l, ptr);
311 if (fs_info->qgroup_flags &
312 BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
313 struct qgroup_rescan *qscan =
314 kmalloc(sizeof(*qscan), GFP_NOFS);
315 if (!qscan) {
316 ret = -ENOMEM;
317 goto out;
318 }
319 fs_info->qgroup_rescan_progress.type = 0;
320 fs_info->qgroup_rescan_progress.offset = 0;
321 qgroup_rescan_start(fs_info, qscan);
322 }
323 goto next1; 314 goto next1;
324 } 315 }
325 316
@@ -421,9 +412,18 @@ out:
421 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { 412 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) {
422 fs_info->quota_enabled = 0; 413 fs_info->quota_enabled = 0;
423 fs_info->pending_quota_state = 0; 414 fs_info->pending_quota_state = 0;
415 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
416 ret >= 0) {
417 ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
424 } 418 }
425 btrfs_free_path(path); 419 btrfs_free_path(path);
426 420
421 if (ret < 0) {
422 ulist_free(fs_info->qgroup_ulist);
423 fs_info->qgroup_ulist = NULL;
424 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
425 }
426
427 return ret < 0 ? ret : 0; 427 return ret < 0 ? ret : 0;
428} 428}
429 429
@@ -460,6 +460,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
460 } 460 }
461 kfree(qgroup); 461 kfree(qgroup);
462 } 462 }
463 ulist_free(fs_info->qgroup_ulist);
463} 464}
464 465
465static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, 466static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -819,6 +820,12 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
819 goto out; 820 goto out;
820 } 821 }
821 822
823 fs_info->qgroup_ulist = ulist_alloc(GFP_NOFS);
824 if (!fs_info->qgroup_ulist) {
825 ret = -ENOMEM;
826 goto out;
827 }
828
822 /* 829 /*
823 * initially create the quota tree 830 * initially create the quota tree
824 */ 831 */
@@ -916,6 +923,10 @@ out_free_root:
916 kfree(quota_root); 923 kfree(quota_root);
917 } 924 }
918out: 925out:
926 if (ret) {
927 ulist_free(fs_info->qgroup_ulist);
928 fs_info->qgroup_ulist = NULL;
929 }
919 mutex_unlock(&fs_info->qgroup_ioctl_lock); 930 mutex_unlock(&fs_info->qgroup_ioctl_lock);
920 return ret; 931 return ret;
921} 932}
@@ -1355,7 +1366,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1355 u64 ref_root; 1366 u64 ref_root;
1356 struct btrfs_qgroup *qgroup; 1367 struct btrfs_qgroup *qgroup;
1357 struct ulist *roots = NULL; 1368 struct ulist *roots = NULL;
1358 struct ulist *tmp = NULL;
1359 u64 seq; 1369 u64 seq;
1360 int ret = 0; 1370 int ret = 0;
1361 int sgn; 1371 int sgn;
@@ -1428,14 +1438,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1428 if (ret < 0) 1438 if (ret < 0)
1429 return ret; 1439 return ret;
1430 1440
1431 mutex_lock(&fs_info->qgroup_rescan_lock);
1432 spin_lock(&fs_info->qgroup_lock); 1441 spin_lock(&fs_info->qgroup_lock);
1433 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
1434 if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
1435 ret = 0;
1436 goto unlock;
1437 }
1438 }
1439 1442
1440 quota_root = fs_info->quota_root; 1443 quota_root = fs_info->quota_root;
1441 if (!quota_root) 1444 if (!quota_root)
@@ -1448,39 +1451,34 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1448 /* 1451 /*
1449 * step 1: for each old ref, visit all nodes once and inc refcnt 1452 * step 1: for each old ref, visit all nodes once and inc refcnt
1450 */ 1453 */
1451 tmp = ulist_alloc(GFP_ATOMIC); 1454 ulist_reinit(fs_info->qgroup_ulist);
1452 if (!tmp) {
1453 ret = -ENOMEM;
1454 goto unlock;
1455 }
1456 seq = fs_info->qgroup_seq; 1455 seq = fs_info->qgroup_seq;
1457 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ 1456 fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
1458 1457
1459 ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); 1458 ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
1459 seq);
1460 if (ret) 1460 if (ret)
1461 goto unlock; 1461 goto unlock;
1462 1462
1463 /* 1463 /*
1464 * step 2: walk from the new root 1464 * step 2: walk from the new root
1465 */ 1465 */
1466 ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn, 1466 ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
1467 node->num_bytes, qgroup); 1467 seq, sgn, node->num_bytes, qgroup);
1468 if (ret) 1468 if (ret)
1469 goto unlock; 1469 goto unlock;
1470 1470
1471 /* 1471 /*
1472 * step 3: walk again from old refs 1472 * step 3: walk again from old refs
1473 */ 1473 */
1474 ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn, 1474 ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
1475 node->num_bytes); 1475 seq, sgn, node->num_bytes);
1476 if (ret) 1476 if (ret)
1477 goto unlock; 1477 goto unlock;
1478 1478
1479unlock: 1479unlock:
1480 spin_unlock(&fs_info->qgroup_lock); 1480 spin_unlock(&fs_info->qgroup_lock);
1481 mutex_unlock(&fs_info->qgroup_rescan_lock);
1482 ulist_free(roots); 1481 ulist_free(roots);
1483 ulist_free(tmp);
1484 1482
1485 return ret; 1483 return ret;
1486} 1484}
@@ -1527,9 +1525,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
1527 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1525 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1528 1526
1529 if (!ret && start_rescan_worker) { 1527 if (!ret && start_rescan_worker) {
1530 ret = btrfs_qgroup_rescan(fs_info); 1528 ret = qgroup_rescan_init(fs_info, 0, 1);
1531 if (ret) 1529 if (!ret) {
1532 pr_err("btrfs: start rescan quota failed: %d\n", ret); 1530 qgroup_rescan_zero_tracking(fs_info);
1531 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
1532 &fs_info->qgroup_rescan_work);
1533 }
1533 ret = 0; 1534 ret = 0;
1534 } 1535 }
1535 1536
@@ -1720,7 +1721,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1720 struct btrfs_fs_info *fs_info = root->fs_info; 1721 struct btrfs_fs_info *fs_info = root->fs_info;
1721 u64 ref_root = root->root_key.objectid; 1722 u64 ref_root = root->root_key.objectid;
1722 int ret = 0; 1723 int ret = 0;
1723 struct ulist *ulist = NULL;
1724 struct ulist_node *unode; 1724 struct ulist_node *unode;
1725 struct ulist_iterator uiter; 1725 struct ulist_iterator uiter;
1726 1726
@@ -1743,17 +1743,13 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1743 * in a first step, we check all affected qgroups if any limits would 1743 * in a first step, we check all affected qgroups if any limits would
1744 * be exceeded 1744 * be exceeded
1745 */ 1745 */
1746 ulist = ulist_alloc(GFP_ATOMIC); 1746 ulist_reinit(fs_info->qgroup_ulist);
1747 if (!ulist) { 1747 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1748 ret = -ENOMEM;
1749 goto out;
1750 }
1751 ret = ulist_add(ulist, qgroup->qgroupid,
1752 (uintptr_t)qgroup, GFP_ATOMIC); 1748 (uintptr_t)qgroup, GFP_ATOMIC);
1753 if (ret < 0) 1749 if (ret < 0)
1754 goto out; 1750 goto out;
1755 ULIST_ITER_INIT(&uiter); 1751 ULIST_ITER_INIT(&uiter);
1756 while ((unode = ulist_next(ulist, &uiter))) { 1752 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1757 struct btrfs_qgroup *qg; 1753 struct btrfs_qgroup *qg;
1758 struct btrfs_qgroup_list *glist; 1754 struct btrfs_qgroup_list *glist;
1759 1755
@@ -1774,7 +1770,8 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1774 } 1770 }
1775 1771
1776 list_for_each_entry(glist, &qg->groups, next_group) { 1772 list_for_each_entry(glist, &qg->groups, next_group) {
1777 ret = ulist_add(ulist, glist->group->qgroupid, 1773 ret = ulist_add(fs_info->qgroup_ulist,
1774 glist->group->qgroupid,
1778 (uintptr_t)glist->group, GFP_ATOMIC); 1775 (uintptr_t)glist->group, GFP_ATOMIC);
1779 if (ret < 0) 1776 if (ret < 0)
1780 goto out; 1777 goto out;
@@ -1785,7 +1782,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1785 * no limits exceeded, now record the reservation into all qgroups 1782 * no limits exceeded, now record the reservation into all qgroups
1786 */ 1783 */
1787 ULIST_ITER_INIT(&uiter); 1784 ULIST_ITER_INIT(&uiter);
1788 while ((unode = ulist_next(ulist, &uiter))) { 1785 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1789 struct btrfs_qgroup *qg; 1786 struct btrfs_qgroup *qg;
1790 1787
1791 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; 1788 qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
@@ -1795,8 +1792,6 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
1795 1792
1796out: 1793out:
1797 spin_unlock(&fs_info->qgroup_lock); 1794 spin_unlock(&fs_info->qgroup_lock);
1798 ulist_free(ulist);
1799
1800 return ret; 1795 return ret;
1801} 1796}
1802 1797
@@ -1805,7 +1800,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1805 struct btrfs_root *quota_root; 1800 struct btrfs_root *quota_root;
1806 struct btrfs_qgroup *qgroup; 1801 struct btrfs_qgroup *qgroup;
1807 struct btrfs_fs_info *fs_info = root->fs_info; 1802 struct btrfs_fs_info *fs_info = root->fs_info;
1808 struct ulist *ulist = NULL;
1809 struct ulist_node *unode; 1803 struct ulist_node *unode;
1810 struct ulist_iterator uiter; 1804 struct ulist_iterator uiter;
1811 u64 ref_root = root->root_key.objectid; 1805 u64 ref_root = root->root_key.objectid;
@@ -1827,17 +1821,13 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1827 if (!qgroup) 1821 if (!qgroup)
1828 goto out; 1822 goto out;
1829 1823
1830 ulist = ulist_alloc(GFP_ATOMIC); 1824 ulist_reinit(fs_info->qgroup_ulist);
1831 if (!ulist) { 1825 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
1832 btrfs_std_error(fs_info, -ENOMEM);
1833 goto out;
1834 }
1835 ret = ulist_add(ulist, qgroup->qgroupid,
1836 (uintptr_t)qgroup, GFP_ATOMIC); 1826 (uintptr_t)qgroup, GFP_ATOMIC);
1837 if (ret < 0) 1827 if (ret < 0)
1838 goto out; 1828 goto out;
1839 ULIST_ITER_INIT(&uiter); 1829 ULIST_ITER_INIT(&uiter);
1840 while ((unode = ulist_next(ulist, &uiter))) { 1830 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
1841 struct btrfs_qgroup *qg; 1831 struct btrfs_qgroup *qg;
1842 struct btrfs_qgroup_list *glist; 1832 struct btrfs_qgroup_list *glist;
1843 1833
@@ -1846,7 +1836,8 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1846 qg->reserved -= num_bytes; 1836 qg->reserved -= num_bytes;
1847 1837
1848 list_for_each_entry(glist, &qg->groups, next_group) { 1838 list_for_each_entry(glist, &qg->groups, next_group) {
1849 ret = ulist_add(ulist, glist->group->qgroupid, 1839 ret = ulist_add(fs_info->qgroup_ulist,
1840 glist->group->qgroupid,
1850 (uintptr_t)glist->group, GFP_ATOMIC); 1841 (uintptr_t)glist->group, GFP_ATOMIC);
1851 if (ret < 0) 1842 if (ret < 0)
1852 goto out; 1843 goto out;
@@ -1855,7 +1846,6 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
1855 1846
1856out: 1847out:
1857 spin_unlock(&fs_info->qgroup_lock); 1848 spin_unlock(&fs_info->qgroup_lock);
1858 ulist_free(ulist);
1859} 1849}
1860 1850
1861void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) 1851void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
@@ -1874,12 +1864,11 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1874 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. 1864 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
1875 */ 1865 */
1876static int 1866static int
1877qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, 1867qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1878 struct btrfs_trans_handle *trans, struct ulist *tmp, 1868 struct btrfs_trans_handle *trans, struct ulist *tmp,
1879 struct extent_buffer *scratch_leaf) 1869 struct extent_buffer *scratch_leaf)
1880{ 1870{
1881 struct btrfs_key found; 1871 struct btrfs_key found;
1882 struct btrfs_fs_info *fs_info = qscan->fs_info;
1883 struct ulist *roots = NULL; 1872 struct ulist *roots = NULL;
1884 struct ulist_node *unode; 1873 struct ulist_node *unode;
1885 struct ulist_iterator uiter; 1874 struct ulist_iterator uiter;
@@ -2007,11 +1996,10 @@ out:
2007 1996
2008static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 1997static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2009{ 1998{
2010 struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan, 1999 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
2011 work); 2000 qgroup_rescan_work);
2012 struct btrfs_path *path; 2001 struct btrfs_path *path;
2013 struct btrfs_trans_handle *trans = NULL; 2002 struct btrfs_trans_handle *trans = NULL;
2014 struct btrfs_fs_info *fs_info = qscan->fs_info;
2015 struct ulist *tmp = NULL; 2003 struct ulist *tmp = NULL;
2016 struct extent_buffer *scratch_leaf = NULL; 2004 struct extent_buffer *scratch_leaf = NULL;
2017 int err = -ENOMEM; 2005 int err = -ENOMEM;
@@ -2036,7 +2024,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2036 if (!fs_info->quota_enabled) { 2024 if (!fs_info->quota_enabled) {
2037 err = -EINTR; 2025 err = -EINTR;
2038 } else { 2026 } else {
2039 err = qgroup_rescan_leaf(qscan, path, trans, 2027 err = qgroup_rescan_leaf(fs_info, path, trans,
2040 tmp, scratch_leaf); 2028 tmp, scratch_leaf);
2041 } 2029 }
2042 if (err > 0) 2030 if (err > 0)
@@ -2049,7 +2037,6 @@ out:
2049 kfree(scratch_leaf); 2037 kfree(scratch_leaf);
2050 ulist_free(tmp); 2038 ulist_free(tmp);
2051 btrfs_free_path(path); 2039 btrfs_free_path(path);
2052 kfree(qscan);
2053 2040
2054 mutex_lock(&fs_info->qgroup_rescan_lock); 2041 mutex_lock(&fs_info->qgroup_rescan_lock);
2055 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 2042 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
@@ -2068,47 +2055,74 @@ out:
2068 } else { 2055 } else {
2069 pr_err("btrfs: qgroup scan failed with %d\n", err); 2056 pr_err("btrfs: qgroup scan failed with %d\n", err);
2070 } 2057 }
2071}
2072 2058
2073static void 2059 complete_all(&fs_info->qgroup_rescan_completion);
2074qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan)
2075{
2076 memset(&qscan->work, 0, sizeof(qscan->work));
2077 qscan->work.func = btrfs_qgroup_rescan_worker;
2078 qscan->fs_info = fs_info;
2079
2080 pr_info("btrfs: qgroup scan started\n");
2081 btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work);
2082} 2060}
2083 2061
2084int 2062/*
2085btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 2063 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
2064 * memory required for the rescan context.
2065 */
2066static int
2067qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2068 int init_flags)
2086{ 2069{
2087 int ret = 0; 2070 int ret = 0;
2088 struct rb_node *n;
2089 struct btrfs_qgroup *qgroup;
2090 struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS);
2091 2071
2092 if (!qscan) 2072 if (!init_flags &&
2093 return -ENOMEM; 2073 (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
2074 !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
2075 ret = -EINVAL;
2076 goto err;
2077 }
2094 2078
2095 mutex_lock(&fs_info->qgroup_rescan_lock); 2079 mutex_lock(&fs_info->qgroup_rescan_lock);
2096 spin_lock(&fs_info->qgroup_lock); 2080 spin_lock(&fs_info->qgroup_lock);
2097 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) 2081
2098 ret = -EINPROGRESS; 2082 if (init_flags) {
2099 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 2083 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2100 ret = -EINVAL; 2084 ret = -EINPROGRESS;
2101 if (ret) { 2085 else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
2102 spin_unlock(&fs_info->qgroup_lock); 2086 ret = -EINVAL;
2103 mutex_unlock(&fs_info->qgroup_rescan_lock); 2087
2104 kfree(qscan); 2088 if (ret) {
2105 return ret; 2089 spin_unlock(&fs_info->qgroup_lock);
2090 mutex_unlock(&fs_info->qgroup_rescan_lock);
2091 goto err;
2092 }
2093
2094 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2106 } 2095 }
2107 2096
2108 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2109 memset(&fs_info->qgroup_rescan_progress, 0, 2097 memset(&fs_info->qgroup_rescan_progress, 0,
2110 sizeof(fs_info->qgroup_rescan_progress)); 2098 sizeof(fs_info->qgroup_rescan_progress));
2099 fs_info->qgroup_rescan_progress.objectid = progress_objectid;
2100
2101 spin_unlock(&fs_info->qgroup_lock);
2102 mutex_unlock(&fs_info->qgroup_rescan_lock);
2103
2104 init_completion(&fs_info->qgroup_rescan_completion);
2105
2106 memset(&fs_info->qgroup_rescan_work, 0,
2107 sizeof(fs_info->qgroup_rescan_work));
2108 fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
2109
2110 if (ret) {
2111err:
2112 pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
2113 return ret;
2114 }
2115
2116 return 0;
2117}
2118
2119static void
2120qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
2121{
2122 struct rb_node *n;
2123 struct btrfs_qgroup *qgroup;
2111 2124
2125 spin_lock(&fs_info->qgroup_lock);
2112 /* clear all current qgroup tracking information */ 2126 /* clear all current qgroup tracking information */
2113 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 2127 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
2114 qgroup = rb_entry(n, struct btrfs_qgroup, node); 2128 qgroup = rb_entry(n, struct btrfs_qgroup, node);
@@ -2118,9 +2132,74 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2118 qgroup->excl_cmpr = 0; 2132 qgroup->excl_cmpr = 0;
2119 } 2133 }
2120 spin_unlock(&fs_info->qgroup_lock); 2134 spin_unlock(&fs_info->qgroup_lock);
2121 mutex_unlock(&fs_info->qgroup_rescan_lock); 2135}
2136
2137int
2138btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2139{
2140 int ret = 0;
2141 struct btrfs_trans_handle *trans;
2122 2142
2123 qgroup_rescan_start(fs_info, qscan); 2143 ret = qgroup_rescan_init(fs_info, 0, 1);
2144 if (ret)
2145 return ret;
2146
2147 /*
2148 * We have set the rescan_progress to 0, which means no more
2149 * delayed refs will be accounted by btrfs_qgroup_account_ref.
2150 * However, btrfs_qgroup_account_ref may be right after its call
2151 * to btrfs_find_all_roots, in which case it would still do the
2152 * accounting.
2153 * To solve this, we're committing the transaction, which will
2154 * ensure we run all delayed refs and only after that, we are
2155 * going to clear all tracking information for a clean start.
2156 */
2157
2158 trans = btrfs_join_transaction(fs_info->fs_root);
2159 if (IS_ERR(trans)) {
2160 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2161 return PTR_ERR(trans);
2162 }
2163 ret = btrfs_commit_transaction(trans, fs_info->fs_root);
2164 if (ret) {
2165 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2166 return ret;
2167 }
2168
2169 qgroup_rescan_zero_tracking(fs_info);
2170
2171 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2172 &fs_info->qgroup_rescan_work);
2124 2173
2125 return 0; 2174 return 0;
2126} 2175}
2176
2177int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
2178{
2179 int running;
2180 int ret = 0;
2181
2182 mutex_lock(&fs_info->qgroup_rescan_lock);
2183 spin_lock(&fs_info->qgroup_lock);
2184 running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2185 spin_unlock(&fs_info->qgroup_lock);
2186 mutex_unlock(&fs_info->qgroup_rescan_lock);
2187
2188 if (running)
2189 ret = wait_for_completion_interruptible(
2190 &fs_info->qgroup_rescan_completion);
2191
2192 return ret;
2193}
2194
2195/*
2196 * this is only called from open_ctree where we're still single threaded, thus
2197 * locking is omitted here.
2198 */
2199void
2200btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2201{
2202 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
2203 btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
2204 &fs_info->qgroup_rescan_work);
2205}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4febca4fc2de..12096496cc99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1305,6 +1305,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1305 struct extent_buffer *eb; 1305 struct extent_buffer *eb;
1306 struct btrfs_root_item *root_item; 1306 struct btrfs_root_item *root_item;
1307 struct btrfs_key root_key; 1307 struct btrfs_key root_key;
1308 u64 last_snap = 0;
1308 int ret; 1309 int ret;
1309 1310
1310 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1311 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1320,6 +1321,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1320 BTRFS_TREE_RELOC_OBJECTID); 1321 BTRFS_TREE_RELOC_OBJECTID);
1321 BUG_ON(ret); 1322 BUG_ON(ret);
1322 1323
1324 last_snap = btrfs_root_last_snapshot(&root->root_item);
1323 btrfs_set_root_last_snapshot(&root->root_item, 1325 btrfs_set_root_last_snapshot(&root->root_item,
1324 trans->transid - 1); 1326 trans->transid - 1);
1325 } else { 1327 } else {
@@ -1345,6 +1347,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1345 memset(&root_item->drop_progress, 0, 1347 memset(&root_item->drop_progress, 0,
1346 sizeof(struct btrfs_disk_key)); 1348 sizeof(struct btrfs_disk_key));
1347 root_item->drop_level = 0; 1349 root_item->drop_level = 0;
1350 /*
1351 * abuse rtransid, it is safe because it is impossible to
1352 * receive data into a relocation tree.
1353 */
1354 btrfs_set_root_rtransid(root_item, last_snap);
1355 btrfs_set_root_otransid(root_item, trans->transid);
1348 } 1356 }
1349 1357
1350 btrfs_tree_unlock(eb); 1358 btrfs_tree_unlock(eb);
@@ -1355,8 +1363,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1355 BUG_ON(ret); 1363 BUG_ON(ret);
1356 kfree(root_item); 1364 kfree(root_item);
1357 1365
1358 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 1366 reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1359 &root_key);
1360 BUG_ON(IS_ERR(reloc_root)); 1367 BUG_ON(IS_ERR(reloc_root));
1361 reloc_root->last_trans = trans->transid; 1368 reloc_root->last_trans = trans->transid;
1362 return reloc_root; 1369 return reloc_root;
@@ -2273,8 +2280,12 @@ void free_reloc_roots(struct list_head *list)
2273static noinline_for_stack 2280static noinline_for_stack
2274int merge_reloc_roots(struct reloc_control *rc) 2281int merge_reloc_roots(struct reloc_control *rc)
2275{ 2282{
2283 struct btrfs_trans_handle *trans;
2276 struct btrfs_root *root; 2284 struct btrfs_root *root;
2277 struct btrfs_root *reloc_root; 2285 struct btrfs_root *reloc_root;
2286 u64 last_snap;
2287 u64 otransid;
2288 u64 objectid;
2278 LIST_HEAD(reloc_roots); 2289 LIST_HEAD(reloc_roots);
2279 int found = 0; 2290 int found = 0;
2280 int ret = 0; 2291 int ret = 0;
@@ -2308,12 +2319,44 @@ again:
2308 } else { 2319 } else {
2309 list_del_init(&reloc_root->root_list); 2320 list_del_init(&reloc_root->root_list);
2310 } 2321 }
2322
2323 /*
2324 * we keep the old last snapshod transid in rtranid when we
2325 * created the relocation tree.
2326 */
2327 last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2328 otransid = btrfs_root_otransid(&reloc_root->root_item);
2329 objectid = reloc_root->root_key.offset;
2330
2311 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); 2331 ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2312 if (ret < 0) { 2332 if (ret < 0) {
2313 if (list_empty(&reloc_root->root_list)) 2333 if (list_empty(&reloc_root->root_list))
2314 list_add_tail(&reloc_root->root_list, 2334 list_add_tail(&reloc_root->root_list,
2315 &reloc_roots); 2335 &reloc_roots);
2316 goto out; 2336 goto out;
2337 } else if (!ret) {
2338 /*
2339 * recover the last snapshot tranid to avoid
2340 * the space balance break NOCOW.
2341 */
2342 root = read_fs_root(rc->extent_root->fs_info,
2343 objectid);
2344 if (IS_ERR(root))
2345 continue;
2346
2347 if (btrfs_root_refs(&root->root_item) == 0)
2348 continue;
2349
2350 trans = btrfs_join_transaction(root);
2351 BUG_ON(IS_ERR(trans));
2352
2353 /* Check if the fs/file tree was snapshoted or not. */
2354 if (btrfs_root_last_snapshot(&root->root_item) ==
2355 otransid - 1)
2356 btrfs_set_root_last_snapshot(&root->root_item,
2357 last_snap);
2358
2359 btrfs_end_transaction(trans, root);
2317 } 2360 }
2318 } 2361 }
2319 2362
@@ -3266,6 +3309,8 @@ static int __add_tree_block(struct reloc_control *rc,
3266 struct btrfs_path *path; 3309 struct btrfs_path *path;
3267 struct btrfs_key key; 3310 struct btrfs_key key;
3268 int ret; 3311 int ret;
3312 bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3313 SKINNY_METADATA);
3269 3314
3270 if (tree_block_processed(bytenr, blocksize, rc)) 3315 if (tree_block_processed(bytenr, blocksize, rc))
3271 return 0; 3316 return 0;
@@ -3276,10 +3321,15 @@ static int __add_tree_block(struct reloc_control *rc,
3276 path = btrfs_alloc_path(); 3321 path = btrfs_alloc_path();
3277 if (!path) 3322 if (!path)
3278 return -ENOMEM; 3323 return -ENOMEM;
3279 3324again:
3280 key.objectid = bytenr; 3325 key.objectid = bytenr;
3281 key.type = BTRFS_EXTENT_ITEM_KEY; 3326 if (skinny) {
3282 key.offset = blocksize; 3327 key.type = BTRFS_METADATA_ITEM_KEY;
3328 key.offset = (u64)-1;
3329 } else {
3330 key.type = BTRFS_EXTENT_ITEM_KEY;
3331 key.offset = blocksize;
3332 }
3283 3333
3284 path->search_commit_root = 1; 3334 path->search_commit_root = 1;
3285 path->skip_locking = 1; 3335 path->skip_locking = 1;
@@ -3287,11 +3337,23 @@ static int __add_tree_block(struct reloc_control *rc,
3287 if (ret < 0) 3337 if (ret < 0)
3288 goto out; 3338 goto out;
3289 3339
3290 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3340 if (ret > 0 && skinny) {
3291 if (ret > 0) { 3341 if (path->slots[0]) {
3292 if (key.objectid == bytenr && 3342 path->slots[0]--;
3293 key.type == BTRFS_METADATA_ITEM_KEY) 3343 btrfs_item_key_to_cpu(path->nodes[0], &key,
3294 ret = 0; 3344 path->slots[0]);
3345 if (key.objectid == bytenr &&
3346 (key.type == BTRFS_METADATA_ITEM_KEY ||
3347 (key.type == BTRFS_EXTENT_ITEM_KEY &&
3348 key.offset == blocksize)))
3349 ret = 0;
3350 }
3351
3352 if (ret) {
3353 skinny = false;
3354 btrfs_release_path(path);
3355 goto again;
3356 }
3295 } 3357 }
3296 BUG_ON(ret); 3358 BUG_ON(ret);
3297 3359
@@ -4160,12 +4222,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4160 (unsigned long long)rc->block_group->key.objectid, 4222 (unsigned long long)rc->block_group->key.objectid,
4161 (unsigned long long)rc->block_group->flags); 4223 (unsigned long long)rc->block_group->flags);
4162 4224
4163 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4225 ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
4164 if (ret < 0) { 4226 if (ret < 0) {
4165 err = ret; 4227 err = ret;
4166 goto out; 4228 goto out;
4167 } 4229 }
4168 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4230 btrfs_wait_all_ordered_extents(fs_info, 0);
4169 4231
4170 while (1) { 4232 while (1) {
4171 mutex_lock(&fs_info->cleaner_mutex); 4233 mutex_lock(&fs_info->cleaner_mutex);
@@ -4277,7 +4339,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4277 key.type != BTRFS_ROOT_ITEM_KEY) 4339 key.type != BTRFS_ROOT_ITEM_KEY)
4278 break; 4340 break;
4279 4341
4280 reloc_root = btrfs_read_fs_root_no_radix(root, &key); 4342 reloc_root = btrfs_read_fs_root(root, &key);
4281 if (IS_ERR(reloc_root)) { 4343 if (IS_ERR(reloc_root)) {
4282 err = PTR_ERR(reloc_root); 4344 err = PTR_ERR(reloc_root);
4283 goto out; 4345 goto out;
@@ -4396,10 +4458,8 @@ out:
4396int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) 4458int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4397{ 4459{
4398 struct btrfs_ordered_sum *sums; 4460 struct btrfs_ordered_sum *sums;
4399 struct btrfs_sector_sum *sector_sum;
4400 struct btrfs_ordered_extent *ordered; 4461 struct btrfs_ordered_extent *ordered;
4401 struct btrfs_root *root = BTRFS_I(inode)->root; 4462 struct btrfs_root *root = BTRFS_I(inode)->root;
4402 size_t offset;
4403 int ret; 4463 int ret;
4404 u64 disk_bytenr; 4464 u64 disk_bytenr;
4405 LIST_HEAD(list); 4465 LIST_HEAD(list);
@@ -4413,19 +4473,13 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4413 if (ret) 4473 if (ret)
4414 goto out; 4474 goto out;
4415 4475
4476 disk_bytenr = ordered->start;
4416 while (!list_empty(&list)) { 4477 while (!list_empty(&list)) {
4417 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4478 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4418 list_del_init(&sums->list); 4479 list_del_init(&sums->list);
4419 4480
4420 sector_sum = sums->sums; 4481 sums->bytenr = disk_bytenr;
4421 sums->bytenr = ordered->start; 4482 disk_bytenr += sums->len;
4422
4423 offset = 0;
4424 while (offset < sums->len) {
4425 sector_sum->bytenr += ordered->start - disk_bytenr;
4426 sector_sum++;
4427 offset += root->sectorsize;
4428 }
4429 4483
4430 btrfs_add_ordered_sum(inode, ordered, sums); 4484 btrfs_add_ordered_sum(inode, ordered, sums);
4431 } 4485 }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 5bf1ed57f178..ffb1036ef10d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -64,52 +64,59 @@ void btrfs_read_root_item(struct extent_buffer *eb, int slot,
64} 64}
65 65
66/* 66/*
67 * lookup the root with the highest offset for a given objectid. The key we do 67 * btrfs_find_root - lookup the root by the key.
68 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 68 * root: the root of the root tree
69 * on error. 69 * search_key: the key to search
70 * path: the path we search
71 * root_item: the root item of the tree we look for
72 * root_key: the reak key of the tree we look for
73 *
74 * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
75 * of the search key, just lookup the root with the highest offset for a
76 * given objectid.
77 *
78 * If we find something return 0, otherwise > 0, < 0 on error.
70 */ 79 */
71int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, 80int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
72 struct btrfs_root_item *item, struct btrfs_key *key) 81 struct btrfs_path *path, struct btrfs_root_item *root_item,
82 struct btrfs_key *root_key)
73{ 83{
74 struct btrfs_path *path;
75 struct btrfs_key search_key;
76 struct btrfs_key found_key; 84 struct btrfs_key found_key;
77 struct extent_buffer *l; 85 struct extent_buffer *l;
78 int ret; 86 int ret;
79 int slot; 87 int slot;
80 88
81 search_key.objectid = objectid; 89 ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
82 search_key.type = BTRFS_ROOT_ITEM_KEY;
83 search_key.offset = (u64)-1;
84
85 path = btrfs_alloc_path();
86 if (!path)
87 return -ENOMEM;
88 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
89 if (ret < 0) 90 if (ret < 0)
90 goto out; 91 return ret;
91 92
92 BUG_ON(ret == 0); 93 if (search_key->offset != -1ULL) { /* the search key is exact */
93 if (path->slots[0] == 0) { 94 if (ret > 0)
94 ret = 1; 95 goto out;
95 goto out; 96 } else {
97 BUG_ON(ret == 0); /* Logical error */
98 if (path->slots[0] == 0)
99 goto out;
100 path->slots[0]--;
101 ret = 0;
96 } 102 }
103
97 l = path->nodes[0]; 104 l = path->nodes[0];
98 slot = path->slots[0] - 1; 105 slot = path->slots[0];
106
99 btrfs_item_key_to_cpu(l, &found_key, slot); 107 btrfs_item_key_to_cpu(l, &found_key, slot);
100 if (found_key.objectid != objectid || 108 if (found_key.objectid != search_key->objectid ||
101 found_key.type != BTRFS_ROOT_ITEM_KEY) { 109 found_key.type != BTRFS_ROOT_ITEM_KEY) {
102 ret = 1; 110 ret = 1;
103 goto out; 111 goto out;
104 } 112 }
105 if (item)
106 btrfs_read_root_item(l, slot, item);
107 if (key)
108 memcpy(key, &found_key, sizeof(found_key));
109 113
110 ret = 0; 114 if (root_item)
115 btrfs_read_root_item(l, slot, root_item);
116 if (root_key)
117 memcpy(root_key, &found_key, sizeof(found_key));
111out: 118out:
112 btrfs_free_path(path); 119 btrfs_release_path(path);
113 return ret; 120 return ret;
114} 121}
115 122
@@ -212,86 +219,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
212 return btrfs_insert_item(trans, root, key, item, sizeof(*item)); 219 return btrfs_insert_item(trans, root, key, item, sizeof(*item));
213} 220}
214 221
215/*
216 * at mount time we want to find all the old transaction snapshots that were in
217 * the process of being deleted if we crashed. This is any root item with an
218 * offset lower than the latest root. They need to be queued for deletion to
219 * finish what was happening when we crashed.
220 */
221int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
222{
223 struct btrfs_root *dead_root;
224 struct btrfs_root_item *ri;
225 struct btrfs_key key;
226 struct btrfs_key found_key;
227 struct btrfs_path *path;
228 int ret;
229 u32 nritems;
230 struct extent_buffer *leaf;
231 int slot;
232
233 key.objectid = objectid;
234 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
235 key.offset = 0;
236 path = btrfs_alloc_path();
237 if (!path)
238 return -ENOMEM;
239
240again:
241 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
242 if (ret < 0)
243 goto err;
244 while (1) {
245 leaf = path->nodes[0];
246 nritems = btrfs_header_nritems(leaf);
247 slot = path->slots[0];
248 if (slot >= nritems) {
249 ret = btrfs_next_leaf(root, path);
250 if (ret)
251 break;
252 leaf = path->nodes[0];
253 nritems = btrfs_header_nritems(leaf);
254 slot = path->slots[0];
255 }
256 btrfs_item_key_to_cpu(leaf, &key, slot);
257 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
258 goto next;
259
260 if (key.objectid < objectid)
261 goto next;
262
263 if (key.objectid > objectid)
264 break;
265
266 ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
267 if (btrfs_disk_root_refs(leaf, ri) != 0)
268 goto next;
269
270 memcpy(&found_key, &key, sizeof(key));
271 key.offset++;
272 btrfs_release_path(path);
273 dead_root =
274 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
275 &found_key);
276 if (IS_ERR(dead_root)) {
277 ret = PTR_ERR(dead_root);
278 goto err;
279 }
280
281 ret = btrfs_add_dead_root(dead_root);
282 if (ret)
283 goto err;
284 goto again;
285next:
286 slot++;
287 path->slots[0]++;
288 }
289 ret = 0;
290err:
291 btrfs_free_path(path);
292 return ret;
293}
294
295int btrfs_find_orphan_roots(struct btrfs_root *tree_root) 222int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
296{ 223{
297 struct extent_buffer *leaf; 224 struct extent_buffer *leaf;
@@ -301,6 +228,10 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
301 struct btrfs_root *root; 228 struct btrfs_root *root;
302 int err = 0; 229 int err = 0;
303 int ret; 230 int ret;
231 bool can_recover = true;
232
233 if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
234 can_recover = false;
304 235
305 path = btrfs_alloc_path(); 236 path = btrfs_alloc_path();
306 if (!path) 237 if (!path)
@@ -340,20 +271,52 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
340 root_key.objectid = key.offset; 271 root_key.objectid = key.offset;
341 key.offset++; 272 key.offset++;
342 273
343 root = btrfs_read_fs_root_no_name(tree_root->fs_info, 274 root = btrfs_read_fs_root(tree_root, &root_key);
344 &root_key); 275 err = PTR_RET(root);
345 if (!IS_ERR(root)) 276 if (err && err != -ENOENT) {
277 break;
278 } else if (err == -ENOENT) {
279 struct btrfs_trans_handle *trans;
280
281 btrfs_release_path(path);
282
283 trans = btrfs_join_transaction(tree_root);
284 if (IS_ERR(trans)) {
285 err = PTR_ERR(trans);
286 btrfs_error(tree_root->fs_info, err,
287 "Failed to start trans to delete "
288 "orphan item");
289 break;
290 }
291 err = btrfs_del_orphan_item(trans, tree_root,
292 root_key.objectid);
293 btrfs_end_transaction(trans, tree_root);
294 if (err) {
295 btrfs_error(tree_root->fs_info, err,
296 "Failed to delete root orphan "
297 "item");
298 break;
299 }
346 continue; 300 continue;
301 }
347 302
348 ret = PTR_ERR(root); 303 if (btrfs_root_refs(&root->root_item) == 0) {
349 if (ret != -ENOENT) { 304 btrfs_add_dead_root(root);
350 err = ret; 305 continue;
306 }
307
308 err = btrfs_init_fs_root(root);
309 if (err) {
310 btrfs_free_fs_root(root);
351 break; 311 break;
352 } 312 }
353 313
354 ret = btrfs_find_dead_roots(tree_root, root_key.objectid); 314 root->orphan_item_inserted = 1;
355 if (ret) { 315
356 err = ret; 316 err = btrfs_insert_fs_root(root->fs_info, root);
317 if (err) {
318 BUG_ON(err == -EEXIST);
319 btrfs_free_fs_root(root);
357 break; 320 break;
358 } 321 }
359 } 322 }
@@ -368,8 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
368{ 331{
369 struct btrfs_path *path; 332 struct btrfs_path *path;
370 int ret; 333 int ret;
371 struct btrfs_root_item *ri;
372 struct extent_buffer *leaf;
373 334
374 path = btrfs_alloc_path(); 335 path = btrfs_alloc_path();
375 if (!path) 336 if (!path)
@@ -379,8 +340,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
379 goto out; 340 goto out;
380 341
381 BUG_ON(ret != 0); 342 BUG_ON(ret != 0);
382 leaf = path->nodes[0];
383 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
384 343
385 ret = btrfs_del_item(trans, root, path); 344 ret = btrfs_del_item(trans, root, path);
386out: 345out:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 79bd479317cb..64a157becbe5 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2126,8 +2126,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2126 u8 *csum) 2126 u8 *csum)
2127{ 2127{
2128 struct btrfs_ordered_sum *sum = NULL; 2128 struct btrfs_ordered_sum *sum = NULL;
2129 int ret = 0; 2129 unsigned long index;
2130 unsigned long i;
2131 unsigned long num_sectors; 2130 unsigned long num_sectors;
2132 2131
2133 while (!list_empty(&sctx->csum_list)) { 2132 while (!list_empty(&sctx->csum_list)) {
@@ -2146,19 +2145,14 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2146 if (!sum) 2145 if (!sum)
2147 return 0; 2146 return 0;
2148 2147
2148 index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2149 num_sectors = sum->len / sctx->sectorsize; 2149 num_sectors = sum->len / sctx->sectorsize;
2150 for (i = 0; i < num_sectors; ++i) { 2150 memcpy(csum, sum->sums + index, sctx->csum_size);
2151 if (sum->sums[i].bytenr == logical) { 2151 if (index == num_sectors - 1) {
2152 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
2153 ret = 1;
2154 break;
2155 }
2156 }
2157 if (ret && i == num_sectors - 1) {
2158 list_del(&sum->list); 2152 list_del(&sum->list);
2159 kfree(sum); 2153 kfree(sum);
2160 } 2154 }
2161 return ret; 2155 return 1;
2162} 2156}
2163 2157
2164/* scrub extent tries to collect up to 64 kB for each bio */ 2158/* scrub extent tries to collect up to 64 kB for each bio */
@@ -2501,10 +2495,11 @@ again:
2501 ret = scrub_extent(sctx, extent_logical, extent_len, 2495 ret = scrub_extent(sctx, extent_logical, extent_len,
2502 extent_physical, extent_dev, flags, 2496 extent_physical, extent_dev, flags,
2503 generation, extent_mirror_num, 2497 generation, extent_mirror_num,
2504 extent_physical); 2498 extent_logical - logical + physical);
2505 if (ret) 2499 if (ret)
2506 goto out; 2500 goto out;
2507 2501
2502 scrub_free_csums(sctx);
2508 if (extent_logical + extent_len < 2503 if (extent_logical + extent_len <
2509 key.objectid + bytes) { 2504 key.objectid + bytes) {
2510 logical += increment; 2505 logical += increment;
@@ -3204,16 +3199,18 @@ out:
3204 3199
3205static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) 3200static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3206{ 3201{
3207 unsigned long index;
3208 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 3202 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3209 int ret = 0; 3203 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3210 struct btrfs_key key; 3204 struct btrfs_key key;
3211 struct inode *inode = NULL; 3205 struct inode *inode;
3206 struct page *page;
3212 struct btrfs_root *local_root; 3207 struct btrfs_root *local_root;
3213 u64 physical_for_dev_replace; 3208 u64 physical_for_dev_replace;
3214 u64 len; 3209 u64 len;
3215 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 3210 unsigned long index;
3216 int srcu_index; 3211 int srcu_index;
3212 int ret;
3213 int err;
3217 3214
3218 key.objectid = root; 3215 key.objectid = root;
3219 key.type = BTRFS_ROOT_ITEM_KEY; 3216 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3227,6 +3224,11 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3227 return PTR_ERR(local_root); 3224 return PTR_ERR(local_root);
3228 } 3225 }
3229 3226
3227 if (btrfs_root_refs(&local_root->root_item) == 0) {
3228 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3229 return -ENOENT;
3230 }
3231
3230 key.type = BTRFS_INODE_ITEM_KEY; 3232 key.type = BTRFS_INODE_ITEM_KEY;
3231 key.objectid = inum; 3233 key.objectid = inum;
3232 key.offset = 0; 3234 key.offset = 0;
@@ -3235,19 +3237,21 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3235 if (IS_ERR(inode)) 3237 if (IS_ERR(inode))
3236 return PTR_ERR(inode); 3238 return PTR_ERR(inode);
3237 3239
3240 /* Avoid truncate/dio/punch hole.. */
3241 mutex_lock(&inode->i_mutex);
3242 inode_dio_wait(inode);
3243
3244 ret = 0;
3238 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3245 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3239 len = nocow_ctx->len; 3246 len = nocow_ctx->len;
3240 while (len >= PAGE_CACHE_SIZE) { 3247 while (len >= PAGE_CACHE_SIZE) {
3241 struct page *page = NULL;
3242 int ret_sub;
3243
3244 index = offset >> PAGE_CACHE_SHIFT; 3248 index = offset >> PAGE_CACHE_SHIFT;
3245 3249again:
3246 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 3250 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3247 if (!page) { 3251 if (!page) {
3248 pr_err("find_or_create_page() failed\n"); 3252 pr_err("find_or_create_page() failed\n");
3249 ret = -ENOMEM; 3253 ret = -ENOMEM;
3250 goto next_page; 3254 goto out;
3251 } 3255 }
3252 3256
3253 if (PageUptodate(page)) { 3257 if (PageUptodate(page)) {
@@ -3255,39 +3259,49 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3255 goto next_page; 3259 goto next_page;
3256 } else { 3260 } else {
3257 ClearPageError(page); 3261 ClearPageError(page);
3258 ret_sub = extent_read_full_page(&BTRFS_I(inode)-> 3262 err = extent_read_full_page(&BTRFS_I(inode)->
3259 io_tree, 3263 io_tree,
3260 page, btrfs_get_extent, 3264 page, btrfs_get_extent,
3261 nocow_ctx->mirror_num); 3265 nocow_ctx->mirror_num);
3262 if (ret_sub) { 3266 if (err) {
3263 ret = ret_sub; 3267 ret = err;
3264 goto next_page; 3268 goto next_page;
3265 } 3269 }
3266 wait_on_page_locked(page); 3270
3271 lock_page(page);
3272 /*
3273 * If the page has been remove from the page cache,
3274 * the data on it is meaningless, because it may be
3275 * old one, the new data may be written into the new
3276 * page in the page cache.
3277 */
3278 if (page->mapping != inode->i_mapping) {
3279 page_cache_release(page);
3280 goto again;
3281 }
3267 if (!PageUptodate(page)) { 3282 if (!PageUptodate(page)) {
3268 ret = -EIO; 3283 ret = -EIO;
3269 goto next_page; 3284 goto next_page;
3270 } 3285 }
3271 } 3286 }
3272 ret_sub = write_page_nocow(nocow_ctx->sctx, 3287 err = write_page_nocow(nocow_ctx->sctx,
3273 physical_for_dev_replace, page); 3288 physical_for_dev_replace, page);
3274 if (ret_sub) { 3289 if (err)
3275 ret = ret_sub; 3290 ret = err;
3276 goto next_page;
3277 }
3278
3279next_page: 3291next_page:
3280 if (page) { 3292 unlock_page(page);
3281 unlock_page(page); 3293 page_cache_release(page);
3282 put_page(page); 3294
3283 } 3295 if (ret)
3296 break;
3297
3284 offset += PAGE_CACHE_SIZE; 3298 offset += PAGE_CACHE_SIZE;
3285 physical_for_dev_replace += PAGE_CACHE_SIZE; 3299 physical_for_dev_replace += PAGE_CACHE_SIZE;
3286 len -= PAGE_CACHE_SIZE; 3300 len -= PAGE_CACHE_SIZE;
3287 } 3301 }
3288 3302out:
3289 if (inode) 3303 mutex_unlock(&inode->i_mutex);
3290 iput(inode); 3304 iput(inode);
3291 return ret; 3305 return ret;
3292} 3306}
3293 3307
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ff40f1c00ce3..d3f3b43cae0b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -158,7 +158,7 @@ static void fs_path_reset(struct fs_path *p)
158 } 158 }
159} 159}
160 160
161static struct fs_path *fs_path_alloc(struct send_ctx *sctx) 161static struct fs_path *fs_path_alloc(void)
162{ 162{
163 struct fs_path *p; 163 struct fs_path *p;
164 164
@@ -173,11 +173,11 @@ static struct fs_path *fs_path_alloc(struct send_ctx *sctx)
173 return p; 173 return p;
174} 174}
175 175
176static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx) 176static struct fs_path *fs_path_alloc_reversed(void)
177{ 177{
178 struct fs_path *p; 178 struct fs_path *p;
179 179
180 p = fs_path_alloc(sctx); 180 p = fs_path_alloc();
181 if (!p) 181 if (!p)
182 return NULL; 182 return NULL;
183 p->reversed = 1; 183 p->reversed = 1;
@@ -185,7 +185,7 @@ static struct fs_path *fs_path_alloc_reversed(struct send_ctx *sctx)
185 return p; 185 return p;
186} 186}
187 187
188static void fs_path_free(struct send_ctx *sctx, struct fs_path *p) 188static void fs_path_free(struct fs_path *p)
189{ 189{
190 if (!p) 190 if (!p)
191 return; 191 return;
@@ -753,8 +753,7 @@ typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
753 * 753 *
754 * path must point to the INODE_REF or INODE_EXTREF when called. 754 * path must point to the INODE_REF or INODE_EXTREF when called.
755 */ 755 */
756static int iterate_inode_ref(struct send_ctx *sctx, 756static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
757 struct btrfs_root *root, struct btrfs_path *path,
758 struct btrfs_key *found_key, int resolve, 757 struct btrfs_key *found_key, int resolve,
759 iterate_inode_ref_t iterate, void *ctx) 758 iterate_inode_ref_t iterate, void *ctx)
760{ 759{
@@ -777,13 +776,13 @@ static int iterate_inode_ref(struct send_ctx *sctx,
777 unsigned long elem_size; 776 unsigned long elem_size;
778 unsigned long ptr; 777 unsigned long ptr;
779 778
780 p = fs_path_alloc_reversed(sctx); 779 p = fs_path_alloc_reversed();
781 if (!p) 780 if (!p)
782 return -ENOMEM; 781 return -ENOMEM;
783 782
784 tmp_path = alloc_path_for_send(); 783 tmp_path = alloc_path_for_send();
785 if (!tmp_path) { 784 if (!tmp_path) {
786 fs_path_free(sctx, p); 785 fs_path_free(p);
787 return -ENOMEM; 786 return -ENOMEM;
788 } 787 }
789 788
@@ -858,7 +857,7 @@ static int iterate_inode_ref(struct send_ctx *sctx,
858 857
859out: 858out:
860 btrfs_free_path(tmp_path); 859 btrfs_free_path(tmp_path);
861 fs_path_free(sctx, p); 860 fs_path_free(p);
862 return ret; 861 return ret;
863} 862}
864 863
@@ -874,8 +873,7 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
874 * 873 *
875 * path must point to the dir item when called. 874 * path must point to the dir item when called.
876 */ 875 */
877static int iterate_dir_item(struct send_ctx *sctx, 876static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
878 struct btrfs_root *root, struct btrfs_path *path,
879 struct btrfs_key *found_key, 877 struct btrfs_key *found_key,
880 iterate_dir_item_t iterate, void *ctx) 878 iterate_dir_item_t iterate, void *ctx)
881{ 879{
@@ -990,7 +988,7 @@ static int __copy_first_ref(int num, u64 dir, int index,
990 * Retrieve the first path of an inode. If an inode has more then one 988 * Retrieve the first path of an inode. If an inode has more then one
991 * ref/hardlink, this is ignored. 989 * ref/hardlink, this is ignored.
992 */ 990 */
993static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root, 991static int get_inode_path(struct btrfs_root *root,
994 u64 ino, struct fs_path *path) 992 u64 ino, struct fs_path *path)
995{ 993{
996 int ret; 994 int ret;
@@ -1022,8 +1020,8 @@ static int get_inode_path(struct send_ctx *sctx, struct btrfs_root *root,
1022 goto out; 1020 goto out;
1023 } 1021 }
1024 1022
1025 ret = iterate_inode_ref(sctx, root, p, &found_key, 1, 1023 ret = iterate_inode_ref(root, p, &found_key, 1,
1026 __copy_first_ref, path); 1024 __copy_first_ref, path);
1027 if (ret < 0) 1025 if (ret < 0)
1028 goto out; 1026 goto out;
1029 ret = 0; 1027 ret = 0;
@@ -1314,8 +1312,7 @@ out:
1314 return ret; 1312 return ret;
1315} 1313}
1316 1314
1317static int read_symlink(struct send_ctx *sctx, 1315static int read_symlink(struct btrfs_root *root,
1318 struct btrfs_root *root,
1319 u64 ino, 1316 u64 ino,
1320 struct fs_path *dest) 1317 struct fs_path *dest)
1321{ 1318{
@@ -1562,8 +1559,7 @@ out:
1562 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir, 1559 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1563 * generation of the parent dir and the name of the dir entry. 1560 * generation of the parent dir and the name of the dir entry.
1564 */ 1561 */
1565static int get_first_ref(struct send_ctx *sctx, 1562static int get_first_ref(struct btrfs_root *root, u64 ino,
1566 struct btrfs_root *root, u64 ino,
1567 u64 *dir, u64 *dir_gen, struct fs_path *name) 1563 u64 *dir, u64 *dir_gen, struct fs_path *name)
1568{ 1564{
1569 int ret; 1565 int ret;
@@ -1628,8 +1624,7 @@ out:
1628 return ret; 1624 return ret;
1629} 1625}
1630 1626
1631static int is_first_ref(struct send_ctx *sctx, 1627static int is_first_ref(struct btrfs_root *root,
1632 struct btrfs_root *root,
1633 u64 ino, u64 dir, 1628 u64 ino, u64 dir,
1634 const char *name, int name_len) 1629 const char *name, int name_len)
1635{ 1630{
@@ -1638,11 +1633,11 @@ static int is_first_ref(struct send_ctx *sctx,
1638 u64 tmp_dir; 1633 u64 tmp_dir;
1639 u64 tmp_dir_gen; 1634 u64 tmp_dir_gen;
1640 1635
1641 tmp_name = fs_path_alloc(sctx); 1636 tmp_name = fs_path_alloc();
1642 if (!tmp_name) 1637 if (!tmp_name)
1643 return -ENOMEM; 1638 return -ENOMEM;
1644 1639
1645 ret = get_first_ref(sctx, root, ino, &tmp_dir, &tmp_dir_gen, tmp_name); 1640 ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
1646 if (ret < 0) 1641 if (ret < 0)
1647 goto out; 1642 goto out;
1648 1643
@@ -1654,7 +1649,7 @@ static int is_first_ref(struct send_ctx *sctx,
1654 ret = !memcmp(tmp_name->start, name, name_len); 1649 ret = !memcmp(tmp_name->start, name, name_len);
1655 1650
1656out: 1651out:
1657 fs_path_free(sctx, tmp_name); 1652 fs_path_free(tmp_name);
1658 return ret; 1653 return ret;
1659} 1654}
1660 1655
@@ -1783,11 +1778,11 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1783 if (!sctx->parent_root) 1778 if (!sctx->parent_root)
1784 goto out; 1779 goto out;
1785 1780
1786 name = fs_path_alloc(sctx); 1781 name = fs_path_alloc();
1787 if (!name) 1782 if (!name)
1788 return -ENOMEM; 1783 return -ENOMEM;
1789 1784
1790 ret = get_first_ref(sctx, sctx->parent_root, ino, &dir, &dir_gen, name); 1785 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
1791 if (ret < 0) 1786 if (ret < 0)
1792 goto out; 1787 goto out;
1793 1788
@@ -1795,7 +1790,7 @@ static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1795 name->start, fs_path_len(name)); 1790 name->start, fs_path_len(name));
1796 1791
1797out: 1792out:
1798 fs_path_free(sctx, name); 1793 fs_path_free(name);
1799 return ret; 1794 return ret;
1800} 1795}
1801 1796
@@ -1979,11 +1974,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1979 * send_root or parent_root for ref lookup. 1974 * send_root or parent_root for ref lookup.
1980 */ 1975 */
1981 if (ino < sctx->send_progress) 1976 if (ino < sctx->send_progress)
1982 ret = get_first_ref(sctx, sctx->send_root, ino, 1977 ret = get_first_ref(sctx->send_root, ino,
1983 parent_ino, parent_gen, dest); 1978 parent_ino, parent_gen, dest);
1984 else 1979 else
1985 ret = get_first_ref(sctx, sctx->parent_root, ino, 1980 ret = get_first_ref(sctx->parent_root, ino,
1986 parent_ino, parent_gen, dest); 1981 parent_ino, parent_gen, dest);
1987 if (ret < 0) 1982 if (ret < 0)
1988 goto out; 1983 goto out;
1989 1984
@@ -2070,7 +2065,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2070 u64 parent_gen = 0; 2065 u64 parent_gen = 0;
2071 int stop = 0; 2066 int stop = 0;
2072 2067
2073 name = fs_path_alloc(sctx); 2068 name = fs_path_alloc();
2074 if (!name) { 2069 if (!name) {
2075 ret = -ENOMEM; 2070 ret = -ENOMEM;
2076 goto out; 2071 goto out;
@@ -2098,7 +2093,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2098 } 2093 }
2099 2094
2100out: 2095out:
2101 fs_path_free(sctx, name); 2096 fs_path_free(name);
2102 if (!ret) 2097 if (!ret)
2103 fs_path_unreverse(dest); 2098 fs_path_unreverse(dest);
2104 return ret; 2099 return ret;
@@ -2263,7 +2258,7 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2263 2258
2264verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size); 2259verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2265 2260
2266 p = fs_path_alloc(sctx); 2261 p = fs_path_alloc();
2267 if (!p) 2262 if (!p)
2268 return -ENOMEM; 2263 return -ENOMEM;
2269 2264
@@ -2281,7 +2276,7 @@ verbose_printk("btrfs: send_truncate %llu size=%llu\n", ino, size);
2281 2276
2282tlv_put_failure: 2277tlv_put_failure:
2283out: 2278out:
2284 fs_path_free(sctx, p); 2279 fs_path_free(p);
2285 return ret; 2280 return ret;
2286} 2281}
2287 2282
@@ -2292,7 +2287,7 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2292 2287
2293verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode); 2288verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2294 2289
2295 p = fs_path_alloc(sctx); 2290 p = fs_path_alloc();
2296 if (!p) 2291 if (!p)
2297 return -ENOMEM; 2292 return -ENOMEM;
2298 2293
@@ -2310,7 +2305,7 @@ verbose_printk("btrfs: send_chmod %llu mode=%llu\n", ino, mode);
2310 2305
2311tlv_put_failure: 2306tlv_put_failure:
2312out: 2307out:
2313 fs_path_free(sctx, p); 2308 fs_path_free(p);
2314 return ret; 2309 return ret;
2315} 2310}
2316 2311
@@ -2321,7 +2316,7 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2321 2316
2322verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid); 2317verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2323 2318
2324 p = fs_path_alloc(sctx); 2319 p = fs_path_alloc();
2325 if (!p) 2320 if (!p)
2326 return -ENOMEM; 2321 return -ENOMEM;
2327 2322
@@ -2340,7 +2335,7 @@ verbose_printk("btrfs: send_chown %llu uid=%llu, gid=%llu\n", ino, uid, gid);
2340 2335
2341tlv_put_failure: 2336tlv_put_failure:
2342out: 2337out:
2343 fs_path_free(sctx, p); 2338 fs_path_free(p);
2344 return ret; 2339 return ret;
2345} 2340}
2346 2341
@@ -2356,7 +2351,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2356 2351
2357verbose_printk("btrfs: send_utimes %llu\n", ino); 2352verbose_printk("btrfs: send_utimes %llu\n", ino);
2358 2353
2359 p = fs_path_alloc(sctx); 2354 p = fs_path_alloc();
2360 if (!p) 2355 if (!p)
2361 return -ENOMEM; 2356 return -ENOMEM;
2362 2357
@@ -2397,7 +2392,7 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2397 2392
2398tlv_put_failure: 2393tlv_put_failure:
2399out: 2394out:
2400 fs_path_free(sctx, p); 2395 fs_path_free(p);
2401 btrfs_free_path(path); 2396 btrfs_free_path(path);
2402 return ret; 2397 return ret;
2403} 2398}
@@ -2418,7 +2413,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
2418 2413
2419verbose_printk("btrfs: send_create_inode %llu\n", ino); 2414verbose_printk("btrfs: send_create_inode %llu\n", ino);
2420 2415
2421 p = fs_path_alloc(sctx); 2416 p = fs_path_alloc();
2422 if (!p) 2417 if (!p)
2423 return -ENOMEM; 2418 return -ENOMEM;
2424 2419
@@ -2459,7 +2454,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2459 2454
2460 if (S_ISLNK(mode)) { 2455 if (S_ISLNK(mode)) {
2461 fs_path_reset(p); 2456 fs_path_reset(p);
2462 ret = read_symlink(sctx, sctx->send_root, ino, p); 2457 ret = read_symlink(sctx->send_root, ino, p);
2463 if (ret < 0) 2458 if (ret < 0)
2464 goto out; 2459 goto out;
2465 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p); 2460 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
@@ -2476,7 +2471,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2476 2471
2477tlv_put_failure: 2472tlv_put_failure:
2478out: 2473out:
2479 fs_path_free(sctx, p); 2474 fs_path_free(p);
2480 return ret; 2475 return ret;
2481} 2476}
2482 2477
@@ -2615,13 +2610,13 @@ static int record_ref(struct list_head *head, u64 dir,
2615 return 0; 2610 return 0;
2616} 2611}
2617 2612
2618static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head) 2613static void __free_recorded_refs(struct list_head *head)
2619{ 2614{
2620 struct recorded_ref *cur; 2615 struct recorded_ref *cur;
2621 2616
2622 while (!list_empty(head)) { 2617 while (!list_empty(head)) {
2623 cur = list_entry(head->next, struct recorded_ref, list); 2618 cur = list_entry(head->next, struct recorded_ref, list);
2624 fs_path_free(sctx, cur->full_path); 2619 fs_path_free(cur->full_path);
2625 list_del(&cur->list); 2620 list_del(&cur->list);
2626 kfree(cur); 2621 kfree(cur);
2627 } 2622 }
@@ -2629,8 +2624,8 @@ static void __free_recorded_refs(struct send_ctx *sctx, struct list_head *head)
2629 2624
2630static void free_recorded_refs(struct send_ctx *sctx) 2625static void free_recorded_refs(struct send_ctx *sctx)
2631{ 2626{
2632 __free_recorded_refs(sctx, &sctx->new_refs); 2627 __free_recorded_refs(&sctx->new_refs);
2633 __free_recorded_refs(sctx, &sctx->deleted_refs); 2628 __free_recorded_refs(&sctx->deleted_refs);
2634} 2629}
2635 2630
2636/* 2631/*
@@ -2644,7 +2639,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2644 int ret; 2639 int ret;
2645 struct fs_path *orphan; 2640 struct fs_path *orphan;
2646 2641
2647 orphan = fs_path_alloc(sctx); 2642 orphan = fs_path_alloc();
2648 if (!orphan) 2643 if (!orphan)
2649 return -ENOMEM; 2644 return -ENOMEM;
2650 2645
@@ -2655,7 +2650,7 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2655 ret = send_rename(sctx, path, orphan); 2650 ret = send_rename(sctx, path, orphan);
2656 2651
2657out: 2652out:
2658 fs_path_free(sctx, orphan); 2653 fs_path_free(orphan);
2659 return ret; 2654 return ret;
2660} 2655}
2661 2656
@@ -2746,7 +2741,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2746 */ 2741 */
2747 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); 2742 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
2748 2743
2749 valid_path = fs_path_alloc(sctx); 2744 valid_path = fs_path_alloc();
2750 if (!valid_path) { 2745 if (!valid_path) {
2751 ret = -ENOMEM; 2746 ret = -ENOMEM;
2752 goto out; 2747 goto out;
@@ -2843,9 +2838,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2843 if (ret < 0) 2838 if (ret < 0)
2844 goto out; 2839 goto out;
2845 if (ret) { 2840 if (ret) {
2846 ret = is_first_ref(sctx, sctx->parent_root, 2841 ret = is_first_ref(sctx->parent_root,
2847 ow_inode, cur->dir, cur->name, 2842 ow_inode, cur->dir, cur->name,
2848 cur->name_len); 2843 cur->name_len);
2849 if (ret < 0) 2844 if (ret < 0)
2850 goto out; 2845 goto out;
2851 if (ret) { 2846 if (ret) {
@@ -3024,7 +3019,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3024out: 3019out:
3025 free_recorded_refs(sctx); 3020 free_recorded_refs(sctx);
3026 ulist_free(check_dirs); 3021 ulist_free(check_dirs);
3027 fs_path_free(sctx, valid_path); 3022 fs_path_free(valid_path);
3028 return ret; 3023 return ret;
3029} 3024}
3030 3025
@@ -3037,7 +3032,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3037 struct fs_path *p; 3032 struct fs_path *p;
3038 u64 gen; 3033 u64 gen;
3039 3034
3040 p = fs_path_alloc(sctx); 3035 p = fs_path_alloc();
3041 if (!p) 3036 if (!p)
3042 return -ENOMEM; 3037 return -ENOMEM;
3043 3038
@@ -3057,7 +3052,7 @@ static int __record_new_ref(int num, u64 dir, int index,
3057 3052
3058out: 3053out:
3059 if (ret) 3054 if (ret)
3060 fs_path_free(sctx, p); 3055 fs_path_free(p);
3061 return ret; 3056 return ret;
3062} 3057}
3063 3058
@@ -3070,7 +3065,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3070 struct fs_path *p; 3065 struct fs_path *p;
3071 u64 gen; 3066 u64 gen;
3072 3067
3073 p = fs_path_alloc(sctx); 3068 p = fs_path_alloc();
3074 if (!p) 3069 if (!p)
3075 return -ENOMEM; 3070 return -ENOMEM;
3076 3071
@@ -3090,7 +3085,7 @@ static int __record_deleted_ref(int num, u64 dir, int index,
3090 3085
3091out: 3086out:
3092 if (ret) 3087 if (ret)
3093 fs_path_free(sctx, p); 3088 fs_path_free(p);
3094 return ret; 3089 return ret;
3095} 3090}
3096 3091
@@ -3098,8 +3093,8 @@ static int record_new_ref(struct send_ctx *sctx)
3098{ 3093{
3099 int ret; 3094 int ret;
3100 3095
3101 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3096 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3102 sctx->cmp_key, 0, __record_new_ref, sctx); 3097 sctx->cmp_key, 0, __record_new_ref, sctx);
3103 if (ret < 0) 3098 if (ret < 0)
3104 goto out; 3099 goto out;
3105 ret = 0; 3100 ret = 0;
@@ -3112,8 +3107,8 @@ static int record_deleted_ref(struct send_ctx *sctx)
3112{ 3107{
3113 int ret; 3108 int ret;
3114 3109
3115 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3110 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3116 sctx->cmp_key, 0, __record_deleted_ref, sctx); 3111 sctx->cmp_key, 0, __record_deleted_ref, sctx);
3117 if (ret < 0) 3112 if (ret < 0)
3118 goto out; 3113 goto out;
3119 ret = 0; 3114 ret = 0;
@@ -3142,8 +3137,7 @@ static int __find_iref(int num, u64 dir, int index,
3142 return 0; 3137 return 0;
3143} 3138}
3144 3139
3145static int find_iref(struct send_ctx *sctx, 3140static int find_iref(struct btrfs_root *root,
3146 struct btrfs_root *root,
3147 struct btrfs_path *path, 3141 struct btrfs_path *path,
3148 struct btrfs_key *key, 3142 struct btrfs_key *key,
3149 u64 dir, struct fs_path *name) 3143 u64 dir, struct fs_path *name)
@@ -3155,7 +3149,7 @@ static int find_iref(struct send_ctx *sctx,
3155 ctx.name = name; 3149 ctx.name = name;
3156 ctx.found_idx = -1; 3150 ctx.found_idx = -1;
3157 3151
3158 ret = iterate_inode_ref(sctx, root, path, key, 0, __find_iref, &ctx); 3152 ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
3159 if (ret < 0) 3153 if (ret < 0)
3160 return ret; 3154 return ret;
3161 3155
@@ -3172,7 +3166,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
3172 int ret; 3166 int ret;
3173 struct send_ctx *sctx = ctx; 3167 struct send_ctx *sctx = ctx;
3174 3168
3175 ret = find_iref(sctx, sctx->parent_root, sctx->right_path, 3169 ret = find_iref(sctx->parent_root, sctx->right_path,
3176 sctx->cmp_key, dir, name); 3170 sctx->cmp_key, dir, name);
3177 if (ret == -ENOENT) 3171 if (ret == -ENOENT)
3178 ret = __record_new_ref(num, dir, index, name, sctx); 3172 ret = __record_new_ref(num, dir, index, name, sctx);
@@ -3189,7 +3183,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
3189 int ret; 3183 int ret;
3190 struct send_ctx *sctx = ctx; 3184 struct send_ctx *sctx = ctx;
3191 3185
3192 ret = find_iref(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3186 ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
3193 dir, name); 3187 dir, name);
3194 if (ret == -ENOENT) 3188 if (ret == -ENOENT)
3195 ret = __record_deleted_ref(num, dir, index, name, sctx); 3189 ret = __record_deleted_ref(num, dir, index, name, sctx);
@@ -3203,11 +3197,11 @@ static int record_changed_ref(struct send_ctx *sctx)
3203{ 3197{
3204 int ret = 0; 3198 int ret = 0;
3205 3199
3206 ret = iterate_inode_ref(sctx, sctx->send_root, sctx->left_path, 3200 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
3207 sctx->cmp_key, 0, __record_changed_new_ref, sctx); 3201 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
3208 if (ret < 0) 3202 if (ret < 0)
3209 goto out; 3203 goto out;
3210 ret = iterate_inode_ref(sctx, sctx->parent_root, sctx->right_path, 3204 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
3211 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); 3205 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
3212 if (ret < 0) 3206 if (ret < 0)
3213 goto out; 3207 goto out;
@@ -3266,8 +3260,7 @@ static int process_all_refs(struct send_ctx *sctx,
3266 found_key.type != BTRFS_INODE_EXTREF_KEY)) 3260 found_key.type != BTRFS_INODE_EXTREF_KEY))
3267 break; 3261 break;
3268 3262
3269 ret = iterate_inode_ref(sctx, root, path, &found_key, 0, cb, 3263 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
3270 sctx);
3271 btrfs_release_path(path); 3264 btrfs_release_path(path);
3272 if (ret < 0) 3265 if (ret < 0)
3273 goto out; 3266 goto out;
@@ -3335,7 +3328,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3335 struct fs_path *p; 3328 struct fs_path *p;
3336 posix_acl_xattr_header dummy_acl; 3329 posix_acl_xattr_header dummy_acl;
3337 3330
3338 p = fs_path_alloc(sctx); 3331 p = fs_path_alloc();
3339 if (!p) 3332 if (!p)
3340 return -ENOMEM; 3333 return -ENOMEM;
3341 3334
@@ -3362,7 +3355,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
3362 ret = send_set_xattr(sctx, p, name, name_len, data, data_len); 3355 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
3363 3356
3364out: 3357out:
3365 fs_path_free(sctx, p); 3358 fs_path_free(p);
3366 return ret; 3359 return ret;
3367} 3360}
3368 3361
@@ -3375,7 +3368,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3375 struct send_ctx *sctx = ctx; 3368 struct send_ctx *sctx = ctx;
3376 struct fs_path *p; 3369 struct fs_path *p;
3377 3370
3378 p = fs_path_alloc(sctx); 3371 p = fs_path_alloc();
3379 if (!p) 3372 if (!p)
3380 return -ENOMEM; 3373 return -ENOMEM;
3381 3374
@@ -3386,7 +3379,7 @@ static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
3386 ret = send_remove_xattr(sctx, p, name, name_len); 3379 ret = send_remove_xattr(sctx, p, name, name_len);
3387 3380
3388out: 3381out:
3389 fs_path_free(sctx, p); 3382 fs_path_free(p);
3390 return ret; 3383 return ret;
3391} 3384}
3392 3385
@@ -3394,8 +3387,8 @@ static int process_new_xattr(struct send_ctx *sctx)
3394{ 3387{
3395 int ret = 0; 3388 int ret = 0;
3396 3389
3397 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3390 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3398 sctx->cmp_key, __process_new_xattr, sctx); 3391 sctx->cmp_key, __process_new_xattr, sctx);
3399 3392
3400 return ret; 3393 return ret;
3401} 3394}
@@ -3404,8 +3397,8 @@ static int process_deleted_xattr(struct send_ctx *sctx)
3404{ 3397{
3405 int ret; 3398 int ret;
3406 3399
3407 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3400 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3408 sctx->cmp_key, __process_deleted_xattr, sctx); 3401 sctx->cmp_key, __process_deleted_xattr, sctx);
3409 3402
3410 return ret; 3403 return ret;
3411} 3404}
@@ -3429,17 +3422,15 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
3429 strncmp(name, ctx->name, name_len) == 0) { 3422 strncmp(name, ctx->name, name_len) == 0) {
3430 ctx->found_idx = num; 3423 ctx->found_idx = num;
3431 ctx->found_data_len = data_len; 3424 ctx->found_data_len = data_len;
3432 ctx->found_data = kmalloc(data_len, GFP_NOFS); 3425 ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
3433 if (!ctx->found_data) 3426 if (!ctx->found_data)
3434 return -ENOMEM; 3427 return -ENOMEM;
3435 memcpy(ctx->found_data, data, data_len);
3436 return 1; 3428 return 1;
3437 } 3429 }
3438 return 0; 3430 return 0;
3439} 3431}
3440 3432
3441static int find_xattr(struct send_ctx *sctx, 3433static int find_xattr(struct btrfs_root *root,
3442 struct btrfs_root *root,
3443 struct btrfs_path *path, 3434 struct btrfs_path *path,
3444 struct btrfs_key *key, 3435 struct btrfs_key *key,
3445 const char *name, int name_len, 3436 const char *name, int name_len,
@@ -3454,7 +3445,7 @@ static int find_xattr(struct send_ctx *sctx,
3454 ctx.found_data = NULL; 3445 ctx.found_data = NULL;
3455 ctx.found_data_len = 0; 3446 ctx.found_data_len = 0;
3456 3447
3457 ret = iterate_dir_item(sctx, root, path, key, __find_xattr, &ctx); 3448 ret = iterate_dir_item(root, path, key, __find_xattr, &ctx);
3458 if (ret < 0) 3449 if (ret < 0)
3459 return ret; 3450 return ret;
3460 3451
@@ -3480,9 +3471,9 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
3480 char *found_data = NULL; 3471 char *found_data = NULL;
3481 int found_data_len = 0; 3472 int found_data_len = 0;
3482 3473
3483 ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, 3474 ret = find_xattr(sctx->parent_root, sctx->right_path,
3484 sctx->cmp_key, name, name_len, &found_data, 3475 sctx->cmp_key, name, name_len, &found_data,
3485 &found_data_len); 3476 &found_data_len);
3486 if (ret == -ENOENT) { 3477 if (ret == -ENOENT) {
3487 ret = __process_new_xattr(num, di_key, name, name_len, data, 3478 ret = __process_new_xattr(num, di_key, name, name_len, data,
3488 data_len, type, ctx); 3479 data_len, type, ctx);
@@ -3508,8 +3499,8 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
3508 int ret; 3499 int ret;
3509 struct send_ctx *sctx = ctx; 3500 struct send_ctx *sctx = ctx;
3510 3501
3511 ret = find_xattr(sctx, sctx->send_root, sctx->left_path, sctx->cmp_key, 3502 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
3512 name, name_len, NULL, NULL); 3503 name, name_len, NULL, NULL);
3513 if (ret == -ENOENT) 3504 if (ret == -ENOENT)
3514 ret = __process_deleted_xattr(num, di_key, name, name_len, data, 3505 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
3515 data_len, type, ctx); 3506 data_len, type, ctx);
@@ -3523,11 +3514,11 @@ static int process_changed_xattr(struct send_ctx *sctx)
3523{ 3514{
3524 int ret = 0; 3515 int ret = 0;
3525 3516
3526 ret = iterate_dir_item(sctx, sctx->send_root, sctx->left_path, 3517 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
3527 sctx->cmp_key, __process_changed_new_xattr, sctx); 3518 sctx->cmp_key, __process_changed_new_xattr, sctx);
3528 if (ret < 0) 3519 if (ret < 0)
3529 goto out; 3520 goto out;
3530 ret = iterate_dir_item(sctx, sctx->parent_root, sctx->right_path, 3521 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
3531 sctx->cmp_key, __process_changed_deleted_xattr, sctx); 3522 sctx->cmp_key, __process_changed_deleted_xattr, sctx);
3532 3523
3533out: 3524out:
@@ -3572,8 +3563,8 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
3572 goto out; 3563 goto out;
3573 } 3564 }
3574 3565
3575 ret = iterate_dir_item(sctx, root, path, &found_key, 3566 ret = iterate_dir_item(root, path, &found_key,
3576 __process_new_xattr, sctx); 3567 __process_new_xattr, sctx);
3577 if (ret < 0) 3568 if (ret < 0)
3578 goto out; 3569 goto out;
3579 3570
@@ -3598,7 +3589,7 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
3598 int num_read = 0; 3589 int num_read = 0;
3599 mm_segment_t old_fs; 3590 mm_segment_t old_fs;
3600 3591
3601 p = fs_path_alloc(sctx); 3592 p = fs_path_alloc();
3602 if (!p) 3593 if (!p)
3603 return -ENOMEM; 3594 return -ENOMEM;
3604 3595
@@ -3640,7 +3631,7 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
3640 3631
3641tlv_put_failure: 3632tlv_put_failure:
3642out: 3633out:
3643 fs_path_free(sctx, p); 3634 fs_path_free(p);
3644 set_fs(old_fs); 3635 set_fs(old_fs);
3645 if (ret < 0) 3636 if (ret < 0)
3646 return ret; 3637 return ret;
@@ -3663,7 +3654,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3663 clone_root->root->objectid, clone_root->ino, 3654 clone_root->root->objectid, clone_root->ino,
3664 clone_root->offset); 3655 clone_root->offset);
3665 3656
3666 p = fs_path_alloc(sctx); 3657 p = fs_path_alloc();
3667 if (!p) 3658 if (!p)
3668 return -ENOMEM; 3659 return -ENOMEM;
3669 3660
@@ -3686,8 +3677,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3686 goto out; 3677 goto out;
3687 ret = get_cur_path(sctx, clone_root->ino, gen, p); 3678 ret = get_cur_path(sctx, clone_root->ino, gen, p);
3688 } else { 3679 } else {
3689 ret = get_inode_path(sctx, clone_root->root, 3680 ret = get_inode_path(clone_root->root, clone_root->ino, p);
3690 clone_root->ino, p);
3691 } 3681 }
3692 if (ret < 0) 3682 if (ret < 0)
3693 goto out; 3683 goto out;
@@ -3704,7 +3694,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3704 3694
3705tlv_put_failure: 3695tlv_put_failure:
3706out: 3696out:
3707 fs_path_free(sctx, p); 3697 fs_path_free(p);
3708 return ret; 3698 return ret;
3709} 3699}
3710 3700
@@ -3717,7 +3707,7 @@ static int send_update_extent(struct send_ctx *sctx,
3717 int ret = 0; 3707 int ret = 0;
3718 struct fs_path *p; 3708 struct fs_path *p;
3719 3709
3720 p = fs_path_alloc(sctx); 3710 p = fs_path_alloc();
3721 if (!p) 3711 if (!p)
3722 return -ENOMEM; 3712 return -ENOMEM;
3723 3713
@@ -3737,7 +3727,7 @@ static int send_update_extent(struct send_ctx *sctx,
3737 3727
3738tlv_put_failure: 3728tlv_put_failure:
3739out: 3729out:
3740 fs_path_free(sctx, p); 3730 fs_path_free(p);
3741 return ret; 3731 return ret;
3742} 3732}
3743 3733
@@ -4579,6 +4569,41 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4579 send_root = BTRFS_I(file_inode(mnt_file))->root; 4569 send_root = BTRFS_I(file_inode(mnt_file))->root;
4580 fs_info = send_root->fs_info; 4570 fs_info = send_root->fs_info;
4581 4571
4572 /*
4573 * This is done when we lookup the root, it should already be complete
4574 * by the time we get here.
4575 */
4576 WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
4577
4578 /*
4579 * If we just created this root we need to make sure that the orphan
4580 * cleanup has been done and committed since we search the commit root,
4581 * so check its commit root transid with our otransid and if they match
4582 * commit the transaction to make sure everything is updated.
4583 */
4584 down_read(&send_root->fs_info->extent_commit_sem);
4585 if (btrfs_header_generation(send_root->commit_root) ==
4586 btrfs_root_otransid(&send_root->root_item)) {
4587 struct btrfs_trans_handle *trans;
4588
4589 up_read(&send_root->fs_info->extent_commit_sem);
4590
4591 trans = btrfs_attach_transaction_barrier(send_root);
4592 if (IS_ERR(trans)) {
4593 if (PTR_ERR(trans) != -ENOENT) {
4594 ret = PTR_ERR(trans);
4595 goto out;
4596 }
4597 /* ENOENT means theres no transaction */
4598 } else {
4599 ret = btrfs_commit_transaction(trans, send_root);
4600 if (ret)
4601 goto out;
4602 }
4603 } else {
4604 up_read(&send_root->fs_info->extent_commit_sem);
4605 }
4606
4582 arg = memdup_user(arg_, sizeof(*arg)); 4607 arg = memdup_user(arg_, sizeof(*arg));
4583 if (IS_ERR(arg)) { 4608 if (IS_ERR(arg)) {
4584 ret = PTR_ERR(arg); 4609 ret = PTR_ERR(arg);
@@ -4663,10 +4688,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4663 key.type = BTRFS_ROOT_ITEM_KEY; 4688 key.type = BTRFS_ROOT_ITEM_KEY;
4664 key.offset = (u64)-1; 4689 key.offset = (u64)-1;
4665 clone_root = btrfs_read_fs_root_no_name(fs_info, &key); 4690 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4666 if (!clone_root) {
4667 ret = -EINVAL;
4668 goto out;
4669 }
4670 if (IS_ERR(clone_root)) { 4691 if (IS_ERR(clone_root)) {
4671 ret = PTR_ERR(clone_root); 4692 ret = PTR_ERR(clone_root);
4672 goto out; 4693 goto out;
@@ -4682,8 +4703,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4682 key.type = BTRFS_ROOT_ITEM_KEY; 4703 key.type = BTRFS_ROOT_ITEM_KEY;
4683 key.offset = (u64)-1; 4704 key.offset = (u64)-1;
4684 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); 4705 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4685 if (!sctx->parent_root) { 4706 if (IS_ERR(sctx->parent_root)) {
4686 ret = -EINVAL; 4707 ret = PTR_ERR(sctx->parent_root);
4687 goto out; 4708 goto out;
4688 } 4709 }
4689 } 4710 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f0857e092a3c..8eb6191d86da 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,6 @@
51#include "print-tree.h" 51#include "print-tree.h"
52#include "xattr.h" 52#include "xattr.h"
53#include "volumes.h" 53#include "volumes.h"
54#include "version.h"
55#include "export.h" 54#include "export.h"
56#include "compression.h" 55#include "compression.h"
57#include "rcu-string.h" 56#include "rcu-string.h"
@@ -266,6 +265,9 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
266 return; 265 return;
267 } 266 }
268 ACCESS_ONCE(trans->transaction->aborted) = errno; 267 ACCESS_ONCE(trans->transaction->aborted) = errno;
268 /* Wake up anybody who may be waiting on this transaction */
269 wake_up(&root->fs_info->transaction_wait);
270 wake_up(&root->fs_info->transaction_blocked_wait);
269 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 271 __btrfs_std_error(root->fs_info, function, line, errno, NULL);
270} 272}
271/* 273/*
@@ -776,9 +778,6 @@ find_root:
776 if (IS_ERR(new_root)) 778 if (IS_ERR(new_root))
777 return ERR_CAST(new_root); 779 return ERR_CAST(new_root);
778 780
779 if (btrfs_root_refs(&new_root->root_item) == 0)
780 return ERR_PTR(-ENOENT);
781
782 dir_id = btrfs_root_dirid(&new_root->root_item); 781 dir_id = btrfs_root_dirid(&new_root->root_item);
783setup_root: 782setup_root:
784 location.objectid = dir_id; 783 location.objectid = dir_id;
@@ -866,7 +865,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
866 return 0; 865 return 0;
867 } 866 }
868 867
869 btrfs_wait_ordered_extents(root, 1); 868 btrfs_wait_all_ordered_extents(fs_info, 1);
870 869
871 trans = btrfs_attach_transaction_barrier(root); 870 trans = btrfs_attach_transaction_barrier(root);
872 if (IS_ERR(trans)) { 871 if (IS_ERR(trans)) {
@@ -1685,6 +1684,18 @@ static void btrfs_interface_exit(void)
1685 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); 1684 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
1686} 1685}
1687 1686
1687static void btrfs_print_info(void)
1688{
1689 printk(KERN_INFO "Btrfs loaded"
1690#ifdef CONFIG_BTRFS_DEBUG
1691 ", debug=on"
1692#endif
1693#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1694 ", integrity-checker=on"
1695#endif
1696 "\n");
1697}
1698
1688static int __init init_btrfs_fs(void) 1699static int __init init_btrfs_fs(void)
1689{ 1700{
1690 int err; 1701 int err;
@@ -1733,11 +1744,9 @@ static int __init init_btrfs_fs(void)
1733 1744
1734 btrfs_init_lockdep(); 1745 btrfs_init_lockdep();
1735 1746
1736#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1747 btrfs_print_info();
1737 btrfs_test_free_space_cache(); 1748 btrfs_test_free_space_cache();
1738#endif
1739 1749
1740 printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
1741 return 0; 1750 return 0;
1742 1751
1743unregister_ioctl: 1752unregister_ioctl:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0544587d74f4..af1931a5960d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -34,12 +34,43 @@
34 34
35#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
36 36
37static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
38 [TRANS_STATE_RUNNING] = 0U,
39 [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
40 __TRANS_START),
41 [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
42 __TRANS_START |
43 __TRANS_ATTACH),
44 [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
45 __TRANS_START |
46 __TRANS_ATTACH |
47 __TRANS_JOIN),
48 [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
49 __TRANS_START |
50 __TRANS_ATTACH |
51 __TRANS_JOIN |
52 __TRANS_JOIN_NOLOCK),
53 [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
54 __TRANS_START |
55 __TRANS_ATTACH |
56 __TRANS_JOIN |
57 __TRANS_JOIN_NOLOCK),
58};
59
37static void put_transaction(struct btrfs_transaction *transaction) 60static void put_transaction(struct btrfs_transaction *transaction)
38{ 61{
39 WARN_ON(atomic_read(&transaction->use_count) == 0); 62 WARN_ON(atomic_read(&transaction->use_count) == 0);
40 if (atomic_dec_and_test(&transaction->use_count)) { 63 if (atomic_dec_and_test(&transaction->use_count)) {
41 BUG_ON(!list_empty(&transaction->list)); 64 BUG_ON(!list_empty(&transaction->list));
42 WARN_ON(transaction->delayed_refs.root.rb_node); 65 WARN_ON(transaction->delayed_refs.root.rb_node);
66 while (!list_empty(&transaction->pending_chunks)) {
67 struct extent_map *em;
68
69 em = list_first_entry(&transaction->pending_chunks,
70 struct extent_map, list);
71 list_del_init(&em->list);
72 free_extent_map(em);
73 }
43 kmem_cache_free(btrfs_transaction_cachep, transaction); 74 kmem_cache_free(btrfs_transaction_cachep, transaction);
44 } 75 }
45} 76}
@@ -50,18 +81,35 @@ static noinline void switch_commit_root(struct btrfs_root *root)
50 root->commit_root = btrfs_root_node(root); 81 root->commit_root = btrfs_root_node(root);
51} 82}
52 83
53static inline int can_join_transaction(struct btrfs_transaction *trans, 84static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
54 int type) 85 unsigned int type)
86{
87 if (type & TRANS_EXTWRITERS)
88 atomic_inc(&trans->num_extwriters);
89}
90
91static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
92 unsigned int type)
93{
94 if (type & TRANS_EXTWRITERS)
95 atomic_dec(&trans->num_extwriters);
96}
97
98static inline void extwriter_counter_init(struct btrfs_transaction *trans,
99 unsigned int type)
100{
101 atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
102}
103
104static inline int extwriter_counter_read(struct btrfs_transaction *trans)
55{ 105{
56 return !(trans->in_commit && 106 return atomic_read(&trans->num_extwriters);
57 type != TRANS_JOIN &&
58 type != TRANS_JOIN_NOLOCK);
59} 107}
60 108
61/* 109/*
62 * either allocate a new transaction or hop into the existing one 110 * either allocate a new transaction or hop into the existing one
63 */ 111 */
64static noinline int join_transaction(struct btrfs_root *root, int type) 112static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
65{ 113{
66 struct btrfs_transaction *cur_trans; 114 struct btrfs_transaction *cur_trans;
67 struct btrfs_fs_info *fs_info = root->fs_info; 115 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -74,32 +122,19 @@ loop:
74 return -EROFS; 122 return -EROFS;
75 } 123 }
76 124
77 if (fs_info->trans_no_join) {
78 /*
79 * If we are JOIN_NOLOCK we're already committing a current
80 * transaction, we just need a handle to deal with something
81 * when committing the transaction, such as inode cache and
82 * space cache. It is a special case.
83 */
84 if (type != TRANS_JOIN_NOLOCK) {
85 spin_unlock(&fs_info->trans_lock);
86 return -EBUSY;
87 }
88 }
89
90 cur_trans = fs_info->running_transaction; 125 cur_trans = fs_info->running_transaction;
91 if (cur_trans) { 126 if (cur_trans) {
92 if (cur_trans->aborted) { 127 if (cur_trans->aborted) {
93 spin_unlock(&fs_info->trans_lock); 128 spin_unlock(&fs_info->trans_lock);
94 return cur_trans->aborted; 129 return cur_trans->aborted;
95 } 130 }
96 if (!can_join_transaction(cur_trans, type)) { 131 if (btrfs_blocked_trans_types[cur_trans->state] & type) {
97 spin_unlock(&fs_info->trans_lock); 132 spin_unlock(&fs_info->trans_lock);
98 return -EBUSY; 133 return -EBUSY;
99 } 134 }
100 atomic_inc(&cur_trans->use_count); 135 atomic_inc(&cur_trans->use_count);
101 atomic_inc(&cur_trans->num_writers); 136 atomic_inc(&cur_trans->num_writers);
102 cur_trans->num_joined++; 137 extwriter_counter_inc(cur_trans, type);
103 spin_unlock(&fs_info->trans_lock); 138 spin_unlock(&fs_info->trans_lock);
104 return 0; 139 return 0;
105 } 140 }
@@ -112,6 +147,12 @@ loop:
112 if (type == TRANS_ATTACH) 147 if (type == TRANS_ATTACH)
113 return -ENOENT; 148 return -ENOENT;
114 149
150 /*
151 * JOIN_NOLOCK only happens during the transaction commit, so
152 * it is impossible that ->running_transaction is NULL
153 */
154 BUG_ON(type == TRANS_JOIN_NOLOCK);
155
115 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 156 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
116 if (!cur_trans) 157 if (!cur_trans)
117 return -ENOMEM; 158 return -ENOMEM;
@@ -120,7 +161,7 @@ loop:
120 if (fs_info->running_transaction) { 161 if (fs_info->running_transaction) {
121 /* 162 /*
122 * someone started a transaction after we unlocked. Make sure 163 * someone started a transaction after we unlocked. Make sure
123 * to redo the trans_no_join checks above 164 * to redo the checks above
124 */ 165 */
125 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 166 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
126 goto loop; 167 goto loop;
@@ -131,17 +172,15 @@ loop:
131 } 172 }
132 173
133 atomic_set(&cur_trans->num_writers, 1); 174 atomic_set(&cur_trans->num_writers, 1);
134 cur_trans->num_joined = 0; 175 extwriter_counter_init(cur_trans, type);
135 init_waitqueue_head(&cur_trans->writer_wait); 176 init_waitqueue_head(&cur_trans->writer_wait);
136 init_waitqueue_head(&cur_trans->commit_wait); 177 init_waitqueue_head(&cur_trans->commit_wait);
137 cur_trans->in_commit = 0; 178 cur_trans->state = TRANS_STATE_RUNNING;
138 cur_trans->blocked = 0;
139 /* 179 /*
140 * One for this trans handle, one so it will live on until we 180 * One for this trans handle, one so it will live on until we
141 * commit the transaction. 181 * commit the transaction.
142 */ 182 */
143 atomic_set(&cur_trans->use_count, 2); 183 atomic_set(&cur_trans->use_count, 2);
144 cur_trans->commit_done = 0;
145 cur_trans->start_time = get_seconds(); 184 cur_trans->start_time = get_seconds();
146 185
147 cur_trans->delayed_refs.root = RB_ROOT; 186 cur_trans->delayed_refs.root = RB_ROOT;
@@ -164,7 +203,6 @@ loop:
164 "creating a fresh transaction\n"); 203 "creating a fresh transaction\n");
165 atomic64_set(&fs_info->tree_mod_seq, 0); 204 atomic64_set(&fs_info->tree_mod_seq, 0);
166 205
167 spin_lock_init(&cur_trans->commit_lock);
168 spin_lock_init(&cur_trans->delayed_refs.lock); 206 spin_lock_init(&cur_trans->delayed_refs.lock);
169 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); 207 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
170 atomic_set(&cur_trans->delayed_refs.ref_seq, 0); 208 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
@@ -172,6 +210,7 @@ loop:
172 210
173 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 211 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
174 INIT_LIST_HEAD(&cur_trans->ordered_operations); 212 INIT_LIST_HEAD(&cur_trans->ordered_operations);
213 INIT_LIST_HEAD(&cur_trans->pending_chunks);
175 list_add_tail(&cur_trans->list, &fs_info->trans_list); 214 list_add_tail(&cur_trans->list, &fs_info->trans_list);
176 extent_io_tree_init(&cur_trans->dirty_pages, 215 extent_io_tree_init(&cur_trans->dirty_pages,
177 fs_info->btree_inode->i_mapping); 216 fs_info->btree_inode->i_mapping);
@@ -269,6 +308,13 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
269 return 0; 308 return 0;
270} 309}
271 310
311static inline int is_transaction_blocked(struct btrfs_transaction *trans)
312{
313 return (trans->state >= TRANS_STATE_BLOCKED &&
314 trans->state < TRANS_STATE_UNBLOCKED &&
315 !trans->aborted);
316}
317
272/* wait for commit against the current transaction to become unblocked 318/* wait for commit against the current transaction to become unblocked
273 * when this is done, it is safe to start a new transaction, but the current 319 * when this is done, it is safe to start a new transaction, but the current
274 * transaction might not be fully on disk. 320 * transaction might not be fully on disk.
@@ -279,12 +325,13 @@ static void wait_current_trans(struct btrfs_root *root)
279 325
280 spin_lock(&root->fs_info->trans_lock); 326 spin_lock(&root->fs_info->trans_lock);
281 cur_trans = root->fs_info->running_transaction; 327 cur_trans = root->fs_info->running_transaction;
282 if (cur_trans && cur_trans->blocked) { 328 if (cur_trans && is_transaction_blocked(cur_trans)) {
283 atomic_inc(&cur_trans->use_count); 329 atomic_inc(&cur_trans->use_count);
284 spin_unlock(&root->fs_info->trans_lock); 330 spin_unlock(&root->fs_info->trans_lock);
285 331
286 wait_event(root->fs_info->transaction_wait, 332 wait_event(root->fs_info->transaction_wait,
287 !cur_trans->blocked); 333 cur_trans->state >= TRANS_STATE_UNBLOCKED ||
334 cur_trans->aborted);
288 put_transaction(cur_trans); 335 put_transaction(cur_trans);
289 } else { 336 } else {
290 spin_unlock(&root->fs_info->trans_lock); 337 spin_unlock(&root->fs_info->trans_lock);
@@ -307,7 +354,7 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
307} 354}
308 355
309static struct btrfs_trans_handle * 356static struct btrfs_trans_handle *
310start_transaction(struct btrfs_root *root, u64 num_items, int type, 357start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
311 enum btrfs_reserve_flush_enum flush) 358 enum btrfs_reserve_flush_enum flush)
312{ 359{
313 struct btrfs_trans_handle *h; 360 struct btrfs_trans_handle *h;
@@ -320,7 +367,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
320 return ERR_PTR(-EROFS); 367 return ERR_PTR(-EROFS);
321 368
322 if (current->journal_info) { 369 if (current->journal_info) {
323 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 370 WARN_ON(type & TRANS_EXTWRITERS);
324 h = current->journal_info; 371 h = current->journal_info;
325 h->use_count++; 372 h->use_count++;
326 WARN_ON(h->use_count > 2); 373 WARN_ON(h->use_count > 2);
@@ -366,7 +413,7 @@ again:
366 * If we are ATTACH, it means we just want to catch the current 413 * If we are ATTACH, it means we just want to catch the current
367 * transaction and commit it, so we needn't do sb_start_intwrite(). 414 * transaction and commit it, so we needn't do sb_start_intwrite().
368 */ 415 */
369 if (type < TRANS_JOIN_NOLOCK) 416 if (type & __TRANS_FREEZABLE)
370 sb_start_intwrite(root->fs_info->sb); 417 sb_start_intwrite(root->fs_info->sb);
371 418
372 if (may_wait_transaction(root, type)) 419 if (may_wait_transaction(root, type))
@@ -408,7 +455,8 @@ again:
408 INIT_LIST_HEAD(&h->new_bgs); 455 INIT_LIST_HEAD(&h->new_bgs);
409 456
410 smp_mb(); 457 smp_mb();
411 if (cur_trans->blocked && may_wait_transaction(root, type)) { 458 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
459 may_wait_transaction(root, type)) {
412 btrfs_commit_transaction(h, root); 460 btrfs_commit_transaction(h, root);
413 goto again; 461 goto again;
414 } 462 }
@@ -429,7 +477,7 @@ got_it:
429 return h; 477 return h;
430 478
431join_fail: 479join_fail:
432 if (type < TRANS_JOIN_NOLOCK) 480 if (type & __TRANS_FREEZABLE)
433 sb_end_intwrite(root->fs_info->sb); 481 sb_end_intwrite(root->fs_info->sb);
434 kmem_cache_free(btrfs_trans_handle_cachep, h); 482 kmem_cache_free(btrfs_trans_handle_cachep, h);
435alloc_fail: 483alloc_fail:
@@ -490,7 +538,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
490} 538}
491 539
492/* 540/*
493 * btrfs_attach_transaction() - catch the running transaction 541 * btrfs_attach_transaction_barrier() - catch the running transaction
494 * 542 *
495 * It is similar to the above function, the differentia is this one 543 * It is similar to the above function, the differentia is this one
496 * will wait for all the inactive transactions until they fully 544 * will wait for all the inactive transactions until they fully
@@ -512,7 +560,7 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
512static noinline void wait_for_commit(struct btrfs_root *root, 560static noinline void wait_for_commit(struct btrfs_root *root,
513 struct btrfs_transaction *commit) 561 struct btrfs_transaction *commit)
514{ 562{
515 wait_event(commit->commit_wait, commit->commit_done); 563 wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
516} 564}
517 565
518int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 566int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -548,8 +596,8 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
548 spin_lock(&root->fs_info->trans_lock); 596 spin_lock(&root->fs_info->trans_lock);
549 list_for_each_entry_reverse(t, &root->fs_info->trans_list, 597 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
550 list) { 598 list) {
551 if (t->in_commit) { 599 if (t->state >= TRANS_STATE_COMMIT_START) {
552 if (t->commit_done) 600 if (t->state == TRANS_STATE_COMPLETED)
553 break; 601 break;
554 cur_trans = t; 602 cur_trans = t;
555 atomic_inc(&cur_trans->use_count); 603 atomic_inc(&cur_trans->use_count);
@@ -576,10 +624,11 @@ void btrfs_throttle(struct btrfs_root *root)
576static int should_end_transaction(struct btrfs_trans_handle *trans, 624static int should_end_transaction(struct btrfs_trans_handle *trans,
577 struct btrfs_root *root) 625 struct btrfs_root *root)
578{ 626{
579 int ret; 627 if (root->fs_info->global_block_rsv.space_info->full &&
628 btrfs_should_throttle_delayed_refs(trans, root))
629 return 1;
580 630
581 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); 631 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
582 return ret ? 1 : 0;
583} 632}
584 633
585int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 634int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
@@ -590,7 +639,8 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
590 int err; 639 int err;
591 640
592 smp_mb(); 641 smp_mb();
593 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 642 if (cur_trans->state >= TRANS_STATE_BLOCKED ||
643 cur_trans->delayed_refs.flushing)
594 return 1; 644 return 1;
595 645
596 updates = trans->delayed_ref_updates; 646 updates = trans->delayed_ref_updates;
@@ -609,7 +659,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
609{ 659{
610 struct btrfs_transaction *cur_trans = trans->transaction; 660 struct btrfs_transaction *cur_trans = trans->transaction;
611 struct btrfs_fs_info *info = root->fs_info; 661 struct btrfs_fs_info *info = root->fs_info;
612 int count = 0; 662 unsigned long cur = trans->delayed_ref_updates;
613 int lock = (trans->type != TRANS_JOIN_NOLOCK); 663 int lock = (trans->type != TRANS_JOIN_NOLOCK);
614 int err = 0; 664 int err = 0;
615 665
@@ -638,17 +688,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
638 if (!list_empty(&trans->new_bgs)) 688 if (!list_empty(&trans->new_bgs))
639 btrfs_create_pending_block_groups(trans, root); 689 btrfs_create_pending_block_groups(trans, root);
640 690
641 while (count < 1) { 691 trans->delayed_ref_updates = 0;
642 unsigned long cur = trans->delayed_ref_updates; 692 if (btrfs_should_throttle_delayed_refs(trans, root)) {
693 cur = max_t(unsigned long, cur, 1);
643 trans->delayed_ref_updates = 0; 694 trans->delayed_ref_updates = 0;
644 if (cur && 695 btrfs_run_delayed_refs(trans, root, cur);
645 trans->transaction->delayed_refs.num_heads_ready > 64) {
646 trans->delayed_ref_updates = 0;
647 btrfs_run_delayed_refs(trans, root, cur);
648 } else {
649 break;
650 }
651 count++;
652 } 696 }
653 697
654 btrfs_trans_release_metadata(trans, root); 698 btrfs_trans_release_metadata(trans, root);
@@ -658,12 +702,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
658 btrfs_create_pending_block_groups(trans, root); 702 btrfs_create_pending_block_groups(trans, root);
659 703
660 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 704 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
661 should_end_transaction(trans, root)) { 705 should_end_transaction(trans, root) &&
662 trans->transaction->blocked = 1; 706 ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
663 smp_wmb(); 707 spin_lock(&info->trans_lock);
708 if (cur_trans->state == TRANS_STATE_RUNNING)
709 cur_trans->state = TRANS_STATE_BLOCKED;
710 spin_unlock(&info->trans_lock);
664 } 711 }
665 712
666 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 713 if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
667 if (throttle) { 714 if (throttle) {
668 /* 715 /*
669 * We may race with somebody else here so end up having 716 * We may race with somebody else here so end up having
@@ -677,12 +724,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
677 } 724 }
678 } 725 }
679 726
680 if (trans->type < TRANS_JOIN_NOLOCK) 727 if (trans->type & __TRANS_FREEZABLE)
681 sb_end_intwrite(root->fs_info->sb); 728 sb_end_intwrite(root->fs_info->sb);
682 729
683 WARN_ON(cur_trans != info->running_transaction); 730 WARN_ON(cur_trans != info->running_transaction);
684 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 731 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
685 atomic_dec(&cur_trans->num_writers); 732 atomic_dec(&cur_trans->num_writers);
733 extwriter_counter_dec(cur_trans, trans->type);
686 734
687 smp_mb(); 735 smp_mb();
688 if (waitqueue_active(&cur_trans->writer_wait)) 736 if (waitqueue_active(&cur_trans->writer_wait))
@@ -736,9 +784,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
736 struct extent_state *cached_state = NULL; 784 struct extent_state *cached_state = NULL;
737 u64 start = 0; 785 u64 start = 0;
738 u64 end; 786 u64 end;
739 struct blk_plug plug;
740 787
741 blk_start_plug(&plug);
742 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 788 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
743 mark, &cached_state)) { 789 mark, &cached_state)) {
744 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 790 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -752,7 +798,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
752 } 798 }
753 if (err) 799 if (err)
754 werr = err; 800 werr = err;
755 blk_finish_plug(&plug);
756 return werr; 801 return werr;
757} 802}
758 803
@@ -797,8 +842,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
797{ 842{
798 int ret; 843 int ret;
799 int ret2; 844 int ret2;
845 struct blk_plug plug;
800 846
847 blk_start_plug(&plug);
801 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 848 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
849 blk_finish_plug(&plug);
802 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 850 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
803 851
804 if (ret) 852 if (ret)
@@ -935,12 +983,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
935 * a dirty root struct and adds it into the list of dead roots that need to 983 * a dirty root struct and adds it into the list of dead roots that need to
936 * be deleted 984 * be deleted
937 */ 985 */
938int btrfs_add_dead_root(struct btrfs_root *root) 986void btrfs_add_dead_root(struct btrfs_root *root)
939{ 987{
940 spin_lock(&root->fs_info->trans_lock); 988 spin_lock(&root->fs_info->trans_lock);
941 list_add_tail(&root->root_list, &root->fs_info->dead_roots); 989 if (list_empty(&root->root_list))
990 list_add_tail(&root->root_list, &root->fs_info->dead_roots);
942 spin_unlock(&root->fs_info->trans_lock); 991 spin_unlock(&root->fs_info->trans_lock);
943 return 0;
944} 992}
945 993
946/* 994/*
@@ -1318,20 +1366,26 @@ static void update_super_roots(struct btrfs_root *root)
1318 1366
1319int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1367int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
1320{ 1368{
1369 struct btrfs_transaction *trans;
1321 int ret = 0; 1370 int ret = 0;
1371
1322 spin_lock(&info->trans_lock); 1372 spin_lock(&info->trans_lock);
1323 if (info->running_transaction) 1373 trans = info->running_transaction;
1324 ret = info->running_transaction->in_commit; 1374 if (trans)
1375 ret = (trans->state >= TRANS_STATE_COMMIT_START);
1325 spin_unlock(&info->trans_lock); 1376 spin_unlock(&info->trans_lock);
1326 return ret; 1377 return ret;
1327} 1378}
1328 1379
1329int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1380int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1330{ 1381{
1382 struct btrfs_transaction *trans;
1331 int ret = 0; 1383 int ret = 0;
1384
1332 spin_lock(&info->trans_lock); 1385 spin_lock(&info->trans_lock);
1333 if (info->running_transaction) 1386 trans = info->running_transaction;
1334 ret = info->running_transaction->blocked; 1387 if (trans)
1388 ret = is_transaction_blocked(trans);
1335 spin_unlock(&info->trans_lock); 1389 spin_unlock(&info->trans_lock);
1336 return ret; 1390 return ret;
1337} 1391}
@@ -1343,7 +1397,9 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1343static void wait_current_trans_commit_start(struct btrfs_root *root, 1397static void wait_current_trans_commit_start(struct btrfs_root *root,
1344 struct btrfs_transaction *trans) 1398 struct btrfs_transaction *trans)
1345{ 1399{
1346 wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); 1400 wait_event(root->fs_info->transaction_blocked_wait,
1401 trans->state >= TRANS_STATE_COMMIT_START ||
1402 trans->aborted);
1347} 1403}
1348 1404
1349/* 1405/*
@@ -1354,7 +1410,8 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1354 struct btrfs_transaction *trans) 1410 struct btrfs_transaction *trans)
1355{ 1411{
1356 wait_event(root->fs_info->transaction_wait, 1412 wait_event(root->fs_info->transaction_wait,
1357 trans->commit_done || (trans->in_commit && !trans->blocked)); 1413 trans->state >= TRANS_STATE_UNBLOCKED ||
1414 trans->aborted);
1358} 1415}
1359 1416
1360/* 1417/*
@@ -1450,26 +1507,31 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1450 1507
1451 spin_lock(&root->fs_info->trans_lock); 1508 spin_lock(&root->fs_info->trans_lock);
1452 1509
1453 if (list_empty(&cur_trans->list)) { 1510 /*
1454 spin_unlock(&root->fs_info->trans_lock); 1511 * If the transaction is removed from the list, it means this
1455 btrfs_end_transaction(trans, root); 1512 * transaction has been committed successfully, so it is impossible
1456 return; 1513 * to call the cleanup function.
1457 } 1514 */
1515 BUG_ON(list_empty(&cur_trans->list));
1458 1516
1459 list_del_init(&cur_trans->list); 1517 list_del_init(&cur_trans->list);
1460 if (cur_trans == root->fs_info->running_transaction) { 1518 if (cur_trans == root->fs_info->running_transaction) {
1461 root->fs_info->trans_no_join = 1; 1519 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1462 spin_unlock(&root->fs_info->trans_lock); 1520 spin_unlock(&root->fs_info->trans_lock);
1463 wait_event(cur_trans->writer_wait, 1521 wait_event(cur_trans->writer_wait,
1464 atomic_read(&cur_trans->num_writers) == 1); 1522 atomic_read(&cur_trans->num_writers) == 1);
1465 1523
1466 spin_lock(&root->fs_info->trans_lock); 1524 spin_lock(&root->fs_info->trans_lock);
1467 root->fs_info->running_transaction = NULL;
1468 } 1525 }
1469 spin_unlock(&root->fs_info->trans_lock); 1526 spin_unlock(&root->fs_info->trans_lock);
1470 1527
1471 btrfs_cleanup_one_transaction(trans->transaction, root); 1528 btrfs_cleanup_one_transaction(trans->transaction, root);
1472 1529
1530 spin_lock(&root->fs_info->trans_lock);
1531 if (cur_trans == root->fs_info->running_transaction)
1532 root->fs_info->running_transaction = NULL;
1533 spin_unlock(&root->fs_info->trans_lock);
1534
1473 put_transaction(cur_trans); 1535 put_transaction(cur_trans);
1474 put_transaction(cur_trans); 1536 put_transaction(cur_trans);
1475 1537
@@ -1481,33 +1543,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1481 current->journal_info = NULL; 1543 current->journal_info = NULL;
1482 1544
1483 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1545 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1484
1485 spin_lock(&root->fs_info->trans_lock);
1486 root->fs_info->trans_no_join = 0;
1487 spin_unlock(&root->fs_info->trans_lock);
1488} 1546}
1489 1547
1490static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, 1548static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1491 struct btrfs_root *root) 1549 struct btrfs_root *root)
1492{ 1550{
1493 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1494 int snap_pending = 0;
1495 int ret; 1551 int ret;
1496 1552
1497 if (!flush_on_commit) {
1498 spin_lock(&root->fs_info->trans_lock);
1499 if (!list_empty(&trans->transaction->pending_snapshots))
1500 snap_pending = 1;
1501 spin_unlock(&root->fs_info->trans_lock);
1502 }
1503
1504 if (flush_on_commit || snap_pending) {
1505 ret = btrfs_start_delalloc_inodes(root, 1);
1506 if (ret)
1507 return ret;
1508 btrfs_wait_ordered_extents(root, 1);
1509 }
1510
1511 ret = btrfs_run_delayed_items(trans, root); 1553 ret = btrfs_run_delayed_items(trans, root);
1512 if (ret) 1554 if (ret)
1513 return ret; 1555 return ret;
@@ -1531,23 +1573,25 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1531 return ret; 1573 return ret;
1532} 1574}
1533 1575
1534/* 1576static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1535 * btrfs_transaction state sequence: 1577{
1536 * in_commit = 0, blocked = 0 (initial) 1578 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1537 * in_commit = 1, blocked = 1 1579 return btrfs_start_all_delalloc_inodes(fs_info, 1);
1538 * blocked = 0 1580 return 0;
1539 * commit_done = 1 1581}
1540 */ 1582
1583static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1584{
1585 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1586 btrfs_wait_all_ordered_extents(fs_info, 1);
1587}
1588
1541int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1589int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root) 1590 struct btrfs_root *root)
1543{ 1591{
1544 unsigned long joined = 0;
1545 struct btrfs_transaction *cur_trans = trans->transaction; 1592 struct btrfs_transaction *cur_trans = trans->transaction;
1546 struct btrfs_transaction *prev_trans = NULL; 1593 struct btrfs_transaction *prev_trans = NULL;
1547 DEFINE_WAIT(wait);
1548 int ret; 1594 int ret;
1549 int should_grow = 0;
1550 unsigned long now = get_seconds();
1551 1595
1552 ret = btrfs_run_ordered_operations(trans, root, 0); 1596 ret = btrfs_run_ordered_operations(trans, root, 0);
1553 if (ret) { 1597 if (ret) {
@@ -1586,6 +1630,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1586 * start sending their work down. 1630 * start sending their work down.
1587 */ 1631 */
1588 cur_trans->delayed_refs.flushing = 1; 1632 cur_trans->delayed_refs.flushing = 1;
1633 smp_wmb();
1589 1634
1590 if (!list_empty(&trans->new_bgs)) 1635 if (!list_empty(&trans->new_bgs))
1591 btrfs_create_pending_block_groups(trans, root); 1636 btrfs_create_pending_block_groups(trans, root);
@@ -1596,9 +1641,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1596 return ret; 1641 return ret;
1597 } 1642 }
1598 1643
1599 spin_lock(&cur_trans->commit_lock); 1644 spin_lock(&root->fs_info->trans_lock);
1600 if (cur_trans->in_commit) { 1645 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1601 spin_unlock(&cur_trans->commit_lock); 1646 spin_unlock(&root->fs_info->trans_lock);
1602 atomic_inc(&cur_trans->use_count); 1647 atomic_inc(&cur_trans->use_count);
1603 ret = btrfs_end_transaction(trans, root); 1648 ret = btrfs_end_transaction(trans, root);
1604 1649
@@ -1609,16 +1654,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1609 return ret; 1654 return ret;
1610 } 1655 }
1611 1656
1612 trans->transaction->in_commit = 1; 1657 cur_trans->state = TRANS_STATE_COMMIT_START;
1613 trans->transaction->blocked = 1;
1614 spin_unlock(&cur_trans->commit_lock);
1615 wake_up(&root->fs_info->transaction_blocked_wait); 1658 wake_up(&root->fs_info->transaction_blocked_wait);
1616 1659
1617 spin_lock(&root->fs_info->trans_lock);
1618 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1660 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1619 prev_trans = list_entry(cur_trans->list.prev, 1661 prev_trans = list_entry(cur_trans->list.prev,
1620 struct btrfs_transaction, list); 1662 struct btrfs_transaction, list);
1621 if (!prev_trans->commit_done) { 1663 if (prev_trans->state != TRANS_STATE_COMPLETED) {
1622 atomic_inc(&prev_trans->use_count); 1664 atomic_inc(&prev_trans->use_count);
1623 spin_unlock(&root->fs_info->trans_lock); 1665 spin_unlock(&root->fs_info->trans_lock);
1624 1666
@@ -1632,42 +1674,32 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1632 spin_unlock(&root->fs_info->trans_lock); 1674 spin_unlock(&root->fs_info->trans_lock);
1633 } 1675 }
1634 1676
1635 if (!btrfs_test_opt(root, SSD) && 1677 extwriter_counter_dec(cur_trans, trans->type);
1636 (now < cur_trans->start_time || now - cur_trans->start_time < 1))
1637 should_grow = 1;
1638
1639 do {
1640 joined = cur_trans->num_joined;
1641
1642 WARN_ON(cur_trans != trans->transaction);
1643
1644 ret = btrfs_flush_all_pending_stuffs(trans, root);
1645 if (ret)
1646 goto cleanup_transaction;
1647 1678
1648 prepare_to_wait(&cur_trans->writer_wait, &wait, 1679 ret = btrfs_start_delalloc_flush(root->fs_info);
1649 TASK_UNINTERRUPTIBLE); 1680 if (ret)
1681 goto cleanup_transaction;
1650 1682
1651 if (atomic_read(&cur_trans->num_writers) > 1) 1683 ret = btrfs_flush_all_pending_stuffs(trans, root);
1652 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1684 if (ret)
1653 else if (should_grow) 1685 goto cleanup_transaction;
1654 schedule_timeout(1);
1655 1686
1656 finish_wait(&cur_trans->writer_wait, &wait); 1687 wait_event(cur_trans->writer_wait,
1657 } while (atomic_read(&cur_trans->num_writers) > 1 || 1688 extwriter_counter_read(cur_trans) == 0);
1658 (should_grow && cur_trans->num_joined != joined));
1659 1689
1690 /* some pending stuffs might be added after the previous flush. */
1660 ret = btrfs_flush_all_pending_stuffs(trans, root); 1691 ret = btrfs_flush_all_pending_stuffs(trans, root);
1661 if (ret) 1692 if (ret)
1662 goto cleanup_transaction; 1693 goto cleanup_transaction;
1663 1694
1695 btrfs_wait_delalloc_flush(root->fs_info);
1664 /* 1696 /*
1665 * Ok now we need to make sure to block out any other joins while we 1697 * Ok now we need to make sure to block out any other joins while we
1666 * commit the transaction. We could have started a join before setting 1698 * commit the transaction. We could have started a join before setting
1667 * no_join so make sure to wait for num_writers to == 1 again. 1699 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
1668 */ 1700 */
1669 spin_lock(&root->fs_info->trans_lock); 1701 spin_lock(&root->fs_info->trans_lock);
1670 root->fs_info->trans_no_join = 1; 1702 cur_trans->state = TRANS_STATE_COMMIT_DOING;
1671 spin_unlock(&root->fs_info->trans_lock); 1703 spin_unlock(&root->fs_info->trans_lock);
1672 wait_event(cur_trans->writer_wait, 1704 wait_event(cur_trans->writer_wait,
1673 atomic_read(&cur_trans->num_writers) == 1); 1705 atomic_read(&cur_trans->num_writers) == 1);
@@ -1794,10 +1826,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1794 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, 1826 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1795 sizeof(*root->fs_info->super_copy)); 1827 sizeof(*root->fs_info->super_copy));
1796 1828
1797 trans->transaction->blocked = 0;
1798 spin_lock(&root->fs_info->trans_lock); 1829 spin_lock(&root->fs_info->trans_lock);
1830 cur_trans->state = TRANS_STATE_UNBLOCKED;
1799 root->fs_info->running_transaction = NULL; 1831 root->fs_info->running_transaction = NULL;
1800 root->fs_info->trans_no_join = 0;
1801 spin_unlock(&root->fs_info->trans_lock); 1832 spin_unlock(&root->fs_info->trans_lock);
1802 mutex_unlock(&root->fs_info->reloc_mutex); 1833 mutex_unlock(&root->fs_info->reloc_mutex);
1803 1834
@@ -1825,10 +1856,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1825 1856
1826 btrfs_finish_extent_commit(trans, root); 1857 btrfs_finish_extent_commit(trans, root);
1827 1858
1828 cur_trans->commit_done = 1;
1829
1830 root->fs_info->last_trans_committed = cur_trans->transid; 1859 root->fs_info->last_trans_committed = cur_trans->transid;
1831 1860 /*
1861 * We needn't acquire the lock here because there is no other task
1862 * which can change it.
1863 */
1864 cur_trans->state = TRANS_STATE_COMPLETED;
1832 wake_up(&cur_trans->commit_wait); 1865 wake_up(&cur_trans->commit_wait);
1833 1866
1834 spin_lock(&root->fs_info->trans_lock); 1867 spin_lock(&root->fs_info->trans_lock);
@@ -1838,7 +1871,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1838 put_transaction(cur_trans); 1871 put_transaction(cur_trans);
1839 put_transaction(cur_trans); 1872 put_transaction(cur_trans);
1840 1873
1841 if (trans->type < TRANS_JOIN_NOLOCK) 1874 if (trans->type & __TRANS_FREEZABLE)
1842 sb_end_intwrite(root->fs_info->sb); 1875 sb_end_intwrite(root->fs_info->sb);
1843 1876
1844 trace_btrfs_transaction_commit(root); 1877 trace_btrfs_transaction_commit(root);
@@ -1885,11 +1918,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1885 int ret; 1918 int ret;
1886 struct btrfs_fs_info *fs_info = root->fs_info; 1919 struct btrfs_fs_info *fs_info = root->fs_info;
1887 1920
1888 if (fs_info->sb->s_flags & MS_RDONLY) {
1889 pr_debug("btrfs: cleaner called for RO fs!\n");
1890 return 0;
1891 }
1892
1893 spin_lock(&fs_info->trans_lock); 1921 spin_lock(&fs_info->trans_lock);
1894 if (list_empty(&fs_info->dead_roots)) { 1922 if (list_empty(&fs_info->dead_roots)) {
1895 spin_unlock(&fs_info->trans_lock); 1923 spin_unlock(&fs_info->trans_lock);
@@ -1897,7 +1925,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1897 } 1925 }
1898 root = list_first_entry(&fs_info->dead_roots, 1926 root = list_first_entry(&fs_info->dead_roots,
1899 struct btrfs_root, root_list); 1927 struct btrfs_root, root_list);
1900 list_del(&root->root_list); 1928 list_del_init(&root->root_list);
1901 spin_unlock(&fs_info->trans_lock); 1929 spin_unlock(&fs_info->trans_lock);
1902 1930
1903 pr_debug("btrfs: cleaner removing %llu\n", 1931 pr_debug("btrfs: cleaner removing %llu\n",
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 24c97335a59f..defbc4269897 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -22,21 +22,33 @@
22#include "delayed-ref.h" 22#include "delayed-ref.h"
23#include "ctree.h" 23#include "ctree.h"
24 24
25enum btrfs_trans_state {
26 TRANS_STATE_RUNNING = 0,
27 TRANS_STATE_BLOCKED = 1,
28 TRANS_STATE_COMMIT_START = 2,
29 TRANS_STATE_COMMIT_DOING = 3,
30 TRANS_STATE_UNBLOCKED = 4,
31 TRANS_STATE_COMPLETED = 5,
32 TRANS_STATE_MAX = 6,
33};
34
25struct btrfs_transaction { 35struct btrfs_transaction {
26 u64 transid; 36 u64 transid;
27 /* 37 /*
38 * total external writers(USERSPACE/START/ATTACH) in this
39 * transaction, it must be zero before the transaction is
40 * being committed
41 */
42 atomic_t num_extwriters;
43 /*
28 * total writers in this transaction, it must be zero before the 44 * total writers in this transaction, it must be zero before the
29 * transaction can end 45 * transaction can end
30 */ 46 */
31 atomic_t num_writers; 47 atomic_t num_writers;
32 atomic_t use_count; 48 atomic_t use_count;
33 49
34 unsigned long num_joined; 50 /* Be protected by fs_info->trans_lock when we want to change it. */
35 51 enum btrfs_trans_state state;
36 spinlock_t commit_lock;
37 int in_commit;
38 int commit_done;
39 int blocked;
40 struct list_head list; 52 struct list_head list;
41 struct extent_io_tree dirty_pages; 53 struct extent_io_tree dirty_pages;
42 unsigned long start_time; 54 unsigned long start_time;
@@ -44,17 +56,27 @@ struct btrfs_transaction {
44 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
45 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
46 struct list_head ordered_operations; 58 struct list_head ordered_operations;
59 struct list_head pending_chunks;
47 struct btrfs_delayed_ref_root delayed_refs; 60 struct btrfs_delayed_ref_root delayed_refs;
48 int aborted; 61 int aborted;
49}; 62};
50 63
51enum btrfs_trans_type { 64#define __TRANS_FREEZABLE (1U << 0)
52 TRANS_START, 65
53 TRANS_JOIN, 66#define __TRANS_USERSPACE (1U << 8)
54 TRANS_USERSPACE, 67#define __TRANS_START (1U << 9)
55 TRANS_JOIN_NOLOCK, 68#define __TRANS_ATTACH (1U << 10)
56 TRANS_ATTACH, 69#define __TRANS_JOIN (1U << 11)
57}; 70#define __TRANS_JOIN_NOLOCK (1U << 12)
71
72#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
73#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
74#define TRANS_ATTACH (__TRANS_ATTACH)
75#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
76#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
77
78#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
79 __TRANS_ATTACH)
58 80
59struct btrfs_trans_handle { 81struct btrfs_trans_handle {
60 u64 transid; 82 u64 transid;
@@ -70,7 +92,7 @@ struct btrfs_trans_handle {
70 short aborted; 92 short aborted;
71 short adding_csums; 93 short adding_csums;
72 bool allocating_chunk; 94 bool allocating_chunk;
73 enum btrfs_trans_type type; 95 unsigned int type;
74 /* 96 /*
75 * this root is only needed to validate that the root passed to 97 * this root is only needed to validate that the root passed to
76 * start_transaction is the same as the one passed to end_transaction. 98 * start_transaction is the same as the one passed to end_transaction.
@@ -121,7 +143,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
121int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 143int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
122 struct btrfs_root *root); 144 struct btrfs_root *root);
123 145
124int btrfs_add_dead_root(struct btrfs_root *root); 146void btrfs_add_dead_root(struct btrfs_root *root);
125int btrfs_defrag_root(struct btrfs_root *root); 147int btrfs_defrag_root(struct btrfs_root *root);
126int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); 148int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
127int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 149int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c276ac9a0ec3..ff60d8978ae2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -18,6 +18,7 @@
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/blkdev.h>
21#include <linux/list_sort.h> 22#include <linux/list_sort.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "transaction.h" 24#include "transaction.h"
@@ -279,11 +280,23 @@ static int process_one_buffer(struct btrfs_root *log,
279{ 280{
280 int ret = 0; 281 int ret = 0;
281 282
283 /*
284 * If this fs is mixed then we need to be able to process the leaves to
285 * pin down any logged extents, so we have to read the block.
286 */
287 if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
288 ret = btrfs_read_buffer(eb, gen);
289 if (ret)
290 return ret;
291 }
292
282 if (wc->pin) 293 if (wc->pin)
283 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, 294 ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
284 eb->start, eb->len); 295 eb->start, eb->len);
285 296
286 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { 297 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
298 if (wc->pin && btrfs_header_level(eb) == 0)
299 ret = btrfs_exclude_logged_extents(log, eb);
287 if (wc->write) 300 if (wc->write)
288 btrfs_write_tree_block(eb); 301 btrfs_write_tree_block(eb);
289 if (wc->wait) 302 if (wc->wait)
@@ -2016,13 +2029,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2016 eb, i, &key); 2029 eb, i, &key);
2017 if (ret) 2030 if (ret)
2018 break; 2031 break;
2019 } else if (key.type == BTRFS_INODE_REF_KEY) { 2032 } else if (key.type == BTRFS_INODE_REF_KEY ||
2020 ret = add_inode_ref(wc->trans, root, log, path, 2033 key.type == BTRFS_INODE_EXTREF_KEY) {
2021 eb, i, &key);
2022 if (ret && ret != -ENOENT)
2023 break;
2024 ret = 0;
2025 } else if (key.type == BTRFS_INODE_EXTREF_KEY) {
2026 ret = add_inode_ref(wc->trans, root, log, path, 2034 ret = add_inode_ref(wc->trans, root, log, path,
2027 eb, i, &key); 2035 eb, i, &key);
2028 if (ret && ret != -ENOENT) 2036 if (ret && ret != -ENOENT)
@@ -2358,6 +2366,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2358 struct btrfs_root *log = root->log_root; 2366 struct btrfs_root *log = root->log_root;
2359 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 2367 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
2360 unsigned long log_transid = 0; 2368 unsigned long log_transid = 0;
2369 struct blk_plug plug;
2361 2370
2362 mutex_lock(&root->log_mutex); 2371 mutex_lock(&root->log_mutex);
2363 log_transid = root->log_transid; 2372 log_transid = root->log_transid;
@@ -2401,8 +2410,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2401 /* we start IO on all the marked extents here, but we don't actually 2410 /* we start IO on all the marked extents here, but we don't actually
2402 * wait for them until later. 2411 * wait for them until later.
2403 */ 2412 */
2413 blk_start_plug(&plug);
2404 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); 2414 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
2405 if (ret) { 2415 if (ret) {
2416 blk_finish_plug(&plug);
2406 btrfs_abort_transaction(trans, root, ret); 2417 btrfs_abort_transaction(trans, root, ret);
2407 btrfs_free_logged_extents(log, log_transid); 2418 btrfs_free_logged_extents(log, log_transid);
2408 mutex_unlock(&root->log_mutex); 2419 mutex_unlock(&root->log_mutex);
@@ -2437,6 +2448,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2437 } 2448 }
2438 2449
2439 if (ret) { 2450 if (ret) {
2451 blk_finish_plug(&plug);
2440 if (ret != -ENOSPC) { 2452 if (ret != -ENOSPC) {
2441 btrfs_abort_transaction(trans, root, ret); 2453 btrfs_abort_transaction(trans, root, ret);
2442 mutex_unlock(&log_root_tree->log_mutex); 2454 mutex_unlock(&log_root_tree->log_mutex);
@@ -2452,6 +2464,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2452 2464
2453 index2 = log_root_tree->log_transid % 2; 2465 index2 = log_root_tree->log_transid % 2;
2454 if (atomic_read(&log_root_tree->log_commit[index2])) { 2466 if (atomic_read(&log_root_tree->log_commit[index2])) {
2467 blk_finish_plug(&plug);
2455 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2468 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2456 wait_log_commit(trans, log_root_tree, 2469 wait_log_commit(trans, log_root_tree,
2457 log_root_tree->log_transid); 2470 log_root_tree->log_transid);
@@ -2474,6 +2487,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2474 * check the full commit flag again 2487 * check the full commit flag again
2475 */ 2488 */
2476 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2489 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2490 blk_finish_plug(&plug);
2477 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2491 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2478 btrfs_free_logged_extents(log, log_transid); 2492 btrfs_free_logged_extents(log, log_transid);
2479 mutex_unlock(&log_root_tree->log_mutex); 2493 mutex_unlock(&log_root_tree->log_mutex);
@@ -2481,9 +2495,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2481 goto out_wake_log_root; 2495 goto out_wake_log_root;
2482 } 2496 }
2483 2497
2484 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2498 ret = btrfs_write_marked_extents(log_root_tree,
2485 &log_root_tree->dirty_log_pages, 2499 &log_root_tree->dirty_log_pages,
2486 EXTENT_DIRTY | EXTENT_NEW); 2500 EXTENT_DIRTY | EXTENT_NEW);
2501 blk_finish_plug(&plug);
2487 if (ret) { 2502 if (ret) {
2488 btrfs_abort_transaction(trans, root, ret); 2503 btrfs_abort_transaction(trans, root, ret);
2489 btrfs_free_logged_extents(log, log_transid); 2504 btrfs_free_logged_extents(log, log_transid);
@@ -2491,6 +2506,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2491 goto out_wake_log_root; 2506 goto out_wake_log_root;
2492 } 2507 }
2493 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2508 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2509 btrfs_wait_marked_extents(log_root_tree,
2510 &log_root_tree->dirty_log_pages,
2511 EXTENT_NEW | EXTENT_DIRTY);
2494 btrfs_wait_logged_extents(log, log_transid); 2512 btrfs_wait_logged_extents(log, log_transid);
2495 2513
2496 btrfs_set_super_log_root(root->fs_info->super_for_commit, 2514 btrfs_set_super_log_root(root->fs_info->super_for_commit,
@@ -3728,8 +3746,9 @@ next_slot:
3728 } 3746 }
3729 3747
3730log_extents: 3748log_extents:
3749 btrfs_release_path(path);
3750 btrfs_release_path(dst_path);
3731 if (fast_search) { 3751 if (fast_search) {
3732 btrfs_release_path(dst_path);
3733 ret = btrfs_log_changed_extents(trans, root, inode, dst_path); 3752 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3734 if (ret) { 3753 if (ret) {
3735 err = ret; 3754 err = ret;
@@ -3746,8 +3765,6 @@ log_extents:
3746 } 3765 }
3747 3766
3748 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3767 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
3749 btrfs_release_path(path);
3750 btrfs_release_path(dst_path);
3751 ret = log_directory_changes(trans, root, inode, path, dst_path); 3768 ret = log_directory_changes(trans, root, inode, path, dst_path);
3752 if (ret) { 3769 if (ret) {
3753 err = ret; 3770 err = ret;
@@ -4016,8 +4033,7 @@ again:
4016 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4033 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4017 break; 4034 break;
4018 4035
4019 log = btrfs_read_fs_root_no_radix(log_root_tree, 4036 log = btrfs_read_fs_root(log_root_tree, &found_key);
4020 &found_key);
4021 if (IS_ERR(log)) { 4037 if (IS_ERR(log)) {
4022 ret = PTR_ERR(log); 4038 ret = PTR_ERR(log);
4023 btrfs_error(fs_info, ret, 4039 btrfs_error(fs_info, ret,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 7b417e20efe2..b0a523b2c60e 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -205,6 +205,10 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
205 u64 new_alloced = ulist->nodes_alloced + 128; 205 u64 new_alloced = ulist->nodes_alloced + 128;
206 struct ulist_node *new_nodes; 206 struct ulist_node *new_nodes;
207 void *old = NULL; 207 void *old = NULL;
208 int i;
209
210 for (i = 0; i < ulist->nnodes; i++)
211 rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
208 212
209 /* 213 /*
210 * if nodes_alloced == ULIST_SIZE no memory has been allocated 214 * if nodes_alloced == ULIST_SIZE no memory has been allocated
@@ -224,6 +228,17 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
224 228
225 ulist->nodes = new_nodes; 229 ulist->nodes = new_nodes;
226 ulist->nodes_alloced = new_alloced; 230 ulist->nodes_alloced = new_alloced;
231
232 /*
233 * krealloc actually uses memcpy, which does not copy rb_node
234 * pointers, so we have to do it ourselves. Otherwise we may
235 * be bitten by crashes.
236 */
237 for (i = 0; i < ulist->nnodes; i++) {
238 ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
239 if (ret < 0)
240 return ret;
241 }
227 } 242 }
228 ulist->nodes[ulist->nnodes].val = val; 243 ulist->nodes[ulist->nnodes].val = val;
229 ulist->nodes[ulist->nnodes].aux = aux; 244 ulist->nodes[ulist->nnodes].aux = aux;
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
deleted file mode 100644
index 9bf3946d5ef2..000000000000
--- a/fs/btrfs/version.h
+++ /dev/null
@@ -1,4 +0,0 @@
1#ifndef __BTRFS_VERSION_H
2#define __BTRFS_VERSION_H
3#define BTRFS_BUILD_VERSION "Btrfs"
4#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..78b871753cb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -982,6 +982,35 @@ out:
982 return ret; 982 return ret;
983} 983}
984 984
985static int contains_pending_extent(struct btrfs_trans_handle *trans,
986 struct btrfs_device *device,
987 u64 *start, u64 len)
988{
989 struct extent_map *em;
990 int ret = 0;
991
992 list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
993 struct map_lookup *map;
994 int i;
995
996 map = (struct map_lookup *)em->bdev;
997 for (i = 0; i < map->num_stripes; i++) {
998 if (map->stripes[i].dev != device)
999 continue;
1000 if (map->stripes[i].physical >= *start + len ||
1001 map->stripes[i].physical + em->orig_block_len <=
1002 *start)
1003 continue;
1004 *start = map->stripes[i].physical +
1005 em->orig_block_len;
1006 ret = 1;
1007 }
1008 }
1009
1010 return ret;
1011}
1012
1013
985/* 1014/*
986 * find_free_dev_extent - find free space in the specified device 1015 * find_free_dev_extent - find free space in the specified device
987 * @device: the device which we search the free space in 1016 * @device: the device which we search the free space in
@@ -1002,7 +1031,8 @@ out:
1002 * But if we don't find suitable free space, it is used to store the size of 1031 * But if we don't find suitable free space, it is used to store the size of
1003 * the max free space. 1032 * the max free space.
1004 */ 1033 */
1005int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1034int find_free_dev_extent(struct btrfs_trans_handle *trans,
1035 struct btrfs_device *device, u64 num_bytes,
1006 u64 *start, u64 *len) 1036 u64 *start, u64 *len)
1007{ 1037{
1008 struct btrfs_key key; 1038 struct btrfs_key key;
@@ -1026,21 +1056,22 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1026 */ 1056 */
1027 search_start = max(root->fs_info->alloc_start, 1024ull * 1024); 1057 search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1028 1058
1059 path = btrfs_alloc_path();
1060 if (!path)
1061 return -ENOMEM;
1062again:
1029 max_hole_start = search_start; 1063 max_hole_start = search_start;
1030 max_hole_size = 0; 1064 max_hole_size = 0;
1031 hole_size = 0; 1065 hole_size = 0;
1032 1066
1033 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1067 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1034 ret = -ENOSPC; 1068 ret = -ENOSPC;
1035 goto error; 1069 goto out;
1036 } 1070 }
1037 1071
1038 path = btrfs_alloc_path();
1039 if (!path) {
1040 ret = -ENOMEM;
1041 goto error;
1042 }
1043 path->reada = 2; 1072 path->reada = 2;
1073 path->search_commit_root = 1;
1074 path->skip_locking = 1;
1044 1075
1045 key.objectid = device->devid; 1076 key.objectid = device->devid;
1046 key.offset = search_start; 1077 key.offset = search_start;
@@ -1081,6 +1112,15 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1081 if (key.offset > search_start) { 1112 if (key.offset > search_start) {
1082 hole_size = key.offset - search_start; 1113 hole_size = key.offset - search_start;
1083 1114
1115 /*
1116 * Have to check before we set max_hole_start, otherwise
1117 * we could end up sending back this offset anyway.
1118 */
1119 if (contains_pending_extent(trans, device,
1120 &search_start,
1121 hole_size))
1122 hole_size = 0;
1123
1084 if (hole_size > max_hole_size) { 1124 if (hole_size > max_hole_size) {
1085 max_hole_start = search_start; 1125 max_hole_start = search_start;
1086 max_hole_size = hole_size; 1126 max_hole_size = hole_size;
@@ -1124,6 +1164,11 @@ next:
1124 max_hole_size = hole_size; 1164 max_hole_size = hole_size;
1125 } 1165 }
1126 1166
1167 if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1168 btrfs_release_path(path);
1169 goto again;
1170 }
1171
1127 /* See above. */ 1172 /* See above. */
1128 if (hole_size < num_bytes) 1173 if (hole_size < num_bytes)
1129 ret = -ENOSPC; 1174 ret = -ENOSPC;
@@ -1132,7 +1177,6 @@ next:
1132 1177
1133out: 1178out:
1134 btrfs_free_path(path); 1179 btrfs_free_path(path);
1135error:
1136 *start = max_hole_start; 1180 *start = max_hole_start;
1137 if (len) 1181 if (len)
1138 *len = max_hole_size; 1182 *len = max_hole_size;
@@ -1244,47 +1288,22 @@ out:
1244 return ret; 1288 return ret;
1245} 1289}
1246 1290
1247static noinline int find_next_chunk(struct btrfs_root *root, 1291static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1248 u64 objectid, u64 *offset)
1249{ 1292{
1250 struct btrfs_path *path; 1293 struct extent_map_tree *em_tree;
1251 int ret; 1294 struct extent_map *em;
1252 struct btrfs_key key; 1295 struct rb_node *n;
1253 struct btrfs_chunk *chunk; 1296 u64 ret = 0;
1254 struct btrfs_key found_key;
1255
1256 path = btrfs_alloc_path();
1257 if (!path)
1258 return -ENOMEM;
1259
1260 key.objectid = objectid;
1261 key.offset = (u64)-1;
1262 key.type = BTRFS_CHUNK_ITEM_KEY;
1263
1264 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1265 if (ret < 0)
1266 goto error;
1267
1268 BUG_ON(ret == 0); /* Corruption */
1269 1297
1270 ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); 1298 em_tree = &fs_info->mapping_tree.map_tree;
1271 if (ret) { 1299 read_lock(&em_tree->lock);
1272 *offset = 0; 1300 n = rb_last(&em_tree->map);
1273 } else { 1301 if (n) {
1274 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1302 em = rb_entry(n, struct extent_map, rb_node);
1275 path->slots[0]); 1303 ret = em->start + em->len;
1276 if (found_key.objectid != objectid)
1277 *offset = 0;
1278 else {
1279 chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1280 struct btrfs_chunk);
1281 *offset = found_key.offset +
1282 btrfs_chunk_length(path->nodes[0], chunk);
1283 }
1284 } 1304 }
1285 ret = 0; 1305 read_unlock(&em_tree->lock);
1286error: 1306
1287 btrfs_free_path(path);
1288 return ret; 1307 return ret;
1289} 1308}
1290 1309
@@ -1462,31 +1481,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1462 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1481 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1463 1482
1464 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1483 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1465 printk(KERN_ERR "btrfs: unable to go below four devices " 1484 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1466 "on raid10\n");
1467 ret = -EINVAL;
1468 goto out; 1485 goto out;
1469 } 1486 }
1470 1487
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1488 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1472 printk(KERN_ERR "btrfs: unable to go below two " 1489 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1473 "devices on raid1\n");
1474 ret = -EINVAL;
1475 goto out; 1490 goto out;
1476 } 1491 }
1477 1492
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1493 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1479 root->fs_info->fs_devices->rw_devices <= 2) { 1494 root->fs_info->fs_devices->rw_devices <= 2) {
1480 printk(KERN_ERR "btrfs: unable to go below two " 1495 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1481 "devices on raid5\n");
1482 ret = -EINVAL;
1483 goto out; 1496 goto out;
1484 } 1497 }
1485 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1498 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1486 root->fs_info->fs_devices->rw_devices <= 3) { 1499 root->fs_info->fs_devices->rw_devices <= 3) {
1487 printk(KERN_ERR "btrfs: unable to go below three " 1500 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1488 "devices on raid6\n");
1489 ret = -EINVAL;
1490 goto out; 1501 goto out;
1491 } 1502 }
1492 1503
@@ -1512,8 +1523,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1512 bh = NULL; 1523 bh = NULL;
1513 disk_super = NULL; 1524 disk_super = NULL;
1514 if (!device) { 1525 if (!device) {
1515 printk(KERN_ERR "btrfs: no missing devices found to " 1526 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1516 "remove\n");
1517 goto out; 1527 goto out;
1518 } 1528 }
1519 } else { 1529 } else {
@@ -1535,15 +1545,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1535 } 1545 }
1536 1546
1537 if (device->is_tgtdev_for_dev_replace) { 1547 if (device->is_tgtdev_for_dev_replace) {
1538 pr_err("btrfs: unable to remove the dev_replace target dev\n"); 1548 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1539 ret = -EINVAL;
1540 goto error_brelse; 1549 goto error_brelse;
1541 } 1550 }
1542 1551
1543 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1552 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1544 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1553 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1545 "device\n");
1546 ret = -EINVAL;
1547 goto error_brelse; 1554 goto error_brelse;
1548 } 1555 }
1549 1556
@@ -3295,10 +3302,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3295 } 3302 }
3296 3303
3297 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3304 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3298 if (IS_ERR(tsk)) 3305 return PTR_RET(tsk);
3299 return PTR_ERR(tsk);
3300
3301 return 0;
3302} 3306}
3303 3307
3304int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 3308int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
@@ -3681,10 +3685,8 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3681} 3685}
3682 3686
3683static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3687static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3684 struct btrfs_root *extent_root, 3688 struct btrfs_root *extent_root, u64 start,
3685 struct map_lookup **map_ret, 3689 u64 type)
3686 u64 *num_bytes_out, u64 *stripe_size_out,
3687 u64 start, u64 type)
3688{ 3690{
3689 struct btrfs_fs_info *info = extent_root->fs_info; 3691 struct btrfs_fs_info *info = extent_root->fs_info;
3690 struct btrfs_fs_devices *fs_devices = info->fs_devices; 3692 struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -3791,7 +3793,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3791 if (total_avail == 0) 3793 if (total_avail == 0)
3792 continue; 3794 continue;
3793 3795
3794 ret = find_free_dev_extent(device, 3796 ret = find_free_dev_extent(trans, device,
3795 max_stripe_size * dev_stripes, 3797 max_stripe_size * dev_stripes,
3796 &dev_offset, &max_avail); 3798 &dev_offset, &max_avail);
3797 if (ret && ret != -ENOSPC) 3799 if (ret && ret != -ENOSPC)
@@ -3903,12 +3905,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3903 map->type = type; 3905 map->type = type;
3904 map->sub_stripes = sub_stripes; 3906 map->sub_stripes = sub_stripes;
3905 3907
3906 *map_ret = map;
3907 num_bytes = stripe_size * data_stripes; 3908 num_bytes = stripe_size * data_stripes;
3908 3909
3909 *stripe_size_out = stripe_size;
3910 *num_bytes_out = num_bytes;
3911
3912 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); 3910 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3913 3911
3914 em = alloc_extent_map(); 3912 em = alloc_extent_map();
@@ -3921,38 +3919,26 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3921 em->len = num_bytes; 3919 em->len = num_bytes;
3922 em->block_start = 0; 3920 em->block_start = 0;
3923 em->block_len = em->len; 3921 em->block_len = em->len;
3922 em->orig_block_len = stripe_size;
3924 3923
3925 em_tree = &extent_root->fs_info->mapping_tree.map_tree; 3924 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3926 write_lock(&em_tree->lock); 3925 write_lock(&em_tree->lock);
3927 ret = add_extent_mapping(em_tree, em, 0); 3926 ret = add_extent_mapping(em_tree, em, 0);
3927 if (!ret) {
3928 list_add_tail(&em->list, &trans->transaction->pending_chunks);
3929 atomic_inc(&em->refs);
3930 }
3928 write_unlock(&em_tree->lock); 3931 write_unlock(&em_tree->lock);
3929 if (ret) { 3932 if (ret) {
3930 free_extent_map(em); 3933 free_extent_map(em);
3931 goto error; 3934 goto error;
3932 } 3935 }
3933 3936
3934 for (i = 0; i < map->num_stripes; ++i) {
3935 struct btrfs_device *device;
3936 u64 dev_offset;
3937
3938 device = map->stripes[i].dev;
3939 dev_offset = map->stripes[i].physical;
3940
3941 ret = btrfs_alloc_dev_extent(trans, device,
3942 info->chunk_root->root_key.objectid,
3943 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3944 start, dev_offset, stripe_size);
3945 if (ret)
3946 goto error_dev_extent;
3947 }
3948
3949 ret = btrfs_make_block_group(trans, extent_root, 0, type, 3937 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3950 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3938 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3951 start, num_bytes); 3939 start, num_bytes);
3952 if (ret) { 3940 if (ret)
3953 i = map->num_stripes - 1; 3941 goto error_del_extent;
3954 goto error_dev_extent;
3955 }
3956 3942
3957 free_extent_map(em); 3943 free_extent_map(em);
3958 check_raid56_incompat_flag(extent_root->fs_info, type); 3944 check_raid56_incompat_flag(extent_root->fs_info, type);
@@ -3960,18 +3946,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3960 kfree(devices_info); 3946 kfree(devices_info);
3961 return 0; 3947 return 0;
3962 3948
3963error_dev_extent: 3949error_del_extent:
3964 for (; i >= 0; i--) {
3965 struct btrfs_device *device;
3966 int err;
3967
3968 device = map->stripes[i].dev;
3969 err = btrfs_free_dev_extent(trans, device, start);
3970 if (err) {
3971 btrfs_abort_transaction(trans, extent_root, err);
3972 break;
3973 }
3974 }
3975 write_lock(&em_tree->lock); 3950 write_lock(&em_tree->lock);
3976 remove_extent_mapping(em_tree, em); 3951 remove_extent_mapping(em_tree, em);
3977 write_unlock(&em_tree->lock); 3952 write_unlock(&em_tree->lock);
@@ -3986,33 +3961,68 @@ error:
3986 return ret; 3961 return ret;
3987} 3962}
3988 3963
3989static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 3964int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
3990 struct btrfs_root *extent_root, 3965 struct btrfs_root *extent_root,
3991 struct map_lookup *map, u64 chunk_offset, 3966 u64 chunk_offset, u64 chunk_size)
3992 u64 chunk_size, u64 stripe_size)
3993{ 3967{
3994 u64 dev_offset;
3995 struct btrfs_key key; 3968 struct btrfs_key key;
3996 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; 3969 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3997 struct btrfs_device *device; 3970 struct btrfs_device *device;
3998 struct btrfs_chunk *chunk; 3971 struct btrfs_chunk *chunk;
3999 struct btrfs_stripe *stripe; 3972 struct btrfs_stripe *stripe;
4000 size_t item_size = btrfs_chunk_item_size(map->num_stripes); 3973 struct extent_map_tree *em_tree;
4001 int index = 0; 3974 struct extent_map *em;
3975 struct map_lookup *map;
3976 size_t item_size;
3977 u64 dev_offset;
3978 u64 stripe_size;
3979 int i = 0;
4002 int ret; 3980 int ret;
4003 3981
3982 em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3983 read_lock(&em_tree->lock);
3984 em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
3985 read_unlock(&em_tree->lock);
3986
3987 if (!em) {
3988 btrfs_crit(extent_root->fs_info, "unable to find logical "
3989 "%Lu len %Lu", chunk_offset, chunk_size);
3990 return -EINVAL;
3991 }
3992
3993 if (em->start != chunk_offset || em->len != chunk_size) {
3994 btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
3995 " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
3996 chunk_size, em->start, em->len);
3997 free_extent_map(em);
3998 return -EINVAL;
3999 }
4000
4001 map = (struct map_lookup *)em->bdev;
4002 item_size = btrfs_chunk_item_size(map->num_stripes);
4003 stripe_size = em->orig_block_len;
4004
4004 chunk = kzalloc(item_size, GFP_NOFS); 4005 chunk = kzalloc(item_size, GFP_NOFS);
4005 if (!chunk) 4006 if (!chunk) {
4006 return -ENOMEM; 4007 ret = -ENOMEM;
4008 goto out;
4009 }
4010
4011 for (i = 0; i < map->num_stripes; i++) {
4012 device = map->stripes[i].dev;
4013 dev_offset = map->stripes[i].physical;
4007 4014
4008 index = 0;
4009 while (index < map->num_stripes) {
4010 device = map->stripes[index].dev;
4011 device->bytes_used += stripe_size; 4015 device->bytes_used += stripe_size;
4012 ret = btrfs_update_device(trans, device); 4016 ret = btrfs_update_device(trans, device);
4013 if (ret) 4017 if (ret)
4014 goto out_free; 4018 goto out;
4015 index++; 4019 ret = btrfs_alloc_dev_extent(trans, device,
4020 chunk_root->root_key.objectid,
4021 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4022 chunk_offset, dev_offset,
4023 stripe_size);
4024 if (ret)
4025 goto out;
4016 } 4026 }
4017 4027
4018 spin_lock(&extent_root->fs_info->free_chunk_lock); 4028 spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -4020,17 +4030,15 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4020 map->num_stripes); 4030 map->num_stripes);
4021 spin_unlock(&extent_root->fs_info->free_chunk_lock); 4031 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4022 4032
4023 index = 0;
4024 stripe = &chunk->stripe; 4033 stripe = &chunk->stripe;
4025 while (index < map->num_stripes) { 4034 for (i = 0; i < map->num_stripes; i++) {
4026 device = map->stripes[index].dev; 4035 device = map->stripes[i].dev;
4027 dev_offset = map->stripes[index].physical; 4036 dev_offset = map->stripes[i].physical;
4028 4037
4029 btrfs_set_stack_stripe_devid(stripe, device->devid); 4038 btrfs_set_stack_stripe_devid(stripe, device->devid);
4030 btrfs_set_stack_stripe_offset(stripe, dev_offset); 4039 btrfs_set_stack_stripe_offset(stripe, dev_offset);
4031 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 4040 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4032 stripe++; 4041 stripe++;
4033 index++;
4034 } 4042 }
4035 4043
4036 btrfs_set_stack_chunk_length(chunk, chunk_size); 4044 btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -4048,7 +4056,6 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4048 key.offset = chunk_offset; 4056 key.offset = chunk_offset;
4049 4057
4050 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 4058 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4051
4052 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 4059 if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4053 /* 4060 /*
4054 * TODO: Cleanup of inserted chunk root in case of 4061 * TODO: Cleanup of inserted chunk root in case of
@@ -4058,8 +4065,9 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
4058 item_size); 4065 item_size);
4059 } 4066 }
4060 4067
4061out_free: 4068out:
4062 kfree(chunk); 4069 kfree(chunk);
4070 free_extent_map(em);
4063 return ret; 4071 return ret;
4064} 4072}
4065 4073
@@ -4074,27 +4082,9 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4074 struct btrfs_root *extent_root, u64 type) 4082 struct btrfs_root *extent_root, u64 type)
4075{ 4083{
4076 u64 chunk_offset; 4084 u64 chunk_offset;
4077 u64 chunk_size;
4078 u64 stripe_size;
4079 struct map_lookup *map;
4080 struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4081 int ret;
4082
4083 ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4084 &chunk_offset);
4085 if (ret)
4086 return ret;
4087 4085
4088 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4086 chunk_offset = find_next_chunk(extent_root->fs_info);
4089 &stripe_size, chunk_offset, type); 4087 return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4090 if (ret)
4091 return ret;
4092
4093 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4094 chunk_size, stripe_size);
4095 if (ret)
4096 return ret;
4097 return 0;
4098} 4088}
4099 4089
4100static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, 4090static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -4103,66 +4093,31 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4103{ 4093{
4104 u64 chunk_offset; 4094 u64 chunk_offset;
4105 u64 sys_chunk_offset; 4095 u64 sys_chunk_offset;
4106 u64 chunk_size;
4107 u64 sys_chunk_size;
4108 u64 stripe_size;
4109 u64 sys_stripe_size;
4110 u64 alloc_profile; 4096 u64 alloc_profile;
4111 struct map_lookup *map;
4112 struct map_lookup *sys_map;
4113 struct btrfs_fs_info *fs_info = root->fs_info; 4097 struct btrfs_fs_info *fs_info = root->fs_info;
4114 struct btrfs_root *extent_root = fs_info->extent_root; 4098 struct btrfs_root *extent_root = fs_info->extent_root;
4115 int ret; 4099 int ret;
4116 4100
4117 ret = find_next_chunk(fs_info->chunk_root, 4101 chunk_offset = find_next_chunk(fs_info);
4118 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
4119 if (ret)
4120 return ret;
4121
4122 alloc_profile = btrfs_get_alloc_profile(extent_root, 0); 4102 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4123 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4103 ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4124 &stripe_size, chunk_offset, alloc_profile); 4104 alloc_profile);
4125 if (ret) 4105 if (ret)
4126 return ret; 4106 return ret;
4127 4107
4128 sys_chunk_offset = chunk_offset + chunk_size; 4108 sys_chunk_offset = find_next_chunk(root->fs_info);
4129
4130 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4109 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4131 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4110 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4132 &sys_chunk_size, &sys_stripe_size, 4111 alloc_profile);
4133 sys_chunk_offset, alloc_profile);
4134 if (ret) { 4112 if (ret) {
4135 btrfs_abort_transaction(trans, root, ret); 4113 btrfs_abort_transaction(trans, root, ret);
4136 goto out; 4114 goto out;
4137 } 4115 }
4138 4116
4139 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4117 ret = btrfs_add_device(trans, fs_info->chunk_root, device);
4140 if (ret) {
4141 btrfs_abort_transaction(trans, root, ret);
4142 goto out;
4143 }
4144
4145 /*
4146 * Modifying chunk tree needs allocating new blocks from both
4147 * system block group and metadata block group. So we only can
4148 * do operations require modifying the chunk tree after both
4149 * block groups were created.
4150 */
4151 ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
4152 chunk_size, stripe_size);
4153 if (ret) {
4154 btrfs_abort_transaction(trans, root, ret);
4155 goto out;
4156 }
4157
4158 ret = __finish_chunk_alloc(trans, extent_root, sys_map,
4159 sys_chunk_offset, sys_chunk_size,
4160 sys_stripe_size);
4161 if (ret) 4118 if (ret)
4162 btrfs_abort_transaction(trans, root, ret); 4119 btrfs_abort_transaction(trans, root, ret);
4163
4164out: 4120out:
4165
4166 return ret; 4121 return ret;
4167} 4122}
4168 4123
@@ -4435,9 +4390,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4435 map = (struct map_lookup *)em->bdev; 4390 map = (struct map_lookup *)em->bdev;
4436 offset = logical - em->start; 4391 offset = logical - em->start;
4437 4392
4438 if (mirror_num > map->num_stripes)
4439 mirror_num = 0;
4440
4441 stripe_len = map->stripe_len; 4393 stripe_len = map->stripe_len;
4442 stripe_nr = offset; 4394 stripe_nr = offset;
4443 /* 4395 /*
@@ -5367,7 +5319,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5367 return NULL; 5319 return NULL;
5368 list_add(&device->dev_list, 5320 list_add(&device->dev_list,
5369 &fs_devices->devices); 5321 &fs_devices->devices);
5370 device->dev_root = root->fs_info->dev_root;
5371 device->devid = devid; 5322 device->devid = devid;
5372 device->work.func = pending_bios_fn; 5323 device->work.func = pending_bios_fn;
5373 device->fs_devices = fs_devices; 5324 device->fs_devices = fs_devices;
@@ -5593,7 +5544,6 @@ static int read_one_dev(struct btrfs_root *root,
5593 } 5544 }
5594 5545
5595 fill_device_from_item(leaf, dev_item, device); 5546 fill_device_from_item(leaf, dev_item, device);
5596 device->dev_root = root->fs_info->dev_root;
5597 device->in_fs_metadata = 1; 5547 device->in_fs_metadata = 1;
5598 if (device->writeable && !device->is_tgtdev_for_dev_replace) { 5548 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
5599 device->fs_devices->total_rw_bytes += device->total_bytes; 5549 device->fs_devices->total_rw_bytes += device->total_bytes;
@@ -5751,6 +5701,17 @@ error:
5751 return ret; 5701 return ret;
5752} 5702}
5753 5703
5704void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
5705{
5706 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
5707 struct btrfs_device *device;
5708
5709 mutex_lock(&fs_devices->device_list_mutex);
5710 list_for_each_entry(device, &fs_devices->devices, dev_list)
5711 device->dev_root = fs_info->dev_root;
5712 mutex_unlock(&fs_devices->device_list_mutex);
5713}
5714
5754static void __btrfs_reset_dev_stats(struct btrfs_device *dev) 5715static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
5755{ 5716{
5756 int i; 5717 int i;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f6247e2a47f7..86705583480d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -316,11 +316,13 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
316int btrfs_pause_balance(struct btrfs_fs_info *fs_info); 316int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); 317int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 318int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
319int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 319int find_free_dev_extent(struct btrfs_trans_handle *trans,
320 struct btrfs_device *device, u64 num_bytes,
320 u64 *start, u64 *max_avail); 321 u64 *start, u64 *max_avail);
321void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); 322void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
322int btrfs_get_dev_stats(struct btrfs_root *root, 323int btrfs_get_dev_stats(struct btrfs_root *root,
323 struct btrfs_ioctl_get_dev_stats *stats); 324 struct btrfs_ioctl_get_dev_stats *stats);
325void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
324int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 326int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
325int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 327int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
326 struct btrfs_fs_info *fs_info); 328 struct btrfs_fs_info *fs_info);
@@ -336,6 +338,9 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
336unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 338unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
337 struct btrfs_mapping_tree *map_tree, 339 struct btrfs_mapping_tree *map_tree,
338 u64 logical); 340 u64 logical);
341int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
342 struct btrfs_root *extent_root,
343 u64 chunk_offset, u64 chunk_size);
339static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 344static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
340 int index) 345 int index)
341{ 346{
diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..4d7433534f5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -83,6 +83,40 @@ void unlock_buffer(struct buffer_head *bh)
83EXPORT_SYMBOL(unlock_buffer); 83EXPORT_SYMBOL(unlock_buffer);
84 84
85/* 85/*
86 * Returns if the page has dirty or writeback buffers. If all the buffers
87 * are unlocked and clean then the PageDirty information is stale. If
88 * any of the pages are locked, it is assumed they are locked for IO.
89 */
90void buffer_check_dirty_writeback(struct page *page,
91 bool *dirty, bool *writeback)
92{
93 struct buffer_head *head, *bh;
94 *dirty = false;
95 *writeback = false;
96
97 BUG_ON(!PageLocked(page));
98
99 if (!page_has_buffers(page))
100 return;
101
102 if (PageWriteback(page))
103 *writeback = true;
104
105 head = page_buffers(page);
106 bh = head;
107 do {
108 if (buffer_locked(bh))
109 *writeback = true;
110
111 if (buffer_dirty(bh))
112 *dirty = true;
113
114 bh = bh->b_this_page;
115 } while (bh != head);
116}
117EXPORT_SYMBOL(buffer_check_dirty_writeback);
118
119/*
86 * Block until a buffer comes unlocked. This doesn't stop it 120 * Block until a buffer comes unlocked. This doesn't stop it
87 * from becoming locked again - you have to lock it yourself 121 * from becoming locked again - you have to lock it yourself
88 * if you want to preserve its state. 122 * if you want to preserve its state.
@@ -1454,7 +1488,8 @@ static void discard_buffer(struct buffer_head * bh)
1454 * block_invalidatepage - invalidate part or all of a buffer-backed page 1488 * block_invalidatepage - invalidate part or all of a buffer-backed page
1455 * 1489 *
1456 * @page: the page which is affected 1490 * @page: the page which is affected
1457 * @offset: the index of the truncation point 1491 * @offset: start of the range to invalidate
1492 * @length: length of the range to invalidate
1458 * 1493 *
1459 * block_invalidatepage() is called when all or part of the page has become 1494 * block_invalidatepage() is called when all or part of the page has become
1460 * invalidated by a truncate operation. 1495 * invalidated by a truncate operation.
@@ -1465,15 +1500,22 @@ static void discard_buffer(struct buffer_head * bh)
1465 * point. Because the caller is about to free (and possibly reuse) those 1500 * point. Because the caller is about to free (and possibly reuse) those
1466 * blocks on-disk. 1501 * blocks on-disk.
1467 */ 1502 */
1468void block_invalidatepage(struct page *page, unsigned long offset) 1503void block_invalidatepage(struct page *page, unsigned int offset,
1504 unsigned int length)
1469{ 1505{
1470 struct buffer_head *head, *bh, *next; 1506 struct buffer_head *head, *bh, *next;
1471 unsigned int curr_off = 0; 1507 unsigned int curr_off = 0;
1508 unsigned int stop = length + offset;
1472 1509
1473 BUG_ON(!PageLocked(page)); 1510 BUG_ON(!PageLocked(page));
1474 if (!page_has_buffers(page)) 1511 if (!page_has_buffers(page))
1475 goto out; 1512 goto out;
1476 1513
1514 /*
1515 * Check for overflow
1516 */
1517 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1518
1477 head = page_buffers(page); 1519 head = page_buffers(page);
1478 bh = head; 1520 bh = head;
1479 do { 1521 do {
@@ -1481,6 +1523,12 @@ void block_invalidatepage(struct page *page, unsigned long offset)
1481 next = bh->b_this_page; 1523 next = bh->b_this_page;
1482 1524
1483 /* 1525 /*
1526 * Are we still fully in range ?
1527 */
1528 if (next_off > stop)
1529 goto out;
1530
1531 /*
1484 * is this block fully invalidated? 1532 * is this block fully invalidated?
1485 */ 1533 */
1486 if (offset <= curr_off) 1534 if (offset <= curr_off)
@@ -1501,6 +1549,7 @@ out:
1501} 1549}
1502EXPORT_SYMBOL(block_invalidatepage); 1550EXPORT_SYMBOL(block_invalidatepage);
1503 1551
1552
1504/* 1553/*
1505 * We attach and possibly dirty the buffers atomically wrt 1554 * We attach and possibly dirty the buffers atomically wrt
1506 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers 1555 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
@@ -2841,7 +2890,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2841 * they may have been added in ext3_writepage(). Make them 2890 * they may have been added in ext3_writepage(). Make them
2842 * freeable here, so the page does not leak. 2891 * freeable here, so the page does not leak.
2843 */ 2892 */
2844 do_invalidatepage(page, 0); 2893 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2845 unlock_page(page); 2894 unlock_page(page);
2846 return 0; /* don't care */ 2895 return 0; /* don't care */
2847 } 2896 }
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 746ce532e130..d4c1206af9fc 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -13,8 +13,6 @@
13#include <linux/mount.h> 13#include <linux/mount.h>
14#include "internal.h" 14#include "internal.h"
15 15
16#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
17
18struct cachefiles_lookup_data { 16struct cachefiles_lookup_data {
19 struct cachefiles_xattr *auxdata; /* auxiliary data */ 17 struct cachefiles_xattr *auxdata; /* auxiliary data */
20 char *key; /* key path */ 18 char *key; /* key path */
@@ -212,20 +210,29 @@ static void cachefiles_update_object(struct fscache_object *_object)
212 object = container_of(_object, struct cachefiles_object, fscache); 210 object = container_of(_object, struct cachefiles_object, fscache);
213 cache = container_of(object->fscache.cache, struct cachefiles_cache, 211 cache = container_of(object->fscache.cache, struct cachefiles_cache,
214 cache); 212 cache);
213
214 if (!fscache_use_cookie(_object)) {
215 _leave(" [relinq]");
216 return;
217 }
218
215 cookie = object->fscache.cookie; 219 cookie = object->fscache.cookie;
216 220
217 if (!cookie->def->get_aux) { 221 if (!cookie->def->get_aux) {
222 fscache_unuse_cookie(_object);
218 _leave(" [no aux]"); 223 _leave(" [no aux]");
219 return; 224 return;
220 } 225 }
221 226
222 auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp); 227 auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
223 if (!auxdata) { 228 if (!auxdata) {
229 fscache_unuse_cookie(_object);
224 _leave(" [nomem]"); 230 _leave(" [nomem]");
225 return; 231 return;
226 } 232 }
227 233
228 auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511); 234 auxlen = cookie->def->get_aux(cookie->netfs_data, auxdata->data, 511);
235 fscache_unuse_cookie(_object);
229 ASSERTCMP(auxlen, <, 511); 236 ASSERTCMP(auxlen, <, 511);
230 237
231 auxdata->len = auxlen + 1; 238 auxdata->len = auxlen + 1;
@@ -263,7 +270,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
263#endif 270#endif
264 271
265 /* delete retired objects */ 272 /* delete retired objects */
266 if (object->fscache.state == FSCACHE_OBJECT_RECYCLING && 273 if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
267 _object != cache->cache.fsdef 274 _object != cache->cache.fsdef
268 ) { 275 ) {
269 _debug("- retire object OBJ%x", object->fscache.debug_id); 276 _debug("- retire object OBJ%x", object->fscache.debug_id);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 8c01c5fcdf75..25badd1aec5c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -38,7 +38,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
38 printk(KERN_ERR "%sobject: OBJ%x\n", 38 printk(KERN_ERR "%sobject: OBJ%x\n",
39 prefix, object->fscache.debug_id); 39 prefix, object->fscache.debug_id);
40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", 40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
41 prefix, fscache_object_states[object->fscache.state], 41 prefix, object->fscache.state->name,
42 object->fscache.flags, work_busy(&object->fscache.work), 42 object->fscache.flags, work_busy(&object->fscache.work),
43 object->fscache.events, object->fscache.event_mask); 43 object->fscache.events, object->fscache.event_mask);
44 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", 44 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -127,10 +127,10 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
127found_dentry: 127found_dentry:
128 kdebug("preemptive burial: OBJ%x [%s] %p", 128 kdebug("preemptive burial: OBJ%x [%s] %p",
129 object->fscache.debug_id, 129 object->fscache.debug_id,
130 fscache_object_states[object->fscache.state], 130 object->fscache.state->name,
131 dentry); 131 dentry);
132 132
133 if (object->fscache.state < FSCACHE_OBJECT_DYING) { 133 if (fscache_object_is_live(&object->fscache)) {
134 printk(KERN_ERR "\n"); 134 printk(KERN_ERR "\n");
135 printk(KERN_ERR "CacheFiles: Error:" 135 printk(KERN_ERR "CacheFiles: Error:"
136 " Can't preemptively bury live object\n"); 136 " Can't preemptively bury live object\n");
@@ -192,7 +192,7 @@ try_again:
192 /* an old object from a previous incarnation is hogging the slot - we 192 /* an old object from a previous incarnation is hogging the slot - we
193 * need to wait for it to be destroyed */ 193 * need to wait for it to be destroyed */
194wait_for_old_object: 194wait_for_old_object:
195 if (xobject->fscache.state < FSCACHE_OBJECT_DYING) { 195 if (fscache_object_is_live(&object->fscache)) {
196 printk(KERN_ERR "\n"); 196 printk(KERN_ERR "\n");
197 printk(KERN_ERR "CacheFiles: Error:" 197 printk(KERN_ERR "CacheFiles: Error:"
198 " Unexpected object collision\n"); 198 " Unexpected object collision\n");
@@ -836,7 +836,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
836 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); 836 // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename);
837 837
838 /* look up the victim */ 838 /* look up the victim */
839 mutex_lock_nested(&dir->d_inode->i_mutex, 1); 839 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
840 840
841 start = jiffies; 841 start = jiffies;
842 victim = lookup_one_len(filename, dir, strlen(filename)); 842 victim = lookup_one_len(filename, dir, strlen(filename));
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 317f9ee9c991..ebaff368120d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/file.h> 14#include <linux/file.h>
15#include <linux/swap.h>
15#include "internal.h" 16#include "internal.h"
16 17
17/* 18/*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
227 */ 228 */
228static int cachefiles_read_backing_file_one(struct cachefiles_object *object, 229static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
229 struct fscache_retrieval *op, 230 struct fscache_retrieval *op,
230 struct page *netpage, 231 struct page *netpage)
231 struct pagevec *pagevec)
232{ 232{
233 struct cachefiles_one_read *monitor; 233 struct cachefiles_one_read *monitor;
234 struct address_space *bmapping; 234 struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
237 237
238 _enter(""); 238 _enter("");
239 239
240 pagevec_reinit(pagevec);
241
242 _debug("read back %p{%lu,%d}", 240 _debug("read back %p{%lu,%d}",
243 netpage, netpage->index, page_count(netpage)); 241 netpage, netpage->index, page_count(netpage));
244 242
@@ -283,9 +281,7 @@ installed_new_backing_page:
283 backpage = newpage; 281 backpage = newpage;
284 newpage = NULL; 282 newpage = NULL;
285 283
286 page_cache_get(backpage); 284 lru_cache_add_file(backpage);
287 pagevec_add(pagevec, backpage);
288 __pagevec_lru_add_file(pagevec);
289 285
290read_backing_page: 286read_backing_page:
291 ret = bmapping->a_ops->readpage(NULL, backpage); 287 ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
452 if (block) { 448 if (block) {
453 /* submit the apparently valid page to the backing fs to be 449 /* submit the apparently valid page to the backing fs to be
454 * read from disk */ 450 * read from disk */
455 ret = cachefiles_read_backing_file_one(object, op, page, 451 ret = cachefiles_read_backing_file_one(object, op, page);
456 &pagevec);
457 } else if (cachefiles_has_space(cache, 0, 1) == 0) { 452 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
458 /* there's space in the cache we can use */ 453 /* there's space in the cache we can use */
459 fscache_mark_page_cached(op, page); 454 fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
482{ 477{
483 struct cachefiles_one_read *monitor = NULL; 478 struct cachefiles_one_read *monitor = NULL;
484 struct address_space *bmapping = object->backer->d_inode->i_mapping; 479 struct address_space *bmapping = object->backer->d_inode->i_mapping;
485 struct pagevec lru_pvec;
486 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; 480 struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
487 int ret = 0; 481 int ret = 0;
488 482
489 _enter(""); 483 _enter("");
490 484
491 pagevec_init(&lru_pvec, 0);
492
493 list_for_each_entry_safe(netpage, _n, list, lru) { 485 list_for_each_entry_safe(netpage, _n, list, lru) {
494 list_del(&netpage->lru); 486 list_del(&netpage->lru);
495 487
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
534 backpage = newpage; 526 backpage = newpage;
535 newpage = NULL; 527 newpage = NULL;
536 528
537 page_cache_get(backpage); 529 lru_cache_add_file(backpage);
538 if (!pagevec_add(&lru_pvec, backpage))
539 __pagevec_lru_add_file(&lru_pvec);
540 530
541 reread_backing_page: 531 reread_backing_page:
542 ret = bmapping->a_ops->readpage(NULL, backpage); 532 ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
559 goto nomem; 549 goto nomem;
560 } 550 }
561 551
562 page_cache_get(netpage); 552 lru_cache_add_file(netpage);
563 if (!pagevec_add(&lru_pvec, netpage))
564 __pagevec_lru_add_file(&lru_pvec);
565 553
566 /* install a monitor */ 554 /* install a monitor */
567 page_cache_get(netpage); 555 page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
643 631
644 fscache_mark_page_cached(op, netpage); 632 fscache_mark_page_cached(op, netpage);
645 633
646 page_cache_get(netpage); 634 lru_cache_add_file(netpage);
647 if (!pagevec_add(&lru_pvec, netpage))
648 __pagevec_lru_add_file(&lru_pvec);
649 635
650 /* the netpage is unlocked and marked up to date here */ 636 /* the netpage is unlocked and marked up to date here */
651 fscache_end_io(op, netpage, 0); 637 fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
661 647
662out: 648out:
663 /* tidy up */ 649 /* tidy up */
664 pagevec_lru_add_file(&lru_pvec);
665
666 if (newpage) 650 if (newpage)
667 page_cache_release(newpage); 651 page_cache_release(newpage);
668 if (netpage) 652 if (netpage)
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 73b46288b54b..2476e5162609 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -109,13 +109,12 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object,
109 struct dentry *dentry = object->dentry; 109 struct dentry *dentry = object->dentry;
110 int ret; 110 int ret;
111 111
112 ASSERT(object->fscache.cookie);
113 ASSERT(dentry); 112 ASSERT(dentry);
114 113
115 _enter("%p,#%d", object, auxdata->len); 114 _enter("%p,#%d", object, auxdata->len);
116 115
117 /* attempt to install the cache metadata directly */ 116 /* attempt to install the cache metadata directly */
118 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); 117 _debug("SET #%u", auxdata->len);
119 118
120 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, 119 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
121 &auxdata->type, auxdata->len, 120 &auxdata->type, auxdata->len,
@@ -138,13 +137,12 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object,
138 struct dentry *dentry = object->dentry; 137 struct dentry *dentry = object->dentry;
139 int ret; 138 int ret;
140 139
141 ASSERT(object->fscache.cookie);
142 ASSERT(dentry); 140 ASSERT(dentry);
143 141
144 _enter("%p,#%d", object, auxdata->len); 142 _enter("%p,#%d", object, auxdata->len);
145 143
146 /* attempt to install the cache metadata directly */ 144 /* attempt to install the cache metadata directly */
147 _debug("SET %s #%u", object->fscache.cookie->def->name, auxdata->len); 145 _debug("SET #%u", auxdata->len);
148 146
149 ret = vfs_setxattr(dentry, cachefiles_xattr_cache, 147 ret = vfs_setxattr(dentry, cachefiles_xattr_cache,
150 &auxdata->type, auxdata->len, 148 &auxdata->type, auxdata->len,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f1d6c60ab229..722585cd5c7e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,8 @@ static int ceph_set_page_dirty(struct page *page)
143 * dirty page counters appropriately. Only called if there is private 143 * dirty page counters appropriately. Only called if there is private
144 * data on the page. 144 * data on the page.
145 */ 145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset) 146static void ceph_invalidatepage(struct page *page, unsigned int offset,
147 unsigned int length)
147{ 148{
148 struct inode *inode; 149 struct inode *inode;
149 struct ceph_inode_info *ci; 150 struct ceph_inode_info *ci;
@@ -159,20 +160,20 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
159 if (!PageDirty(page)) 160 if (!PageDirty(page))
160 pr_err("%p invalidatepage %p page not dirty\n", inode, page); 161 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
161 162
162 if (offset == 0) 163 if (offset == 0 && length == PAGE_CACHE_SIZE)
163 ClearPageChecked(page); 164 ClearPageChecked(page);
164 165
165 ci = ceph_inode(inode); 166 ci = ceph_inode(inode);
166 if (offset == 0) { 167 if (offset == 0 && length == PAGE_CACHE_SIZE) {
167 dout("%p invalidatepage %p idx %lu full dirty page %lu\n", 168 dout("%p invalidatepage %p idx %lu full dirty page\n",
168 inode, page, page->index, offset); 169 inode, page, page->index);
169 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 170 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
170 ceph_put_snap_context(snapc); 171 ceph_put_snap_context(snapc);
171 page->private = 0; 172 page->private = 0;
172 ClearPagePrivate(page); 173 ClearPagePrivate(page);
173 } else { 174 } else {
174 dout("%p invalidatepage %p idx %lu partial dirty page\n", 175 dout("%p invalidatepage %p idx %lu partial dirty page %u(%u)\n",
175 inode, page, page->index); 176 inode, page, page->index, offset, length);
176 } 177 }
177} 178}
178 179
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0e4da4a9c213..868b61d56cac 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -111,11 +111,10 @@ static unsigned fpos_off(loff_t p)
111 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by 111 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
112 * the MDS if/when the directory is modified). 112 * the MDS if/when the directory is modified).
113 */ 113 */
114static int __dcache_readdir(struct file *filp, 114static int __dcache_readdir(struct file *file, struct dir_context *ctx)
115 void *dirent, filldir_t filldir)
116{ 115{
117 struct ceph_file_info *fi = filp->private_data; 116 struct ceph_file_info *fi = file->private_data;
118 struct dentry *parent = filp->f_dentry; 117 struct dentry *parent = file->f_dentry;
119 struct inode *dir = parent->d_inode; 118 struct inode *dir = parent->d_inode;
120 struct list_head *p; 119 struct list_head *p;
121 struct dentry *dentry, *last; 120 struct dentry *dentry, *last;
@@ -126,14 +125,14 @@ static int __dcache_readdir(struct file *filp,
126 last = fi->dentry; 125 last = fi->dentry;
127 fi->dentry = NULL; 126 fi->dentry = NULL;
128 127
129 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, 128 dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
130 last); 129 last);
131 130
132 spin_lock(&parent->d_lock); 131 spin_lock(&parent->d_lock);
133 132
134 /* start at beginning? */ 133 /* start at beginning? */
135 if (filp->f_pos == 2 || last == NULL || 134 if (ctx->pos == 2 || last == NULL ||
136 filp->f_pos < ceph_dentry(last)->offset) { 135 ctx->pos < ceph_dentry(last)->offset) {
137 if (list_empty(&parent->d_subdirs)) 136 if (list_empty(&parent->d_subdirs))
138 goto out_unlock; 137 goto out_unlock;
139 p = parent->d_subdirs.prev; 138 p = parent->d_subdirs.prev;
@@ -157,11 +156,11 @@ more:
157 if (!d_unhashed(dentry) && dentry->d_inode && 156 if (!d_unhashed(dentry) && dentry->d_inode &&
158 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 157 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
159 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 158 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
160 filp->f_pos <= di->offset) 159 ctx->pos <= di->offset)
161 break; 160 break;
162 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, 161 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
163 dentry->d_name.len, dentry->d_name.name, di->offset, 162 dentry->d_name.len, dentry->d_name.name, di->offset,
164 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", 163 ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
165 !dentry->d_inode ? " null" : ""); 164 !dentry->d_inode ? " null" : "");
166 spin_unlock(&dentry->d_lock); 165 spin_unlock(&dentry->d_lock);
167 p = p->prev; 166 p = p->prev;
@@ -173,29 +172,27 @@ more:
173 spin_unlock(&dentry->d_lock); 172 spin_unlock(&dentry->d_lock);
174 spin_unlock(&parent->d_lock); 173 spin_unlock(&parent->d_lock);
175 174
176 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 175 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
177 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 176 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
178 filp->f_pos = di->offset; 177 ctx->pos = di->offset;
179 err = filldir(dirent, dentry->d_name.name, 178 if (!dir_emit(ctx, dentry->d_name.name,
180 dentry->d_name.len, di->offset, 179 dentry->d_name.len,
181 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), 180 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
182 dentry->d_inode->i_mode >> 12); 181 dentry->d_inode->i_mode >> 12)) {
183 182 if (last) {
184 if (last) {
185 if (err < 0) {
186 /* remember our position */ 183 /* remember our position */
187 fi->dentry = last; 184 fi->dentry = last;
188 fi->next_offset = di->offset; 185 fi->next_offset = di->offset;
189 } else {
190 dput(last);
191 } 186 }
187 dput(dentry);
188 return 0;
192 } 189 }
193 last = dentry;
194 190
195 if (err < 0) 191 if (last)
196 goto out; 192 dput(last);
193 last = dentry;
197 194
198 filp->f_pos++; 195 ctx->pos++;
199 196
200 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 197 /* make sure a dentry wasn't dropped while we didn't have parent lock */
201 if (!ceph_dir_is_complete(dir)) { 198 if (!ceph_dir_is_complete(dir)) {
@@ -235,59 +232,59 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
235 return 0; 232 return 0;
236} 233}
237 234
238static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) 235static int ceph_readdir(struct file *file, struct dir_context *ctx)
239{ 236{
240 struct ceph_file_info *fi = filp->private_data; 237 struct ceph_file_info *fi = file->private_data;
241 struct inode *inode = file_inode(filp); 238 struct inode *inode = file_inode(file);
242 struct ceph_inode_info *ci = ceph_inode(inode); 239 struct ceph_inode_info *ci = ceph_inode(inode);
243 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 240 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
244 struct ceph_mds_client *mdsc = fsc->mdsc; 241 struct ceph_mds_client *mdsc = fsc->mdsc;
245 unsigned frag = fpos_frag(filp->f_pos); 242 unsigned frag = fpos_frag(ctx->pos);
246 int off = fpos_off(filp->f_pos); 243 int off = fpos_off(ctx->pos);
247 int err; 244 int err;
248 u32 ftype; 245 u32 ftype;
249 struct ceph_mds_reply_info_parsed *rinfo; 246 struct ceph_mds_reply_info_parsed *rinfo;
250 const int max_entries = fsc->mount_options->max_readdir; 247 const int max_entries = fsc->mount_options->max_readdir;
251 const int max_bytes = fsc->mount_options->max_readdir_bytes; 248 const int max_bytes = fsc->mount_options->max_readdir_bytes;
252 249
253 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 250 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
254 if (fi->flags & CEPH_F_ATEND) 251 if (fi->flags & CEPH_F_ATEND)
255 return 0; 252 return 0;
256 253
257 /* always start with . and .. */ 254 /* always start with . and .. */
258 if (filp->f_pos == 0) { 255 if (ctx->pos == 0) {
259 /* note dir version at start of readdir so we can tell 256 /* note dir version at start of readdir so we can tell
260 * if any dentries get dropped */ 257 * if any dentries get dropped */
261 fi->dir_release_count = atomic_read(&ci->i_release_count); 258 fi->dir_release_count = atomic_read(&ci->i_release_count);
262 259
263 dout("readdir off 0 -> '.'\n"); 260 dout("readdir off 0 -> '.'\n");
264 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 261 if (!dir_emit(ctx, ".", 1,
265 ceph_translate_ino(inode->i_sb, inode->i_ino), 262 ceph_translate_ino(inode->i_sb, inode->i_ino),
266 inode->i_mode >> 12) < 0) 263 inode->i_mode >> 12))
267 return 0; 264 return 0;
268 filp->f_pos = 1; 265 ctx->pos = 1;
269 off = 1; 266 off = 1;
270 } 267 }
271 if (filp->f_pos == 1) { 268 if (ctx->pos == 1) {
272 ino_t ino = parent_ino(filp->f_dentry); 269 ino_t ino = parent_ino(file->f_dentry);
273 dout("readdir off 1 -> '..'\n"); 270 dout("readdir off 1 -> '..'\n");
274 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), 271 if (!dir_emit(ctx, "..", 2,
275 ceph_translate_ino(inode->i_sb, ino), 272 ceph_translate_ino(inode->i_sb, ino),
276 inode->i_mode >> 12) < 0) 273 inode->i_mode >> 12))
277 return 0; 274 return 0;
278 filp->f_pos = 2; 275 ctx->pos = 2;
279 off = 2; 276 off = 2;
280 } 277 }
281 278
282 /* can we use the dcache? */ 279 /* can we use the dcache? */
283 spin_lock(&ci->i_ceph_lock); 280 spin_lock(&ci->i_ceph_lock);
284 if ((filp->f_pos == 2 || fi->dentry) && 281 if ((ctx->pos == 2 || fi->dentry) &&
285 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 282 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
286 ceph_snap(inode) != CEPH_SNAPDIR && 283 ceph_snap(inode) != CEPH_SNAPDIR &&
287 __ceph_dir_is_complete(ci) && 284 __ceph_dir_is_complete(ci) &&
288 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 285 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
289 spin_unlock(&ci->i_ceph_lock); 286 spin_unlock(&ci->i_ceph_lock);
290 err = __dcache_readdir(filp, dirent, filldir); 287 err = __dcache_readdir(file, ctx);
291 if (err != -EAGAIN) 288 if (err != -EAGAIN)
292 return err; 289 return err;
293 } else { 290 } else {
@@ -327,7 +324,7 @@ more:
327 return PTR_ERR(req); 324 return PTR_ERR(req);
328 req->r_inode = inode; 325 req->r_inode = inode;
329 ihold(inode); 326 ihold(inode);
330 req->r_dentry = dget(filp->f_dentry); 327 req->r_dentry = dget(file->f_dentry);
331 /* hints to request -> mds selection code */ 328 /* hints to request -> mds selection code */
332 req->r_direct_mode = USE_AUTH_MDS; 329 req->r_direct_mode = USE_AUTH_MDS;
333 req->r_direct_hash = ceph_frag_value(frag); 330 req->r_direct_hash = ceph_frag_value(frag);
@@ -379,15 +376,16 @@ more:
379 rinfo = &fi->last_readdir->r_reply_info; 376 rinfo = &fi->last_readdir->r_reply_info;
380 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 377 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
381 rinfo->dir_nr, off, fi->offset); 378 rinfo->dir_nr, off, fi->offset);
379
380 ctx->pos = ceph_make_fpos(frag, off);
382 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { 381 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
383 u64 pos = ceph_make_fpos(frag, off);
384 struct ceph_mds_reply_inode *in = 382 struct ceph_mds_reply_inode *in =
385 rinfo->dir_in[off - fi->offset].in; 383 rinfo->dir_in[off - fi->offset].in;
386 struct ceph_vino vino; 384 struct ceph_vino vino;
387 ino_t ino; 385 ino_t ino;
388 386
389 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 387 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
390 off, off - fi->offset, rinfo->dir_nr, pos, 388 off, off - fi->offset, rinfo->dir_nr, ctx->pos,
391 rinfo->dir_dname_len[off - fi->offset], 389 rinfo->dir_dname_len[off - fi->offset],
392 rinfo->dir_dname[off - fi->offset], in); 390 rinfo->dir_dname[off - fi->offset], in);
393 BUG_ON(!in); 391 BUG_ON(!in);
@@ -395,16 +393,15 @@ more:
395 vino.ino = le64_to_cpu(in->ino); 393 vino.ino = le64_to_cpu(in->ino);
396 vino.snap = le64_to_cpu(in->snapid); 394 vino.snap = le64_to_cpu(in->snapid);
397 ino = ceph_vino_to_ino(vino); 395 ino = ceph_vino_to_ino(vino);
398 if (filldir(dirent, 396 if (!dir_emit(ctx,
399 rinfo->dir_dname[off - fi->offset], 397 rinfo->dir_dname[off - fi->offset],
400 rinfo->dir_dname_len[off - fi->offset], 398 rinfo->dir_dname_len[off - fi->offset],
401 pos, 399 ceph_translate_ino(inode->i_sb, ino), ftype)) {
402 ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
403 dout("filldir stopping us...\n"); 400 dout("filldir stopping us...\n");
404 return 0; 401 return 0;
405 } 402 }
406 off++; 403 off++;
407 filp->f_pos = pos + 1; 404 ctx->pos++;
408 } 405 }
409 406
410 if (fi->last_name) { 407 if (fi->last_name) {
@@ -417,7 +414,7 @@ more:
417 if (!ceph_frag_is_rightmost(frag)) { 414 if (!ceph_frag_is_rightmost(frag)) {
418 frag = ceph_frag_next(frag); 415 frag = ceph_frag_next(frag);
419 off = 0; 416 off = 0;
420 filp->f_pos = ceph_make_fpos(frag, off); 417 ctx->pos = ceph_make_fpos(frag, off);
421 dout("readdir next frag is %x\n", frag); 418 dout("readdir next frag is %x\n", frag);
422 goto more; 419 goto more;
423 } 420 }
@@ -432,11 +429,11 @@ more:
432 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { 429 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
433 dout(" marking %p complete\n", inode); 430 dout(" marking %p complete\n", inode);
434 __ceph_dir_set_complete(ci, fi->dir_release_count); 431 __ceph_dir_set_complete(ci, fi->dir_release_count);
435 ci->i_max_offset = filp->f_pos; 432 ci->i_max_offset = ctx->pos;
436 } 433 }
437 spin_unlock(&ci->i_ceph_lock); 434 spin_unlock(&ci->i_ceph_lock);
438 435
439 dout("readdir %p filp %p done.\n", inode, filp); 436 dout("readdir %p file %p done.\n", inode, file);
440 return 0; 437 return 0;
441} 438}
442 439
@@ -1270,7 +1267,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
1270 1267
1271const struct file_operations ceph_dir_fops = { 1268const struct file_operations ceph_dir_fops = {
1272 .read = ceph_read_dir, 1269 .read = ceph_read_dir,
1273 .readdir = ceph_readdir, 1270 .iterate = ceph_readdir,
1274 .llseek = ceph_dir_llseek, 1271 .llseek = ceph_dir_llseek,
1275 .open = ceph_open, 1272 .open = ceph_open,
1276 .release = ceph_release, 1273 .release = ceph_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a17ffe4ec3ca..bc0735498d29 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -861,16 +861,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
861 break; 861 break;
862 } 862 }
863 863
864 if (offset < 0 || offset > inode->i_sb->s_maxbytes) { 864 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
865 offset = -EINVAL;
866 goto out;
867 }
868
869 /* Special lock needed here? */
870 if (offset != file->f_pos) {
871 file->f_pos = offset;
872 file->f_version = 0;
873 }
874 865
875out: 866out:
876 mutex_unlock(&inode->i_mutex); 867 mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3b0abed667c2..98b6e50bde04 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -911,8 +911,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
911 } else if (realdn) { 911 } else if (realdn) {
912 dout("dn %p (%d) spliced with %p (%d) " 912 dout("dn %p (%d) spliced with %p (%d) "
913 "inode %p ino %llx.%llx\n", 913 "inode %p ino %llx.%llx\n",
914 dn, dn->d_count, 914 dn, d_count(dn),
915 realdn, realdn->d_count, 915 realdn, d_count(realdn),
916 realdn->d_inode, ceph_vinop(realdn->d_inode)); 916 realdn->d_inode, ceph_vinop(realdn->d_inode));
917 dput(dn); 917 dput(dn);
918 dn = realdn; 918 dn = realdn;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 89788515a63d..ae6d14e82b0f 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -192,7 +192,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
192 192
193/** 193/**
194 * Encode the flock and fcntl locks for the given inode into the ceph_filelock 194 * Encode the flock and fcntl locks for the given inode into the ceph_filelock
195 * array. Must be called with lock_flocks() already held. 195 * array. Must be called with inode->i_lock already held.
196 * If we encounter more of a specific lock type than expected, return -ENOSPC. 196 * If we encounter more of a specific lock type than expected, return -ENOSPC.
197 */ 197 */
198int ceph_encode_locks_to_buffer(struct inode *inode, 198int ceph_encode_locks_to_buffer(struct inode *inode,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index cbf08203e00d..603786b564be 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1588,7 +1588,7 @@ retry:
1588 *base = ceph_ino(temp->d_inode); 1588 *base = ceph_ino(temp->d_inode);
1589 *plen = len; 1589 *plen = len;
1590 dout("build_path on %p %d built %llx '%.*s'\n", 1590 dout("build_path on %p %d built %llx '%.*s'\n",
1591 dentry, dentry->d_count, *base, len, path); 1591 dentry, d_count(dentry), *base, len, path);
1592 return path; 1592 return path;
1593} 1593}
1594 1594
@@ -2517,20 +2517,20 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2517 struct ceph_filelock *flocks; 2517 struct ceph_filelock *flocks;
2518 2518
2519encode_again: 2519encode_again:
2520 lock_flocks(); 2520 spin_lock(&inode->i_lock);
2521 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2521 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2522 unlock_flocks(); 2522 spin_unlock(&inode->i_lock);
2523 flocks = kmalloc((num_fcntl_locks+num_flock_locks) * 2523 flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
2524 sizeof(struct ceph_filelock), GFP_NOFS); 2524 sizeof(struct ceph_filelock), GFP_NOFS);
2525 if (!flocks) { 2525 if (!flocks) {
2526 err = -ENOMEM; 2526 err = -ENOMEM;
2527 goto out_free; 2527 goto out_free;
2528 } 2528 }
2529 lock_flocks(); 2529 spin_lock(&inode->i_lock);
2530 err = ceph_encode_locks_to_buffer(inode, flocks, 2530 err = ceph_encode_locks_to_buffer(inode, flocks,
2531 num_fcntl_locks, 2531 num_fcntl_locks,
2532 num_flock_locks); 2532 num_flock_locks);
2533 unlock_flocks(); 2533 spin_unlock(&inode->i_lock);
2534 if (err) { 2534 if (err) {
2535 kfree(flocks); 2535 kfree(flocks);
2536 if (err == -ENOSPC) 2536 if (err == -ENOSPC)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2906ee276408..603f18a65c12 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -10,6 +10,7 @@ config CIFS
10 select CRYPTO_ECB 10 select CRYPTO_ECB
11 select CRYPTO_DES 11 select CRYPTO_DES
12 select CRYPTO_SHA256 12 select CRYPTO_SHA256
13 select CRYPTO_CMAC
13 help 14 help
14 This is the client VFS module for the Common Internet File System 15 This is the client VFS module for the Common Internet File System
15 (CIFS) protocol which is the successor to the Server Message Block 16 (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index d59748346020..f3ac4154cbb6 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
213 tcon->nativeFileSystem); 213 tcon->nativeFileSystem);
214 } 214 }
215 seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" 215 seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
216 "\nPathComponentMax: %d Status: 0x%d", 216 "\n\tPathComponentMax: %d Status: 0x%d",
217 le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), 217 le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
218 le32_to_cpu(tcon->fsAttrInfo.Attributes), 218 le32_to_cpu(tcon->fsAttrInfo.Attributes),
219 le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), 219 le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
@@ -224,6 +224,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
224 seq_puts(m, " type: CDROM "); 224 seq_puts(m, " type: CDROM ");
225 else 225 else
226 seq_printf(m, " type: %d ", dev_type); 226 seq_printf(m, " type: %d ", dev_type);
227 if (server->ops->dump_share_caps)
228 server->ops->dump_share_caps(m, tcon);
227 229
228 if (tcon->need_reconnect) 230 if (tcon->need_reconnect)
229 seq_puts(m, "\tDISCONNECTED "); 231 seq_puts(m, "\tDISCONNECTED ");
@@ -595,9 +597,36 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
595 return single_open(file, cifs_security_flags_proc_show, NULL); 597 return single_open(file, cifs_security_flags_proc_show, NULL);
596} 598}
597 599
600/*
601 * Ensure that if someone sets a MUST flag, that we disable all other MAY
602 * flags except for the ones corresponding to the given MUST flag. If there are
603 * multiple MUST flags, then try to prefer more secure ones.
604 */
605static void
606cifs_security_flags_handle_must_flags(unsigned int *flags)
607{
608 unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
609
610 if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
611 *flags = CIFSSEC_MUST_KRB5;
612 else if ((*flags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
613 *flags = CIFSSEC_MUST_NTLMSSP;
614 else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
615 *flags = CIFSSEC_MUST_NTLMV2;
616 else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
617 *flags = CIFSSEC_MUST_NTLM;
618 else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
619 *flags = CIFSSEC_MUST_LANMAN;
620 else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
621 *flags = CIFSSEC_MUST_PLNTXT;
622
623 *flags |= signflags;
624}
625
598static ssize_t cifs_security_flags_proc_write(struct file *file, 626static ssize_t cifs_security_flags_proc_write(struct file *file,
599 const char __user *buffer, size_t count, loff_t *ppos) 627 const char __user *buffer, size_t count, loff_t *ppos)
600{ 628{
629 int rc;
601 unsigned int flags; 630 unsigned int flags;
602 char flags_string[12]; 631 char flags_string[12];
603 char c; 632 char c;
@@ -620,26 +649,35 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
620 global_secflags = CIFSSEC_MAX; 649 global_secflags = CIFSSEC_MAX;
621 return count; 650 return count;
622 } else if (!isdigit(c)) { 651 } else if (!isdigit(c)) {
623 cifs_dbg(VFS, "invalid flag %c\n", c); 652 cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
653 flags_string);
624 return -EINVAL; 654 return -EINVAL;
625 } 655 }
626 } 656 }
627 /* else we have a number */
628 657
629 flags = simple_strtoul(flags_string, NULL, 0); 658 /* else we have a number */
659 rc = kstrtouint(flags_string, 0, &flags);
660 if (rc) {
661 cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
662 flags_string);
663 return rc;
664 }
630 665
631 cifs_dbg(FYI, "sec flags 0x%x\n", flags); 666 cifs_dbg(FYI, "sec flags 0x%x\n", flags);
632 667
633 if (flags <= 0) { 668 if (flags == 0) {
634 cifs_dbg(VFS, "invalid security flags %s\n", flags_string); 669 cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", flags_string);
635 return -EINVAL; 670 return -EINVAL;
636 } 671 }
637 672
638 if (flags & ~CIFSSEC_MASK) { 673 if (flags & ~CIFSSEC_MASK) {
639 cifs_dbg(VFS, "attempt to set unsupported security flags 0x%x\n", 674 cifs_dbg(VFS, "Unsupported security flags: 0x%x\n",
640 flags & ~CIFSSEC_MASK); 675 flags & ~CIFSSEC_MASK);
641 return -EINVAL; 676 return -EINVAL;
642 } 677 }
678
679 cifs_security_flags_handle_must_flags(&flags);
680
643 /* flags look ok - update the global security flags for cifs module */ 681 /* flags look ok - update the global security flags for cifs module */
644 global_secflags = flags; 682 global_secflags = flags;
645 if (global_secflags & CIFSSEC_MUST_SIGN) { 683 if (global_secflags & CIFSSEC_MUST_SIGN) {
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 4fb097468e21..fe8d6276410a 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -327,14 +327,14 @@ UniToupper(register wchar_t uc)
327/* 327/*
328 * UniStrupr: Upper case a unicode string 328 * UniStrupr: Upper case a unicode string
329 */ 329 */
330static inline wchar_t * 330static inline __le16 *
331UniStrupr(register wchar_t *upin) 331UniStrupr(register __le16 *upin)
332{ 332{
333 register wchar_t *up; 333 register __le16 *up;
334 334
335 up = upin; 335 up = upin;
336 while (*up) { /* For all characters */ 336 while (*up) { /* For all characters */
337 *up = UniToupper(*up); 337 *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
338 up++; 338 up++;
339 } 339 }
340 return upin; /* Return input pointer */ 340 return upin; /* Return input pointer */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 71436d1fca13..fc6f4f3a1a9d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/cifsencrypt.c 2 * fs/cifs/cifsencrypt.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2005,2006 4 * Copyright (C) International Business Machines Corp., 2005,2013
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -31,6 +31,37 @@
31#include <linux/random.h> 31#include <linux/random.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33 33
34static int
35cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
36{
37 int rc;
38 unsigned int size;
39
40 if (server->secmech.sdescmd5 != NULL)
41 return 0; /* already allocated */
42
43 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
44 if (IS_ERR(server->secmech.md5)) {
45 cifs_dbg(VFS, "could not allocate crypto md5\n");
46 rc = PTR_ERR(server->secmech.md5);
47 server->secmech.md5 = NULL;
48 return rc;
49 }
50
51 size = sizeof(struct shash_desc) +
52 crypto_shash_descsize(server->secmech.md5);
53 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL);
54 if (!server->secmech.sdescmd5) {
55 crypto_free_shash(server->secmech.md5);
56 server->secmech.md5 = NULL;
57 return -ENOMEM;
58 }
59 server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
60 server->secmech.sdescmd5->shash.flags = 0x0;
61
62 return 0;
63}
64
34/* 65/*
35 * Calculate and return the CIFS signature based on the mac key and SMB PDU. 66 * Calculate and return the CIFS signature based on the mac key and SMB PDU.
36 * The 16 byte signature must be allocated by the caller. Note we only use the 67 * The 16 byte signature must be allocated by the caller. Note we only use the
@@ -50,8 +81,11 @@ static int cifs_calc_signature(struct smb_rqst *rqst,
50 return -EINVAL; 81 return -EINVAL;
51 82
52 if (!server->secmech.sdescmd5) { 83 if (!server->secmech.sdescmd5) {
53 cifs_dbg(VFS, "%s: Can't generate signature\n", __func__); 84 rc = cifs_crypto_shash_md5_allocate(server);
54 return -1; 85 if (rc) {
86 cifs_dbg(VFS, "%s: Can't alloc md5 crypto\n", __func__);
87 return -1;
88 }
55 } 89 }
56 90
57 rc = crypto_shash_init(&server->secmech.sdescmd5->shash); 91 rc = crypto_shash_init(&server->secmech.sdescmd5->shash);
@@ -276,7 +310,6 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
276 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); 310 strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
277 311
278 if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) { 312 if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
279 memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
280 memcpy(lnm_session_key, password_with_pad, 313 memcpy(lnm_session_key, password_with_pad,
281 CIFS_ENCPWD_SIZE); 314 CIFS_ENCPWD_SIZE);
282 return 0; 315 return 0;
@@ -389,7 +422,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
389 if (blobptr + attrsize > blobend) 422 if (blobptr + attrsize > blobend)
390 break; 423 break;
391 if (type == NTLMSSP_AV_NB_DOMAIN_NAME) { 424 if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
392 if (!attrsize) 425 if (!attrsize || attrsize >= CIFS_MAX_DOMAINNAME_LEN)
393 break; 426 break;
394 if (!ses->domainName) { 427 if (!ses->domainName) {
395 ses->domainName = 428 ses->domainName =
@@ -414,7 +447,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
414 int rc = 0; 447 int rc = 0;
415 int len; 448 int len;
416 char nt_hash[CIFS_NTHASH_SIZE]; 449 char nt_hash[CIFS_NTHASH_SIZE];
417 wchar_t *user; 450 __le16 *user;
418 wchar_t *domain; 451 wchar_t *domain;
419 wchar_t *server; 452 wchar_t *server;
420 453
@@ -439,7 +472,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
439 return rc; 472 return rc;
440 } 473 }
441 474
442 /* convert ses->user_name to unicode and uppercase */ 475 /* convert ses->user_name to unicode */
443 len = ses->user_name ? strlen(ses->user_name) : 0; 476 len = ses->user_name ? strlen(ses->user_name) : 0;
444 user = kmalloc(2 + (len * 2), GFP_KERNEL); 477 user = kmalloc(2 + (len * 2), GFP_KERNEL);
445 if (user == NULL) { 478 if (user == NULL) {
@@ -448,7 +481,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
448 } 481 }
449 482
450 if (len) { 483 if (len) {
451 len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp); 484 len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
452 UniStrupr(user); 485 UniStrupr(user);
453 } else { 486 } else {
454 memset(user, '\0', 2); 487 memset(user, '\0', 2);
@@ -536,7 +569,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
536 return rc; 569 return rc;
537 } 570 }
538 571
539 if (ses->server->secType == RawNTLMSSP) 572 if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
540 memcpy(ses->auth_key.response + offset, 573 memcpy(ses->auth_key.response + offset,
541 ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); 574 ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
542 else 575 else
@@ -557,6 +590,36 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
557 return rc; 590 return rc;
558} 591}
559 592
593static int crypto_hmacmd5_alloc(struct TCP_Server_Info *server)
594{
595 int rc;
596 unsigned int size;
597
598 /* check if already allocated */
599 if (server->secmech.sdeschmacmd5)
600 return 0;
601
602 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
603 if (IS_ERR(server->secmech.hmacmd5)) {
604 cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
605 rc = PTR_ERR(server->secmech.hmacmd5);
606 server->secmech.hmacmd5 = NULL;
607 return rc;
608 }
609
610 size = sizeof(struct shash_desc) +
611 crypto_shash_descsize(server->secmech.hmacmd5);
612 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
613 if (!server->secmech.sdeschmacmd5) {
614 crypto_free_shash(server->secmech.hmacmd5);
615 server->secmech.hmacmd5 = NULL;
616 return -ENOMEM;
617 }
618 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
619 server->secmech.sdeschmacmd5->shash.flags = 0x0;
620
621 return 0;
622}
560 623
561int 624int
562setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) 625setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
@@ -568,7 +631,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
568 char ntlmv2_hash[16]; 631 char ntlmv2_hash[16];
569 unsigned char *tiblob = NULL; /* target info blob */ 632 unsigned char *tiblob = NULL; /* target info blob */
570 633
571 if (ses->server->secType == RawNTLMSSP) { 634 if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
572 if (!ses->domainName) { 635 if (!ses->domainName) {
573 rc = find_domain_name(ses, nls_cp); 636 rc = find_domain_name(ses, nls_cp);
574 if (rc) { 637 if (rc) {
@@ -607,6 +670,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
607 670
608 memcpy(ses->auth_key.response + baselen, tiblob, tilen); 671 memcpy(ses->auth_key.response + baselen, tiblob, tilen);
609 672
673 rc = crypto_hmacmd5_alloc(ses->server);
674 if (rc) {
675 cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
676 goto setup_ntlmv2_rsp_ret;
677 }
678
610 /* calculate ntlmv2_hash */ 679 /* calculate ntlmv2_hash */
611 rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); 680 rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
612 if (rc) { 681 if (rc) {
@@ -706,94 +775,32 @@ calc_seckey(struct cifs_ses *ses)
706void 775void
707cifs_crypto_shash_release(struct TCP_Server_Info *server) 776cifs_crypto_shash_release(struct TCP_Server_Info *server)
708{ 777{
709 if (server->secmech.hmacsha256) 778 if (server->secmech.cmacaes) {
710 crypto_free_shash(server->secmech.hmacsha256); 779 crypto_free_shash(server->secmech.cmacaes);
711 780 server->secmech.cmacaes = NULL;
712 if (server->secmech.md5)
713 crypto_free_shash(server->secmech.md5);
714
715 if (server->secmech.hmacmd5)
716 crypto_free_shash(server->secmech.hmacmd5);
717
718 kfree(server->secmech.sdeschmacsha256);
719
720 kfree(server->secmech.sdeschmacmd5);
721
722 kfree(server->secmech.sdescmd5);
723}
724
725int
726cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
727{
728 int rc;
729 unsigned int size;
730
731 server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
732 if (IS_ERR(server->secmech.hmacmd5)) {
733 cifs_dbg(VFS, "could not allocate crypto hmacmd5\n");
734 return PTR_ERR(server->secmech.hmacmd5);
735 }
736
737 server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
738 if (IS_ERR(server->secmech.md5)) {
739 cifs_dbg(VFS, "could not allocate crypto md5\n");
740 rc = PTR_ERR(server->secmech.md5);
741 goto crypto_allocate_md5_fail;
742 } 781 }
743 782
744 server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0); 783 if (server->secmech.hmacsha256) {
745 if (IS_ERR(server->secmech.hmacsha256)) { 784 crypto_free_shash(server->secmech.hmacsha256);
746 cifs_dbg(VFS, "could not allocate crypto hmacsha256\n"); 785 server->secmech.hmacsha256 = NULL;
747 rc = PTR_ERR(server->secmech.hmacsha256);
748 goto crypto_allocate_hmacsha256_fail;
749 }
750
751 size = sizeof(struct shash_desc) +
752 crypto_shash_descsize(server->secmech.hmacmd5);
753 server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
754 if (!server->secmech.sdeschmacmd5) {
755 rc = -ENOMEM;
756 goto crypto_allocate_hmacmd5_sdesc_fail;
757 } 786 }
758 server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5;
759 server->secmech.sdeschmacmd5->shash.flags = 0x0;
760 787
761 size = sizeof(struct shash_desc) + 788 if (server->secmech.md5) {
762 crypto_shash_descsize(server->secmech.md5); 789 crypto_free_shash(server->secmech.md5);
763 server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); 790 server->secmech.md5 = NULL;
764 if (!server->secmech.sdescmd5) {
765 rc = -ENOMEM;
766 goto crypto_allocate_md5_sdesc_fail;
767 } 791 }
768 server->secmech.sdescmd5->shash.tfm = server->secmech.md5;
769 server->secmech.sdescmd5->shash.flags = 0x0;
770 792
771 size = sizeof(struct shash_desc) + 793 if (server->secmech.hmacmd5) {
772 crypto_shash_descsize(server->secmech.hmacsha256); 794 crypto_free_shash(server->secmech.hmacmd5);
773 server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL); 795 server->secmech.hmacmd5 = NULL;
774 if (!server->secmech.sdeschmacsha256) {
775 rc = -ENOMEM;
776 goto crypto_allocate_hmacsha256_sdesc_fail;
777 } 796 }
778 server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
779 server->secmech.sdeschmacsha256->shash.flags = 0x0;
780
781 return 0;
782
783crypto_allocate_hmacsha256_sdesc_fail:
784 kfree(server->secmech.sdescmd5);
785 797
786crypto_allocate_md5_sdesc_fail: 798 kfree(server->secmech.sdesccmacaes);
799 server->secmech.sdesccmacaes = NULL;
800 kfree(server->secmech.sdeschmacsha256);
801 server->secmech.sdeschmacsha256 = NULL;
787 kfree(server->secmech.sdeschmacmd5); 802 kfree(server->secmech.sdeschmacmd5);
788 803 server->secmech.sdeschmacmd5 = NULL;
789crypto_allocate_hmacmd5_sdesc_fail: 804 kfree(server->secmech.sdescmd5);
790 crypto_free_shash(server->secmech.hmacsha256); 805 server->secmech.sdescmd5 = NULL;
791
792crypto_allocate_hmacsha256_fail:
793 crypto_free_shash(server->secmech.md5);
794
795crypto_allocate_md5_fail:
796 crypto_free_shash(server->secmech.hmacmd5);
797
798 return rc;
799} 806}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3752b9f6d9e4..85ea98d139fc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -147,18 +147,17 @@ cifs_read_super(struct super_block *sb)
147 goto out_no_root; 147 goto out_no_root;
148 } 148 }
149 149
150 if (cifs_sb_master_tcon(cifs_sb)->nocase)
151 sb->s_d_op = &cifs_ci_dentry_ops;
152 else
153 sb->s_d_op = &cifs_dentry_ops;
154
150 sb->s_root = d_make_root(inode); 155 sb->s_root = d_make_root(inode);
151 if (!sb->s_root) { 156 if (!sb->s_root) {
152 rc = -ENOMEM; 157 rc = -ENOMEM;
153 goto out_no_root; 158 goto out_no_root;
154 } 159 }
155 160
156 /* do that *after* d_make_root() - we want NULL ->d_op for root here */
157 if (cifs_sb_master_tcon(cifs_sb)->nocase)
158 sb->s_d_op = &cifs_ci_dentry_ops;
159 else
160 sb->s_d_op = &cifs_dentry_ops;
161
162#ifdef CONFIG_CIFS_NFSD_EXPORT 161#ifdef CONFIG_CIFS_NFSD_EXPORT
163 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { 162 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
164 cifs_dbg(FYI, "export ops supported\n"); 163 cifs_dbg(FYI, "export ops supported\n");
@@ -312,11 +311,14 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
312} 311}
313 312
314static void 313static void
315cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) 314cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
316{ 315{
316 if (ses->sectype == Unspecified)
317 return;
318
317 seq_printf(s, ",sec="); 319 seq_printf(s, ",sec=");
318 320
319 switch (server->secType) { 321 switch (ses->sectype) {
320 case LANMAN: 322 case LANMAN:
321 seq_printf(s, "lanman"); 323 seq_printf(s, "lanman");
322 break; 324 break;
@@ -338,7 +340,7 @@ cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server)
338 break; 340 break;
339 } 341 }
340 342
341 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 343 if (ses->sign)
342 seq_printf(s, "i"); 344 seq_printf(s, "i");
343} 345}
344 346
@@ -369,7 +371,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
369 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; 371 srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
370 372
371 seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string); 373 seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
372 cifs_show_security(s, tcon->ses->server); 374 cifs_show_security(s, tcon->ses);
373 cifs_show_cache_flavor(s, cifs_sb); 375 cifs_show_cache_flavor(s, cifs_sb);
374 376
375 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) 377 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
@@ -765,7 +767,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
765 767
766static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 768static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
767{ 769{
768 /* note that this is called by vfs setlease with lock_flocks held 770 /* note that this is called by vfs setlease with i_lock held
769 to protect *lease from going away */ 771 to protect *lease from going away */
770 struct inode *inode = file_inode(file); 772 struct inode *inode = file_inode(file);
771 struct cifsFileInfo *cfile = file->private_data; 773 struct cifsFileInfo *cfile = file->private_data;
@@ -968,7 +970,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
968}; 970};
969 971
970const struct file_operations cifs_dir_ops = { 972const struct file_operations cifs_dir_ops = {
971 .readdir = cifs_readdir, 973 .iterate = cifs_readdir,
972 .release = cifs_closedir, 974 .release = cifs_closedir,
973 .read = generic_read_dir, 975 .read = generic_read_dir,
974 .unlocked_ioctl = cifs_ioctl, 976 .unlocked_ioctl = cifs_ioctl,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0e32c3446ce9..ea723a5e8226 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -101,7 +101,7 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
101extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *); 101extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
102extern const struct file_operations cifs_dir_ops; 102extern const struct file_operations cifs_dir_ops;
103extern int cifs_dir_open(struct inode *inode, struct file *file); 103extern int cifs_dir_open(struct inode *inode, struct file *file);
104extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir); 104extern int cifs_readdir(struct file *file, struct dir_context *ctx);
105 105
106/* Functions related to dir entries */ 106/* Functions related to dir entries */
107extern const struct dentry_operations cifs_dentry_ops; 107extern const struct dentry_operations cifs_dentry_ops;
@@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
132extern const struct export_operations cifs_export_ops; 132extern const struct export_operations cifs_export_ops;
133#endif /* CONFIG_CIFS_NFSD_EXPORT */ 133#endif /* CONFIG_CIFS_NFSD_EXPORT */
134 134
135#define CIFS_VERSION "2.0" 135#define CIFS_VERSION "2.01"
136#endif /* _CIFSFS_H */ 136#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4f07f6fbe494..52ca861ed35e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -44,6 +44,7 @@
44#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) 44#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
45#define MAX_SERVER_SIZE 15 45#define MAX_SERVER_SIZE 15
46#define MAX_SHARE_SIZE 80 46#define MAX_SHARE_SIZE 80
47#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */
47#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */ 48#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */
48#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ 49#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
49 50
@@ -101,20 +102,14 @@ enum statusEnum {
101}; 102};
102 103
103enum securityEnum { 104enum securityEnum {
104 LANMAN = 0, /* Legacy LANMAN auth */ 105 Unspecified = 0, /* not specified */
106 LANMAN, /* Legacy LANMAN auth */
105 NTLM, /* Legacy NTLM012 auth with NTLM hash */ 107 NTLM, /* Legacy NTLM012 auth with NTLM hash */
106 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ 108 NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
107 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ 109 RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
108/* NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
109 Kerberos, /* Kerberos via SPNEGO */ 110 Kerberos, /* Kerberos via SPNEGO */
110}; 111};
111 112
112enum protocolEnum {
113 TCP = 0,
114 SCTP
115 /* Netbios frames protocol not supported at this time */
116};
117
118struct session_key { 113struct session_key {
119 unsigned int len; 114 unsigned int len;
120 char *response; 115 char *response;
@@ -131,9 +126,11 @@ struct cifs_secmech {
131 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ 126 struct crypto_shash *hmacmd5; /* hmac-md5 hash function */
132 struct crypto_shash *md5; /* md5 hash function */ 127 struct crypto_shash *md5; /* md5 hash function */
133 struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */ 128 struct crypto_shash *hmacsha256; /* hmac-sha256 hash function */
129 struct crypto_shash *cmacaes; /* block-cipher based MAC function */
134 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ 130 struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */
135 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ 131 struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
136 struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */ 132 struct sdesc *sdeschmacsha256; /* ctxt to generate smb2 signature */
133 struct sdesc *sdesccmacaes; /* ctxt to generate smb3 signature */
137}; 134};
138 135
139/* per smb session structure/fields */ 136/* per smb session structure/fields */
@@ -181,6 +178,7 @@ enum smb_version {
181 Smb_20, 178 Smb_20,
182 Smb_21, 179 Smb_21,
183 Smb_30, 180 Smb_30,
181 Smb_302,
184}; 182};
185 183
186struct mid_q_entry; 184struct mid_q_entry;
@@ -197,6 +195,7 @@ struct cifs_writedata;
197struct cifs_io_parms; 195struct cifs_io_parms;
198struct cifs_search_info; 196struct cifs_search_info;
199struct cifsInodeInfo; 197struct cifsInodeInfo;
198struct cifs_open_parms;
200 199
201struct smb_version_operations { 200struct smb_version_operations {
202 int (*send_cancel)(struct TCP_Server_Info *, void *, 201 int (*send_cancel)(struct TCP_Server_Info *, void *,
@@ -228,6 +227,7 @@ struct smb_version_operations {
228 void (*dump_detail)(void *); 227 void (*dump_detail)(void *);
229 void (*clear_stats)(struct cifs_tcon *); 228 void (*clear_stats)(struct cifs_tcon *);
230 void (*print_stats)(struct seq_file *m, struct cifs_tcon *); 229 void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
230 void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
231 /* verify the message */ 231 /* verify the message */
232 int (*check_message)(char *, unsigned int); 232 int (*check_message)(char *, unsigned int);
233 bool (*is_oplock_break)(char *, struct TCP_Server_Info *); 233 bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
@@ -309,9 +309,8 @@ struct smb_version_operations {
309 const char *, const char *, 309 const char *, const char *,
310 struct cifs_sb_info *); 310 struct cifs_sb_info *);
311 /* open a file for non-posix mounts */ 311 /* open a file for non-posix mounts */
312 int (*open)(const unsigned int, struct cifs_tcon *, const char *, int, 312 int (*open)(const unsigned int, struct cifs_open_parms *,
313 int, int, struct cifs_fid *, __u32 *, FILE_ALL_INFO *, 313 __u32 *, FILE_ALL_INFO *);
314 struct cifs_sb_info *);
315 /* set fid protocol-specific info */ 314 /* set fid protocol-specific info */
316 void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32); 315 void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
317 /* close a file */ 316 /* close a file */
@@ -367,8 +366,13 @@ struct smb_version_operations {
367 void (*set_lease_key)(struct inode *, struct cifs_fid *fid); 366 void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
368 /* generate new lease key */ 367 /* generate new lease key */
369 void (*new_lease_key)(struct cifs_fid *fid); 368 void (*new_lease_key)(struct cifs_fid *fid);
369 /* The next two functions will need to be changed to per smb session */
370 void (*generate_signingkey)(struct TCP_Server_Info *server);
370 int (*calc_signature)(struct smb_rqst *rqst, 371 int (*calc_signature)(struct smb_rqst *rqst,
371 struct TCP_Server_Info *server); 372 struct TCP_Server_Info *server);
373 int (*query_mf_symlink)(const unsigned char *path, char *pbuf,
374 unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
375 unsigned int xid);
372}; 376};
373 377
374struct smb_version_values { 378struct smb_version_values {
@@ -387,6 +391,8 @@ struct smb_version_values {
387 unsigned int cap_nt_find; 391 unsigned int cap_nt_find;
388 unsigned int cap_large_files; 392 unsigned int cap_large_files;
389 unsigned int oplock_read; 393 unsigned int oplock_read;
394 __u16 signing_enabled;
395 __u16 signing_required;
390}; 396};
391 397
392#define HEADER_SIZE(server) (server->vals->header_size) 398#define HEADER_SIZE(server) (server->vals->header_size)
@@ -407,7 +413,8 @@ struct smb_vol {
407 kgid_t backupgid; 413 kgid_t backupgid;
408 umode_t file_mode; 414 umode_t file_mode;
409 umode_t dir_mode; 415 umode_t dir_mode;
410 unsigned secFlg; 416 enum securityEnum sectype; /* sectype requested via mnt opts */
417 bool sign; /* was signing requested via mnt opts? */
411 bool retry:1; 418 bool retry:1;
412 bool intr:1; 419 bool intr:1;
413 bool setuids:1; 420 bool setuids:1;
@@ -441,6 +448,7 @@ struct smb_vol {
441 bool mfsymlinks:1; /* use Minshall+French Symlinks */ 448 bool mfsymlinks:1; /* use Minshall+French Symlinks */
442 bool multiuser:1; 449 bool multiuser:1;
443 bool rwpidforward:1; /* pid forward for read/write operations */ 450 bool rwpidforward:1; /* pid forward for read/write operations */
451 bool nosharesock;
444 unsigned int rsize; 452 unsigned int rsize;
445 unsigned int wsize; 453 unsigned int wsize;
446 bool sockopt_tcp_nodelay:1; 454 bool sockopt_tcp_nodelay:1;
@@ -514,6 +522,7 @@ struct TCP_Server_Info {
514 struct task_struct *tsk; 522 struct task_struct *tsk;
515 char server_GUID[16]; 523 char server_GUID[16];
516 __u16 sec_mode; 524 __u16 sec_mode;
525 bool sign; /* is signing enabled on this connection? */
517 bool session_estab; /* mark when very first sess is established */ 526 bool session_estab; /* mark when very first sess is established */
518#ifdef CONFIG_CIFS_SMB2 527#ifdef CONFIG_CIFS_SMB2
519 int echo_credits; /* echo reserved slots */ 528 int echo_credits; /* echo reserved slots */
@@ -521,7 +530,6 @@ struct TCP_Server_Info {
521 bool echoes:1; /* enable echoes */ 530 bool echoes:1; /* enable echoes */
522#endif 531#endif
523 u16 dialect; /* dialect index that server chose */ 532 u16 dialect; /* dialect index that server chose */
524 enum securityEnum secType;
525 bool oplocks:1; /* enable oplocks */ 533 bool oplocks:1; /* enable oplocks */
526 unsigned int maxReq; /* Clients should submit no more */ 534 unsigned int maxReq; /* Clients should submit no more */
527 /* than maxReq distinct unanswered SMBs to the server when using */ 535 /* than maxReq distinct unanswered SMBs to the server when using */
@@ -540,12 +548,17 @@ struct TCP_Server_Info {
540 int timeAdj; /* Adjust for difference in server time zone in sec */ 548 int timeAdj; /* Adjust for difference in server time zone in sec */
541 __u64 CurrentMid; /* multiplex id - rotating counter */ 549 __u64 CurrentMid; /* multiplex id - rotating counter */
542 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ 550 char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
551 char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
543 /* 16th byte of RFC1001 workstation name is always null */ 552 /* 16th byte of RFC1001 workstation name is always null */
544 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; 553 char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
545 __u32 sequence_number; /* for signing, protected by srv_mutex */ 554 __u32 sequence_number; /* for signing, protected by srv_mutex */
546 struct session_key session_key; 555 struct session_key session_key;
547 unsigned long lstrp; /* when we got last response from this server */ 556 unsigned long lstrp; /* when we got last response from this server */
548 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ 557 struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
558#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */
559#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
560#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
561 char negflavor; /* NEGOTIATE response flavor */
549 /* extended security flavors that server supports */ 562 /* extended security flavors that server supports */
550 bool sec_ntlmssp; /* supports NTLMSSP */ 563 bool sec_ntlmssp; /* supports NTLMSSP */
551 bool sec_kerberosu2u; /* supports U2U Kerberos */ 564 bool sec_kerberosu2u; /* supports U2U Kerberos */
@@ -697,7 +710,6 @@ struct cifs_ses {
697 enum statusEnum status; 710 enum statusEnum status;
698 unsigned overrideSecFlg; /* if non-zero override global sec flags */ 711 unsigned overrideSecFlg; /* if non-zero override global sec flags */
699 __u16 ipc_tid; /* special tid for connection to IPC share */ 712 __u16 ipc_tid; /* special tid for connection to IPC share */
700 __u16 flags;
701 __u16 vcnum; 713 __u16 vcnum;
702 char *serverOS; /* name of operating system underlying server */ 714 char *serverOS; /* name of operating system underlying server */
703 char *serverNOS; /* name of network operating system of server */ 715 char *serverNOS; /* name of network operating system of server */
@@ -714,21 +726,14 @@ struct cifs_ses {
714 char *password; 726 char *password;
715 struct session_key auth_key; 727 struct session_key auth_key;
716 struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ 728 struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
729 enum securityEnum sectype; /* what security flavor was specified? */
730 bool sign; /* is signing required? */
717 bool need_reconnect:1; /* connection reset, uid now invalid */ 731 bool need_reconnect:1; /* connection reset, uid now invalid */
718#ifdef CONFIG_CIFS_SMB2 732#ifdef CONFIG_CIFS_SMB2
719 __u16 session_flags; 733 __u16 session_flags;
720#endif /* CONFIG_CIFS_SMB2 */ 734#endif /* CONFIG_CIFS_SMB2 */
721}; 735};
722 736
723/* no more than one of the following three session flags may be set */
724#define CIFS_SES_NT4 1
725#define CIFS_SES_OS2 2
726#define CIFS_SES_W9X 4
727/* following flag is set for old servers such as OS2 (and Win95?)
728 which do not negotiate NTLM or POSIX dialects, but instead
729 negotiate one of the older LANMAN dialects */
730#define CIFS_SES_LANMAN 8
731
732static inline bool 737static inline bool
733cap_unix(struct cifs_ses *ses) 738cap_unix(struct cifs_ses *ses)
734{ 739{
@@ -816,7 +821,7 @@ struct cifs_tcon {
816#ifdef CONFIG_CIFS_SMB2 821#ifdef CONFIG_CIFS_SMB2
817 bool print:1; /* set if connection to printer share */ 822 bool print:1; /* set if connection to printer share */
818 bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ 823 bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
819 __u32 capabilities; 824 __le32 capabilities;
820 __u32 share_flags; 825 __u32 share_flags;
821 __u32 maximal_access; 826 __u32 maximal_access;
822 __u32 vol_serial_number; 827 __u32 vol_serial_number;
@@ -911,6 +916,17 @@ struct cifs_search_info {
911 bool smallBuf:1; /* so we know which buf_release function to call */ 916 bool smallBuf:1; /* so we know which buf_release function to call */
912}; 917};
913 918
919struct cifs_open_parms {
920 struct cifs_tcon *tcon;
921 struct cifs_sb_info *cifs_sb;
922 int disposition;
923 int desired_access;
924 int create_options;
925 const char *path;
926 struct cifs_fid *fid;
927 bool reconnect:1;
928};
929
914struct cifs_fid { 930struct cifs_fid {
915 __u16 netfid; 931 __u16 netfid;
916#ifdef CONFIG_CIFS_SMB2 932#ifdef CONFIG_CIFS_SMB2
@@ -1348,7 +1364,7 @@ require use of the stronger protocol */
1348#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ 1364#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
1349#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ 1365#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
1350 1366
1351#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP) 1367#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
1352#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) 1368#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
1353#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) 1369#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
1354/* 1370/*
@@ -1494,4 +1510,7 @@ extern struct smb_version_values smb21_values;
1494#define SMB30_VERSION_STRING "3.0" 1510#define SMB30_VERSION_STRING "3.0"
1495extern struct smb_version_operations smb30_operations; 1511extern struct smb_version_operations smb30_operations;
1496extern struct smb_version_values smb30_values; 1512extern struct smb_version_values smb30_values;
1513#define SMB302_VERSION_STRING "3.02"
1514/*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */
1515extern struct smb_version_values smb302_values;
1497#endif /* _CIFS_GLOB_H */ 1516#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index e996ff6b26d1..11ca24a8e054 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -142,6 +142,11 @@
142 */ 142 */
143#define CIFS_SESS_KEY_SIZE (16) 143#define CIFS_SESS_KEY_SIZE (16)
144 144
145/*
146 * Size of the smb3 signing key
147 */
148#define SMB3_SIGN_KEY_SIZE (16)
149
145#define CIFS_CLIENT_CHALLENGE_SIZE (8) 150#define CIFS_CLIENT_CHALLENGE_SIZE (8)
146#define CIFS_SERVER_CHALLENGE_SIZE (8) 151#define CIFS_SERVER_CHALLENGE_SIZE (8)
147#define CIFS_HMAC_MD5_HASH_SIZE (16) 152#define CIFS_HMAC_MD5_HASH_SIZE (16)
@@ -531,7 +536,7 @@ typedef struct lanman_neg_rsp {
531#define READ_RAW_ENABLE 1 536#define READ_RAW_ENABLE 1
532#define WRITE_RAW_ENABLE 2 537#define WRITE_RAW_ENABLE 2
533#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE) 538#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
534 539#define SMB1_CLIENT_GUID_SIZE (16)
535typedef struct negotiate_rsp { 540typedef struct negotiate_rsp {
536 struct smb_hdr hdr; /* wct = 17 */ 541 struct smb_hdr hdr; /* wct = 17 */
537 __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */ 542 __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */
@@ -553,7 +558,7 @@ typedef struct negotiate_rsp {
553 /* followed by 16 bytes of server GUID */ 558 /* followed by 16 bytes of server GUID */
554 /* then security blob if cap_extended_security negotiated */ 559 /* then security blob if cap_extended_security negotiated */
555 struct { 560 struct {
556 unsigned char GUID[16]; 561 unsigned char GUID[SMB1_CLIENT_GUID_SIZE];
557 unsigned char SecurityBlob[1]; 562 unsigned char SecurityBlob[1];
558 } __attribute__((packed)) extended_response; 563 } __attribute__((packed)) extended_response;
559 } __attribute__((packed)) u; 564 } __attribute__((packed)) u;
@@ -1315,6 +1320,14 @@ typedef struct smb_com_ntransact_rsp {
1315 /* parms and data follow */ 1320 /* parms and data follow */
1316} __attribute__((packed)) NTRANSACT_RSP; 1321} __attribute__((packed)) NTRANSACT_RSP;
1317 1322
1323/* See MS-SMB 2.2.7.2.1.1 */
1324struct srv_copychunk {
1325 __le64 SourceOffset;
1326 __le64 DestinationOffset;
1327 __le32 CopyLength;
1328 __u32 Reserved;
1329} __packed;
1330
1318typedef struct smb_com_transaction_ioctl_req { 1331typedef struct smb_com_transaction_ioctl_req {
1319 struct smb_hdr hdr; /* wct = 23 */ 1332 struct smb_hdr hdr; /* wct = 23 */
1320 __u8 MaxSetupCount; 1333 __u8 MaxSetupCount;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index dda188a94332..b29a012bed33 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -118,6 +118,8 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
118extern int small_smb_init_no_tc(const int smb_cmd, const int wct, 118extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
119 struct cifs_ses *ses, 119 struct cifs_ses *ses,
120 void **request_buf); 120 void **request_buf);
121extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
122 enum securityEnum requested);
121extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, 123extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
122 const struct nls_table *nls_cp); 124 const struct nls_table *nls_cp);
123extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 125extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
@@ -212,6 +214,7 @@ extern int cifs_negotiate_protocol(const unsigned int xid,
212 struct cifs_ses *ses); 214 struct cifs_ses *ses);
213extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, 215extern int cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
214 struct nls_table *nls_info); 216 struct nls_table *nls_info);
217extern int cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required);
215extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses); 218extern int CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses);
216 219
217extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, 220extern int CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
@@ -430,9 +433,9 @@ extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
430 const struct nls_table *); 433 const struct nls_table *);
431extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); 434extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
432extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); 435extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
433extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
434extern void cifs_crypto_shash_release(struct TCP_Server_Info *); 436extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
435extern int calc_seckey(struct cifs_ses *); 437extern int calc_seckey(struct cifs_ses *);
438extern void generate_smb3signingkey(struct TCP_Server_Info *);
436 439
437#ifdef CONFIG_CIFS_WEAK_PW_HASH 440#ifdef CONFIG_CIFS_WEAK_PW_HASH
438extern int calc_lanman_hash(const char *password, const char *cryptkey, 441extern int calc_lanman_hash(const char *password, const char *cryptkey,
@@ -494,5 +497,7 @@ void cifs_writev_complete(struct work_struct *work);
494struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, 497struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
495 work_func_t complete); 498 work_func_t complete);
496void cifs_writedata_release(struct kref *refcount); 499void cifs_writedata_release(struct kref *refcount);
497 500int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
501 unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
502 unsigned int xid);
498#endif /* _CIFSPROTO_H */ 503#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a58dc77cc443..a89c4cb4e6cf 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -367,6 +367,185 @@ vt2_err:
367 return -EINVAL; 367 return -EINVAL;
368} 368}
369 369
370static int
371decode_ext_sec_blob(struct cifs_ses *ses, NEGOTIATE_RSP *pSMBr)
372{
373 int rc = 0;
374 u16 count;
375 char *guid = pSMBr->u.extended_response.GUID;
376 struct TCP_Server_Info *server = ses->server;
377
378 count = get_bcc(&pSMBr->hdr);
379 if (count < SMB1_CLIENT_GUID_SIZE)
380 return -EIO;
381
382 spin_lock(&cifs_tcp_ses_lock);
383 if (server->srv_count > 1) {
384 spin_unlock(&cifs_tcp_ses_lock);
385 if (memcmp(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE) != 0) {
386 cifs_dbg(FYI, "server UID changed\n");
387 memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
388 }
389 } else {
390 spin_unlock(&cifs_tcp_ses_lock);
391 memcpy(server->server_GUID, guid, SMB1_CLIENT_GUID_SIZE);
392 }
393
394 if (count == SMB1_CLIENT_GUID_SIZE) {
395 server->sec_ntlmssp = true;
396 } else {
397 count -= SMB1_CLIENT_GUID_SIZE;
398 rc = decode_negTokenInit(
399 pSMBr->u.extended_response.SecurityBlob, count, server);
400 if (rc != 1)
401 return -EINVAL;
402 }
403
404 return 0;
405}
406
407int
408cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
409{
410 bool srv_sign_required = server->sec_mode & server->vals->signing_required;
411 bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled;
412 bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN;
413
414 /*
415 * Is signing required by mnt options? If not then check
416 * global_secflags to see if it is there.
417 */
418 if (!mnt_sign_required)
419 mnt_sign_required = ((global_secflags & CIFSSEC_MUST_SIGN) ==
420 CIFSSEC_MUST_SIGN);
421
422 /*
423 * If signing is required then it's automatically enabled too,
424 * otherwise, check to see if the secflags allow it.
425 */
426 mnt_sign_enabled = mnt_sign_required ? mnt_sign_required :
427 (global_secflags & CIFSSEC_MAY_SIGN);
428
429 /* If server requires signing, does client allow it? */
430 if (srv_sign_required) {
431 if (!mnt_sign_enabled) {
432 cifs_dbg(VFS, "Server requires signing, but it's disabled in SecurityFlags!");
433 return -ENOTSUPP;
434 }
435 server->sign = true;
436 }
437
438 /* If client requires signing, does server allow it? */
439 if (mnt_sign_required) {
440 if (!srv_sign_enabled) {
441 cifs_dbg(VFS, "Server does not support signing!");
442 return -ENOTSUPP;
443 }
444 server->sign = true;
445 }
446
447 return 0;
448}
449
450#ifdef CONFIG_CIFS_WEAK_PW_HASH
451static int
452decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
453{
454 __s16 tmp;
455 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
456
457 if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT)
458 return -EOPNOTSUPP;
459
460 server->sec_mode = le16_to_cpu(rsp->SecurityMode);
461 server->maxReq = min_t(unsigned int,
462 le16_to_cpu(rsp->MaxMpxCount),
463 cifs_max_pending);
464 set_credits(server, server->maxReq);
465 server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
466 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
467 /* even though we do not use raw we might as well set this
468 accurately, in case we ever find a need for it */
469 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
470 server->max_rw = 0xFF00;
471 server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
472 } else {
473 server->max_rw = 0;/* do not need to use raw anyway */
474 server->capabilities = CAP_MPX_MODE;
475 }
476 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
477 if (tmp == -1) {
478 /* OS/2 often does not set timezone therefore
479 * we must use server time to calc time zone.
480 * Could deviate slightly from the right zone.
481 * Smallest defined timezone difference is 15 minutes
482 * (i.e. Nepal). Rounding up/down is done to match
483 * this requirement.
484 */
485 int val, seconds, remain, result;
486 struct timespec ts, utc;
487 utc = CURRENT_TIME;
488 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
489 rsp->SrvTime.Time, 0);
490 cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
491 (int)ts.tv_sec, (int)utc.tv_sec,
492 (int)(utc.tv_sec - ts.tv_sec));
493 val = (int)(utc.tv_sec - ts.tv_sec);
494 seconds = abs(val);
495 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
496 remain = seconds % MIN_TZ_ADJ;
497 if (remain >= (MIN_TZ_ADJ / 2))
498 result += MIN_TZ_ADJ;
499 if (val < 0)
500 result = -result;
501 server->timeAdj = result;
502 } else {
503 server->timeAdj = (int)tmp;
504 server->timeAdj *= 60; /* also in seconds */
505 }
506 cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
507
508
509 /* BB get server time for time conversions and add
510 code to use it and timezone since this is not UTC */
511
512 if (rsp->EncryptionKeyLength ==
513 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
514 memcpy(server->cryptkey, rsp->EncryptionKey,
515 CIFS_CRYPTO_KEY_SIZE);
516 } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
517 return -EIO; /* need cryptkey unless plain text */
518 }
519
520 cifs_dbg(FYI, "LANMAN negotiated\n");
521 return 0;
522}
523#else
524static inline int
525decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
526{
527 cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n");
528 return -EOPNOTSUPP;
529}
530#endif
531
532static bool
533should_set_ext_sec_flag(enum securityEnum sectype)
534{
535 switch (sectype) {
536 case RawNTLMSSP:
537 case Kerberos:
538 return true;
539 case Unspecified:
540 if (global_secflags &
541 (CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP))
542 return true;
543 /* Fallthrough */
544 default:
545 return false;
546 }
547}
548
370int 549int
371CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) 550CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
372{ 551{
@@ -375,41 +554,24 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
375 int rc = 0; 554 int rc = 0;
376 int bytes_returned; 555 int bytes_returned;
377 int i; 556 int i;
378 struct TCP_Server_Info *server; 557 struct TCP_Server_Info *server = ses->server;
379 u16 count; 558 u16 count;
380 unsigned int secFlags;
381 559
382 if (ses->server) 560 if (!server) {
383 server = ses->server; 561 WARN(1, "%s: server is NULL!\n", __func__);
384 else { 562 return -EIO;
385 rc = -EIO;
386 return rc;
387 } 563 }
564
388 rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ , 565 rc = smb_init(SMB_COM_NEGOTIATE, 0, NULL /* no tcon yet */ ,
389 (void **) &pSMB, (void **) &pSMBr); 566 (void **) &pSMB, (void **) &pSMBr);
390 if (rc) 567 if (rc)
391 return rc; 568 return rc;
392 569
393 /* if any of auth flags (ie not sign or seal) are overriden use them */
394 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
395 secFlags = ses->overrideSecFlg; /* BB FIXME fix sign flags? */
396 else /* if override flags set only sign/seal OR them with global auth */
397 secFlags = global_secflags | ses->overrideSecFlg;
398
399 cifs_dbg(FYI, "secFlags 0x%x\n", secFlags);
400
401 pSMB->hdr.Mid = get_next_mid(server); 570 pSMB->hdr.Mid = get_next_mid(server);
402 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); 571 pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
403 572
404 if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) 573 if (should_set_ext_sec_flag(ses->sectype)) {
405 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 574 cifs_dbg(FYI, "Requesting extended security.");
406 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
407 cifs_dbg(FYI, "Kerberos only mechanism, enable extended security\n");
408 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
409 } else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
410 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
411 else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
412 cifs_dbg(FYI, "NTLMSSP only mechanism, enable extended security\n");
413 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; 575 pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
414 } 576 }
415 577
@@ -436,127 +598,21 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
436 could not negotiate a common dialect */ 598 could not negotiate a common dialect */
437 rc = -EOPNOTSUPP; 599 rc = -EOPNOTSUPP;
438 goto neg_err_exit; 600 goto neg_err_exit;
439#ifdef CONFIG_CIFS_WEAK_PW_HASH
440 } else if ((pSMBr->hdr.WordCount == 13)
441 && ((server->dialect == LANMAN_PROT)
442 || (server->dialect == LANMAN2_PROT))) {
443 __s16 tmp;
444 struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
445
446 if ((secFlags & CIFSSEC_MAY_LANMAN) ||
447 (secFlags & CIFSSEC_MAY_PLNTXT))
448 server->secType = LANMAN;
449 else {
450 cifs_dbg(VFS, "mount failed weak security disabled in /proc/fs/cifs/SecurityFlags\n");
451 rc = -EOPNOTSUPP;
452 goto neg_err_exit;
453 }
454 server->sec_mode = le16_to_cpu(rsp->SecurityMode);
455 server->maxReq = min_t(unsigned int,
456 le16_to_cpu(rsp->MaxMpxCount),
457 cifs_max_pending);
458 set_credits(server, server->maxReq);
459 server->maxBuf = le16_to_cpu(rsp->MaxBufSize);
460 server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
461 /* even though we do not use raw we might as well set this
462 accurately, in case we ever find a need for it */
463 if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
464 server->max_rw = 0xFF00;
465 server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
466 } else {
467 server->max_rw = 0;/* do not need to use raw anyway */
468 server->capabilities = CAP_MPX_MODE;
469 }
470 tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
471 if (tmp == -1) {
472 /* OS/2 often does not set timezone therefore
473 * we must use server time to calc time zone.
474 * Could deviate slightly from the right zone.
475 * Smallest defined timezone difference is 15 minutes
476 * (i.e. Nepal). Rounding up/down is done to match
477 * this requirement.
478 */
479 int val, seconds, remain, result;
480 struct timespec ts, utc;
481 utc = CURRENT_TIME;
482 ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
483 rsp->SrvTime.Time, 0);
484 cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n",
485 (int)ts.tv_sec, (int)utc.tv_sec,
486 (int)(utc.tv_sec - ts.tv_sec));
487 val = (int)(utc.tv_sec - ts.tv_sec);
488 seconds = abs(val);
489 result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
490 remain = seconds % MIN_TZ_ADJ;
491 if (remain >= (MIN_TZ_ADJ / 2))
492 result += MIN_TZ_ADJ;
493 if (val < 0)
494 result = -result;
495 server->timeAdj = result;
496 } else {
497 server->timeAdj = (int)tmp;
498 server->timeAdj *= 60; /* also in seconds */
499 }
500 cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj);
501
502
503 /* BB get server time for time conversions and add
504 code to use it and timezone since this is not UTC */
505
506 if (rsp->EncryptionKeyLength ==
507 cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
508 memcpy(ses->server->cryptkey, rsp->EncryptionKey,
509 CIFS_CRYPTO_KEY_SIZE);
510 } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
511 rc = -EIO; /* need cryptkey unless plain text */
512 goto neg_err_exit;
513 }
514
515 cifs_dbg(FYI, "LANMAN negotiated\n");
516 /* we will not end up setting signing flags - as no signing
517 was in LANMAN and server did not return the flags on */
518 goto signing_check;
519#else /* weak security disabled */
520 } else if (pSMBr->hdr.WordCount == 13) { 601 } else if (pSMBr->hdr.WordCount == 13) {
521 cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n"); 602 server->negflavor = CIFS_NEGFLAVOR_LANMAN;
522 rc = -EOPNOTSUPP; 603 rc = decode_lanman_negprot_rsp(server, pSMBr);
523#endif /* WEAK_PW_HASH */ 604 goto signing_check;
524 goto neg_err_exit;
525 } else if (pSMBr->hdr.WordCount != 17) { 605 } else if (pSMBr->hdr.WordCount != 17) {
526 /* unknown wct */ 606 /* unknown wct */
527 rc = -EOPNOTSUPP; 607 rc = -EOPNOTSUPP;
528 goto neg_err_exit; 608 goto neg_err_exit;
529 } 609 }
530 /* else wct == 17 NTLM */ 610 /* else wct == 17, NTLM or better */
611
531 server->sec_mode = pSMBr->SecurityMode; 612 server->sec_mode = pSMBr->SecurityMode;
532 if ((server->sec_mode & SECMODE_USER) == 0) 613 if ((server->sec_mode & SECMODE_USER) == 0)
533 cifs_dbg(FYI, "share mode security\n"); 614 cifs_dbg(FYI, "share mode security\n");
534 615
535 if ((server->sec_mode & SECMODE_PW_ENCRYPT) == 0)
536#ifdef CONFIG_CIFS_WEAK_PW_HASH
537 if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
538#endif /* CIFS_WEAK_PW_HASH */
539 cifs_dbg(VFS, "Server requests plain text password but client support disabled\n");
540
541 if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
542 server->secType = NTLMv2;
543 else if (secFlags & CIFSSEC_MAY_NTLM)
544 server->secType = NTLM;
545 else if (secFlags & CIFSSEC_MAY_NTLMV2)
546 server->secType = NTLMv2;
547 else if (secFlags & CIFSSEC_MAY_KRB5)
548 server->secType = Kerberos;
549 else if (secFlags & CIFSSEC_MAY_NTLMSSP)
550 server->secType = RawNTLMSSP;
551 else if (secFlags & CIFSSEC_MAY_LANMAN)
552 server->secType = LANMAN;
553 else {
554 rc = -EOPNOTSUPP;
555 cifs_dbg(VFS, "Invalid security type\n");
556 goto neg_err_exit;
557 }
558 /* else ... any others ...? */
559
560 /* one byte, so no need to convert this or EncryptionKeyLen from 616 /* one byte, so no need to convert this or EncryptionKeyLen from
561 little endian */ 617 little endian */
562 server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount), 618 server->maxReq = min_t(unsigned int, le16_to_cpu(pSMBr->MaxMpxCount),
@@ -569,90 +625,26 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
569 server->capabilities = le32_to_cpu(pSMBr->Capabilities); 625 server->capabilities = le32_to_cpu(pSMBr->Capabilities);
570 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); 626 server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
571 server->timeAdj *= 60; 627 server->timeAdj *= 60;
628
572 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { 629 if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
630 server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
573 memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, 631 memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey,
574 CIFS_CRYPTO_KEY_SIZE); 632 CIFS_CRYPTO_KEY_SIZE);
575 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || 633 } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC ||
576 server->capabilities & CAP_EXTENDED_SECURITY) && 634 server->capabilities & CAP_EXTENDED_SECURITY) &&
577 (pSMBr->EncryptionKeyLength == 0)) { 635 (pSMBr->EncryptionKeyLength == 0)) {
578 /* decode security blob */ 636 server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
579 count = get_bcc(&pSMBr->hdr); 637 rc = decode_ext_sec_blob(ses, pSMBr);
580 if (count < 16) {
581 rc = -EIO;
582 goto neg_err_exit;
583 }
584 spin_lock(&cifs_tcp_ses_lock);
585 if (server->srv_count > 1) {
586 spin_unlock(&cifs_tcp_ses_lock);
587 if (memcmp(server->server_GUID,
588 pSMBr->u.extended_response.
589 GUID, 16) != 0) {
590 cifs_dbg(FYI, "server UID changed\n");
591 memcpy(server->server_GUID,
592 pSMBr->u.extended_response.GUID,
593 16);
594 }
595 } else {
596 spin_unlock(&cifs_tcp_ses_lock);
597 memcpy(server->server_GUID,
598 pSMBr->u.extended_response.GUID, 16);
599 }
600
601 if (count == 16) {
602 server->secType = RawNTLMSSP;
603 } else {
604 rc = decode_negTokenInit(pSMBr->u.extended_response.
605 SecurityBlob, count - 16,
606 server);
607 if (rc == 1)
608 rc = 0;
609 else
610 rc = -EINVAL;
611 if (server->secType == Kerberos) {
612 if (!server->sec_kerberos &&
613 !server->sec_mskerberos)
614 rc = -EOPNOTSUPP;
615 } else if (server->secType == RawNTLMSSP) {
616 if (!server->sec_ntlmssp)
617 rc = -EOPNOTSUPP;
618 } else
619 rc = -EOPNOTSUPP;
620 }
621 } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { 638 } else if (server->sec_mode & SECMODE_PW_ENCRYPT) {
622 rc = -EIO; /* no crypt key only if plain text pwd */ 639 rc = -EIO; /* no crypt key only if plain text pwd */
623 goto neg_err_exit;
624 } else
625 server->capabilities &= ~CAP_EXTENDED_SECURITY;
626
627#ifdef CONFIG_CIFS_WEAK_PW_HASH
628signing_check:
629#endif
630 if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
631 /* MUST_SIGN already includes the MAY_SIGN FLAG
632 so if this is zero it means that signing is disabled */
633 cifs_dbg(FYI, "Signing disabled\n");
634 if (server->sec_mode & SECMODE_SIGN_REQUIRED) {
635 cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n");
636 rc = -EOPNOTSUPP;
637 }
638 server->sec_mode &=
639 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
640 } else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
641 /* signing required */
642 cifs_dbg(FYI, "Must sign - secFlags 0x%x\n", secFlags);
643 if ((server->sec_mode &
644 (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
645 cifs_dbg(VFS, "signing required but server lacks support\n");
646 rc = -EOPNOTSUPP;
647 } else
648 server->sec_mode |= SECMODE_SIGN_REQUIRED;
649 } else { 640 } else {
650 /* signing optional ie CIFSSEC_MAY_SIGN */ 641 server->negflavor = CIFS_NEGFLAVOR_UNENCAP;
651 if ((server->sec_mode & SECMODE_SIGN_REQUIRED) == 0) 642 server->capabilities &= ~CAP_EXTENDED_SECURITY;
652 server->sec_mode &=
653 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
654 } 643 }
655 644
645signing_check:
646 if (!rc)
647 rc = cifs_enable_signing(server, ses->sign);
656neg_err_exit: 648neg_err_exit:
657 cifs_buf_release(pSMB); 649 cifs_buf_release(pSMB);
658 650
@@ -777,9 +769,8 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
777 769
778 pSMB->hdr.Mid = get_next_mid(ses->server); 770 pSMB->hdr.Mid = get_next_mid(ses->server);
779 771
780 if (ses->server->sec_mode & 772 if (ses->server->sign)
781 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 773 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
782 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
783 774
784 pSMB->hdr.Uid = ses->Suid; 775 pSMB->hdr.Uid = ses->Suid;
785 776
@@ -1540,8 +1531,7 @@ cifs_readv_callback(struct mid_q_entry *mid)
1540 switch (mid->mid_state) { 1531 switch (mid->mid_state) {
1541 case MID_RESPONSE_RECEIVED: 1532 case MID_RESPONSE_RECEIVED:
1542 /* result already set, check signature */ 1533 /* result already set, check signature */
1543 if (server->sec_mode & 1534 if (server->sign) {
1544 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1545 int rc = 0; 1535 int rc = 0;
1546 1536
1547 rc = cifs_verify_signature(&rqst, server, 1537 rc = cifs_verify_signature(&rqst, server,
@@ -3940,6 +3930,7 @@ QFileInfoRetry:
3940 pSMB->Pad = 0; 3930 pSMB->Pad = 0;
3941 pSMB->Fid = netfid; 3931 pSMB->Fid = netfid;
3942 inc_rfc1001_len(pSMB, byte_count); 3932 inc_rfc1001_len(pSMB, byte_count);
3933 pSMB->t2.ByteCount = cpu_to_le16(byte_count);
3943 3934
3944 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 3935 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3945 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 3936 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -4108,6 +4099,7 @@ UnixQFileInfoRetry:
4108 pSMB->Pad = 0; 4099 pSMB->Pad = 0;
4109 pSMB->Fid = netfid; 4100 pSMB->Fid = netfid;
4110 inc_rfc1001_len(pSMB, byte_count); 4101 inc_rfc1001_len(pSMB, byte_count);
4102 pSMB->t2.ByteCount = cpu_to_le16(byte_count);
4111 4103
4112 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 4104 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
4113 (struct smb_hdr *) pSMBr, &bytes_returned, 0); 4105 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -4794,11 +4786,8 @@ getDFSRetry:
4794 strncpy(pSMB->RequestFileName, search_name, name_len); 4786 strncpy(pSMB->RequestFileName, search_name, name_len);
4795 } 4787 }
4796 4788
4797 if (ses->server) { 4789 if (ses->server && ses->server->sign)
4798 if (ses->server->sec_mode & 4790 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
4799 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
4800 pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
4801 }
4802 4791
4803 pSMB->hdr.Uid = ses->Suid; 4792 pSMB->hdr.Uid = ses->Suid;
4804 4793
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e3bc39bb9d12..d67c550c4980 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -85,7 +85,7 @@ enum {
85 Opt_acl, Opt_noacl, Opt_locallease, 85 Opt_acl, Opt_noacl, Opt_locallease,
86 Opt_sign, Opt_seal, Opt_noac, 86 Opt_sign, Opt_seal, Opt_noac,
87 Opt_fsc, Opt_mfsymlinks, 87 Opt_fsc, Opt_mfsymlinks,
88 Opt_multiuser, Opt_sloppy, 88 Opt_multiuser, Opt_sloppy, Opt_nosharesock,
89 89
90 /* Mount options which take numeric value */ 90 /* Mount options which take numeric value */
91 Opt_backupuid, Opt_backupgid, Opt_uid, 91 Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -165,6 +165,7 @@ static const match_table_t cifs_mount_option_tokens = {
165 { Opt_mfsymlinks, "mfsymlinks" }, 165 { Opt_mfsymlinks, "mfsymlinks" },
166 { Opt_multiuser, "multiuser" }, 166 { Opt_multiuser, "multiuser" },
167 { Opt_sloppy, "sloppy" }, 167 { Opt_sloppy, "sloppy" },
168 { Opt_nosharesock, "nosharesock" },
168 169
169 { Opt_backupuid, "backupuid=%s" }, 170 { Opt_backupuid, "backupuid=%s" },
170 { Opt_backupgid, "backupgid=%s" }, 171 { Opt_backupgid, "backupgid=%s" },
@@ -275,6 +276,7 @@ static const match_table_t cifs_smb_version_tokens = {
275 { Smb_20, SMB20_VERSION_STRING}, 276 { Smb_20, SMB20_VERSION_STRING},
276 { Smb_21, SMB21_VERSION_STRING }, 277 { Smb_21, SMB21_VERSION_STRING },
277 { Smb_30, SMB30_VERSION_STRING }, 278 { Smb_30, SMB30_VERSION_STRING },
279 { Smb_302, SMB302_VERSION_STRING },
278}; 280};
279 281
280static int ip_connect(struct TCP_Server_Info *server); 282static int ip_connect(struct TCP_Server_Info *server);
@@ -1024,44 +1026,48 @@ static int cifs_parse_security_flavors(char *value,
1024 1026
1025 substring_t args[MAX_OPT_ARGS]; 1027 substring_t args[MAX_OPT_ARGS];
1026 1028
1029 /*
1030 * With mount options, the last one should win. Reset any existing
1031 * settings back to default.
1032 */
1033 vol->sectype = Unspecified;
1034 vol->sign = false;
1035
1027 switch (match_token(value, cifs_secflavor_tokens, args)) { 1036 switch (match_token(value, cifs_secflavor_tokens, args)) {
1028 case Opt_sec_krb5:
1029 vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_SIGN;
1030 break;
1031 case Opt_sec_krb5i:
1032 vol->secFlg |= CIFSSEC_MAY_KRB5 | CIFSSEC_MUST_SIGN;
1033 break;
1034 case Opt_sec_krb5p: 1037 case Opt_sec_krb5p:
1035 /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */ 1038 cifs_dbg(VFS, "sec=krb5p is not supported!\n");
1036 cifs_dbg(VFS, "Krb5 cifs privacy not supported\n"); 1039 return 1;
1037 break; 1040 case Opt_sec_krb5i:
1038 case Opt_sec_ntlmssp: 1041 vol->sign = true;
1039 vol->secFlg |= CIFSSEC_MAY_NTLMSSP; 1042 /* Fallthrough */
1043 case Opt_sec_krb5:
1044 vol->sectype = Kerberos;
1040 break; 1045 break;
1041 case Opt_sec_ntlmsspi: 1046 case Opt_sec_ntlmsspi:
1042 vol->secFlg |= CIFSSEC_MAY_NTLMSSP | CIFSSEC_MUST_SIGN; 1047 vol->sign = true;
1043 break; 1048 /* Fallthrough */
1044 case Opt_ntlm: 1049 case Opt_sec_ntlmssp:
1045 /* ntlm is default so can be turned off too */ 1050 vol->sectype = RawNTLMSSP;
1046 vol->secFlg |= CIFSSEC_MAY_NTLM;
1047 break; 1051 break;
1048 case Opt_sec_ntlmi: 1052 case Opt_sec_ntlmi:
1049 vol->secFlg |= CIFSSEC_MAY_NTLM | CIFSSEC_MUST_SIGN; 1053 vol->sign = true;
1050 break; 1054 /* Fallthrough */
1051 case Opt_sec_ntlmv2: 1055 case Opt_ntlm:
1052 vol->secFlg |= CIFSSEC_MAY_NTLMV2; 1056 vol->sectype = NTLM;
1053 break; 1057 break;
1054 case Opt_sec_ntlmv2i: 1058 case Opt_sec_ntlmv2i:
1055 vol->secFlg |= CIFSSEC_MAY_NTLMV2 | CIFSSEC_MUST_SIGN; 1059 vol->sign = true;
1060 /* Fallthrough */
1061 case Opt_sec_ntlmv2:
1062 vol->sectype = NTLMv2;
1056 break; 1063 break;
1057#ifdef CONFIG_CIFS_WEAK_PW_HASH 1064#ifdef CONFIG_CIFS_WEAK_PW_HASH
1058 case Opt_sec_lanman: 1065 case Opt_sec_lanman:
1059 vol->secFlg |= CIFSSEC_MAY_LANMAN; 1066 vol->sectype = LANMAN;
1060 break; 1067 break;
1061#endif 1068#endif
1062 case Opt_sec_none: 1069 case Opt_sec_none:
1063 vol->nullauth = 1; 1070 vol->nullauth = 1;
1064 vol->secFlg |= CIFSSEC_MAY_NTLM;
1065 break; 1071 break;
1066 default: 1072 default:
1067 cifs_dbg(VFS, "bad security option: %s\n", value); 1073 cifs_dbg(VFS, "bad security option: %s\n", value);
@@ -1119,6 +1125,10 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1119 vol->ops = &smb30_operations; 1125 vol->ops = &smb30_operations;
1120 vol->vals = &smb30_values; 1126 vol->vals = &smb30_values;
1121 break; 1127 break;
1128 case Smb_302:
1129 vol->ops = &smb30_operations; /* currently identical with 3.0 */
1130 vol->vals = &smb302_values;
1131 break;
1122#endif 1132#endif
1123 default: 1133 default:
1124 cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); 1134 cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value);
@@ -1424,7 +1434,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1424 vol->local_lease = 1; 1434 vol->local_lease = 1;
1425 break; 1435 break;
1426 case Opt_sign: 1436 case Opt_sign:
1427 vol->secFlg |= CIFSSEC_MUST_SIGN; 1437 vol->sign = true;
1428 break; 1438 break;
1429 case Opt_seal: 1439 case Opt_seal:
1430 /* we do not do the following in secFlags because seal 1440 /* we do not do the following in secFlags because seal
@@ -1455,6 +1465,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1455 case Opt_sloppy: 1465 case Opt_sloppy:
1456 sloppy = true; 1466 sloppy = true;
1457 break; 1467 break;
1468 case Opt_nosharesock:
1469 vol->nosharesock = true;
1470 break;
1458 1471
1459 /* Numeric Values */ 1472 /* Numeric Values */
1460 case Opt_backupuid: 1473 case Opt_backupuid:
@@ -1662,7 +1675,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1662 if (string == NULL) 1675 if (string == NULL)
1663 goto out_nomem; 1676 goto out_nomem;
1664 1677
1665 if (strnlen(string, 256) == 256) { 1678 if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN)
1679 == CIFS_MAX_DOMAINNAME_LEN) {
1666 printk(KERN_WARNING "CIFS: domain name too" 1680 printk(KERN_WARNING "CIFS: domain name too"
1667 " long\n"); 1681 " long\n");
1668 goto cifs_parse_mount_err; 1682 goto cifs_parse_mount_err;
@@ -1978,47 +1992,21 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
1978static bool 1992static bool
1979match_security(struct TCP_Server_Info *server, struct smb_vol *vol) 1993match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
1980{ 1994{
1981 unsigned int secFlags; 1995 /*
1982 1996 * The select_sectype function should either return the vol->sectype
1983 if (vol->secFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) 1997 * that was specified, or "Unspecified" if that sectype was not
1984 secFlags = vol->secFlg; 1998 * compatible with the given NEGOTIATE request.
1985 else 1999 */
1986 secFlags = global_secflags | vol->secFlg; 2000 if (select_sectype(server, vol->sectype) == Unspecified)
1987
1988 switch (server->secType) {
1989 case LANMAN:
1990 if (!(secFlags & (CIFSSEC_MAY_LANMAN|CIFSSEC_MAY_PLNTXT)))
1991 return false;
1992 break;
1993 case NTLMv2:
1994 if (!(secFlags & CIFSSEC_MAY_NTLMV2))
1995 return false;
1996 break;
1997 case NTLM:
1998 if (!(secFlags & CIFSSEC_MAY_NTLM))
1999 return false;
2000 break;
2001 case Kerberos:
2002 if (!(secFlags & CIFSSEC_MAY_KRB5))
2003 return false;
2004 break;
2005 case RawNTLMSSP:
2006 if (!(secFlags & CIFSSEC_MAY_NTLMSSP))
2007 return false;
2008 break;
2009 default:
2010 /* shouldn't happen */
2011 return false; 2001 return false;
2012 }
2013 2002
2014 /* now check if signing mode is acceptable */ 2003 /*
2015 if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && 2004 * Now check if signing mode is acceptable. No need to check
2016 (server->sec_mode & SECMODE_SIGN_REQUIRED)) 2005 * global_secflags at this point since if MUST_SIGN is set then
2017 return false; 2006 * the server->sign had better be too.
2018 else if (((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) && 2007 */
2019 (server->sec_mode & 2008 if (vol->sign && !server->sign)
2020 (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)) == 0) 2009 return false;
2021 return false;
2022 2010
2023 return true; 2011 return true;
2024} 2012}
@@ -2027,6 +2015,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
2027{ 2015{
2028 struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr; 2016 struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
2029 2017
2018 if (vol->nosharesock)
2019 return 0;
2020
2030 if ((server->vals != vol->vals) || (server->ops != vol->ops)) 2021 if ((server->vals != vol->vals) || (server->ops != vol->ops))
2031 return 0; 2022 return 0;
2032 2023
@@ -2118,12 +2109,6 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
2118 goto out_err; 2109 goto out_err;
2119 } 2110 }
2120 2111
2121 rc = cifs_crypto_shash_allocate(tcp_ses);
2122 if (rc) {
2123 cifs_dbg(VFS, "could not setup hash structures rc %d\n", rc);
2124 goto out_err;
2125 }
2126
2127 tcp_ses->ops = volume_info->ops; 2112 tcp_ses->ops = volume_info->ops;
2128 tcp_ses->vals = volume_info->vals; 2113 tcp_ses->vals = volume_info->vals;
2129 cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); 2114 cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
@@ -2216,7 +2201,11 @@ out_err:
2216 2201
2217static int match_session(struct cifs_ses *ses, struct smb_vol *vol) 2202static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
2218{ 2203{
2219 switch (ses->server->secType) { 2204 if (vol->sectype != Unspecified &&
2205 vol->sectype != ses->sectype)
2206 return 0;
2207
2208 switch (ses->sectype) {
2220 case Kerberos: 2209 case Kerberos:
2221 if (!uid_eq(vol->cred_uid, ses->cred_uid)) 2210 if (!uid_eq(vol->cred_uid, ses->cred_uid))
2222 return 0; 2211 return 0;
@@ -2288,8 +2277,8 @@ cifs_put_smb_ses(struct cifs_ses *ses)
2288 2277
2289#ifdef CONFIG_KEYS 2278#ifdef CONFIG_KEYS
2290 2279
2291/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */ 2280/* strlen("cifs:a:") + CIFS_MAX_DOMAINNAME_LEN + 1 */
2292#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1) 2281#define CIFSCREDS_DESC_SIZE (7 + CIFS_MAX_DOMAINNAME_LEN + 1)
2293 2282
2294/* Populate username and pw fields from keyring if possible */ 2283/* Populate username and pw fields from keyring if possible */
2295static int 2284static int
@@ -2493,7 +2482,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2493 ses->cred_uid = volume_info->cred_uid; 2482 ses->cred_uid = volume_info->cred_uid;
2494 ses->linux_uid = volume_info->linux_uid; 2483 ses->linux_uid = volume_info->linux_uid;
2495 2484
2496 ses->overrideSecFlg = volume_info->secFlg; 2485 ses->sectype = volume_info->sectype;
2486 ses->sign = volume_info->sign;
2497 2487
2498 mutex_lock(&ses->session_mutex); 2488 mutex_lock(&ses->session_mutex);
2499 rc = cifs_negotiate_protocol(xid, ses); 2489 rc = cifs_negotiate_protocol(xid, ses);
@@ -3656,7 +3646,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
3656 NTLMv2 password here) */ 3646 NTLMv2 password here) */
3657#ifdef CONFIG_CIFS_WEAK_PW_HASH 3647#ifdef CONFIG_CIFS_WEAK_PW_HASH
3658 if ((global_secflags & CIFSSEC_MAY_LANMAN) && 3648 if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
3659 (ses->server->secType == LANMAN)) 3649 (ses->sectype == LANMAN))
3660 calc_lanman_hash(tcon->password, ses->server->cryptkey, 3650 calc_lanman_hash(tcon->password, ses->server->cryptkey,
3661 ses->server->sec_mode & 3651 ses->server->sec_mode &
3662 SECMODE_PW_ENCRYPT ? true : false, 3652 SECMODE_PW_ENCRYPT ? true : false,
@@ -3674,8 +3664,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
3674 } 3664 }
3675 } 3665 }
3676 3666
3677 if (ses->server->sec_mode & 3667 if (ses->server->sign)
3678 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
3679 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 3668 smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
3680 3669
3681 if (ses->capabilities & CAP_STATUS32) { 3670 if (ses->capabilities & CAP_STATUS32) {
@@ -3738,7 +3727,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
3738 } 3727 }
3739 bcc_ptr += length + 1; 3728 bcc_ptr += length + 1;
3740 bytes_left -= (length + 1); 3729 bytes_left -= (length + 1);
3741 strncpy(tcon->treeName, tree, MAX_TREE_SIZE); 3730 strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
3742 3731
3743 /* mostly informational -- no need to fail on error here */ 3732 /* mostly informational -- no need to fail on error here */
3744 kfree(tcon->nativeFileSystem); 3733 kfree(tcon->nativeFileSystem);
@@ -3827,7 +3816,6 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
3827 int rc = -ENOSYS; 3816 int rc = -ENOSYS;
3828 struct TCP_Server_Info *server = ses->server; 3817 struct TCP_Server_Info *server = ses->server;
3829 3818
3830 ses->flags = 0;
3831 ses->capabilities = server->capabilities; 3819 ses->capabilities = server->capabilities;
3832 if (linuxExtEnabled == 0) 3820 if (linuxExtEnabled == 0)
3833 ses->capabilities &= (~server->vals->cap_unix); 3821 ses->capabilities &= (~server->vals->cap_unix);
@@ -3848,6 +3836,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
3848 server->sequence_number = 0x2; 3836 server->sequence_number = 0x2;
3849 server->session_estab = true; 3837 server->session_estab = true;
3850 ses->auth_key.response = NULL; 3838 ses->auth_key.response = NULL;
3839 if (server->ops->generate_signingkey)
3840 server->ops->generate_signingkey(server);
3851 } 3841 }
3852 mutex_unlock(&server->srv_mutex); 3842 mutex_unlock(&server->srv_mutex);
3853 3843
@@ -3870,23 +3860,11 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
3870static int 3860static int
3871cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses) 3861cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
3872{ 3862{
3873 switch (ses->server->secType) { 3863 vol->sectype = ses->sectype;
3874 case Kerberos: 3864
3875 vol->secFlg = CIFSSEC_MUST_KRB5; 3865 /* krb5 is special, since we don't need username or pw */
3866 if (vol->sectype == Kerberos)
3876 return 0; 3867 return 0;
3877 case NTLMv2:
3878 vol->secFlg = CIFSSEC_MUST_NTLMV2;
3879 break;
3880 case NTLM:
3881 vol->secFlg = CIFSSEC_MUST_NTLM;
3882 break;
3883 case RawNTLMSSP:
3884 vol->secFlg = CIFSSEC_MUST_NTLMSSP;
3885 break;
3886 case LANMAN:
3887 vol->secFlg = CIFSSEC_MUST_LANMAN;
3888 break;
3889 }
3890 3868
3891 return cifs_set_cifscreds(vol, ses); 3869 return cifs_set_cifscreds(vol, ses);
3892} 3870}
@@ -3912,6 +3890,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
3912 vol_info->nocase = master_tcon->nocase; 3890 vol_info->nocase = master_tcon->nocase;
3913 vol_info->local_lease = master_tcon->local_lease; 3891 vol_info->local_lease = master_tcon->local_lease;
3914 vol_info->no_linux_ext = !master_tcon->unix_ext; 3892 vol_info->no_linux_ext = !master_tcon->unix_ext;
3893 vol_info->sectype = master_tcon->ses->sectype;
3894 vol_info->sign = master_tcon->ses->sign;
3915 3895
3916 rc = cifs_set_vol_auth(vol_info, master_tcon->ses); 3896 rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
3917 if (rc) { 3897 if (rc) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5699b5036ed8..d62ce0d48141 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -204,6 +204,7 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
204 struct inode *newinode = NULL; 204 struct inode *newinode = NULL;
205 int disposition; 205 int disposition;
206 struct TCP_Server_Info *server = tcon->ses->server; 206 struct TCP_Server_Info *server = tcon->ses->server;
207 struct cifs_open_parms oparms;
207 208
208 *oplock = 0; 209 *oplock = 0;
209 if (tcon->ses->server->oplocks) 210 if (tcon->ses->server->oplocks)
@@ -319,9 +320,16 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
319 if (backup_cred(cifs_sb)) 320 if (backup_cred(cifs_sb))
320 create_options |= CREATE_OPEN_BACKUP_INTENT; 321 create_options |= CREATE_OPEN_BACKUP_INTENT;
321 322
322 rc = server->ops->open(xid, tcon, full_path, disposition, 323 oparms.tcon = tcon;
323 desired_access, create_options, fid, oplock, 324 oparms.cifs_sb = cifs_sb;
324 buf, cifs_sb); 325 oparms.desired_access = desired_access;
326 oparms.create_options = create_options;
327 oparms.disposition = disposition;
328 oparms.path = full_path;
329 oparms.fid = fid;
330 oparms.reconnect = false;
331
332 rc = server->ops->open(xid, &oparms, oplock, buf);
325 if (rc) { 333 if (rc) {
326 cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc); 334 cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
327 goto out; 335 goto out;
@@ -822,8 +830,7 @@ const struct dentry_operations cifs_dentry_ops = {
822/* d_delete: cifs_d_delete, */ /* not needed except for debugging */ 830/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
823}; 831};
824 832
825static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode, 833static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
826 struct qstr *q)
827{ 834{
828 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; 835 struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
829 unsigned long hash; 836 unsigned long hash;
@@ -838,12 +845,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
838 return 0; 845 return 0;
839} 846}
840 847
841static int cifs_ci_compare(const struct dentry *parent, 848static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
842 const struct inode *pinode,
843 const struct dentry *dentry, const struct inode *inode,
844 unsigned int len, const char *str, const struct qstr *name) 849 unsigned int len, const char *str, const struct qstr *name)
845{ 850{
846 struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls; 851 struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
847 852
848 if ((name->len == len) && 853 if ((name->len == len) &&
849 (nls_strnicmp(codepage, name->name, str, len) == 0)) 854 (nls_strnicmp(codepage, name->name, str, len) == 0))
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 48b29d24c9f4..7e36ae34e947 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -183,6 +183,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
183 int create_options = CREATE_NOT_DIR; 183 int create_options = CREATE_NOT_DIR;
184 FILE_ALL_INFO *buf; 184 FILE_ALL_INFO *buf;
185 struct TCP_Server_Info *server = tcon->ses->server; 185 struct TCP_Server_Info *server = tcon->ses->server;
186 struct cifs_open_parms oparms;
186 187
187 if (!server->ops->open) 188 if (!server->ops->open)
188 return -ENOSYS; 189 return -ENOSYS;
@@ -224,9 +225,16 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
224 if (backup_cred(cifs_sb)) 225 if (backup_cred(cifs_sb))
225 create_options |= CREATE_OPEN_BACKUP_INTENT; 226 create_options |= CREATE_OPEN_BACKUP_INTENT;
226 227
227 rc = server->ops->open(xid, tcon, full_path, disposition, 228 oparms.tcon = tcon;
228 desired_access, create_options, fid, oplock, buf, 229 oparms.cifs_sb = cifs_sb;
229 cifs_sb); 230 oparms.desired_access = desired_access;
231 oparms.create_options = create_options;
232 oparms.disposition = disposition;
233 oparms.path = full_path;
234 oparms.fid = fid;
235 oparms.reconnect = false;
236
237 rc = server->ops->open(xid, &oparms, oplock, buf);
230 238
231 if (rc) 239 if (rc)
232 goto out; 240 goto out;
@@ -553,11 +561,10 @@ cifs_relock_file(struct cifsFileInfo *cfile)
553 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 561 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
554 int rc = 0; 562 int rc = 0;
555 563
556 /* we are going to update can_cache_brlcks here - need a write access */ 564 down_read(&cinode->lock_sem);
557 down_write(&cinode->lock_sem);
558 if (cinode->can_cache_brlcks) { 565 if (cinode->can_cache_brlcks) {
559 /* can cache locks - no need to push them */ 566 /* can cache locks - no need to relock */
560 up_write(&cinode->lock_sem); 567 up_read(&cinode->lock_sem);
561 return rc; 568 return rc;
562 } 569 }
563 570
@@ -568,7 +575,7 @@ cifs_relock_file(struct cifsFileInfo *cfile)
568 else 575 else
569 rc = tcon->ses->server->ops->push_mand_locks(cfile); 576 rc = tcon->ses->server->ops->push_mand_locks(cfile);
570 577
571 up_write(&cinode->lock_sem); 578 up_read(&cinode->lock_sem);
572 return rc; 579 return rc;
573} 580}
574 581
@@ -587,7 +594,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
587 int desired_access; 594 int desired_access;
588 int disposition = FILE_OPEN; 595 int disposition = FILE_OPEN;
589 int create_options = CREATE_NOT_DIR; 596 int create_options = CREATE_NOT_DIR;
590 struct cifs_fid fid; 597 struct cifs_open_parms oparms;
591 598
592 xid = get_xid(); 599 xid = get_xid();
593 mutex_lock(&cfile->fh_mutex); 600 mutex_lock(&cfile->fh_mutex);
@@ -637,9 +644,10 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
637 644
638 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 645 rc = cifs_posix_open(full_path, NULL, inode->i_sb,
639 cifs_sb->mnt_file_mode /* ignored */, 646 cifs_sb->mnt_file_mode /* ignored */,
640 oflags, &oplock, &fid.netfid, xid); 647 oflags, &oplock, &cfile->fid.netfid, xid);
641 if (rc == 0) { 648 if (rc == 0) {
642 cifs_dbg(FYI, "posix reopen succeeded\n"); 649 cifs_dbg(FYI, "posix reopen succeeded\n");
650 oparms.reconnect = true;
643 goto reopen_success; 651 goto reopen_success;
644 } 652 }
645 /* 653 /*
@@ -654,7 +662,16 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
654 create_options |= CREATE_OPEN_BACKUP_INTENT; 662 create_options |= CREATE_OPEN_BACKUP_INTENT;
655 663
656 if (server->ops->get_lease_key) 664 if (server->ops->get_lease_key)
657 server->ops->get_lease_key(inode, &fid); 665 server->ops->get_lease_key(inode, &cfile->fid);
666
667 oparms.tcon = tcon;
668 oparms.cifs_sb = cifs_sb;
669 oparms.desired_access = desired_access;
670 oparms.create_options = create_options;
671 oparms.disposition = disposition;
672 oparms.path = full_path;
673 oparms.fid = &cfile->fid;
674 oparms.reconnect = true;
658 675
659 /* 676 /*
660 * Can not refresh inode by passing in file_info buf to be returned by 677 * Can not refresh inode by passing in file_info buf to be returned by
@@ -663,9 +680,14 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
663 * version of file size can be stale. If we knew for sure that inode was 680 * version of file size can be stale. If we knew for sure that inode was
664 * not dirty locally we could do this. 681 * not dirty locally we could do this.
665 */ 682 */
666 rc = server->ops->open(xid, tcon, full_path, disposition, 683 rc = server->ops->open(xid, &oparms, &oplock, NULL);
667 desired_access, create_options, &fid, &oplock, 684 if (rc == -ENOENT && oparms.reconnect == false) {
668 NULL, cifs_sb); 685 /* durable handle timeout is expired - open the file again */
686 rc = server->ops->open(xid, &oparms, &oplock, NULL);
687 /* indicate that we need to relock the file */
688 oparms.reconnect = true;
689 }
690
669 if (rc) { 691 if (rc) {
670 mutex_unlock(&cfile->fh_mutex); 692 mutex_unlock(&cfile->fh_mutex);
671 cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc); 693 cifs_dbg(FYI, "cifs_reopen returned 0x%x\n", rc);
@@ -696,8 +718,9 @@ reopen_success:
696 * to the server to get the new inode info. 718 * to the server to get the new inode info.
697 */ 719 */
698 720
699 server->ops->set_fid(cfile, &fid, oplock); 721 server->ops->set_fid(cfile, &cfile->fid, oplock);
700 cifs_relock_file(cfile); 722 if (oparms.reconnect)
723 cifs_relock_file(cfile);
701 724
702reopen_error_exit: 725reopen_error_exit:
703 kfree(full_path); 726 kfree(full_path);
@@ -999,7 +1022,7 @@ try_again:
999 rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); 1022 rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
1000 if (!rc) 1023 if (!rc)
1001 goto try_again; 1024 goto try_again;
1002 locks_delete_block(flock); 1025 posix_unblock_lock(flock);
1003 } 1026 }
1004 return rc; 1027 return rc;
1005} 1028}
@@ -1092,6 +1115,7 @@ struct lock_to_push {
1092static int 1115static int
1093cifs_push_posix_locks(struct cifsFileInfo *cfile) 1116cifs_push_posix_locks(struct cifsFileInfo *cfile)
1094{ 1117{
1118 struct inode *inode = cfile->dentry->d_inode;
1095 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1119 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1096 struct file_lock *flock, **before; 1120 struct file_lock *flock, **before;
1097 unsigned int count = 0, i = 0; 1121 unsigned int count = 0, i = 0;
@@ -1102,12 +1126,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1102 1126
1103 xid = get_xid(); 1127 xid = get_xid();
1104 1128
1105 lock_flocks(); 1129 spin_lock(&inode->i_lock);
1106 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1130 cifs_for_each_lock(inode, before) {
1107 if ((*before)->fl_flags & FL_POSIX) 1131 if ((*before)->fl_flags & FL_POSIX)
1108 count++; 1132 count++;
1109 } 1133 }
1110 unlock_flocks(); 1134 spin_unlock(&inode->i_lock);
1111 1135
1112 INIT_LIST_HEAD(&locks_to_send); 1136 INIT_LIST_HEAD(&locks_to_send);
1113 1137
@@ -1126,8 +1150,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1126 } 1150 }
1127 1151
1128 el = locks_to_send.next; 1152 el = locks_to_send.next;
1129 lock_flocks(); 1153 spin_lock(&inode->i_lock);
1130 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1154 cifs_for_each_lock(inode, before) {
1131 flock = *before; 1155 flock = *before;
1132 if ((flock->fl_flags & FL_POSIX) == 0) 1156 if ((flock->fl_flags & FL_POSIX) == 0)
1133 continue; 1157 continue;
@@ -1152,7 +1176,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1152 lck->offset = flock->fl_start; 1176 lck->offset = flock->fl_start;
1153 el = el->next; 1177 el = el->next;
1154 } 1178 }
1155 unlock_flocks(); 1179 spin_unlock(&inode->i_lock);
1156 1180
1157 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { 1181 list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
1158 int stored_rc; 1182 int stored_rc;
@@ -3546,11 +3570,12 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
3546 return cifs_fscache_release_page(page, gfp); 3570 return cifs_fscache_release_page(page, gfp);
3547} 3571}
3548 3572
3549static void cifs_invalidate_page(struct page *page, unsigned long offset) 3573static void cifs_invalidate_page(struct page *page, unsigned int offset,
3574 unsigned int length)
3550{ 3575{
3551 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); 3576 struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
3552 3577
3553 if (offset == 0) 3578 if (offset == 0 && length == PAGE_CACHE_SIZE)
3554 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); 3579 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
3555} 3580}
3556 3581
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 20efd81266c6..449b6cf09b09 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -558,6 +558,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
558 fattr->cf_mode &= ~(S_IWUGO); 558 fattr->cf_mode &= ~(S_IWUGO);
559 559
560 fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); 560 fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
561 if (fattr->cf_nlink < 1) {
562 cifs_dbg(1, "replacing bogus file nlink value %u\n",
563 fattr->cf_nlink);
564 fattr->cf_nlink = 1;
565 }
561 } 566 }
562 567
563 fattr->cf_uid = cifs_sb->mnt_uid; 568 fattr->cf_uid = cifs_sb->mnt_uid;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index b83c3f5646bd..562044f700e5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -305,67 +305,89 @@ CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
305} 305}
306 306
307int 307int
308CIFSCheckMFSymlink(struct cifs_fattr *fattr, 308open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
309 const unsigned char *path, 309 unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
310 struct cifs_sb_info *cifs_sb, unsigned int xid) 310 unsigned int xid)
311{ 311{
312 int rc; 312 int rc;
313 int oplock = 0; 313 int oplock = 0;
314 __u16 netfid = 0; 314 __u16 netfid = 0;
315 struct tcon_link *tlink; 315 struct tcon_link *tlink;
316 struct cifs_tcon *pTcon; 316 struct cifs_tcon *ptcon;
317 struct cifs_io_parms io_parms; 317 struct cifs_io_parms io_parms;
318 u8 *buf;
319 char *pbuf;
320 unsigned int bytes_read = 0;
321 int buf_type = CIFS_NO_BUFFER; 318 int buf_type = CIFS_NO_BUFFER;
322 unsigned int link_len = 0;
323 FILE_ALL_INFO file_info; 319 FILE_ALL_INFO file_info;
324 320
325 if (!CIFSCouldBeMFSymlink(fattr))
326 /* it's not a symlink */
327 return 0;
328
329 tlink = cifs_sb_tlink(cifs_sb); 321 tlink = cifs_sb_tlink(cifs_sb);
330 if (IS_ERR(tlink)) 322 if (IS_ERR(tlink))
331 return PTR_ERR(tlink); 323 return PTR_ERR(tlink);
332 pTcon = tlink_tcon(tlink); 324 ptcon = tlink_tcon(tlink);
333 325
334 rc = CIFSSMBOpen(xid, pTcon, path, FILE_OPEN, GENERIC_READ, 326 rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ,
335 CREATE_NOT_DIR, &netfid, &oplock, &file_info, 327 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
336 cifs_sb->local_nls, 328 cifs_sb->local_nls,
337 cifs_sb->mnt_cifs_flags & 329 cifs_sb->mnt_cifs_flags &
338 CIFS_MOUNT_MAP_SPECIAL_CHR); 330 CIFS_MOUNT_MAP_SPECIAL_CHR);
339 if (rc != 0) 331 if (rc != 0) {
340 goto out; 332 cifs_put_tlink(tlink);
333 return rc;
334 }
341 335
342 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { 336 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
343 CIFSSMBClose(xid, pTcon, netfid); 337 CIFSSMBClose(xid, ptcon, netfid);
338 cifs_put_tlink(tlink);
344 /* it's not a symlink */ 339 /* it's not a symlink */
345 goto out; 340 return rc;
346 } 341 }
347 342
348 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
349 if (!buf) {
350 rc = -ENOMEM;
351 goto out;
352 }
353 pbuf = buf;
354 io_parms.netfid = netfid; 343 io_parms.netfid = netfid;
355 io_parms.pid = current->tgid; 344 io_parms.pid = current->tgid;
356 io_parms.tcon = pTcon; 345 io_parms.tcon = ptcon;
357 io_parms.offset = 0; 346 io_parms.offset = 0;
358 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; 347 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
359 348
360 rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); 349 rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
361 CIFSSMBClose(xid, pTcon, netfid); 350 CIFSSMBClose(xid, ptcon, netfid);
362 if (rc != 0) { 351 cifs_put_tlink(tlink);
363 kfree(buf); 352 return rc;
353}
354
355
356int
357CIFSCheckMFSymlink(struct cifs_fattr *fattr,
358 const unsigned char *path,
359 struct cifs_sb_info *cifs_sb, unsigned int xid)
360{
361 int rc = 0;
362 u8 *buf = NULL;
363 unsigned int link_len = 0;
364 unsigned int bytes_read = 0;
365 struct cifs_tcon *ptcon;
366
367 if (!CIFSCouldBeMFSymlink(fattr))
368 /* it's not a symlink */
369 return 0;
370
371 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
372 if (!buf) {
373 rc = -ENOMEM;
364 goto out; 374 goto out;
365 } 375 }
366 376
377 ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
378 if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink))
379 rc = ptcon->ses->server->ops->query_mf_symlink(path, buf,
380 &bytes_read, cifs_sb, xid);
381 else
382 goto out;
383
384 if (rc != 0)
385 goto out;
386
387 if (bytes_read == 0) /* not a symlink */
388 goto out;
389
367 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL); 390 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
368 kfree(buf);
369 if (rc == -EINVAL) { 391 if (rc == -EINVAL) {
370 /* it's not a symlink */ 392 /* it's not a symlink */
371 rc = 0; 393 rc = 0;
@@ -381,7 +403,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
381 fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO; 403 fattr->cf_mode |= S_IFLNK | S_IRWXU | S_IRWXG | S_IRWXO;
382 fattr->cf_dtype = DT_LNK; 404 fattr->cf_dtype = DT_LNK;
383out: 405out:
384 cifs_put_tlink(tlink); 406 kfree(buf);
385 return rc; 407 return rc;
386} 408}
387 409
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1bec014779fd..f7d4b2285efe 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -267,8 +267,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
267 if (treeCon->nocase) 267 if (treeCon->nocase)
268 buffer->Flags |= SMBFLG_CASELESS; 268 buffer->Flags |= SMBFLG_CASELESS;
269 if ((treeCon->ses) && (treeCon->ses->server)) 269 if ((treeCon->ses) && (treeCon->ses->server))
270 if (treeCon->ses->server->sec_mode & 270 if (treeCon->ses->server->sign)
271 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
272 buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 271 buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
273 } 272 }
274 273
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 770d5a9781c1..69d2c826a23b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -111,6 +111,14 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
111 return; 111 return;
112 } 112 }
113 113
114 /*
115 * If we know that the inode will need to be revalidated immediately,
116 * then don't create a new dentry for it. We'll end up doing an on
117 * the wire call either way and this spares us an invalidation.
118 */
119 if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
120 return;
121
114 dentry = d_alloc(parent, name); 122 dentry = d_alloc(parent, name);
115 if (!dentry) 123 if (!dentry)
116 return; 124 return;
@@ -126,6 +134,22 @@ out:
126 dput(dentry); 134 dput(dentry);
127} 135}
128 136
137/*
138 * Is it possible that this directory might turn out to be a DFS referral
139 * once we go to try and use it?
140 */
141static bool
142cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb)
143{
144#ifdef CONFIG_CIFS_DFS_UPCALL
145 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
146
147 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
148 return true;
149#endif
150 return false;
151}
152
129static void 153static void
130cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) 154cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
131{ 155{
@@ -135,6 +159,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
135 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { 159 if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
136 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; 160 fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
137 fattr->cf_dtype = DT_DIR; 161 fattr->cf_dtype = DT_DIR;
162 /*
163 * Windows CIFS servers generally make DFS referrals look
164 * like directories in FIND_* responses with the reparse
165 * attribute flag also set (since DFS junctions are
166 * reparse points). We must revalidate at least these
167 * directory inodes before trying to use them (if
168 * they are DFS we will get PATH_NOT_COVERED back
169 * when queried directly and can then try to connect
170 * to the DFS target)
171 */
172 if (cifs_dfs_is_possible(cifs_sb) &&
173 (fattr->cf_cifsattrs & ATTR_REPARSE))
174 fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
138 } else { 175 } else {
139 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; 176 fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
140 fattr->cf_dtype = DT_REG; 177 fattr->cf_dtype = DT_REG;
@@ -537,14 +574,14 @@ static int cifs_save_resume_key(const char *current_entry,
537 * every entry (do not increment for . or .. entry). 574 * every entry (do not increment for . or .. entry).
538 */ 575 */
539static int 576static int
540find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, 577find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
541 struct file *file, char **current_entry, int *num_to_ret) 578 struct file *file, char **current_entry, int *num_to_ret)
542{ 579{
543 __u16 search_flags; 580 __u16 search_flags;
544 int rc = 0; 581 int rc = 0;
545 int pos_in_buf = 0; 582 int pos_in_buf = 0;
546 loff_t first_entry_in_buffer; 583 loff_t first_entry_in_buffer;
547 loff_t index_to_find = file->f_pos; 584 loff_t index_to_find = pos;
548 struct cifsFileInfo *cfile = file->private_data; 585 struct cifsFileInfo *cfile = file->private_data;
549 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 586 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
550 struct TCP_Server_Info *server = tcon->ses->server; 587 struct TCP_Server_Info *server = tcon->ses->server;
@@ -659,8 +696,9 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon,
659 return rc; 696 return rc;
660} 697}
661 698
662static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir, 699static int cifs_filldir(char *find_entry, struct file *file,
663 void *dirent, char *scratch_buf, unsigned int max_len) 700 struct dir_context *ctx,
701 char *scratch_buf, unsigned int max_len)
664{ 702{
665 struct cifsFileInfo *file_info = file->private_data; 703 struct cifsFileInfo *file_info = file->private_data;
666 struct super_block *sb = file->f_path.dentry->d_sb; 704 struct super_block *sb = file->f_path.dentry->d_sb;
@@ -740,13 +778,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
740 cifs_prime_dcache(file->f_dentry, &name, &fattr); 778 cifs_prime_dcache(file->f_dentry, &name, &fattr);
741 779
742 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 780 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
743 rc = filldir(dirent, name.name, name.len, file->f_pos, ino, 781 return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype);
744 fattr.cf_dtype);
745 return rc;
746} 782}
747 783
748 784
749int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) 785int cifs_readdir(struct file *file, struct dir_context *ctx)
750{ 786{
751 int rc = 0; 787 int rc = 0;
752 unsigned int xid; 788 unsigned int xid;
@@ -772,103 +808,86 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
772 goto rddir2_exit; 808 goto rddir2_exit;
773 } 809 }
774 810
775 switch ((int) file->f_pos) { 811 if (!dir_emit_dots(file, ctx))
776 case 0: 812 goto rddir2_exit;
777 if (filldir(direntry, ".", 1, file->f_pos,
778 file_inode(file)->i_ino, DT_DIR) < 0) {
779 cifs_dbg(VFS, "Filldir for current dir failed\n");
780 rc = -ENOMEM;
781 break;
782 }
783 file->f_pos++;
784 case 1:
785 if (filldir(direntry, "..", 2, file->f_pos,
786 parent_ino(file->f_path.dentry), DT_DIR) < 0) {
787 cifs_dbg(VFS, "Filldir for parent dir failed\n");
788 rc = -ENOMEM;
789 break;
790 }
791 file->f_pos++;
792 default:
793 /* 1) If search is active,
794 is in current search buffer?
795 if it before then restart search
796 if after then keep searching till find it */
797
798 if (file->private_data == NULL) {
799 rc = -EINVAL;
800 free_xid(xid);
801 return rc;
802 }
803 cifsFile = file->private_data;
804 if (cifsFile->srch_inf.endOfSearch) {
805 if (cifsFile->srch_inf.emptyDir) {
806 cifs_dbg(FYI, "End of search, empty dir\n");
807 rc = 0;
808 break;
809 }
810 } /* else {
811 cifsFile->invalidHandle = true;
812 tcon->ses->server->close(xid, tcon, &cifsFile->fid);
813 } */
814 813
815 tcon = tlink_tcon(cifsFile->tlink); 814 /* 1) If search is active,
816 rc = find_cifs_entry(xid, tcon, file, &current_entry, 815 is in current search buffer?
817 &num_to_fill); 816 if it before then restart search
818 if (rc) { 817 if after then keep searching till find it */
819 cifs_dbg(FYI, "fce error %d\n", rc); 818
820 goto rddir2_exit; 819 if (file->private_data == NULL) {
821 } else if (current_entry != NULL) { 820 rc = -EINVAL;
822 cifs_dbg(FYI, "entry %lld found\n", file->f_pos); 821 goto rddir2_exit;
823 } else { 822 }
824 cifs_dbg(FYI, "could not find entry\n"); 823 cifsFile = file->private_data;
824 if (cifsFile->srch_inf.endOfSearch) {
825 if (cifsFile->srch_inf.emptyDir) {
826 cifs_dbg(FYI, "End of search, empty dir\n");
827 rc = 0;
825 goto rddir2_exit; 828 goto rddir2_exit;
826 } 829 }
827 cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n", 830 } /* else {
828 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start); 831 cifsFile->invalidHandle = true;
829 max_len = tcon->ses->server->ops->calc_smb_size( 832 tcon->ses->server->close(xid, tcon, &cifsFile->fid);
830 cifsFile->srch_inf.ntwrk_buf_start); 833 } */
831 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; 834
832 835 tcon = tlink_tcon(cifsFile->tlink);
833 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL); 836 rc = find_cifs_entry(xid, tcon, ctx->pos, file, &current_entry,
834 if (tmp_buf == NULL) { 837 &num_to_fill);
835 rc = -ENOMEM; 838 if (rc) {
839 cifs_dbg(FYI, "fce error %d\n", rc);
840 goto rddir2_exit;
841 } else if (current_entry != NULL) {
842 cifs_dbg(FYI, "entry %lld found\n", ctx->pos);
843 } else {
844 cifs_dbg(FYI, "could not find entry\n");
845 goto rddir2_exit;
846 }
847 cifs_dbg(FYI, "loop through %d times filling dir for net buf %p\n",
848 num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
849 max_len = tcon->ses->server->ops->calc_smb_size(
850 cifsFile->srch_inf.ntwrk_buf_start);
851 end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
852
853 tmp_buf = kmalloc(UNICODE_NAME_MAX, GFP_KERNEL);
854 if (tmp_buf == NULL) {
855 rc = -ENOMEM;
856 goto rddir2_exit;
857 }
858
859 for (i = 0; i < num_to_fill; i++) {
860 if (current_entry == NULL) {
861 /* evaluate whether this case is an error */
862 cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n",
863 num_to_fill, i);
836 break; 864 break;
837 } 865 }
838 866 /*
839 for (i = 0; (i < num_to_fill) && (rc == 0); i++) { 867 * if buggy server returns . and .. late do we want to
840 if (current_entry == NULL) { 868 * check for that here?
841 /* evaluate whether this case is an error */ 869 */
842 cifs_dbg(VFS, "past SMB end, num to fill %d i %d\n", 870 rc = cifs_filldir(current_entry, file, ctx,
843 num_to_fill, i); 871 tmp_buf, max_len);
844 break; 872 if (rc) {
845 } 873 if (rc > 0)
846 /*
847 * if buggy server returns . and .. late do we want to
848 * check for that here?
849 */
850 rc = cifs_filldir(current_entry, file, filldir,
851 direntry, tmp_buf, max_len);
852 if (rc == -EOVERFLOW) {
853 rc = 0; 874 rc = 0;
854 break; 875 break;
855 }
856
857 file->f_pos++;
858 if (file->f_pos ==
859 cifsFile->srch_inf.index_of_last_entry) {
860 cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
861 file->f_pos, tmp_buf);
862 cifs_save_resume_key(current_entry, cifsFile);
863 break;
864 } else
865 current_entry =
866 nxt_dir_entry(current_entry, end_of_smb,
867 cifsFile->srch_inf.info_level);
868 } 876 }
869 kfree(tmp_buf); 877
870 break; 878 ctx->pos++;
871 } /* end switch */ 879 if (ctx->pos ==
880 cifsFile->srch_inf.index_of_last_entry) {
881 cifs_dbg(FYI, "last entry in buf at pos %lld %s\n",
882 ctx->pos, tmp_buf);
883 cifs_save_resume_key(current_entry, cifsFile);
884 break;
885 } else
886 current_entry =
887 nxt_dir_entry(current_entry, end_of_smb,
888 cifsFile->srch_inf.info_level);
889 }
890 kfree(tmp_buf);
872 891
873rddir2_exit: 892rddir2_exit:
874 free_xid(xid); 893 free_xid(xid);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index f230571a7ab3..08dd37bb23aa 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -138,8 +138,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB)
138 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | 138 capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
139 CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; 139 CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
140 140
141 if (ses->server->sec_mode & 141 if (ses->server->sign)
142 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
143 pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 142 pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
144 143
145 if (ses->capabilities & CAP_UNICODE) { 144 if (ses->capabilities & CAP_UNICODE) {
@@ -198,7 +197,7 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
198 bytes_ret = 0; 197 bytes_ret = 0;
199 } else 198 } else
200 bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName, 199 bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
201 256, nls_cp); 200 CIFS_MAX_DOMAINNAME_LEN, nls_cp);
202 bcc_ptr += 2 * bytes_ret; 201 bcc_ptr += 2 * bytes_ret;
203 bcc_ptr += 2; /* account for null terminator */ 202 bcc_ptr += 2; /* account for null terminator */
204 203
@@ -256,8 +255,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
256 255
257 /* copy domain */ 256 /* copy domain */
258 if (ses->domainName != NULL) { 257 if (ses->domainName != NULL) {
259 strncpy(bcc_ptr, ses->domainName, 256); 258 strncpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
260 bcc_ptr += strnlen(ses->domainName, 256); 259 bcc_ptr += strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
261 } /* else we will send a null domain name 260 } /* else we will send a null domain name
262 so the server will default to its own domain */ 261 so the server will default to its own domain */
263 *bcc_ptr = 0; 262 *bcc_ptr = 0;
@@ -310,11 +309,10 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
310 return; 309 return;
311} 310}
312 311
313static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft, 312static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
314 struct cifs_ses *ses, 313 struct cifs_ses *ses,
315 const struct nls_table *nls_cp) 314 const struct nls_table *nls_cp)
316{ 315{
317 int rc = 0;
318 int len; 316 int len;
319 char *bcc_ptr = *pbcc_area; 317 char *bcc_ptr = *pbcc_area;
320 318
@@ -322,24 +320,22 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
322 320
323 len = strnlen(bcc_ptr, bleft); 321 len = strnlen(bcc_ptr, bleft);
324 if (len >= bleft) 322 if (len >= bleft)
325 return rc; 323 return;
326 324
327 kfree(ses->serverOS); 325 kfree(ses->serverOS);
328 326
329 ses->serverOS = kzalloc(len + 1, GFP_KERNEL); 327 ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
330 if (ses->serverOS) 328 if (ses->serverOS)
331 strncpy(ses->serverOS, bcc_ptr, len); 329 strncpy(ses->serverOS, bcc_ptr, len);
332 if (strncmp(ses->serverOS, "OS/2", 4) == 0) { 330 if (strncmp(ses->serverOS, "OS/2", 4) == 0)
333 cifs_dbg(FYI, "OS/2 server\n"); 331 cifs_dbg(FYI, "OS/2 server\n");
334 ses->flags |= CIFS_SES_OS2;
335 }
336 332
337 bcc_ptr += len + 1; 333 bcc_ptr += len + 1;
338 bleft -= len + 1; 334 bleft -= len + 1;
339 335
340 len = strnlen(bcc_ptr, bleft); 336 len = strnlen(bcc_ptr, bleft);
341 if (len >= bleft) 337 if (len >= bleft)
342 return rc; 338 return;
343 339
344 kfree(ses->serverNOS); 340 kfree(ses->serverNOS);
345 341
@@ -352,7 +348,7 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
352 348
353 len = strnlen(bcc_ptr, bleft); 349 len = strnlen(bcc_ptr, bleft);
354 if (len > bleft) 350 if (len > bleft)
355 return rc; 351 return;
356 352
357 /* No domain field in LANMAN case. Domain is 353 /* No domain field in LANMAN case. Domain is
358 returned by old servers in the SMB negprot response */ 354 returned by old servers in the SMB negprot response */
@@ -360,8 +356,6 @@ static int decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
360 but thus do return domain here we could add parsing 356 but thus do return domain here we could add parsing
361 for it later, but it is not very important */ 357 for it later, but it is not very important */
362 cifs_dbg(FYI, "ascii: bytes left %d\n", bleft); 358 cifs_dbg(FYI, "ascii: bytes left %d\n", bleft);
363
364 return rc;
365} 359}
366 360
367int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, 361int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
@@ -432,8 +426,7 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
432 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | 426 flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET |
433 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 427 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
434 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; 428 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
435 if (ses->server->sec_mode & 429 if (ses->server->sign) {
436 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
437 flags |= NTLMSSP_NEGOTIATE_SIGN; 430 flags |= NTLMSSP_NEGOTIATE_SIGN;
438 if (!ses->server->session_estab) 431 if (!ses->server->session_estab)
439 flags |= NTLMSSP_NEGOTIATE_KEY_XCH; 432 flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -471,8 +464,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
471 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO | 464 NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
472 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | 465 NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
473 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; 466 NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
474 if (ses->server->sec_mode & 467 if (ses->server->sign) {
475 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
476 flags |= NTLMSSP_NEGOTIATE_SIGN; 468 flags |= NTLMSSP_NEGOTIATE_SIGN;
477 if (!ses->server->session_estab) 469 if (!ses->server->session_estab)
478 flags |= NTLMSSP_NEGOTIATE_KEY_XCH; 470 flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
@@ -558,6 +550,56 @@ setup_ntlmv2_ret:
558 return rc; 550 return rc;
559} 551}
560 552
553enum securityEnum
554select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
555{
556 switch (server->negflavor) {
557 case CIFS_NEGFLAVOR_EXTENDED:
558 switch (requested) {
559 case Kerberos:
560 case RawNTLMSSP:
561 return requested;
562 case Unspecified:
563 if (server->sec_ntlmssp &&
564 (global_secflags & CIFSSEC_MAY_NTLMSSP))
565 return RawNTLMSSP;
566 if ((server->sec_kerberos || server->sec_mskerberos) &&
567 (global_secflags & CIFSSEC_MAY_KRB5))
568 return Kerberos;
569 /* Fallthrough */
570 default:
571 return Unspecified;
572 }
573 case CIFS_NEGFLAVOR_UNENCAP:
574 switch (requested) {
575 case NTLM:
576 case NTLMv2:
577 return requested;
578 case Unspecified:
579 if (global_secflags & CIFSSEC_MAY_NTLMV2)
580 return NTLMv2;
581 if (global_secflags & CIFSSEC_MAY_NTLM)
582 return NTLM;
583 /* Fallthrough */
584 default:
585 return Unspecified;
586 }
587 case CIFS_NEGFLAVOR_LANMAN:
588 switch (requested) {
589 case LANMAN:
590 return requested;
591 case Unspecified:
592 if (global_secflags & CIFSSEC_MAY_LANMAN)
593 return LANMAN;
594 /* Fallthrough */
595 default:
596 return Unspecified;
597 }
598 default:
599 return Unspecified;
600 }
601}
602
561int 603int
562CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, 604CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
563 const struct nls_table *nls_cp) 605 const struct nls_table *nls_cp)
@@ -579,11 +621,18 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
579 u16 blob_len; 621 u16 blob_len;
580 char *ntlmsspblob = NULL; 622 char *ntlmsspblob = NULL;
581 623
582 if (ses == NULL) 624 if (ses == NULL) {
625 WARN(1, "%s: ses == NULL!", __func__);
583 return -EINVAL; 626 return -EINVAL;
627 }
584 628
585 type = ses->server->secType; 629 type = select_sectype(ses->server, ses->sectype);
586 cifs_dbg(FYI, "sess setup type %d\n", type); 630 cifs_dbg(FYI, "sess setup type %d\n", type);
631 if (type == Unspecified) {
632 cifs_dbg(VFS, "Unable to select appropriate authentication method!");
633 return -EINVAL;
634 }
635
587 if (type == RawNTLMSSP) { 636 if (type == RawNTLMSSP) {
588 /* if memory allocation is successful, caller of this function 637 /* if memory allocation is successful, caller of this function
589 * frees it. 638 * frees it.
@@ -643,8 +692,6 @@ ssetup_ntlmssp_authenticate:
643 } 692 }
644 bcc_ptr = str_area; 693 bcc_ptr = str_area;
645 694
646 ses->flags &= ~CIFS_SES_LANMAN;
647
648 iov[1].iov_base = NULL; 695 iov[1].iov_base = NULL;
649 iov[1].iov_len = 0; 696 iov[1].iov_len = 0;
650 697
@@ -668,7 +715,6 @@ ssetup_ntlmssp_authenticate:
668 ses->server->sec_mode & SECMODE_PW_ENCRYPT ? 715 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
669 true : false, lnm_session_key); 716 true : false, lnm_session_key);
670 717
671 ses->flags |= CIFS_SES_LANMAN;
672 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); 718 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
673 bcc_ptr += CIFS_AUTH_RESP_SIZE; 719 bcc_ptr += CIFS_AUTH_RESP_SIZE;
674 720
@@ -938,8 +984,7 @@ ssetup_ntlmssp_authenticate:
938 } 984 }
939 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); 985 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
940 } else { 986 } else {
941 rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, 987 decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
942 ses, nls_cp);
943 } 988 }
944 989
945ssetup_exit: 990ssetup_exit:
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 3efdb9d5c0b8..60943978aec3 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -449,8 +449,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
449 * WRITEX header, not including the 4 byte RFC1001 length. 449 * WRITEX header, not including the 4 byte RFC1001 length.
450 */ 450 */
451 if (!(server->capabilities & CAP_LARGE_WRITE_X) || 451 if (!(server->capabilities & CAP_LARGE_WRITE_X) ||
452 (!(server->capabilities & CAP_UNIX) && 452 (!(server->capabilities & CAP_UNIX) && server->sign))
453 (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED))))
454 wsize = min_t(unsigned int, wsize, 453 wsize = min_t(unsigned int, wsize,
455 server->maxBuf - sizeof(WRITE_REQ) + 4); 454 server->maxBuf - sizeof(WRITE_REQ) + 4);
456 455
@@ -675,20 +674,23 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
675} 674}
676 675
677static int 676static int
678cifs_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path, 677cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
679 int disposition, int desired_access, int create_options, 678 __u32 *oplock, FILE_ALL_INFO *buf)
680 struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf, 679{
681 struct cifs_sb_info *cifs_sb) 680 if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
682{ 681 return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
683 if (!(tcon->ses->capabilities & CAP_NT_SMBS)) 682 oparms->disposition,
684 return SMBLegacyOpen(xid, tcon, path, disposition, 683 oparms->desired_access,
685 desired_access, create_options, 684 oparms->create_options,
686 &fid->netfid, oplock, buf, 685 &oparms->fid->netfid, oplock, buf,
687 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags 686 oparms->cifs_sb->local_nls,
687 oparms->cifs_sb->mnt_cifs_flags
688 & CIFS_MOUNT_MAP_SPECIAL_CHR); 688 & CIFS_MOUNT_MAP_SPECIAL_CHR);
689 return CIFSSMBOpen(xid, tcon, path, disposition, desired_access, 689 return CIFSSMBOpen(xid, oparms->tcon, oparms->path,
690 create_options, &fid->netfid, oplock, buf, 690 oparms->disposition, oparms->desired_access,
691 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 691 oparms->create_options, &oparms->fid->netfid, oplock,
692 buf, oparms->cifs_sb->local_nls,
693 oparms->cifs_sb->mnt_cifs_flags &
692 CIFS_MOUNT_MAP_SPECIAL_CHR); 694 CIFS_MOUNT_MAP_SPECIAL_CHR);
693} 695}
694 696
@@ -765,20 +767,14 @@ smb_set_file_info(struct inode *inode, const char *full_path,
765 } 767 }
766 tcon = tlink_tcon(tlink); 768 tcon = tlink_tcon(tlink);
767 769
768 /* 770 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
769 * NT4 apparently returns success on this call, but it doesn't really
770 * work.
771 */
772 if (!(tcon->ses->flags & CIFS_SES_NT4)) {
773 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf,
774 cifs_sb->local_nls,
775 cifs_sb->mnt_cifs_flags & 771 cifs_sb->mnt_cifs_flags &
776 CIFS_MOUNT_MAP_SPECIAL_CHR); 772 CIFS_MOUNT_MAP_SPECIAL_CHR);
777 if (rc == 0) { 773 if (rc == 0) {
778 cinode->cifsAttrs = le32_to_cpu(buf->Attributes); 774 cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
779 goto out; 775 goto out;
780 } else if (rc != -EOPNOTSUPP && rc != -EINVAL) 776 } else if (rc != -EOPNOTSUPP && rc != -EINVAL) {
781 goto out; 777 goto out;
782 } 778 }
783 779
784 cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); 780 cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
@@ -948,6 +944,7 @@ struct smb_version_operations smb1_operations = {
948 .mand_lock = cifs_mand_lock, 944 .mand_lock = cifs_mand_lock,
949 .mand_unlock_range = cifs_unlock_range, 945 .mand_unlock_range = cifs_unlock_range,
950 .push_mand_locks = cifs_push_mandatory_locks, 946 .push_mand_locks = cifs_push_mandatory_locks,
947 .query_mf_symlink = open_query_close_cifs_symlink,
951}; 948};
952 949
953struct smb_version_values smb1_values = { 950struct smb_version_values smb1_values = {
@@ -964,4 +961,6 @@ struct smb_version_values smb1_values = {
964 .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, 961 .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
965 .cap_large_files = CAP_LARGE_FILES, 962 .cap_large_files = CAP_LARGE_FILES,
966 .oplock_read = OPLOCK_READ, 963 .oplock_read = OPLOCK_READ,
964 .signing_enabled = SECMODE_SIGN_ENABLED,
965 .signing_required = SECMODE_SIGN_REQUIRED,
967}; 966};
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 5da1b55a2258..04a81a4142c3 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -40,7 +40,8 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
40 oplock &= 0xFF; 40 oplock &= 0xFF;
41 if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) 41 if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
42 return; 42 return;
43 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { 43 if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
44 oplock == SMB2_OPLOCK_LEVEL_BATCH) {
44 cinode->clientCanCacheAll = true; 45 cinode->clientCanCacheAll = true;
45 cinode->clientCanCacheRead = true; 46 cinode->clientCanCacheRead = true;
46 cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", 47 cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
@@ -57,17 +58,16 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
57} 58}
58 59
59int 60int
60smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path, 61smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
61 int disposition, int desired_access, int create_options, 62 __u32 *oplock, FILE_ALL_INFO *buf)
62 struct cifs_fid *fid, __u32 *oplock, FILE_ALL_INFO *buf,
63 struct cifs_sb_info *cifs_sb)
64{ 63{
65 int rc; 64 int rc;
66 __le16 *smb2_path; 65 __le16 *smb2_path;
67 struct smb2_file_all_info *smb2_data = NULL; 66 struct smb2_file_all_info *smb2_data = NULL;
68 __u8 smb2_oplock[17]; 67 __u8 smb2_oplock[17];
68 struct cifs_fid *fid = oparms->fid;
69 69
70 smb2_path = cifs_convert_path_to_utf16(path, cifs_sb); 70 smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
71 if (smb2_path == NULL) { 71 if (smb2_path == NULL) {
72 rc = -ENOMEM; 72 rc = -ENOMEM;
73 goto out; 73 goto out;
@@ -80,21 +80,19 @@ smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, const char *path,
80 goto out; 80 goto out;
81 } 81 }
82 82
83 desired_access |= FILE_READ_ATTRIBUTES; 83 oparms->desired_access |= FILE_READ_ATTRIBUTES;
84 *smb2_oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE; 84 *smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
85 85
86 if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) 86 if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
87 memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); 87 memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
88 88
89 rc = SMB2_open(xid, tcon, smb2_path, &fid->persistent_fid, 89 rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data);
90 &fid->volatile_fid, desired_access, disposition,
91 0, 0, smb2_oplock, smb2_data);
92 if (rc) 90 if (rc)
93 goto out; 91 goto out;
94 92
95 if (buf) { 93 if (buf) {
96 /* open response does not have IndexNumber field - get it */ 94 /* open response does not have IndexNumber field - get it */
97 rc = SMB2_get_srv_num(xid, tcon, fid->persistent_fid, 95 rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
98 fid->volatile_fid, 96 fid->volatile_fid,
99 &smb2_data->IndexNumber); 97 &smb2_data->IndexNumber);
100 if (rc) { 98 if (rc) {
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 7c0e2143e775..c38350851b08 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -54,5 +54,7 @@
54#define SMB2_SIGNATURE_SIZE (16) 54#define SMB2_SIGNATURE_SIZE (16)
55#define SMB2_NTLMV2_SESSKEY_SIZE (16) 55#define SMB2_NTLMV2_SESSKEY_SIZE (16)
56#define SMB2_HMACSHA256_SIZE (32) 56#define SMB2_HMACSHA256_SIZE (32)
57#define SMB2_CMACAES_SIZE (16)
58#define SMB3_SIGNKEY_SIZE (16)
57 59
58#endif /* _SMB2_GLOB_H */ 60#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index fff6dfba6204..c6ec1633309a 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -41,21 +41,26 @@ static int
41smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, 41smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
42 struct cifs_sb_info *cifs_sb, const char *full_path, 42 struct cifs_sb_info *cifs_sb, const char *full_path,
43 __u32 desired_access, __u32 create_disposition, 43 __u32 desired_access, __u32 create_disposition,
44 __u32 file_attributes, __u32 create_options, 44 __u32 create_options, void *data, int command)
45 void *data, int command)
46{ 45{
47 int rc, tmprc = 0; 46 int rc, tmprc = 0;
48 u64 persistent_fid, volatile_fid;
49 __le16 *utf16_path; 47 __le16 *utf16_path;
50 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; 48 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
49 struct cifs_open_parms oparms;
50 struct cifs_fid fid;
51 51
52 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); 52 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
53 if (!utf16_path) 53 if (!utf16_path)
54 return -ENOMEM; 54 return -ENOMEM;
55 55
56 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, 56 oparms.tcon = tcon;
57 desired_access, create_disposition, file_attributes, 57 oparms.desired_access = desired_access;
58 create_options, &oplock, NULL); 58 oparms.disposition = create_disposition;
59 oparms.create_options = create_options;
60 oparms.fid = &fid;
61 oparms.reconnect = false;
62
63 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
59 if (rc) { 64 if (rc) {
60 kfree(utf16_path); 65 kfree(utf16_path);
61 return rc; 66 return rc;
@@ -65,8 +70,8 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
65 case SMB2_OP_DELETE: 70 case SMB2_OP_DELETE:
66 break; 71 break;
67 case SMB2_OP_QUERY_INFO: 72 case SMB2_OP_QUERY_INFO:
68 tmprc = SMB2_query_info(xid, tcon, persistent_fid, 73 tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid,
69 volatile_fid, 74 fid.volatile_fid,
70 (struct smb2_file_all_info *)data); 75 (struct smb2_file_all_info *)data);
71 break; 76 break;
72 case SMB2_OP_MKDIR: 77 case SMB2_OP_MKDIR:
@@ -76,19 +81,21 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
76 */ 81 */
77 break; 82 break;
78 case SMB2_OP_RENAME: 83 case SMB2_OP_RENAME:
79 tmprc = SMB2_rename(xid, tcon, persistent_fid, volatile_fid, 84 tmprc = SMB2_rename(xid, tcon, fid.persistent_fid,
80 (__le16 *)data); 85 fid.volatile_fid, (__le16 *)data);
81 break; 86 break;
82 case SMB2_OP_HARDLINK: 87 case SMB2_OP_HARDLINK:
83 tmprc = SMB2_set_hardlink(xid, tcon, persistent_fid, 88 tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid,
84 volatile_fid, (__le16 *)data); 89 fid.volatile_fid, (__le16 *)data);
85 break; 90 break;
86 case SMB2_OP_SET_EOF: 91 case SMB2_OP_SET_EOF:
87 tmprc = SMB2_set_eof(xid, tcon, persistent_fid, volatile_fid, 92 tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
88 current->tgid, (__le64 *)data); 93 fid.volatile_fid, current->tgid,
94 (__le64 *)data);
89 break; 95 break;
90 case SMB2_OP_SET_INFO: 96 case SMB2_OP_SET_INFO:
91 tmprc = SMB2_set_info(xid, tcon, persistent_fid, volatile_fid, 97 tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
98 fid.volatile_fid,
92 (FILE_BASIC_INFO *)data); 99 (FILE_BASIC_INFO *)data);
93 break; 100 break;
94 default: 101 default:
@@ -96,7 +103,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
96 break; 103 break;
97 } 104 }
98 105
99 rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid); 106 rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
100 if (tmprc) 107 if (tmprc)
101 rc = tmprc; 108 rc = tmprc;
102 kfree(utf16_path); 109 kfree(utf16_path);
@@ -129,8 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
129 return -ENOMEM; 136 return -ENOMEM;
130 137
131 rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, 138 rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
132 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, 139 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data,
133 smb2_data, SMB2_OP_QUERY_INFO); 140 SMB2_OP_QUERY_INFO);
134 if (rc) 141 if (rc)
135 goto out; 142 goto out;
136 143
@@ -145,7 +152,7 @@ smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
145 struct cifs_sb_info *cifs_sb) 152 struct cifs_sb_info *cifs_sb)
146{ 153{
147 return smb2_open_op_close(xid, tcon, cifs_sb, name, 154 return smb2_open_op_close(xid, tcon, cifs_sb, name,
148 FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0, 155 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
149 CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); 156 CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
150} 157}
151 158
@@ -164,7 +171,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
164 dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; 171 dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
165 data.Attributes = cpu_to_le32(dosattrs); 172 data.Attributes = cpu_to_le32(dosattrs);
166 tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name, 173 tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
167 FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0, 174 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
168 CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); 175 CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
169 if (tmprc == 0) 176 if (tmprc == 0)
170 cifs_i->cifsAttrs = dosattrs; 177 cifs_i->cifsAttrs = dosattrs;
@@ -175,7 +182,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
175 struct cifs_sb_info *cifs_sb) 182 struct cifs_sb_info *cifs_sb)
176{ 183{
177 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, 184 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
178 0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE, 185 CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
179 NULL, SMB2_OP_DELETE); 186 NULL, SMB2_OP_DELETE);
180} 187}
181 188
@@ -184,7 +191,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
184 struct cifs_sb_info *cifs_sb) 191 struct cifs_sb_info *cifs_sb)
185{ 192{
186 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, 193 return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
187 0, CREATE_DELETE_ON_CLOSE, NULL, 194 CREATE_DELETE_ON_CLOSE, NULL,
188 SMB2_OP_DELETE); 195 SMB2_OP_DELETE);
189} 196}
190 197
@@ -203,7 +210,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
203 } 210 }
204 211
205 rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access, 212 rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access,
206 FILE_OPEN, 0, 0, smb2_to_name, command); 213 FILE_OPEN, 0, smb2_to_name, command);
207smb2_rename_path: 214smb2_rename_path:
208 kfree(smb2_to_name); 215 kfree(smb2_to_name);
209 return rc; 216 return rc;
@@ -234,7 +241,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
234{ 241{
235 __le64 eof = cpu_to_le64(size); 242 __le64 eof = cpu_to_le64(size);
236 return smb2_open_op_close(xid, tcon, cifs_sb, full_path, 243 return smb2_open_op_close(xid, tcon, cifs_sb, full_path,
237 FILE_WRITE_DATA, FILE_OPEN, 0, 0, &eof, 244 FILE_WRITE_DATA, FILE_OPEN, 0, &eof,
238 SMB2_OP_SET_EOF); 245 SMB2_OP_SET_EOF);
239} 246}
240 247
@@ -250,7 +257,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
250 if (IS_ERR(tlink)) 257 if (IS_ERR(tlink))
251 return PTR_ERR(tlink); 258 return PTR_ERR(tlink);
252 rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, 259 rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path,
253 FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, 0, buf, 260 FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf,
254 SMB2_OP_SET_INFO); 261 SMB2_OP_SET_INFO);
255 cifs_put_tlink(tlink); 262 cifs_put_tlink(tlink);
256 return rc; 263 return rc;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 10383d8c015b..b0c43345cd98 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -266,6 +266,10 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
266 ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength); 266 ((struct smb2_query_directory_rsp *)hdr)->OutputBufferLength);
267 break; 267 break;
268 case SMB2_IOCTL: 268 case SMB2_IOCTL:
269 *off = le32_to_cpu(
270 ((struct smb2_ioctl_rsp *)hdr)->OutputOffset);
271 *len = le32_to_cpu(((struct smb2_ioctl_rsp *)hdr)->OutputCount);
272 break;
269 case SMB2_CHANGE_NOTIFY: 273 case SMB2_CHANGE_NOTIFY:
270 default: 274 default:
271 /* BB FIXME for unimplemented cases above */ 275 /* BB FIXME for unimplemented cases above */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f2e76f3b0c61..f259e6cc8357 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -213,22 +213,29 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
213 struct cifs_sb_info *cifs_sb, const char *full_path) 213 struct cifs_sb_info *cifs_sb, const char *full_path)
214{ 214{
215 int rc; 215 int rc;
216 __u64 persistent_fid, volatile_fid;
217 __le16 *utf16_path; 216 __le16 *utf16_path;
218 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; 217 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
218 struct cifs_open_parms oparms;
219 struct cifs_fid fid;
219 220
220 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); 221 utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
221 if (!utf16_path) 222 if (!utf16_path)
222 return -ENOMEM; 223 return -ENOMEM;
223 224
224 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, 225 oparms.tcon = tcon;
225 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL); 226 oparms.desired_access = FILE_READ_ATTRIBUTES;
227 oparms.disposition = FILE_OPEN;
228 oparms.create_options = 0;
229 oparms.fid = &fid;
230 oparms.reconnect = false;
231
232 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
226 if (rc) { 233 if (rc) {
227 kfree(utf16_path); 234 kfree(utf16_path);
228 return rc; 235 return rc;
229 } 236 }
230 237
231 rc = SMB2_close(xid, tcon, persistent_fid, volatile_fid); 238 rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
232 kfree(utf16_path); 239 kfree(utf16_path);
233 return rc; 240 return rc;
234} 241}
@@ -281,6 +288,25 @@ smb2_clear_stats(struct cifs_tcon *tcon)
281} 288}
282 289
283static void 290static void
291smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
292{
293 seq_puts(m, "\n\tShare Capabilities:");
294 if (tcon->capabilities & SMB2_SHARE_CAP_DFS)
295 seq_puts(m, " DFS,");
296 if (tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
297 seq_puts(m, " CONTINUOUS AVAILABILITY,");
298 if (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT)
299 seq_puts(m, " SCALEOUT,");
300 if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER)
301 seq_puts(m, " CLUSTER,");
302 if (tcon->capabilities & SMB2_SHARE_CAP_ASYMMETRIC)
303 seq_puts(m, " ASYMMETRIC,");
304 if (tcon->capabilities == 0)
305 seq_puts(m, " None");
306 seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
307}
308
309static void
284smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) 310smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
285{ 311{
286#ifdef CONFIG_CIFS_STATS 312#ifdef CONFIG_CIFS_STATS
@@ -292,7 +318,6 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
292 seq_printf(m, "\nSessionSetups: %d sent %d failed", 318 seq_printf(m, "\nSessionSetups: %d sent %d failed",
293 atomic_read(&sent[SMB2_SESSION_SETUP_HE]), 319 atomic_read(&sent[SMB2_SESSION_SETUP_HE]),
294 atomic_read(&failed[SMB2_SESSION_SETUP_HE])); 320 atomic_read(&failed[SMB2_SESSION_SETUP_HE]));
295#define SMB2LOGOFF 0x0002 /* trivial request/resp */
296 seq_printf(m, "\nLogoffs: %d sent %d failed", 321 seq_printf(m, "\nLogoffs: %d sent %d failed",
297 atomic_read(&sent[SMB2_LOGOFF_HE]), 322 atomic_read(&sent[SMB2_LOGOFF_HE]),
298 atomic_read(&failed[SMB2_LOGOFF_HE])); 323 atomic_read(&failed[SMB2_LOGOFF_HE]));
@@ -425,15 +450,20 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
425 __le16 *utf16_path; 450 __le16 *utf16_path;
426 int rc; 451 int rc;
427 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; 452 __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
428 __u64 persistent_fid, volatile_fid; 453 struct cifs_open_parms oparms;
429 454
430 utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); 455 utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
431 if (!utf16_path) 456 if (!utf16_path)
432 return -ENOMEM; 457 return -ENOMEM;
433 458
434 rc = SMB2_open(xid, tcon, utf16_path, &persistent_fid, &volatile_fid, 459 oparms.tcon = tcon;
435 FILE_READ_ATTRIBUTES | FILE_READ_DATA, FILE_OPEN, 0, 0, 460 oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
436 &oplock, NULL); 461 oparms.disposition = FILE_OPEN;
462 oparms.create_options = 0;
463 oparms.fid = fid;
464 oparms.reconnect = false;
465
466 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
437 kfree(utf16_path); 467 kfree(utf16_path);
438 if (rc) { 468 if (rc) {
439 cifs_dbg(VFS, "open dir failed\n"); 469 cifs_dbg(VFS, "open dir failed\n");
@@ -442,14 +472,12 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
442 472
443 srch_inf->entries_in_buffer = 0; 473 srch_inf->entries_in_buffer = 0;
444 srch_inf->index_of_last_entry = 0; 474 srch_inf->index_of_last_entry = 0;
445 fid->persistent_fid = persistent_fid;
446 fid->volatile_fid = volatile_fid;
447 475
448 rc = SMB2_query_directory(xid, tcon, persistent_fid, volatile_fid, 0, 476 rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
449 srch_inf); 477 fid->volatile_fid, 0, srch_inf);
450 if (rc) { 478 if (rc) {
451 cifs_dbg(VFS, "query directory failed\n"); 479 cifs_dbg(VFS, "query directory failed\n");
452 SMB2_close(xid, tcon, persistent_fid, volatile_fid); 480 SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
453 } 481 }
454 return rc; 482 return rc;
455} 483}
@@ -510,17 +538,25 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
510 struct kstatfs *buf) 538 struct kstatfs *buf)
511{ 539{
512 int rc; 540 int rc;
513 u64 persistent_fid, volatile_fid;
514 __le16 srch_path = 0; /* Null - open root of share */ 541 __le16 srch_path = 0; /* Null - open root of share */
515 u8 oplock = SMB2_OPLOCK_LEVEL_NONE; 542 u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
543 struct cifs_open_parms oparms;
544 struct cifs_fid fid;
545
546 oparms.tcon = tcon;
547 oparms.desired_access = FILE_READ_ATTRIBUTES;
548 oparms.disposition = FILE_OPEN;
549 oparms.create_options = 0;
550 oparms.fid = &fid;
551 oparms.reconnect = false;
516 552
517 rc = SMB2_open(xid, tcon, &srch_path, &persistent_fid, &volatile_fid, 553 rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL);
518 FILE_READ_ATTRIBUTES, FILE_OPEN, 0, 0, &oplock, NULL);
519 if (rc) 554 if (rc)
520 return rc; 555 return rc;
521 buf->f_type = SMB2_MAGIC_NUMBER; 556 buf->f_type = SMB2_MAGIC_NUMBER;
522 rc = SMB2_QFS_info(xid, tcon, persistent_fid, volatile_fid, buf); 557 rc = SMB2_QFS_info(xid, tcon, fid.persistent_fid, fid.volatile_fid,
523 SMB2_close(xid, tcon, persistent_fid, volatile_fid); 558 buf);
559 SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
524 return rc; 560 return rc;
525} 561}
526 562
@@ -645,6 +681,7 @@ struct smb_version_operations smb30_operations = {
645 .dump_detail = smb2_dump_detail, 681 .dump_detail = smb2_dump_detail,
646 .clear_stats = smb2_clear_stats, 682 .clear_stats = smb2_clear_stats,
647 .print_stats = smb2_print_stats, 683 .print_stats = smb2_print_stats,
684 .dump_share_caps = smb2_dump_share_caps,
648 .is_oplock_break = smb2_is_valid_oplock_break, 685 .is_oplock_break = smb2_is_valid_oplock_break,
649 .need_neg = smb2_need_neg, 686 .need_neg = smb2_need_neg,
650 .negotiate = smb2_negotiate, 687 .negotiate = smb2_negotiate,
@@ -690,6 +727,7 @@ struct smb_version_operations smb30_operations = {
690 .get_lease_key = smb2_get_lease_key, 727 .get_lease_key = smb2_get_lease_key,
691 .set_lease_key = smb2_set_lease_key, 728 .set_lease_key = smb2_set_lease_key,
692 .new_lease_key = smb2_new_lease_key, 729 .new_lease_key = smb2_new_lease_key,
730 .generate_signingkey = generate_smb3signingkey,
693 .calc_signature = smb3_calc_signature, 731 .calc_signature = smb3_calc_signature,
694}; 732};
695 733
@@ -709,6 +747,8 @@ struct smb_version_values smb20_values = {
709 .cap_nt_find = SMB2_NT_FIND, 747 .cap_nt_find = SMB2_NT_FIND,
710 .cap_large_files = SMB2_LARGE_FILES, 748 .cap_large_files = SMB2_LARGE_FILES,
711 .oplock_read = SMB2_OPLOCK_LEVEL_II, 749 .oplock_read = SMB2_OPLOCK_LEVEL_II,
750 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
751 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
712}; 752};
713 753
714struct smb_version_values smb21_values = { 754struct smb_version_values smb21_values = {
@@ -727,6 +767,8 @@ struct smb_version_values smb21_values = {
727 .cap_nt_find = SMB2_NT_FIND, 767 .cap_nt_find = SMB2_NT_FIND,
728 .cap_large_files = SMB2_LARGE_FILES, 768 .cap_large_files = SMB2_LARGE_FILES,
729 .oplock_read = SMB2_OPLOCK_LEVEL_II, 769 .oplock_read = SMB2_OPLOCK_LEVEL_II,
770 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
771 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
730}; 772};
731 773
732struct smb_version_values smb30_values = { 774struct smb_version_values smb30_values = {
@@ -745,4 +787,26 @@ struct smb_version_values smb30_values = {
745 .cap_nt_find = SMB2_NT_FIND, 787 .cap_nt_find = SMB2_NT_FIND,
746 .cap_large_files = SMB2_LARGE_FILES, 788 .cap_large_files = SMB2_LARGE_FILES,
747 .oplock_read = SMB2_OPLOCK_LEVEL_II, 789 .oplock_read = SMB2_OPLOCK_LEVEL_II,
790 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
791 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
792};
793
794struct smb_version_values smb302_values = {
795 .version_string = SMB302_VERSION_STRING,
796 .protocol_id = SMB302_PROT_ID,
797 .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
798 .large_lock_type = 0,
799 .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
800 .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
801 .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
802 .header_size = sizeof(struct smb2_hdr),
803 .max_header_size = MAX_SMB2_HDR_SIZE,
804 .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
805 .lock_cmd = SMB2_LOCK,
806 .cap_unix = 0,
807 .cap_nt_find = SMB2_NT_FIND,
808 .cap_large_files = SMB2_LARGE_FILES,
809 .oplock_read = SMB2_OPLOCK_LEVEL_II,
810 .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
811 .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
748}; 812};
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2b95ce2b54e8..abc9c2809b51 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/smb2pdu.c 2 * fs/cifs/smb2pdu.c
3 * 3 *
4 * Copyright (C) International Business Machines Corp., 2009, 2012 4 * Copyright (C) International Business Machines Corp., 2009, 2013
5 * Etersoft, 2012 5 * Etersoft, 2012
6 * Author(s): Steve French (sfrench@us.ibm.com) 6 * Author(s): Steve French (sfrench@us.ibm.com)
7 * Pavel Shilovsky (pshilovsky@samba.org) 2012 7 * Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -108,19 +108,33 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
108 if (!tcon) 108 if (!tcon)
109 goto out; 109 goto out;
110 110
111 /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
112 /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
113 /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
114 if ((tcon->ses) &&
115 (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
116 hdr->CreditCharge = cpu_to_le16(1);
117 /* else CreditCharge MBZ */
118
111 hdr->TreeId = tcon->tid; 119 hdr->TreeId = tcon->tid;
112 /* Uid is not converted */ 120 /* Uid is not converted */
113 if (tcon->ses) 121 if (tcon->ses)
114 hdr->SessionId = tcon->ses->Suid; 122 hdr->SessionId = tcon->ses->Suid;
115 /* BB check following DFS flags BB */ 123
116 /* BB do we have to add check for SHI1005_FLAGS_DFS_ROOT too? */ 124 /*
117 if (tcon->share_flags & SHI1005_FLAGS_DFS) 125 * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
118 hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; 126 * to pass the path on the Open SMB prefixed by \\server\share.
119 /* BB how does SMB2 do case sensitive? */ 127 * Not sure when we would need to do the augmented path (if ever) and
120 /* if (tcon->nocase) 128 * setting this flag breaks the SMB2 open operation since it is
121 hdr->Flags |= SMBFLG_CASELESS; */ 129 * illegal to send an empty path name (without \\server\share prefix)
122 if (tcon->ses && tcon->ses->server && 130 * when the DFS flag is set in the SMB open header. We could
123 (tcon->ses->server->sec_mode & SECMODE_SIGN_REQUIRED)) 131 * consider setting the flag on all operations other than open
132 * but it is safer to net set it for now.
133 */
134/* if (tcon->share_flags & SHI1005_FLAGS_DFS)
135 hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
136
137 if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
124 hdr->Flags |= SMB2_FLAGS_SIGNED; 138 hdr->Flags |= SMB2_FLAGS_SIGNED;
125out: 139out:
126 pdu->StructureSize2 = cpu_to_le16(parmsize); 140 pdu->StructureSize2 = cpu_to_le16(parmsize);
@@ -328,34 +342,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
328 struct kvec iov[1]; 342 struct kvec iov[1];
329 int rc = 0; 343 int rc = 0;
330 int resp_buftype; 344 int resp_buftype;
331 struct TCP_Server_Info *server; 345 struct TCP_Server_Info *server = ses->server;
332 unsigned int sec_flags;
333 u16 temp = 0;
334 int blob_offset, blob_length; 346 int blob_offset, blob_length;
335 char *security_blob; 347 char *security_blob;
336 int flags = CIFS_NEG_OP; 348 int flags = CIFS_NEG_OP;
337 349
338 cifs_dbg(FYI, "Negotiate protocol\n"); 350 cifs_dbg(FYI, "Negotiate protocol\n");
339 351
340 if (ses->server) 352 if (!server) {
341 server = ses->server; 353 WARN(1, "%s: server is NULL!\n", __func__);
342 else { 354 return -EIO;
343 rc = -EIO;
344 return rc;
345 } 355 }
346 356
347 rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req); 357 rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req);
348 if (rc) 358 if (rc)
349 return rc; 359 return rc;
350 360
351 /* if any of auth flags (ie not sign or seal) are overriden use them */
352 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
353 sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/
354 else /* if override flags set only sign/seal OR them with global auth */
355 sec_flags = global_secflags | ses->overrideSecFlg;
356
357 cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
358
359 req->hdr.SessionId = 0; 361 req->hdr.SessionId = 0;
360 362
361 req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); 363 req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
@@ -364,12 +366,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
364 inc_rfc1001_len(req, 2); 366 inc_rfc1001_len(req, 2);
365 367
366 /* only one of SMB2 signing flags may be set in SMB2 request */ 368 /* only one of SMB2 signing flags may be set in SMB2 request */
367 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) 369 if (ses->sign)
368 temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; 370 req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
369 else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */ 371 else if (global_secflags & CIFSSEC_MAY_SIGN)
370 temp = SMB2_NEGOTIATE_SIGNING_ENABLED; 372 req->SecurityMode = cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
371 373 else
372 req->SecurityMode = cpu_to_le16(temp); 374 req->SecurityMode = 0;
373 375
374 req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities); 376 req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
375 377
@@ -399,6 +401,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
399 cifs_dbg(FYI, "negotiated smb2.1 dialect\n"); 401 cifs_dbg(FYI, "negotiated smb2.1 dialect\n");
400 else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID)) 402 else if (rsp->DialectRevision == cpu_to_le16(SMB30_PROT_ID))
401 cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); 403 cifs_dbg(FYI, "negotiated smb3.0 dialect\n");
404 else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID))
405 cifs_dbg(FYI, "negotiated smb3.02 dialect\n");
402 else { 406 else {
403 cifs_dbg(VFS, "Illegal dialect returned by server %d\n", 407 cifs_dbg(VFS, "Illegal dialect returned by server %d\n",
404 le16_to_cpu(rsp->DialectRevision)); 408 le16_to_cpu(rsp->DialectRevision));
@@ -407,6 +411,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
407 } 411 }
408 server->dialect = le16_to_cpu(rsp->DialectRevision); 412 server->dialect = le16_to_cpu(rsp->DialectRevision);
409 413
414 /* SMB2 only has an extended negflavor */
415 server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
410 server->maxBuf = le32_to_cpu(rsp->MaxTransactSize); 416 server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
411 server->max_read = le32_to_cpu(rsp->MaxReadSize); 417 server->max_read = le32_to_cpu(rsp->MaxReadSize);
412 server->max_write = le32_to_cpu(rsp->MaxWriteSize); 418 server->max_write = le32_to_cpu(rsp->MaxWriteSize);
@@ -418,44 +424,22 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
418 424
419 security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, 425 security_blob = smb2_get_data_area_len(&blob_offset, &blob_length,
420 &rsp->hdr); 426 &rsp->hdr);
421 if (blob_length == 0) { 427 /*
422 cifs_dbg(VFS, "missing security blob on negprot\n"); 428 * See MS-SMB2 section 2.2.4: if no blob, client picks default which
423 rc = -EIO; 429 * for us will be
424 goto neg_exit; 430 * ses->sectype = RawNTLMSSP;
425 } 431 * but for time being this is our only auth choice so doesn't matter.
426 432 * We just found a server which sets blob length to zero expecting raw.
427 cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags); 433 */
428 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) { 434 if (blob_length == 0)
429 cifs_dbg(FYI, "Signing required\n"); 435 cifs_dbg(FYI, "missing security blob on negprot\n");
430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
431 SMB2_NEGOTIATE_SIGNING_ENABLED))) {
432 cifs_dbg(VFS, "signing required but server lacks support\n");
433 rc = -EOPNOTSUPP;
434 goto neg_exit;
435 }
436 server->sec_mode |= SECMODE_SIGN_REQUIRED;
437 } else if (sec_flags & CIFSSEC_MAY_SIGN) {
438 cifs_dbg(FYI, "Signing optional\n");
439 if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
440 cifs_dbg(FYI, "Server requires signing\n");
441 server->sec_mode |= SECMODE_SIGN_REQUIRED;
442 } else {
443 server->sec_mode &=
444 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
445 }
446 } else {
447 cifs_dbg(FYI, "Signing disabled\n");
448 if (server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) {
449 cifs_dbg(VFS, "Server requires packet signing to be enabled in /proc/fs/cifs/SecurityFlags\n");
450 rc = -EOPNOTSUPP;
451 goto neg_exit;
452 }
453 server->sec_mode &=
454 ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
455 }
456 436
437 rc = cifs_enable_signing(server, ses->sign);
457#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */ 438#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */
458 rc = decode_neg_token_init(security_blob, blob_length, 439 if (rc)
440 goto neg_exit;
441 if (blob_length)
442 rc = decode_neg_token_init(security_blob, blob_length,
459 &server->sec_type); 443 &server->sec_type);
460 if (rc == 1) 444 if (rc == 1)
461 rc = 0; 445 rc = 0;
@@ -480,9 +464,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
480 int rc = 0; 464 int rc = 0;
481 int resp_buftype; 465 int resp_buftype;
482 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 466 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
483 struct TCP_Server_Info *server; 467 struct TCP_Server_Info *server = ses->server;
484 unsigned int sec_flags;
485 u8 temp = 0;
486 u16 blob_length = 0; 468 u16 blob_length = 0;
487 char *security_blob; 469 char *security_blob;
488 char *ntlmssp_blob = NULL; 470 char *ntlmssp_blob = NULL;
@@ -490,11 +472,9 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
490 472
491 cifs_dbg(FYI, "Session Setup\n"); 473 cifs_dbg(FYI, "Session Setup\n");
492 474
493 if (ses->server) 475 if (!server) {
494 server = ses->server; 476 WARN(1, "%s: server is NULL!\n", __func__);
495 else { 477 return -EIO;
496 rc = -EIO;
497 return rc;
498 } 478 }
499 479
500 /* 480 /*
@@ -505,7 +485,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
505 if (!ses->ntlmssp) 485 if (!ses->ntlmssp)
506 return -ENOMEM; 486 return -ENOMEM;
507 487
508 ses->server->secType = RawNTLMSSP; 488 /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
489 ses->sectype = RawNTLMSSP;
509 490
510ssetup_ntlmssp_authenticate: 491ssetup_ntlmssp_authenticate:
511 if (phase == NtLmChallenge) 492 if (phase == NtLmChallenge)
@@ -515,28 +496,19 @@ ssetup_ntlmssp_authenticate:
515 if (rc) 496 if (rc)
516 return rc; 497 return rc;
517 498
518 /* if any of auth flags (ie not sign or seal) are overriden use them */
519 if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
520 sec_flags = ses->overrideSecFlg; /* BB FIXME fix sign flags?*/
521 else /* if override flags set only sign/seal OR them with global auth */
522 sec_flags = global_secflags | ses->overrideSecFlg;
523
524 cifs_dbg(FYI, "sec_flags 0x%x\n", sec_flags);
525
526 req->hdr.SessionId = 0; /* First session, not a reauthenticate */ 499 req->hdr.SessionId = 0; /* First session, not a reauthenticate */
527 req->VcNumber = 0; /* MBZ */ 500 req->VcNumber = 0; /* MBZ */
528 /* to enable echos and oplocks */ 501 /* to enable echos and oplocks */
529 req->hdr.CreditRequest = cpu_to_le16(3); 502 req->hdr.CreditRequest = cpu_to_le16(3);
530 503
531 /* only one of SMB2 signing flags may be set in SMB2 request */ 504 /* only one of SMB2 signing flags may be set in SMB2 request */
532 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) 505 if (server->sign)
533 temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; 506 req->SecurityMode = SMB2_NEGOTIATE_SIGNING_REQUIRED;
534 else if (ses->server->sec_mode & SMB2_NEGOTIATE_SIGNING_REQUIRED) 507 else if (global_secflags & CIFSSEC_MAY_SIGN) /* one flag unlike MUST_ */
535 temp = SMB2_NEGOTIATE_SIGNING_REQUIRED; 508 req->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED;
536 else if (sec_flags & CIFSSEC_MAY_SIGN) /* MAY_SIGN is a single flag */ 509 else
537 temp = SMB2_NEGOTIATE_SIGNING_ENABLED; 510 req->SecurityMode = 0;
538 511
539 req->SecurityMode = temp;
540 req->Capabilities = 0; 512 req->Capabilities = 0;
541 req->Channel = 0; /* MBZ */ 513 req->Channel = 0; /* MBZ */
542 514
@@ -679,7 +651,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
679 651
680 /* since no tcon, smb2_init can not do this, so do here */ 652 /* since no tcon, smb2_init can not do this, so do here */
681 req->hdr.SessionId = ses->Suid; 653 req->hdr.SessionId = ses->Suid;
682 if (server->sec_mode & SECMODE_SIGN_REQUIRED) 654 if (server->sign)
683 req->hdr.Flags |= SMB2_FLAGS_SIGNED; 655 req->hdr.Flags |= SMB2_FLAGS_SIGNED;
684 656
685 rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0); 657 rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
@@ -788,11 +760,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
788 } 760 }
789 761
790 tcon->share_flags = le32_to_cpu(rsp->ShareFlags); 762 tcon->share_flags = le32_to_cpu(rsp->ShareFlags);
763 tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */
791 tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); 764 tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
792 tcon->tidStatus = CifsGood; 765 tcon->tidStatus = CifsGood;
793 tcon->need_reconnect = false; 766 tcon->need_reconnect = false;
794 tcon->tid = rsp->hdr.TreeId; 767 tcon->tid = rsp->hdr.TreeId;
795 strncpy(tcon->treeName, tree, MAX_TREE_SIZE); 768 strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
796 769
797 if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && 770 if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
798 ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) 771 ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
@@ -874,29 +847,76 @@ create_lease_buf(u8 *lease_key, u8 oplock)
874 return buf; 847 return buf;
875} 848}
876 849
850static struct create_durable *
851create_durable_buf(void)
852{
853 struct create_durable *buf;
854
855 buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
856 if (!buf)
857 return NULL;
858
859 buf->ccontext.DataOffset = cpu_to_le16(offsetof
860 (struct create_durable, Data));
861 buf->ccontext.DataLength = cpu_to_le32(16);
862 buf->ccontext.NameOffset = cpu_to_le16(offsetof
863 (struct create_durable, Name));
864 buf->ccontext.NameLength = cpu_to_le16(4);
865 buf->Name[0] = 'D';
866 buf->Name[1] = 'H';
867 buf->Name[2] = 'n';
868 buf->Name[3] = 'Q';
869 return buf;
870}
871
872static struct create_durable *
873create_reconnect_durable_buf(struct cifs_fid *fid)
874{
875 struct create_durable *buf;
876
877 buf = kzalloc(sizeof(struct create_durable), GFP_KERNEL);
878 if (!buf)
879 return NULL;
880
881 buf->ccontext.DataOffset = cpu_to_le16(offsetof
882 (struct create_durable, Data));
883 buf->ccontext.DataLength = cpu_to_le32(16);
884 buf->ccontext.NameOffset = cpu_to_le16(offsetof
885 (struct create_durable, Name));
886 buf->ccontext.NameLength = cpu_to_le16(4);
887 buf->Data.Fid.PersistentFileId = fid->persistent_fid;
888 buf->Data.Fid.VolatileFileId = fid->volatile_fid;
889 buf->Name[0] = 'D';
890 buf->Name[1] = 'H';
891 buf->Name[2] = 'n';
892 buf->Name[3] = 'C';
893 return buf;
894}
895
877static __u8 896static __u8
878parse_lease_state(struct smb2_create_rsp *rsp) 897parse_lease_state(struct smb2_create_rsp *rsp)
879{ 898{
880 char *data_offset; 899 char *data_offset;
881 struct create_lease *lc; 900 struct create_lease *lc;
882 bool found = false; 901 bool found = false;
902 unsigned int next = 0;
903 char *name;
883 904
884 data_offset = (char *)rsp; 905 data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
885 data_offset += 4 + le32_to_cpu(rsp->CreateContextsOffset);
886 lc = (struct create_lease *)data_offset; 906 lc = (struct create_lease *)data_offset;
887 do { 907 do {
888 char *name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc; 908 lc = (struct create_lease *)((char *)lc + next);
909 name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
889 if (le16_to_cpu(lc->ccontext.NameLength) != 4 || 910 if (le16_to_cpu(lc->ccontext.NameLength) != 4 ||
890 strncmp(name, "RqLs", 4)) { 911 strncmp(name, "RqLs", 4)) {
891 lc = (struct create_lease *)((char *)lc 912 next = le32_to_cpu(lc->ccontext.Next);
892 + le32_to_cpu(lc->ccontext.Next));
893 continue; 913 continue;
894 } 914 }
895 if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) 915 if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
896 return SMB2_OPLOCK_LEVEL_NOCHANGE; 916 return SMB2_OPLOCK_LEVEL_NOCHANGE;
897 found = true; 917 found = true;
898 break; 918 break;
899 } while (le32_to_cpu(lc->ccontext.Next) != 0); 919 } while (next != 0);
900 920
901 if (!found) 921 if (!found)
902 return 0; 922 return 0;
@@ -904,23 +924,74 @@ parse_lease_state(struct smb2_create_rsp *rsp)
904 return smb2_map_lease_to_oplock(lc->lcontext.LeaseState); 924 return smb2_map_lease_to_oplock(lc->lcontext.LeaseState);
905} 925}
906 926
927static int
928add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock)
929{
930 struct smb2_create_req *req = iov[0].iov_base;
931 unsigned int num = *num_iovec;
932
933 iov[num].iov_base = create_lease_buf(oplock+1, *oplock);
934 if (iov[num].iov_base == NULL)
935 return -ENOMEM;
936 iov[num].iov_len = sizeof(struct create_lease);
937 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
938 if (!req->CreateContextsOffset)
939 req->CreateContextsOffset = cpu_to_le32(
940 sizeof(struct smb2_create_req) - 4 +
941 iov[num - 1].iov_len);
942 req->CreateContextsLength = cpu_to_le32(
943 le32_to_cpu(req->CreateContextsLength) +
944 sizeof(struct create_lease));
945 inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
946 *num_iovec = num + 1;
947 return 0;
948}
949
950static int
951add_durable_context(struct kvec *iov, unsigned int *num_iovec,
952 struct cifs_open_parms *oparms)
953{
954 struct smb2_create_req *req = iov[0].iov_base;
955 unsigned int num = *num_iovec;
956
957 if (oparms->reconnect) {
958 iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
959 /* indicate that we don't need to relock the file */
960 oparms->reconnect = false;
961 } else
962 iov[num].iov_base = create_durable_buf();
963 if (iov[num].iov_base == NULL)
964 return -ENOMEM;
965 iov[num].iov_len = sizeof(struct create_durable);
966 if (!req->CreateContextsOffset)
967 req->CreateContextsOffset =
968 cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
969 iov[1].iov_len);
970 req->CreateContextsLength =
971 cpu_to_le32(le32_to_cpu(req->CreateContextsLength) +
972 sizeof(struct create_durable));
973 inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
974 *num_iovec = num + 1;
975 return 0;
976}
977
907int 978int
908SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path, 979SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
909 u64 *persistent_fid, u64 *volatile_fid, __u32 desired_access,
910 __u32 create_disposition, __u32 file_attributes, __u32 create_options,
911 __u8 *oplock, struct smb2_file_all_info *buf) 980 __u8 *oplock, struct smb2_file_all_info *buf)
912{ 981{
913 struct smb2_create_req *req; 982 struct smb2_create_req *req;
914 struct smb2_create_rsp *rsp; 983 struct smb2_create_rsp *rsp;
915 struct TCP_Server_Info *server; 984 struct TCP_Server_Info *server;
985 struct cifs_tcon *tcon = oparms->tcon;
916 struct cifs_ses *ses = tcon->ses; 986 struct cifs_ses *ses = tcon->ses;
917 struct kvec iov[3]; 987 struct kvec iov[4];
918 int resp_buftype; 988 int resp_buftype;
919 int uni_path_len; 989 int uni_path_len;
920 __le16 *copy_path = NULL; 990 __le16 *copy_path = NULL;
921 int copy_size; 991 int copy_size;
922 int rc = 0; 992 int rc = 0;
923 int num_iovecs = 2; 993 unsigned int num_iovecs = 2;
994 __u32 file_attributes = 0;
924 995
925 cifs_dbg(FYI, "create/open\n"); 996 cifs_dbg(FYI, "create/open\n");
926 997
@@ -933,55 +1004,47 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
933 if (rc) 1004 if (rc)
934 return rc; 1005 return rc;
935 1006
1007 if (oparms->create_options & CREATE_OPTION_READONLY)
1008 file_attributes |= ATTR_READONLY;
1009
936 req->ImpersonationLevel = IL_IMPERSONATION; 1010 req->ImpersonationLevel = IL_IMPERSONATION;
937 req->DesiredAccess = cpu_to_le32(desired_access); 1011 req->DesiredAccess = cpu_to_le32(oparms->desired_access);
938 /* File attributes ignored on open (used in create though) */ 1012 /* File attributes ignored on open (used in create though) */
939 req->FileAttributes = cpu_to_le32(file_attributes); 1013 req->FileAttributes = cpu_to_le32(file_attributes);
940 req->ShareAccess = FILE_SHARE_ALL_LE; 1014 req->ShareAccess = FILE_SHARE_ALL_LE;
941 req->CreateDisposition = cpu_to_le32(create_disposition); 1015 req->CreateDisposition = cpu_to_le32(oparms->disposition);
942 req->CreateOptions = cpu_to_le32(create_options); 1016 req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK);
943 uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; 1017 uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
944 req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) 1018 /* do not count rfc1001 len field */
945 - 8 /* pad */ - 4 /* do not count rfc1001 len field */); 1019 req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4);
946 1020
947 iov[0].iov_base = (char *)req; 1021 iov[0].iov_base = (char *)req;
948 /* 4 for rfc1002 length field */ 1022 /* 4 for rfc1002 length field */
949 iov[0].iov_len = get_rfc1002_length(req) + 4; 1023 iov[0].iov_len = get_rfc1002_length(req) + 4;
950 1024
951 /* MUST set path len (NameLength) to 0 opening root of share */ 1025 /* MUST set path len (NameLength) to 0 opening root of share */
952 if (uni_path_len >= 4) { 1026 req->NameLength = cpu_to_le16(uni_path_len - 2);
953 req->NameLength = cpu_to_le16(uni_path_len - 2); 1027 /* -1 since last byte is buf[0] which is sent below (path) */
954 /* -1 since last byte is buf[0] which is sent below (path) */ 1028 iov[0].iov_len--;
955 iov[0].iov_len--; 1029 if (uni_path_len % 8 != 0) {
956 if (uni_path_len % 8 != 0) { 1030 copy_size = uni_path_len / 8 * 8;
957 copy_size = uni_path_len / 8 * 8; 1031 if (copy_size < uni_path_len)
958 if (copy_size < uni_path_len) 1032 copy_size += 8;
959 copy_size += 8; 1033
960 1034 copy_path = kzalloc(copy_size, GFP_KERNEL);
961 copy_path = kzalloc(copy_size, GFP_KERNEL); 1035 if (!copy_path)
962 if (!copy_path) 1036 return -ENOMEM;
963 return -ENOMEM; 1037 memcpy((char *)copy_path, (const char *)path,
964 memcpy((char *)copy_path, (const char *)path, 1038 uni_path_len);
965 uni_path_len); 1039 uni_path_len = copy_size;
966 uni_path_len = copy_size; 1040 path = copy_path;
967 path = copy_path;
968 }
969
970 iov[1].iov_len = uni_path_len;
971 iov[1].iov_base = path;
972 /*
973 * -1 since last byte is buf[0] which was counted in
974 * smb2_buf_len.
975 */
976 inc_rfc1001_len(req, uni_path_len - 1);
977 } else {
978 iov[0].iov_len += 7;
979 req->hdr.smb2_buf_length = cpu_to_be32(be32_to_cpu(
980 req->hdr.smb2_buf_length) + 8 - 1);
981 num_iovecs = 1;
982 req->NameLength = 0;
983 } 1041 }
984 1042
1043 iov[1].iov_len = uni_path_len;
1044 iov[1].iov_base = path;
1045 /* -1 since last byte is buf[0] which was counted in smb2_buf_len */
1046 inc_rfc1001_len(req, uni_path_len - 1);
1047
985 if (!server->oplocks) 1048 if (!server->oplocks)
986 *oplock = SMB2_OPLOCK_LEVEL_NONE; 1049 *oplock = SMB2_OPLOCK_LEVEL_NONE;
987 1050
@@ -989,21 +1052,29 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
989 *oplock == SMB2_OPLOCK_LEVEL_NONE) 1052 *oplock == SMB2_OPLOCK_LEVEL_NONE)
990 req->RequestedOplockLevel = *oplock; 1053 req->RequestedOplockLevel = *oplock;
991 else { 1054 else {
992 iov[num_iovecs].iov_base = create_lease_buf(oplock+1, *oplock); 1055 rc = add_lease_context(iov, &num_iovecs, oplock);
993 if (iov[num_iovecs].iov_base == NULL) { 1056 if (rc) {
994 cifs_small_buf_release(req); 1057 cifs_small_buf_release(req);
995 kfree(copy_path); 1058 kfree(copy_path);
996 return -ENOMEM; 1059 return rc;
1060 }
1061 }
1062
1063 if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
1064 /* need to set Next field of lease context if we request it */
1065 if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
1066 struct create_context *ccontext =
1067 (struct create_context *)iov[num_iovecs-1].iov_base;
1068 ccontext->Next =
1069 cpu_to_le32(sizeof(struct create_lease));
1070 }
1071 rc = add_durable_context(iov, &num_iovecs, oparms);
1072 if (rc) {
1073 cifs_small_buf_release(req);
1074 kfree(copy_path);
1075 kfree(iov[num_iovecs-1].iov_base);
1076 return rc;
997 } 1077 }
998 iov[num_iovecs].iov_len = sizeof(struct create_lease);
999 req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
1000 req->CreateContextsOffset = cpu_to_le32(
1001 sizeof(struct smb2_create_req) - 4 - 8 +
1002 iov[num_iovecs-1].iov_len);
1003 req->CreateContextsLength = cpu_to_le32(
1004 sizeof(struct create_lease));
1005 inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
1006 num_iovecs++;
1007 } 1078 }
1008 1079
1009 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); 1080 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
@@ -1014,8 +1085,8 @@ SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, __le16 *path,
1014 goto creat_exit; 1085 goto creat_exit;
1015 } 1086 }
1016 1087
1017 *persistent_fid = rsp->PersistentFileId; 1088 oparms->fid->persistent_fid = rsp->PersistentFileId;
1018 *volatile_fid = rsp->VolatileFileId; 1089 oparms->fid->volatile_fid = rsp->VolatileFileId;
1019 1090
1020 if (buf) { 1091 if (buf) {
1021 memcpy(buf, &rsp->CreationTime, 32); 1092 memcpy(buf, &rsp->CreationTime, 32);
@@ -1036,6 +1107,122 @@ creat_exit:
1036 return rc; 1107 return rc;
1037} 1108}
1038 1109
1110/*
1111 * SMB2 IOCTL is used for both IOCTLs and FSCTLs
1112 */
1113int
1114SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
1115 u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data,
1116 u32 indatalen, char **out_data, u32 *plen /* returned data len */)
1117{
1118 struct smb2_ioctl_req *req;
1119 struct smb2_ioctl_rsp *rsp;
1120 struct TCP_Server_Info *server;
1121 struct cifs_ses *ses = tcon->ses;
1122 struct kvec iov[2];
1123 int resp_buftype;
1124 int num_iovecs;
1125 int rc = 0;
1126
1127 cifs_dbg(FYI, "SMB2 IOCTL\n");
1128
1129 /* zero out returned data len, in case of error */
1130 if (plen)
1131 *plen = 0;
1132
1133 if (ses && (ses->server))
1134 server = ses->server;
1135 else
1136 return -EIO;
1137
1138 rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req);
1139 if (rc)
1140 return rc;
1141
1142 req->CtlCode = cpu_to_le32(opcode);
1143 req->PersistentFileId = persistent_fid;
1144 req->VolatileFileId = volatile_fid;
1145
1146 if (indatalen) {
1147 req->InputCount = cpu_to_le32(indatalen);
1148 /* do not set InputOffset if no input data */
1149 req->InputOffset =
1150 cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
1151 iov[1].iov_base = in_data;
1152 iov[1].iov_len = indatalen;
1153 num_iovecs = 2;
1154 } else
1155 num_iovecs = 1;
1156
1157 req->OutputOffset = 0;
1158 req->OutputCount = 0; /* MBZ */
1159
1160 /*
1161 * Could increase MaxOutputResponse, but that would require more
1162 * than one credit. Windows typically sets this smaller, but for some
1163 * ioctls it may be useful to allow server to send more. No point
1164 * limiting what the server can send as long as fits in one credit
1165 */
1166 req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */
1167
1168 if (is_fsctl)
1169 req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
1170 else
1171 req->Flags = 0;
1172
1173 iov[0].iov_base = (char *)req;
1174 /* 4 for rfc1002 length field */
1175 iov[0].iov_len = get_rfc1002_length(req) + 4;
1176
1177 if (indatalen)
1178 inc_rfc1001_len(req, indatalen);
1179
1180 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
1181 rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
1182
1183 if (rc != 0) {
1184 if (tcon)
1185 cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
1186 goto ioctl_exit;
1187 }
1188
1189 /* check if caller wants to look at return data or just return rc */
1190 if ((plen == NULL) || (out_data == NULL))
1191 goto ioctl_exit;
1192
1193 *plen = le32_to_cpu(rsp->OutputCount);
1194
1195 /* We check for obvious errors in the output buffer length and offset */
1196 if (*plen == 0)
1197 goto ioctl_exit; /* server returned no data */
1198 else if (*plen > 0xFF00) {
1199 cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
1200 *plen = 0;
1201 rc = -EIO;
1202 goto ioctl_exit;
1203 }
1204
1205 if (get_rfc1002_length(rsp) < le32_to_cpu(rsp->OutputOffset) + *plen) {
1206 cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
1207 le32_to_cpu(rsp->OutputOffset));
1208 *plen = 0;
1209 rc = -EIO;
1210 goto ioctl_exit;
1211 }
1212
1213 *out_data = kmalloc(*plen, GFP_KERNEL);
1214 if (*out_data == NULL) {
1215 rc = -ENOMEM;
1216 goto ioctl_exit;
1217 }
1218
1219 memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
1220 *plen);
1221ioctl_exit:
1222 free_rsp_buf(resp_buftype, rsp);
1223 return rc;
1224}
1225
1039int 1226int
1040SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, 1227SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
1041 u64 persistent_fid, u64 volatile_fid) 1228 u64 persistent_fid, u64 volatile_fid)
@@ -1384,8 +1571,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
1384 case MID_RESPONSE_RECEIVED: 1571 case MID_RESPONSE_RECEIVED:
1385 credits_received = le16_to_cpu(buf->CreditRequest); 1572 credits_received = le16_to_cpu(buf->CreditRequest);
1386 /* result already set, check signature */ 1573 /* result already set, check signature */
1387 if (server->sec_mode & 1574 if (server->sign) {
1388 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1389 int rc; 1575 int rc;
1390 1576
1391 rc = smb2_verify_signature(&rqst, server); 1577 rc = smb2_verify_signature(&rqst, server);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4cb4ced258cb..36b0d37ea69b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/smb2pdu.h 2 * fs/cifs/smb2pdu.h
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2009, 2010 4 * Copyright (c) International Business Machines Corp., 2009, 2013
5 * Etersoft, 2012 5 * Etersoft, 2012
6 * Author(s): Steve French (sfrench@us.ibm.com) 6 * Author(s): Steve French (sfrench@us.ibm.com)
7 * Pavel Shilovsky (pshilovsky@samba.org) 2012 7 * Pavel Shilovsky (pshilovsky@samba.org) 2012
@@ -170,6 +170,7 @@ struct smb2_negotiate_req {
170#define SMB20_PROT_ID 0x0202 170#define SMB20_PROT_ID 0x0202
171#define SMB21_PROT_ID 0x0210 171#define SMB21_PROT_ID 0x0210
172#define SMB30_PROT_ID 0x0300 172#define SMB30_PROT_ID 0x0300
173#define SMB302_PROT_ID 0x0302
173#define BAD_PROT_ID 0xFFFF 174#define BAD_PROT_ID 0xFFFF
174 175
175/* SecurityMode flags */ 176/* SecurityMode flags */
@@ -283,10 +284,17 @@ struct smb2_tree_connect_rsp {
283#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 284#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400
284#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 285#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800
285#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 286#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000
286#define SHI1005_FLAGS_ENABLE_HASH 0x00002000 287#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000
288#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000
289#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000
290#define SHI1005_FLAGS_ALL 0x0000FF33
287 291
288/* Possible share capabilities */ 292/* Possible share capabilities */
289#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) 293#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */
294#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */
295#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */
296#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */
297#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
290 298
291struct smb2_tree_disconnect_req { 299struct smb2_tree_disconnect_req {
292 struct smb2_hdr hdr; 300 struct smb2_hdr hdr;
@@ -420,7 +428,7 @@ struct smb2_create_req {
420 __le16 NameLength; 428 __le16 NameLength;
421 __le32 CreateContextsOffset; 429 __le32 CreateContextsOffset;
422 __le32 CreateContextsLength; 430 __le32 CreateContextsLength;
423 __u8 Buffer[8]; 431 __u8 Buffer[0];
424} __packed; 432} __packed;
425 433
426struct smb2_create_rsp { 434struct smb2_create_rsp {
@@ -477,6 +485,87 @@ struct create_lease {
477 struct lease_context lcontext; 485 struct lease_context lcontext;
478} __packed; 486} __packed;
479 487
488struct create_durable {
489 struct create_context ccontext;
490 __u8 Name[8];
491 union {
492 __u8 Reserved[16];
493 struct {
494 __u64 PersistentFileId;
495 __u64 VolatileFileId;
496 } Fid;
497 } Data;
498} __packed;
499
500/* this goes in the ioctl buffer when doing a copychunk request */
501struct copychunk_ioctl {
502 char SourceKey[24];
503 __le32 ChunkCount; /* we are only sending 1 */
504 __le32 Reserved;
505 /* array will only be one chunk long for us */
506 __le64 SourceOffset;
507 __le64 TargetOffset;
508 __le32 Length; /* how many bytes to copy */
509 __u32 Reserved2;
510} __packed;
511
512/* Response and Request are the same format */
513struct validate_negotiate_info {
514 __le32 Capabilities;
515 __u8 Guid[SMB2_CLIENT_GUID_SIZE];
516 __le16 SecurityMode;
517 __le16 DialectCount;
518 __le16 Dialect[1];
519} __packed;
520
521#define RSS_CAPABLE 0x00000001
522#define RDMA_CAPABLE 0x00000002
523
524struct network_interface_info_ioctl_rsp {
525 __le32 Next; /* next interface. zero if this is last one */
526 __le32 IfIndex;
527 __le32 Capability; /* RSS or RDMA Capable */
528 __le32 Reserved;
529 __le64 LinkSpeed;
530 char SockAddr_Storage[128];
531} __packed;
532
533#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
534
535struct smb2_ioctl_req {
536 struct smb2_hdr hdr;
537 __le16 StructureSize; /* Must be 57 */
538 __u16 Reserved;
539 __le32 CtlCode;
540 __u64 PersistentFileId; /* opaque endianness */
541 __u64 VolatileFileId; /* opaque endianness */
542 __le32 InputOffset;
543 __le32 InputCount;
544 __le32 MaxInputResponse;
545 __le32 OutputOffset;
546 __le32 OutputCount;
547 __le32 MaxOutputResponse;
548 __le32 Flags;
549 __u32 Reserved2;
550 char Buffer[0];
551} __packed;
552
553struct smb2_ioctl_rsp {
554 struct smb2_hdr hdr;
555 __le16 StructureSize; /* Must be 57 */
556 __u16 Reserved;
557 __le32 CtlCode;
558 __u64 PersistentFileId; /* opaque endianness */
559 __u64 VolatileFileId; /* opaque endianness */
560 __le32 InputOffset;
561 __le32 InputCount;
562 __le32 OutputOffset;
563 __le32 OutputCount;
564 __le32 Flags;
565 __u32 Reserved2;
566 /* char * buffer[] */
567} __packed;
568
480/* Currently defined values for close flags */ 569/* Currently defined values for close flags */
481#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) 570#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001)
482struct smb2_close_req { 571struct smb2_close_req {
@@ -517,17 +606,25 @@ struct smb2_flush_rsp {
517 __le16 Reserved; 606 __le16 Reserved;
518} __packed; 607} __packed;
519 608
609/* For read request Flags field below, following flag is defined for SMB3.02 */
610#define SMB2_READFLAG_READ_UNBUFFERED 0x01
611
612/* Channel field for read and write: exactly one of following flags can be set*/
613#define SMB2_CHANNEL_NONE 0x00000000
614#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */
615#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
616
520struct smb2_read_req { 617struct smb2_read_req {
521 struct smb2_hdr hdr; 618 struct smb2_hdr hdr;
522 __le16 StructureSize; /* Must be 49 */ 619 __le16 StructureSize; /* Must be 49 */
523 __u8 Padding; /* offset from start of SMB2 header to place read */ 620 __u8 Padding; /* offset from start of SMB2 header to place read */
524 __u8 Reserved; 621 __u8 Flags; /* MBZ unless SMB3.02 or later */
525 __le32 Length; 622 __le32 Length;
526 __le64 Offset; 623 __le64 Offset;
527 __u64 PersistentFileId; /* opaque endianness */ 624 __u64 PersistentFileId; /* opaque endianness */
528 __u64 VolatileFileId; /* opaque endianness */ 625 __u64 VolatileFileId; /* opaque endianness */
529 __le32 MinimumCount; 626 __le32 MinimumCount;
530 __le32 Channel; /* Reserved MBZ */ 627 __le32 Channel; /* MBZ except for SMB3 or later */
531 __le32 RemainingBytes; 628 __le32 RemainingBytes;
532 __le16 ReadChannelInfoOffset; /* Reserved MBZ */ 629 __le16 ReadChannelInfoOffset; /* Reserved MBZ */
533 __le16 ReadChannelInfoLength; /* Reserved MBZ */ 630 __le16 ReadChannelInfoLength; /* Reserved MBZ */
@@ -545,8 +642,9 @@ struct smb2_read_rsp {
545 __u8 Buffer[1]; 642 __u8 Buffer[1];
546} __packed; 643} __packed;
547 644
548/* For write request Flags field below the following flag is defined: */ 645/* For write request Flags field below the following flags are defined: */
549#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 646#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */
647#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */
550 648
551struct smb2_write_req { 649struct smb2_write_req {
552 struct smb2_hdr hdr; 650 struct smb2_hdr hdr;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 2aa3535e38ce..1a5ecbed40ed 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -84,11 +84,9 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
84 const char *from_name, const char *to_name, 84 const char *from_name, const char *to_name,
85 struct cifs_sb_info *cifs_sb); 85 struct cifs_sb_info *cifs_sb);
86 86
87extern int smb2_open_file(const unsigned int xid, struct cifs_tcon *tcon, 87extern int smb2_open_file(const unsigned int xid,
88 const char *full_path, int disposition, 88 struct cifs_open_parms *oparms,
89 int desired_access, int create_options, 89 __u32 *oplock, FILE_ALL_INFO *buf);
90 struct cifs_fid *fid, __u32 *oplock,
91 FILE_ALL_INFO *buf, struct cifs_sb_info *cifs_sb);
92extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); 90extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
93extern int smb2_unlock_range(struct cifsFileInfo *cfile, 91extern int smb2_unlock_range(struct cifsFileInfo *cfile,
94 struct file_lock *flock, const unsigned int xid); 92 struct file_lock *flock, const unsigned int xid);
@@ -106,11 +104,13 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
106 const char *tree, struct cifs_tcon *tcon, 104 const char *tree, struct cifs_tcon *tcon,
107 const struct nls_table *); 105 const struct nls_table *);
108extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); 106extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
109extern int SMB2_open(const unsigned int xid, struct cifs_tcon *tcon, 107extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
110 __le16 *path, u64 *persistent_fid, u64 *volatile_fid, 108 __le16 *path, __u8 *oplock,
111 __u32 desired_access, __u32 create_disposition, 109 struct smb2_file_all_info *buf);
112 __u32 file_attributes, __u32 create_options, 110extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
113 __u8 *oplock, struct smb2_file_all_info *buf); 111 u64 persistent_fid, u64 volatile_fid, u32 opcode,
112 bool is_fsctl, char *in_data, u32 indatalen,
113 char **out_data, u32 *plen /* returned data len */);
114extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, 114extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
115 u64 persistent_file_id, u64 volatile_file_id); 115 u64 persistent_file_id, u64 volatile_file_id);
116extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, 116extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 01f0ac800780..4f2300d020c7 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,6 +39,82 @@
39#include "smb2status.h" 39#include "smb2status.h"
40#include "smb2glob.h" 40#include "smb2glob.h"
41 41
42static int
43smb2_crypto_shash_allocate(struct TCP_Server_Info *server)
44{
45 int rc;
46 unsigned int size;
47
48 if (server->secmech.sdeschmacsha256 != NULL)
49 return 0; /* already allocated */
50
51 server->secmech.hmacsha256 = crypto_alloc_shash("hmac(sha256)", 0, 0);
52 if (IS_ERR(server->secmech.hmacsha256)) {
53 cifs_dbg(VFS, "could not allocate crypto hmacsha256\n");
54 rc = PTR_ERR(server->secmech.hmacsha256);
55 server->secmech.hmacsha256 = NULL;
56 return rc;
57 }
58
59 size = sizeof(struct shash_desc) +
60 crypto_shash_descsize(server->secmech.hmacsha256);
61 server->secmech.sdeschmacsha256 = kmalloc(size, GFP_KERNEL);
62 if (!server->secmech.sdeschmacsha256) {
63 crypto_free_shash(server->secmech.hmacsha256);
64 server->secmech.hmacsha256 = NULL;
65 return -ENOMEM;
66 }
67 server->secmech.sdeschmacsha256->shash.tfm = server->secmech.hmacsha256;
68 server->secmech.sdeschmacsha256->shash.flags = 0x0;
69
70 return 0;
71}
72
73static int
74smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
75{
76 unsigned int size;
77 int rc;
78
79 if (server->secmech.sdesccmacaes != NULL)
80 return 0; /* already allocated */
81
82 rc = smb2_crypto_shash_allocate(server);
83 if (rc)
84 return rc;
85
86 server->secmech.cmacaes = crypto_alloc_shash("cmac(aes)", 0, 0);
87 if (IS_ERR(server->secmech.cmacaes)) {
88 cifs_dbg(VFS, "could not allocate crypto cmac-aes");
89 kfree(server->secmech.sdeschmacsha256);
90 server->secmech.sdeschmacsha256 = NULL;
91 crypto_free_shash(server->secmech.hmacsha256);
92 server->secmech.hmacsha256 = NULL;
93 rc = PTR_ERR(server->secmech.cmacaes);
94 server->secmech.cmacaes = NULL;
95 return rc;
96 }
97
98 size = sizeof(struct shash_desc) +
99 crypto_shash_descsize(server->secmech.cmacaes);
100 server->secmech.sdesccmacaes = kmalloc(size, GFP_KERNEL);
101 if (!server->secmech.sdesccmacaes) {
102 cifs_dbg(VFS, "%s: Can't alloc cmacaes\n", __func__);
103 kfree(server->secmech.sdeschmacsha256);
104 server->secmech.sdeschmacsha256 = NULL;
105 crypto_free_shash(server->secmech.hmacsha256);
106 crypto_free_shash(server->secmech.cmacaes);
107 server->secmech.hmacsha256 = NULL;
108 server->secmech.cmacaes = NULL;
109 return -ENOMEM;
110 }
111 server->secmech.sdesccmacaes->shash.tfm = server->secmech.cmacaes;
112 server->secmech.sdesccmacaes->shash.flags = 0x0;
113
114 return 0;
115}
116
117
42int 118int
43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 119smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
44{ 120{
@@ -52,6 +128,12 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
52 memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); 128 memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
53 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); 129 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
54 130
131 rc = smb2_crypto_shash_allocate(server);
132 if (rc) {
133 cifs_dbg(VFS, "%s: shah256 alloc failed\n", __func__);
134 return rc;
135 }
136
55 rc = crypto_shash_setkey(server->secmech.hmacsha256, 137 rc = crypto_shash_setkey(server->secmech.hmacsha256,
56 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); 138 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
57 if (rc) { 139 if (rc) {
@@ -61,7 +143,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
61 143
62 rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); 144 rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
63 if (rc) { 145 if (rc) {
64 cifs_dbg(VFS, "%s: Could not init md5\n", __func__); 146 cifs_dbg(VFS, "%s: Could not init sha256", __func__);
65 return rc; 147 return rc;
66 } 148 }
67 149
@@ -116,11 +198,166 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
116 return rc; 198 return rc;
117} 199}
118 200
201void
202generate_smb3signingkey(struct TCP_Server_Info *server)
203{
204 unsigned char zero = 0x0;
205 __u8 i[4] = {0, 0, 0, 1};
206 __u8 L[4] = {0, 0, 0, 128};
207 int rc = 0;
208 unsigned char prfhash[SMB2_HMACSHA256_SIZE];
209 unsigned char *hashptr = prfhash;
210
211 memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
212 memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
213
214 rc = smb3_crypto_shash_allocate(server);
215 if (rc) {
216 cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
217 goto smb3signkey_ret;
218 }
219
220 rc = crypto_shash_setkey(server->secmech.hmacsha256,
221 server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
222 if (rc) {
223 cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
224 goto smb3signkey_ret;
225 }
226
227 rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
228 if (rc) {
229 cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
230 goto smb3signkey_ret;
231 }
232
233 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
234 i, 4);
235 if (rc) {
236 cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
237 goto smb3signkey_ret;
238 }
239
240 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
241 "SMB2AESCMAC", 12);
242 if (rc) {
243 cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
244 goto smb3signkey_ret;
245 }
246
247 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
248 &zero, 1);
249 if (rc) {
250 cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
251 goto smb3signkey_ret;
252 }
253
254 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
255 "SmbSign", 8);
256 if (rc) {
257 cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
258 goto smb3signkey_ret;
259 }
260
261 rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
262 L, 4);
263 if (rc) {
264 cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
265 goto smb3signkey_ret;
266 }
267
268 rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
269 hashptr);
270 if (rc) {
271 cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
272 goto smb3signkey_ret;
273 }
274
275 memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
276
277smb3signkey_ret:
278 return;
279}
280
119int 281int
120smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 282smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
121{ 283{
122 cifs_dbg(FYI, "smb3 signatures not supported yet\n"); 284 int i, rc;
123 return -EOPNOTSUPP; 285 unsigned char smb3_signature[SMB2_CMACAES_SIZE];
286 unsigned char *sigptr = smb3_signature;
287 struct kvec *iov = rqst->rq_iov;
288 int n_vec = rqst->rq_nvec;
289 struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
290
291 memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
292 memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
293
294 rc = crypto_shash_setkey(server->secmech.cmacaes,
295 server->smb3signingkey, SMB2_CMACAES_SIZE);
296 if (rc) {
297 cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
298 return rc;
299 }
300
301 /*
302 * we already allocate sdesccmacaes when we init smb3 signing key,
303 * so unlike smb2 case we do not have to check here if secmech are
304 * initialized
305 */
306 rc = crypto_shash_init(&server->secmech.sdesccmacaes->shash);
307 if (rc) {
308 cifs_dbg(VFS, "%s: Could not init cmac aes\n", __func__);
309 return rc;
310 }
311
312 for (i = 0; i < n_vec; i++) {
313 if (iov[i].iov_len == 0)
314 continue;
315 if (iov[i].iov_base == NULL) {
316 cifs_dbg(VFS, "null iovec entry");
317 return -EIO;
318 }
319 /*
320 * The first entry includes a length field (which does not get
321 * signed that occupies the first 4 bytes before the header).
322 */
323 if (i == 0) {
324 if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
325 break; /* nothing to sign or corrupt header */
326 rc =
327 crypto_shash_update(
328 &server->secmech.sdesccmacaes->shash,
329 iov[i].iov_base + 4, iov[i].iov_len - 4);
330 } else {
331 rc =
332 crypto_shash_update(
333 &server->secmech.sdesccmacaes->shash,
334 iov[i].iov_base, iov[i].iov_len);
335 }
336 if (rc) {
337 cifs_dbg(VFS, "%s: Couldn't update cmac aes with payload\n",
338 __func__);
339 return rc;
340 }
341 }
342
343 /* now hash over the rq_pages array */
344 for (i = 0; i < rqst->rq_npages; i++) {
345 struct kvec p_iov;
346
347 cifs_rqst_page_to_kvec(rqst, i, &p_iov);
348 crypto_shash_update(&server->secmech.sdesccmacaes->shash,
349 p_iov.iov_base, p_iov.iov_len);
350 kunmap(rqst->rq_pages[i]);
351 }
352
353 rc = crypto_shash_final(&server->secmech.sdesccmacaes->shash,
354 sigptr);
355 if (rc)
356 cifs_dbg(VFS, "%s: Could not generate cmac aes\n", __func__);
357
358 memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
359
360 return rc;
124} 361}
125 362
126/* must be called with server->srv_mutex held */ 363/* must be called with server->srv_mutex held */
@@ -275,8 +512,7 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
275 512
276 dump_smb(mid->resp_buf, min_t(u32, 80, len)); 513 dump_smb(mid->resp_buf, min_t(u32, 80, len));
277 /* convert the length into a more usable form */ 514 /* convert the length into a more usable form */
278 if ((len > 24) && 515 if (len > 24 && server->sign) {
279 (server->sec_mode & (SECMODE_SIGN_REQUIRED|SECMODE_SIGN_ENABLED))) {
280 int rc; 516 int rc;
281 517
282 rc = smb2_verify_signature(&rqst, server); 518 rc = smb2_verify_signature(&rqst, server);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 7056b891e087..d952ee48f4dc 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions 2 * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
3 * 3 *
4 * Copyright (c) International Business Machines Corp., 2002,2009 4 * Copyright (c) International Business Machines Corp., 2002,2013
5 * Author(s): Steve French (sfrench@us.ibm.com) 5 * Author(s): Steve French (sfrench@us.ibm.com)
6 * 6 *
7 * This library is free software; you can redistribute it and/or modify 7 * This library is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
22/* IOCTL information */ 22/* IOCTL information */
23/* 23/*
24 * List of ioctl/fsctl function codes that are or could be useful in the 24 * List of ioctl/fsctl function codes that are or could be useful in the
25 * future to remote clients like cifs or SMB2 client. There is probably 25 * future to remote clients like cifs or SMB2/SMB3 client. This is probably
26 * a slightly larger set of fsctls that NTFS local filesystem could handle, 26 * a slightly larger set of fsctls that NTFS local filesystem could handle,
27 * including the seven below that we do not have struct definitions for. 27 * including the seven below that we do not have struct definitions for.
28 * Even with protocol definitions for most of these now available, we still 28 * Even with protocol definitions for most of these now available, we still
@@ -30,7 +30,13 @@
30 * remotely. Some of the following, such as the encryption/compression ones 30 * remotely. Some of the following, such as the encryption/compression ones
31 * could be invoked from tools via a specialized hook into the VFS rather 31 * could be invoked from tools via a specialized hook into the VFS rather
32 * than via the standard vfs entry points 32 * than via the standard vfs entry points
33 *
34 * See MS-SMB2 Section 2.2.31 (last checked June 2013, all of that list are
35 * below). Additional detail on less common ones can be found in MS-FSCC
36 * section 2.3.
33 */ 37 */
38#define FSCTL_DFS_GET_REFERRALS 0x00060194
39#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0
34#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000 40#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
35#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004 41#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
36#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008 42#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008
@@ -71,14 +77,31 @@
71#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ 77#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
72#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ 78#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
73#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ 79#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */
80#define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */
74#define FSCTL_SIS_LINK_FILES 0x0009C104 81#define FSCTL_SIS_LINK_FILES 0x0009C104
75#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ 82#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */
76#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ 83#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */
77/* strange that the number for this op is not sequential with previous op */ 84/* strange that the number for this op is not sequential with previous op */
78#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */ 85#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */
86/* Enumerate previous versions of a file */
87#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
88/* Retrieve an opaque file reference for server-side data movement ie copy */
89#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
90#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
79#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ 91#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
80#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ 92#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
93#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */
94/* Perform server-side data movement */
95#define FSCTL_SRV_COPYCHUNK 0x001440F2
96#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
97#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
98#define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */
81 99
82#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 100#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
83#define IO_REPARSE_TAG_HSM 0xC0000004 101#define IO_REPARSE_TAG_HSM 0xC0000004
84#define IO_REPARSE_TAG_SIS 0x80000007 102#define IO_REPARSE_TAG_SIS 0x80000007
103
104/* fsctl flags */
105/* If Flags is set to this value, the request is an FSCTL not ioctl request */
106#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
107
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bfbf4700d160..6fdcb1b4a106 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -447,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
447{ 447{
448 int error; 448 int error;
449 449
450 error = wait_event_freezekillable(server->response_q, 450 error = wait_event_freezekillable_unsafe(server->response_q,
451 midQ->mid_state != MID_REQUEST_SUBMITTED); 451 midQ->mid_state != MID_REQUEST_SUBMITTED);
452 if (error < 0) 452 if (error < 0)
453 return -ERESTARTSYS; 453 return -ERESTARTSYS;
@@ -463,7 +463,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
463 struct mid_q_entry *mid; 463 struct mid_q_entry *mid;
464 464
465 /* enable signing if server requires it */ 465 /* enable signing if server requires it */
466 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 466 if (server->sign)
467 hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; 467 hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
468 468
469 mid = AllocMidQEntry(hdr, server); 469 mid = AllocMidQEntry(hdr, server);
@@ -612,7 +612,7 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
612 dump_smb(mid->resp_buf, min_t(u32, 92, len)); 612 dump_smb(mid->resp_buf, min_t(u32, 92, len));
613 613
614 /* convert the length into a more usable form */ 614 /* convert the length into a more usable form */
615 if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { 615 if (server->sign) {
616 struct kvec iov; 616 struct kvec iov;
617 int rc = 0; 617 int rc = 0;
618 struct smb_rqst rqst = { .rq_iov = &iov, 618 struct smb_rqst rqst = { .rq_iov = &iov,
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index b7d3a05c062c..190effc6a6fa 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -43,15 +43,14 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
43 struct inode *new_inode, struct dentry *new_dentry); 43 struct inode *new_inode, struct dentry *new_dentry);
44 44
45/* dir file-ops */ 45/* dir file-ops */
46static int coda_readdir(struct file *file, void *buf, filldir_t filldir); 46static int coda_readdir(struct file *file, struct dir_context *ctx);
47 47
48/* dentry ops */ 48/* dentry ops */
49static int coda_dentry_revalidate(struct dentry *de, unsigned int flags); 49static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
50static int coda_dentry_delete(const struct dentry *); 50static int coda_dentry_delete(const struct dentry *);
51 51
52/* support routines */ 52/* support routines */
53static int coda_venus_readdir(struct file *coda_file, void *buf, 53static int coda_venus_readdir(struct file *, struct dir_context *);
54 filldir_t filldir);
55 54
56/* same as fs/bad_inode.c */ 55/* same as fs/bad_inode.c */
57static int coda_return_EIO(void) 56static int coda_return_EIO(void)
@@ -85,7 +84,7 @@ const struct inode_operations coda_dir_inode_operations =
85const struct file_operations coda_dir_operations = { 84const struct file_operations coda_dir_operations = {
86 .llseek = generic_file_llseek, 85 .llseek = generic_file_llseek,
87 .read = generic_read_dir, 86 .read = generic_read_dir,
88 .readdir = coda_readdir, 87 .iterate = coda_readdir,
89 .open = coda_open, 88 .open = coda_open,
90 .release = coda_release, 89 .release = coda_release,
91 .fsync = coda_fsync, 90 .fsync = coda_fsync,
@@ -378,7 +377,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
378 377
379 378
380/* file operations for directories */ 379/* file operations for directories */
381static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) 380static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
382{ 381{
383 struct coda_file_info *cfi; 382 struct coda_file_info *cfi;
384 struct file *host_file; 383 struct file *host_file;
@@ -391,30 +390,19 @@ static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir)
391 if (!host_file->f_op) 390 if (!host_file->f_op)
392 return -ENOTDIR; 391 return -ENOTDIR;
393 392
394 if (host_file->f_op->readdir) 393 if (host_file->f_op->iterate) {
395 {
396 /* potemkin case: we were handed a directory inode.
397 * We can't use vfs_readdir because we have to keep the file
398 * position in sync between the coda_file and the host_file.
399 * and as such we need grab the inode mutex. */
400 struct inode *host_inode = file_inode(host_file); 394 struct inode *host_inode = file_inode(host_file);
401
402 mutex_lock(&host_inode->i_mutex); 395 mutex_lock(&host_inode->i_mutex);
403 host_file->f_pos = coda_file->f_pos;
404
405 ret = -ENOENT; 396 ret = -ENOENT;
406 if (!IS_DEADDIR(host_inode)) { 397 if (!IS_DEADDIR(host_inode)) {
407 ret = host_file->f_op->readdir(host_file, buf, filldir); 398 ret = host_file->f_op->iterate(host_file, ctx);
408 file_accessed(host_file); 399 file_accessed(host_file);
409 } 400 }
410
411 coda_file->f_pos = host_file->f_pos;
412 mutex_unlock(&host_inode->i_mutex); 401 mutex_unlock(&host_inode->i_mutex);
402 return ret;
413 } 403 }
414 else /* Venus: we must read Venus dirents from a file */ 404 /* Venus: we must read Venus dirents from a file */
415 ret = coda_venus_readdir(coda_file, buf, filldir); 405 return coda_venus_readdir(coda_file, ctx);
416
417 return ret;
418} 406}
419 407
420static inline unsigned int CDT2DT(unsigned char cdt) 408static inline unsigned int CDT2DT(unsigned char cdt)
@@ -437,10 +425,8 @@ static inline unsigned int CDT2DT(unsigned char cdt)
437} 425}
438 426
439/* support routines */ 427/* support routines */
440static int coda_venus_readdir(struct file *coda_file, void *buf, 428static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
441 filldir_t filldir)
442{ 429{
443 int result = 0; /* # of entries returned */
444 struct coda_file_info *cfi; 430 struct coda_file_info *cfi;
445 struct coda_inode_info *cii; 431 struct coda_inode_info *cii;
446 struct file *host_file; 432 struct file *host_file;
@@ -462,23 +448,12 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
462 vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); 448 vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
463 if (!vdir) return -ENOMEM; 449 if (!vdir) return -ENOMEM;
464 450
465 if (coda_file->f_pos == 0) { 451 if (!dir_emit_dots(coda_file, ctx))
466 ret = filldir(buf, ".", 1, 0, de->d_inode->i_ino, DT_DIR); 452 goto out;
467 if (ret < 0) 453
468 goto out;
469 result++;
470 coda_file->f_pos++;
471 }
472 if (coda_file->f_pos == 1) {
473 ret = filldir(buf, "..", 2, 1, parent_ino(de), DT_DIR);
474 if (ret < 0)
475 goto out;
476 result++;
477 coda_file->f_pos++;
478 }
479 while (1) { 454 while (1) {
480 /* read entries from the directory file */ 455 /* read entries from the directory file */
481 ret = kernel_read(host_file, coda_file->f_pos - 2, (char *)vdir, 456 ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
482 sizeof(*vdir)); 457 sizeof(*vdir));
483 if (ret < 0) { 458 if (ret < 0) {
484 printk(KERN_ERR "coda readdir: read dir %s failed %d\n", 459 printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
@@ -507,32 +482,23 @@ static int coda_venus_readdir(struct file *coda_file, void *buf,
507 482
508 /* Make sure we skip '.' and '..', we already got those */ 483 /* Make sure we skip '.' and '..', we already got those */
509 if (name.name[0] == '.' && (name.len == 1 || 484 if (name.name[0] == '.' && (name.len == 1 ||
510 (vdir->d_name[1] == '.' && name.len == 2))) 485 (name.name[1] == '.' && name.len == 2)))
511 vdir->d_fileno = name.len = 0; 486 vdir->d_fileno = name.len = 0;
512 487
513 /* skip null entries */ 488 /* skip null entries */
514 if (vdir->d_fileno && name.len) { 489 if (vdir->d_fileno && name.len) {
515 /* try to look up this entry in the dcache, that way 490 ino = vdir->d_fileno;
516 * userspace doesn't have to worry about breaking
517 * getcwd by having mismatched inode numbers for
518 * internal volume mountpoints. */
519 ino = find_inode_number(de, &name);
520 if (!ino) ino = vdir->d_fileno;
521
522 type = CDT2DT(vdir->d_type); 491 type = CDT2DT(vdir->d_type);
523 ret = filldir(buf, name.name, name.len, 492 if (!dir_emit(ctx, name.name, name.len, ino, type))
524 coda_file->f_pos, ino, type); 493 break;
525 /* failure means no space for filling in this round */
526 if (ret < 0) break;
527 result++;
528 } 494 }
529 /* we'll always have progress because d_reclen is unsigned and 495 /* we'll always have progress because d_reclen is unsigned and
530 * we've already established it is non-zero. */ 496 * we've already established it is non-zero. */
531 coda_file->f_pos += vdir->d_reclen; 497 ctx->pos += vdir->d_reclen;
532 } 498 }
533out: 499out:
534 kfree(vdir); 500 kfree(vdir);
535 return result ? result : ret; 501 return 0;
536} 502}
537 503
538/* called when a cache lookup succeeds */ 504/* called when a cache lookup succeeds */
@@ -560,7 +526,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
560 if (cii->c_flags & C_FLUSH) 526 if (cii->c_flags & C_FLUSH)
561 coda_flag_inode_children(inode, C_FLUSH); 527 coda_flag_inode_children(inode, C_FLUSH);
562 528
563 if (de->d_count > 1) 529 if (d_count(de) > 1)
564 /* pretend it's valid, but don't change the flags */ 530 /* pretend it's valid, but don't change the flags */
565 goto out; 531 goto out;
566 532
diff --git a/fs/compat.c b/fs/compat.c
index fc3b55dce184..6af20de2c1a3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -832,6 +832,7 @@ struct compat_old_linux_dirent {
832}; 832};
833 833
834struct compat_readdir_callback { 834struct compat_readdir_callback {
835 struct dir_context ctx;
835 struct compat_old_linux_dirent __user *dirent; 836 struct compat_old_linux_dirent __user *dirent;
836 int result; 837 int result;
837}; 838};
@@ -873,15 +874,15 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
873{ 874{
874 int error; 875 int error;
875 struct fd f = fdget(fd); 876 struct fd f = fdget(fd);
876 struct compat_readdir_callback buf; 877 struct compat_readdir_callback buf = {
878 .ctx.actor = compat_fillonedir,
879 .dirent = dirent
880 };
877 881
878 if (!f.file) 882 if (!f.file)
879 return -EBADF; 883 return -EBADF;
880 884
881 buf.result = 0; 885 error = iterate_dir(f.file, &buf.ctx);
882 buf.dirent = dirent;
883
884 error = vfs_readdir(f.file, compat_fillonedir, &buf);
885 if (buf.result) 886 if (buf.result)
886 error = buf.result; 887 error = buf.result;
887 888
@@ -897,6 +898,7 @@ struct compat_linux_dirent {
897}; 898};
898 899
899struct compat_getdents_callback { 900struct compat_getdents_callback {
901 struct dir_context ctx;
900 struct compat_linux_dirent __user *current_dir; 902 struct compat_linux_dirent __user *current_dir;
901 struct compat_linux_dirent __user *previous; 903 struct compat_linux_dirent __user *previous;
902 int count; 904 int count;
@@ -951,7 +953,11 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
951{ 953{
952 struct fd f; 954 struct fd f;
953 struct compat_linux_dirent __user * lastdirent; 955 struct compat_linux_dirent __user * lastdirent;
954 struct compat_getdents_callback buf; 956 struct compat_getdents_callback buf = {
957 .ctx.actor = compat_filldir,
958 .current_dir = dirent,
959 .count = count
960 };
955 int error; 961 int error;
956 962
957 if (!access_ok(VERIFY_WRITE, dirent, count)) 963 if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -961,17 +967,12 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
961 if (!f.file) 967 if (!f.file)
962 return -EBADF; 968 return -EBADF;
963 969
964 buf.current_dir = dirent; 970 error = iterate_dir(f.file, &buf.ctx);
965 buf.previous = NULL;
966 buf.count = count;
967 buf.error = 0;
968
969 error = vfs_readdir(f.file, compat_filldir, &buf);
970 if (error >= 0) 971 if (error >= 0)
971 error = buf.error; 972 error = buf.error;
972 lastdirent = buf.previous; 973 lastdirent = buf.previous;
973 if (lastdirent) { 974 if (lastdirent) {
974 if (put_user(f.file->f_pos, &lastdirent->d_off)) 975 if (put_user(buf.ctx.pos, &lastdirent->d_off))
975 error = -EFAULT; 976 error = -EFAULT;
976 else 977 else
977 error = count - buf.count; 978 error = count - buf.count;
@@ -983,6 +984,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
983#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64 984#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
984 985
985struct compat_getdents_callback64 { 986struct compat_getdents_callback64 {
987 struct dir_context ctx;
986 struct linux_dirent64 __user *current_dir; 988 struct linux_dirent64 __user *current_dir;
987 struct linux_dirent64 __user *previous; 989 struct linux_dirent64 __user *previous;
988 int count; 990 int count;
@@ -1036,7 +1038,11 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1036{ 1038{
1037 struct fd f; 1039 struct fd f;
1038 struct linux_dirent64 __user * lastdirent; 1040 struct linux_dirent64 __user * lastdirent;
1039 struct compat_getdents_callback64 buf; 1041 struct compat_getdents_callback64 buf = {
1042 .ctx.actor = compat_filldir64,
1043 .current_dir = dirent,
1044 .count = count
1045 };
1040 int error; 1046 int error;
1041 1047
1042 if (!access_ok(VERIFY_WRITE, dirent, count)) 1048 if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -1046,17 +1052,12 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
1046 if (!f.file) 1052 if (!f.file)
1047 return -EBADF; 1053 return -EBADF;
1048 1054
1049 buf.current_dir = dirent; 1055 error = iterate_dir(f.file, &buf.ctx);
1050 buf.previous = NULL;
1051 buf.count = count;
1052 buf.error = 0;
1053
1054 error = vfs_readdir(f.file, compat_filldir64, &buf);
1055 if (error >= 0) 1056 if (error >= 0)
1056 error = buf.error; 1057 error = buf.error;
1057 lastdirent = buf.previous; 1058 lastdirent = buf.previous;
1058 if (lastdirent) { 1059 if (lastdirent) {
1059 typeof(lastdirent->d_off) d_off = f.file->f_pos; 1060 typeof(lastdirent->d_off) d_off = buf.ctx.pos;
1060 if (__put_user_unaligned(d_off, &lastdirent->d_off)) 1061 if (__put_user_unaligned(d_off, &lastdirent->d_off))
1061 error = -EFAULT; 1062 error = -EFAULT;
1062 else 1063 else
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 996cdc5abb85..5d19acfa7c6c 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -66,7 +66,6 @@
66#include <linux/gigaset_dev.h> 66#include <linux/gigaset_dev.h>
67 67
68#ifdef CONFIG_BLOCK 68#ifdef CONFIG_BLOCK
69#include <linux/loop.h>
70#include <linux/cdrom.h> 69#include <linux/cdrom.h>
71#include <linux/fd.h> 70#include <linux/fd.h>
72#include <scsi/scsi.h> 71#include <scsi/scsi.h>
@@ -954,8 +953,6 @@ COMPATIBLE_IOCTL(MTIOCTOP)
954/* Socket level stuff */ 953/* Socket level stuff */
955COMPATIBLE_IOCTL(FIOQSIZE) 954COMPATIBLE_IOCTL(FIOQSIZE)
956#ifdef CONFIG_BLOCK 955#ifdef CONFIG_BLOCK
957/* loop */
958IGNORE_IOCTL(LOOP_CLR_FD)
959/* md calls this on random blockdevs */ 956/* md calls this on random blockdevs */
960IGNORE_IOCTL(RAID_VERSION) 957IGNORE_IOCTL(RAID_VERSION)
961/* qemu/qemu-img might call these two on plain files for probing */ 958/* qemu/qemu-img might call these two on plain files for probing */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7aabc6ad4e9b..277bd1be21fd 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -387,7 +387,7 @@ static void remove_dir(struct dentry * d)
387 if (d->d_inode) 387 if (d->d_inode)
388 simple_rmdir(parent->d_inode,d); 388 simple_rmdir(parent->d_inode,d);
389 389
390 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d->d_count); 390 pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d));
391 391
392 dput(parent); 392 dput(parent);
393} 393}
@@ -660,19 +660,15 @@ static int create_default_group(struct config_group *parent_group,
660 struct config_group *group) 660 struct config_group *group)
661{ 661{
662 int ret; 662 int ret;
663 struct qstr name;
664 struct configfs_dirent *sd; 663 struct configfs_dirent *sd;
665 /* We trust the caller holds a reference to parent */ 664 /* We trust the caller holds a reference to parent */
666 struct dentry *child, *parent = parent_group->cg_item.ci_dentry; 665 struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
667 666
668 if (!group->cg_item.ci_name) 667 if (!group->cg_item.ci_name)
669 group->cg_item.ci_name = group->cg_item.ci_namebuf; 668 group->cg_item.ci_name = group->cg_item.ci_namebuf;
670 name.name = group->cg_item.ci_name;
671 name.len = strlen(name.name);
672 name.hash = full_name_hash(name.name, name.len);
673 669
674 ret = -ENOMEM; 670 ret = -ENOMEM;
675 child = d_alloc(parent, &name); 671 child = d_alloc_name(parent, group->cg_item.ci_name);
676 if (child) { 672 if (child) {
677 d_add(child, NULL); 673 d_add(child, NULL);
678 674
@@ -1532,84 +1528,66 @@ static inline unsigned char dt_type(struct configfs_dirent *sd)
1532 return (sd->s_mode >> 12) & 15; 1528 return (sd->s_mode >> 12) & 15;
1533} 1529}
1534 1530
1535static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 1531static int configfs_readdir(struct file *file, struct dir_context *ctx)
1536{ 1532{
1537 struct dentry *dentry = filp->f_path.dentry; 1533 struct dentry *dentry = file->f_path.dentry;
1538 struct super_block *sb = dentry->d_sb; 1534 struct super_block *sb = dentry->d_sb;
1539 struct configfs_dirent * parent_sd = dentry->d_fsdata; 1535 struct configfs_dirent * parent_sd = dentry->d_fsdata;
1540 struct configfs_dirent *cursor = filp->private_data; 1536 struct configfs_dirent *cursor = file->private_data;
1541 struct list_head *p, *q = &cursor->s_sibling; 1537 struct list_head *p, *q = &cursor->s_sibling;
1542 ino_t ino = 0; 1538 ino_t ino = 0;
1543 int i = filp->f_pos;
1544 1539
1545 switch (i) { 1540 if (!dir_emit_dots(file, ctx))
1546 case 0: 1541 return 0;
1547 ino = dentry->d_inode->i_ino; 1542 if (ctx->pos == 2) {
1548 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 1543 spin_lock(&configfs_dirent_lock);
1549 break; 1544 list_move(q, &parent_sd->s_children);
1550 filp->f_pos++; 1545 spin_unlock(&configfs_dirent_lock);
1551 i++; 1546 }
1552 /* fallthrough */ 1547 for (p = q->next; p != &parent_sd->s_children; p = p->next) {
1553 case 1: 1548 struct configfs_dirent *next;
1554 ino = parent_ino(dentry); 1549 const char *name;
1555 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 1550 int len;
1556 break; 1551 struct inode *inode = NULL;
1557 filp->f_pos++; 1552
1558 i++; 1553 next = list_entry(p, struct configfs_dirent, s_sibling);
1559 /* fallthrough */ 1554 if (!next->s_element)
1560 default: 1555 continue;
1561 if (filp->f_pos == 2) {
1562 spin_lock(&configfs_dirent_lock);
1563 list_move(q, &parent_sd->s_children);
1564 spin_unlock(&configfs_dirent_lock);
1565 }
1566 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
1567 struct configfs_dirent *next;
1568 const char * name;
1569 int len;
1570 struct inode *inode = NULL;
1571 1556
1572 next = list_entry(p, struct configfs_dirent, 1557 name = configfs_get_name(next);
1573 s_sibling); 1558 len = strlen(name);
1574 if (!next->s_element)
1575 continue;
1576
1577 name = configfs_get_name(next);
1578 len = strlen(name);
1579
1580 /*
1581 * We'll have a dentry and an inode for
1582 * PINNED items and for open attribute
1583 * files. We lock here to prevent a race
1584 * with configfs_d_iput() clearing
1585 * s_dentry before calling iput().
1586 *
1587 * Why do we go to the trouble? If
1588 * someone has an attribute file open,
1589 * the inode number should match until
1590 * they close it. Beyond that, we don't
1591 * care.
1592 */
1593 spin_lock(&configfs_dirent_lock);
1594 dentry = next->s_dentry;
1595 if (dentry)
1596 inode = dentry->d_inode;
1597 if (inode)
1598 ino = inode->i_ino;
1599 spin_unlock(&configfs_dirent_lock);
1600 if (!inode)
1601 ino = iunique(sb, 2);
1602 1559
1603 if (filldir(dirent, name, len, filp->f_pos, ino, 1560 /*
1604 dt_type(next)) < 0) 1561 * We'll have a dentry and an inode for
1605 return 0; 1562 * PINNED items and for open attribute
1563 * files. We lock here to prevent a race
1564 * with configfs_d_iput() clearing
1565 * s_dentry before calling iput().
1566 *
1567 * Why do we go to the trouble? If
1568 * someone has an attribute file open,
1569 * the inode number should match until
1570 * they close it. Beyond that, we don't
1571 * care.
1572 */
1573 spin_lock(&configfs_dirent_lock);
1574 dentry = next->s_dentry;
1575 if (dentry)
1576 inode = dentry->d_inode;
1577 if (inode)
1578 ino = inode->i_ino;
1579 spin_unlock(&configfs_dirent_lock);
1580 if (!inode)
1581 ino = iunique(sb, 2);
1606 1582
1607 spin_lock(&configfs_dirent_lock); 1583 if (!dir_emit(ctx, name, len, ino, dt_type(next)))
1608 list_move(q, p); 1584 return 0;
1609 spin_unlock(&configfs_dirent_lock); 1585
1610 p = q; 1586 spin_lock(&configfs_dirent_lock);
1611 filp->f_pos++; 1587 list_move(q, p);
1612 } 1588 spin_unlock(&configfs_dirent_lock);
1589 p = q;
1590 ctx->pos++;
1613 } 1591 }
1614 return 0; 1592 return 0;
1615} 1593}
@@ -1661,14 +1639,13 @@ const struct file_operations configfs_dir_operations = {
1661 .release = configfs_dir_close, 1639 .release = configfs_dir_close,
1662 .llseek = configfs_dir_lseek, 1640 .llseek = configfs_dir_lseek,
1663 .read = generic_read_dir, 1641 .read = generic_read_dir,
1664 .readdir = configfs_readdir, 1642 .iterate = configfs_readdir,
1665}; 1643};
1666 1644
1667int configfs_register_subsystem(struct configfs_subsystem *subsys) 1645int configfs_register_subsystem(struct configfs_subsystem *subsys)
1668{ 1646{
1669 int err; 1647 int err;
1670 struct config_group *group = &subsys->su_group; 1648 struct config_group *group = &subsys->su_group;
1671 struct qstr name;
1672 struct dentry *dentry; 1649 struct dentry *dentry;
1673 struct dentry *root; 1650 struct dentry *root;
1674 struct configfs_dirent *sd; 1651 struct configfs_dirent *sd;
@@ -1685,12 +1662,8 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
1685 1662
1686 mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT); 1663 mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT);
1687 1664
1688 name.name = group->cg_item.ci_name;
1689 name.len = strlen(name.name);
1690 name.hash = full_name_hash(name.name, name.len);
1691
1692 err = -ENOMEM; 1665 err = -ENOMEM;
1693 dentry = d_alloc(root, &name); 1666 dentry = d_alloc_name(root, group->cg_item.ci_name);
1694 if (dentry) { 1667 if (dentry) {
1695 d_add(dentry, NULL); 1668 d_add(dentry, NULL);
1696 1669
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2b6cb23dd14e..1d1c41f1014d 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
203 mutex_lock(&buffer->mutex); 203 mutex_lock(&buffer->mutex);
204 len = fill_write_buffer(buffer, buf, count); 204 len = fill_write_buffer(buffer, buf, count);
205 if (len > 0) 205 if (len > 0)
206 len = flush_write_buffer(file->f_path.dentry, buffer, count); 206 len = flush_write_buffer(file->f_path.dentry, buffer, len);
207 if (len > 0) 207 if (len > 0)
208 *ppos += len; 208 *ppos += len;
209 mutex_unlock(&buffer->mutex); 209 mutex_unlock(&buffer->mutex);
diff --git a/fs/coredump.c b/fs/coredump.c
index dafafbafa731..72f816d6cad9 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -45,69 +45,79 @@
45#include <trace/events/sched.h> 45#include <trace/events/sched.h>
46 46
47int core_uses_pid; 47int core_uses_pid;
48char core_pattern[CORENAME_MAX_SIZE] = "core";
49unsigned int core_pipe_limit; 48unsigned int core_pipe_limit;
49char core_pattern[CORENAME_MAX_SIZE] = "core";
50static int core_name_size = CORENAME_MAX_SIZE;
50 51
51struct core_name { 52struct core_name {
52 char *corename; 53 char *corename;
53 int used, size; 54 int used, size;
54}; 55};
55static atomic_t call_count = ATOMIC_INIT(1);
56 56
57/* The maximal length of core_pattern is also specified in sysctl.c */ 57/* The maximal length of core_pattern is also specified in sysctl.c */
58 58
59static int expand_corename(struct core_name *cn) 59static int expand_corename(struct core_name *cn, int size)
60{ 60{
61 char *old_corename = cn->corename; 61 char *corename = krealloc(cn->corename, size, GFP_KERNEL);
62
63 cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
64 cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
65 62
66 if (!cn->corename) { 63 if (!corename)
67 kfree(old_corename);
68 return -ENOMEM; 64 return -ENOMEM;
69 }
70 65
66 if (size > core_name_size) /* racy but harmless */
67 core_name_size = size;
68
69 cn->size = ksize(corename);
70 cn->corename = corename;
71 return 0; 71 return 0;
72} 72}
73 73
74static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
75{
76 int free, need;
77
78again:
79 free = cn->size - cn->used;
80 need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
81 if (need < free) {
82 cn->used += need;
83 return 0;
84 }
85
86 if (!expand_corename(cn, cn->size + need - free + 1))
87 goto again;
88
89 return -ENOMEM;
90}
91
74static int cn_printf(struct core_name *cn, const char *fmt, ...) 92static int cn_printf(struct core_name *cn, const char *fmt, ...)
75{ 93{
76 char *cur;
77 int need;
78 int ret;
79 va_list arg; 94 va_list arg;
95 int ret;
80 96
81 va_start(arg, fmt); 97 va_start(arg, fmt);
82 need = vsnprintf(NULL, 0, fmt, arg); 98 ret = cn_vprintf(cn, fmt, arg);
83 va_end(arg); 99 va_end(arg);
84 100
85 if (likely(need < cn->size - cn->used - 1)) 101 return ret;
86 goto out_printf; 102}
87 103
88 ret = expand_corename(cn); 104static int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
89 if (ret) 105{
90 goto expand_fail; 106 int cur = cn->used;
107 va_list arg;
108 int ret;
91 109
92out_printf:
93 cur = cn->corename + cn->used;
94 va_start(arg, fmt); 110 va_start(arg, fmt);
95 vsnprintf(cur, need + 1, fmt, arg); 111 ret = cn_vprintf(cn, fmt, arg);
96 va_end(arg); 112 va_end(arg);
97 cn->used += need;
98 return 0;
99 113
100expand_fail: 114 for (; cur < cn->used; ++cur) {
115 if (cn->corename[cur] == '/')
116 cn->corename[cur] = '!';
117 }
101 return ret; 118 return ret;
102} 119}
103 120
104static void cn_escape(char *str)
105{
106 for (; *str; str++)
107 if (*str == '/')
108 *str = '!';
109}
110
111static int cn_print_exe_file(struct core_name *cn) 121static int cn_print_exe_file(struct core_name *cn)
112{ 122{
113 struct file *exe_file; 123 struct file *exe_file;
@@ -115,12 +125,8 @@ static int cn_print_exe_file(struct core_name *cn)
115 int ret; 125 int ret;
116 126
117 exe_file = get_mm_exe_file(current->mm); 127 exe_file = get_mm_exe_file(current->mm);
118 if (!exe_file) { 128 if (!exe_file)
119 char *commstart = cn->corename + cn->used; 129 return cn_esc_printf(cn, "%s (path unknown)", current->comm);
120 ret = cn_printf(cn, "%s (path unknown)", current->comm);
121 cn_escape(commstart);
122 return ret;
123 }
124 130
125 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); 131 pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
126 if (!pathbuf) { 132 if (!pathbuf) {
@@ -134,9 +140,7 @@ static int cn_print_exe_file(struct core_name *cn)
134 goto free_buf; 140 goto free_buf;
135 } 141 }
136 142
137 cn_escape(path); 143 ret = cn_esc_printf(cn, "%s", path);
138
139 ret = cn_printf(cn, "%s", path);
140 144
141free_buf: 145free_buf:
142 kfree(pathbuf); 146 kfree(pathbuf);
@@ -157,19 +161,19 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
157 int pid_in_pattern = 0; 161 int pid_in_pattern = 0;
158 int err = 0; 162 int err = 0;
159 163
160 cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
161 cn->corename = kmalloc(cn->size, GFP_KERNEL);
162 cn->used = 0; 164 cn->used = 0;
163 165 cn->corename = NULL;
164 if (!cn->corename) 166 if (expand_corename(cn, core_name_size))
165 return -ENOMEM; 167 return -ENOMEM;
168 cn->corename[0] = '\0';
169
170 if (ispipe)
171 ++pat_ptr;
166 172
167 /* Repeat as long as we have more pattern to process and more output 173 /* Repeat as long as we have more pattern to process and more output
168 space */ 174 space */
169 while (*pat_ptr) { 175 while (*pat_ptr) {
170 if (*pat_ptr != '%') { 176 if (*pat_ptr != '%') {
171 if (*pat_ptr == 0)
172 goto out;
173 err = cn_printf(cn, "%c", *pat_ptr++); 177 err = cn_printf(cn, "%c", *pat_ptr++);
174 } else { 178 } else {
175 switch (*++pat_ptr) { 179 switch (*++pat_ptr) {
@@ -210,22 +214,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
210 break; 214 break;
211 } 215 }
212 /* hostname */ 216 /* hostname */
213 case 'h': { 217 case 'h':
214 char *namestart = cn->corename + cn->used;
215 down_read(&uts_sem); 218 down_read(&uts_sem);
216 err = cn_printf(cn, "%s", 219 err = cn_esc_printf(cn, "%s",
217 utsname()->nodename); 220 utsname()->nodename);
218 up_read(&uts_sem); 221 up_read(&uts_sem);
219 cn_escape(namestart);
220 break; 222 break;
221 }
222 /* executable */ 223 /* executable */
223 case 'e': { 224 case 'e':
224 char *commstart = cn->corename + cn->used; 225 err = cn_esc_printf(cn, "%s", current->comm);
225 err = cn_printf(cn, "%s", current->comm);
226 cn_escape(commstart);
227 break; 226 break;
228 }
229 case 'E': 227 case 'E':
230 err = cn_print_exe_file(cn); 228 err = cn_print_exe_file(cn);
231 break; 229 break;
@@ -244,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
244 return err; 242 return err;
245 } 243 }
246 244
245out:
247 /* Backward compatibility with core_uses_pid: 246 /* Backward compatibility with core_uses_pid:
248 * 247 *
249 * If core_pattern does not include a %p (as is the default) 248 * If core_pattern does not include a %p (as is the default)
@@ -254,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
254 if (err) 253 if (err)
255 return err; 254 return err;
256 } 255 }
257out:
258 return ispipe; 256 return ispipe;
259} 257}
260 258
@@ -549,7 +547,7 @@ void do_coredump(siginfo_t *siginfo)
549 if (ispipe < 0) { 547 if (ispipe < 0) {
550 printk(KERN_WARNING "format_corename failed\n"); 548 printk(KERN_WARNING "format_corename failed\n");
551 printk(KERN_WARNING "Aborting core\n"); 549 printk(KERN_WARNING "Aborting core\n");
552 goto fail_corename; 550 goto fail_unlock;
553 } 551 }
554 552
555 if (cprm.limit == 1) { 553 if (cprm.limit == 1) {
@@ -584,7 +582,7 @@ void do_coredump(siginfo_t *siginfo)
584 goto fail_dropcount; 582 goto fail_dropcount;
585 } 583 }
586 584
587 helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); 585 helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL);
588 if (!helper_argv) { 586 if (!helper_argv) {
589 printk(KERN_WARNING "%s failed to allocate memory\n", 587 printk(KERN_WARNING "%s failed to allocate memory\n",
590 __func__); 588 __func__);
@@ -601,7 +599,7 @@ void do_coredump(siginfo_t *siginfo)
601 599
602 argv_free(helper_argv); 600 argv_free(helper_argv);
603 if (retval) { 601 if (retval) {
604 printk(KERN_INFO "Core dump to %s pipe failed\n", 602 printk(KERN_INFO "Core dump to |%s pipe failed\n",
605 cn.corename); 603 cn.corename);
606 goto close_fail; 604 goto close_fail;
607 } 605 }
@@ -669,7 +667,6 @@ fail_dropcount:
669 atomic_dec(&core_dump_count); 667 atomic_dec(&core_dump_count);
670fail_unlock: 668fail_unlock:
671 kfree(cn.corename); 669 kfree(cn.corename);
672fail_corename:
673 coredump_finish(mm, core_dumped); 670 coredump_finish(mm, core_dumped);
674 revert_creds(old_cred); 671 revert_creds(old_cred);
675fail_creds: 672fail_creds:
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 35b1c7bd18b7..e501ac3a49ff 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -349,18 +349,17 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
349/* 349/*
350 * Read a cramfs directory entry. 350 * Read a cramfs directory entry.
351 */ 351 */
352static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 352static int cramfs_readdir(struct file *file, struct dir_context *ctx)
353{ 353{
354 struct inode *inode = file_inode(filp); 354 struct inode *inode = file_inode(file);
355 struct super_block *sb = inode->i_sb; 355 struct super_block *sb = inode->i_sb;
356 char *buf; 356 char *buf;
357 unsigned int offset; 357 unsigned int offset;
358 int copied;
359 358
360 /* Offset within the thing. */ 359 /* Offset within the thing. */
361 offset = filp->f_pos; 360 if (ctx->pos >= inode->i_size)
362 if (offset >= inode->i_size)
363 return 0; 361 return 0;
362 offset = ctx->pos;
364 /* Directory entries are always 4-byte aligned */ 363 /* Directory entries are always 4-byte aligned */
365 if (offset & 3) 364 if (offset & 3)
366 return -EINVAL; 365 return -EINVAL;
@@ -369,14 +368,13 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
369 if (!buf) 368 if (!buf)
370 return -ENOMEM; 369 return -ENOMEM;
371 370
372 copied = 0;
373 while (offset < inode->i_size) { 371 while (offset < inode->i_size) {
374 struct cramfs_inode *de; 372 struct cramfs_inode *de;
375 unsigned long nextoffset; 373 unsigned long nextoffset;
376 char *name; 374 char *name;
377 ino_t ino; 375 ino_t ino;
378 umode_t mode; 376 umode_t mode;
379 int namelen, error; 377 int namelen;
380 378
381 mutex_lock(&read_mutex); 379 mutex_lock(&read_mutex);
382 de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); 380 de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN);
@@ -402,13 +400,10 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
402 break; 400 break;
403 namelen--; 401 namelen--;
404 } 402 }
405 error = filldir(dirent, buf, namelen, offset, ino, mode >> 12); 403 if (!dir_emit(ctx, buf, namelen, ino, mode >> 12))
406 if (error)
407 break; 404 break;
408 405
409 offset = nextoffset; 406 ctx->pos = offset = nextoffset;
410 filp->f_pos = offset;
411 copied++;
412 } 407 }
413 kfree(buf); 408 kfree(buf);
414 return 0; 409 return 0;
@@ -547,7 +542,7 @@ static const struct address_space_operations cramfs_aops = {
547static const struct file_operations cramfs_directory_operations = { 542static const struct file_operations cramfs_directory_operations = {
548 .llseek = generic_file_llseek, 543 .llseek = generic_file_llseek,
549 .read = generic_read_dir, 544 .read = generic_read_dir,
550 .readdir = cramfs_readdir, 545 .iterate = cramfs_readdir,
551}; 546};
552 547
553static const struct inode_operations cramfs_dir_inode_operations = { 548static const struct inode_operations cramfs_dir_inode_operations = {
diff --git a/fs/dcache.c b/fs/dcache.c
index f09b9085f7d8..87bdb5329c3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,6 +1612,10 @@ EXPORT_SYMBOL(d_obtain_alias);
1612 * If a dentry was found and moved, then it is returned. Otherwise NULL 1612 * If a dentry was found and moved, then it is returned. Otherwise NULL
1613 * is returned. This matches the expected return value of ->lookup. 1613 * is returned. This matches the expected return value of ->lookup.
1614 * 1614 *
1615 * Cluster filesystems may call this function with a negative, hashed dentry.
1616 * In that case, we know that the inode will be a regular file, and also this
1617 * will only occur during atomic_open. So we need to check for the dentry
1618 * being already hashed only in the final case.
1615 */ 1619 */
1616struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) 1620struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1617{ 1621{
@@ -1636,8 +1640,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
1636 security_d_instantiate(dentry, inode); 1640 security_d_instantiate(dentry, inode);
1637 d_rehash(dentry); 1641 d_rehash(dentry);
1638 } 1642 }
1639 } else 1643 } else {
1640 d_add(dentry, inode); 1644 d_instantiate(dentry, inode);
1645 if (d_unhashed(dentry))
1646 d_rehash(dentry);
1647 }
1641 return new; 1648 return new;
1642} 1649}
1643EXPORT_SYMBOL(d_splice_alias); 1650EXPORT_SYMBOL(d_splice_alias);
@@ -1723,7 +1730,7 @@ EXPORT_SYMBOL(d_add_ci);
1723 * Do the slow-case of the dentry name compare. 1730 * Do the slow-case of the dentry name compare.
1724 * 1731 *
1725 * Unlike the dentry_cmp() function, we need to atomically 1732 * Unlike the dentry_cmp() function, we need to atomically
1726 * load the name, length and inode information, so that the 1733 * load the name and length information, so that the
1727 * filesystem can rely on them, and can use the 'name' and 1734 * filesystem can rely on them, and can use the 'name' and
1728 * 'len' information without worrying about walking off the 1735 * 'len' information without worrying about walking off the
1729 * end of memory etc. 1736 * end of memory etc.
@@ -1741,22 +1748,18 @@ enum slow_d_compare {
1741 1748
1742static noinline enum slow_d_compare slow_dentry_cmp( 1749static noinline enum slow_d_compare slow_dentry_cmp(
1743 const struct dentry *parent, 1750 const struct dentry *parent,
1744 struct inode *inode,
1745 struct dentry *dentry, 1751 struct dentry *dentry,
1746 unsigned int seq, 1752 unsigned int seq,
1747 const struct qstr *name) 1753 const struct qstr *name)
1748{ 1754{
1749 int tlen = dentry->d_name.len; 1755 int tlen = dentry->d_name.len;
1750 const char *tname = dentry->d_name.name; 1756 const char *tname = dentry->d_name.name;
1751 struct inode *i = dentry->d_inode;
1752 1757
1753 if (read_seqcount_retry(&dentry->d_seq, seq)) { 1758 if (read_seqcount_retry(&dentry->d_seq, seq)) {
1754 cpu_relax(); 1759 cpu_relax();
1755 return D_COMP_SEQRETRY; 1760 return D_COMP_SEQRETRY;
1756 } 1761 }
1757 if (parent->d_op->d_compare(parent, inode, 1762 if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
1758 dentry, i,
1759 tlen, tname, name))
1760 return D_COMP_NOMATCH; 1763 return D_COMP_NOMATCH;
1761 return D_COMP_OK; 1764 return D_COMP_OK;
1762} 1765}
@@ -1766,7 +1769,6 @@ static noinline enum slow_d_compare slow_dentry_cmp(
1766 * @parent: parent dentry 1769 * @parent: parent dentry
1767 * @name: qstr of name we wish to find 1770 * @name: qstr of name we wish to find
1768 * @seqp: returns d_seq value at the point where the dentry was found 1771 * @seqp: returns d_seq value at the point where the dentry was found
1769 * @inode: returns dentry->d_inode when the inode was found valid.
1770 * Returns: dentry, or NULL 1772 * Returns: dentry, or NULL
1771 * 1773 *
1772 * __d_lookup_rcu is the dcache lookup function for rcu-walk name 1774 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
@@ -1793,7 +1795,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
1793 */ 1795 */
1794struct dentry *__d_lookup_rcu(const struct dentry *parent, 1796struct dentry *__d_lookup_rcu(const struct dentry *parent,
1795 const struct qstr *name, 1797 const struct qstr *name,
1796 unsigned *seqp, struct inode *inode) 1798 unsigned *seqp)
1797{ 1799{
1798 u64 hashlen = name->hash_len; 1800 u64 hashlen = name->hash_len;
1799 const unsigned char *str = name->name; 1801 const unsigned char *str = name->name;
@@ -1827,11 +1829,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
1827seqretry: 1829seqretry:
1828 /* 1830 /*
1829 * The dentry sequence count protects us from concurrent 1831 * The dentry sequence count protects us from concurrent
1830 * renames, and thus protects inode, parent and name fields. 1832 * renames, and thus protects parent and name fields.
1831 * 1833 *
1832 * The caller must perform a seqcount check in order 1834 * The caller must perform a seqcount check in order
1833 * to do anything useful with the returned dentry, 1835 * to do anything useful with the returned dentry.
1834 * including using the 'd_inode' pointer.
1835 * 1836 *
1836 * NOTE! We do a "raw" seqcount_begin here. That means that 1837 * NOTE! We do a "raw" seqcount_begin here. That means that
1837 * we don't wait for the sequence count to stabilize if it 1838 * we don't wait for the sequence count to stabilize if it
@@ -1845,12 +1846,12 @@ seqretry:
1845 continue; 1846 continue;
1846 if (d_unhashed(dentry)) 1847 if (d_unhashed(dentry))
1847 continue; 1848 continue;
1848 *seqp = seq;
1849 1849
1850 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { 1850 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
1851 if (dentry->d_name.hash != hashlen_hash(hashlen)) 1851 if (dentry->d_name.hash != hashlen_hash(hashlen))
1852 continue; 1852 continue;
1853 switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) { 1853 *seqp = seq;
1854 switch (slow_dentry_cmp(parent, dentry, seq, name)) {
1854 case D_COMP_OK: 1855 case D_COMP_OK:
1855 return dentry; 1856 return dentry;
1856 case D_COMP_NOMATCH: 1857 case D_COMP_NOMATCH:
@@ -1862,6 +1863,7 @@ seqretry:
1862 1863
1863 if (dentry->d_name.hash_len != hashlen) 1864 if (dentry->d_name.hash_len != hashlen)
1864 continue; 1865 continue;
1866 *seqp = seq;
1865 if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) 1867 if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
1866 return dentry; 1868 return dentry;
1867 } 1869 }
@@ -1959,9 +1961,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
1959 if (parent->d_flags & DCACHE_OP_COMPARE) { 1961 if (parent->d_flags & DCACHE_OP_COMPARE) {
1960 int tlen = dentry->d_name.len; 1962 int tlen = dentry->d_name.len;
1961 const char *tname = dentry->d_name.name; 1963 const char *tname = dentry->d_name.name;
1962 if (parent->d_op->d_compare(parent, parent->d_inode, 1964 if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
1963 dentry, dentry->d_inode,
1964 tlen, tname, name))
1965 goto next; 1965 goto next;
1966 } else { 1966 } else {
1967 if (dentry->d_name.len != len) 1967 if (dentry->d_name.len != len)
@@ -1998,7 +1998,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
1998 */ 1998 */
1999 name->hash = full_name_hash(name->name, name->len); 1999 name->hash = full_name_hash(name->name, name->len);
2000 if (dir->d_flags & DCACHE_OP_HASH) { 2000 if (dir->d_flags & DCACHE_OP_HASH) {
2001 int err = dir->d_op->d_hash(dir, dir->d_inode, name); 2001 int err = dir->d_op->d_hash(dir, name);
2002 if (unlikely(err < 0)) 2002 if (unlikely(err < 0))
2003 return ERR_PTR(err); 2003 return ERR_PTR(err);
2004 } 2004 }
@@ -2968,34 +2968,21 @@ rename_retry:
2968 goto again; 2968 goto again;
2969} 2969}
2970 2970
2971/** 2971void d_tmpfile(struct dentry *dentry, struct inode *inode)
2972 * find_inode_number - check for dentry with name
2973 * @dir: directory to check
2974 * @name: Name to find.
2975 *
2976 * Check whether a dentry already exists for the given name,
2977 * and return the inode number if it has an inode. Otherwise
2978 * 0 is returned.
2979 *
2980 * This routine is used to post-process directory listings for
2981 * filesystems using synthetic inode numbers, and is necessary
2982 * to keep getcwd() working.
2983 */
2984
2985ino_t find_inode_number(struct dentry *dir, struct qstr *name)
2986{ 2972{
2987 struct dentry * dentry; 2973 inode_dec_link_count(inode);
2988 ino_t ino = 0; 2974 BUG_ON(dentry->d_name.name != dentry->d_iname ||
2989 2975 !hlist_unhashed(&dentry->d_alias) ||
2990 dentry = d_hash_and_lookup(dir, name); 2976 !d_unlinked(dentry));
2991 if (!IS_ERR_OR_NULL(dentry)) { 2977 spin_lock(&dentry->d_parent->d_lock);
2992 if (dentry->d_inode) 2978 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
2993 ino = dentry->d_inode->i_ino; 2979 dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
2994 dput(dentry); 2980 (unsigned long long)inode->i_ino);
2995 } 2981 spin_unlock(&dentry->d_lock);
2996 return ino; 2982 spin_unlock(&dentry->d_parent->d_lock);
2983 d_instantiate(dentry, inode);
2997} 2984}
2998EXPORT_SYMBOL(find_inode_number); 2985EXPORT_SYMBOL(d_tmpfile);
2999 2986
3000static __initdata unsigned long dhash_entries; 2987static __initdata unsigned long dhash_entries;
3001static int __init set_dhash_entries(char *str) 2988static int __init set_dhash_entries(char *str)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index c5ca6ae5a30c..63146295153b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -21,6 +21,7 @@
21#include <linux/debugfs.h> 21#include <linux/debugfs.h>
22#include <linux/io.h> 22#include <linux/io.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/atomic.h>
24 25
25static ssize_t default_read_file(struct file *file, char __user *buf, 26static ssize_t default_read_file(struct file *file, char __user *buf,
26 size_t count, loff_t *ppos) 27 size_t count, loff_t *ppos)
@@ -403,6 +404,47 @@ struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
403} 404}
404EXPORT_SYMBOL_GPL(debugfs_create_size_t); 405EXPORT_SYMBOL_GPL(debugfs_create_size_t);
405 406
407static int debugfs_atomic_t_set(void *data, u64 val)
408{
409 atomic_set((atomic_t *)data, val);
410 return 0;
411}
412static int debugfs_atomic_t_get(void *data, u64 *val)
413{
414 *val = atomic_read((atomic_t *)data);
415 return 0;
416}
417DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
418 debugfs_atomic_t_set, "%lld\n");
419DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n");
420DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
421
422/**
423 * debugfs_create_atomic_t - create a debugfs file that is used to read and
424 * write an atomic_t value
425 * @name: a pointer to a string containing the name of the file to create.
426 * @mode: the permission that the file should have
427 * @parent: a pointer to the parent dentry for this file. This should be a
428 * directory dentry if set. If this parameter is %NULL, then the
429 * file will be created in the root of the debugfs filesystem.
430 * @value: a pointer to the variable that the file should read to and write
431 * from.
432 */
433struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
434 struct dentry *parent, atomic_t *value)
435{
436 /* if there are no write bits set, make read only */
437 if (!(mode & S_IWUGO))
438 return debugfs_create_file(name, mode, parent, value,
439 &fops_atomic_t_ro);
440 /* if there are no read bits set, make write only */
441 if (!(mode & S_IRUGO))
442 return debugfs_create_file(name, mode, parent, value,
443 &fops_atomic_t_wo);
444
445 return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
446}
447EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
406 448
407static ssize_t read_file_bool(struct file *file, char __user *user_buf, 449static ssize_t read_file_bool(struct file *file, char __user *user_buf,
408 size_t count, loff_t *ppos) 450 size_t count, loff_t *ppos)
@@ -431,6 +473,7 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
431 if (copy_from_user(buf, user_buf, buf_size)) 473 if (copy_from_user(buf, user_buf, buf_size))
432 return -EFAULT; 474 return -EFAULT;
433 475
476 buf[buf_size] = '\0';
434 if (strtobool(buf, &bv) == 0) 477 if (strtobool(buf, &bv) == 0)
435 *val = bv; 478 *val = bv;
436 479
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 4888cb3fdef7..c7c83ff0f752 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -533,8 +533,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
533 */ 533 */
534void debugfs_remove_recursive(struct dentry *dentry) 534void debugfs_remove_recursive(struct dentry *dentry)
535{ 535{
536 struct dentry *child; 536 struct dentry *child, *next, *parent;
537 struct dentry *parent;
538 537
539 if (IS_ERR_OR_NULL(dentry)) 538 if (IS_ERR_OR_NULL(dentry))
540 return; 539 return;
@@ -544,61 +543,37 @@ void debugfs_remove_recursive(struct dentry *dentry)
544 return; 543 return;
545 544
546 parent = dentry; 545 parent = dentry;
546 down:
547 mutex_lock(&parent->d_inode->i_mutex); 547 mutex_lock(&parent->d_inode->i_mutex);
548 list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
549 if (!debugfs_positive(child))
550 continue;
548 551
549 while (1) { 552 /* perhaps simple_empty(child) makes more sense */
550 /*
551 * When all dentries under "parent" has been removed,
552 * walk up the tree until we reach our starting point.
553 */
554 if (list_empty(&parent->d_subdirs)) {
555 mutex_unlock(&parent->d_inode->i_mutex);
556 if (parent == dentry)
557 break;
558 parent = parent->d_parent;
559 mutex_lock(&parent->d_inode->i_mutex);
560 }
561 child = list_entry(parent->d_subdirs.next, struct dentry,
562 d_u.d_child);
563 next_sibling:
564
565 /*
566 * If "child" isn't empty, walk down the tree and
567 * remove all its descendants first.
568 */
569 if (!list_empty(&child->d_subdirs)) { 553 if (!list_empty(&child->d_subdirs)) {
570 mutex_unlock(&parent->d_inode->i_mutex); 554 mutex_unlock(&parent->d_inode->i_mutex);
571 parent = child; 555 parent = child;
572 mutex_lock(&parent->d_inode->i_mutex); 556 goto down;
573 continue;
574 } 557 }
575 __debugfs_remove(child, parent); 558 up:
576 if (parent->d_subdirs.next == &child->d_u.d_child) { 559 if (!__debugfs_remove(child, parent))
577 /* 560 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
578 * Try the next sibling.
579 */
580 if (child->d_u.d_child.next != &parent->d_subdirs) {
581 child = list_entry(child->d_u.d_child.next,
582 struct dentry,
583 d_u.d_child);
584 goto next_sibling;
585 }
586
587 /*
588 * Avoid infinite loop if we fail to remove
589 * one dentry.
590 */
591 mutex_unlock(&parent->d_inode->i_mutex);
592 break;
593 }
594 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
595 } 561 }
596 562
597 parent = dentry->d_parent; 563 mutex_unlock(&parent->d_inode->i_mutex);
564 child = parent;
565 parent = parent->d_parent;
598 mutex_lock(&parent->d_inode->i_mutex); 566 mutex_lock(&parent->d_inode->i_mutex);
599 __debugfs_remove(dentry, parent); 567
568 if (child != dentry) {
569 next = list_entry(child->d_u.d_child.next, struct dentry,
570 d_u.d_child);
571 goto up;
572 }
573
574 if (!__debugfs_remove(child, parent))
575 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
600 mutex_unlock(&parent->d_inode->i_mutex); 576 mutex_unlock(&parent->d_inode->i_mutex);
601 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
602} 577}
603EXPORT_SYMBOL_GPL(debugfs_remove_recursive); 578EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
604 579
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 7d58d5b112b5..76feb4b60fa6 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
138static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, 138static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
139 const char *buf, size_t len) 139 const char *buf, size_t len)
140{ 140{
141 strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN); 141 strlcpy(dlm_config.ci_cluster_name, buf,
142 strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN); 142 sizeof(dlm_config.ci_cluster_name));
143 strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
143 return len; 144 return len;
144} 145}
145 146
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1b1146670c4b..e223a911a834 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
2038 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; 2038 b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
2039 if (b == 1) { 2039 if (b == 1) {
2040 int len = receive_extralen(ms); 2040 int len = receive_extralen(ms);
2041 if (len > DLM_RESNAME_MAXLEN) 2041 if (len > r->res_ls->ls_lvblen)
2042 len = DLM_RESNAME_MAXLEN; 2042 len = r->res_ls->ls_lvblen;
2043 memcpy(lkb->lkb_lvbptr, ms->m_extra, len); 2043 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2044 lkb->lkb_lvbseq = ms->m_lvbseq; 2044 lkb->lkb_lvbseq = ms->m_lvbseq;
2045 } 2045 }
@@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
3893 if (!lkb->lkb_lvbptr) 3893 if (!lkb->lkb_lvbptr)
3894 return -ENOMEM; 3894 return -ENOMEM;
3895 len = receive_extralen(ms); 3895 len = receive_extralen(ms);
3896 if (len > DLM_RESNAME_MAXLEN) 3896 if (len > ls->ls_lvblen)
3897 len = DLM_RESNAME_MAXLEN; 3897 len = ls->ls_lvblen;
3898 memcpy(lkb->lkb_lvbptr, ms->m_extra, len); 3898 memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
3899 } 3899 }
3900 return 0; 3900 return 0;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3ca79d3253b9..88556dc0458e 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force)
883void dlm_stop_lockspaces(void) 883void dlm_stop_lockspaces(void)
884{ 884{
885 struct dlm_ls *ls; 885 struct dlm_ls *ls;
886 int count;
886 887
887 restart: 888 restart:
889 count = 0;
888 spin_lock(&lslist_lock); 890 spin_lock(&lslist_lock);
889 list_for_each_entry(ls, &lslist, ls_list) { 891 list_for_each_entry(ls, &lslist, ls_list) {
890 if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) 892 if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) {
893 count++;
891 continue; 894 continue;
895 }
892 spin_unlock(&lslist_lock); 896 spin_unlock(&lslist_lock);
893 log_error(ls, "no userland control daemon, stopping lockspace"); 897 log_error(ls, "no userland control daemon, stopping lockspace");
894 dlm_ls_stop(ls); 898 dlm_ls_stop(ls);
895 goto restart; 899 goto restart;
896 } 900 }
897 spin_unlock(&lslist_lock); 901 spin_unlock(&lslist_lock);
902
903 if (count)
904 log_print("dlm user daemon left %d lockspaces", count);
898} 905}
899 906
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d0ccd2fd79eb..d90909ec6aa6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,7 +52,6 @@
52#include <linux/mutex.h> 52#include <linux/mutex.h>
53#include <linux/sctp.h> 53#include <linux/sctp.h>
54#include <linux/slab.h> 54#include <linux/slab.h>
55#include <linux/sctp.h>
56#include <net/sctp/sctp.h> 55#include <net/sctp/sctp.h>
57#include <net/ipv6.h> 56#include <net/ipv6.h>
58 57
@@ -126,6 +125,7 @@ struct connection {
126 struct connection *othercon; 125 struct connection *othercon;
127 struct work_struct rwork; /* Receive workqueue */ 126 struct work_struct rwork; /* Receive workqueue */
128 struct work_struct swork; /* Send workqueue */ 127 struct work_struct swork; /* Send workqueue */
128 bool try_new_addr;
129}; 129};
130#define sock2con(x) ((struct connection *)(x)->sk_user_data) 130#define sock2con(x) ((struct connection *)(x)->sk_user_data)
131 131
@@ -144,6 +144,7 @@ struct dlm_node_addr {
144 struct list_head list; 144 struct list_head list;
145 int nodeid; 145 int nodeid;
146 int addr_count; 146 int addr_count;
147 int curr_addr_index;
147 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; 148 struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
148}; 149};
149 150
@@ -310,7 +311,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
310} 311}
311 312
312static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, 313static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
313 struct sockaddr *sa_out) 314 struct sockaddr *sa_out, bool try_new_addr)
314{ 315{
315 struct sockaddr_storage sas; 316 struct sockaddr_storage sas;
316 struct dlm_node_addr *na; 317 struct dlm_node_addr *na;
@@ -320,8 +321,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out,
320 321
321 spin_lock(&dlm_node_addrs_spin); 322 spin_lock(&dlm_node_addrs_spin);
322 na = find_node_addr(nodeid); 323 na = find_node_addr(nodeid);
323 if (na && na->addr_count) 324 if (na && na->addr_count) {
324 memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage)); 325 if (try_new_addr) {
326 na->curr_addr_index++;
327 if (na->curr_addr_index == na->addr_count)
328 na->curr_addr_index = 0;
329 }
330
331 memcpy(&sas, na->addr[na->curr_addr_index ],
332 sizeof(struct sockaddr_storage));
333 }
325 spin_unlock(&dlm_node_addrs_spin); 334 spin_unlock(&dlm_node_addrs_spin);
326 335
327 if (!na) 336 if (!na)
@@ -353,19 +362,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
353{ 362{
354 struct dlm_node_addr *na; 363 struct dlm_node_addr *na;
355 int rv = -EEXIST; 364 int rv = -EEXIST;
365 int addr_i;
356 366
357 spin_lock(&dlm_node_addrs_spin); 367 spin_lock(&dlm_node_addrs_spin);
358 list_for_each_entry(na, &dlm_node_addrs, list) { 368 list_for_each_entry(na, &dlm_node_addrs, list) {
359 if (!na->addr_count) 369 if (!na->addr_count)
360 continue; 370 continue;
361 371
362 if (!addr_compare(na->addr[0], addr)) 372 for (addr_i = 0; addr_i < na->addr_count; addr_i++) {
363 continue; 373 if (addr_compare(na->addr[addr_i], addr)) {
364 374 *nodeid = na->nodeid;
365 *nodeid = na->nodeid; 375 rv = 0;
366 rv = 0; 376 goto unlock;
367 break; 377 }
378 }
368 } 379 }
380unlock:
369 spin_unlock(&dlm_node_addrs_spin); 381 spin_unlock(&dlm_node_addrs_spin);
370 return rv; 382 return rv;
371} 383}
@@ -561,8 +573,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
561 573
562static void sctp_init_failed_foreach(struct connection *con) 574static void sctp_init_failed_foreach(struct connection *con)
563{ 575{
576
577 /*
578 * Don't try to recover base con and handle race where the
579 * other node's assoc init creates a assoc and we get that
580 * notification, then we get a notification that our attempt
581 * failed due. This happens when we are still trying the primary
582 * address, but the other node has already tried secondary addrs
583 * and found one that worked.
584 */
585 if (!con->nodeid || con->sctp_assoc)
586 return;
587
588 log_print("Retrying SCTP association init for node %d\n", con->nodeid);
589
590 con->try_new_addr = true;
564 con->sctp_assoc = 0; 591 con->sctp_assoc = 0;
565 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { 592 if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) {
566 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) 593 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
567 queue_work(send_workqueue, &con->swork); 594 queue_work(send_workqueue, &con->swork);
568 } 595 }
@@ -579,15 +606,56 @@ static void sctp_init_failed(void)
579 mutex_unlock(&connections_lock); 606 mutex_unlock(&connections_lock);
580} 607}
581 608
609static void retry_failed_sctp_send(struct connection *recv_con,
610 struct sctp_send_failed *sn_send_failed,
611 char *buf)
612{
613 int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed);
614 struct dlm_mhandle *mh;
615 struct connection *con;
616 char *retry_buf;
617 int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
618
619 log_print("Retry sending %d bytes to node id %d", len, nodeid);
620
621 con = nodeid2con(nodeid, 0);
622 if (!con) {
623 log_print("Could not look up con for nodeid %d\n",
624 nodeid);
625 return;
626 }
627
628 mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf);
629 if (!mh) {
630 log_print("Could not allocate buf for retry.");
631 return;
632 }
633 memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len);
634 dlm_lowcomms_commit_buffer(mh);
635
636 /*
637 * If we got a assoc changed event before the send failed event then
638 * we only need to retry the send.
639 */
640 if (con->sctp_assoc) {
641 if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
642 queue_work(send_workqueue, &con->swork);
643 } else
644 sctp_init_failed_foreach(con);
645}
646
582/* Something happened to an association */ 647/* Something happened to an association */
583static void process_sctp_notification(struct connection *con, 648static void process_sctp_notification(struct connection *con,
584 struct msghdr *msg, char *buf) 649 struct msghdr *msg, char *buf)
585{ 650{
586 union sctp_notification *sn = (union sctp_notification *)buf; 651 union sctp_notification *sn = (union sctp_notification *)buf;
587 652
588 if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { 653 switch (sn->sn_header.sn_type) {
654 case SCTP_SEND_FAILED:
655 retry_failed_sctp_send(con, &sn->sn_send_failed, buf);
656 break;
657 case SCTP_ASSOC_CHANGE:
589 switch (sn->sn_assoc_change.sac_state) { 658 switch (sn->sn_assoc_change.sac_state) {
590
591 case SCTP_COMM_UP: 659 case SCTP_COMM_UP:
592 case SCTP_RESTART: 660 case SCTP_RESTART:
593 { 661 {
@@ -662,9 +730,11 @@ static void process_sctp_notification(struct connection *con,
662 log_print("connecting to %d sctp association %d", 730 log_print("connecting to %d sctp association %d",
663 nodeid, (int)sn->sn_assoc_change.sac_assoc_id); 731 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
664 732
733 new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id;
734 new_con->try_new_addr = false;
665 /* Send any pending writes */ 735 /* Send any pending writes */
666 clear_bit(CF_CONNECT_PENDING, &new_con->flags); 736 clear_bit(CF_CONNECT_PENDING, &new_con->flags);
667 clear_bit(CF_INIT_PENDING, &con->flags); 737 clear_bit(CF_INIT_PENDING, &new_con->flags);
668 if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { 738 if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) {
669 queue_work(send_workqueue, &new_con->swork); 739 queue_work(send_workqueue, &new_con->swork);
670 } 740 }
@@ -683,14 +753,10 @@ static void process_sctp_notification(struct connection *con,
683 } 753 }
684 break; 754 break;
685 755
686 /* We don't know which INIT failed, so clear the PENDING flags
687 * on them all. if assoc_id is zero then it will then try
688 * again */
689
690 case SCTP_CANT_STR_ASSOC: 756 case SCTP_CANT_STR_ASSOC:
691 { 757 {
758 /* Will retry init when we get the send failed notification */
692 log_print("Can't start SCTP association - retrying"); 759 log_print("Can't start SCTP association - retrying");
693 sctp_init_failed();
694 } 760 }
695 break; 761 break;
696 762
@@ -699,6 +765,8 @@ static void process_sctp_notification(struct connection *con,
699 (int)sn->sn_assoc_change.sac_assoc_id, 765 (int)sn->sn_assoc_change.sac_assoc_id,
700 sn->sn_assoc_change.sac_state); 766 sn->sn_assoc_change.sac_state);
701 } 767 }
768 default:
769 ; /* fall through */
702 } 770 }
703} 771}
704 772
@@ -958,6 +1026,24 @@ static void free_entry(struct writequeue_entry *e)
958 kfree(e); 1026 kfree(e);
959} 1027}
960 1028
1029/*
1030 * writequeue_entry_complete - try to delete and free write queue entry
1031 * @e: write queue entry to try to delete
1032 * @completed: bytes completed
1033 *
1034 * writequeue_lock must be held.
1035 */
1036static void writequeue_entry_complete(struct writequeue_entry *e, int completed)
1037{
1038 e->offset += completed;
1039 e->len -= completed;
1040
1041 if (e->len == 0 && e->users == 0) {
1042 list_del(&e->list);
1043 free_entry(e);
1044 }
1045}
1046
961/* Initiate an SCTP association. 1047/* Initiate an SCTP association.
962 This is a special case of send_to_sock() in that we don't yet have a 1048 This is a special case of send_to_sock() in that we don't yet have a
963 peeled-off socket for this association, so we use the listening socket 1049 peeled-off socket for this association, so we use the listening socket
@@ -977,15 +1063,14 @@ static void sctp_init_assoc(struct connection *con)
977 int addrlen; 1063 int addrlen;
978 struct kvec iov[1]; 1064 struct kvec iov[1];
979 1065
1066 mutex_lock(&con->sock_mutex);
980 if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) 1067 if (test_and_set_bit(CF_INIT_PENDING, &con->flags))
981 return; 1068 goto unlock;
982
983 if (con->retries++ > MAX_CONNECT_RETRIES)
984 return;
985 1069
986 if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) { 1070 if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr,
1071 con->try_new_addr)) {
987 log_print("no address for nodeid %d", con->nodeid); 1072 log_print("no address for nodeid %d", con->nodeid);
988 return; 1073 goto unlock;
989 } 1074 }
990 base_con = nodeid2con(0, 0); 1075 base_con = nodeid2con(0, 0);
991 BUG_ON(base_con == NULL); 1076 BUG_ON(base_con == NULL);
@@ -1003,17 +1088,25 @@ static void sctp_init_assoc(struct connection *con)
1003 if (list_empty(&con->writequeue)) { 1088 if (list_empty(&con->writequeue)) {
1004 spin_unlock(&con->writequeue_lock); 1089 spin_unlock(&con->writequeue_lock);
1005 log_print("writequeue empty for nodeid %d", con->nodeid); 1090 log_print("writequeue empty for nodeid %d", con->nodeid);
1006 return; 1091 goto unlock;
1007 } 1092 }
1008 1093
1009 e = list_first_entry(&con->writequeue, struct writequeue_entry, list); 1094 e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
1010 len = e->len; 1095 len = e->len;
1011 offset = e->offset; 1096 offset = e->offset;
1012 spin_unlock(&con->writequeue_lock);
1013 1097
1014 /* Send the first block off the write queue */ 1098 /* Send the first block off the write queue */
1015 iov[0].iov_base = page_address(e->page)+offset; 1099 iov[0].iov_base = page_address(e->page)+offset;
1016 iov[0].iov_len = len; 1100 iov[0].iov_len = len;
1101 spin_unlock(&con->writequeue_lock);
1102
1103 if (rem_addr.ss_family == AF_INET) {
1104 struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr;
1105 log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr);
1106 } else {
1107 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr;
1108 log_print("Trying to connect to %pI6", &sin6->sin6_addr);
1109 }
1017 1110
1018 cmsg = CMSG_FIRSTHDR(&outmessage); 1111 cmsg = CMSG_FIRSTHDR(&outmessage);
1019 cmsg->cmsg_level = IPPROTO_SCTP; 1112 cmsg->cmsg_level = IPPROTO_SCTP;
@@ -1021,8 +1114,9 @@ static void sctp_init_assoc(struct connection *con)
1021 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); 1114 cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
1022 sinfo = CMSG_DATA(cmsg); 1115 sinfo = CMSG_DATA(cmsg);
1023 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); 1116 memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
1024 sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid()); 1117 sinfo->sinfo_ppid = cpu_to_le32(con->nodeid);
1025 outmessage.msg_controllen = cmsg->cmsg_len; 1118 outmessage.msg_controllen = cmsg->cmsg_len;
1119 sinfo->sinfo_flags |= SCTP_ADDR_OVER;
1026 1120
1027 ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); 1121 ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len);
1028 if (ret < 0) { 1122 if (ret < 0) {
@@ -1035,15 +1129,12 @@ static void sctp_init_assoc(struct connection *con)
1035 } 1129 }
1036 else { 1130 else {
1037 spin_lock(&con->writequeue_lock); 1131 spin_lock(&con->writequeue_lock);
1038 e->offset += ret; 1132 writequeue_entry_complete(e, ret);
1039 e->len -= ret;
1040
1041 if (e->len == 0 && e->users == 0) {
1042 list_del(&e->list);
1043 free_entry(e);
1044 }
1045 spin_unlock(&con->writequeue_lock); 1133 spin_unlock(&con->writequeue_lock);
1046 } 1134 }
1135
1136unlock:
1137 mutex_unlock(&con->sock_mutex);
1047} 1138}
1048 1139
1049/* Connect a new socket to its peer */ 1140/* Connect a new socket to its peer */
@@ -1075,7 +1166,7 @@ static void tcp_connect_to_sock(struct connection *con)
1075 goto out_err; 1166 goto out_err;
1076 1167
1077 memset(&saddr, 0, sizeof(saddr)); 1168 memset(&saddr, 0, sizeof(saddr));
1078 result = nodeid_to_addr(con->nodeid, &saddr, NULL); 1169 result = nodeid_to_addr(con->nodeid, &saddr, NULL, false);
1079 if (result < 0) { 1170 if (result < 0) {
1080 log_print("no address for nodeid %d", con->nodeid); 1171 log_print("no address for nodeid %d", con->nodeid);
1081 goto out_err; 1172 goto out_err;
@@ -1254,6 +1345,7 @@ static int sctp_listen_for_all(void)
1254 int result = -EINVAL, num = 1, i, addr_len; 1345 int result = -EINVAL, num = 1, i, addr_len;
1255 struct connection *con = nodeid2con(0, GFP_NOFS); 1346 struct connection *con = nodeid2con(0, GFP_NOFS);
1256 int bufsize = NEEDED_RMEM; 1347 int bufsize = NEEDED_RMEM;
1348 int one = 1;
1257 1349
1258 if (!con) 1350 if (!con)
1259 return -ENOMEM; 1351 return -ENOMEM;
@@ -1288,6 +1380,11 @@ static int sctp_listen_for_all(void)
1288 goto create_delsock; 1380 goto create_delsock;
1289 } 1381 }
1290 1382
1383 result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one,
1384 sizeof(one));
1385 if (result < 0)
1386 log_print("Could not set SCTP NODELAY error %d\n", result);
1387
1291 /* Init con struct */ 1388 /* Init con struct */
1292 sock->sk->sk_user_data = con; 1389 sock->sk->sk_user_data = con;
1293 con->sock = sock; 1390 con->sock = sock;
@@ -1493,13 +1590,7 @@ static void send_to_sock(struct connection *con)
1493 } 1590 }
1494 1591
1495 spin_lock(&con->writequeue_lock); 1592 spin_lock(&con->writequeue_lock);
1496 e->offset += ret; 1593 writequeue_entry_complete(e, ret);
1497 e->len -= ret;
1498
1499 if (e->len == 0 && e->users == 0) {
1500 list_del(&e->list);
1501 free_entry(e);
1502 }
1503 } 1594 }
1504 spin_unlock(&con->writequeue_lock); 1595 spin_unlock(&con->writequeue_lock);
1505out: 1596out:
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 911649a47dd5..812149119fa3 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -686,7 +686,6 @@ static int device_close(struct inode *inode, struct file *file)
686 device_remove_lockspace() */ 686 device_remove_lockspace() */
687 687
688 sigprocmask(SIG_SETMASK, &tmpsig, NULL); 688 sigprocmask(SIG_SETMASK, &tmpsig, NULL);
689 recalc_sigpending();
690 689
691 return 0; 690 return 0;
692} 691}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index f71ec125290d..d10757635b9c 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -37,16 +37,8 @@
37#include <asm/unaligned.h> 37#include <asm/unaligned.h>
38#include "ecryptfs_kernel.h" 38#include "ecryptfs_kernel.h"
39 39
40static int 40#define DECRYPT 0
41ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, 41#define ENCRYPT 1
42 struct page *dst_page, int dst_offset,
43 struct page *src_page, int src_offset, int size,
44 unsigned char *iv);
45static int
46ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
47 struct page *dst_page, int dst_offset,
48 struct page *src_page, int src_offset, int size,
49 unsigned char *iv);
50 42
51/** 43/**
52 * ecryptfs_to_hex 44 * ecryptfs_to_hex
@@ -336,19 +328,20 @@ static void extent_crypt_complete(struct crypto_async_request *req, int rc)
336} 328}
337 329
338/** 330/**
339 * encrypt_scatterlist 331 * crypt_scatterlist
340 * @crypt_stat: Pointer to the crypt_stat struct to initialize. 332 * @crypt_stat: Pointer to the crypt_stat struct to initialize.
341 * @dest_sg: Destination of encrypted data 333 * @dst_sg: Destination of the data after performing the crypto operation
342 * @src_sg: Data to be encrypted 334 * @src_sg: Data to be encrypted or decrypted
343 * @size: Length of data to be encrypted 335 * @size: Length of data
344 * @iv: iv to use during encryption 336 * @iv: IV to use
337 * @op: ENCRYPT or DECRYPT to indicate the desired operation
345 * 338 *
346 * Returns the number of bytes encrypted; negative value on error 339 * Returns the number of bytes encrypted or decrypted; negative value on error
347 */ 340 */
348static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, 341static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
349 struct scatterlist *dest_sg, 342 struct scatterlist *dst_sg,
350 struct scatterlist *src_sg, int size, 343 struct scatterlist *src_sg, int size,
351 unsigned char *iv) 344 unsigned char *iv, int op)
352{ 345{
353 struct ablkcipher_request *req = NULL; 346 struct ablkcipher_request *req = NULL;
354 struct extent_crypt_result ecr; 347 struct extent_crypt_result ecr;
@@ -391,9 +384,9 @@ static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
391 crypt_stat->flags |= ECRYPTFS_KEY_SET; 384 crypt_stat->flags |= ECRYPTFS_KEY_SET;
392 } 385 }
393 mutex_unlock(&crypt_stat->cs_tfm_mutex); 386 mutex_unlock(&crypt_stat->cs_tfm_mutex);
394 ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size); 387 ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
395 ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv); 388 rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
396 rc = crypto_ablkcipher_encrypt(req); 389 crypto_ablkcipher_decrypt(req);
397 if (rc == -EINPROGRESS || rc == -EBUSY) { 390 if (rc == -EINPROGRESS || rc == -EBUSY) {
398 struct extent_crypt_result *ecr = req->base.data; 391 struct extent_crypt_result *ecr = req->base.data;
399 392
@@ -407,41 +400,43 @@ out:
407} 400}
408 401
409/** 402/**
410 * ecryptfs_lower_offset_for_extent 403 * lower_offset_for_page
411 * 404 *
412 * Convert an eCryptfs page index into a lower byte offset 405 * Convert an eCryptfs page index into a lower byte offset
413 */ 406 */
414static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, 407static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
415 struct ecryptfs_crypt_stat *crypt_stat) 408 struct page *page)
416{ 409{
417 (*offset) = ecryptfs_lower_header_size(crypt_stat) 410 return ecryptfs_lower_header_size(crypt_stat) +
418 + (crypt_stat->extent_size * extent_num); 411 (page->index << PAGE_CACHE_SHIFT);
419} 412}
420 413
421/** 414/**
422 * ecryptfs_encrypt_extent 415 * crypt_extent
423 * @enc_extent_page: Allocated page into which to encrypt the data in
424 * @page
425 * @crypt_stat: crypt_stat containing cryptographic context for the 416 * @crypt_stat: crypt_stat containing cryptographic context for the
426 * encryption operation 417 * encryption operation
427 * @page: Page containing plaintext data extent to encrypt 418 * @dst_page: The page to write the result into
419 * @src_page: The page to read from
428 * @extent_offset: Page extent offset for use in generating IV 420 * @extent_offset: Page extent offset for use in generating IV
421 * @op: ENCRYPT or DECRYPT to indicate the desired operation
429 * 422 *
430 * Encrypts one extent of data. 423 * Encrypts or decrypts one extent of data.
431 * 424 *
432 * Return zero on success; non-zero otherwise 425 * Return zero on success; non-zero otherwise
433 */ 426 */
434static int ecryptfs_encrypt_extent(struct page *enc_extent_page, 427static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
435 struct ecryptfs_crypt_stat *crypt_stat, 428 struct page *dst_page,
436 struct page *page, 429 struct page *src_page,
437 unsigned long extent_offset) 430 unsigned long extent_offset, int op)
438{ 431{
432 pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
439 loff_t extent_base; 433 loff_t extent_base;
440 char extent_iv[ECRYPTFS_MAX_IV_BYTES]; 434 char extent_iv[ECRYPTFS_MAX_IV_BYTES];
435 struct scatterlist src_sg, dst_sg;
436 size_t extent_size = crypt_stat->extent_size;
441 int rc; 437 int rc;
442 438
443 extent_base = (((loff_t)page->index) 439 extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
444 * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
445 rc = ecryptfs_derive_iv(extent_iv, crypt_stat, 440 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
446 (extent_base + extent_offset)); 441 (extent_base + extent_offset));
447 if (rc) { 442 if (rc) {
@@ -450,15 +445,21 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
450 (unsigned long long)(extent_base + extent_offset), rc); 445 (unsigned long long)(extent_base + extent_offset), rc);
451 goto out; 446 goto out;
452 } 447 }
453 rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0, 448
454 page, (extent_offset 449 sg_init_table(&src_sg, 1);
455 * crypt_stat->extent_size), 450 sg_init_table(&dst_sg, 1);
456 crypt_stat->extent_size, extent_iv); 451
452 sg_set_page(&src_sg, src_page, extent_size,
453 extent_offset * extent_size);
454 sg_set_page(&dst_sg, dst_page, extent_size,
455 extent_offset * extent_size);
456
457 rc = crypt_scatterlist(crypt_stat, &dst_sg, &src_sg, extent_size,
458 extent_iv, op);
457 if (rc < 0) { 459 if (rc < 0) {
458 printk(KERN_ERR "%s: Error attempting to encrypt page with " 460 printk(KERN_ERR "%s: Error attempting to crypt page with "
459 "page->index = [%ld], extent_offset = [%ld]; " 461 "page_index = [%ld], extent_offset = [%ld]; "
460 "rc = [%d]\n", __func__, page->index, extent_offset, 462 "rc = [%d]\n", __func__, page_index, extent_offset, rc);
461 rc);
462 goto out; 463 goto out;
463 } 464 }
464 rc = 0; 465 rc = 0;
@@ -489,6 +490,7 @@ int ecryptfs_encrypt_page(struct page *page)
489 char *enc_extent_virt; 490 char *enc_extent_virt;
490 struct page *enc_extent_page = NULL; 491 struct page *enc_extent_page = NULL;
491 loff_t extent_offset; 492 loff_t extent_offset;
493 loff_t lower_offset;
492 int rc = 0; 494 int rc = 0;
493 495
494 ecryptfs_inode = page->mapping->host; 496 ecryptfs_inode = page->mapping->host;
@@ -502,75 +504,35 @@ int ecryptfs_encrypt_page(struct page *page)
502 "encrypted extent\n"); 504 "encrypted extent\n");
503 goto out; 505 goto out;
504 } 506 }
505 enc_extent_virt = kmap(enc_extent_page); 507
506 for (extent_offset = 0; 508 for (extent_offset = 0;
507 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); 509 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
508 extent_offset++) { 510 extent_offset++) {
509 loff_t offset; 511 rc = crypt_extent(crypt_stat, enc_extent_page, page,
510 512 extent_offset, ENCRYPT);
511 rc = ecryptfs_encrypt_extent(enc_extent_page, crypt_stat, page,
512 extent_offset);
513 if (rc) { 513 if (rc) {
514 printk(KERN_ERR "%s: Error encrypting extent; " 514 printk(KERN_ERR "%s: Error encrypting extent; "
515 "rc = [%d]\n", __func__, rc); 515 "rc = [%d]\n", __func__, rc);
516 goto out; 516 goto out;
517 } 517 }
518 ecryptfs_lower_offset_for_extent(
519 &offset, ((((loff_t)page->index)
520 * (PAGE_CACHE_SIZE
521 / crypt_stat->extent_size))
522 + extent_offset), crypt_stat);
523 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
524 offset, crypt_stat->extent_size);
525 if (rc < 0) {
526 ecryptfs_printk(KERN_ERR, "Error attempting "
527 "to write lower page; rc = [%d]"
528 "\n", rc);
529 goto out;
530 }
531 }
532 rc = 0;
533out:
534 if (enc_extent_page) {
535 kunmap(enc_extent_page);
536 __free_page(enc_extent_page);
537 } 518 }
538 return rc;
539}
540 519
541static int ecryptfs_decrypt_extent(struct page *page, 520 lower_offset = lower_offset_for_page(crypt_stat, page);
542 struct ecryptfs_crypt_stat *crypt_stat, 521 enc_extent_virt = kmap(enc_extent_page);
543 struct page *enc_extent_page, 522 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
544 unsigned long extent_offset) 523 PAGE_CACHE_SIZE);
545{ 524 kunmap(enc_extent_page);
546 loff_t extent_base;
547 char extent_iv[ECRYPTFS_MAX_IV_BYTES];
548 int rc;
549
550 extent_base = (((loff_t)page->index)
551 * (PAGE_CACHE_SIZE / crypt_stat->extent_size));
552 rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
553 (extent_base + extent_offset));
554 if (rc) {
555 ecryptfs_printk(KERN_ERR, "Error attempting to derive IV for "
556 "extent [0x%.16llx]; rc = [%d]\n",
557 (unsigned long long)(extent_base + extent_offset), rc);
558 goto out;
559 }
560 rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
561 (extent_offset
562 * crypt_stat->extent_size),
563 enc_extent_page, 0,
564 crypt_stat->extent_size, extent_iv);
565 if (rc < 0) { 525 if (rc < 0) {
566 printk(KERN_ERR "%s: Error attempting to decrypt to page with " 526 ecryptfs_printk(KERN_ERR,
567 "page->index = [%ld], extent_offset = [%ld]; " 527 "Error attempting to write lower page; rc = [%d]\n",
568 "rc = [%d]\n", __func__, page->index, extent_offset, 528 rc);
569 rc);
570 goto out; 529 goto out;
571 } 530 }
572 rc = 0; 531 rc = 0;
573out: 532out:
533 if (enc_extent_page) {
534 __free_page(enc_extent_page);
535 }
574 return rc; 536 return rc;
575} 537}
576 538
@@ -594,43 +556,33 @@ int ecryptfs_decrypt_page(struct page *page)
594{ 556{
595 struct inode *ecryptfs_inode; 557 struct inode *ecryptfs_inode;
596 struct ecryptfs_crypt_stat *crypt_stat; 558 struct ecryptfs_crypt_stat *crypt_stat;
597 char *enc_extent_virt; 559 char *page_virt;
598 struct page *enc_extent_page = NULL;
599 unsigned long extent_offset; 560 unsigned long extent_offset;
561 loff_t lower_offset;
600 int rc = 0; 562 int rc = 0;
601 563
602 ecryptfs_inode = page->mapping->host; 564 ecryptfs_inode = page->mapping->host;
603 crypt_stat = 565 crypt_stat =
604 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 566 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
605 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); 567 BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
606 enc_extent_page = alloc_page(GFP_USER); 568
607 if (!enc_extent_page) { 569 lower_offset = lower_offset_for_page(crypt_stat, page);
608 rc = -ENOMEM; 570 page_virt = kmap(page);
609 ecryptfs_printk(KERN_ERR, "Error allocating memory for " 571 rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
610 "encrypted extent\n"); 572 ecryptfs_inode);
573 kunmap(page);
574 if (rc < 0) {
575 ecryptfs_printk(KERN_ERR,
576 "Error attempting to read lower page; rc = [%d]\n",
577 rc);
611 goto out; 578 goto out;
612 } 579 }
613 enc_extent_virt = kmap(enc_extent_page); 580
614 for (extent_offset = 0; 581 for (extent_offset = 0;
615 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size); 582 extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
616 extent_offset++) { 583 extent_offset++) {
617 loff_t offset; 584 rc = crypt_extent(crypt_stat, page, page,
618 585 extent_offset, DECRYPT);
619 ecryptfs_lower_offset_for_extent(
620 &offset, ((page->index * (PAGE_CACHE_SIZE
621 / crypt_stat->extent_size))
622 + extent_offset), crypt_stat);
623 rc = ecryptfs_read_lower(enc_extent_virt, offset,
624 crypt_stat->extent_size,
625 ecryptfs_inode);
626 if (rc < 0) {
627 ecryptfs_printk(KERN_ERR, "Error attempting "
628 "to read lower page; rc = [%d]"
629 "\n", rc);
630 goto out;
631 }
632 rc = ecryptfs_decrypt_extent(page, crypt_stat, enc_extent_page,
633 extent_offset);
634 if (rc) { 586 if (rc) {
635 printk(KERN_ERR "%s: Error encrypting extent; " 587 printk(KERN_ERR "%s: Error encrypting extent; "
636 "rc = [%d]\n", __func__, rc); 588 "rc = [%d]\n", __func__, rc);
@@ -638,142 +590,9 @@ int ecryptfs_decrypt_page(struct page *page)
638 } 590 }
639 } 591 }
640out: 592out:
641 if (enc_extent_page) {
642 kunmap(enc_extent_page);
643 __free_page(enc_extent_page);
644 }
645 return rc; 593 return rc;
646} 594}
647 595
648/**
649 * decrypt_scatterlist
650 * @crypt_stat: Cryptographic context
651 * @dest_sg: The destination scatterlist to decrypt into
652 * @src_sg: The source scatterlist to decrypt from
653 * @size: The number of bytes to decrypt
654 * @iv: The initialization vector to use for the decryption
655 *
656 * Returns the number of bytes decrypted; negative value on error
657 */
658static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
659 struct scatterlist *dest_sg,
660 struct scatterlist *src_sg, int size,
661 unsigned char *iv)
662{
663 struct ablkcipher_request *req = NULL;
664 struct extent_crypt_result ecr;
665 int rc = 0;
666
667 BUG_ON(!crypt_stat || !crypt_stat->tfm
668 || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
669 if (unlikely(ecryptfs_verbosity > 0)) {
670 ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
671 crypt_stat->key_size);
672 ecryptfs_dump_hex(crypt_stat->key,
673 crypt_stat->key_size);
674 }
675
676 init_completion(&ecr.completion);
677
678 mutex_lock(&crypt_stat->cs_tfm_mutex);
679 req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
680 if (!req) {
681 mutex_unlock(&crypt_stat->cs_tfm_mutex);
682 rc = -ENOMEM;
683 goto out;
684 }
685
686 ablkcipher_request_set_callback(req,
687 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
688 extent_crypt_complete, &ecr);
689 /* Consider doing this once, when the file is opened */
690 if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
691 rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
692 crypt_stat->key_size);
693 if (rc) {
694 ecryptfs_printk(KERN_ERR,
695 "Error setting key; rc = [%d]\n",
696 rc);
697 mutex_unlock(&crypt_stat->cs_tfm_mutex);
698 rc = -EINVAL;
699 goto out;
700 }
701 crypt_stat->flags |= ECRYPTFS_KEY_SET;
702 }
703 mutex_unlock(&crypt_stat->cs_tfm_mutex);
704 ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
705 ablkcipher_request_set_crypt(req, src_sg, dest_sg, size, iv);
706 rc = crypto_ablkcipher_decrypt(req);
707 if (rc == -EINPROGRESS || rc == -EBUSY) {
708 struct extent_crypt_result *ecr = req->base.data;
709
710 wait_for_completion(&ecr->completion);
711 rc = ecr->rc;
712 INIT_COMPLETION(ecr->completion);
713 }
714out:
715 ablkcipher_request_free(req);
716 return rc;
717
718}
719
720/**
721 * ecryptfs_encrypt_page_offset
722 * @crypt_stat: The cryptographic context
723 * @dst_page: The page to encrypt into
724 * @dst_offset: The offset in the page to encrypt into
725 * @src_page: The page to encrypt from
726 * @src_offset: The offset in the page to encrypt from
727 * @size: The number of bytes to encrypt
728 * @iv: The initialization vector to use for the encryption
729 *
730 * Returns the number of bytes encrypted
731 */
732static int
733ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
734 struct page *dst_page, int dst_offset,
735 struct page *src_page, int src_offset, int size,
736 unsigned char *iv)
737{
738 struct scatterlist src_sg, dst_sg;
739
740 sg_init_table(&src_sg, 1);
741 sg_init_table(&dst_sg, 1);
742
743 sg_set_page(&src_sg, src_page, size, src_offset);
744 sg_set_page(&dst_sg, dst_page, size, dst_offset);
745 return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
746}
747
748/**
749 * ecryptfs_decrypt_page_offset
750 * @crypt_stat: The cryptographic context
751 * @dst_page: The page to decrypt into
752 * @dst_offset: The offset in the page to decrypt into
753 * @src_page: The page to decrypt from
754 * @src_offset: The offset in the page to decrypt from
755 * @size: The number of bytes to decrypt
756 * @iv: The initialization vector to use for the decryption
757 *
758 * Returns the number of bytes decrypted
759 */
760static int
761ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
762 struct page *dst_page, int dst_offset,
763 struct page *src_page, int src_offset, int size,
764 unsigned char *iv)
765{
766 struct scatterlist src_sg, dst_sg;
767
768 sg_init_table(&src_sg, 1);
769 sg_set_page(&src_sg, src_page, size, src_offset);
770
771 sg_init_table(&dst_sg, 1);
772 sg_set_page(&dst_sg, dst_page, size, dst_offset);
773
774 return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
775}
776
777#define ECRYPTFS_MAX_SCATTERLIST_LEN 4 596#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
778 597
779/** 598/**
@@ -2243,12 +2062,11 @@ out:
2243 */ 2062 */
2244int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, 2063int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
2245 size_t *plaintext_name_size, 2064 size_t *plaintext_name_size,
2246 struct dentry *ecryptfs_dir_dentry, 2065 struct super_block *sb,
2247 const char *name, size_t name_size) 2066 const char *name, size_t name_size)
2248{ 2067{
2249 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 2068 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2250 &ecryptfs_superblock_to_private( 2069 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
2251 ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
2252 char *decoded_name; 2070 char *decoded_name;
2253 size_t decoded_name_size; 2071 size_t decoded_name_size;
2254 size_t packet_size; 2072 size_t packet_size;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index f622a733f7ad..df19d34a033b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -575,7 +575,7 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
575 struct inode *ecryptfs_inode); 575 struct inode *ecryptfs_inode);
576int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 576int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
577 size_t *decrypted_name_size, 577 size_t *decrypted_name_size,
578 struct dentry *ecryptfs_dentry, 578 struct super_block *sb,
579 const char *name, size_t name_size); 579 const char *name, size_t name_size);
580int ecryptfs_fill_zeros(struct file *file, loff_t new_length); 580int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
581int ecryptfs_encrypt_and_encode_filename( 581int ecryptfs_encrypt_and_encode_filename(
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index a7abbea2c096..992cf95830b5 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -49,7 +49,7 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
49 unsigned long nr_segs, loff_t pos) 49 unsigned long nr_segs, loff_t pos)
50{ 50{
51 ssize_t rc; 51 ssize_t rc;
52 struct path lower; 52 struct path *path;
53 struct file *file = iocb->ki_filp; 53 struct file *file = iocb->ki_filp;
54 54
55 rc = generic_file_aio_read(iocb, iov, nr_segs, pos); 55 rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
@@ -60,17 +60,16 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
60 if (-EIOCBQUEUED == rc) 60 if (-EIOCBQUEUED == rc)
61 rc = wait_on_sync_kiocb(iocb); 61 rc = wait_on_sync_kiocb(iocb);
62 if (rc >= 0) { 62 if (rc >= 0) {
63 lower.dentry = ecryptfs_dentry_to_lower(file->f_path.dentry); 63 path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
64 lower.mnt = ecryptfs_dentry_to_lower_mnt(file->f_path.dentry); 64 touch_atime(path);
65 touch_atime(&lower);
66 } 65 }
67 return rc; 66 return rc;
68} 67}
69 68
70struct ecryptfs_getdents_callback { 69struct ecryptfs_getdents_callback {
71 void *dirent; 70 struct dir_context ctx;
72 struct dentry *dentry; 71 struct dir_context *caller;
73 filldir_t filldir; 72 struct super_block *sb;
74 int filldir_called; 73 int filldir_called;
75 int entries_written; 74 int entries_written;
76}; 75};
@@ -88,7 +87,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
88 87
89 buf->filldir_called++; 88 buf->filldir_called++;
90 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, 89 rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
91 buf->dentry, lower_name, 90 buf->sb, lower_name,
92 lower_namelen); 91 lower_namelen);
93 if (rc) { 92 if (rc) {
94 printk(KERN_ERR "%s: Error attempting to decode and decrypt " 93 printk(KERN_ERR "%s: Error attempting to decode and decrypt "
@@ -96,9 +95,10 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
96 rc); 95 rc);
97 goto out; 96 goto out;
98 } 97 }
99 rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); 98 buf->caller->pos = buf->ctx.pos;
99 rc = !dir_emit(buf->caller, name, name_size, ino, d_type);
100 kfree(name); 100 kfree(name);
101 if (rc >= 0) 101 if (!rc)
102 buf->entries_written++; 102 buf->entries_written++;
103out: 103out:
104 return rc; 104 return rc;
@@ -107,27 +107,22 @@ out:
107/** 107/**
108 * ecryptfs_readdir 108 * ecryptfs_readdir
109 * @file: The eCryptfs directory file 109 * @file: The eCryptfs directory file
110 * @dirent: Directory entry handle 110 * @ctx: The actor to feed the entries to
111 * @filldir: The filldir callback function
112 */ 111 */
113static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) 112static int ecryptfs_readdir(struct file *file, struct dir_context *ctx)
114{ 113{
115 int rc; 114 int rc;
116 struct file *lower_file; 115 struct file *lower_file;
117 struct inode *inode; 116 struct inode *inode = file_inode(file);
118 struct ecryptfs_getdents_callback buf; 117 struct ecryptfs_getdents_callback buf = {
119 118 .ctx.actor = ecryptfs_filldir,
119 .caller = ctx,
120 .sb = inode->i_sb,
121 };
120 lower_file = ecryptfs_file_to_lower(file); 122 lower_file = ecryptfs_file_to_lower(file);
121 lower_file->f_pos = file->f_pos; 123 lower_file->f_pos = ctx->pos;
122 inode = file_inode(file); 124 rc = iterate_dir(lower_file, &buf.ctx);
123 memset(&buf, 0, sizeof(buf)); 125 ctx->pos = buf.ctx.pos;
124 buf.dirent = dirent;
125 buf.dentry = file->f_path.dentry;
126 buf.filldir = filldir;
127 buf.filldir_called = 0;
128 buf.entries_written = 0;
129 rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
130 file->f_pos = lower_file->f_pos;
131 if (rc < 0) 126 if (rc < 0)
132 goto out; 127 goto out;
133 if (buf.filldir_called && !buf.entries_written) 128 if (buf.filldir_called && !buf.entries_written)
@@ -344,7 +339,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
344#endif 339#endif
345 340
346const struct file_operations ecryptfs_dir_fops = { 341const struct file_operations ecryptfs_dir_fops = {
347 .readdir = ecryptfs_readdir, 342 .iterate = ecryptfs_readdir,
348 .read = generic_read_dir, 343 .read = generic_read_dir,
349 .unlocked_ioctl = ecryptfs_unlocked_ioctl, 344 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
350#ifdef CONFIG_COMPAT 345#ifdef CONFIG_COMPAT
@@ -365,7 +360,7 @@ const struct file_operations ecryptfs_main_fops = {
365 .aio_read = ecryptfs_read_update_atime, 360 .aio_read = ecryptfs_read_update_atime,
366 .write = do_sync_write, 361 .write = do_sync_write,
367 .aio_write = generic_file_aio_write, 362 .aio_write = generic_file_aio_write,
368 .readdir = ecryptfs_readdir, 363 .iterate = ecryptfs_readdir,
369 .unlocked_ioctl = ecryptfs_unlocked_ioctl, 364 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
370#ifdef CONFIG_COMPAT 365#ifdef CONFIG_COMPAT
371 .compat_ioctl = ecryptfs_compat_ioctl, 366 .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5eab400e2590..67e9b6339691 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -358,7 +358,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
358 358
359 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); 359 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
360 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); 360 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
361 BUG_ON(!lower_dentry->d_count); 361 BUG_ON(!d_count(lower_dentry));
362 362
363 ecryptfs_set_dentry_private(dentry, dentry_info); 363 ecryptfs_set_dentry_private(dentry, dentry_info);
364 ecryptfs_set_dentry_lower(dentry, lower_dentry); 364 ecryptfs_set_dentry_lower(dentry, lower_dentry);
@@ -679,7 +679,7 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
679 set_fs(old_fs); 679 set_fs(old_fs);
680 if (rc < 0) 680 if (rc < 0)
681 goto out; 681 goto out;
682 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, 682 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
683 lower_buf, rc); 683 lower_buf, rc);
684out: 684out:
685 kfree(lower_buf); 685 kfree(lower_buf);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index e924cf45aad9..eb1c5979ecaf 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -120,16 +120,15 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
120 struct file **lower_file) 120 struct file **lower_file)
121{ 121{
122 const struct cred *cred = current_cred(); 122 const struct cred *cred = current_cred();
123 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 123 struct path *path = ecryptfs_dentry_to_lower_path(dentry);
124 struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
125 int rc; 124 int rc;
126 125
127 rc = ecryptfs_privileged_open(lower_file, lower_dentry, lower_mnt, 126 rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt,
128 cred); 127 cred);
129 if (rc) { 128 if (rc) {
130 printk(KERN_ERR "Error opening lower file " 129 printk(KERN_ERR "Error opening lower file "
131 "for lower_dentry [0x%p] and lower_mnt [0x%p]; " 130 "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
132 "rc = [%d]\n", lower_dentry, lower_mnt, rc); 131 "rc = [%d]\n", path->dentry, path->mnt, rc);
133 (*lower_file) = NULL; 132 (*lower_file) = NULL;
134 } 133 }
135 return rc; 134 return rc;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 49ff8ea08f1c..e57380e5f6bd 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -247,14 +247,13 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
247 goto unlock; 247 goto unlock;
248 } 248 }
249 msg_size = (sizeof(*msg) + msg->data_len); 249 msg_size = (sizeof(*msg) + msg->data_len);
250 msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); 250 msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL);
251 if (!msg_ctx->msg) { 251 if (!msg_ctx->msg) {
252 rc = -ENOMEM; 252 rc = -ENOMEM;
253 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " 253 printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
254 "GFP_KERNEL memory\n", __func__, msg_size); 254 "GFP_KERNEL memory\n", __func__, msg_size);
255 goto unlock; 255 goto unlock;
256 } 256 }
257 memcpy(msg_ctx->msg, msg, msg_size);
258 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; 257 msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
259 wake_up_process(msg_ctx->task); 258 wake_up_process(msg_ctx->task);
260 rc = 0; 259 rc = 0;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 7e787fb90293..07ab49745e31 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -155,20 +155,8 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
155 return 0; 155 return 0;
156}; 156};
157 157
158/*
159 * Handle negative dentry.
160 */
161static struct dentry *efivarfs_lookup(struct inode *dir, struct dentry *dentry,
162 unsigned int flags)
163{
164 if (dentry->d_name.len > NAME_MAX)
165 return ERR_PTR(-ENAMETOOLONG);
166 d_add(dentry, NULL);
167 return NULL;
168}
169
170const struct inode_operations efivarfs_dir_inode_operations = { 158const struct inode_operations efivarfs_dir_inode_operations = {
171 .lookup = efivarfs_lookup, 159 .lookup = simple_lookup,
172 .unlink = efivarfs_unlink, 160 .unlink = efivarfs_unlink,
173 .create = efivarfs_create, 161 .create = efivarfs_create,
174}; 162};
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 141aee31884f..a8766b880c07 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -45,8 +45,8 @@ static struct super_block *efivarfs_sb;
45 * So we need to perform a case-sensitive match on part 1 and a 45 * So we need to perform a case-sensitive match on part 1 and a
46 * case-insensitive match on part 2. 46 * case-insensitive match on part 2.
47 */ 47 */
48static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode, 48static int efivarfs_d_compare(const struct dentry *parent,
49 const struct dentry *dentry, const struct inode *inode, 49 const struct dentry *dentry,
50 unsigned int len, const char *str, 50 unsigned int len, const char *str,
51 const struct qstr *name) 51 const struct qstr *name)
52{ 52{
@@ -63,8 +63,7 @@ static int efivarfs_d_compare(const struct dentry *parent, const struct inode *p
63 return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN); 63 return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
64} 64}
65 65
66static int efivarfs_d_hash(const struct dentry *dentry, 66static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
67 const struct inode *inode, struct qstr *qstr)
68{ 67{
69 unsigned long hash = init_name_hash(); 68 unsigned long hash = init_name_hash();
70 const unsigned char *s = qstr->name; 69 const unsigned char *s = qstr->name;
@@ -108,7 +107,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
108 q.name = name; 107 q.name = name;
109 q.len = strlen(name); 108 q.len = strlen(name);
110 109
111 err = efivarfs_d_hash(NULL, NULL, &q); 110 err = efivarfs_d_hash(NULL, &q);
112 if (err) 111 if (err)
113 return ERR_PTR(err); 112 return ERR_PTR(err);
114 113
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index 055a9e9ca747..b72307ccdf7a 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -7,40 +7,38 @@
7#include <linux/buffer_head.h> 7#include <linux/buffer_head.h>
8#include "efs.h" 8#include "efs.h"
9 9
10static int efs_readdir(struct file *, void *, filldir_t); 10static int efs_readdir(struct file *, struct dir_context *);
11 11
12const struct file_operations efs_dir_operations = { 12const struct file_operations efs_dir_operations = {
13 .llseek = generic_file_llseek, 13 .llseek = generic_file_llseek,
14 .read = generic_read_dir, 14 .read = generic_read_dir,
15 .readdir = efs_readdir, 15 .iterate = efs_readdir,
16}; 16};
17 17
18const struct inode_operations efs_dir_inode_operations = { 18const struct inode_operations efs_dir_inode_operations = {
19 .lookup = efs_lookup, 19 .lookup = efs_lookup,
20}; 20};
21 21
22static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { 22static int efs_readdir(struct file *file, struct dir_context *ctx)
23 struct inode *inode = file_inode(filp); 23{
24 struct buffer_head *bh; 24 struct inode *inode = file_inode(file);
25
26 struct efs_dir *dirblock;
27 struct efs_dentry *dirslot;
28 efs_ino_t inodenum;
29 efs_block_t block; 25 efs_block_t block;
30 int slot, namelen; 26 int slot;
31 char *nameptr;
32 27
33 if (inode->i_size & (EFS_DIRBSIZE-1)) 28 if (inode->i_size & (EFS_DIRBSIZE-1))
34 printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); 29 printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
35 30
36 /* work out where this entry can be found */ 31 /* work out where this entry can be found */
37 block = filp->f_pos >> EFS_DIRBSIZE_BITS; 32 block = ctx->pos >> EFS_DIRBSIZE_BITS;
38 33
39 /* each block contains at most 256 slots */ 34 /* each block contains at most 256 slots */
40 slot = filp->f_pos & 0xff; 35 slot = ctx->pos & 0xff;
41 36
42 /* look at all blocks */ 37 /* look at all blocks */
43 while (block < inode->i_blocks) { 38 while (block < inode->i_blocks) {
39 struct efs_dir *dirblock;
40 struct buffer_head *bh;
41
44 /* read the dir block */ 42 /* read the dir block */
45 bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); 43 bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
46 44
@@ -57,11 +55,14 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
57 break; 55 break;
58 } 56 }
59 57
60 while (slot < dirblock->slots) { 58 for (; slot < dirblock->slots; slot++) {
61 if (dirblock->space[slot] == 0) { 59 struct efs_dentry *dirslot;
62 slot++; 60 efs_ino_t inodenum;
61 const char *nameptr;
62 int namelen;
63
64 if (dirblock->space[slot] == 0)
63 continue; 65 continue;
64 }
65 66
66 dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); 67 dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
67 68
@@ -72,39 +73,29 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) {
72#ifdef DEBUG 73#ifdef DEBUG
73 printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); 74 printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
74#endif 75#endif
75 if (namelen > 0) { 76 if (!namelen)
76 /* found the next entry */ 77 continue;
77 filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; 78 /* found the next entry */
78 79 ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
79 /* copy filename and data in dirslot */ 80
80 filldir(dirent, nameptr, namelen, filp->f_pos, inodenum, DT_UNKNOWN); 81 /* sanity check */
81 82 if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
82 /* sanity check */ 83 printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
83 if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { 84 continue;
84 printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); 85 }
85 slot++; 86
86 continue; 87 /* copy filename and data in dirslot */
87 } 88 if (!dir_emit(ctx, nameptr, namelen, inodenum, DT_UNKNOWN)) {
88
89 /* store position of next slot */
90 if (++slot == dirblock->slots) {
91 slot = 0;
92 block++;
93 }
94 brelse(bh); 89 brelse(bh);
95 filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; 90 return 0;
96 goto out;
97 } 91 }
98 slot++;
99 } 92 }
100 brelse(bh); 93 brelse(bh);
101 94
102 slot = 0; 95 slot = 0;
103 block++; 96 block++;
104 } 97 }
105 98 ctx->pos = (block << EFS_DIRBSIZE_BITS) | slot;
106 filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot;
107out:
108 return 0; 99 return 0;
109} 100}
110 101
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index deecc7294a67..9ad17b15b454 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,6 +34,7 @@
34#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/anon_inodes.h> 35#include <linux/anon_inodes.h>
36#include <linux/device.h> 36#include <linux/device.h>
37#include <linux/freezer.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38#include <asm/io.h> 39#include <asm/io.h>
39#include <asm/mman.h> 40#include <asm/mman.h>
@@ -1602,7 +1603,8 @@ fetch_events:
1602 } 1603 }
1603 1604
1604 spin_unlock_irqrestore(&ep->lock, flags); 1605 spin_unlock_irqrestore(&ep->lock, flags);
1605 if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) 1606 if (!freezable_schedule_hrtimeout_range(to, slack,
1607 HRTIMER_MODE_ABS))
1606 timed_out = 1; 1608 timed_out = 1;
1607 1609
1608 spin_lock_irqsave(&ep->lock, flags); 1610 spin_lock_irqsave(&ep->lock, flags);
@@ -1975,8 +1977,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1975 return -EINVAL; 1977 return -EINVAL;
1976 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) 1978 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1977 return -EFAULT; 1979 return -EFAULT;
1978 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 1980 sigsaved = current->blocked;
1979 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 1981 set_current_blocked(&ksigmask);
1980 } 1982 }
1981 1983
1982 error = sys_epoll_wait(epfd, events, maxevents, timeout); 1984 error = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -1993,7 +1995,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1993 sizeof(sigsaved)); 1995 sizeof(sigsaved));
1994 set_restore_sigmask(); 1996 set_restore_sigmask();
1995 } else 1997 } else
1996 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 1998 set_current_blocked(&sigsaved);
1997 } 1999 }
1998 2000
1999 return error; 2001 return error;
@@ -2020,8 +2022,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2020 if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) 2022 if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
2021 return -EFAULT; 2023 return -EFAULT;
2022 sigset_from_compat(&ksigmask, &csigmask); 2024 sigset_from_compat(&ksigmask, &csigmask);
2023 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); 2025 sigsaved = current->blocked;
2024 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); 2026 set_current_blocked(&ksigmask);
2025 } 2027 }
2026 2028
2027 err = sys_epoll_wait(epfd, events, maxevents, timeout); 2029 err = sys_epoll_wait(epfd, events, maxevents, timeout);
@@ -2038,7 +2040,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
2038 sizeof(sigsaved)); 2040 sizeof(sigsaved));
2039 set_restore_sigmask(); 2041 set_restore_sigmask();
2040 } else 2042 } else
2041 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 2043 set_current_blocked(&sigsaved);
2042 } 2044 }
2043 2045
2044 return err; 2046 return err;
diff --git a/fs/exec.c b/fs/exec.c
index ffd7a813ad3d..9c73def87642 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -110,13 +110,14 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
110 static const struct open_flags uselib_flags = { 110 static const struct open_flags uselib_flags = {
111 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 111 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
112 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, 112 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
113 .intent = LOOKUP_OPEN 113 .intent = LOOKUP_OPEN,
114 .lookup_flags = LOOKUP_FOLLOW,
114 }; 115 };
115 116
116 if (IS_ERR(tmp)) 117 if (IS_ERR(tmp))
117 goto out; 118 goto out;
118 119
119 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); 120 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
120 putname(tmp); 121 putname(tmp);
121 error = PTR_ERR(file); 122 error = PTR_ERR(file);
122 if (IS_ERR(file)) 123 if (IS_ERR(file))
@@ -756,10 +757,11 @@ struct file *open_exec(const char *name)
756 static const struct open_flags open_exec_flags = { 757 static const struct open_flags open_exec_flags = {
757 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 758 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
758 .acc_mode = MAY_EXEC | MAY_OPEN, 759 .acc_mode = MAY_EXEC | MAY_OPEN,
759 .intent = LOOKUP_OPEN 760 .intent = LOOKUP_OPEN,
761 .lookup_flags = LOOKUP_FOLLOW,
760 }; 762 };
761 763
762 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags, LOOKUP_FOLLOW); 764 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags);
763 if (IS_ERR(file)) 765 if (IS_ERR(file))
764 goto out; 766 goto out;
765 767
@@ -930,6 +932,7 @@ static int de_thread(struct task_struct *tsk)
930 * also take its birthdate (always earlier than our own). 932 * also take its birthdate (always earlier than our own).
931 */ 933 */
932 tsk->start_time = leader->start_time; 934 tsk->start_time = leader->start_time;
935 tsk->real_start_time = leader->real_start_time;
933 936
934 BUG_ON(!same_thread_group(leader, tsk)); 937 BUG_ON(!same_thread_group(leader, tsk));
935 BUG_ON(has_group_leader_pid(tsk)); 938 BUG_ON(has_group_leader_pid(tsk));
@@ -945,9 +948,8 @@ static int de_thread(struct task_struct *tsk)
945 * Note: The old leader also uses this pid until release_task 948 * Note: The old leader also uses this pid until release_task
946 * is called. Odd but simple and correct. 949 * is called. Odd but simple and correct.
947 */ 950 */
948 detach_pid(tsk, PIDTYPE_PID);
949 tsk->pid = leader->pid; 951 tsk->pid = leader->pid;
950 attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); 952 change_pid(tsk, PIDTYPE_PID, task_pid(leader));
951 transfer_pid(leader, tsk, PIDTYPE_PGID); 953 transfer_pid(leader, tsk, PIDTYPE_PGID);
952 transfer_pid(leader, tsk, PIDTYPE_SID); 954 transfer_pid(leader, tsk, PIDTYPE_SID);
953 955
@@ -1463,7 +1465,6 @@ static int do_execve_common(const char *filename,
1463 struct files_struct *displaced; 1465 struct files_struct *displaced;
1464 bool clear_in_exec; 1466 bool clear_in_exec;
1465 int retval; 1467 int retval;
1466 const struct cred *cred = current_cred();
1467 1468
1468 /* 1469 /*
1469 * We move the actual failure in case of RLIMIT_NPROC excess from 1470 * We move the actual failure in case of RLIMIT_NPROC excess from
@@ -1472,7 +1473,7 @@ static int do_execve_common(const char *filename,
1472 * whether NPROC limit is still exceeded. 1473 * whether NPROC limit is still exceeded.
1473 */ 1474 */
1474 if ((current->flags & PF_NPROC_EXCEEDED) && 1475 if ((current->flags & PF_NPROC_EXCEEDED) &&
1475 atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) { 1476 atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
1476 retval = -EAGAIN; 1477 retval = -EAGAIN;
1477 goto out_ret; 1478 goto out_ret;
1478 } 1479 }
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 46375896cfc0..49f51ab4caac 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -239,22 +239,19 @@ void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
239} 239}
240 240
241static int 241static int
242exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) 242exofs_readdir(struct file *file, struct dir_context *ctx)
243{ 243{
244 loff_t pos = filp->f_pos; 244 loff_t pos = ctx->pos;
245 struct inode *inode = file_inode(filp); 245 struct inode *inode = file_inode(file);
246 unsigned int offset = pos & ~PAGE_CACHE_MASK; 246 unsigned int offset = pos & ~PAGE_CACHE_MASK;
247 unsigned long n = pos >> PAGE_CACHE_SHIFT; 247 unsigned long n = pos >> PAGE_CACHE_SHIFT;
248 unsigned long npages = dir_pages(inode); 248 unsigned long npages = dir_pages(inode);
249 unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); 249 unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
250 unsigned char *types = NULL; 250 int need_revalidate = (file->f_version != inode->i_version);
251 int need_revalidate = (filp->f_version != inode->i_version);
252 251
253 if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) 252 if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
254 return 0; 253 return 0;
255 254
256 types = exofs_filetype_table;
257
258 for ( ; n < npages; n++, offset = 0) { 255 for ( ; n < npages; n++, offset = 0) {
259 char *kaddr, *limit; 256 char *kaddr, *limit;
260 struct exofs_dir_entry *de; 257 struct exofs_dir_entry *de;
@@ -263,7 +260,7 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
263 if (IS_ERR(page)) { 260 if (IS_ERR(page)) {
264 EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", 261 EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
265 inode->i_ino); 262 inode->i_ino);
266 filp->f_pos += PAGE_CACHE_SIZE - offset; 263 ctx->pos += PAGE_CACHE_SIZE - offset;
267 return PTR_ERR(page); 264 return PTR_ERR(page);
268 } 265 }
269 kaddr = page_address(page); 266 kaddr = page_address(page);
@@ -271,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
271 if (offset) { 268 if (offset) {
272 offset = exofs_validate_entry(kaddr, offset, 269 offset = exofs_validate_entry(kaddr, offset,
273 chunk_mask); 270 chunk_mask);
274 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; 271 ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
275 } 272 }
276 filp->f_version = inode->i_version; 273 file->f_version = inode->i_version;
277 need_revalidate = 0; 274 need_revalidate = 0;
278 } 275 }
279 de = (struct exofs_dir_entry *)(kaddr + offset); 276 de = (struct exofs_dir_entry *)(kaddr + offset);
@@ -288,27 +285,24 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
288 return -EIO; 285 return -EIO;
289 } 286 }
290 if (de->inode_no) { 287 if (de->inode_no) {
291 int over; 288 unsigned char t;
292 unsigned char d_type = DT_UNKNOWN;
293 289
294 if (types && de->file_type < EXOFS_FT_MAX) 290 if (de->file_type < EXOFS_FT_MAX)
295 d_type = types[de->file_type]; 291 t = exofs_filetype_table[de->file_type];
292 else
293 t = DT_UNKNOWN;
296 294
297 offset = (char *)de - kaddr; 295 if (!dir_emit(ctx, de->name, de->name_len,
298 over = filldir(dirent, de->name, de->name_len,
299 (n<<PAGE_CACHE_SHIFT) | offset,
300 le64_to_cpu(de->inode_no), 296 le64_to_cpu(de->inode_no),
301 d_type); 297 t)) {
302 if (over) {
303 exofs_put_page(page); 298 exofs_put_page(page);
304 return 0; 299 return 0;
305 } 300 }
306 } 301 }
307 filp->f_pos += le16_to_cpu(de->rec_len); 302 ctx->pos += le16_to_cpu(de->rec_len);
308 } 303 }
309 exofs_put_page(page); 304 exofs_put_page(page);
310 } 305 }
311
312 return 0; 306 return 0;
313} 307}
314 308
@@ -669,5 +663,5 @@ not_empty:
669const struct file_operations exofs_dir_operations = { 663const struct file_operations exofs_dir_operations = {
670 .llseek = generic_file_llseek, 664 .llseek = generic_file_llseek,
671 .read = generic_read_dir, 665 .read = generic_read_dir,
672 .readdir = exofs_readdir, 666 .iterate = exofs_readdir,
673}; 667};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d1f80abd8828..2ec8eb1ab269 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -953,9 +953,11 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
953 return 0; 953 return 0;
954} 954}
955 955
956static void exofs_invalidatepage(struct page *page, unsigned long offset) 956static void exofs_invalidatepage(struct page *page, unsigned int offset,
957 unsigned int length)
957{ 958{
958 EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); 959 EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n",
960 page->index, offset, length);
959 WARN_ON(1); 961 WARN_ON(1);
960} 962}
961 963
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 262fc9940982..293bc2e47a73 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -212,6 +212,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
212} 212}
213 213
214struct getdents_callback { 214struct getdents_callback {
215 struct dir_context ctx;
215 char *name; /* name that was found. It already points to a 216 char *name; /* name that was found. It already points to a
216 buffer NAME_MAX+1 is size */ 217 buffer NAME_MAX+1 is size */
217 unsigned long ino; /* the inum we are looking for */ 218 unsigned long ino; /* the inum we are looking for */
@@ -254,7 +255,11 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
254 struct inode *dir = path->dentry->d_inode; 255 struct inode *dir = path->dentry->d_inode;
255 int error; 256 int error;
256 struct file *file; 257 struct file *file;
257 struct getdents_callback buffer; 258 struct getdents_callback buffer = {
259 .ctx.actor = filldir_one,
260 .name = name,
261 .ino = child->d_inode->i_ino
262 };
258 263
259 error = -ENOTDIR; 264 error = -ENOTDIR;
260 if (!dir || !S_ISDIR(dir->i_mode)) 265 if (!dir || !S_ISDIR(dir->i_mode))
@@ -271,17 +276,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
271 goto out; 276 goto out;
272 277
273 error = -EINVAL; 278 error = -EINVAL;
274 if (!file->f_op->readdir) 279 if (!file->f_op->iterate)
275 goto out_close; 280 goto out_close;
276 281
277 buffer.name = name;
278 buffer.ino = child->d_inode->i_ino;
279 buffer.found = 0;
280 buffer.sequence = 0; 282 buffer.sequence = 0;
281 while (1) { 283 while (1) {
282 int old_seq = buffer.sequence; 284 int old_seq = buffer.sequence;
283 285
284 error = vfs_readdir(file, filldir_one, &buffer); 286 error = iterate_dir(file, &buffer.ctx);
285 if (buffer.found) { 287 if (buffer.found) {
286 error = 0; 288 error = 0;
287 break; 289 break;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 4237722bfd27..6e1d4ab09d72 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -287,17 +287,17 @@ static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode)
287} 287}
288 288
289static int 289static int
290ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) 290ext2_readdir(struct file *file, struct dir_context *ctx)
291{ 291{
292 loff_t pos = filp->f_pos; 292 loff_t pos = ctx->pos;
293 struct inode *inode = file_inode(filp); 293 struct inode *inode = file_inode(file);
294 struct super_block *sb = inode->i_sb; 294 struct super_block *sb = inode->i_sb;
295 unsigned int offset = pos & ~PAGE_CACHE_MASK; 295 unsigned int offset = pos & ~PAGE_CACHE_MASK;
296 unsigned long n = pos >> PAGE_CACHE_SHIFT; 296 unsigned long n = pos >> PAGE_CACHE_SHIFT;
297 unsigned long npages = dir_pages(inode); 297 unsigned long npages = dir_pages(inode);
298 unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); 298 unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
299 unsigned char *types = NULL; 299 unsigned char *types = NULL;
300 int need_revalidate = filp->f_version != inode->i_version; 300 int need_revalidate = file->f_version != inode->i_version;
301 301
302 if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) 302 if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
303 return 0; 303 return 0;
@@ -314,16 +314,16 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
314 ext2_error(sb, __func__, 314 ext2_error(sb, __func__,
315 "bad page in #%lu", 315 "bad page in #%lu",
316 inode->i_ino); 316 inode->i_ino);
317 filp->f_pos += PAGE_CACHE_SIZE - offset; 317 ctx->pos += PAGE_CACHE_SIZE - offset;
318 return PTR_ERR(page); 318 return PTR_ERR(page);
319 } 319 }
320 kaddr = page_address(page); 320 kaddr = page_address(page);
321 if (unlikely(need_revalidate)) { 321 if (unlikely(need_revalidate)) {
322 if (offset) { 322 if (offset) {
323 offset = ext2_validate_entry(kaddr, offset, chunk_mask); 323 offset = ext2_validate_entry(kaddr, offset, chunk_mask);
324 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; 324 ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
325 } 325 }
326 filp->f_version = inode->i_version; 326 file->f_version = inode->i_version;
327 need_revalidate = 0; 327 need_revalidate = 0;
328 } 328 }
329 de = (ext2_dirent *)(kaddr+offset); 329 de = (ext2_dirent *)(kaddr+offset);
@@ -336,22 +336,19 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir)
336 return -EIO; 336 return -EIO;
337 } 337 }
338 if (de->inode) { 338 if (de->inode) {
339 int over;
340 unsigned char d_type = DT_UNKNOWN; 339 unsigned char d_type = DT_UNKNOWN;
341 340
342 if (types && de->file_type < EXT2_FT_MAX) 341 if (types && de->file_type < EXT2_FT_MAX)
343 d_type = types[de->file_type]; 342 d_type = types[de->file_type];
344 343
345 offset = (char *)de - kaddr; 344 if (!dir_emit(ctx, de->name, de->name_len,
346 over = filldir(dirent, de->name, de->name_len, 345 le32_to_cpu(de->inode),
347 (n<<PAGE_CACHE_SHIFT) | offset, 346 d_type)) {
348 le32_to_cpu(de->inode), d_type);
349 if (over) {
350 ext2_put_page(page); 347 ext2_put_page(page);
351 return 0; 348 return 0;
352 } 349 }
353 } 350 }
354 filp->f_pos += ext2_rec_len_from_disk(de->rec_len); 351 ctx->pos += ext2_rec_len_from_disk(de->rec_len);
355 } 352 }
356 ext2_put_page(page); 353 ext2_put_page(page);
357 } 354 }
@@ -724,7 +721,7 @@ not_empty:
724const struct file_operations ext2_dir_operations = { 721const struct file_operations ext2_dir_operations = {
725 .llseek = generic_file_llseek, 722 .llseek = generic_file_llseek,
726 .read = generic_read_dir, 723 .read = generic_read_dir,
727 .readdir = ext2_readdir, 724 .iterate = ext2_readdir,
728 .unlocked_ioctl = ext2_ioctl, 725 .unlocked_ioctl = ext2_ioctl,
729#ifdef CONFIG_COMPAT 726#ifdef CONFIG_COMPAT
730 .compat_ioctl = ext2_compat_ioctl, 727 .compat_ioctl = ext2_compat_ioctl,
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 73b0d9519836..256dd5f4c1c4 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
119 return ext2_add_nondir(dentry, inode); 119 return ext2_add_nondir(dentry, inode);
120} 120}
121 121
122static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
123{
124 struct inode *inode = ext2_new_inode(dir, mode, NULL);
125 if (IS_ERR(inode))
126 return PTR_ERR(inode);
127
128 inode->i_op = &ext2_file_inode_operations;
129 if (ext2_use_xip(inode->i_sb)) {
130 inode->i_mapping->a_ops = &ext2_aops_xip;
131 inode->i_fop = &ext2_xip_file_operations;
132 } else if (test_opt(inode->i_sb, NOBH)) {
133 inode->i_mapping->a_ops = &ext2_nobh_aops;
134 inode->i_fop = &ext2_file_operations;
135 } else {
136 inode->i_mapping->a_ops = &ext2_aops;
137 inode->i_fop = &ext2_file_operations;
138 }
139 mark_inode_dirty(inode);
140 d_tmpfile(dentry, inode);
141 unlock_new_inode(inode);
142 return 0;
143}
144
122static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) 145static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
123{ 146{
124 struct inode * inode; 147 struct inode * inode;
@@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
398#endif 421#endif
399 .setattr = ext2_setattr, 422 .setattr = ext2_setattr,
400 .get_acl = ext2_get_acl, 423 .get_acl = ext2_get_acl,
424 .tmpfile = ext2_tmpfile,
401}; 425};
402 426
403const struct inode_operations ext2_special_inode_operations = { 427const struct inode_operations ext2_special_inode_operations = {
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 87eccbbca255..f522425aaa24 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -28,8 +28,7 @@ static unsigned char ext3_filetype_table[] = {
28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
29}; 29};
30 30
31static int ext3_dx_readdir(struct file * filp, 31static int ext3_dx_readdir(struct file *, struct dir_context *);
32 void * dirent, filldir_t filldir);
33 32
34static unsigned char get_dtype(struct super_block *sb, int filetype) 33static unsigned char get_dtype(struct super_block *sb, int filetype)
35{ 34{
@@ -91,36 +90,30 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
91 return error_msg == NULL ? 1 : 0; 90 return error_msg == NULL ? 1 : 0;
92} 91}
93 92
94static int ext3_readdir(struct file * filp, 93static int ext3_readdir(struct file *file, struct dir_context *ctx)
95 void * dirent, filldir_t filldir)
96{ 94{
97 int error = 0;
98 unsigned long offset; 95 unsigned long offset;
99 int i, stored; 96 int i;
100 struct ext3_dir_entry_2 *de; 97 struct ext3_dir_entry_2 *de;
101 int err; 98 int err;
102 struct inode *inode = file_inode(filp); 99 struct inode *inode = file_inode(file);
103 struct super_block *sb = inode->i_sb; 100 struct super_block *sb = inode->i_sb;
104 int ret = 0;
105 int dir_has_error = 0; 101 int dir_has_error = 0;
106 102
107 if (is_dx_dir(inode)) { 103 if (is_dx_dir(inode)) {
108 err = ext3_dx_readdir(filp, dirent, filldir); 104 err = ext3_dx_readdir(file, ctx);
109 if (err != ERR_BAD_DX_DIR) { 105 if (err != ERR_BAD_DX_DIR)
110 ret = err; 106 return err;
111 goto out;
112 }
113 /* 107 /*
114 * We don't set the inode dirty flag since it's not 108 * We don't set the inode dirty flag since it's not
115 * critical that it get flushed back to the disk. 109 * critical that it get flushed back to the disk.
116 */ 110 */
117 EXT3_I(file_inode(filp))->i_flags &= ~EXT3_INDEX_FL; 111 EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
118 } 112 }
119 stored = 0; 113 offset = ctx->pos & (sb->s_blocksize - 1);
120 offset = filp->f_pos & (sb->s_blocksize - 1);
121 114
122 while (!error && !stored && filp->f_pos < inode->i_size) { 115 while (ctx->pos < inode->i_size) {
123 unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb); 116 unsigned long blk = ctx->pos >> EXT3_BLOCK_SIZE_BITS(sb);
124 struct buffer_head map_bh; 117 struct buffer_head map_bh;
125 struct buffer_head *bh = NULL; 118 struct buffer_head *bh = NULL;
126 119
@@ -129,12 +122,12 @@ static int ext3_readdir(struct file * filp,
129 if (err > 0) { 122 if (err > 0) {
130 pgoff_t index = map_bh.b_blocknr >> 123 pgoff_t index = map_bh.b_blocknr >>
131 (PAGE_CACHE_SHIFT - inode->i_blkbits); 124 (PAGE_CACHE_SHIFT - inode->i_blkbits);
132 if (!ra_has_index(&filp->f_ra, index)) 125 if (!ra_has_index(&file->f_ra, index))
133 page_cache_sync_readahead( 126 page_cache_sync_readahead(
134 sb->s_bdev->bd_inode->i_mapping, 127 sb->s_bdev->bd_inode->i_mapping,
135 &filp->f_ra, filp, 128 &file->f_ra, file,
136 index, 1); 129 index, 1);
137 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 130 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
138 bh = ext3_bread(NULL, inode, blk, 0, &err); 131 bh = ext3_bread(NULL, inode, blk, 0, &err);
139 } 132 }
140 133
@@ -146,22 +139,21 @@ static int ext3_readdir(struct file * filp,
146 if (!dir_has_error) { 139 if (!dir_has_error) {
147 ext3_error(sb, __func__, "directory #%lu " 140 ext3_error(sb, __func__, "directory #%lu "
148 "contains a hole at offset %lld", 141 "contains a hole at offset %lld",
149 inode->i_ino, filp->f_pos); 142 inode->i_ino, ctx->pos);
150 dir_has_error = 1; 143 dir_has_error = 1;
151 } 144 }
152 /* corrupt size? Maybe no more blocks to read */ 145 /* corrupt size? Maybe no more blocks to read */
153 if (filp->f_pos > inode->i_blocks << 9) 146 if (ctx->pos > inode->i_blocks << 9)
154 break; 147 break;
155 filp->f_pos += sb->s_blocksize - offset; 148 ctx->pos += sb->s_blocksize - offset;
156 continue; 149 continue;
157 } 150 }
158 151
159revalidate:
160 /* If the dir block has changed since the last call to 152 /* If the dir block has changed since the last call to
161 * readdir(2), then we might be pointing to an invalid 153 * readdir(2), then we might be pointing to an invalid
162 * dirent right now. Scan from the start of the block 154 * dirent right now. Scan from the start of the block
163 * to make sure. */ 155 * to make sure. */
164 if (filp->f_version != inode->i_version) { 156 if (offset && file->f_version != inode->i_version) {
165 for (i = 0; i < sb->s_blocksize && i < offset; ) { 157 for (i = 0; i < sb->s_blocksize && i < offset; ) {
166 de = (struct ext3_dir_entry_2 *) 158 de = (struct ext3_dir_entry_2 *)
167 (bh->b_data + i); 159 (bh->b_data + i);
@@ -177,53 +169,40 @@ revalidate:
177 i += ext3_rec_len_from_disk(de->rec_len); 169 i += ext3_rec_len_from_disk(de->rec_len);
178 } 170 }
179 offset = i; 171 offset = i;
180 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 172 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
181 | offset; 173 | offset;
182 filp->f_version = inode->i_version; 174 file->f_version = inode->i_version;
183 } 175 }
184 176
185 while (!error && filp->f_pos < inode->i_size 177 while (ctx->pos < inode->i_size
186 && offset < sb->s_blocksize) { 178 && offset < sb->s_blocksize) {
187 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); 179 de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
188 if (!ext3_check_dir_entry ("ext3_readdir", inode, de, 180 if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
189 bh, offset)) { 181 bh, offset)) {
190 /* On error, skip the f_pos to the 182 /* On error, skip the to the
191 next block. */ 183 next block. */
192 filp->f_pos = (filp->f_pos | 184 ctx->pos = (ctx->pos |
193 (sb->s_blocksize - 1)) + 1; 185 (sb->s_blocksize - 1)) + 1;
194 brelse (bh); 186 break;
195 ret = stored;
196 goto out;
197 } 187 }
198 offset += ext3_rec_len_from_disk(de->rec_len); 188 offset += ext3_rec_len_from_disk(de->rec_len);
199 if (le32_to_cpu(de->inode)) { 189 if (le32_to_cpu(de->inode)) {
200 /* We might block in the next section 190 if (!dir_emit(ctx, de->name, de->name_len,
201 * if the data destination is 191 le32_to_cpu(de->inode),
202 * currently swapped out. So, use a 192 get_dtype(sb, de->file_type))) {
203 * version stamp to detect whether or 193 brelse(bh);
204 * not the directory has been modified 194 return 0;
205 * during the copy operation. 195 }
206 */
207 u64 version = filp->f_version;
208
209 error = filldir(dirent, de->name,
210 de->name_len,
211 filp->f_pos,
212 le32_to_cpu(de->inode),
213 get_dtype(sb, de->file_type));
214 if (error)
215 break;
216 if (version != filp->f_version)
217 goto revalidate;
218 stored ++;
219 } 196 }
220 filp->f_pos += ext3_rec_len_from_disk(de->rec_len); 197 ctx->pos += ext3_rec_len_from_disk(de->rec_len);
221 } 198 }
222 offset = 0; 199 offset = 0;
223 brelse (bh); 200 brelse (bh);
201 if (ctx->pos < inode->i_size)
202 if (!dir_relax(inode))
203 return 0;
224 } 204 }
225out: 205 return 0;
226 return ret;
227} 206}
228 207
229static inline int is_32bit_api(void) 208static inline int is_32bit_api(void)
@@ -452,62 +431,54 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
452 * for all entres on the fname linked list. (Normally there is only 431 * for all entres on the fname linked list. (Normally there is only
453 * one entry on the linked list, unless there are 62 bit hash collisions.) 432 * one entry on the linked list, unless there are 62 bit hash collisions.)
454 */ 433 */
455static int call_filldir(struct file * filp, void * dirent, 434static bool call_filldir(struct file *file, struct dir_context *ctx,
456 filldir_t filldir, struct fname *fname) 435 struct fname *fname)
457{ 436{
458 struct dir_private_info *info = filp->private_data; 437 struct dir_private_info *info = file->private_data;
459 loff_t curr_pos; 438 struct inode *inode = file_inode(file);
460 struct inode *inode = file_inode(filp); 439 struct super_block *sb = inode->i_sb;
461 struct super_block * sb;
462 int error;
463
464 sb = inode->i_sb;
465 440
466 if (!fname) { 441 if (!fname) {
467 printk("call_filldir: called with null fname?!?\n"); 442 printk("call_filldir: called with null fname?!?\n");
468 return 0; 443 return true;
469 } 444 }
470 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); 445 ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
471 while (fname) { 446 while (fname) {
472 error = filldir(dirent, fname->name, 447 if (!dir_emit(ctx, fname->name, fname->name_len,
473 fname->name_len, curr_pos,
474 fname->inode, 448 fname->inode,
475 get_dtype(sb, fname->file_type)); 449 get_dtype(sb, fname->file_type))) {
476 if (error) {
477 filp->f_pos = curr_pos;
478 info->extra_fname = fname; 450 info->extra_fname = fname;
479 return error; 451 return false;
480 } 452 }
481 fname = fname->next; 453 fname = fname->next;
482 } 454 }
483 return 0; 455 return true;
484} 456}
485 457
486static int ext3_dx_readdir(struct file * filp, 458static int ext3_dx_readdir(struct file *file, struct dir_context *ctx)
487 void * dirent, filldir_t filldir)
488{ 459{
489 struct dir_private_info *info = filp->private_data; 460 struct dir_private_info *info = file->private_data;
490 struct inode *inode = file_inode(filp); 461 struct inode *inode = file_inode(file);
491 struct fname *fname; 462 struct fname *fname;
492 int ret; 463 int ret;
493 464
494 if (!info) { 465 if (!info) {
495 info = ext3_htree_create_dir_info(filp, filp->f_pos); 466 info = ext3_htree_create_dir_info(file, ctx->pos);
496 if (!info) 467 if (!info)
497 return -ENOMEM; 468 return -ENOMEM;
498 filp->private_data = info; 469 file->private_data = info;
499 } 470 }
500 471
501 if (filp->f_pos == ext3_get_htree_eof(filp)) 472 if (ctx->pos == ext3_get_htree_eof(file))
502 return 0; /* EOF */ 473 return 0; /* EOF */
503 474
504 /* Some one has messed with f_pos; reset the world */ 475 /* Some one has messed with f_pos; reset the world */
505 if (info->last_pos != filp->f_pos) { 476 if (info->last_pos != ctx->pos) {
506 free_rb_tree_fname(&info->root); 477 free_rb_tree_fname(&info->root);
507 info->curr_node = NULL; 478 info->curr_node = NULL;
508 info->extra_fname = NULL; 479 info->extra_fname = NULL;
509 info->curr_hash = pos2maj_hash(filp, filp->f_pos); 480 info->curr_hash = pos2maj_hash(file, ctx->pos);
510 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); 481 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
511 } 482 }
512 483
513 /* 484 /*
@@ -515,7 +486,7 @@ static int ext3_dx_readdir(struct file * filp,
515 * chain, return them first. 486 * chain, return them first.
516 */ 487 */
517 if (info->extra_fname) { 488 if (info->extra_fname) {
518 if (call_filldir(filp, dirent, filldir, info->extra_fname)) 489 if (!call_filldir(file, ctx, info->extra_fname))
519 goto finished; 490 goto finished;
520 info->extra_fname = NULL; 491 info->extra_fname = NULL;
521 goto next_node; 492 goto next_node;
@@ -529,17 +500,17 @@ static int ext3_dx_readdir(struct file * filp,
529 * cached entries. 500 * cached entries.
530 */ 501 */
531 if ((!info->curr_node) || 502 if ((!info->curr_node) ||
532 (filp->f_version != inode->i_version)) { 503 (file->f_version != inode->i_version)) {
533 info->curr_node = NULL; 504 info->curr_node = NULL;
534 free_rb_tree_fname(&info->root); 505 free_rb_tree_fname(&info->root);
535 filp->f_version = inode->i_version; 506 file->f_version = inode->i_version;
536 ret = ext3_htree_fill_tree(filp, info->curr_hash, 507 ret = ext3_htree_fill_tree(file, info->curr_hash,
537 info->curr_minor_hash, 508 info->curr_minor_hash,
538 &info->next_hash); 509 &info->next_hash);
539 if (ret < 0) 510 if (ret < 0)
540 return ret; 511 return ret;
541 if (ret == 0) { 512 if (ret == 0) {
542 filp->f_pos = ext3_get_htree_eof(filp); 513 ctx->pos = ext3_get_htree_eof(file);
543 break; 514 break;
544 } 515 }
545 info->curr_node = rb_first(&info->root); 516 info->curr_node = rb_first(&info->root);
@@ -548,7 +519,7 @@ static int ext3_dx_readdir(struct file * filp,
548 fname = rb_entry(info->curr_node, struct fname, rb_hash); 519 fname = rb_entry(info->curr_node, struct fname, rb_hash);
549 info->curr_hash = fname->hash; 520 info->curr_hash = fname->hash;
550 info->curr_minor_hash = fname->minor_hash; 521 info->curr_minor_hash = fname->minor_hash;
551 if (call_filldir(filp, dirent, filldir, fname)) 522 if (!call_filldir(file, ctx, fname))
552 break; 523 break;
553 next_node: 524 next_node:
554 info->curr_node = rb_next(info->curr_node); 525 info->curr_node = rb_next(info->curr_node);
@@ -559,7 +530,7 @@ static int ext3_dx_readdir(struct file * filp,
559 info->curr_minor_hash = fname->minor_hash; 530 info->curr_minor_hash = fname->minor_hash;
560 } else { 531 } else {
561 if (info->next_hash == ~0) { 532 if (info->next_hash == ~0) {
562 filp->f_pos = ext3_get_htree_eof(filp); 533 ctx->pos = ext3_get_htree_eof(file);
563 break; 534 break;
564 } 535 }
565 info->curr_hash = info->next_hash; 536 info->curr_hash = info->next_hash;
@@ -567,7 +538,7 @@ static int ext3_dx_readdir(struct file * filp,
567 } 538 }
568 } 539 }
569finished: 540finished:
570 info->last_pos = filp->f_pos; 541 info->last_pos = ctx->pos;
571 return 0; 542 return 0;
572} 543}
573 544
@@ -582,7 +553,7 @@ static int ext3_release_dir (struct inode * inode, struct file * filp)
582const struct file_operations ext3_dir_operations = { 553const struct file_operations ext3_dir_operations = {
583 .llseek = ext3_dir_llseek, 554 .llseek = ext3_dir_llseek,
584 .read = generic_read_dir, 555 .read = generic_read_dir,
585 .readdir = ext3_readdir, 556 .iterate = ext3_readdir,
586 .unlocked_ioctl = ext3_ioctl, 557 .unlocked_ioctl = ext3_ioctl,
587#ifdef CONFIG_COMPAT 558#ifdef CONFIG_COMPAT
588 .compat_ioctl = ext3_compat_ioctl, 559 .compat_ioctl = ext3_compat_ioctl,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index b31dbd4c46ad..1cb9c7e10c6f 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -48,9 +48,13 @@ int ext3_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
48 48
49 trace_ext3_sync_file_enter(file, datasync); 49 trace_ext3_sync_file_enter(file, datasync);
50 50
51 if (inode->i_sb->s_flags & MS_RDONLY) 51 if (inode->i_sb->s_flags & MS_RDONLY) {
52 /* Make sure that we read updated state */
53 smp_rmb();
54 if (EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)
55 return -EROFS;
52 return 0; 56 return 0;
53 57 }
54 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 58 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
55 if (ret) 59 if (ret)
56 goto out; 60 goto out;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 23c712825640..2bd85486b879 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1825,19 +1825,20 @@ ext3_readpages(struct file *file, struct address_space *mapping,
1825 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); 1825 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1826} 1826}
1827 1827
1828static void ext3_invalidatepage(struct page *page, unsigned long offset) 1828static void ext3_invalidatepage(struct page *page, unsigned int offset,
1829 unsigned int length)
1829{ 1830{
1830 journal_t *journal = EXT3_JOURNAL(page->mapping->host); 1831 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1831 1832
1832 trace_ext3_invalidatepage(page, offset); 1833 trace_ext3_invalidatepage(page, offset, length);
1833 1834
1834 /* 1835 /*
1835 * If it's a full truncate we just forget about the pending dirtying 1836 * If it's a full truncate we just forget about the pending dirtying
1836 */ 1837 */
1837 if (offset == 0) 1838 if (offset == 0 && length == PAGE_CACHE_SIZE)
1838 ClearPageChecked(page); 1839 ClearPageChecked(page);
1839 1840
1840 journal_invalidatepage(journal, page, offset); 1841 journal_invalidatepage(journal, page, offset, length);
1841} 1842}
1842 1843
1843static int ext3_releasepage(struct page *page, gfp_t wait) 1844static int ext3_releasepage(struct page *page, gfp_t wait)
@@ -1984,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
1984 .direct_IO = ext3_direct_IO, 1985 .direct_IO = ext3_direct_IO,
1985 .migratepage = buffer_migrate_page, 1986 .migratepage = buffer_migrate_page,
1986 .is_partially_uptodate = block_is_partially_uptodate, 1987 .is_partially_uptodate = block_is_partially_uptodate,
1988 .is_dirty_writeback = buffer_check_dirty_writeback,
1987 .error_remove_page = generic_error_remove_page, 1989 .error_remove_page = generic_error_remove_page,
1988}; 1990};
1989 1991
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 692de13e3596..1194b1f0f839 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -576,11 +576,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 576 if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb)) 577 (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
578 +((char *)de - bh->b_data))) { 578 +((char *)de - bh->b_data))) {
579 /* On error, skip the f_pos to the next block. */ 579 /* silently ignore the rest of the block */
580 dir_file->f_pos = (dir_file->f_pos | 580 break;
581 (dir->i_sb->s_blocksize - 1)) + 1;
582 brelse (bh);
583 return count;
584 } 581 }
585 ext3fs_dirhash(de->name, de->name_len, hinfo); 582 ext3fs_dirhash(de->name, de->name_len, hinfo);
586 if ((hinfo->hash < start_hash) || 583 if ((hinfo->hash < start_hash) ||
@@ -1762,6 +1759,45 @@ retry:
1762 return err; 1759 return err;
1763} 1760}
1764 1761
1762static int ext3_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
1763{
1764 handle_t *handle;
1765 struct inode *inode;
1766 int err, retries = 0;
1767
1768 dquot_initialize(dir);
1769
1770retry:
1771 handle = ext3_journal_start(dir, EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
1772 4 + EXT3_XATTR_TRANS_BLOCKS);
1773
1774 if (IS_ERR(handle))
1775 return PTR_ERR(handle);
1776
1777 inode = ext3_new_inode (handle, dir, NULL, mode);
1778 err = PTR_ERR(inode);
1779 if (!IS_ERR(inode)) {
1780 inode->i_op = &ext3_file_inode_operations;
1781 inode->i_fop = &ext3_file_operations;
1782 ext3_set_aops(inode);
1783 d_tmpfile(dentry, inode);
1784 err = ext3_orphan_add(handle, inode);
1785 if (err)
1786 goto err_drop_inode;
1787 mark_inode_dirty(inode);
1788 unlock_new_inode(inode);
1789 }
1790 ext3_journal_stop(handle);
1791 if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1792 goto retry;
1793 return err;
1794err_drop_inode:
1795 ext3_journal_stop(handle);
1796 unlock_new_inode(inode);
1797 iput(inode);
1798 return err;
1799}
1800
1765static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) 1801static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
1766{ 1802{
1767 handle_t *handle; 1803 handle_t *handle;
@@ -2303,7 +2339,7 @@ static int ext3_link (struct dentry * old_dentry,
2303 2339
2304retry: 2340retry:
2305 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2341 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2306 EXT3_INDEX_EXTRA_TRANS_BLOCKS); 2342 EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
2307 if (IS_ERR(handle)) 2343 if (IS_ERR(handle))
2308 return PTR_ERR(handle); 2344 return PTR_ERR(handle);
2309 2345
@@ -2317,6 +2353,11 @@ retry:
2317 err = ext3_add_entry(handle, dentry, inode); 2353 err = ext3_add_entry(handle, dentry, inode);
2318 if (!err) { 2354 if (!err) {
2319 ext3_mark_inode_dirty(handle, inode); 2355 ext3_mark_inode_dirty(handle, inode);
2356 /* this can happen only for tmpfile being
2357 * linked the first time
2358 */
2359 if (inode->i_nlink == 1)
2360 ext3_orphan_del(handle, inode);
2320 d_instantiate(dentry, inode); 2361 d_instantiate(dentry, inode);
2321 } else { 2362 } else {
2322 drop_nlink(inode); 2363 drop_nlink(inode);
@@ -2519,6 +2560,7 @@ const struct inode_operations ext3_dir_inode_operations = {
2519 .mkdir = ext3_mkdir, 2560 .mkdir = ext3_mkdir,
2520 .rmdir = ext3_rmdir, 2561 .rmdir = ext3_rmdir,
2521 .mknod = ext3_mknod, 2562 .mknod = ext3_mknod,
2563 .tmpfile = ext3_tmpfile,
2522 .rename = ext3_rename, 2564 .rename = ext3_rename,
2523 .setattr = ext3_setattr, 2565 .setattr = ext3_setattr,
2524#ifdef CONFIG_EXT3_FS_XATTR 2566#ifdef CONFIG_EXT3_FS_XATTR
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6356665a74bb..c47f14750722 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -174,6 +174,11 @@ static void ext3_handle_error(struct super_block *sb)
174 if (test_opt (sb, ERRORS_RO)) { 174 if (test_opt (sb, ERRORS_RO)) {
175 ext3_msg(sb, KERN_CRIT, 175 ext3_msg(sb, KERN_CRIT,
176 "error: remounting filesystem read-only"); 176 "error: remounting filesystem read-only");
177 /*
178 * Make sure updated value of ->s_mount_state will be visible
179 * before ->s_flags update.
180 */
181 smp_wmb();
177 sb->s_flags |= MS_RDONLY; 182 sb->s_flags |= MS_RDONLY;
178 } 183 }
179 ext3_commit_super(sb, es, 1); 184 ext3_commit_super(sb, es, 1);
@@ -291,8 +296,14 @@ void ext3_abort(struct super_block *sb, const char *function,
291 ext3_msg(sb, KERN_CRIT, 296 ext3_msg(sb, KERN_CRIT,
292 "error: remounting filesystem read-only"); 297 "error: remounting filesystem read-only");
293 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; 298 EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
294 sb->s_flags |= MS_RDONLY;
295 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT); 299 set_opt(EXT3_SB(sb)->s_mount_opt, ABORT);
300 /*
301 * Make sure updated value of ->s_mount_state will be visible
302 * before ->s_flags update.
303 */
304 smp_wmb();
305 sb->s_flags |= MS_RDONLY;
306
296 if (EXT3_SB(sb)->s_journal) 307 if (EXT3_SB(sb)->s_journal)
297 journal_abort(EXT3_SB(sb)->s_journal, -EIO); 308 journal_abort(EXT3_SB(sb)->s_journal, -EIO);
298} 309}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..ddd715e42a5c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
38 ext4_group_t group; 38 ext4_group_t group;
39 39
40 if (test_opt2(sb, STD_GROUP_SIZE)) 40 if (test_opt2(sb, STD_GROUP_SIZE))
41 group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 41 group = (block -
42 block) >> 42 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
43 (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); 43 (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
44 else 44 else
45 ext4_get_group_no_and_offset(sb, block, &group, NULL); 45 ext4_get_group_no_and_offset(sb, block, &group, NULL);
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
682 682
683static inline int test_root(ext4_group_t a, int b) 683static inline int test_root(ext4_group_t a, int b)
684{ 684{
685 int num = b; 685 while (1) {
686 686 if (a < b)
687 while (a > num) 687 return 0;
688 num *= b; 688 if (a == b)
689 return num == a; 689 return 1;
690 if ((a % b) != 0)
691 return 0;
692 a = a / b;
693 }
690} 694}
691 695
692static int ext4_group_sparse(ext4_group_t group) 696static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f8d56e4254e0..3c7d288ae94c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,7 @@
29#include "ext4.h" 29#include "ext4.h"
30#include "xattr.h" 30#include "xattr.h"
31 31
32static int ext4_dx_readdir(struct file *filp, 32static int ext4_dx_readdir(struct file *, struct dir_context *);
33 void *dirent, filldir_t filldir);
34 33
35/** 34/**
36 * Check if the given dir-inode refers to an htree-indexed directory 35 * Check if the given dir-inode refers to an htree-indexed directory
@@ -103,60 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
103 return 1; 102 return 1;
104} 103}
105 104
106static int ext4_readdir(struct file *filp, 105static int ext4_readdir(struct file *file, struct dir_context *ctx)
107 void *dirent, filldir_t filldir)
108{ 106{
109 int error = 0;
110 unsigned int offset; 107 unsigned int offset;
111 int i, stored; 108 int i, stored;
112 struct ext4_dir_entry_2 *de; 109 struct ext4_dir_entry_2 *de;
113 int err; 110 int err;
114 struct inode *inode = file_inode(filp); 111 struct inode *inode = file_inode(file);
115 struct super_block *sb = inode->i_sb; 112 struct super_block *sb = inode->i_sb;
116 int ret = 0;
117 int dir_has_error = 0; 113 int dir_has_error = 0;
118 114
119 if (is_dx_dir(inode)) { 115 if (is_dx_dir(inode)) {
120 err = ext4_dx_readdir(filp, dirent, filldir); 116 err = ext4_dx_readdir(file, ctx);
121 if (err != ERR_BAD_DX_DIR) { 117 if (err != ERR_BAD_DX_DIR) {
122 ret = err; 118 return err;
123 goto out;
124 } 119 }
125 /* 120 /*
126 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
127 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
128 */ 123 */
129 ext4_clear_inode_flag(file_inode(filp), 124 ext4_clear_inode_flag(file_inode(file),
130 EXT4_INODE_INDEX); 125 EXT4_INODE_INDEX);
131 } 126 }
132 127
133 if (ext4_has_inline_data(inode)) { 128 if (ext4_has_inline_data(inode)) {
134 int has_inline_data = 1; 129 int has_inline_data = 1;
135 ret = ext4_read_inline_dir(filp, dirent, filldir, 130 int ret = ext4_read_inline_dir(file, ctx,
136 &has_inline_data); 131 &has_inline_data);
137 if (has_inline_data) 132 if (has_inline_data)
138 return ret; 133 return ret;
139 } 134 }
140 135
141 stored = 0; 136 stored = 0;
142 offset = filp->f_pos & (sb->s_blocksize - 1); 137 offset = ctx->pos & (sb->s_blocksize - 1);
143 138
144 while (!error && !stored && filp->f_pos < inode->i_size) { 139 while (ctx->pos < inode->i_size) {
145 struct ext4_map_blocks map; 140 struct ext4_map_blocks map;
146 struct buffer_head *bh = NULL; 141 struct buffer_head *bh = NULL;
147 142
148 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 143 map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
149 map.m_len = 1; 144 map.m_len = 1;
150 err = ext4_map_blocks(NULL, inode, &map, 0); 145 err = ext4_map_blocks(NULL, inode, &map, 0);
151 if (err > 0) { 146 if (err > 0) {
152 pgoff_t index = map.m_pblk >> 147 pgoff_t index = map.m_pblk >>
153 (PAGE_CACHE_SHIFT - inode->i_blkbits); 148 (PAGE_CACHE_SHIFT - inode->i_blkbits);
154 if (!ra_has_index(&filp->f_ra, index)) 149 if (!ra_has_index(&file->f_ra, index))
155 page_cache_sync_readahead( 150 page_cache_sync_readahead(
156 sb->s_bdev->bd_inode->i_mapping, 151 sb->s_bdev->bd_inode->i_mapping,
157 &filp->f_ra, filp, 152 &file->f_ra, file,
158 index, 1); 153 index, 1);
159 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 154 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
160 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); 155 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
161 } 156 }
162 157
@@ -166,16 +161,16 @@ static int ext4_readdir(struct file *filp,
166 */ 161 */
167 if (!bh) { 162 if (!bh) {
168 if (!dir_has_error) { 163 if (!dir_has_error) {
169 EXT4_ERROR_FILE(filp, 0, 164 EXT4_ERROR_FILE(file, 0,
170 "directory contains a " 165 "directory contains a "
171 "hole at offset %llu", 166 "hole at offset %llu",
172 (unsigned long long) filp->f_pos); 167 (unsigned long long) ctx->pos);
173 dir_has_error = 1; 168 dir_has_error = 1;
174 } 169 }
175 /* corrupt size? Maybe no more blocks to read */ 170 /* corrupt size? Maybe no more blocks to read */
176 if (filp->f_pos > inode->i_blocks << 9) 171 if (ctx->pos > inode->i_blocks << 9)
177 break; 172 break;
178 filp->f_pos += sb->s_blocksize - offset; 173 ctx->pos += sb->s_blocksize - offset;
179 continue; 174 continue;
180 } 175 }
181 176
@@ -183,21 +178,20 @@ static int ext4_readdir(struct file *filp,
183 if (!buffer_verified(bh) && 178 if (!buffer_verified(bh) &&
184 !ext4_dirent_csum_verify(inode, 179 !ext4_dirent_csum_verify(inode,
185 (struct ext4_dir_entry *)bh->b_data)) { 180 (struct ext4_dir_entry *)bh->b_data)) {
186 EXT4_ERROR_FILE(filp, 0, "directory fails checksum " 181 EXT4_ERROR_FILE(file, 0, "directory fails checksum "
187 "at offset %llu", 182 "at offset %llu",
188 (unsigned long long)filp->f_pos); 183 (unsigned long long)ctx->pos);
189 filp->f_pos += sb->s_blocksize - offset; 184 ctx->pos += sb->s_blocksize - offset;
190 brelse(bh); 185 brelse(bh);
191 continue; 186 continue;
192 } 187 }
193 set_buffer_verified(bh); 188 set_buffer_verified(bh);
194 189
195revalidate:
196 /* If the dir block has changed since the last call to 190 /* If the dir block has changed since the last call to
197 * readdir(2), then we might be pointing to an invalid 191 * readdir(2), then we might be pointing to an invalid
198 * dirent right now. Scan from the start of the block 192 * dirent right now. Scan from the start of the block
199 * to make sure. */ 193 * to make sure. */
200 if (filp->f_version != inode->i_version) { 194 if (file->f_version != inode->i_version) {
201 for (i = 0; i < sb->s_blocksize && i < offset; ) { 195 for (i = 0; i < sb->s_blocksize && i < offset; ) {
202 de = (struct ext4_dir_entry_2 *) 196 de = (struct ext4_dir_entry_2 *)
203 (bh->b_data + i); 197 (bh->b_data + i);
@@ -214,57 +208,46 @@ revalidate:
214 sb->s_blocksize); 208 sb->s_blocksize);
215 } 209 }
216 offset = i; 210 offset = i;
217 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 211 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
218 | offset; 212 | offset;
219 filp->f_version = inode->i_version; 213 file->f_version = inode->i_version;
220 } 214 }
221 215
222 while (!error && filp->f_pos < inode->i_size 216 while (ctx->pos < inode->i_size
223 && offset < sb->s_blocksize) { 217 && offset < sb->s_blocksize) {
224 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 218 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
225 if (ext4_check_dir_entry(inode, filp, de, bh, 219 if (ext4_check_dir_entry(inode, file, de, bh,
226 bh->b_data, bh->b_size, 220 bh->b_data, bh->b_size,
227 offset)) { 221 offset)) {
228 /* 222 /*
229 * On error, skip the f_pos to the next block 223 * On error, skip to the next block
230 */ 224 */
231 filp->f_pos = (filp->f_pos | 225 ctx->pos = (ctx->pos |
232 (sb->s_blocksize - 1)) + 1; 226 (sb->s_blocksize - 1)) + 1;
233 brelse(bh); 227 break;
234 ret = stored;
235 goto out;
236 } 228 }
237 offset += ext4_rec_len_from_disk(de->rec_len, 229 offset += ext4_rec_len_from_disk(de->rec_len,
238 sb->s_blocksize); 230 sb->s_blocksize);
239 if (le32_to_cpu(de->inode)) { 231 if (le32_to_cpu(de->inode)) {
240 /* We might block in the next section 232 if (!dir_emit(ctx, de->name,
241 * if the data destination is
242 * currently swapped out. So, use a
243 * version stamp to detect whether or
244 * not the directory has been modified
245 * during the copy operation.
246 */
247 u64 version = filp->f_version;
248
249 error = filldir(dirent, de->name,
250 de->name_len, 233 de->name_len,
251 filp->f_pos,
252 le32_to_cpu(de->inode), 234 le32_to_cpu(de->inode),
253 get_dtype(sb, de->file_type)); 235 get_dtype(sb, de->file_type))) {
254 if (error) 236 brelse(bh);
255 break; 237 return 0;
256 if (version != filp->f_version) 238 }
257 goto revalidate;
258 stored++;
259 } 239 }
260 filp->f_pos += ext4_rec_len_from_disk(de->rec_len, 240 ctx->pos += ext4_rec_len_from_disk(de->rec_len,
261 sb->s_blocksize); 241 sb->s_blocksize);
262 } 242 }
263 offset = 0; 243 offset = 0;
264 brelse(bh); 244 brelse(bh);
245 if (ctx->pos < inode->i_size) {
246 if (!dir_relax(inode))
247 return 0;
248 }
265 } 249 }
266out: 250 return 0;
267 return ret;
268} 251}
269 252
270static inline int is_32bit_api(void) 253static inline int is_32bit_api(void)
@@ -492,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
492 * for all entres on the fname linked list. (Normally there is only 475 * for all entres on the fname linked list. (Normally there is only
493 * one entry on the linked list, unless there are 62 bit hash collisions.) 476 * one entry on the linked list, unless there are 62 bit hash collisions.)
494 */ 477 */
495static int call_filldir(struct file *filp, void *dirent, 478static int call_filldir(struct file *file, struct dir_context *ctx,
496 filldir_t filldir, struct fname *fname) 479 struct fname *fname)
497{ 480{
498 struct dir_private_info *info = filp->private_data; 481 struct dir_private_info *info = file->private_data;
499 loff_t curr_pos; 482 struct inode *inode = file_inode(file);
500 struct inode *inode = file_inode(filp); 483 struct super_block *sb = inode->i_sb;
501 struct super_block *sb;
502 int error;
503
504 sb = inode->i_sb;
505 484
506 if (!fname) { 485 if (!fname) {
507 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " 486 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -509,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,
509 inode->i_ino, current->comm); 488 inode->i_ino, current->comm);
510 return 0; 489 return 0;
511 } 490 }
512 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); 491 ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
513 while (fname) { 492 while (fname) {
514 error = filldir(dirent, fname->name, 493 if (!dir_emit(ctx, fname->name,
515 fname->name_len, curr_pos, 494 fname->name_len,
516 fname->inode, 495 fname->inode,
517 get_dtype(sb, fname->file_type)); 496 get_dtype(sb, fname->file_type))) {
518 if (error) {
519 filp->f_pos = curr_pos;
520 info->extra_fname = fname; 497 info->extra_fname = fname;
521 return error; 498 return 1;
522 } 499 }
523 fname = fname->next; 500 fname = fname->next;
524 } 501 }
525 return 0; 502 return 0;
526} 503}
527 504
528static int ext4_dx_readdir(struct file *filp, 505static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
529 void *dirent, filldir_t filldir)
530{ 506{
531 struct dir_private_info *info = filp->private_data; 507 struct dir_private_info *info = file->private_data;
532 struct inode *inode = file_inode(filp); 508 struct inode *inode = file_inode(file);
533 struct fname *fname; 509 struct fname *fname;
534 int ret; 510 int ret;
535 511
536 if (!info) { 512 if (!info) {
537 info = ext4_htree_create_dir_info(filp, filp->f_pos); 513 info = ext4_htree_create_dir_info(file, ctx->pos);
538 if (!info) 514 if (!info)
539 return -ENOMEM; 515 return -ENOMEM;
540 filp->private_data = info; 516 file->private_data = info;
541 } 517 }
542 518
543 if (filp->f_pos == ext4_get_htree_eof(filp)) 519 if (ctx->pos == ext4_get_htree_eof(file))
544 return 0; /* EOF */ 520 return 0; /* EOF */
545 521
546 /* Some one has messed with f_pos; reset the world */ 522 /* Some one has messed with f_pos; reset the world */
547 if (info->last_pos != filp->f_pos) { 523 if (info->last_pos != ctx->pos) {
548 free_rb_tree_fname(&info->root); 524 free_rb_tree_fname(&info->root);
549 info->curr_node = NULL; 525 info->curr_node = NULL;
550 info->extra_fname = NULL; 526 info->extra_fname = NULL;
551 info->curr_hash = pos2maj_hash(filp, filp->f_pos); 527 info->curr_hash = pos2maj_hash(file, ctx->pos);
552 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); 528 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
553 } 529 }
554 530
555 /* 531 /*
@@ -557,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,
557 * chain, return them first. 533 * chain, return them first.
558 */ 534 */
559 if (info->extra_fname) { 535 if (info->extra_fname) {
560 if (call_filldir(filp, dirent, filldir, info->extra_fname)) 536 if (call_filldir(file, ctx, info->extra_fname))
561 goto finished; 537 goto finished;
562 info->extra_fname = NULL; 538 info->extra_fname = NULL;
563 goto next_node; 539 goto next_node;
@@ -571,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,
571 * cached entries. 547 * cached entries.
572 */ 548 */
573 if ((!info->curr_node) || 549 if ((!info->curr_node) ||
574 (filp->f_version != inode->i_version)) { 550 (file->f_version != inode->i_version)) {
575 info->curr_node = NULL; 551 info->curr_node = NULL;
576 free_rb_tree_fname(&info->root); 552 free_rb_tree_fname(&info->root);
577 filp->f_version = inode->i_version; 553 file->f_version = inode->i_version;
578 ret = ext4_htree_fill_tree(filp, info->curr_hash, 554 ret = ext4_htree_fill_tree(file, info->curr_hash,
579 info->curr_minor_hash, 555 info->curr_minor_hash,
580 &info->next_hash); 556 &info->next_hash);
581 if (ret < 0) 557 if (ret < 0)
582 return ret; 558 return ret;
583 if (ret == 0) { 559 if (ret == 0) {
584 filp->f_pos = ext4_get_htree_eof(filp); 560 ctx->pos = ext4_get_htree_eof(file);
585 break; 561 break;
586 } 562 }
587 info->curr_node = rb_first(&info->root); 563 info->curr_node = rb_first(&info->root);
@@ -590,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,
590 fname = rb_entry(info->curr_node, struct fname, rb_hash); 566 fname = rb_entry(info->curr_node, struct fname, rb_hash);
591 info->curr_hash = fname->hash; 567 info->curr_hash = fname->hash;
592 info->curr_minor_hash = fname->minor_hash; 568 info->curr_minor_hash = fname->minor_hash;
593 if (call_filldir(filp, dirent, filldir, fname)) 569 if (call_filldir(file, ctx, fname))
594 break; 570 break;
595 next_node: 571 next_node:
596 info->curr_node = rb_next(info->curr_node); 572 info->curr_node = rb_next(info->curr_node);
@@ -601,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,
601 info->curr_minor_hash = fname->minor_hash; 577 info->curr_minor_hash = fname->minor_hash;
602 } else { 578 } else {
603 if (info->next_hash == ~0) { 579 if (info->next_hash == ~0) {
604 filp->f_pos = ext4_get_htree_eof(filp); 580 ctx->pos = ext4_get_htree_eof(file);
605 break; 581 break;
606 } 582 }
607 info->curr_hash = info->next_hash; 583 info->curr_hash = info->next_hash;
@@ -609,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,
609 } 585 }
610 } 586 }
611finished: 587finished:
612 info->last_pos = filp->f_pos; 588 info->last_pos = ctx->pos;
613 return 0; 589 return 0;
614} 590}
615 591
@@ -624,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
624const struct file_operations ext4_dir_operations = { 600const struct file_operations ext4_dir_operations = {
625 .llseek = ext4_dir_llseek, 601 .llseek = ext4_dir_llseek,
626 .read = generic_read_dir, 602 .read = generic_read_dir,
627 .readdir = ext4_readdir, 603 .iterate = ext4_readdir,
628 .unlocked_ioctl = ext4_ioctl, 604 .unlocked_ioctl = ext4_ioctl,
629#ifdef CONFIG_COMPAT 605#ifdef CONFIG_COMPAT
630 .compat_ioctl = ext4_compat_ioctl, 606 .compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5aae3d12d400..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,38 +177,28 @@ struct ext4_map_blocks {
177}; 177};
178 178
179/* 179/*
180 * For delayed allocation tracking
181 */
182struct mpage_da_data {
183 struct inode *inode;
184 sector_t b_blocknr; /* start block number of extent */
185 size_t b_size; /* size of extent */
186 unsigned long b_state; /* state of the extent */
187 unsigned long first_page, next_page; /* extent of pages */
188 struct writeback_control *wbc;
189 int io_done;
190 int pages_written;
191 int retval;
192};
193
194/*
195 * Flags for ext4_io_end->flags 180 * Flags for ext4_io_end->flags
196 */ 181 */
197#define EXT4_IO_END_UNWRITTEN 0x0001 182#define EXT4_IO_END_UNWRITTEN 0x0001
198#define EXT4_IO_END_ERROR 0x0002 183#define EXT4_IO_END_DIRECT 0x0002
199#define EXT4_IO_END_DIRECT 0x0004
200 184
201/* 185/*
202 * For converting uninitialized extents on a work queue. 186 * For converting uninitialized extents on a work queue. 'handle' is used for
187 * buffered writeback.
203 */ 188 */
204typedef struct ext4_io_end { 189typedef struct ext4_io_end {
205 struct list_head list; /* per-file finished IO list */ 190 struct list_head list; /* per-file finished IO list */
191 handle_t *handle; /* handle reserved for extent
192 * conversion */
206 struct inode *inode; /* file being written to */ 193 struct inode *inode; /* file being written to */
194 struct bio *bio; /* Linked list of completed
195 * bios covering the extent */
207 unsigned int flag; /* unwritten or not */ 196 unsigned int flag; /* unwritten or not */
208 loff_t offset; /* offset in the file */ 197 loff_t offset; /* offset in the file */
209 ssize_t size; /* size of the extent */ 198 ssize_t size; /* size of the extent */
210 struct kiocb *iocb; /* iocb struct for AIO */ 199 struct kiocb *iocb; /* iocb struct for AIO */
211 int result; /* error value for AIO */ 200 int result; /* error value for AIO */
201 atomic_t count; /* reference counter */
212} ext4_io_end_t; 202} ext4_io_end_t;
213 203
214struct ext4_io_submit { 204struct ext4_io_submit {
@@ -581,11 +571,6 @@ enum {
581#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 571#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
582 572
583/* 573/*
584 * Flags used by ext4_discard_partial_page_buffers
585 */
586#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
587
588/*
589 * ioctl commands 574 * ioctl commands
590 */ 575 */
591#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS 576#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -879,6 +864,7 @@ struct ext4_inode_info {
879 rwlock_t i_es_lock; 864 rwlock_t i_es_lock;
880 struct list_head i_es_lru; 865 struct list_head i_es_lru;
881 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 866 unsigned int i_es_lru_nr; /* protected by i_es_lock */
867 unsigned long i_touch_when; /* jiffies of last accessing */
882 868
883 /* ialloc */ 869 /* ialloc */
884 ext4_group_t i_last_alloc_group; 870 ext4_group_t i_last_alloc_group;
@@ -903,12 +889,22 @@ struct ext4_inode_info {
903 qsize_t i_reserved_quota; 889 qsize_t i_reserved_quota;
904#endif 890#endif
905 891
906 /* completed IOs that might need unwritten extents handling */ 892 /* Lock protecting lists below */
907 struct list_head i_completed_io_list;
908 spinlock_t i_completed_io_lock; 893 spinlock_t i_completed_io_lock;
894 /*
895 * Completed IOs that need unwritten extents handling and have
896 * transaction reserved
897 */
898 struct list_head i_rsv_conversion_list;
899 /*
900 * Completed IOs that need unwritten extents handling and don't have
901 * transaction reserved
902 */
903 struct list_head i_unrsv_conversion_list;
909 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 904 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
910 atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 905 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
911 struct work_struct i_unwritten_work; /* deferred extent conversion */ 906 struct work_struct i_rsv_conversion_work;
907 struct work_struct i_unrsv_conversion_work;
912 908
913 spinlock_t i_block_reservation_lock; 909 spinlock_t i_block_reservation_lock;
914 910
@@ -1245,7 +1241,6 @@ struct ext4_sb_info {
1245 unsigned int s_mb_stats; 1241 unsigned int s_mb_stats;
1246 unsigned int s_mb_order2_reqs; 1242 unsigned int s_mb_order2_reqs;
1247 unsigned int s_mb_group_prealloc; 1243 unsigned int s_mb_group_prealloc;
1248 unsigned int s_max_writeback_mb_bump;
1249 unsigned int s_max_dir_size_kb; 1244 unsigned int s_max_dir_size_kb;
1250 /* where last allocation was done - for stream allocation */ 1245 /* where last allocation was done - for stream allocation */
1251 unsigned long s_mb_last_group; 1246 unsigned long s_mb_last_group;
@@ -1281,8 +1276,10 @@ struct ext4_sb_info {
1281 struct flex_groups *s_flex_groups; 1276 struct flex_groups *s_flex_groups;
1282 ext4_group_t s_flex_groups_allocated; 1277 ext4_group_t s_flex_groups_allocated;
1283 1278
1284 /* workqueue for dio unwritten */ 1279 /* workqueue for unreserved extent convertions (dio) */
1285 struct workqueue_struct *dio_unwritten_wq; 1280 struct workqueue_struct *unrsv_conversion_wq;
1281 /* workqueue for reserved extent conversions (buffered io) */
1282 struct workqueue_struct *rsv_conversion_wq;
1286 1283
1287 /* timer for periodic error stats printing */ 1284 /* timer for periodic error stats printing */
1288 struct timer_list s_err_report; 1285 struct timer_list s_err_report;
@@ -1307,6 +1304,7 @@ struct ext4_sb_info {
1307 /* Reclaim extents from extent status tree */ 1304 /* Reclaim extents from extent status tree */
1308 struct shrinker s_es_shrinker; 1305 struct shrinker s_es_shrinker;
1309 struct list_head s_es_lru; 1306 struct list_head s_es_lru;
1307 unsigned long s_es_last_sorted;
1310 struct percpu_counter s_extent_cache_cnt; 1308 struct percpu_counter s_extent_cache_cnt;
1311 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1309 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1312}; 1310};
@@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1342 struct ext4_io_end *io_end) 1340 struct ext4_io_end *io_end)
1343{ 1341{
1344 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1342 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1343 /* Writeback has to have coversion transaction reserved */
1344 WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
1345 !(io_end->flag & EXT4_IO_END_DIRECT));
1345 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1346 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1346 atomic_inc(&EXT4_I(inode)->i_unwritten); 1347 atomic_inc(&EXT4_I(inode)->i_unwritten);
1347 } 1348 }
@@ -1999,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
1999 2000
2000/* fsync.c */ 2001/* fsync.c */
2001extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2002extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
2002extern int ext4_flush_unwritten_io(struct inode *);
2003 2003
2004/* hash.c */ 2004/* hash.c */
2005extern int ext4fs_dirhash(const char *name, int len, struct 2005extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
2089extern int ext4_can_truncate(struct inode *inode); 2089extern int ext4_can_truncate(struct inode *inode);
2090extern void ext4_truncate(struct inode *); 2090extern void ext4_truncate(struct inode *);
2091extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); 2091extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
2093extern void ext4_set_inode_flags(struct inode *); 2093extern void ext4_set_inode_flags(struct inode *);
2094extern void ext4_get_inode_flags(struct ext4_inode_info *); 2094extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
2096extern void ext4_set_aops(struct inode *inode); 2096extern void ext4_set_aops(struct inode *inode);
2097extern int ext4_writepage_trans_blocks(struct inode *); 2097extern int ext4_writepage_trans_blocks(struct inode *);
2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2099extern int ext4_discard_partial_page_buffers(handle_t *handle, 2099extern int ext4_block_truncate_page(handle_t *handle,
2100 struct address_space *mapping, loff_t from, 2100 struct address_space *mapping, loff_t from);
2101 loff_t length, int flags); 2101extern int ext4_block_zero_page_range(handle_t *handle,
2102 struct address_space *mapping, loff_t from, loff_t length);
2103extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2104 loff_t lstart, loff_t lend);
2102extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2105extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2103extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2106extern qsize_t *ext4_get_reserved_space(struct inode *inode);
2104extern void ext4_da_update_reserve_space(struct inode *inode, 2107extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2111 const struct iovec *iov, loff_t offset, 2114 const struct iovec *iov, loff_t offset,
2112 unsigned long nr_segs); 2115 unsigned long nr_segs);
2113extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2116extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2114extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2117extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2115extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2118extern void ext4_ind_truncate(handle_t *, struct inode *inode);
2116extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 2119extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
2117 ext4_lblk_t first, ext4_lblk_t stop); 2120 ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2166 ext4_group_t ngroup); 2169 ext4_group_t ngroup);
2167extern const char *ext4_decode_error(struct super_block *sb, int errno, 2170extern const char *ext4_decode_error(struct super_block *sb, int errno,
2168 char nbuf[16]); 2171 char nbuf[16]);
2172
2169extern __printf(4, 5) 2173extern __printf(4, 5)
2170void __ext4_error(struct super_block *, const char *, unsigned int, 2174void __ext4_error(struct super_block *, const char *, unsigned int,
2171 const char *, ...); 2175 const char *, ...);
2172#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
2173 __LINE__, ## message)
2174extern __printf(5, 6) 2176extern __printf(5, 6)
2175void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, 2177void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
2176 const char *, ...); 2178 const char *, ...);
2177extern __printf(5, 6) 2179extern __printf(5, 6)
2178void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, 2180void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
2179 const char *, ...); 2181 const char *, ...);
2180extern void __ext4_std_error(struct super_block *, const char *, 2182extern void __ext4_std_error(struct super_block *, const char *,
2181 unsigned int, int); 2183 unsigned int, int);
2182extern __printf(4, 5) 2184extern __printf(4, 5)
2183void __ext4_abort(struct super_block *, const char *, unsigned int, 2185void __ext4_abort(struct super_block *, const char *, unsigned int,
2184 const char *, ...); 2186 const char *, ...);
2185#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
2186 __LINE__, ## message)
2187extern __printf(4, 5) 2187extern __printf(4, 5)
2188void __ext4_warning(struct super_block *, const char *, unsigned int, 2188void __ext4_warning(struct super_block *, const char *, unsigned int,
2189 const char *, ...); 2189 const char *, ...);
2190#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
2191 __LINE__, ## message)
2192extern __printf(3, 4) 2190extern __printf(3, 4)
2193void ext4_msg(struct super_block *, const char *, const char *, ...); 2191void __ext4_msg(struct super_block *, const char *, const char *, ...);
2194extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 2192extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
2195 const char *, unsigned int, const char *); 2193 const char *, unsigned int, const char *);
2196#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
2197 __LINE__, msg)
2198extern __printf(7, 8) 2194extern __printf(7, 8)
2199void __ext4_grp_locked_error(const char *, unsigned int, 2195void __ext4_grp_locked_error(const char *, unsigned int,
2200 struct super_block *, ext4_group_t, 2196 struct super_block *, ext4_group_t,
2201 unsigned long, ext4_fsblk_t, 2197 unsigned long, ext4_fsblk_t,
2202 const char *, ...); 2198 const char *, ...);
2203#define ext4_grp_locked_error(sb, grp, message...) \ 2199
2204 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) 2200#ifdef CONFIG_PRINTK
2201
2202#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2203 __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
2204#define ext4_error_file(file, func, line, block, fmt, ...) \
2205 __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
2206#define ext4_error(sb, fmt, ...) \
2207 __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2208#define ext4_abort(sb, fmt, ...) \
2209 __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2210#define ext4_warning(sb, fmt, ...) \
2211 __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2212#define ext4_msg(sb, level, fmt, ...) \
2213 __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
2214#define dump_mmp_msg(sb, mmp, msg) \
2215 __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
2216#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2217 __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
2218 fmt, ##__VA_ARGS__)
2219
2220#else
2221
2222#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2223do { \
2224 no_printk(fmt, ##__VA_ARGS__); \
2225 __ext4_error_inode(inode, "", 0, block, " "); \
2226} while (0)
2227#define ext4_error_file(file, func, line, block, fmt, ...) \
2228do { \
2229 no_printk(fmt, ##__VA_ARGS__); \
2230 __ext4_error_file(file, "", 0, block, " "); \
2231} while (0)
2232#define ext4_error(sb, fmt, ...) \
2233do { \
2234 no_printk(fmt, ##__VA_ARGS__); \
2235 __ext4_error(sb, "", 0, " "); \
2236} while (0)
2237#define ext4_abort(sb, fmt, ...) \
2238do { \
2239 no_printk(fmt, ##__VA_ARGS__); \
2240 __ext4_abort(sb, "", 0, " "); \
2241} while (0)
2242#define ext4_warning(sb, fmt, ...) \
2243do { \
2244 no_printk(fmt, ##__VA_ARGS__); \
2245 __ext4_warning(sb, "", 0, " "); \
2246} while (0)
2247#define ext4_msg(sb, level, fmt, ...) \
2248do { \
2249 no_printk(fmt, ##__VA_ARGS__); \
2250 __ext4_msg(sb, "", " "); \
2251} while (0)
2252#define dump_mmp_msg(sb, mmp, msg) \
2253 __dump_mmp_msg(sb, mmp, "", 0, "")
2254#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2255do { \
2256 no_printk(fmt, ##__VA_ARGS__); \
2257 __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
2258} while (0)
2259
2260#endif
2261
2205extern void ext4_update_dynamic_rev(struct super_block *sb); 2262extern void ext4_update_dynamic_rev(struct super_block *sb);
2206extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 2263extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
2207 __u32 compat); 2264 __u32 compat);
@@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
2312{ 2369{
2313 struct ext4_group_info ***grp_info; 2370 struct ext4_group_info ***grp_info;
2314 long indexv, indexh; 2371 long indexv, indexh;
2372 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
2315 grp_info = EXT4_SB(sb)->s_group_info; 2373 grp_info = EXT4_SB(sb)->s_group_info;
2316 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); 2374 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
2317 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); 2375 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2515,7 +2573,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
2515 struct inode *parent, 2573 struct inode *parent,
2516 struct inode *inode); 2574 struct inode *inode);
2517extern int ext4_read_inline_dir(struct file *filp, 2575extern int ext4_read_inline_dir(struct file *filp,
2518 void *dirent, filldir_t filldir, 2576 struct dir_context *ctx,
2519 int *has_inline_data); 2577 int *has_inline_data);
2520extern int htree_inlinedir_to_tree(struct file *dir_file, 2578extern int htree_inlinedir_to_tree(struct file *dir_file,
2521 struct inode *dir, ext4_lblk_t block, 2579 struct inode *dir, ext4_lblk_t block,
@@ -2598,8 +2656,7 @@ struct ext4_extent;
2598 2656
2599extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2657extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
2600extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2658extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
2601extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 2659extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
2602 int chunk);
2603extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2660extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2604 struct ext4_map_blocks *map, int flags); 2661 struct ext4_map_blocks *map, int flags);
2605extern void ext4_ext_truncate(handle_t *, struct inode *); 2662extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
2609extern void ext4_ext_release(struct super_block *); 2666extern void ext4_ext_release(struct super_block *);
2610extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2667extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
2611 loff_t len); 2668 loff_t len);
2612extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2669extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
2613 ssize_t len); 2670 loff_t offset, ssize_t len);
2614extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2671extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
2615 struct ext4_map_blocks *map, int flags); 2672 struct ext4_map_blocks *map, int flags);
2616extern int ext4_ext_calc_metadata_amount(struct inode *inode, 2673extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2650 2707
2651/* page-io.c */ 2708/* page-io.c */
2652extern int __init ext4_init_pageio(void); 2709extern int __init ext4_init_pageio(void);
2653extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2654extern void ext4_exit_pageio(void); 2710extern void ext4_exit_pageio(void);
2655extern void ext4_ioend_shutdown(struct inode *);
2656extern void ext4_free_io_end(ext4_io_end_t *io);
2657extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2711extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2658extern void ext4_end_io_work(struct work_struct *work); 2712extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
2713extern int ext4_put_io_end(ext4_io_end_t *io_end);
2714extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
2715extern void ext4_io_submit_init(struct ext4_io_submit *io,
2716 struct writeback_control *wbc);
2717extern void ext4_end_io_rsv_work(struct work_struct *work);
2718extern void ext4_end_io_unrsv_work(struct work_struct *work);
2659extern void ext4_io_submit(struct ext4_io_submit *io); 2719extern void ext4_io_submit(struct ext4_io_submit *io);
2660extern int ext4_bio_write_page(struct ext4_io_submit *io, 2720extern int ext4_bio_write_page(struct ext4_io_submit *io,
2661 struct page *page, 2721 struct page *page,
@@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
2668extern int ext4_mmp_csum_verify(struct super_block *sb, 2728extern int ext4_mmp_csum_verify(struct super_block *sb,
2669 struct mmp_struct *mmp); 2729 struct mmp_struct *mmp);
2670 2730
2671/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2731/*
2732 * Note that these flags will never ever appear in a buffer_head's state flag.
2733 * See EXT4_MAP_... to see where this is used.
2734 */
2672enum ext4_state_bits { 2735enum ext4_state_bits {
2673 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2736 BH_Uninit /* blocks are allocated but uninitialized on disk */
2674 = BH_JBDPrivateStart, 2737 = BH_JBDPrivateStart,
2675 BH_AllocFromCluster, /* allocated blocks were part of already 2738 BH_AllocFromCluster, /* allocated blocks were part of already
2676 * allocated cluster. Note that this flag will 2739 * allocated cluster. */
2677 * never, ever appear in a buffer_head's state
2678 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2679 * this is used. */
2680}; 2740};
2681 2741
2682BUFFER_FNS(Uninit, uninit)
2683TAS_BUFFER_FNS(Uninit, uninit)
2684
2685/* 2742/*
2686 * Add new method to test whether block and inode bitmaps are properly 2743 * Add new method to test whether block and inode bitmaps are properly
2687 * initialized. With uninit_bg reading the block from disk is not enough 2744 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
38/* 38/*
39 * Wrappers for jbd2_journal_start/end. 39 * Wrappers for jbd2_journal_start/end.
40 */ 40 */
41handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 41static int ext4_journal_check_start(struct super_block *sb)
42 int type, int nblocks)
43{ 42{
44 journal_t *journal; 43 journal_t *journal;
45 44
46 might_sleep(); 45 might_sleep();
47
48 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
49 if (sb->s_flags & MS_RDONLY) 46 if (sb->s_flags & MS_RDONLY)
50 return ERR_PTR(-EROFS); 47 return -EROFS;
51
52 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); 48 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
53 journal = EXT4_SB(sb)->s_journal; 49 journal = EXT4_SB(sb)->s_journal;
54 if (!journal)
55 return ext4_get_nojournal();
56 /* 50 /*
57 * Special case here: if the journal has aborted behind our 51 * Special case here: if the journal has aborted behind our
58 * backs (eg. EIO in the commit thread), then we still need to 52 * backs (eg. EIO in the commit thread), then we still need to
59 * take the FS itself readonly cleanly. 53 * take the FS itself readonly cleanly.
60 */ 54 */
61 if (is_journal_aborted(journal)) { 55 if (journal && is_journal_aborted(journal)) {
62 ext4_abort(sb, "Detected aborted journal"); 56 ext4_abort(sb, "Detected aborted journal");
63 return ERR_PTR(-EROFS); 57 return -EROFS;
64 } 58 }
65 return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); 59 return 0;
60}
61
62handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
63 int type, int blocks, int rsv_blocks)
64{
65 journal_t *journal;
66 int err;
67
68 trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
69 err = ext4_journal_check_start(sb);
70 if (err < 0)
71 return ERR_PTR(err);
72
73 journal = EXT4_SB(sb)->s_journal;
74 if (!journal)
75 return ext4_get_nojournal();
76 return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
77 type, line);
66} 78}
67 79
68int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) 80int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
86 return err; 98 return err;
87} 99}
88 100
101handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
102 int type)
103{
104 struct super_block *sb;
105 int err;
106
107 if (!ext4_handle_valid(handle))
108 return ext4_get_nojournal();
109
110 sb = handle->h_journal->j_private;
111 trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
112 _RET_IP_);
113 err = ext4_journal_check_start(sb);
114 if (err < 0) {
115 jbd2_journal_free_reserved(handle);
116 return ERR_PTR(err);
117 }
118
119 err = jbd2_journal_start_reserved(handle, type, line);
120 if (err < 0)
121 return ERR_PTR(err);
122 return handle;
123}
124
89void ext4_journal_abort_handle(const char *caller, unsigned int line, 125void ext4_journal_abort_handle(const char *caller, unsigned int line,
90 const char *err_fn, struct buffer_head *bh, 126 const char *err_fn, struct buffer_head *bh,
91 handle_t *handle, int err) 127 handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
134#define EXT4_HT_MIGRATE 8 134#define EXT4_HT_MIGRATE 8
135#define EXT4_HT_MOVE_EXTENTS 9 135#define EXT4_HT_MOVE_EXTENTS 9
136#define EXT4_HT_XATTR 10 136#define EXT4_HT_XATTR 10
137#define EXT4_HT_MAX 11 137#define EXT4_HT_EXT_CONVERT 11
138#define EXT4_HT_MAX 12
138 139
139/** 140/**
140 * struct ext4_journal_cb_entry - Base structure for callback information. 141 * struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
265 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) 266 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
266 267
267handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 268handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
268 int type, int nblocks); 269 int type, int blocks, int rsv_blocks);
269int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); 270int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
270 271
271#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 272#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
300} 301}
301 302
302#define ext4_journal_start_sb(sb, type, nblocks) \ 303#define ext4_journal_start_sb(sb, type, nblocks) \
303 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) 304 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
304 305
305#define ext4_journal_start(inode, type, nblocks) \ 306#define ext4_journal_start(inode, type, nblocks) \
306 __ext4_journal_start((inode), __LINE__, (type), (nblocks)) 307 __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
308
309#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
310 __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
307 311
308static inline handle_t *__ext4_journal_start(struct inode *inode, 312static inline handle_t *__ext4_journal_start(struct inode *inode,
309 unsigned int line, int type, 313 unsigned int line, int type,
310 int nblocks) 314 int blocks, int rsv_blocks)
311{ 315{
312 return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); 316 return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
317 rsv_blocks);
313} 318}
314 319
315#define ext4_journal_stop(handle) \ 320#define ext4_journal_stop(handle) \
316 __ext4_journal_stop(__func__, __LINE__, (handle)) 321 __ext4_journal_stop(__func__, __LINE__, (handle))
317 322
323#define ext4_journal_start_reserved(handle, type) \
324 __ext4_journal_start_reserved((handle), __LINE__, (type))
325
326handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
327 int type);
328
329static inline void ext4_journal_free_reserved(handle_t *handle)
330{
331 if (ext4_handle_valid(handle))
332 jbd2_journal_free_reserved(handle);
333}
334
318static inline handle_t *ext4_journal_current_handle(void) 335static inline handle_t *ext4_journal_current_handle(void)
319{ 336{
320 return journal_current_handle(); 337 return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910b9cf..72ba4705d4fa 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2125 next_del = ext4_find_delayed_extent(inode, &es); 2125 next_del = ext4_find_delayed_extent(inode, &es);
2126 if (!exists && next_del) { 2126 if (!exists && next_del) {
2127 exists = 1; 2127 exists = 1;
2128 flags |= FIEMAP_EXTENT_DELALLOC; 2128 flags |= (FIEMAP_EXTENT_DELALLOC |
2129 FIEMAP_EXTENT_UNKNOWN);
2129 } 2130 }
2130 up_read(&EXT4_I(inode)->i_data_sem); 2131 up_read(&EXT4_I(inode)->i_data_sem);
2131 2132
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2328} 2329}
2329 2330
2330/* 2331/*
2331 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2332 * How many index/leaf blocks need to change/allocate to add @extents extents?
2332 * 2333 *
2333 * if nrblocks are fit in a single extent (chunk flag is 1), then 2334 * If we add a single extent, then in the worse case, each tree level
2334 * in the worse case, each tree level index/leaf need to be changed 2335 * index/leaf need to be changed in case of the tree split.
2335 * if the tree split due to insert a new extent, then the old tree
2336 * index/leaf need to be updated too
2337 * 2336 *
2338 * If the nrblocks are discontiguous, they could cause 2337 * If more extents are inserted, they could cause the whole tree split more
2339 * the whole tree split more than once, but this is really rare. 2338 * than once, but this is really rare.
2340 */ 2339 */
2341int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2340int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2342{ 2341{
2343 int index; 2342 int index;
2344 int depth; 2343 int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2349 2348
2350 depth = ext_depth(inode); 2349 depth = ext_depth(inode);
2351 2350
2352 if (chunk) 2351 if (extents <= 1)
2353 index = depth * 2; 2352 index = depth * 2;
2354 else 2353 else
2355 index = depth * 3; 2354 index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2357 return index; 2356 return index;
2358} 2357}
2359 2358
2359static inline int get_default_free_blocks_flags(struct inode *inode)
2360{
2361 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2362 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2363 else if (ext4_should_journal_data(inode))
2364 return EXT4_FREE_BLOCKS_FORGET;
2365 return 0;
2366}
2367
2360static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2368static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2361 struct ext4_extent *ex, 2369 struct ext4_extent *ex,
2362 ext4_fsblk_t *partial_cluster, 2370 long long *partial_cluster,
2363 ext4_lblk_t from, ext4_lblk_t to) 2371 ext4_lblk_t from, ext4_lblk_t to)
2364{ 2372{
2365 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2373 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2366 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2374 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2367 ext4_fsblk_t pblk; 2375 ext4_fsblk_t pblk;
2368 int flags = 0; 2376 int flags = get_default_free_blocks_flags(inode);
2369
2370 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2371 flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2372 else if (ext4_should_journal_data(inode))
2373 flags |= EXT4_FREE_BLOCKS_FORGET;
2374 2377
2375 /* 2378 /*
2376 * For bigalloc file systems, we never free a partial cluster 2379 * For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2388 * partial cluster here. 2391 * partial cluster here.
2389 */ 2392 */
2390 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2393 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2391 if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2394 if ((*partial_cluster > 0) &&
2395 (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2392 ext4_free_blocks(handle, inode, NULL, 2396 ext4_free_blocks(handle, inode, NULL,
2393 EXT4_C2B(sbi, *partial_cluster), 2397 EXT4_C2B(sbi, *partial_cluster),
2394 sbi->s_cluster_ratio, flags); 2398 sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2414 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2418 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2415 /* tail removal */ 2419 /* tail removal */
2416 ext4_lblk_t num; 2420 ext4_lblk_t num;
2421 unsigned int unaligned;
2417 2422
2418 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2423 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2419 pblk = ext4_ext_pblock(ex) + ee_len - num; 2424 pblk = ext4_ext_pblock(ex) + ee_len - num;
2420 ext_debug("free last %u blocks starting %llu\n", num, pblk); 2425 /*
2426 * Usually we want to free partial cluster at the end of the
2427 * extent, except for the situation when the cluster is still
2428 * used by any other extent (partial_cluster is negative).
2429 */
2430 if (*partial_cluster < 0 &&
2431 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2432 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2433
2434 ext_debug("free last %u blocks starting %llu partial %lld\n",
2435 num, pblk, *partial_cluster);
2421 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2436 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2422 /* 2437 /*
2423 * If the block range to be freed didn't start at the 2438 * If the block range to be freed didn't start at the
2424 * beginning of a cluster, and we removed the entire 2439 * beginning of a cluster, and we removed the entire
2425 * extent, save the partial cluster here, since we 2440 * extent and the cluster is not used by any other extent,
2426 * might need to delete if we determine that the 2441 * save the partial cluster here, since we might need to
2427 * truncate operation has removed all of the blocks in 2442 * delete if we determine that the truncate operation has
2428 * the cluster. 2443 * removed all of the blocks in the cluster.
2444 *
2445 * On the other hand, if we did not manage to free the whole
2446 * extent, we have to mark the cluster as used (store negative
2447 * cluster number in partial_cluster).
2429 */ 2448 */
2430 if (pblk & (sbi->s_cluster_ratio - 1) && 2449 unaligned = pblk & (sbi->s_cluster_ratio - 1);
2431 (ee_len == num)) 2450 if (unaligned && (ee_len == num) &&
2451 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2432 *partial_cluster = EXT4_B2C(sbi, pblk); 2452 *partial_cluster = EXT4_B2C(sbi, pblk);
2433 else 2453 else if (unaligned)
2454 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2455 else if (*partial_cluster > 0)
2434 *partial_cluster = 0; 2456 *partial_cluster = 0;
2435 } else if (from == le32_to_cpu(ex->ee_block) 2457 } else
2436 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2458 ext4_error(sbi->s_sb, "strange request: removal(2) "
2437 /* head removal */ 2459 "%u-%u from %u:%u\n",
2438 ext4_lblk_t num; 2460 from, to, le32_to_cpu(ex->ee_block), ee_len);
2439 ext4_fsblk_t start;
2440
2441 num = to - from;
2442 start = ext4_ext_pblock(ex);
2443
2444 ext_debug("free first %u blocks starting %llu\n", num, start);
2445 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2446
2447 } else {
2448 printk(KERN_INFO "strange request: removal(2) "
2449 "%u-%u from %u:%u\n",
2450 from, to, le32_to_cpu(ex->ee_block), ee_len);
2451 }
2452 return 0; 2461 return 0;
2453} 2462}
2454 2463
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2461 * @handle: The journal handle 2470 * @handle: The journal handle
2462 * @inode: The files inode 2471 * @inode: The files inode
2463 * @path: The path to the leaf 2472 * @path: The path to the leaf
2473 * @partial_cluster: The cluster which we'll have to free if all extents
2474 * has been released from it. It gets negative in case
2475 * that the cluster is still used.
2464 * @start: The first block to remove 2476 * @start: The first block to remove
2465 * @end: The last block to remove 2477 * @end: The last block to remove
2466 */ 2478 */
2467static int 2479static int
2468ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2480ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2469 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, 2481 struct ext4_ext_path *path,
2482 long long *partial_cluster,
2470 ext4_lblk_t start, ext4_lblk_t end) 2483 ext4_lblk_t start, ext4_lblk_t end)
2471{ 2484{
2472 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2485 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2479 unsigned short ex_ee_len; 2492 unsigned short ex_ee_len;
2480 unsigned uninitialized = 0; 2493 unsigned uninitialized = 0;
2481 struct ext4_extent *ex; 2494 struct ext4_extent *ex;
2495 ext4_fsblk_t pblk;
2482 2496
2483 /* the header must be checked already in ext4_ext_remove_space() */ 2497 /* the header must be checked already in ext4_ext_remove_space() */
2484 ext_debug("truncate since %u in leaf to %u\n", start, end); 2498 ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2490 return -EIO; 2504 return -EIO;
2491 } 2505 }
2492 /* find where to start removing */ 2506 /* find where to start removing */
2493 ex = EXT_LAST_EXTENT(eh); 2507 ex = path[depth].p_ext;
2508 if (!ex)
2509 ex = EXT_LAST_EXTENT(eh);
2494 2510
2495 ex_ee_block = le32_to_cpu(ex->ee_block); 2511 ex_ee_block = le32_to_cpu(ex->ee_block);
2496 ex_ee_len = ext4_ext_get_actual_len(ex); 2512 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2517 2533
2518 /* If this extent is beyond the end of the hole, skip it */ 2534 /* If this extent is beyond the end of the hole, skip it */
2519 if (end < ex_ee_block) { 2535 if (end < ex_ee_block) {
2536 /*
2537 * We're going to skip this extent and move to another,
2538 * so if this extent is not cluster aligned we have
2539 * to mark the current cluster as used to avoid
2540 * accidentally freeing it later on
2541 */
2542 pblk = ext4_ext_pblock(ex);
2543 if (pblk & (sbi->s_cluster_ratio - 1))
2544 *partial_cluster =
2545 -((long long)EXT4_B2C(sbi, pblk));
2520 ex--; 2546 ex--;
2521 ex_ee_block = le32_to_cpu(ex->ee_block); 2547 ex_ee_block = le32_to_cpu(ex->ee_block);
2522 ex_ee_len = ext4_ext_get_actual_len(ex); 2548 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2592 sizeof(struct ext4_extent)); 2618 sizeof(struct ext4_extent));
2593 } 2619 }
2594 le16_add_cpu(&eh->eh_entries, -1); 2620 le16_add_cpu(&eh->eh_entries, -1);
2595 } else 2621 } else if (*partial_cluster > 0)
2596 *partial_cluster = 0; 2622 *partial_cluster = 0;
2597 2623
2598 err = ext4_ext_dirty(handle, inode, path + depth); 2624 err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2610 err = ext4_ext_correct_indexes(handle, inode, path); 2636 err = ext4_ext_correct_indexes(handle, inode, path);
2611 2637
2612 /* 2638 /*
2613 * If there is still a entry in the leaf node, check to see if 2639 * Free the partial cluster only if the current extent does not
2614 * it references the partial cluster. This is the only place 2640 * reference it. Otherwise we might free used cluster.
2615 * where it could; if it doesn't, we can free the cluster.
2616 */ 2641 */
2617 if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && 2642 if (*partial_cluster > 0 &&
2618 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2643 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2619 *partial_cluster)) { 2644 *partial_cluster)) {
2620 int flags = EXT4_FREE_BLOCKS_FORGET; 2645 int flags = get_default_free_blocks_flags(inode);
2621
2622 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2623 flags |= EXT4_FREE_BLOCKS_METADATA;
2624 2646
2625 ext4_free_blocks(handle, inode, NULL, 2647 ext4_free_blocks(handle, inode, NULL,
2626 EXT4_C2B(sbi, *partial_cluster), 2648 EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2664 struct super_block *sb = inode->i_sb; 2686 struct super_block *sb = inode->i_sb;
2665 int depth = ext_depth(inode); 2687 int depth = ext_depth(inode);
2666 struct ext4_ext_path *path = NULL; 2688 struct ext4_ext_path *path = NULL;
2667 ext4_fsblk_t partial_cluster = 0; 2689 long long partial_cluster = 0;
2668 handle_t *handle; 2690 handle_t *handle;
2669 int i = 0, err = 0; 2691 int i = 0, err = 0;
2670 2692
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2676 return PTR_ERR(handle); 2698 return PTR_ERR(handle);
2677 2699
2678again: 2700again:
2679 trace_ext4_ext_remove_space(inode, start, depth); 2701 trace_ext4_ext_remove_space(inode, start, end, depth);
2680 2702
2681 /* 2703 /*
2682 * Check if we are removing extents inside the extent tree. If that 2704 * Check if we are removing extents inside the extent tree. If that
@@ -2813,6 +2835,9 @@ again:
2813 err = -EIO; 2835 err = -EIO;
2814 break; 2836 break;
2815 } 2837 }
2838 /* Yield here to deal with large extent trees.
2839 * Should be a no-op if we did IO above. */
2840 cond_resched();
2816 if (WARN_ON(i + 1 > depth)) { 2841 if (WARN_ON(i + 1 > depth)) {
2817 err = -EIO; 2842 err = -EIO;
2818 break; 2843 break;
@@ -2844,17 +2869,14 @@ again:
2844 } 2869 }
2845 } 2870 }
2846 2871
2847 trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, 2872 trace_ext4_ext_remove_space_done(inode, start, end, depth,
2848 path->p_hdr->eh_entries); 2873 partial_cluster, path->p_hdr->eh_entries);
2849 2874
2850 /* If we still have something in the partial cluster and we have removed 2875 /* If we still have something in the partial cluster and we have removed
2851 * even the first extent, then we should free the blocks in the partial 2876 * even the first extent, then we should free the blocks in the partial
2852 * cluster as well. */ 2877 * cluster as well. */
2853 if (partial_cluster && path->p_hdr->eh_entries == 0) { 2878 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2854 int flags = EXT4_FREE_BLOCKS_FORGET; 2879 int flags = get_default_free_blocks_flags(inode);
2855
2856 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2857 flags |= EXT4_FREE_BLOCKS_METADATA;
2858 2880
2859 ext4_free_blocks(handle, inode, NULL, 2881 ext4_free_blocks(handle, inode, NULL,
2860 EXT4_C2B(EXT4_SB(sb), partial_cluster), 2882 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -4242,8 +4264,8 @@ got_allocated_blocks:
4242 /* not a good idea to call discard here directly, 4264 /* not a good idea to call discard here directly,
4243 * but otherwise we'd need to call it every free() */ 4265 * but otherwise we'd need to call it every free() */
4244 ext4_discard_preallocations(inode); 4266 ext4_discard_preallocations(inode);
4245 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 4267 ext4_free_blocks(handle, inode, NULL, newblock,
4246 ext4_ext_get_actual_len(&newex), fb_flags); 4268 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4247 goto out2; 4269 goto out2;
4248 } 4270 }
4249 4271
@@ -4363,8 +4385,9 @@ out2:
4363 } 4385 }
4364 4386
4365out3: 4387out3:
4366 trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); 4388 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4367 4389 err ? err : allocated);
4390 ext4_es_lru_add(inode);
4368 return err ? err : allocated; 4391 return err ? err : allocated;
4369} 4392}
4370 4393
@@ -4386,9 +4409,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4386 4409
4387 last_block = (inode->i_size + sb->s_blocksize - 1) 4410 last_block = (inode->i_size + sb->s_blocksize - 1)
4388 >> EXT4_BLOCK_SIZE_BITS(sb); 4411 >> EXT4_BLOCK_SIZE_BITS(sb);
4412retry:
4389 err = ext4_es_remove_extent(inode, last_block, 4413 err = ext4_es_remove_extent(inode, last_block,
4390 EXT_MAX_BLOCKS - last_block); 4414 EXT_MAX_BLOCKS - last_block);
4415 if (err == -ENOMEM) {
4416 cond_resched();
4417 congestion_wait(BLK_RW_ASYNC, HZ/50);
4418 goto retry;
4419 }
4420 if (err) {
4421 ext4_std_error(inode->i_sb, err);
4422 return;
4423 }
4391 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4424 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4425 ext4_std_error(inode->i_sb, err);
4392} 4426}
4393 4427
4394static void ext4_falloc_update_inode(struct inode *inode, 4428static void ext4_falloc_update_inode(struct inode *inode,
@@ -4446,7 +4480,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4446 return -EOPNOTSUPP; 4480 return -EOPNOTSUPP;
4447 4481
4448 if (mode & FALLOC_FL_PUNCH_HOLE) 4482 if (mode & FALLOC_FL_PUNCH_HOLE)
4449 return ext4_punch_hole(file, offset, len); 4483 return ext4_punch_hole(inode, offset, len);
4450 4484
4451 ret = ext4_convert_inline_data(inode); 4485 ret = ext4_convert_inline_data(inode);
4452 if (ret) 4486 if (ret)
@@ -4548,10 +4582,9 @@ retry:
4548 * function, to convert the fallocated extents after IO is completed. 4582 * function, to convert the fallocated extents after IO is completed.
4549 * Returns 0 on success. 4583 * Returns 0 on success.
4550 */ 4584 */
4551int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 4585int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4552 ssize_t len) 4586 loff_t offset, ssize_t len)
4553{ 4587{
4554 handle_t *handle;
4555 unsigned int max_blocks; 4588 unsigned int max_blocks;
4556 int ret = 0; 4589 int ret = 0;
4557 int ret2 = 0; 4590 int ret2 = 0;
@@ -4566,16 +4599,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4566 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4599 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4567 map.m_lblk); 4600 map.m_lblk);
4568 /* 4601 /*
4569 * credits to insert 1 extent into extent tree 4602 * This is somewhat ugly but the idea is clear: When transaction is
4603 * reserved, everything goes into it. Otherwise we rather start several
4604 * smaller transactions for conversion of each extent separately.
4570 */ 4605 */
4571 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4606 if (handle) {
4607 handle = ext4_journal_start_reserved(handle,
4608 EXT4_HT_EXT_CONVERT);
4609 if (IS_ERR(handle))
4610 return PTR_ERR(handle);
4611 credits = 0;
4612 } else {
4613 /*
4614 * credits to insert 1 extent into extent tree
4615 */
4616 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4617 }
4572 while (ret >= 0 && ret < max_blocks) { 4618 while (ret >= 0 && ret < max_blocks) {
4573 map.m_lblk += ret; 4619 map.m_lblk += ret;
4574 map.m_len = (max_blocks -= ret); 4620 map.m_len = (max_blocks -= ret);
4575 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 4621 if (credits) {
4576 if (IS_ERR(handle)) { 4622 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4577 ret = PTR_ERR(handle); 4623 credits);
4578 break; 4624 if (IS_ERR(handle)) {
4625 ret = PTR_ERR(handle);
4626 break;
4627 }
4579 } 4628 }
4580 ret = ext4_map_blocks(handle, inode, &map, 4629 ret = ext4_map_blocks(handle, inode, &map,
4581 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4630 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4635,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4586 inode->i_ino, map.m_lblk, 4635 inode->i_ino, map.m_lblk,
4587 map.m_len, ret); 4636 map.m_len, ret);
4588 ext4_mark_inode_dirty(handle, inode); 4637 ext4_mark_inode_dirty(handle, inode);
4589 ret2 = ext4_journal_stop(handle); 4638 if (credits)
4590 if (ret <= 0 || ret2 ) 4639 ret2 = ext4_journal_stop(handle);
4640 if (ret <= 0 || ret2)
4591 break; 4641 break;
4592 } 4642 }
4643 if (!credits)
4644 ret2 = ext4_journal_stop(handle);
4593 return ret > 0 ? ret2 : ret; 4645 return ret > 0 ? ret2 : ret;
4594} 4646}
4595 4647
@@ -4659,7 +4711,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4659 error = ext4_get_inode_loc(inode, &iloc); 4711 error = ext4_get_inode_loc(inode, &iloc);
4660 if (error) 4712 if (error)
4661 return error; 4713 return error;
4662 physical = iloc.bh->b_blocknr << blockbits; 4714 physical = (__u64)iloc.bh->b_blocknr << blockbits;
4663 offset = EXT4_GOOD_OLD_INODE_SIZE + 4715 offset = EXT4_GOOD_OLD_INODE_SIZE +
4664 EXT4_I(inode)->i_extra_isize; 4716 EXT4_I(inode)->i_extra_isize;
4665 physical += offset; 4717 physical += offset;
@@ -4667,7 +4719,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4667 flags |= FIEMAP_EXTENT_DATA_INLINE; 4719 flags |= FIEMAP_EXTENT_DATA_INLINE;
4668 brelse(iloc.bh); 4720 brelse(iloc.bh);
4669 } else { /* external block */ 4721 } else { /* external block */
4670 physical = EXT4_I(inode)->i_file_acl << blockbits; 4722 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4671 length = inode->i_sb->s_blocksize; 4723 length = inode->i_sb->s_blocksize;
4672 } 4724 }
4673 4725
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..91cb110da1b4 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
10 * Ext4 extents status tree core functions. 10 * Ext4 extents status tree core functions.
11 */ 11 */
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/list_sort.h>
13#include "ext4.h" 14#include "ext4.h"
14#include "extents_status.h" 15#include "extents_status.h"
15#include "ext4_extents.h" 16#include "ext4_extents.h"
@@ -147,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
147 ext4_lblk_t end); 148 ext4_lblk_t end);
148static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 149static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
149 int nr_to_scan); 150 int nr_to_scan);
151static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
152 struct ext4_inode_info *locked_ei);
150 153
151int __init ext4_init_es(void) 154int __init ext4_init_es(void)
152{ 155{
@@ -291,7 +294,6 @@ out:
291 294
292 read_unlock(&EXT4_I(inode)->i_es_lock); 295 read_unlock(&EXT4_I(inode)->i_es_lock);
293 296
294 ext4_es_lru_add(inode);
295 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 297 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
296} 298}
297 299
@@ -439,7 +441,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
439 */ 441 */
440 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { 442 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
441 if (in_range(es->es_lblk, ee_block, ee_len)) { 443 if (in_range(es->es_lblk, ee_block, ee_len)) {
442 pr_warn("ES insert assertation failed for " 444 pr_warn("ES insert assertion failed for "
443 "inode: %lu we can find an extent " 445 "inode: %lu we can find an extent "
444 "at block [%d/%d/%llu/%c], but we " 446 "at block [%d/%d/%llu/%c], but we "
445 "want to add an delayed/hole extent " 447 "want to add an delayed/hole extent "
@@ -458,7 +460,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
458 */ 460 */
459 if (es->es_lblk < ee_block || 461 if (es->es_lblk < ee_block ||
460 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { 462 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
461 pr_warn("ES insert assertation failed for inode: %lu " 463 pr_warn("ES insert assertion failed for inode: %lu "
462 "ex_status [%d/%d/%llu/%c] != " 464 "ex_status [%d/%d/%llu/%c] != "
463 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 465 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
464 ee_block, ee_len, ee_start, 466 ee_block, ee_len, ee_start,
@@ -468,7 +470,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
468 } 470 }
469 471
470 if (ee_status ^ es_status) { 472 if (ee_status ^ es_status) {
471 pr_warn("ES insert assertation failed for inode: %lu " 473 pr_warn("ES insert assertion failed for inode: %lu "
472 "ex_status [%d/%d/%llu/%c] != " 474 "ex_status [%d/%d/%llu/%c] != "
473 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 475 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
474 ee_block, ee_len, ee_start, 476 ee_block, ee_len, ee_start,
@@ -481,7 +483,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
481 * that we don't want to add an written/unwritten extent. 483 * that we don't want to add an written/unwritten extent.
482 */ 484 */
483 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 485 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
484 pr_warn("ES insert assertation failed for inode: %lu " 486 pr_warn("ES insert assertion failed for inode: %lu "
485 "can't find an extent at block %d but we want " 487 "can't find an extent at block %d but we want "
486 "to add an written/unwritten extent " 488 "to add an written/unwritten extent "
487 "[%d/%d/%llu/%llx]\n", inode->i_ino, 489 "[%d/%d/%llu/%llx]\n", inode->i_ino,
@@ -519,7 +521,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
519 * We want to add a delayed/hole extent but this 521 * We want to add a delayed/hole extent but this
520 * block has been allocated. 522 * block has been allocated.
521 */ 523 */
522 pr_warn("ES insert assertation failed for inode: %lu " 524 pr_warn("ES insert assertion failed for inode: %lu "
523 "We can find blocks but we want to add a " 525 "We can find blocks but we want to add a "
524 "delayed/hole extent [%d/%d/%llu/%llx]\n", 526 "delayed/hole extent [%d/%d/%llu/%llx]\n",
525 inode->i_ino, es->es_lblk, es->es_len, 527 inode->i_ino, es->es_lblk, es->es_len,
@@ -527,13 +529,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
527 return; 529 return;
528 } else if (ext4_es_is_written(es)) { 530 } else if (ext4_es_is_written(es)) {
529 if (retval != es->es_len) { 531 if (retval != es->es_len) {
530 pr_warn("ES insert assertation failed for " 532 pr_warn("ES insert assertion failed for "
531 "inode: %lu retval %d != es_len %d\n", 533 "inode: %lu retval %d != es_len %d\n",
532 inode->i_ino, retval, es->es_len); 534 inode->i_ino, retval, es->es_len);
533 return; 535 return;
534 } 536 }
535 if (map.m_pblk != ext4_es_pblock(es)) { 537 if (map.m_pblk != ext4_es_pblock(es)) {
536 pr_warn("ES insert assertation failed for " 538 pr_warn("ES insert assertion failed for "
537 "inode: %lu m_pblk %llu != " 539 "inode: %lu m_pblk %llu != "
538 "es_pblk %llu\n", 540 "es_pblk %llu\n",
539 inode->i_ino, map.m_pblk, 541 inode->i_ino, map.m_pblk,
@@ -549,7 +551,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
549 } 551 }
550 } else if (retval == 0) { 552 } else if (retval == 0) {
551 if (ext4_es_is_written(es)) { 553 if (ext4_es_is_written(es)) {
552 pr_warn("ES insert assertation failed for inode: %lu " 554 pr_warn("ES insert assertion failed for inode: %lu "
553 "We can't find the block but we want to add " 555 "We can't find the block but we want to add "
554 "an written extent [%d/%d/%llu/%llx]\n", 556 "an written extent [%d/%d/%llu/%llx]\n",
555 inode->i_ino, es->es_lblk, es->es_len, 557 inode->i_ino, es->es_lblk, es->es_len,
@@ -632,10 +634,8 @@ out:
632} 634}
633 635
634/* 636/*
635 * ext4_es_insert_extent() adds a space to a extent status tree. 637 * ext4_es_insert_extent() adds information to an inode's extent
636 * 638 * status tree.
637 * ext4_es_insert_extent is called by ext4_da_write_begin and
638 * ext4_es_remove_extent.
639 * 639 *
640 * Return 0 on success, error code on failure. 640 * Return 0 on success, error code on failure.
641 */ 641 */
@@ -667,12 +667,17 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
667 err = __es_remove_extent(inode, lblk, end); 667 err = __es_remove_extent(inode, lblk, end);
668 if (err != 0) 668 if (err != 0)
669 goto error; 669 goto error;
670retry:
670 err = __es_insert_extent(inode, &newes); 671 err = __es_insert_extent(inode, &newes);
672 if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
673 EXT4_I(inode)))
674 goto retry;
675 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
676 err = 0;
671 677
672error: 678error:
673 write_unlock(&EXT4_I(inode)->i_es_lock); 679 write_unlock(&EXT4_I(inode)->i_es_lock);
674 680
675 ext4_es_lru_add(inode);
676 ext4_es_print_tree(inode); 681 ext4_es_print_tree(inode);
677 682
678 return err; 683 return err;
@@ -734,7 +739,6 @@ out:
734 739
735 read_unlock(&EXT4_I(inode)->i_es_lock); 740 read_unlock(&EXT4_I(inode)->i_es_lock);
736 741
737 ext4_es_lru_add(inode);
738 trace_ext4_es_lookup_extent_exit(inode, es, found); 742 trace_ext4_es_lookup_extent_exit(inode, es, found);
739 return found; 743 return found;
740} 744}
@@ -748,8 +752,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
748 struct extent_status orig_es; 752 struct extent_status orig_es;
749 ext4_lblk_t len1, len2; 753 ext4_lblk_t len1, len2;
750 ext4_fsblk_t block; 754 ext4_fsblk_t block;
751 int err = 0; 755 int err;
752 756
757retry:
758 err = 0;
753 es = __es_tree_search(&tree->root, lblk); 759 es = __es_tree_search(&tree->root, lblk);
754 if (!es) 760 if (!es)
755 goto out; 761 goto out;
@@ -784,6 +790,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
784 if (err) { 790 if (err) {
785 es->es_lblk = orig_es.es_lblk; 791 es->es_lblk = orig_es.es_lblk;
786 es->es_len = orig_es.es_len; 792 es->es_len = orig_es.es_len;
793 if ((err == -ENOMEM) &&
794 __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
795 EXT4_I(inode)))
796 goto retry;
787 goto out; 797 goto out;
788 } 798 }
789 } else { 799 } else {
@@ -878,38 +888,64 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
878 EXTENT_STATUS_WRITTEN); 888 EXTENT_STATUS_WRITTEN);
879} 889}
880 890
881static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 891static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
892 struct list_head *b)
893{
894 struct ext4_inode_info *eia, *eib;
895 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
896 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
897
898 if (eia->i_touch_when == eib->i_touch_when)
899 return 0;
900 if (time_after(eia->i_touch_when, eib->i_touch_when))
901 return 1;
902 else
903 return -1;
904}
905
906static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
907 struct ext4_inode_info *locked_ei)
882{ 908{
883 struct ext4_sb_info *sbi = container_of(shrink,
884 struct ext4_sb_info, s_es_shrinker);
885 struct ext4_inode_info *ei; 909 struct ext4_inode_info *ei;
886 struct list_head *cur, *tmp, scanned; 910 struct list_head *cur, *tmp;
887 int nr_to_scan = sc->nr_to_scan; 911 LIST_HEAD(skiped);
888 int ret, nr_shrunk = 0; 912 int ret, nr_shrunk = 0;
889 913
890 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 914 spin_lock(&sbi->s_es_lru_lock);
891 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
892
893 if (!nr_to_scan)
894 return ret;
895 915
896 INIT_LIST_HEAD(&scanned); 916 /*
917 * If the inode that is at the head of LRU list is newer than
918 * last_sorted time, that means that we need to sort this list.
919 */
920 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
921 if (sbi->s_es_last_sorted < ei->i_touch_when) {
922 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
923 sbi->s_es_last_sorted = jiffies;
924 }
897 925
898 spin_lock(&sbi->s_es_lru_lock);
899 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 926 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
900 list_move_tail(cur, &scanned); 927 /*
928 * If we have already reclaimed all extents from extent
929 * status tree, just stop the loop immediately.
930 */
931 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
932 break;
901 933
902 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 934 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
903 935
904 read_lock(&ei->i_es_lock); 936 /* Skip the inode that is newer than the last_sorted time */
905 if (ei->i_es_lru_nr == 0) { 937 if (sbi->s_es_last_sorted < ei->i_touch_when) {
906 read_unlock(&ei->i_es_lock); 938 list_move_tail(cur, &skiped);
907 continue; 939 continue;
908 } 940 }
909 read_unlock(&ei->i_es_lock); 941
942 if (ei->i_es_lru_nr == 0 || ei == locked_ei)
943 continue;
910 944
911 write_lock(&ei->i_es_lock); 945 write_lock(&ei->i_es_lock);
912 ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 946 ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
947 if (ei->i_es_lru_nr == 0)
948 list_del_init(&ei->i_es_lru);
913 write_unlock(&ei->i_es_lock); 949 write_unlock(&ei->i_es_lock);
914 950
915 nr_shrunk += ret; 951 nr_shrunk += ret;
@@ -917,29 +953,50 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
917 if (nr_to_scan == 0) 953 if (nr_to_scan == 0)
918 break; 954 break;
919 } 955 }
920 list_splice_tail(&scanned, &sbi->s_es_lru); 956
957 /* Move the newer inodes into the tail of the LRU list. */
958 list_splice_tail(&skiped, &sbi->s_es_lru);
921 spin_unlock(&sbi->s_es_lru_lock); 959 spin_unlock(&sbi->s_es_lru_lock);
922 960
961 if (locked_ei && nr_shrunk == 0)
962 nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
963
964 return nr_shrunk;
965}
966
967static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
968{
969 struct ext4_sb_info *sbi = container_of(shrink,
970 struct ext4_sb_info, s_es_shrinker);
971 int nr_to_scan = sc->nr_to_scan;
972 int ret, nr_shrunk;
973
974 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
975 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
976
977 if (!nr_to_scan)
978 return ret;
979
980 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
981
923 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 982 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
924 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); 983 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
925 return ret; 984 return ret;
926} 985}
927 986
928void ext4_es_register_shrinker(struct super_block *sb) 987void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
929{ 988{
930 struct ext4_sb_info *sbi;
931
932 sbi = EXT4_SB(sb);
933 INIT_LIST_HEAD(&sbi->s_es_lru); 989 INIT_LIST_HEAD(&sbi->s_es_lru);
934 spin_lock_init(&sbi->s_es_lru_lock); 990 spin_lock_init(&sbi->s_es_lru_lock);
991 sbi->s_es_last_sorted = 0;
935 sbi->s_es_shrinker.shrink = ext4_es_shrink; 992 sbi->s_es_shrinker.shrink = ext4_es_shrink;
936 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 993 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
937 register_shrinker(&sbi->s_es_shrinker); 994 register_shrinker(&sbi->s_es_shrinker);
938} 995}
939 996
940void ext4_es_unregister_shrinker(struct super_block *sb) 997void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
941{ 998{
942 unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); 999 unregister_shrinker(&sbi->s_es_shrinker);
943} 1000}
944 1001
945void ext4_es_lru_add(struct inode *inode) 1002void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +1004,14 @@ void ext4_es_lru_add(struct inode *inode)
947 struct ext4_inode_info *ei = EXT4_I(inode); 1004 struct ext4_inode_info *ei = EXT4_I(inode);
948 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1005 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
949 1006
1007 ei->i_touch_when = jiffies;
1008
1009 if (!list_empty(&ei->i_es_lru))
1010 return;
1011
950 spin_lock(&sbi->s_es_lru_lock); 1012 spin_lock(&sbi->s_es_lru_lock);
951 if (list_empty(&ei->i_es_lru)) 1013 if (list_empty(&ei->i_es_lru))
952 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 1014 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
953 else
954 list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
955 spin_unlock(&sbi->s_es_lru_lock); 1015 spin_unlock(&sbi->s_es_lru_lock);
956} 1016}
957 1017
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
39 EXTENT_STATUS_DELAYED | \ 39 EXTENT_STATUS_DELAYED | \
40 EXTENT_STATUS_HOLE) 40 EXTENT_STATUS_HOLE)
41 41
42struct ext4_sb_info;
42struct ext4_extent; 43struct ext4_extent;
43 44
44struct extent_status { 45struct extent_status {
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
119 es->es_pblk = block; 120 es->es_pblk = block;
120} 121}
121 122
122extern void ext4_es_register_shrinker(struct super_block *sb); 123extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
123extern void ext4_es_unregister_shrinker(struct super_block *sb); 124extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
124extern void ext4_es_lru_add(struct inode *inode); 125extern void ext4_es_lru_add(struct inode *inode);
125extern void ext4_es_lru_del(struct inode *inode); 126extern void ext4_es_lru_del(struct inode *inode);
126 127
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1b4d51b5d86..6f4cc567c382 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
312 blkbits = inode->i_sb->s_blocksize_bits; 312 blkbits = inode->i_sb->s_blocksize_bits;
313 startoff = *offset; 313 startoff = *offset;
314 lastoff = startoff; 314 lastoff = startoff;
315 endoff = (map->m_lblk + map->m_len) << blkbits; 315 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
316 316
317 index = startoff >> PAGE_CACHE_SHIFT; 317 index = startoff >> PAGE_CACHE_SHIFT;
318 end = endoff >> PAGE_CACHE_SHIFT; 318 end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
457 ret = ext4_map_blocks(NULL, inode, &map, 0); 457 ret = ext4_map_blocks(NULL, inode, &map, 0);
458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
459 if (last != start) 459 if (last != start)
460 dataoff = last << blkbits; 460 dataoff = (loff_t)last << blkbits;
461 break; 461 break;
462 } 462 }
463 463
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
468 ext4_es_find_delayed_extent_range(inode, last, last, &es); 468 ext4_es_find_delayed_extent_range(inode, last, last, &es);
469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
470 if (last != start) 470 if (last != start)
471 dataoff = last << blkbits; 471 dataoff = (loff_t)last << blkbits;
472 break; 472 break;
473 } 473 }
474 474
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
486 } 486 }
487 487
488 last++; 488 last++;
489 dataoff = last << blkbits; 489 dataoff = (loff_t)last << blkbits;
490 } while (last <= end); 490 } while (last <= end);
491 491
492 mutex_unlock(&inode->i_mutex); 492 mutex_unlock(&inode->i_mutex);
@@ -494,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
494 if (dataoff > isize) 494 if (dataoff > isize)
495 return -ENXIO; 495 return -ENXIO;
496 496
497 if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 497 return vfs_setpos(file, dataoff, maxsize);
498 return -EINVAL;
499 if (dataoff > maxsize)
500 return -EINVAL;
501
502 if (dataoff != file->f_pos) {
503 file->f_pos = dataoff;
504 file->f_version = 0;
505 }
506
507 return dataoff;
508} 498}
509 499
510/* 500/*
@@ -540,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
540 ret = ext4_map_blocks(NULL, inode, &map, 0); 530 ret = ext4_map_blocks(NULL, inode, &map, 0);
541 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 531 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
542 last += ret; 532 last += ret;
543 holeoff = last << blkbits; 533 holeoff = (loff_t)last << blkbits;
544 continue; 534 continue;
545 } 535 }
546 536
@@ -551,7 +541,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
551 ext4_es_find_delayed_extent_range(inode, last, last, &es); 541 ext4_es_find_delayed_extent_range(inode, last, last, &es);
552 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 542 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
553 last = es.es_lblk + es.es_len; 543 last = es.es_lblk + es.es_len;
554 holeoff = last << blkbits; 544 holeoff = (loff_t)last << blkbits;
555 continue; 545 continue;
556 } 546 }
557 547
@@ -566,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
566 &map, &holeoff); 556 &map, &holeoff);
567 if (!unwritten) { 557 if (!unwritten) {
568 last += ret; 558 last += ret;
569 holeoff = last << blkbits; 559 holeoff = (loff_t)last << blkbits;
570 continue; 560 continue;
571 } 561 }
572 } 562 }
@@ -580,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
580 if (holeoff > isize) 570 if (holeoff > isize)
581 holeoff = isize; 571 holeoff = isize;
582 572
583 if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 573 return vfs_setpos(file, holeoff, maxsize);
584 return -EINVAL;
585 if (holeoff > maxsize)
586 return -EINVAL;
587
588 if (holeoff != file->f_pos) {
589 file->f_pos = holeoff;
590 file->f_version = 0;
591 }
592
593 return holeoff;
594} 574}
595 575
596/* 576/*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
73 return ret; 73 return ret;
74} 74}
75 75
76/**
77 * __sync_file - generic_file_fsync without the locking and filemap_write
78 * @inode: inode to sync
79 * @datasync: only sync essential metadata if true
80 *
81 * This is just generic_file_fsync without the locking. This is needed for
82 * nojournal mode to make sure this inodes data/metadata makes it to disk
83 * properly. The i_mutex should be held already.
84 */
85static int __sync_inode(struct inode *inode, int datasync)
86{
87 int err;
88 int ret;
89
90 ret = sync_mapping_buffers(inode->i_mapping);
91 if (!(inode->i_state & I_DIRTY))
92 return ret;
93 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
94 return ret;
95
96 err = sync_inode_metadata(inode, 1);
97 if (ret == 0)
98 ret = err;
99 return ret;
100}
101
102/* 76/*
103 * akpm: A new design for ext4_sync_file(). 77 * akpm: A new design for ext4_sync_file().
104 * 78 *
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
116 struct inode *inode = file->f_mapping->host; 90 struct inode *inode = file->f_mapping->host;
117 struct ext4_inode_info *ei = EXT4_I(inode); 91 struct ext4_inode_info *ei = EXT4_I(inode);
118 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 92 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
119 int ret, err; 93 int ret = 0, err;
120 tid_t commit_tid; 94 tid_t commit_tid;
121 bool needs_barrier = false; 95 bool needs_barrier = false;
122 96
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
124 98
125 trace_ext4_sync_file_enter(file, datasync); 99 trace_ext4_sync_file_enter(file, datasync);
126 100
127 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 101 if (inode->i_sb->s_flags & MS_RDONLY) {
128 if (ret) 102 /* Make sure that we read updated s_mount_flags value */
129 return ret; 103 smp_rmb();
130 mutex_lock(&inode->i_mutex); 104 if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
131 105 ret = -EROFS;
132 if (inode->i_sb->s_flags & MS_RDONLY)
133 goto out;
134
135 ret = ext4_flush_unwritten_io(inode);
136 if (ret < 0)
137 goto out; 106 goto out;
107 }
138 108
139 if (!journal) { 109 if (!journal) {
140 ret = __sync_inode(inode, datasync); 110 ret = generic_file_fsync(file, start, end, datasync);
141 if (!ret && !hlist_empty(&inode->i_dentry)) 111 if (!ret && !hlist_empty(&inode->i_dentry))
142 ret = ext4_sync_parent(inode); 112 ret = ext4_sync_parent(inode);
143 goto out; 113 goto out;
144 } 114 }
145 115
116 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
117 if (ret)
118 return ret;
146 /* 119 /*
147 * data=writeback,ordered: 120 * data=writeback,ordered:
148 * The caller's filemap_fdatawrite()/wait will sync the data. 121 * The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
172 if (!ret) 145 if (!ret)
173 ret = err; 146 ret = err;
174 } 147 }
175 out: 148out:
176 mutex_unlock(&inode->i_mutex);
177 trace_ext4_sync_file_exit(inode, ret); 149 trace_ext4_sync_file_exit(inode, ret);
178 return ret; 150 return ret;
179} 151}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..8bf5999875ee 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -734,11 +734,8 @@ repeat_in_this_group:
734 ino = ext4_find_next_zero_bit((unsigned long *) 734 ino = ext4_find_next_zero_bit((unsigned long *)
735 inode_bitmap_bh->b_data, 735 inode_bitmap_bh->b_data,
736 EXT4_INODES_PER_GROUP(sb), ino); 736 EXT4_INODES_PER_GROUP(sb), ino);
737 if (ino >= EXT4_INODES_PER_GROUP(sb)) { 737 if (ino >= EXT4_INODES_PER_GROUP(sb))
738 if (++group == ngroups) 738 goto next_group;
739 group = 0;
740 continue;
741 }
742 if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { 739 if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
743 ext4_error(sb, "reserved inode found cleared - " 740 ext4_error(sb, "reserved inode found cleared - "
744 "inode=%lu", ino + 1); 741 "inode=%lu", ino + 1);
@@ -747,7 +744,8 @@ repeat_in_this_group:
747 if (!handle) { 744 if (!handle) {
748 BUG_ON(nblocks <= 0); 745 BUG_ON(nblocks <= 0);
749 handle = __ext4_journal_start_sb(dir->i_sb, line_no, 746 handle = __ext4_journal_start_sb(dir->i_sb, line_no,
750 handle_type, nblocks); 747 handle_type, nblocks,
748 0);
751 if (IS_ERR(handle)) { 749 if (IS_ERR(handle)) {
752 err = PTR_ERR(handle); 750 err = PTR_ERR(handle);
753 ext4_std_error(sb, err); 751 ext4_std_error(sb, err);
@@ -768,6 +766,9 @@ repeat_in_this_group:
768 goto got; /* we grabbed the inode! */ 766 goto got; /* we grabbed the inode! */
769 if (ino < EXT4_INODES_PER_GROUP(sb)) 767 if (ino < EXT4_INODES_PER_GROUP(sb))
770 goto repeat_in_this_group; 768 goto repeat_in_this_group;
769next_group:
770 if (++group == ngroups)
771 group = 0;
771 } 772 }
772 err = -ENOSPC; 773 err = -ENOSPC;
773 goto out; 774 goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
624 partial--; 624 partial--;
625 } 625 }
626out: 626out:
627 trace_ext4_ind_map_blocks_exit(inode, map, err); 627 trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
628 return err; 628 return err;
629} 629}
630 630
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
675 675
676retry: 676retry:
677 if (rw == READ && ext4_should_dioread_nolock(inode)) { 677 if (rw == READ && ext4_should_dioread_nolock(inode)) {
678 if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
679 mutex_lock(&inode->i_mutex);
680 ext4_flush_unwritten_io(inode);
681 mutex_unlock(&inode->i_mutex);
682 }
683 /* 678 /*
684 * Nolock dioread optimization may be dynamically disabled 679 * Nolock dioread optimization may be dynamically disabled
685 * via ext4_inode_block_unlocked_dio(). Check inode's state 680 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
779 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 774 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
780} 775}
781 776
782int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) 777/*
778 * Calculate number of indirect blocks touched by mapping @nrblocks logically
779 * contiguous blocks
780 */
781int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
783{ 782{
784 int indirects;
785
786 /* if nrblocks are contiguous */
787 if (chunk) {
788 /*
789 * With N contiguous data blocks, we need at most
790 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
791 * 2 dindirect blocks, and 1 tindirect block
792 */
793 return DIV_ROUND_UP(nrblocks,
794 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
795 }
796 /* 783 /*
797 * if nrblocks are not contiguous, worse case, each block touch 784 * With N contiguous data blocks, we need at most
798 * a indirect block, and each indirect block touch a double indirect 785 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
799 * block, plus a triple indirect block 786 * 2 dindirect blocks, and 1 tindirect block
800 */ 787 */
801 indirects = nrblocks * 2 + 1; 788 return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
802 return indirects;
803} 789}
804 790
805/* 791/*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
940 __le32 *last) 926 __le32 *last)
941{ 927{
942 __le32 *p; 928 __le32 *p;
943 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 929 int flags = EXT4_FREE_BLOCKS_VALIDATED;
944 int err; 930 int err;
945 931
946 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 932 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
947 flags |= EXT4_FREE_BLOCKS_METADATA; 933 flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
934 else if (ext4_should_journal_data(inode))
935 flags |= EXT4_FREE_BLOCKS_FORGET;
948 936
949 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 937 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
950 count)) { 938 count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3e2bf873e8a8..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
72 entry = (struct ext4_xattr_entry *) 72 entry = (struct ext4_xattr_entry *)
73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off); 73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
74 74
75 free += le32_to_cpu(entry->e_value_size); 75 free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
76 goto out; 76 goto out;
77 } 77 }
78 78
@@ -1404,16 +1404,15 @@ out:
1404 * offset as if '.' and '..' really take place. 1404 * offset as if '.' and '..' really take place.
1405 * 1405 *
1406 */ 1406 */
1407int ext4_read_inline_dir(struct file *filp, 1407int ext4_read_inline_dir(struct file *file,
1408 void *dirent, filldir_t filldir, 1408 struct dir_context *ctx,
1409 int *has_inline_data) 1409 int *has_inline_data)
1410{ 1410{
1411 int error = 0;
1412 unsigned int offset, parent_ino; 1411 unsigned int offset, parent_ino;
1413 int i, stored; 1412 int i;
1414 struct ext4_dir_entry_2 *de; 1413 struct ext4_dir_entry_2 *de;
1415 struct super_block *sb; 1414 struct super_block *sb;
1416 struct inode *inode = file_inode(filp); 1415 struct inode *inode = file_inode(file);
1417 int ret, inline_size = 0; 1416 int ret, inline_size = 0;
1418 struct ext4_iloc iloc; 1417 struct ext4_iloc iloc;
1419 void *dir_buf = NULL; 1418 void *dir_buf = NULL;
@@ -1444,9 +1443,8 @@ int ext4_read_inline_dir(struct file *filp,
1444 goto out; 1443 goto out;
1445 1444
1446 sb = inode->i_sb; 1445 sb = inode->i_sb;
1447 stored = 0;
1448 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); 1446 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1449 offset = filp->f_pos; 1447 offset = ctx->pos;
1450 1448
1451 /* 1449 /*
1452 * dotdot_offset and dotdot_size is the real offset and 1450 * dotdot_offset and dotdot_size is the real offset and
@@ -1460,104 +1458,74 @@ int ext4_read_inline_dir(struct file *filp,
1460 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; 1458 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
1461 extra_size = extra_offset + inline_size; 1459 extra_size = extra_offset + inline_size;
1462 1460
1463 while (!error && !stored && filp->f_pos < extra_size) { 1461 /*
1464revalidate: 1462 * If the version has changed since the last call to
1465 /* 1463 * readdir(2), then we might be pointing to an invalid
1466 * If the version has changed since the last call to 1464 * dirent right now. Scan from the start of the inline
1467 * readdir(2), then we might be pointing to an invalid 1465 * dir to make sure.
1468 * dirent right now. Scan from the start of the inline 1466 */
1469 * dir to make sure. 1467 if (file->f_version != inode->i_version) {
1470 */ 1468 for (i = 0; i < extra_size && i < offset;) {
1471 if (filp->f_version != inode->i_version) { 1469 /*
1472 for (i = 0; i < extra_size && i < offset;) { 1470 * "." is with offset 0 and
1473 /* 1471 * ".." is dotdot_offset.
1474 * "." is with offset 0 and 1472 */
1475 * ".." is dotdot_offset. 1473 if (!i) {
1476 */ 1474 i = dotdot_offset;
1477 if (!i) { 1475 continue;
1478 i = dotdot_offset; 1476 } else if (i == dotdot_offset) {
1479 continue; 1477 i = dotdot_size;
1480 } else if (i == dotdot_offset) {
1481 i = dotdot_size;
1482 continue;
1483 }
1484 /* for other entry, the real offset in
1485 * the buf has to be tuned accordingly.
1486 */
1487 de = (struct ext4_dir_entry_2 *)
1488 (dir_buf + i - extra_offset);
1489 /* It's too expensive to do a full
1490 * dirent test each time round this
1491 * loop, but we do have to test at
1492 * least that it is non-zero. A
1493 * failure will be detected in the
1494 * dirent test below. */
1495 if (ext4_rec_len_from_disk(de->rec_len,
1496 extra_size) < EXT4_DIR_REC_LEN(1))
1497 break;
1498 i += ext4_rec_len_from_disk(de->rec_len,
1499 extra_size);
1500 }
1501 offset = i;
1502 filp->f_pos = offset;
1503 filp->f_version = inode->i_version;
1504 }
1505
1506 while (!error && filp->f_pos < extra_size) {
1507 if (filp->f_pos == 0) {
1508 error = filldir(dirent, ".", 1, 0, inode->i_ino,
1509 DT_DIR);
1510 if (error)
1511 break;
1512 stored++;
1513 filp->f_pos = dotdot_offset;
1514 continue; 1478 continue;
1515 } 1479 }
1480 /* for other entry, the real offset in
1481 * the buf has to be tuned accordingly.
1482 */
1483 de = (struct ext4_dir_entry_2 *)
1484 (dir_buf + i - extra_offset);
1485 /* It's too expensive to do a full
1486 * dirent test each time round this
1487 * loop, but we do have to test at
1488 * least that it is non-zero. A
1489 * failure will be detected in the
1490 * dirent test below. */
1491 if (ext4_rec_len_from_disk(de->rec_len, extra_size)
1492 < EXT4_DIR_REC_LEN(1))
1493 break;
1494 i += ext4_rec_len_from_disk(de->rec_len,
1495 extra_size);
1496 }
1497 offset = i;
1498 ctx->pos = offset;
1499 file->f_version = inode->i_version;
1500 }
1516 1501
1517 if (filp->f_pos == dotdot_offset) { 1502 while (ctx->pos < extra_size) {
1518 error = filldir(dirent, "..", 2, 1503 if (ctx->pos == 0) {
1519 dotdot_offset, 1504 if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
1520 parent_ino, DT_DIR); 1505 goto out;
1521 if (error) 1506 ctx->pos = dotdot_offset;
1522 break; 1507 continue;
1523 stored++; 1508 }
1524 1509
1525 filp->f_pos = dotdot_size; 1510 if (ctx->pos == dotdot_offset) {
1526 continue; 1511 if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
1527 } 1512 goto out;
1513 ctx->pos = dotdot_size;
1514 continue;
1515 }
1528 1516
1529 de = (struct ext4_dir_entry_2 *) 1517 de = (struct ext4_dir_entry_2 *)
1530 (dir_buf + filp->f_pos - extra_offset); 1518 (dir_buf + ctx->pos - extra_offset);
1531 if (ext4_check_dir_entry(inode, filp, de, 1519 if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
1532 iloc.bh, dir_buf, 1520 extra_size, ctx->pos))
1533 extra_size, filp->f_pos)) { 1521 goto out;
1534 ret = stored; 1522 if (le32_to_cpu(de->inode)) {
1523 if (!dir_emit(ctx, de->name, de->name_len,
1524 le32_to_cpu(de->inode),
1525 get_dtype(sb, de->file_type)))
1535 goto out; 1526 goto out;
1536 }
1537 if (le32_to_cpu(de->inode)) {
1538 /* We might block in the next section
1539 * if the data destination is
1540 * currently swapped out. So, use a
1541 * version stamp to detect whether or
1542 * not the directory has been modified
1543 * during the copy operation.
1544 */
1545 u64 version = filp->f_version;
1546
1547 error = filldir(dirent, de->name,
1548 de->name_len,
1549 filp->f_pos,
1550 le32_to_cpu(de->inode),
1551 get_dtype(sb, de->file_type));
1552 if (error)
1553 break;
1554 if (version != filp->f_version)
1555 goto revalidate;
1556 stored++;
1557 }
1558 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
1559 extra_size);
1560 } 1527 }
1528 ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
1561 } 1529 }
1562out: 1530out:
1563 kfree(dir_buf); 1531 kfree(dir_buf);
@@ -1842,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
1842 if (error) 1810 if (error)
1843 goto out; 1811 goto out;
1844 1812
1845 physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1813 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1846 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1814 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1847 physical += offsetof(struct ext4_inode, i_block); 1815 physical += offsetof(struct ext4_inode, i_block);
1848 length = i_size_read(inode); 1816 length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b89ecbd..dd32a2eacd0d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
132 new_size); 132 new_size);
133} 133}
134 134
135static void ext4_invalidatepage(struct page *page, unsigned long offset); 135static void ext4_invalidatepage(struct page *page, unsigned int offset,
136 unsigned int length);
136static int __ext4_journalled_writepage(struct page *page, unsigned int len); 137static int __ext4_journalled_writepage(struct page *page, unsigned int len);
137static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 138static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
138static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 139static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
139 struct inode *inode, struct page *page, loff_t from, 140 int pextents);
140 loff_t length, int flags);
141 141
142/* 142/*
143 * Test whether an inode is a fast symlink. 143 * Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
215 filemap_write_and_wait(&inode->i_data); 215 filemap_write_and_wait(&inode->i_data);
216 } 216 }
217 truncate_inode_pages(&inode->i_data, 0); 217 truncate_inode_pages(&inode->i_data, 0);
218 ext4_ioend_shutdown(inode); 218
219 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
219 goto no_delete; 220 goto no_delete;
220 } 221 }
221 222
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
225 if (ext4_should_order_data(inode)) 226 if (ext4_should_order_data(inode))
226 ext4_begin_ordered_truncate(inode, 0); 227 ext4_begin_ordered_truncate(inode, 0);
227 truncate_inode_pages(&inode->i_data, 0); 228 truncate_inode_pages(&inode->i_data, 0);
228 ext4_ioend_shutdown(inode);
229 229
230 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
230 if (is_bad_inode(inode)) 231 if (is_bad_inode(inode))
231 goto no_delete; 232 goto no_delete;
232 233
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
423#define check_block_validity(inode, map) \ 424#define check_block_validity(inode, map) \
424 __check_block_validity((inode), __func__, __LINE__, (map)) 425 __check_block_validity((inode), __func__, __LINE__, (map))
425 426
426/*
427 * Return the number of contiguous dirty pages in a given inode
428 * starting at page frame idx.
429 */
430static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
431 unsigned int max_pages)
432{
433 struct address_space *mapping = inode->i_mapping;
434 pgoff_t index;
435 struct pagevec pvec;
436 pgoff_t num = 0;
437 int i, nr_pages, done = 0;
438
439 if (max_pages == 0)
440 return 0;
441 pagevec_init(&pvec, 0);
442 while (!done) {
443 index = idx;
444 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
445 PAGECACHE_TAG_DIRTY,
446 (pgoff_t)PAGEVEC_SIZE);
447 if (nr_pages == 0)
448 break;
449 for (i = 0; i < nr_pages; i++) {
450 struct page *page = pvec.pages[i];
451 struct buffer_head *bh, *head;
452
453 lock_page(page);
454 if (unlikely(page->mapping != mapping) ||
455 !PageDirty(page) ||
456 PageWriteback(page) ||
457 page->index != idx) {
458 done = 1;
459 unlock_page(page);
460 break;
461 }
462 if (page_has_buffers(page)) {
463 bh = head = page_buffers(page);
464 do {
465 if (!buffer_delay(bh) &&
466 !buffer_unwritten(bh))
467 done = 1;
468 bh = bh->b_this_page;
469 } while (!done && (bh != head));
470 }
471 unlock_page(page);
472 if (done)
473 break;
474 idx++;
475 num++;
476 if (num >= max_pages) {
477 done = 1;
478 break;
479 }
480 }
481 pagevec_release(&pvec);
482 }
483 return num;
484}
485
486#ifdef ES_AGGRESSIVE_TEST 427#ifdef ES_AGGRESSIVE_TEST
487static void ext4_map_blocks_es_recheck(handle_t *handle, 428static void ext4_map_blocks_es_recheck(handle_t *handle,
488 struct inode *inode, 429 struct inode *inode,
@@ -524,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
524 if (es_map->m_lblk != map->m_lblk || 465 if (es_map->m_lblk != map->m_lblk ||
525 es_map->m_flags != map->m_flags || 466 es_map->m_flags != map->m_flags ||
526 es_map->m_pblk != map->m_pblk) { 467 es_map->m_pblk != map->m_pblk) {
527 printk("ES cache assertation failed for inode: %lu " 468 printk("ES cache assertion failed for inode: %lu "
528 "es_cached ex [%d/%d/%llu/%x] != " 469 "es_cached ex [%d/%d/%llu/%x] != "
529 "found ex [%d/%d/%llu/%x] retval %d flags %x\n", 470 "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
530 inode->i_ino, es_map->m_lblk, es_map->m_len, 471 inode->i_ino, es_map->m_lblk, es_map->m_len,
@@ -575,6 +516,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
575 516
576 /* Lookup extent status tree firstly */ 517 /* Lookup extent status tree firstly */
577 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 518 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
519 ext4_es_lru_add(inode);
578 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 520 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
579 map->m_pblk = ext4_es_pblock(&es) + 521 map->m_pblk = ext4_es_pblock(&es) +
580 map->m_lblk - es.es_lblk; 522 map->m_lblk - es.es_lblk;
@@ -613,14 +555,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
613 int ret; 555 int ret;
614 unsigned long long status; 556 unsigned long long status;
615 557
616#ifdef ES_AGGRESSIVE_TEST 558 if (unlikely(retval != map->m_len)) {
617 if (retval != map->m_len) { 559 ext4_warning(inode->i_sb,
618 printk("ES len assertation failed for inode: %lu " 560 "ES len assertion failed for inode "
619 "retval %d != map->m_len %d " 561 "%lu: retval %d != map->m_len %d",
620 "in %s (lookup)\n", inode->i_ino, retval, 562 inode->i_ino, retval, map->m_len);
621 map->m_len, __func__); 563 WARN_ON(1);
622 } 564 }
623#endif
624 565
625 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 566 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
626 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 567 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -714,14 +655,13 @@ found:
714 int ret; 655 int ret;
715 unsigned long long status; 656 unsigned long long status;
716 657
717#ifdef ES_AGGRESSIVE_TEST 658 if (unlikely(retval != map->m_len)) {
718 if (retval != map->m_len) { 659 ext4_warning(inode->i_sb,
719 printk("ES len assertation failed for inode: %lu " 660 "ES len assertion failed for inode "
720 "retval %d != map->m_len %d " 661 "%lu: retval %d != map->m_len %d",
721 "in %s (allocation)\n", inode->i_ino, retval, 662 inode->i_ino, retval, map->m_len);
722 map->m_len, __func__); 663 WARN_ON(1);
723 } 664 }
724#endif
725 665
726 /* 666 /*
727 * If the extent has been zeroed out, we don't need to update 667 * If the extent has been zeroed out, we don't need to update
@@ -1118,10 +1058,13 @@ static int ext4_write_end(struct file *file,
1118 } 1058 }
1119 } 1059 }
1120 1060
1121 if (ext4_has_inline_data(inode)) 1061 if (ext4_has_inline_data(inode)) {
1122 copied = ext4_write_inline_data_end(inode, pos, len, 1062 ret = ext4_write_inline_data_end(inode, pos, len,
1123 copied, page); 1063 copied, page);
1124 else 1064 if (ret < 0)
1065 goto errout;
1066 copied = ret;
1067 } else
1125 copied = block_write_end(file, mapping, pos, 1068 copied = block_write_end(file, mapping, pos,
1126 len, copied, page, fsdata); 1069 len, copied, page, fsdata);
1127 1070
@@ -1157,8 +1100,6 @@ static int ext4_write_end(struct file *file,
1157 if (i_size_changed) 1100 if (i_size_changed)
1158 ext4_mark_inode_dirty(handle, inode); 1101 ext4_mark_inode_dirty(handle, inode);
1159 1102
1160 if (copied < 0)
1161 ret = copied;
1162 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1103 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1163 /* if we have allocated more blocks and copied 1104 /* if we have allocated more blocks and copied
1164 * less. We will have blocks allocated outside 1105 * less. We will have blocks allocated outside
@@ -1415,21 +1356,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1415} 1356}
1416 1357
1417static void ext4_da_page_release_reservation(struct page *page, 1358static void ext4_da_page_release_reservation(struct page *page,
1418 unsigned long offset) 1359 unsigned int offset,
1360 unsigned int length)
1419{ 1361{
1420 int to_release = 0; 1362 int to_release = 0;
1421 struct buffer_head *head, *bh; 1363 struct buffer_head *head, *bh;
1422 unsigned int curr_off = 0; 1364 unsigned int curr_off = 0;
1423 struct inode *inode = page->mapping->host; 1365 struct inode *inode = page->mapping->host;
1424 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1366 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1367 unsigned int stop = offset + length;
1425 int num_clusters; 1368 int num_clusters;
1426 ext4_fsblk_t lblk; 1369 ext4_fsblk_t lblk;
1427 1370
1371 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1372
1428 head = page_buffers(page); 1373 head = page_buffers(page);
1429 bh = head; 1374 bh = head;
1430 do { 1375 do {
1431 unsigned int next_off = curr_off + bh->b_size; 1376 unsigned int next_off = curr_off + bh->b_size;
1432 1377
1378 if (next_off > stop)
1379 break;
1380
1433 if ((offset <= curr_off) && (buffer_delay(bh))) { 1381 if ((offset <= curr_off) && (buffer_delay(bh))) {
1434 to_release++; 1382 to_release++;
1435 clear_buffer_delay(bh); 1383 clear_buffer_delay(bh);
@@ -1460,140 +1408,43 @@ static void ext4_da_page_release_reservation(struct page *page,
1460 * Delayed allocation stuff 1408 * Delayed allocation stuff
1461 */ 1409 */
1462 1410
1463/* 1411struct mpage_da_data {
1464 * mpage_da_submit_io - walks through extent of pages and try to write 1412 struct inode *inode;
1465 * them with writepage() call back 1413 struct writeback_control *wbc;
1466 *
1467 * @mpd->inode: inode
1468 * @mpd->first_page: first page of the extent
1469 * @mpd->next_page: page after the last page of the extent
1470 *
1471 * By the time mpage_da_submit_io() is called we expect all blocks
1472 * to be allocated. this may be wrong if allocation failed.
1473 *
1474 * As pages are already locked by write_cache_pages(), we can't use it
1475 */
1476static int mpage_da_submit_io(struct mpage_da_data *mpd,
1477 struct ext4_map_blocks *map)
1478{
1479 struct pagevec pvec;
1480 unsigned long index, end;
1481 int ret = 0, err, nr_pages, i;
1482 struct inode *inode = mpd->inode;
1483 struct address_space *mapping = inode->i_mapping;
1484 loff_t size = i_size_read(inode);
1485 unsigned int len, block_start;
1486 struct buffer_head *bh, *page_bufs = NULL;
1487 sector_t pblock = 0, cur_logical = 0;
1488 struct ext4_io_submit io_submit;
1489 1414
1490 BUG_ON(mpd->next_page <= mpd->first_page); 1415 pgoff_t first_page; /* The first page to write */
1491 memset(&io_submit, 0, sizeof(io_submit)); 1416 pgoff_t next_page; /* Current page to examine */
1417 pgoff_t last_page; /* Last page to examine */
1492 /* 1418 /*
1493 * We need to start from the first_page to the next_page - 1 1419 * Extent to map - this can be after first_page because that can be
1494 * to make sure we also write the mapped dirty buffer_heads. 1420 * fully mapped. We somewhat abuse m_flags to store whether the extent
1495 * If we look at mpd->b_blocknr we would only be looking 1421 * is delalloc or unwritten.
1496 * at the currently mapped buffer_heads.
1497 */ 1422 */
1498 index = mpd->first_page; 1423 struct ext4_map_blocks map;
1499 end = mpd->next_page - 1; 1424 struct ext4_io_submit io_submit; /* IO submission data */
1500 1425};
1501 pagevec_init(&pvec, 0);
1502 while (index <= end) {
1503 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1504 if (nr_pages == 0)
1505 break;
1506 for (i = 0; i < nr_pages; i++) {
1507 int skip_page = 0;
1508 struct page *page = pvec.pages[i];
1509
1510 index = page->index;
1511 if (index > end)
1512 break;
1513
1514 if (index == size >> PAGE_CACHE_SHIFT)
1515 len = size & ~PAGE_CACHE_MASK;
1516 else
1517 len = PAGE_CACHE_SIZE;
1518 if (map) {
1519 cur_logical = index << (PAGE_CACHE_SHIFT -
1520 inode->i_blkbits);
1521 pblock = map->m_pblk + (cur_logical -
1522 map->m_lblk);
1523 }
1524 index++;
1525
1526 BUG_ON(!PageLocked(page));
1527 BUG_ON(PageWriteback(page));
1528
1529 bh = page_bufs = page_buffers(page);
1530 block_start = 0;
1531 do {
1532 if (map && (cur_logical >= map->m_lblk) &&
1533 (cur_logical <= (map->m_lblk +
1534 (map->m_len - 1)))) {
1535 if (buffer_delay(bh)) {
1536 clear_buffer_delay(bh);
1537 bh->b_blocknr = pblock;
1538 }
1539 if (buffer_unwritten(bh) ||
1540 buffer_mapped(bh))
1541 BUG_ON(bh->b_blocknr != pblock);
1542 if (map->m_flags & EXT4_MAP_UNINIT)
1543 set_buffer_uninit(bh);
1544 clear_buffer_unwritten(bh);
1545 }
1546
1547 /*
1548 * skip page if block allocation undone and
1549 * block is dirty
1550 */
1551 if (ext4_bh_delay_or_unwritten(NULL, bh))
1552 skip_page = 1;
1553 bh = bh->b_this_page;
1554 block_start += bh->b_size;
1555 cur_logical++;
1556 pblock++;
1557 } while (bh != page_bufs);
1558
1559 if (skip_page) {
1560 unlock_page(page);
1561 continue;
1562 }
1563
1564 clear_page_dirty_for_io(page);
1565 err = ext4_bio_write_page(&io_submit, page, len,
1566 mpd->wbc);
1567 if (!err)
1568 mpd->pages_written++;
1569 /*
1570 * In error case, we have to continue because
1571 * remaining pages are still locked
1572 */
1573 if (ret == 0)
1574 ret = err;
1575 }
1576 pagevec_release(&pvec);
1577 }
1578 ext4_io_submit(&io_submit);
1579 return ret;
1580}
1581 1426
1582static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 1427static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1428 bool invalidate)
1583{ 1429{
1584 int nr_pages, i; 1430 int nr_pages, i;
1585 pgoff_t index, end; 1431 pgoff_t index, end;
1586 struct pagevec pvec; 1432 struct pagevec pvec;
1587 struct inode *inode = mpd->inode; 1433 struct inode *inode = mpd->inode;
1588 struct address_space *mapping = inode->i_mapping; 1434 struct address_space *mapping = inode->i_mapping;
1589 ext4_lblk_t start, last; 1435
1436 /* This is necessary when next_page == 0. */
1437 if (mpd->first_page >= mpd->next_page)
1438 return;
1590 1439
1591 index = mpd->first_page; 1440 index = mpd->first_page;
1592 end = mpd->next_page - 1; 1441 end = mpd->next_page - 1;
1593 1442 if (invalidate) {
1594 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1443 ext4_lblk_t start, last;
1595 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1444 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1596 ext4_es_remove_extent(inode, start, last - start + 1); 1445 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1446 ext4_es_remove_extent(inode, start, last - start + 1);
1447 }
1597 1448
1598 pagevec_init(&pvec, 0); 1449 pagevec_init(&pvec, 0);
1599 while (index <= end) { 1450 while (index <= end) {
@@ -1606,14 +1457,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1606 break; 1457 break;
1607 BUG_ON(!PageLocked(page)); 1458 BUG_ON(!PageLocked(page));
1608 BUG_ON(PageWriteback(page)); 1459 BUG_ON(PageWriteback(page));
1609 block_invalidatepage(page, 0); 1460 if (invalidate) {
1610 ClearPageUptodate(page); 1461 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1462 ClearPageUptodate(page);
1463 }
1611 unlock_page(page); 1464 unlock_page(page);
1612 } 1465 }
1613 index = pvec.pages[nr_pages - 1]->index + 1; 1466 index = pvec.pages[nr_pages - 1]->index + 1;
1614 pagevec_release(&pvec); 1467 pagevec_release(&pvec);
1615 } 1468 }
1616 return;
1617} 1469}
1618 1470
1619static void ext4_print_free_blocks(struct inode *inode) 1471static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1494,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1642 return; 1494 return;
1643} 1495}
1644 1496
1645/*
1646 * mpage_da_map_and_submit - go through given space, map them
1647 * if necessary, and then submit them for I/O
1648 *
1649 * @mpd - bh describing space
1650 *
1651 * The function skips space we know is already mapped to disk blocks.
1652 *
1653 */
1654static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1655{
1656 int err, blks, get_blocks_flags;
1657 struct ext4_map_blocks map, *mapp = NULL;
1658 sector_t next = mpd->b_blocknr;
1659 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1660 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1661 handle_t *handle = NULL;
1662
1663 /*
1664 * If the blocks are mapped already, or we couldn't accumulate
1665 * any blocks, then proceed immediately to the submission stage.
1666 */
1667 if ((mpd->b_size == 0) ||
1668 ((mpd->b_state & (1 << BH_Mapped)) &&
1669 !(mpd->b_state & (1 << BH_Delay)) &&
1670 !(mpd->b_state & (1 << BH_Unwritten))))
1671 goto submit_io;
1672
1673 handle = ext4_journal_current_handle();
1674 BUG_ON(!handle);
1675
1676 /*
1677 * Call ext4_map_blocks() to allocate any delayed allocation
1678 * blocks, or to convert an uninitialized extent to be
1679 * initialized (in the case where we have written into
1680 * one or more preallocated blocks).
1681 *
1682 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1683 * indicate that we are on the delayed allocation path. This
1684 * affects functions in many different parts of the allocation
1685 * call path. This flag exists primarily because we don't
1686 * want to change *many* call functions, so ext4_map_blocks()
1687 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1688 * inode's allocation semaphore is taken.
1689 *
1690 * If the blocks in questions were delalloc blocks, set
1691 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1692 * variables are updated after the blocks have been allocated.
1693 */
1694 map.m_lblk = next;
1695 map.m_len = max_blocks;
1696 /*
1697 * We're in delalloc path and it is possible that we're going to
1698 * need more metadata blocks than previously reserved. However
1699 * we must not fail because we're in writeback and there is
1700 * nothing we can do about it so it might result in data loss.
1701 * So use reserved blocks to allocate metadata if possible.
1702 */
1703 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1704 EXT4_GET_BLOCKS_METADATA_NOFAIL;
1705 if (ext4_should_dioread_nolock(mpd->inode))
1706 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1707 if (mpd->b_state & (1 << BH_Delay))
1708 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1709
1710
1711 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1712 if (blks < 0) {
1713 struct super_block *sb = mpd->inode->i_sb;
1714
1715 err = blks;
1716 /*
1717 * If get block returns EAGAIN or ENOSPC and there
1718 * appears to be free blocks we will just let
1719 * mpage_da_submit_io() unlock all of the pages.
1720 */
1721 if (err == -EAGAIN)
1722 goto submit_io;
1723
1724 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1725 mpd->retval = err;
1726 goto submit_io;
1727 }
1728
1729 /*
1730 * get block failure will cause us to loop in
1731 * writepages, because a_ops->writepage won't be able
1732 * to make progress. The page will be redirtied by
1733 * writepage and writepages will again try to write
1734 * the same.
1735 */
1736 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1737 ext4_msg(sb, KERN_CRIT,
1738 "delayed block allocation failed for inode %lu "
1739 "at logical offset %llu with max blocks %zd "
1740 "with error %d", mpd->inode->i_ino,
1741 (unsigned long long) next,
1742 mpd->b_size >> mpd->inode->i_blkbits, err);
1743 ext4_msg(sb, KERN_CRIT,
1744 "This should not happen!! Data will be lost");
1745 if (err == -ENOSPC)
1746 ext4_print_free_blocks(mpd->inode);
1747 }
1748 /* invalidate all the pages */
1749 ext4_da_block_invalidatepages(mpd);
1750
1751 /* Mark this page range as having been completed */
1752 mpd->io_done = 1;
1753 return;
1754 }
1755 BUG_ON(blks == 0);
1756
1757 mapp = &map;
1758 if (map.m_flags & EXT4_MAP_NEW) {
1759 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1760 int i;
1761
1762 for (i = 0; i < map.m_len; i++)
1763 unmap_underlying_metadata(bdev, map.m_pblk + i);
1764 }
1765
1766 /*
1767 * Update on-disk size along with block allocation.
1768 */
1769 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1770 if (disksize > i_size_read(mpd->inode))
1771 disksize = i_size_read(mpd->inode);
1772 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1773 ext4_update_i_disksize(mpd->inode, disksize);
1774 err = ext4_mark_inode_dirty(handle, mpd->inode);
1775 if (err)
1776 ext4_error(mpd->inode->i_sb,
1777 "Failed to mark inode %lu dirty",
1778 mpd->inode->i_ino);
1779 }
1780
1781submit_io:
1782 mpage_da_submit_io(mpd, mapp);
1783 mpd->io_done = 1;
1784}
1785
1786#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1787 (1 << BH_Delay) | (1 << BH_Unwritten))
1788
1789/*
1790 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1791 *
1792 * @mpd->lbh - extent of blocks
1793 * @logical - logical number of the block in the file
1794 * @b_state - b_state of the buffer head added
1795 *
1796 * the function is used to collect contig. blocks in same state
1797 */
1798static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
1799 unsigned long b_state)
1800{
1801 sector_t next;
1802 int blkbits = mpd->inode->i_blkbits;
1803 int nrblocks = mpd->b_size >> blkbits;
1804
1805 /*
1806 * XXX Don't go larger than mballoc is willing to allocate
1807 * This is a stopgap solution. We eventually need to fold
1808 * mpage_da_submit_io() into this function and then call
1809 * ext4_map_blocks() multiple times in a loop
1810 */
1811 if (nrblocks >= (8*1024*1024 >> blkbits))
1812 goto flush_it;
1813
1814 /* check if the reserved journal credits might overflow */
1815 if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
1816 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1817 /*
1818 * With non-extent format we are limited by the journal
1819 * credit available. Total credit needed to insert
1820 * nrblocks contiguous blocks is dependent on the
1821 * nrblocks. So limit nrblocks.
1822 */
1823 goto flush_it;
1824 }
1825 }
1826 /*
1827 * First block in the extent
1828 */
1829 if (mpd->b_size == 0) {
1830 mpd->b_blocknr = logical;
1831 mpd->b_size = 1 << blkbits;
1832 mpd->b_state = b_state & BH_FLAGS;
1833 return;
1834 }
1835
1836 next = mpd->b_blocknr + nrblocks;
1837 /*
1838 * Can we merge the block to our big extent?
1839 */
1840 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1841 mpd->b_size += 1 << blkbits;
1842 return;
1843 }
1844
1845flush_it:
1846 /*
1847 * We couldn't merge the block to our extent, so we
1848 * need to flush current extent and start new one
1849 */
1850 mpage_da_map_and_submit(mpd);
1851 return;
1852}
1853
1854static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 1497static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1855{ 1498{
1856 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1499 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1885,7 +1528,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1885 1528
1886 /* Lookup extent status tree firstly */ 1529 /* Lookup extent status tree firstly */
1887 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1530 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1888 1531 ext4_es_lru_add(inode);
1889 if (ext4_es_is_hole(&es)) { 1532 if (ext4_es_is_hole(&es)) {
1890 retval = 0; 1533 retval = 0;
1891 down_read((&EXT4_I(inode)->i_data_sem)); 1534 down_read((&EXT4_I(inode)->i_data_sem));
@@ -1992,14 +1635,13 @@ add_delayed:
1992 int ret; 1635 int ret;
1993 unsigned long long status; 1636 unsigned long long status;
1994 1637
1995#ifdef ES_AGGRESSIVE_TEST 1638 if (unlikely(retval != map->m_len)) {
1996 if (retval != map->m_len) { 1639 ext4_warning(inode->i_sb,
1997 printk("ES len assertation failed for inode: %lu " 1640 "ES len assertion failed for inode "
1998 "retval %d != map->m_len %d " 1641 "%lu: retval %d != map->m_len %d",
1999 "in %s (lookup)\n", inode->i_ino, retval, 1642 inode->i_ino, retval, map->m_len);
2000 map->m_len, __func__); 1643 WARN_ON(1);
2001 } 1644 }
2002#endif
2003 1645
2004 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 1646 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
2005 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 1647 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -2156,7 +1798,7 @@ out:
2156 * lock so we have to do some magic. 1798 * lock so we have to do some magic.
2157 * 1799 *
2158 * This function can get called via... 1800 * This function can get called via...
2159 * - ext4_da_writepages after taking page lock (have journal handle) 1801 * - ext4_writepages after taking page lock (have journal handle)
2160 * - journal_submit_inode_data_buffers (no journal handle) 1802 * - journal_submit_inode_data_buffers (no journal handle)
2161 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1803 * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
2162 * - grab_page_cache when doing write_begin (have journal handle) 1804 * - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1876,405 @@ static int ext4_writepage(struct page *page,
2234 */ 1876 */
2235 return __ext4_journalled_writepage(page, len); 1877 return __ext4_journalled_writepage(page, len);
2236 1878
2237 memset(&io_submit, 0, sizeof(io_submit)); 1879 ext4_io_submit_init(&io_submit, wbc);
1880 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
1881 if (!io_submit.io_end) {
1882 redirty_page_for_writepage(wbc, page);
1883 unlock_page(page);
1884 return -ENOMEM;
1885 }
2238 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 1886 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
2239 ext4_io_submit(&io_submit); 1887 ext4_io_submit(&io_submit);
1888 /* Drop io_end reference we got from init */
1889 ext4_put_io_end_defer(io_submit.io_end);
2240 return ret; 1890 return ret;
2241} 1891}
2242 1892
1893#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
1894
2243/* 1895/*
2244 * This is called via ext4_da_writepages() to 1896 * mballoc gives us at most this number of blocks...
2245 * calculate the total number of credits to reserve to fit 1897 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
2246 * a single extent allocation into a single transaction, 1898 * The rest of mballoc seems to handle chunks upto full group size.
2247 * ext4_da_writpeages() will loop calling this before
2248 * the block allocation.
2249 */ 1899 */
1900#define MAX_WRITEPAGES_EXTENT_LEN 2048
2250 1901
2251static int ext4_da_writepages_trans_blocks(struct inode *inode) 1902/*
1903 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1904 *
1905 * @mpd - extent of blocks
1906 * @lblk - logical number of the block in the file
1907 * @b_state - b_state of the buffer head added
1908 *
1909 * the function is used to collect contig. blocks in same state
1910 */
1911static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1912 unsigned long b_state)
2252{ 1913{
2253 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1914 struct ext4_map_blocks *map = &mpd->map;
1915
1916 /* Don't go larger than mballoc is willing to allocate */
1917 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1918 return 0;
1919
1920 /* First block in the extent? */
1921 if (map->m_len == 0) {
1922 map->m_lblk = lblk;
1923 map->m_len = 1;
1924 map->m_flags = b_state & BH_FLAGS;
1925 return 1;
1926 }
1927
1928 /* Can we merge the block to our big extent? */
1929 if (lblk == map->m_lblk + map->m_len &&
1930 (b_state & BH_FLAGS) == map->m_flags) {
1931 map->m_len++;
1932 return 1;
1933 }
1934 return 0;
1935}
2254 1936
1937static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
1938 struct buffer_head *head,
1939 struct buffer_head *bh,
1940 ext4_lblk_t lblk)
1941{
1942 struct inode *inode = mpd->inode;
1943 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
1944 >> inode->i_blkbits;
1945
1946 do {
1947 BUG_ON(buffer_locked(bh));
1948
1949 if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1950 (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
1951 lblk >= blocks) {
1952 /* Found extent to map? */
1953 if (mpd->map.m_len)
1954 return false;
1955 if (lblk >= blocks)
1956 return true;
1957 continue;
1958 }
1959 if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
1960 return false;
1961 } while (lblk++, (bh = bh->b_this_page) != head);
1962 return true;
1963}
1964
1965static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
1966{
1967 int len;
1968 loff_t size = i_size_read(mpd->inode);
1969 int err;
1970
1971 BUG_ON(page->index != mpd->first_page);
1972 if (page->index == size >> PAGE_CACHE_SHIFT)
1973 len = size & ~PAGE_CACHE_MASK;
1974 else
1975 len = PAGE_CACHE_SIZE;
1976 clear_page_dirty_for_io(page);
1977 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
1978 if (!err)
1979 mpd->wbc->nr_to_write--;
1980 mpd->first_page++;
1981
1982 return err;
1983}
1984
1985/*
1986 * mpage_map_buffers - update buffers corresponding to changed extent and
1987 * submit fully mapped pages for IO
1988 *
1989 * @mpd - description of extent to map, on return next extent to map
1990 *
1991 * Scan buffers corresponding to changed extent (we expect corresponding pages
1992 * to be already locked) and update buffer state according to new extent state.
1993 * We map delalloc buffers to their physical location, clear unwritten bits,
1994 * and mark buffers as uninit when we perform writes to uninitialized extents
1995 * and do extent conversion after IO is finished. If the last page is not fully
1996 * mapped, we update @map to the next extent in the last page that needs
1997 * mapping. Otherwise we submit the page for IO.
1998 */
1999static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2000{
2001 struct pagevec pvec;
2002 int nr_pages, i;
2003 struct inode *inode = mpd->inode;
2004 struct buffer_head *head, *bh;
2005 int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
2006 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
2007 >> inode->i_blkbits;
2008 pgoff_t start, end;
2009 ext4_lblk_t lblk;
2010 sector_t pblock;
2011 int err;
2012
2013 start = mpd->map.m_lblk >> bpp_bits;
2014 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2015 lblk = start << bpp_bits;
2016 pblock = mpd->map.m_pblk;
2017
2018 pagevec_init(&pvec, 0);
2019 while (start <= end) {
2020 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
2021 PAGEVEC_SIZE);
2022 if (nr_pages == 0)
2023 break;
2024 for (i = 0; i < nr_pages; i++) {
2025 struct page *page = pvec.pages[i];
2026
2027 if (page->index > end)
2028 break;
2029 /* Upto 'end' pages must be contiguous */
2030 BUG_ON(page->index != start);
2031 bh = head = page_buffers(page);
2032 do {
2033 if (lblk < mpd->map.m_lblk)
2034 continue;
2035 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2036 /*
2037 * Buffer after end of mapped extent.
2038 * Find next buffer in the page to map.
2039 */
2040 mpd->map.m_len = 0;
2041 mpd->map.m_flags = 0;
2042 add_page_bufs_to_extent(mpd, head, bh,
2043 lblk);
2044 pagevec_release(&pvec);
2045 return 0;
2046 }
2047 if (buffer_delay(bh)) {
2048 clear_buffer_delay(bh);
2049 bh->b_blocknr = pblock++;
2050 }
2051 clear_buffer_unwritten(bh);
2052 } while (++lblk < blocks &&
2053 (bh = bh->b_this_page) != head);
2054
2055 /*
2056 * FIXME: This is going to break if dioread_nolock
2057 * supports blocksize < pagesize as we will try to
2058 * convert potentially unmapped parts of inode.
2059 */
2060 mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
2061 /* Page fully mapped - let IO run! */
2062 err = mpage_submit_page(mpd, page);
2063 if (err < 0) {
2064 pagevec_release(&pvec);
2065 return err;
2066 }
2067 start++;
2068 }
2069 pagevec_release(&pvec);
2070 }
2071 /* Extent fully mapped and matches with page boundary. We are done. */
2072 mpd->map.m_len = 0;
2073 mpd->map.m_flags = 0;
2074 return 0;
2075}
2076
2077static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2078{
2079 struct inode *inode = mpd->inode;
2080 struct ext4_map_blocks *map = &mpd->map;
2081 int get_blocks_flags;
2082 int err;
2083
2084 trace_ext4_da_write_pages_extent(inode, map);
2255 /* 2085 /*
2256 * With non-extent format the journal credit needed to 2086 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2257 * insert nrblocks contiguous block is dependent on 2087 * to convert an uninitialized extent to be initialized (in the case
2258 * number of contiguous block. So we will limit 2088 * where we have written into one or more preallocated blocks). It is
2259 * number of contiguous block to a sane value 2089 * possible that we're going to need more metadata blocks than
2090 * previously reserved. However we must not fail because we're in
2091 * writeback and there is nothing we can do about it so it might result
2092 * in data loss. So use reserved blocks to allocate metadata if
2093 * possible.
2094 *
2095 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
2096 * in question are delalloc blocks. This affects functions in many
2097 * different parts of the allocation call path. This flag exists
2098 * primarily because we don't want to change *many* call functions, so
2099 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
2100 * once the inode's allocation semaphore is taken.
2260 */ 2101 */
2261 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && 2102 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2262 (max_blocks > EXT4_MAX_TRANS_DATA)) 2103 EXT4_GET_BLOCKS_METADATA_NOFAIL;
2263 max_blocks = EXT4_MAX_TRANS_DATA; 2104 if (ext4_should_dioread_nolock(inode))
2105 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2106 if (map->m_flags & (1 << BH_Delay))
2107 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2264 2108
2265 return ext4_chunk_trans_blocks(inode, max_blocks); 2109 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2110 if (err < 0)
2111 return err;
2112 if (map->m_flags & EXT4_MAP_UNINIT) {
2113 if (!mpd->io_submit.io_end->handle &&
2114 ext4_handle_valid(handle)) {
2115 mpd->io_submit.io_end->handle = handle->h_rsv_handle;
2116 handle->h_rsv_handle = NULL;
2117 }
2118 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
2119 }
2120
2121 BUG_ON(map->m_len == 0);
2122 if (map->m_flags & EXT4_MAP_NEW) {
2123 struct block_device *bdev = inode->i_sb->s_bdev;
2124 int i;
2125
2126 for (i = 0; i < map->m_len; i++)
2127 unmap_underlying_metadata(bdev, map->m_pblk + i);
2128 }
2129 return 0;
2266} 2130}
2267 2131
2268/* 2132/*
2269 * write_cache_pages_da - walk the list of dirty pages of the given 2133 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2270 * address space and accumulate pages that need writing, and call 2134 * mpd->len and submit pages underlying it for IO
2271 * mpage_da_map_and_submit to map a single contiguous memory region 2135 *
2272 * and then write them. 2136 * @handle - handle for journal operations
2137 * @mpd - extent to map
2138 *
2139 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2140 * delayed, blocks are allocated, if it is unwritten, we may need to convert
2141 * them to initialized or split the described range from larger unwritten
2142 * extent. Note that we need not map all the described range since allocation
2143 * can return less blocks or the range is covered by more unwritten extents. We
2144 * cannot map more because we are limited by reserved transaction credits. On
2145 * the other hand we always make sure that the last touched page is fully
2146 * mapped so that it can be written out (and thus forward progress is
2147 * guaranteed). After mapping we submit all mapped pages for IO.
2273 */ 2148 */
2274static int write_cache_pages_da(handle_t *handle, 2149static int mpage_map_and_submit_extent(handle_t *handle,
2275 struct address_space *mapping, 2150 struct mpage_da_data *mpd,
2276 struct writeback_control *wbc, 2151 bool *give_up_on_write)
2277 struct mpage_da_data *mpd,
2278 pgoff_t *done_index)
2279{ 2152{
2280 struct buffer_head *bh, *head; 2153 struct inode *inode = mpd->inode;
2281 struct inode *inode = mapping->host; 2154 struct ext4_map_blocks *map = &mpd->map;
2282 struct pagevec pvec; 2155 int err;
2283 unsigned int nr_pages; 2156 loff_t disksize;
2284 sector_t logical;
2285 pgoff_t index, end;
2286 long nr_to_write = wbc->nr_to_write;
2287 int i, tag, ret = 0;
2288
2289 memset(mpd, 0, sizeof(struct mpage_da_data));
2290 mpd->wbc = wbc;
2291 mpd->inode = inode;
2292 pagevec_init(&pvec, 0);
2293 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2294 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2295 2157
2296 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2158 mpd->io_submit.io_end->offset =
2159 ((loff_t)map->m_lblk) << inode->i_blkbits;
2160 do {
2161 err = mpage_map_one_extent(handle, mpd);
2162 if (err < 0) {
2163 struct super_block *sb = inode->i_sb;
2164
2165 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
2166 goto invalidate_dirty_pages;
2167 /*
2168 * Let the uper layers retry transient errors.
2169 * In the case of ENOSPC, if ext4_count_free_blocks()
2170 * is non-zero, a commit should free up blocks.
2171 */
2172 if ((err == -ENOMEM) ||
2173 (err == -ENOSPC && ext4_count_free_clusters(sb)))
2174 return err;
2175 ext4_msg(sb, KERN_CRIT,
2176 "Delayed block allocation failed for "
2177 "inode %lu at logical offset %llu with"
2178 " max blocks %u with error %d",
2179 inode->i_ino,
2180 (unsigned long long)map->m_lblk,
2181 (unsigned)map->m_len, -err);
2182 ext4_msg(sb, KERN_CRIT,
2183 "This should not happen!! Data will "
2184 "be lost\n");
2185 if (err == -ENOSPC)
2186 ext4_print_free_blocks(inode);
2187 invalidate_dirty_pages:
2188 *give_up_on_write = true;
2189 return err;
2190 }
2191 /*
2192 * Update buffer state, submit mapped pages, and get us new
2193 * extent to map
2194 */
2195 err = mpage_map_and_submit_buffers(mpd);
2196 if (err < 0)
2197 return err;
2198 } while (map->m_len);
2199
2200 /* Update on-disk size after IO is submitted */
2201 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
2202 if (disksize > i_size_read(inode))
2203 disksize = i_size_read(inode);
2204 if (disksize > EXT4_I(inode)->i_disksize) {
2205 int err2;
2206
2207 ext4_update_i_disksize(inode, disksize);
2208 err2 = ext4_mark_inode_dirty(handle, inode);
2209 if (err2)
2210 ext4_error(inode->i_sb,
2211 "Failed to mark inode %lu dirty",
2212 inode->i_ino);
2213 if (!err)
2214 err = err2;
2215 }
2216 return err;
2217}
2218
2219/*
2220 * Calculate the total number of credits to reserve for one writepages
2221 * iteration. This is called from ext4_writepages(). We map an extent of
2222 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2223 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2224 * bpp - 1 blocks in bpp different extents.
2225 */
2226static int ext4_da_writepages_trans_blocks(struct inode *inode)
2227{
2228 int bpp = ext4_journal_blocks_per_page(inode);
2229
2230 return ext4_meta_trans_blocks(inode,
2231 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
2232}
2233
2234/*
2235 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2236 * and underlying extent to map
2237 *
2238 * @mpd - where to look for pages
2239 *
2240 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2241 * IO immediately. When we find a page which isn't mapped we start accumulating
2242 * extent of buffers underlying these pages that needs mapping (formed by
2243 * either delayed or unwritten buffers). We also lock the pages containing
2244 * these buffers. The extent found is returned in @mpd structure (starting at
2245 * mpd->lblk with length mpd->len blocks).
2246 *
2247 * Note that this function can attach bios to one io_end structure which are
2248 * neither logically nor physically contiguous. Although it may seem as an
2249 * unnecessary complication, it is actually inevitable in blocksize < pagesize
2250 * case as we need to track IO to all buffers underlying a page in one io_end.
2251 */
2252static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2253{
2254 struct address_space *mapping = mpd->inode->i_mapping;
2255 struct pagevec pvec;
2256 unsigned int nr_pages;
2257 pgoff_t index = mpd->first_page;
2258 pgoff_t end = mpd->last_page;
2259 int tag;
2260 int i, err = 0;
2261 int blkbits = mpd->inode->i_blkbits;
2262 ext4_lblk_t lblk;
2263 struct buffer_head *head;
2264
2265 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2297 tag = PAGECACHE_TAG_TOWRITE; 2266 tag = PAGECACHE_TAG_TOWRITE;
2298 else 2267 else
2299 tag = PAGECACHE_TAG_DIRTY; 2268 tag = PAGECACHE_TAG_DIRTY;
2300 2269
2301 *done_index = index; 2270 pagevec_init(&pvec, 0);
2271 mpd->map.m_len = 0;
2272 mpd->next_page = index;
2302 while (index <= end) { 2273 while (index <= end) {
2303 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2274 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2304 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2275 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2305 if (nr_pages == 0) 2276 if (nr_pages == 0)
2306 return 0; 2277 goto out;
2307 2278
2308 for (i = 0; i < nr_pages; i++) { 2279 for (i = 0; i < nr_pages; i++) {
2309 struct page *page = pvec.pages[i]; 2280 struct page *page = pvec.pages[i];
@@ -2318,31 +2289,21 @@ static int write_cache_pages_da(handle_t *handle,
2318 if (page->index > end) 2289 if (page->index > end)
2319 goto out; 2290 goto out;
2320 2291
2321 *done_index = page->index + 1; 2292 /* If we can't merge this page, we are done. */
2322 2293 if (mpd->map.m_len > 0 && mpd->next_page != page->index)
2323 /* 2294 goto out;
2324 * If we can't merge this page, and we have
2325 * accumulated an contiguous region, write it
2326 */
2327 if ((mpd->next_page != page->index) &&
2328 (mpd->next_page != mpd->first_page)) {
2329 mpage_da_map_and_submit(mpd);
2330 goto ret_extent_tail;
2331 }
2332 2295
2333 lock_page(page); 2296 lock_page(page);
2334
2335 /* 2297 /*
2336 * If the page is no longer dirty, or its 2298 * If the page is no longer dirty, or its mapping no
2337 * mapping no longer corresponds to inode we 2299 * longer corresponds to inode we are writing (which
2338 * are writing (which means it has been 2300 * means it has been truncated or invalidated), or the
2339 * truncated or invalidated), or the page is 2301 * page is already under writeback and we are not doing
2340 * already under writeback and we are not 2302 * a data integrity writeback, skip the page
2341 * doing a data integrity writeback, skip the page
2342 */ 2303 */
2343 if (!PageDirty(page) || 2304 if (!PageDirty(page) ||
2344 (PageWriteback(page) && 2305 (PageWriteback(page) &&
2345 (wbc->sync_mode == WB_SYNC_NONE)) || 2306 (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2346 unlikely(page->mapping != mapping)) { 2307 unlikely(page->mapping != mapping)) {
2347 unlock_page(page); 2308 unlock_page(page);
2348 continue; 2309 continue;
@@ -2351,106 +2312,70 @@ static int write_cache_pages_da(handle_t *handle,
2351 wait_on_page_writeback(page); 2312 wait_on_page_writeback(page);
2352 BUG_ON(PageWriteback(page)); 2313 BUG_ON(PageWriteback(page));
2353 2314
2354 /* 2315 if (mpd->map.m_len == 0)
2355 * If we have inline data and arrive here, it means that
2356 * we will soon create the block for the 1st page, so
2357 * we'd better clear the inline data here.
2358 */
2359 if (ext4_has_inline_data(inode)) {
2360 BUG_ON(ext4_test_inode_state(inode,
2361 EXT4_STATE_MAY_INLINE_DATA));
2362 ext4_destroy_inline_data(handle, inode);
2363 }
2364
2365 if (mpd->next_page != page->index)
2366 mpd->first_page = page->index; 2316 mpd->first_page = page->index;
2367 mpd->next_page = page->index + 1; 2317 mpd->next_page = page->index + 1;
2368 logical = (sector_t) page->index <<
2369 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2370
2371 /* Add all dirty buffers to mpd */ 2318 /* Add all dirty buffers to mpd */
2319 lblk = ((ext4_lblk_t)page->index) <<
2320 (PAGE_CACHE_SHIFT - blkbits);
2372 head = page_buffers(page); 2321 head = page_buffers(page);
2373 bh = head; 2322 if (!add_page_bufs_to_extent(mpd, head, head, lblk))
2374 do { 2323 goto out;
2375 BUG_ON(buffer_locked(bh)); 2324 /* So far everything mapped? Submit the page for IO. */
2376 /* 2325 if (mpd->map.m_len == 0) {
2377 * We need to try to allocate unmapped blocks 2326 err = mpage_submit_page(mpd, page);
2378 * in the same page. Otherwise we won't make 2327 if (err < 0)
2379 * progress with the page in ext4_writepage
2380 */
2381 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2382 mpage_add_bh_to_extent(mpd, logical,
2383 bh->b_state);
2384 if (mpd->io_done)
2385 goto ret_extent_tail;
2386 } else if (buffer_dirty(bh) &&
2387 buffer_mapped(bh)) {
2388 /*
2389 * mapped dirty buffer. We need to
2390 * update the b_state because we look
2391 * at b_state in mpage_da_map_blocks.
2392 * We don't update b_size because if we
2393 * find an unmapped buffer_head later
2394 * we need to use the b_state flag of
2395 * that buffer_head.
2396 */
2397 if (mpd->b_size == 0)
2398 mpd->b_state =
2399 bh->b_state & BH_FLAGS;
2400 }
2401 logical++;
2402 } while ((bh = bh->b_this_page) != head);
2403
2404 if (nr_to_write > 0) {
2405 nr_to_write--;
2406 if (nr_to_write == 0 &&
2407 wbc->sync_mode == WB_SYNC_NONE)
2408 /*
2409 * We stop writing back only if we are
2410 * not doing integrity sync. In case of
2411 * integrity sync we have to keep going
2412 * because someone may be concurrently
2413 * dirtying pages, and we might have
2414 * synced a lot of newly appeared dirty
2415 * pages, but have not synced all of the
2416 * old dirty pages.
2417 */
2418 goto out; 2328 goto out;
2419 } 2329 }
2330
2331 /*
2332 * Accumulated enough dirty pages? This doesn't apply
2333 * to WB_SYNC_ALL mode. For integrity sync we have to
2334 * keep going because someone may be concurrently
2335 * dirtying pages, and we might have synced a lot of
2336 * newly appeared dirty pages, but have not synced all
2337 * of the old dirty pages.
2338 */
2339 if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
2340 mpd->next_page - mpd->first_page >=
2341 mpd->wbc->nr_to_write)
2342 goto out;
2420 } 2343 }
2421 pagevec_release(&pvec); 2344 pagevec_release(&pvec);
2422 cond_resched(); 2345 cond_resched();
2423 } 2346 }
2424 return 0; 2347 return 0;
2425ret_extent_tail:
2426 ret = MPAGE_DA_EXTENT_TAIL;
2427out: 2348out:
2428 pagevec_release(&pvec); 2349 pagevec_release(&pvec);
2429 cond_resched(); 2350 return err;
2430 return ret;
2431} 2351}
2432 2352
2353static int __writepage(struct page *page, struct writeback_control *wbc,
2354 void *data)
2355{
2356 struct address_space *mapping = data;
2357 int ret = ext4_writepage(page, wbc);
2358 mapping_set_error(mapping, ret);
2359 return ret;
2360}
2433 2361
2434static int ext4_da_writepages(struct address_space *mapping, 2362static int ext4_writepages(struct address_space *mapping,
2435 struct writeback_control *wbc) 2363 struct writeback_control *wbc)
2436{ 2364{
2437 pgoff_t index; 2365 pgoff_t writeback_index = 0;
2366 long nr_to_write = wbc->nr_to_write;
2438 int range_whole = 0; 2367 int range_whole = 0;
2368 int cycled = 1;
2439 handle_t *handle = NULL; 2369 handle_t *handle = NULL;
2440 struct mpage_da_data mpd; 2370 struct mpage_da_data mpd;
2441 struct inode *inode = mapping->host; 2371 struct inode *inode = mapping->host;
2442 int pages_written = 0; 2372 int needed_blocks, rsv_blocks = 0, ret = 0;
2443 unsigned int max_pages;
2444 int range_cyclic, cycled = 1, io_done = 0;
2445 int needed_blocks, ret = 0;
2446 long desired_nr_to_write, nr_to_writebump = 0;
2447 loff_t range_start = wbc->range_start;
2448 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2373 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2449 pgoff_t done_index = 0; 2374 bool done;
2450 pgoff_t end;
2451 struct blk_plug plug; 2375 struct blk_plug plug;
2376 bool give_up_on_write = false;
2452 2377
2453 trace_ext4_da_writepages(inode, wbc); 2378 trace_ext4_writepages(inode, wbc);
2454 2379
2455 /* 2380 /*
2456 * No pages to write? This is mainly a kludge to avoid starting 2381 * No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2385,165 @@ static int ext4_da_writepages(struct address_space *mapping,
2460 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2385 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2461 return 0; 2386 return 0;
2462 2387
2388 if (ext4_should_journal_data(inode)) {
2389 struct blk_plug plug;
2390 int ret;
2391
2392 blk_start_plug(&plug);
2393 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
2394 blk_finish_plug(&plug);
2395 return ret;
2396 }
2397
2463 /* 2398 /*
2464 * If the filesystem has aborted, it is read-only, so return 2399 * If the filesystem has aborted, it is read-only, so return
2465 * right away instead of dumping stack traces later on that 2400 * right away instead of dumping stack traces later on that
2466 * will obscure the real source of the problem. We test 2401 * will obscure the real source of the problem. We test
2467 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2402 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2468 * the latter could be true if the filesystem is mounted 2403 * the latter could be true if the filesystem is mounted
2469 * read-only, and in that case, ext4_da_writepages should 2404 * read-only, and in that case, ext4_writepages should
2470 * *never* be called, so if that ever happens, we would want 2405 * *never* be called, so if that ever happens, we would want
2471 * the stack trace. 2406 * the stack trace.
2472 */ 2407 */
2473 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2408 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2474 return -EROFS; 2409 return -EROFS;
2475 2410
2411 if (ext4_should_dioread_nolock(inode)) {
2412 /*
2413 * We may need to convert upto one extent per block in
2414 * the page and we may dirty the inode.
2415 */
2416 rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
2417 }
2418
2419 /*
2420 * If we have inline data and arrive here, it means that
2421 * we will soon create the block for the 1st page, so
2422 * we'd better clear the inline data here.
2423 */
2424 if (ext4_has_inline_data(inode)) {
2425 /* Just inode will be modified... */
2426 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2427 if (IS_ERR(handle)) {
2428 ret = PTR_ERR(handle);
2429 goto out_writepages;
2430 }
2431 BUG_ON(ext4_test_inode_state(inode,
2432 EXT4_STATE_MAY_INLINE_DATA));
2433 ext4_destroy_inline_data(handle, inode);
2434 ext4_journal_stop(handle);
2435 }
2436
2476 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2437 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2477 range_whole = 1; 2438 range_whole = 1;
2478 2439
2479 range_cyclic = wbc->range_cyclic;
2480 if (wbc->range_cyclic) { 2440 if (wbc->range_cyclic) {
2481 index = mapping->writeback_index; 2441 writeback_index = mapping->writeback_index;
2482 if (index) 2442 if (writeback_index)
2483 cycled = 0; 2443 cycled = 0;
2484 wbc->range_start = index << PAGE_CACHE_SHIFT; 2444 mpd.first_page = writeback_index;
2485 wbc->range_end = LLONG_MAX; 2445 mpd.last_page = -1;
2486 wbc->range_cyclic = 0;
2487 end = -1;
2488 } else { 2446 } else {
2489 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2447 mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
2490 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2448 mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
2491 }
2492
2493 /*
2494 * This works around two forms of stupidity. The first is in
2495 * the writeback code, which caps the maximum number of pages
2496 * written to be 1024 pages. This is wrong on multiple
2497 * levels; different architectues have a different page size,
2498 * which changes the maximum amount of data which gets
2499 * written. Secondly, 4 megabytes is way too small. XFS
2500 * forces this value to be 16 megabytes by multiplying
2501 * nr_to_write parameter by four, and then relies on its
2502 * allocator to allocate larger extents to make them
2503 * contiguous. Unfortunately this brings us to the second
2504 * stupidity, which is that ext4's mballoc code only allocates
2505 * at most 2048 blocks. So we force contiguous writes up to
2506 * the number of dirty blocks in the inode, or
2507 * sbi->max_writeback_mb_bump whichever is smaller.
2508 */
2509 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2510 if (!range_cyclic && range_whole) {
2511 if (wbc->nr_to_write == LONG_MAX)
2512 desired_nr_to_write = wbc->nr_to_write;
2513 else
2514 desired_nr_to_write = wbc->nr_to_write * 8;
2515 } else
2516 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2517 max_pages);
2518 if (desired_nr_to_write > max_pages)
2519 desired_nr_to_write = max_pages;
2520
2521 if (wbc->nr_to_write < desired_nr_to_write) {
2522 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2523 wbc->nr_to_write = desired_nr_to_write;
2524 } 2449 }
2525 2450
2451 mpd.inode = inode;
2452 mpd.wbc = wbc;
2453 ext4_io_submit_init(&mpd.io_submit, wbc);
2526retry: 2454retry:
2527 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2455 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2528 tag_pages_for_writeback(mapping, index, end); 2456 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2529 2457 done = false;
2530 blk_start_plug(&plug); 2458 blk_start_plug(&plug);
2531 while (!ret && wbc->nr_to_write > 0) { 2459 while (!done && mpd.first_page <= mpd.last_page) {
2460 /* For each extent of pages we use new io_end */
2461 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2462 if (!mpd.io_submit.io_end) {
2463 ret = -ENOMEM;
2464 break;
2465 }
2532 2466
2533 /* 2467 /*
2534 * we insert one extent at a time. So we need 2468 * We have two constraints: We find one extent to map and we
2535 * credit needed for single extent allocation. 2469 * must always write out whole page (makes a difference when
2536 * journalled mode is currently not supported 2470 * blocksize < pagesize) so that we don't block on IO when we
2537 * by delalloc 2471 * try to write out the rest of the page. Journalled mode is
2472 * not supported by delalloc.
2538 */ 2473 */
2539 BUG_ON(ext4_should_journal_data(inode)); 2474 BUG_ON(ext4_should_journal_data(inode));
2540 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2475 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2541 2476
2542 /* start a new transaction*/ 2477 /* start a new transaction */
2543 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2478 handle = ext4_journal_start_with_reserve(inode,
2544 needed_blocks); 2479 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
2545 if (IS_ERR(handle)) { 2480 if (IS_ERR(handle)) {
2546 ret = PTR_ERR(handle); 2481 ret = PTR_ERR(handle);
2547 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2482 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2548 "%ld pages, ino %lu; err %d", __func__, 2483 "%ld pages, ino %lu; err %d", __func__,
2549 wbc->nr_to_write, inode->i_ino, ret); 2484 wbc->nr_to_write, inode->i_ino, ret);
2550 blk_finish_plug(&plug); 2485 /* Release allocated io_end */
2551 goto out_writepages; 2486 ext4_put_io_end(mpd.io_submit.io_end);
2487 break;
2552 } 2488 }
2553 2489
2554 /* 2490 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
2555 * Now call write_cache_pages_da() to find the next 2491 ret = mpage_prepare_extent_to_map(&mpd);
2556 * contiguous region of logical blocks that need 2492 if (!ret) {
2557 * blocks to be allocated by ext4 and submit them. 2493 if (mpd.map.m_len)
2558 */ 2494 ret = mpage_map_and_submit_extent(handle, &mpd,
2559 ret = write_cache_pages_da(handle, mapping, 2495 &give_up_on_write);
2560 wbc, &mpd, &done_index); 2496 else {
2561 /* 2497 /*
2562 * If we have a contiguous extent of pages and we 2498 * We scanned the whole range (or exhausted
2563 * haven't done the I/O yet, map the blocks and submit 2499 * nr_to_write), submitted what was mapped and
2564 * them for I/O. 2500 * didn't find anything needing mapping. We are
2565 */ 2501 * done.
2566 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2502 */
2567 mpage_da_map_and_submit(&mpd); 2503 done = true;
2568 ret = MPAGE_DA_EXTENT_TAIL; 2504 }
2569 } 2505 }
2570 trace_ext4_da_write_pages(inode, &mpd);
2571 wbc->nr_to_write -= mpd.pages_written;
2572
2573 ext4_journal_stop(handle); 2506 ext4_journal_stop(handle);
2574 2507 /* Submit prepared bio */
2575 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2508 ext4_io_submit(&mpd.io_submit);
2576 /* commit the transaction which would 2509 /* Unlock pages we didn't use */
2510 mpage_release_unused_pages(&mpd, give_up_on_write);
2511 /* Drop our io_end reference we got from init */
2512 ext4_put_io_end(mpd.io_submit.io_end);
2513
2514 if (ret == -ENOSPC && sbi->s_journal) {
2515 /*
2516 * Commit the transaction which would
2577 * free blocks released in the transaction 2517 * free blocks released in the transaction
2578 * and try again 2518 * and try again
2579 */ 2519 */
2580 jbd2_journal_force_commit_nested(sbi->s_journal); 2520 jbd2_journal_force_commit_nested(sbi->s_journal);
2581 ret = 0; 2521 ret = 0;
2582 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2522 continue;
2583 /* 2523 }
2584 * Got one extent now try with rest of the pages. 2524 /* Fatal error - ENOMEM, EIO... */
2585 * If mpd.retval is set -EIO, journal is aborted. 2525 if (ret)
2586 * So we don't need to write any more.
2587 */
2588 pages_written += mpd.pages_written;
2589 ret = mpd.retval;
2590 io_done = 1;
2591 } else if (wbc->nr_to_write)
2592 /*
2593 * There is no more writeout needed
2594 * or we requested for a noblocking writeout
2595 * and we found the device congested
2596 */
2597 break; 2526 break;
2598 } 2527 }
2599 blk_finish_plug(&plug); 2528 blk_finish_plug(&plug);
2600 if (!io_done && !cycled) { 2529 if (!ret && !cycled) {
2601 cycled = 1; 2530 cycled = 1;
2602 index = 0; 2531 mpd.last_page = writeback_index - 1;
2603 wbc->range_start = index << PAGE_CACHE_SHIFT; 2532 mpd.first_page = 0;
2604 wbc->range_end = mapping->writeback_index - 1;
2605 goto retry; 2533 goto retry;
2606 } 2534 }
2607 2535
2608 /* Update index */ 2536 /* Update index */
2609 wbc->range_cyclic = range_cyclic;
2610 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2537 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2611 /* 2538 /*
2612 * set the writeback_index so that range_cyclic 2539 * Set the writeback_index so that range_cyclic
2613 * mode will write it back later 2540 * mode will write it back later
2614 */ 2541 */
2615 mapping->writeback_index = done_index; 2542 mapping->writeback_index = mpd.first_page;
2616 2543
2617out_writepages: 2544out_writepages:
2618 wbc->nr_to_write -= nr_to_writebump; 2545 trace_ext4_writepages_result(inode, wbc, ret,
2619 wbc->range_start = range_start; 2546 nr_to_write - wbc->nr_to_write);
2620 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2621 return ret; 2547 return ret;
2622} 2548}
2623 2549
@@ -2829,7 +2755,8 @@ static int ext4_da_write_end(struct file *file,
2829 return ret ? ret : copied; 2755 return ret ? ret : copied;
2830} 2756}
2831 2757
2832static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2758static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
2759 unsigned int length)
2833{ 2760{
2834 /* 2761 /*
2835 * Drop reserved blocks 2762 * Drop reserved blocks
@@ -2838,10 +2765,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2838 if (!page_has_buffers(page)) 2765 if (!page_has_buffers(page))
2839 goto out; 2766 goto out;
2840 2767
2841 ext4_da_page_release_reservation(page, offset); 2768 ext4_da_page_release_reservation(page, offset, length);
2842 2769
2843out: 2770out:
2844 ext4_invalidatepage(page, offset); 2771 ext4_invalidatepage(page, offset, length);
2845 2772
2846 return; 2773 return;
2847} 2774}
@@ -2864,7 +2791,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
2864 * laptop_mode, not even desirable). However, to do otherwise 2791 * laptop_mode, not even desirable). However, to do otherwise
2865 * would require replicating code paths in: 2792 * would require replicating code paths in:
2866 * 2793 *
2867 * ext4_da_writepages() -> 2794 * ext4_writepages() ->
2868 * write_cache_pages() ---> (via passed in callback function) 2795 * write_cache_pages() ---> (via passed in callback function)
2869 * __mpage_da_writepage() --> 2796 * __mpage_da_writepage() -->
2870 * mpage_add_bh_to_extent() 2797 * mpage_add_bh_to_extent()
@@ -2989,37 +2916,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
2989 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2916 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2990} 2917}
2991 2918
2992static void ext4_invalidatepage(struct page *page, unsigned long offset) 2919static void ext4_invalidatepage(struct page *page, unsigned int offset,
2920 unsigned int length)
2993{ 2921{
2994 trace_ext4_invalidatepage(page, offset); 2922 trace_ext4_invalidatepage(page, offset, length);
2995 2923
2996 /* No journalling happens on data buffers when this function is used */ 2924 /* No journalling happens on data buffers when this function is used */
2997 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 2925 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
2998 2926
2999 block_invalidatepage(page, offset); 2927 block_invalidatepage(page, offset, length);
3000} 2928}
3001 2929
3002static int __ext4_journalled_invalidatepage(struct page *page, 2930static int __ext4_journalled_invalidatepage(struct page *page,
3003 unsigned long offset) 2931 unsigned int offset,
2932 unsigned int length)
3004{ 2933{
3005 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2934 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3006 2935
3007 trace_ext4_journalled_invalidatepage(page, offset); 2936 trace_ext4_journalled_invalidatepage(page, offset, length);
3008 2937
3009 /* 2938 /*
3010 * If it's a full truncate we just forget about the pending dirtying 2939 * If it's a full truncate we just forget about the pending dirtying
3011 */ 2940 */
3012 if (offset == 0) 2941 if (offset == 0 && length == PAGE_CACHE_SIZE)
3013 ClearPageChecked(page); 2942 ClearPageChecked(page);
3014 2943
3015 return jbd2_journal_invalidatepage(journal, page, offset); 2944 return jbd2_journal_invalidatepage(journal, page, offset, length);
3016} 2945}
3017 2946
3018/* Wrapper for aops... */ 2947/* Wrapper for aops... */
3019static void ext4_journalled_invalidatepage(struct page *page, 2948static void ext4_journalled_invalidatepage(struct page *page,
3020 unsigned long offset) 2949 unsigned int offset,
2950 unsigned int length)
3021{ 2951{
3022 WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); 2952 WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
3023} 2953}
3024 2954
3025static int ext4_releasepage(struct page *page, gfp_t wait) 2955static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +2997,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3067 struct inode *inode = file_inode(iocb->ki_filp); 2997 struct inode *inode = file_inode(iocb->ki_filp);
3068 ext4_io_end_t *io_end = iocb->private; 2998 ext4_io_end_t *io_end = iocb->private;
3069 2999
3070 /* if not async direct IO or dio with 0 bytes write, just return */ 3000 /* if not async direct IO just return */
3071 if (!io_end || !size) 3001 if (!io_end) {
3072 goto out; 3002 inode_dio_done(inode);
3003 if (is_async)
3004 aio_complete(iocb, ret, 0);
3005 return;
3006 }
3073 3007
3074 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3008 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3075 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3009 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3011,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3077 size); 3011 size);
3078 3012
3079 iocb->private = NULL; 3013 iocb->private = NULL;
3080
3081 /* if not aio dio with unwritten extents, just free io and return */
3082 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3083 ext4_free_io_end(io_end);
3084out:
3085 inode_dio_done(inode);
3086 if (is_async)
3087 aio_complete(iocb, ret, 0);
3088 return;
3089 }
3090
3091 io_end->offset = offset; 3014 io_end->offset = offset;
3092 io_end->size = size; 3015 io_end->size = size;
3093 if (is_async) { 3016 if (is_async) {
3094 io_end->iocb = iocb; 3017 io_end->iocb = iocb;
3095 io_end->result = ret; 3018 io_end->result = ret;
3096 } 3019 }
3097 3020 ext4_put_io_end_defer(io_end);
3098 ext4_add_complete_io(io_end);
3099} 3021}
3100 3022
3101/* 3023/*
@@ -3129,6 +3051,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3129 get_block_t *get_block_func = NULL; 3051 get_block_t *get_block_func = NULL;
3130 int dio_flags = 0; 3052 int dio_flags = 0;
3131 loff_t final_size = offset + count; 3053 loff_t final_size = offset + count;
3054 ext4_io_end_t *io_end = NULL;
3132 3055
3133 /* Use the old path for reads and writes beyond i_size. */ 3056 /* Use the old path for reads and writes beyond i_size. */
3134 if (rw != WRITE || final_size > inode->i_size) 3057 if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3059,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3136 3059
3137 BUG_ON(iocb->private == NULL); 3060 BUG_ON(iocb->private == NULL);
3138 3061
3062 /*
3063 * Make all waiters for direct IO properly wait also for extent
3064 * conversion. This also disallows race between truncate() and
3065 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3066 */
3067 if (rw == WRITE)
3068 atomic_inc(&inode->i_dio_count);
3069
3139 /* If we do a overwrite dio, i_mutex locking can be released */ 3070 /* If we do a overwrite dio, i_mutex locking can be released */
3140 overwrite = *((int *)iocb->private); 3071 overwrite = *((int *)iocb->private);
3141 3072
3142 if (overwrite) { 3073 if (overwrite) {
3143 atomic_inc(&inode->i_dio_count);
3144 down_read(&EXT4_I(inode)->i_data_sem); 3074 down_read(&EXT4_I(inode)->i_data_sem);
3145 mutex_unlock(&inode->i_mutex); 3075 mutex_unlock(&inode->i_mutex);
3146 } 3076 }
@@ -3167,13 +3097,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3167 iocb->private = NULL; 3097 iocb->private = NULL;
3168 ext4_inode_aio_set(inode, NULL); 3098 ext4_inode_aio_set(inode, NULL);
3169 if (!is_sync_kiocb(iocb)) { 3099 if (!is_sync_kiocb(iocb)) {
3170 ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3100 io_end = ext4_init_io_end(inode, GFP_NOFS);
3171 if (!io_end) { 3101 if (!io_end) {
3172 ret = -ENOMEM; 3102 ret = -ENOMEM;
3173 goto retake_lock; 3103 goto retake_lock;
3174 } 3104 }
3175 io_end->flag |= EXT4_IO_END_DIRECT; 3105 io_end->flag |= EXT4_IO_END_DIRECT;
3176 iocb->private = io_end; 3106 /*
3107 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
3108 */
3109 iocb->private = ext4_get_io_end(io_end);
3177 /* 3110 /*
3178 * we save the io structure for current async direct 3111 * we save the io structure for current async direct
3179 * IO, so that later ext4_map_blocks() could flag the 3112 * IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3130,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3197 NULL, 3130 NULL,
3198 dio_flags); 3131 dio_flags);
3199 3132
3200 if (iocb->private)
3201 ext4_inode_aio_set(inode, NULL);
3202 /* 3133 /*
3203 * The io_end structure takes a reference to the inode, that 3134 * Put our reference to io_end. This can free the io_end structure e.g.
3204 * structure needs to be destroyed and the reference to the 3135 * in sync IO case or in case of error. It can even perform extent
3205 * inode need to be dropped, when IO is complete, even with 0 3136 * conversion if all bios we submitted finished before we got here.
3206 * byte write, or failed. 3137 * Note that in that case iocb->private can be already set to NULL
3207 * 3138 * here.
3208 * In the successful AIO DIO case, the io_end structure will
3209 * be destroyed and the reference to the inode will be dropped
3210 * after the end_io call back function is called.
3211 *
3212 * In the case there is 0 byte write, or error case, since VFS
3213 * direct IO won't invoke the end_io call back function, we
3214 * need to free the end_io structure here.
3215 */ 3139 */
3216 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3140 if (io_end) {
3217 ext4_free_io_end(iocb->private); 3141 ext4_inode_aio_set(inode, NULL);
3218 iocb->private = NULL; 3142 ext4_put_io_end(io_end);
3219 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3143 /*
3144 * When no IO was submitted ext4_end_io_dio() was not
3145 * called so we have to put iocb's reference.
3146 */
3147 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3148 WARN_ON(iocb->private != io_end);
3149 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3150 WARN_ON(io_end->iocb);
3151 /*
3152 * Generic code already did inode_dio_done() so we
3153 * have to clear EXT4_IO_END_DIRECT to not do it for
3154 * the second time.
3155 */
3156 io_end->flag = 0;
3157 ext4_put_io_end(io_end);
3158 iocb->private = NULL;
3159 }
3160 }
3161 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3220 EXT4_STATE_DIO_UNWRITTEN)) { 3162 EXT4_STATE_DIO_UNWRITTEN)) {
3221 int err; 3163 int err;
3222 /* 3164 /*
3223 * for non AIO case, since the IO is already 3165 * for non AIO case, since the IO is already
3224 * completed, we could do the conversion right here 3166 * completed, we could do the conversion right here
3225 */ 3167 */
3226 err = ext4_convert_unwritten_extents(inode, 3168 err = ext4_convert_unwritten_extents(NULL, inode,
3227 offset, ret); 3169 offset, ret);
3228 if (err < 0) 3170 if (err < 0)
3229 ret = err; 3171 ret = err;
@@ -3231,9 +3173,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3231 } 3173 }
3232 3174
3233retake_lock: 3175retake_lock:
3176 if (rw == WRITE)
3177 inode_dio_done(inode);
3234 /* take i_mutex locking again if we do a ovewrite dio */ 3178 /* take i_mutex locking again if we do a ovewrite dio */
3235 if (overwrite) { 3179 if (overwrite) {
3236 inode_dio_done(inode);
3237 up_read(&EXT4_I(inode)->i_data_sem); 3180 up_read(&EXT4_I(inode)->i_data_sem);
3238 mutex_lock(&inode->i_mutex); 3181 mutex_lock(&inode->i_mutex);
3239 } 3182 }
@@ -3292,6 +3235,7 @@ static const struct address_space_operations ext4_aops = {
3292 .readpage = ext4_readpage, 3235 .readpage = ext4_readpage,
3293 .readpages = ext4_readpages, 3236 .readpages = ext4_readpages,
3294 .writepage = ext4_writepage, 3237 .writepage = ext4_writepage,
3238 .writepages = ext4_writepages,
3295 .write_begin = ext4_write_begin, 3239 .write_begin = ext4_write_begin,
3296 .write_end = ext4_write_end, 3240 .write_end = ext4_write_end,
3297 .bmap = ext4_bmap, 3241 .bmap = ext4_bmap,
@@ -3307,6 +3251,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3307 .readpage = ext4_readpage, 3251 .readpage = ext4_readpage,
3308 .readpages = ext4_readpages, 3252 .readpages = ext4_readpages,
3309 .writepage = ext4_writepage, 3253 .writepage = ext4_writepage,
3254 .writepages = ext4_writepages,
3310 .write_begin = ext4_write_begin, 3255 .write_begin = ext4_write_begin,
3311 .write_end = ext4_journalled_write_end, 3256 .write_end = ext4_journalled_write_end,
3312 .set_page_dirty = ext4_journalled_set_page_dirty, 3257 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3322,7 +3267,7 @@ static const struct address_space_operations ext4_da_aops = {
3322 .readpage = ext4_readpage, 3267 .readpage = ext4_readpage,
3323 .readpages = ext4_readpages, 3268 .readpages = ext4_readpages,
3324 .writepage = ext4_writepage, 3269 .writepage = ext4_writepage,
3325 .writepages = ext4_da_writepages, 3270 .writepages = ext4_writepages,
3326 .write_begin = ext4_da_write_begin, 3271 .write_begin = ext4_da_write_begin,
3327 .write_end = ext4_da_write_end, 3272 .write_end = ext4_da_write_end,
3328 .bmap = ext4_bmap, 3273 .bmap = ext4_bmap,
@@ -3355,89 +3300,56 @@ void ext4_set_aops(struct inode *inode)
3355 inode->i_mapping->a_ops = &ext4_aops; 3300 inode->i_mapping->a_ops = &ext4_aops;
3356} 3301}
3357 3302
3358
3359/* 3303/*
3360 * ext4_discard_partial_page_buffers() 3304 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3361 * Wrapper function for ext4_discard_partial_page_buffers_no_lock. 3305 * up to the end of the block which corresponds to `from'.
3362 * This function finds and locks the page containing the offset 3306 * This required during truncate. We need to physically zero the tail end
3363 * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. 3307 * of that block so it doesn't yield old data if the file is later grown.
3364 * Calling functions that already have the page locked should call
3365 * ext4_discard_partial_page_buffers_no_lock directly.
3366 */ 3308 */
3367int ext4_discard_partial_page_buffers(handle_t *handle, 3309int ext4_block_truncate_page(handle_t *handle,
3368 struct address_space *mapping, loff_t from, 3310 struct address_space *mapping, loff_t from)
3369 loff_t length, int flags)
3370{ 3311{
3312 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3313 unsigned length;
3314 unsigned blocksize;
3371 struct inode *inode = mapping->host; 3315 struct inode *inode = mapping->host;
3372 struct page *page;
3373 int err = 0;
3374 3316
3375 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3317 blocksize = inode->i_sb->s_blocksize;
3376 mapping_gfp_mask(mapping) & ~__GFP_FS); 3318 length = blocksize - (offset & (blocksize - 1));
3377 if (!page)
3378 return -ENOMEM;
3379
3380 err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
3381 from, length, flags);
3382 3319
3383 unlock_page(page); 3320 return ext4_block_zero_page_range(handle, mapping, from, length);
3384 page_cache_release(page);
3385 return err;
3386} 3321}
3387 3322
3388/* 3323/*
3389 * ext4_discard_partial_page_buffers_no_lock() 3324 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3390 * Zeros a page range of length 'length' starting from offset 'from'. 3325 * starting from file offset 'from'. The range to be zero'd must
3391 * Buffer heads that correspond to the block aligned regions of the 3326 * be contained with in one block. If the specified range exceeds
3392 * zeroed range will be unmapped. Unblock aligned regions 3327 * the end of the block it will be shortened to end of the block
3393 * will have the corresponding buffer head mapped if needed so that 3328 * that cooresponds to 'from'
3394 * that region of the page can be updated with the partial zero out.
3395 *
3396 * This function assumes that the page has already been locked. The
3397 * The range to be discarded must be contained with in the given page.
3398 * If the specified range exceeds the end of the page it will be shortened
3399 * to the end of the page that corresponds to 'from'. This function is
3400 * appropriate for updating a page and it buffer heads to be unmapped and
3401 * zeroed for blocks that have been either released, or are going to be
3402 * released.
3403 *
3404 * handle: The journal handle
3405 * inode: The files inode
3406 * page: A locked page that contains the offset "from"
3407 * from: The starting byte offset (from the beginning of the file)
3408 * to begin discarding
3409 * len: The length of bytes to discard
3410 * flags: Optional flags that may be used:
3411 *
3412 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3413 * Only zero the regions of the page whose buffer heads
3414 * have already been unmapped. This flag is appropriate
3415 * for updating the contents of a page whose blocks may
3416 * have already been released, and we only want to zero
3417 * out the regions that correspond to those released blocks.
3418 *
3419 * Returns zero on success or negative on failure.
3420 */ 3329 */
3421static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 3330int ext4_block_zero_page_range(handle_t *handle,
3422 struct inode *inode, struct page *page, loff_t from, 3331 struct address_space *mapping, loff_t from, loff_t length)
3423 loff_t length, int flags)
3424{ 3332{
3425 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3333 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3426 unsigned int offset = from & (PAGE_CACHE_SIZE-1); 3334 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3427 unsigned int blocksize, max, pos; 3335 unsigned blocksize, max, pos;
3428 ext4_lblk_t iblock; 3336 ext4_lblk_t iblock;
3337 struct inode *inode = mapping->host;
3429 struct buffer_head *bh; 3338 struct buffer_head *bh;
3339 struct page *page;
3430 int err = 0; 3340 int err = 0;
3431 3341
3432 blocksize = inode->i_sb->s_blocksize; 3342 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3433 max = PAGE_CACHE_SIZE - offset; 3343 mapping_gfp_mask(mapping) & ~__GFP_FS);
3344 if (!page)
3345 return -ENOMEM;
3434 3346
3435 if (index != page->index) 3347 blocksize = inode->i_sb->s_blocksize;
3436 return -EINVAL; 3348 max = blocksize - (offset & (blocksize - 1));
3437 3349
3438 /* 3350 /*
3439 * correct length if it does not fall between 3351 * correct length if it does not fall between
3440 * 'from' and the end of the page 3352 * 'from' and the end of the block
3441 */ 3353 */
3442 if (length > max || length < 0) 3354 if (length > max || length < 0)
3443 length = max; 3355 length = max;
@@ -3455,106 +3367,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3455 iblock++; 3367 iblock++;
3456 pos += blocksize; 3368 pos += blocksize;
3457 } 3369 }
3458 3370 if (buffer_freed(bh)) {
3459 pos = offset; 3371 BUFFER_TRACE(bh, "freed: skip");
3460 while (pos < offset + length) { 3372 goto unlock;
3461 unsigned int end_of_block, range_to_discard; 3373 }
3462 3374 if (!buffer_mapped(bh)) {
3463 err = 0; 3375 BUFFER_TRACE(bh, "unmapped");
3464 3376 ext4_get_block(inode, iblock, bh, 0);
3465 /* The length of space left to zero and unmap */ 3377 /* unmapped? It's a hole - nothing to do */
3466 range_to_discard = offset + length - pos;
3467
3468 /* The length of space until the end of the block */
3469 end_of_block = blocksize - (pos & (blocksize-1));
3470
3471 /*
3472 * Do not unmap or zero past end of block
3473 * for this buffer head
3474 */
3475 if (range_to_discard > end_of_block)
3476 range_to_discard = end_of_block;
3477
3478
3479 /*
3480 * Skip this buffer head if we are only zeroing unampped
3481 * regions of the page
3482 */
3483 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3484 buffer_mapped(bh))
3485 goto next;
3486
3487 /* If the range is block aligned, unmap */
3488 if (range_to_discard == blocksize) {
3489 clear_buffer_dirty(bh);
3490 bh->b_bdev = NULL;
3491 clear_buffer_mapped(bh);
3492 clear_buffer_req(bh);
3493 clear_buffer_new(bh);
3494 clear_buffer_delay(bh);
3495 clear_buffer_unwritten(bh);
3496 clear_buffer_uptodate(bh);
3497 zero_user(page, pos, range_to_discard);
3498 BUFFER_TRACE(bh, "Buffer discarded");
3499 goto next;
3500 }
3501
3502 /*
3503 * If this block is not completely contained in the range
3504 * to be discarded, then it is not going to be released. Because
3505 * we need to keep this block, we need to make sure this part
3506 * of the page is uptodate before we modify it by writeing
3507 * partial zeros on it.
3508 */
3509 if (!buffer_mapped(bh)) { 3378 if (!buffer_mapped(bh)) {
3510 /* 3379 BUFFER_TRACE(bh, "still unmapped");
3511 * Buffer head must be mapped before we can read 3380 goto unlock;
3512 * from the block
3513 */
3514 BUFFER_TRACE(bh, "unmapped");
3515 ext4_get_block(inode, iblock, bh, 0);
3516 /* unmapped? It's a hole - nothing to do */
3517 if (!buffer_mapped(bh)) {
3518 BUFFER_TRACE(bh, "still unmapped");
3519 goto next;
3520 }
3521 } 3381 }
3382 }
3522 3383
3523 /* Ok, it's mapped. Make sure it's up-to-date */ 3384 /* Ok, it's mapped. Make sure it's up-to-date */
3524 if (PageUptodate(page)) 3385 if (PageUptodate(page))
3525 set_buffer_uptodate(bh); 3386 set_buffer_uptodate(bh);
3526 3387
3527 if (!buffer_uptodate(bh)) { 3388 if (!buffer_uptodate(bh)) {
3528 err = -EIO; 3389 err = -EIO;
3529 ll_rw_block(READ, 1, &bh); 3390 ll_rw_block(READ, 1, &bh);
3530 wait_on_buffer(bh); 3391 wait_on_buffer(bh);
3531 /* Uhhuh. Read error. Complain and punt.*/ 3392 /* Uhhuh. Read error. Complain and punt. */
3532 if (!buffer_uptodate(bh)) 3393 if (!buffer_uptodate(bh))
3533 goto next; 3394 goto unlock;
3534 } 3395 }
3396 if (ext4_should_journal_data(inode)) {
3397 BUFFER_TRACE(bh, "get write access");
3398 err = ext4_journal_get_write_access(handle, bh);
3399 if (err)
3400 goto unlock;
3401 }
3402 zero_user(page, offset, length);
3403 BUFFER_TRACE(bh, "zeroed end of block");
3535 3404
3536 if (ext4_should_journal_data(inode)) { 3405 if (ext4_should_journal_data(inode)) {
3537 BUFFER_TRACE(bh, "get write access"); 3406 err = ext4_handle_dirty_metadata(handle, inode, bh);
3538 err = ext4_journal_get_write_access(handle, bh); 3407 } else {
3539 if (err) 3408 err = 0;
3540 goto next; 3409 mark_buffer_dirty(bh);
3541 } 3410 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
3411 err = ext4_jbd2_file_inode(handle, inode);
3412 }
3542 3413
3543 zero_user(page, pos, range_to_discard); 3414unlock:
3415 unlock_page(page);
3416 page_cache_release(page);
3417 return err;
3418}
3544 3419
3545 err = 0; 3420int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3546 if (ext4_should_journal_data(inode)) { 3421 loff_t lstart, loff_t length)
3547 err = ext4_handle_dirty_metadata(handle, inode, bh); 3422{
3548 } else 3423 struct super_block *sb = inode->i_sb;
3549 mark_buffer_dirty(bh); 3424 struct address_space *mapping = inode->i_mapping;
3425 unsigned partial_start, partial_end;
3426 ext4_fsblk_t start, end;
3427 loff_t byte_end = (lstart + length - 1);
3428 int err = 0;
3550 3429
3551 BUFFER_TRACE(bh, "Partial buffer zeroed"); 3430 partial_start = lstart & (sb->s_blocksize - 1);
3552next: 3431 partial_end = byte_end & (sb->s_blocksize - 1);
3553 bh = bh->b_this_page;
3554 iblock++;
3555 pos += range_to_discard;
3556 }
3557 3432
3433 start = lstart >> sb->s_blocksize_bits;
3434 end = byte_end >> sb->s_blocksize_bits;
3435
3436 /* Handle partial zero within the single block */
3437 if (start == end &&
3438 (partial_start || (partial_end != sb->s_blocksize - 1))) {
3439 err = ext4_block_zero_page_range(handle, mapping,
3440 lstart, length);
3441 return err;
3442 }
3443 /* Handle partial zero out on the start of the range */
3444 if (partial_start) {
3445 err = ext4_block_zero_page_range(handle, mapping,
3446 lstart, sb->s_blocksize);
3447 if (err)
3448 return err;
3449 }
3450 /* Handle partial zero out on the end of the range */
3451 if (partial_end != sb->s_blocksize - 1)
3452 err = ext4_block_zero_page_range(handle, mapping,
3453 byte_end - partial_end,
3454 partial_end + 1);
3558 return err; 3455 return err;
3559} 3456}
3560 3457
@@ -3580,14 +3477,12 @@ int ext4_can_truncate(struct inode *inode)
3580 * Returns: 0 on success or negative on failure 3477 * Returns: 0 on success or negative on failure
3581 */ 3478 */
3582 3479
3583int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3480int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3584{ 3481{
3585 struct inode *inode = file_inode(file);
3586 struct super_block *sb = inode->i_sb; 3482 struct super_block *sb = inode->i_sb;
3587 ext4_lblk_t first_block, stop_block; 3483 ext4_lblk_t first_block, stop_block;
3588 struct address_space *mapping = inode->i_mapping; 3484 struct address_space *mapping = inode->i_mapping;
3589 loff_t first_page, last_page, page_len; 3485 loff_t first_block_offset, last_block_offset;
3590 loff_t first_page_offset, last_page_offset;
3591 handle_t *handle; 3486 handle_t *handle;
3592 unsigned int credits; 3487 unsigned int credits;
3593 int ret = 0; 3488 int ret = 0;
@@ -3638,23 +3533,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3638 offset; 3533 offset;
3639 } 3534 }
3640 3535
3641 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 3536 first_block_offset = round_up(offset, sb->s_blocksize);
3642 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 3537 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
3643 3538
3644 first_page_offset = first_page << PAGE_CACHE_SHIFT; 3539 /* Now release the pages and zero block aligned part of pages*/
3645 last_page_offset = last_page << PAGE_CACHE_SHIFT; 3540 if (last_block_offset > first_block_offset)
3646 3541 truncate_pagecache_range(inode, first_block_offset,
3647 /* Now release the pages */ 3542 last_block_offset);
3648 if (last_page_offset > first_page_offset) {
3649 truncate_pagecache_range(inode, first_page_offset,
3650 last_page_offset - 1);
3651 }
3652 3543
3653 /* Wait all existing dio workers, newcomers will block on i_mutex */ 3544 /* Wait all existing dio workers, newcomers will block on i_mutex */
3654 ext4_inode_block_unlocked_dio(inode); 3545 ext4_inode_block_unlocked_dio(inode);
3655 ret = ext4_flush_unwritten_io(inode);
3656 if (ret)
3657 goto out_dio;
3658 inode_dio_wait(inode); 3546 inode_dio_wait(inode);
3659 3547
3660 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3548 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3556,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3668 goto out_dio; 3556 goto out_dio;
3669 } 3557 }
3670 3558
3671 /* 3559 ret = ext4_zero_partial_blocks(handle, inode, offset,
3672 * Now we need to zero out the non-page-aligned data in the 3560 length);
3673 * pages at the start and tail of the hole, and unmap the 3561 if (ret)
3674 * buffer heads for the block aligned regions of the page that 3562 goto out_stop;
3675 * were completely zeroed.
3676 */
3677 if (first_page > last_page) {
3678 /*
3679 * If the file space being truncated is contained
3680 * within a page just zero out and unmap the middle of
3681 * that page
3682 */
3683 ret = ext4_discard_partial_page_buffers(handle,
3684 mapping, offset, length, 0);
3685
3686 if (ret)
3687 goto out_stop;
3688 } else {
3689 /*
3690 * zero out and unmap the partial page that contains
3691 * the start of the hole
3692 */
3693 page_len = first_page_offset - offset;
3694 if (page_len > 0) {
3695 ret = ext4_discard_partial_page_buffers(handle, mapping,
3696 offset, page_len, 0);
3697 if (ret)
3698 goto out_stop;
3699 }
3700
3701 /*
3702 * zero out and unmap the partial page that contains
3703 * the end of the hole
3704 */
3705 page_len = offset + length - last_page_offset;
3706 if (page_len > 0) {
3707 ret = ext4_discard_partial_page_buffers(handle, mapping,
3708 last_page_offset, page_len, 0);
3709 if (ret)
3710 goto out_stop;
3711 }
3712 }
3713
3714 /*
3715 * If i_size is contained in the last page, we need to
3716 * unmap and zero the partial page after i_size
3717 */
3718 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
3719 inode->i_size % PAGE_CACHE_SIZE != 0) {
3720 page_len = PAGE_CACHE_SIZE -
3721 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3722
3723 if (page_len > 0) {
3724 ret = ext4_discard_partial_page_buffers(handle,
3725 mapping, inode->i_size, page_len, 0);
3726
3727 if (ret)
3728 goto out_stop;
3729 }
3730 }
3731 3563
3732 first_block = (offset + sb->s_blocksize - 1) >> 3564 first_block = (offset + sb->s_blocksize - 1) >>
3733 EXT4_BLOCK_SIZE_BITS(sb); 3565 EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3635,6 @@ void ext4_truncate(struct inode *inode)
3803 unsigned int credits; 3635 unsigned int credits;
3804 handle_t *handle; 3636 handle_t *handle;
3805 struct address_space *mapping = inode->i_mapping; 3637 struct address_space *mapping = inode->i_mapping;
3806 loff_t page_len;
3807 3638
3808 /* 3639 /*
3809 * There is a possibility that we're either freeing the inode 3640 * There is a possibility that we're either freeing the inode
@@ -3830,12 +3661,6 @@ void ext4_truncate(struct inode *inode)
3830 return; 3661 return;
3831 } 3662 }
3832 3663
3833 /*
3834 * finish any pending end_io work so we won't run the risk of
3835 * converting any truncated blocks to initialized later
3836 */
3837 ext4_flush_unwritten_io(inode);
3838
3839 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3664 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3840 credits = ext4_writepage_trans_blocks(inode); 3665 credits = ext4_writepage_trans_blocks(inode);
3841 else 3666 else
@@ -3847,14 +3672,8 @@ void ext4_truncate(struct inode *inode)
3847 return; 3672 return;
3848 } 3673 }
3849 3674
3850 if (inode->i_size % PAGE_CACHE_SIZE != 0) { 3675 if (inode->i_size & (inode->i_sb->s_blocksize - 1))
3851 page_len = PAGE_CACHE_SIZE - 3676 ext4_block_truncate_page(handle, mapping, inode->i_size);
3852 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3853
3854 if (ext4_discard_partial_page_buffers(handle,
3855 mapping, inode->i_size, page_len, 0))
3856 goto out_stop;
3857 }
3858 3677
3859 /* 3678 /*
3860 * We add the inode to the orphan list, so that if this 3679 * We add the inode to the orphan list, so that if this
@@ -4623,7 +4442,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
4623 inode->i_size >> PAGE_CACHE_SHIFT); 4442 inode->i_size >> PAGE_CACHE_SHIFT);
4624 if (!page) 4443 if (!page)
4625 return; 4444 return;
4626 ret = __ext4_journalled_invalidatepage(page, offset); 4445 ret = __ext4_journalled_invalidatepage(page, offset,
4446 PAGE_CACHE_SIZE - offset);
4627 unlock_page(page); 4447 unlock_page(page);
4628 page_cache_release(page); 4448 page_cache_release(page);
4629 if (ret != -EBUSY) 4449 if (ret != -EBUSY)
@@ -4805,7 +4625,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4805 struct kstat *stat) 4625 struct kstat *stat)
4806{ 4626{
4807 struct inode *inode; 4627 struct inode *inode;
4808 unsigned long delalloc_blocks; 4628 unsigned long long delalloc_blocks;
4809 4629
4810 inode = dentry->d_inode; 4630 inode = dentry->d_inode;
4811 generic_fillattr(inode, stat); 4631 generic_fillattr(inode, stat);
@@ -4823,15 +4643,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4823 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 4643 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
4824 EXT4_I(inode)->i_reserved_data_blocks); 4644 EXT4_I(inode)->i_reserved_data_blocks);
4825 4645
4826 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4646 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
4827 return 0; 4647 return 0;
4828} 4648}
4829 4649
4830static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4650static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
4651 int pextents)
4831{ 4652{
4832 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4653 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4833 return ext4_ind_trans_blocks(inode, nrblocks, chunk); 4654 return ext4_ind_trans_blocks(inode, lblocks);
4834 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4655 return ext4_ext_index_trans_blocks(inode, pextents);
4835} 4656}
4836 4657
4837/* 4658/*
@@ -4845,7 +4666,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4845 * 4666 *
4846 * Also account for superblock, inode, quota and xattr blocks 4667 * Also account for superblock, inode, quota and xattr blocks
4847 */ 4668 */
4848static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4669static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
4670 int pextents)
4849{ 4671{
4850 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 4672 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
4851 int gdpblocks; 4673 int gdpblocks;
@@ -4853,14 +4675,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4853 int ret = 0; 4675 int ret = 0;
4854 4676
4855 /* 4677 /*
4856 * How many index blocks need to touch to modify nrblocks? 4678 * How many index blocks need to touch to map @lblocks logical blocks
4857 * The "Chunk" flag indicating whether the nrblocks is 4679 * to @pextents physical extents?
4858 * physically contiguous on disk
4859 *
4860 * For Direct IO and fallocate, they calls get_block to allocate
4861 * one single extent at a time, so they could set the "Chunk" flag
4862 */ 4680 */
4863 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4681 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
4864 4682
4865 ret = idxblocks; 4683 ret = idxblocks;
4866 4684
@@ -4868,12 +4686,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4868 * Now let's see how many group bitmaps and group descriptors need 4686 * Now let's see how many group bitmaps and group descriptors need
4869 * to account 4687 * to account
4870 */ 4688 */
4871 groups = idxblocks; 4689 groups = idxblocks + pextents;
4872 if (chunk)
4873 groups += 1;
4874 else
4875 groups += nrblocks;
4876
4877 gdpblocks = groups; 4690 gdpblocks = groups;
4878 if (groups > ngroups) 4691 if (groups > ngroups)
4879 groups = ngroups; 4692 groups = ngroups;
@@ -4904,7 +4717,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
4904 int bpp = ext4_journal_blocks_per_page(inode); 4717 int bpp = ext4_journal_blocks_per_page(inode);
4905 int ret; 4718 int ret;
4906 4719
4907 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4720 ret = ext4_meta_trans_blocks(inode, bpp, bpp);
4908 4721
4909 /* Account for data blocks for journalled mode */ 4722 /* Account for data blocks for journalled mode */
4910 if (ext4_should_journal_data(inode)) 4723 if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9491ac0590f7..c0427e2f6648 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -77,8 +77,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
77 memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); 77 memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
78 memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); 78 memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
79 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); 79 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
80 memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); 80 ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
81 memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); 81 ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
82 ext4_es_lru_del(inode1);
83 ext4_es_lru_del(inode2);
82 84
83 isize = i_size_read(inode1); 85 isize = i_size_read(inode1);
84 i_size_write(inode1, i_size_read(inode2)); 86 i_size_write(inode1, i_size_read(inode2));
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def84082a9a9..4bbbf13bd743 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,6 +2105,7 @@ repeat:
2105 group = ac->ac_g_ex.fe_group; 2105 group = ac->ac_g_ex.fe_group;
2106 2106
2107 for (i = 0; i < ngroups; group++, i++) { 2107 for (i = 0; i < ngroups; group++, i++) {
2108 cond_resched();
2108 /* 2109 /*
2109 * Artificially restricted ngroups for non-extent 2110 * Artificially restricted ngroups for non-extent
2110 * files makes group > ngroups possible on first loop. 2111 * files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4405repeat: 4406repeat:
4406 /* allocate space in core */ 4407 /* allocate space in core */
4407 *errp = ext4_mb_regular_allocator(ac); 4408 *errp = ext4_mb_regular_allocator(ac);
4408 if (*errp) { 4409 if (*errp)
4409 ext4_discard_allocated_blocks(ac); 4410 goto discard_and_exit;
4410 goto errout;
4411 }
4412 4411
4413 /* as we've just preallocated more space than 4412 /* as we've just preallocated more space than
4414 * user requested orinally, we store allocated 4413 * user requested originally, we store allocated
4415 * space in a special descriptor */ 4414 * space in a special descriptor */
4416 if (ac->ac_status == AC_STATUS_FOUND && 4415 if (ac->ac_status == AC_STATUS_FOUND &&
4417 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4416 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4418 ext4_mb_new_preallocation(ac); 4417 *errp = ext4_mb_new_preallocation(ac);
4418 if (*errp) {
4419 discard_and_exit:
4420 ext4_discard_allocated_blocks(ac);
4421 goto errout;
4422 }
4419 } 4423 }
4420 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4424 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4421 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 4425 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4612 BUG_ON(bh && (count > 1)); 4616 BUG_ON(bh && (count > 1));
4613 4617
4614 for (i = 0; i < count; i++) { 4618 for (i = 0; i < count; i++) {
4619 cond_resched();
4615 if (!bh) 4620 if (!bh)
4616 tbh = sb_find_get_block(inode->i_sb, 4621 tbh = sb_find_get_block(inode->i_sb,
4617 block + i); 4622 block + i);
4618 if (unlikely(!tbh)) 4623 if (!tbh)
4619 continue; 4624 continue;
4620 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4625 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4621 inode, tbh, block + i); 4626 inode, tbh, block + i);
@@ -4735,11 +4740,16 @@ do_more:
4735 * blocks being freed are metadata. these blocks shouldn't 4740 * blocks being freed are metadata. these blocks shouldn't
4736 * be used until this transaction is committed 4741 * be used until this transaction is committed
4737 */ 4742 */
4743 retry:
4738 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); 4744 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4739 if (!new_entry) { 4745 if (!new_entry) {
4740 ext4_mb_unload_buddy(&e4b); 4746 /*
4741 err = -ENOMEM; 4747 * We use a retry loop because
4742 goto error_return; 4748 * ext4_free_blocks() is not allowed to fail.
4749 */
4750 cond_resched();
4751 congestion_wait(BLK_RW_ASYNC, HZ/50);
4752 goto retry;
4743 } 4753 }
4744 new_entry->efd_start_cluster = bit; 4754 new_entry->efd_start_cluster = bit;
4745 new_entry->efd_group = block_group; 4755 new_entry->efd_group = block_group;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
912 struct page *pagep[2] = {NULL, NULL}; 912 struct page *pagep[2] = {NULL, NULL};
913 handle_t *handle; 913 handle_t *handle;
914 ext4_lblk_t orig_blk_offset; 914 ext4_lblk_t orig_blk_offset;
915 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
916 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 915 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
917 unsigned int w_flags = 0; 916 unsigned int w_flags = 0;
918 unsigned int tmp_data_size, data_size, replaced_size; 917 unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
940 orig_blk_offset = orig_page_offset * blocks_per_page + 939 orig_blk_offset = orig_page_offset * blocks_per_page +
941 data_offset_in_page; 940 data_offset_in_page;
942 941
943 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
944
945 /* Calculate data_size */ 942 /* Calculate data_size */
946 if ((orig_blk_offset + block_len_in_page - 1) == 943 if ((orig_blk_offset + block_len_in_page - 1) ==
947 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 944 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc35ecb7..35f55a0dbc4b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
918 bh->b_data, bh->b_size, 918 bh->b_data, bh->b_size,
919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
920 + ((char *)de - bh->b_data))) { 920 + ((char *)de - bh->b_data))) {
921 /* On error, skip the f_pos to the next block. */ 921 /* silently ignore the rest of the block */
922 dir_file->f_pos = (dir_file->f_pos | 922 break;
923 (dir->i_sb->s_blocksize - 1)) + 1;
924 brelse(bh);
925 return count;
926 } 923 }
927 ext4fs_dirhash(de->name, de->name_len, hinfo); 924 ext4fs_dirhash(de->name, de->name_len, hinfo);
928 if ((hinfo->hash < start_hash) || 925 if ((hinfo->hash < start_hash) ||
@@ -2299,6 +2296,45 @@ retry:
2299 return err; 2296 return err;
2300} 2297}
2301 2298
2299static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2300{
2301 handle_t *handle;
2302 struct inode *inode;
2303 int err, retries = 0;
2304
2305 dquot_initialize(dir);
2306
2307retry:
2308 inode = ext4_new_inode_start_handle(dir, mode,
2309 NULL, 0, NULL,
2310 EXT4_HT_DIR,
2311 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2312 4 + EXT4_XATTR_TRANS_BLOCKS);
2313 handle = ext4_journal_current_handle();
2314 err = PTR_ERR(inode);
2315 if (!IS_ERR(inode)) {
2316 inode->i_op = &ext4_file_inode_operations;
2317 inode->i_fop = &ext4_file_operations;
2318 ext4_set_aops(inode);
2319 d_tmpfile(dentry, inode);
2320 err = ext4_orphan_add(handle, inode);
2321 if (err)
2322 goto err_drop_inode;
2323 mark_inode_dirty(inode);
2324 unlock_new_inode(inode);
2325 }
2326 if (handle)
2327 ext4_journal_stop(handle);
2328 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2329 goto retry;
2330 return err;
2331err_drop_inode:
2332 ext4_journal_stop(handle);
2333 unlock_new_inode(inode);
2334 iput(inode);
2335 return err;
2336}
2337
2302struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, 2338struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
2303 struct ext4_dir_entry_2 *de, 2339 struct ext4_dir_entry_2 *de,
2304 int blocksize, int csum_size, 2340 int blocksize, int csum_size,
@@ -2906,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry,
2906retry: 2942retry:
2907 handle = ext4_journal_start(dir, EXT4_HT_DIR, 2943 handle = ext4_journal_start(dir, EXT4_HT_DIR,
2908 (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2944 (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2909 EXT4_INDEX_EXTRA_TRANS_BLOCKS)); 2945 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
2910 if (IS_ERR(handle)) 2946 if (IS_ERR(handle))
2911 return PTR_ERR(handle); 2947 return PTR_ERR(handle);
2912 2948
@@ -2920,6 +2956,11 @@ retry:
2920 err = ext4_add_entry(handle, dentry, inode); 2956 err = ext4_add_entry(handle, dentry, inode);
2921 if (!err) { 2957 if (!err) {
2922 ext4_mark_inode_dirty(handle, inode); 2958 ext4_mark_inode_dirty(handle, inode);
2959 /* this can happen only for tmpfile being
2960 * linked the first time
2961 */
2962 if (inode->i_nlink == 1)
2963 ext4_orphan_del(handle, inode);
2923 d_instantiate(dentry, inode); 2964 d_instantiate(dentry, inode);
2924 } else { 2965 } else {
2925 drop_nlink(inode); 2966 drop_nlink(inode);
@@ -3172,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = {
3172 .mkdir = ext4_mkdir, 3213 .mkdir = ext4_mkdir,
3173 .rmdir = ext4_rmdir, 3214 .rmdir = ext4_rmdir,
3174 .mknod = ext4_mknod, 3215 .mknod = ext4_mknod,
3216 .tmpfile = ext4_tmpfile,
3175 .rename = ext4_rename, 3217 .rename = ext4_rename,
3176 .setattr = ext4_setattr, 3218 .setattr = ext4_setattr,
3177 .setxattr = generic_setxattr, 3219 .setxattr = generic_setxattr,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..6625d210fb45 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -25,6 +25,7 @@
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/ratelimit.h>
28 29
29#include "ext4_jbd2.h" 30#include "ext4_jbd2.h"
30#include "xattr.h" 31#include "xattr.h"
@@ -46,46 +47,121 @@ void ext4_exit_pageio(void)
46} 47}
47 48
48/* 49/*
49 * This function is called by ext4_evict_inode() to make sure there is 50 * Print an buffer I/O error compatible with the fs/buffer.c. This
50 * no more pending I/O completion work left to do. 51 * provides compatibility with dmesg scrapers that look for a specific
52 * buffer I/O error message. We really need a unified error reporting
53 * structure to userspace ala Digital Unix's uerf system, but it's
54 * probably not going to happen in my lifetime, due to LKML politics...
51 */ 55 */
52void ext4_ioend_shutdown(struct inode *inode) 56static void buffer_io_error(struct buffer_head *bh)
57{
58 char b[BDEVNAME_SIZE];
59 printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
60 bdevname(bh->b_bdev, b),
61 (unsigned long long)bh->b_blocknr);
62}
63
64static void ext4_finish_bio(struct bio *bio)
53{ 65{
54 wait_queue_head_t *wq = ext4_ioend_wq(inode); 66 int i;
67 int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
55 68
56 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 69 for (i = 0; i < bio->bi_vcnt; i++) {
57 /* 70 struct bio_vec *bvec = &bio->bi_io_vec[i];
58 * We need to make sure the work structure is finished being 71 struct page *page = bvec->bv_page;
59 * used before we let the inode get destroyed. 72 struct buffer_head *bh, *head;
60 */ 73 unsigned bio_start = bvec->bv_offset;
61 if (work_pending(&EXT4_I(inode)->i_unwritten_work)) 74 unsigned bio_end = bio_start + bvec->bv_len;
62 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 75 unsigned under_io = 0;
76 unsigned long flags;
77
78 if (!page)
79 continue;
80
81 if (error) {
82 SetPageError(page);
83 set_bit(AS_EIO, &page->mapping->flags);
84 }
85 bh = head = page_buffers(page);
86 /*
87 * We check all buffers in the page under BH_Uptodate_Lock
88 * to avoid races with other end io clearing async_write flags
89 */
90 local_irq_save(flags);
91 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
92 do {
93 if (bh_offset(bh) < bio_start ||
94 bh_offset(bh) + bh->b_size > bio_end) {
95 if (buffer_async_write(bh))
96 under_io++;
97 continue;
98 }
99 clear_buffer_async_write(bh);
100 if (error)
101 buffer_io_error(bh);
102 } while ((bh = bh->b_this_page) != head);
103 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
104 local_irq_restore(flags);
105 if (!under_io)
106 end_page_writeback(page);
107 }
108}
109
110static void ext4_release_io_end(ext4_io_end_t *io_end)
111{
112 struct bio *bio, *next_bio;
113
114 BUG_ON(!list_empty(&io_end->list));
115 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
116 WARN_ON(io_end->handle);
117
118 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
119 wake_up_all(ext4_ioend_wq(io_end->inode));
120
121 for (bio = io_end->bio; bio; bio = next_bio) {
122 next_bio = bio->bi_private;
123 ext4_finish_bio(bio);
124 bio_put(bio);
125 }
126 if (io_end->flag & EXT4_IO_END_DIRECT)
127 inode_dio_done(io_end->inode);
128 if (io_end->iocb)
129 aio_complete(io_end->iocb, io_end->result, 0);
130 kmem_cache_free(io_end_cachep, io_end);
63} 131}
64 132
65void ext4_free_io_end(ext4_io_end_t *io) 133static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
66{ 134{
67 BUG_ON(!io); 135 struct inode *inode = io_end->inode;
68 BUG_ON(!list_empty(&io->list));
69 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
70 136
71 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 137 io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
72 wake_up_all(ext4_ioend_wq(io->inode)); 138 /* Wake up anyone waiting on unwritten extent conversion */
73 kmem_cache_free(io_end_cachep, io); 139 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
140 wake_up_all(ext4_ioend_wq(inode));
74} 141}
75 142
76/* check a range of space and convert unwritten extents to written. */ 143/*
144 * Check a range of space and convert unwritten extents to written. Note that
145 * we are protected from truncate touching same part of extent tree by the
146 * fact that truncate code waits for all DIO to finish (thus exclusion from
147 * direct IO is achieved) and also waits for PageWriteback bits. Thus we
148 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
149 * completed (happens from ext4_free_ioend()).
150 */
77static int ext4_end_io(ext4_io_end_t *io) 151static int ext4_end_io(ext4_io_end_t *io)
78{ 152{
79 struct inode *inode = io->inode; 153 struct inode *inode = io->inode;
80 loff_t offset = io->offset; 154 loff_t offset = io->offset;
81 ssize_t size = io->size; 155 ssize_t size = io->size;
156 handle_t *handle = io->handle;
82 int ret = 0; 157 int ret = 0;
83 158
84 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 159 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
85 "list->prev 0x%p\n", 160 "list->prev 0x%p\n",
86 io, inode->i_ino, io->list.next, io->list.prev); 161 io, inode->i_ino, io->list.next, io->list.prev);
87 162
88 ret = ext4_convert_unwritten_extents(inode, offset, size); 163 io->handle = NULL; /* Following call will use up the handle */
164 ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
89 if (ret < 0) { 165 if (ret < 0) {
90 ext4_msg(inode->i_sb, KERN_EMERG, 166 ext4_msg(inode->i_sb, KERN_EMERG,
91 "failed to convert unwritten extents to written " 167 "failed to convert unwritten extents to written "
@@ -93,30 +169,22 @@ static int ext4_end_io(ext4_io_end_t *io)
93 "(inode %lu, offset %llu, size %zd, error %d)", 169 "(inode %lu, offset %llu, size %zd, error %d)",
94 inode->i_ino, offset, size, ret); 170 inode->i_ino, offset, size, ret);
95 } 171 }
96 /* Wake up anyone waiting on unwritten extent conversion */ 172 ext4_clear_io_unwritten_flag(io);
97 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 173 ext4_release_io_end(io);
98 wake_up_all(ext4_ioend_wq(inode));
99 if (io->flag & EXT4_IO_END_DIRECT)
100 inode_dio_done(inode);
101 if (io->iocb)
102 aio_complete(io->iocb, io->result, 0);
103 return ret; 174 return ret;
104} 175}
105 176
106static void dump_completed_IO(struct inode *inode) 177static void dump_completed_IO(struct inode *inode, struct list_head *head)
107{ 178{
108#ifdef EXT4FS_DEBUG 179#ifdef EXT4FS_DEBUG
109 struct list_head *cur, *before, *after; 180 struct list_head *cur, *before, *after;
110 ext4_io_end_t *io, *io0, *io1; 181 ext4_io_end_t *io, *io0, *io1;
111 182
112 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 183 if (list_empty(head))
113 ext4_debug("inode %lu completed_io list is empty\n",
114 inode->i_ino);
115 return; 184 return;
116 }
117 185
118 ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 186 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
119 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 187 list_for_each_entry(io, head, list) {
120 cur = &io->list; 188 cur = &io->list;
121 before = cur->prev; 189 before = cur->prev;
122 io0 = container_of(before, ext4_io_end_t, list); 190 io0 = container_of(before, ext4_io_end_t, list);
@@ -130,23 +198,30 @@ static void dump_completed_IO(struct inode *inode)
130} 198}
131 199
132/* Add the io_end to per-inode completed end_io list. */ 200/* Add the io_end to per-inode completed end_io list. */
133void ext4_add_complete_io(ext4_io_end_t *io_end) 201static void ext4_add_complete_io(ext4_io_end_t *io_end)
134{ 202{
135 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 203 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
136 struct workqueue_struct *wq; 204 struct workqueue_struct *wq;
137 unsigned long flags; 205 unsigned long flags;
138 206
139 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 207 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
140 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
141
142 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 208 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
143 if (list_empty(&ei->i_completed_io_list)) 209 if (io_end->handle) {
144 queue_work(wq, &ei->i_unwritten_work); 210 wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
145 list_add_tail(&io_end->list, &ei->i_completed_io_list); 211 if (list_empty(&ei->i_rsv_conversion_list))
212 queue_work(wq, &ei->i_rsv_conversion_work);
213 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
214 } else {
215 wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
216 if (list_empty(&ei->i_unrsv_conversion_list))
217 queue_work(wq, &ei->i_unrsv_conversion_work);
218 list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
219 }
146 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 220 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
147} 221}
148 222
149static int ext4_do_flush_completed_IO(struct inode *inode) 223static int ext4_do_flush_completed_IO(struct inode *inode,
224 struct list_head *head)
150{ 225{
151 ext4_io_end_t *io; 226 ext4_io_end_t *io;
152 struct list_head unwritten; 227 struct list_head unwritten;
@@ -155,8 +230,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
155 int err, ret = 0; 230 int err, ret = 0;
156 231
157 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 232 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
158 dump_completed_IO(inode); 233 dump_completed_IO(inode, head);
159 list_replace_init(&ei->i_completed_io_list, &unwritten); 234 list_replace_init(head, &unwritten);
160 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 235 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
161 236
162 while (!list_empty(&unwritten)) { 237 while (!list_empty(&unwritten)) {
@@ -167,30 +242,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
167 err = ext4_end_io(io); 242 err = ext4_end_io(io);
168 if (unlikely(!ret && err)) 243 if (unlikely(!ret && err))
169 ret = err; 244 ret = err;
170 io->flag &= ~EXT4_IO_END_UNWRITTEN;
171 ext4_free_io_end(io);
172 } 245 }
173 return ret; 246 return ret;
174} 247}
175 248
176/* 249/*
177 * work on completed aio dio IO, to convert unwritten extents to extents 250 * work on completed IO, to convert unwritten extents to extents
178 */ 251 */
179void ext4_end_io_work(struct work_struct *work) 252void ext4_end_io_rsv_work(struct work_struct *work)
180{ 253{
181 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 254 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
182 i_unwritten_work); 255 i_rsv_conversion_work);
183 ext4_do_flush_completed_IO(&ei->vfs_inode); 256 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
184} 257}
185 258
186int ext4_flush_unwritten_io(struct inode *inode) 259void ext4_end_io_unrsv_work(struct work_struct *work)
187{ 260{
188 int ret; 261 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
189 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && 262 i_unrsv_conversion_work);
190 !(inode->i_state & I_FREEING)); 263 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
191 ret = ext4_do_flush_completed_IO(inode);
192 ext4_unwritten_wait(inode);
193 return ret;
194} 264}
195 265
196ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 266ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -200,83 +270,59 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
200 atomic_inc(&EXT4_I(inode)->i_ioend_count); 270 atomic_inc(&EXT4_I(inode)->i_ioend_count);
201 io->inode = inode; 271 io->inode = inode;
202 INIT_LIST_HEAD(&io->list); 272 INIT_LIST_HEAD(&io->list);
273 atomic_set(&io->count, 1);
203 } 274 }
204 return io; 275 return io;
205} 276}
206 277
207/* 278void ext4_put_io_end_defer(ext4_io_end_t *io_end)
208 * Print an buffer I/O error compatible with the fs/buffer.c. This
209 * provides compatibility with dmesg scrapers that look for a specific
210 * buffer I/O error message. We really need a unified error reporting
211 * structure to userspace ala Digital Unix's uerf system, but it's
212 * probably not going to happen in my lifetime, due to LKML politics...
213 */
214static void buffer_io_error(struct buffer_head *bh)
215{ 279{
216 char b[BDEVNAME_SIZE]; 280 if (atomic_dec_and_test(&io_end->count)) {
217 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", 281 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
218 bdevname(bh->b_bdev, b), 282 ext4_release_io_end(io_end);
219 (unsigned long long)bh->b_blocknr); 283 return;
284 }
285 ext4_add_complete_io(io_end);
286 }
220} 287}
221 288
289int ext4_put_io_end(ext4_io_end_t *io_end)
290{
291 int err = 0;
292
293 if (atomic_dec_and_test(&io_end->count)) {
294 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
295 err = ext4_convert_unwritten_extents(io_end->handle,
296 io_end->inode, io_end->offset,
297 io_end->size);
298 io_end->handle = NULL;
299 ext4_clear_io_unwritten_flag(io_end);
300 }
301 ext4_release_io_end(io_end);
302 }
303 return err;
304}
305
306ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
307{
308 atomic_inc(&io_end->count);
309 return io_end;
310}
311
312/* BIO completion function for page writeback */
222static void ext4_end_bio(struct bio *bio, int error) 313static void ext4_end_bio(struct bio *bio, int error)
223{ 314{
224 ext4_io_end_t *io_end = bio->bi_private; 315 ext4_io_end_t *io_end = bio->bi_private;
225 struct inode *inode;
226 int i;
227 int blocksize;
228 sector_t bi_sector = bio->bi_sector; 316 sector_t bi_sector = bio->bi_sector;
229 317
230 BUG_ON(!io_end); 318 BUG_ON(!io_end);
231 inode = io_end->inode;
232 blocksize = 1 << inode->i_blkbits;
233 bio->bi_private = NULL;
234 bio->bi_end_io = NULL; 319 bio->bi_end_io = NULL;
235 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 320 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
236 error = 0; 321 error = 0;
237 for (i = 0; i < bio->bi_vcnt; i++) {
238 struct bio_vec *bvec = &bio->bi_io_vec[i];
239 struct page *page = bvec->bv_page;
240 struct buffer_head *bh, *head;
241 unsigned bio_start = bvec->bv_offset;
242 unsigned bio_end = bio_start + bvec->bv_len;
243 unsigned under_io = 0;
244 unsigned long flags;
245
246 if (!page)
247 continue;
248
249 if (error) {
250 SetPageError(page);
251 set_bit(AS_EIO, &page->mapping->flags);
252 }
253 bh = head = page_buffers(page);
254 /*
255 * We check all buffers in the page under BH_Uptodate_Lock
256 * to avoid races with other end io clearing async_write flags
257 */
258 local_irq_save(flags);
259 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
260 do {
261 if (bh_offset(bh) < bio_start ||
262 bh_offset(bh) + blocksize > bio_end) {
263 if (buffer_async_write(bh))
264 under_io++;
265 continue;
266 }
267 clear_buffer_async_write(bh);
268 if (error)
269 buffer_io_error(bh);
270 } while ((bh = bh->b_this_page) != head);
271 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
272 local_irq_restore(flags);
273 if (!under_io)
274 end_page_writeback(page);
275 }
276 bio_put(bio);
277 322
278 if (error) { 323 if (error) {
279 io_end->flag |= EXT4_IO_END_ERROR; 324 struct inode *inode = io_end->inode;
325
280 ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 326 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
281 "(offset %llu size %ld starting block %llu)", 327 "(offset %llu size %ld starting block %llu)",
282 inode->i_ino, 328 inode->i_ino,
@@ -286,12 +332,23 @@ static void ext4_end_bio(struct bio *bio, int error)
286 bi_sector >> (inode->i_blkbits - 9)); 332 bi_sector >> (inode->i_blkbits - 9));
287 } 333 }
288 334
289 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 335 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
290 ext4_free_io_end(io_end); 336 /*
291 return; 337 * Link bio into list hanging from io_end. We have to do it
338 * atomically as bio completions can be racing against each
339 * other.
340 */
341 bio->bi_private = xchg(&io_end->bio, bio);
342 ext4_put_io_end_defer(io_end);
343 } else {
344 /*
345 * Drop io_end reference early. Inode can get freed once
346 * we finish the bio.
347 */
348 ext4_put_io_end_defer(io_end);
349 ext4_finish_bio(bio);
350 bio_put(bio);
292 } 351 }
293
294 ext4_add_complete_io(io_end);
295} 352}
296 353
297void ext4_io_submit(struct ext4_io_submit *io) 354void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +362,38 @@ void ext4_io_submit(struct ext4_io_submit *io)
305 bio_put(io->io_bio); 362 bio_put(io->io_bio);
306 } 363 }
307 io->io_bio = NULL; 364 io->io_bio = NULL;
308 io->io_op = 0; 365}
366
367void ext4_io_submit_init(struct ext4_io_submit *io,
368 struct writeback_control *wbc)
369{
370 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
371 io->io_bio = NULL;
309 io->io_end = NULL; 372 io->io_end = NULL;
310} 373}
311 374
312static int io_submit_init(struct ext4_io_submit *io, 375static int io_submit_init_bio(struct ext4_io_submit *io,
313 struct inode *inode, 376 struct buffer_head *bh)
314 struct writeback_control *wbc,
315 struct buffer_head *bh)
316{ 377{
317 ext4_io_end_t *io_end;
318 struct page *page = bh->b_page;
319 int nvecs = bio_get_nr_vecs(bh->b_bdev); 378 int nvecs = bio_get_nr_vecs(bh->b_bdev);
320 struct bio *bio; 379 struct bio *bio;
321 380
322 io_end = ext4_init_io_end(inode, GFP_NOFS);
323 if (!io_end)
324 return -ENOMEM;
325 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 381 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
382 if (!bio)
383 return -ENOMEM;
326 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 384 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
327 bio->bi_bdev = bh->b_bdev; 385 bio->bi_bdev = bh->b_bdev;
328 bio->bi_private = io->io_end = io_end;
329 bio->bi_end_io = ext4_end_bio; 386 bio->bi_end_io = ext4_end_bio;
330 387 bio->bi_private = ext4_get_io_end(io->io_end);
331 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
332
333 io->io_bio = bio; 388 io->io_bio = bio;
334 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
335 io->io_next_block = bh->b_blocknr; 389 io->io_next_block = bh->b_blocknr;
336 return 0; 390 return 0;
337} 391}
338 392
339static int io_submit_add_bh(struct ext4_io_submit *io, 393static int io_submit_add_bh(struct ext4_io_submit *io,
340 struct inode *inode, 394 struct inode *inode,
341 struct writeback_control *wbc,
342 struct buffer_head *bh) 395 struct buffer_head *bh)
343{ 396{
344 ext4_io_end_t *io_end;
345 int ret; 397 int ret;
346 398
347 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 399 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +401,14 @@ submit_and_retry:
349 ext4_io_submit(io); 401 ext4_io_submit(io);
350 } 402 }
351 if (io->io_bio == NULL) { 403 if (io->io_bio == NULL) {
352 ret = io_submit_init(io, inode, wbc, bh); 404 ret = io_submit_init_bio(io, bh);
353 if (ret) 405 if (ret)
354 return ret; 406 return ret;
355 } 407 }
356 io_end = io->io_end;
357 if (test_clear_buffer_uninit(bh))
358 ext4_set_io_unwritten_flag(inode, io_end);
359 io->io_end->size += bh->b_size;
360 io->io_next_block++;
361 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 408 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
362 if (ret != bh->b_size) 409 if (ret != bh->b_size)
363 goto submit_and_retry; 410 goto submit_and_retry;
411 io->io_next_block++;
364 return 0; 412 return 0;
365} 413}
366 414
@@ -432,7 +480,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
432 do { 480 do {
433 if (!buffer_async_write(bh)) 481 if (!buffer_async_write(bh))
434 continue; 482 continue;
435 ret = io_submit_add_bh(io, inode, wbc, bh); 483 ret = io_submit_add_bh(io, inode, bh);
436 if (ret) { 484 if (ret) {
437 /* 485 /*
438 * We only get here on ENOMEM. Not much else 486 * We only get here on ENOMEM. Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
79 ext4_fsblk_t end = start + input->blocks_count; 79 ext4_fsblk_t end = start + input->blocks_count;
80 ext4_group_t group = input->group; 80 ext4_group_t group = input->group;
81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
82 unsigned overhead = ext4_group_overhead_blocks(sb, group); 82 unsigned overhead;
83 ext4_fsblk_t metaend = start + overhead; 83 ext4_fsblk_t metaend;
84 struct buffer_head *bh = NULL; 84 struct buffer_head *bh = NULL;
85 ext4_grpblk_t free_blocks_count, offset; 85 ext4_grpblk_t free_blocks_count, offset;
86 int err = -EINVAL; 86 int err = -EINVAL;
87 87
88 if (group != sbi->s_groups_count) {
89 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
90 input->group, sbi->s_groups_count);
91 return -EINVAL;
92 }
93
94 overhead = ext4_group_overhead_blocks(sb, group);
95 metaend = start + overhead;
88 input->free_blocks_count = free_blocks_count = 96 input->free_blocks_count = free_blocks_count =
89 input->blocks_count - 2 - overhead - sbi->s_itb_per_group; 97 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
90 98
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
96 free_blocks_count, input->reserved_blocks); 104 free_blocks_count, input->reserved_blocks);
97 105
98 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 106 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
99 if (group != sbi->s_groups_count) 107 if (offset != 0)
100 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
101 input->group, sbi->s_groups_count);
102 else if (offset != 0)
103 ext4_warning(sb, "Last group not full"); 108 ext4_warning(sb, "Last group not full");
104 else if (input->reserved_blocks > input->blocks_count / 5) 109 else if (input->reserved_blocks > input->blocks_count / 5)
105 ext4_warning(sb, "Reserved blocks too high (%u)", 110 ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
1551 int reserved_gdb = ext4_bg_has_super(sb, input->group) ? 1556 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
1552 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1557 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
1553 struct inode *inode = NULL; 1558 struct inode *inode = NULL;
1554 int gdb_off, gdb_num; 1559 int gdb_off;
1555 int err; 1560 int err;
1556 __u16 bg_flags = 0; 1561 __u16 bg_flags = 0;
1557 1562
1558 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
1559 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); 1563 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
1560 1564
1561 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 1565 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
1656 err = err2; 1660 err = err2;
1657 1661
1658 if (!err) { 1662 if (!err) {
1659 ext4_fsblk_t first_block;
1660 first_block = ext4_group_first_block_no(sb, 0);
1661 if (test_opt(sb, DEBUG)) 1663 if (test_opt(sb, DEBUG))
1662 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1664 printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
1663 "blocks\n", ext4_blocks_count(es)); 1665 "blocks\n", ext4_blocks_count(es));
1664 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, 1666 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
1665 (char *)es, sizeof(struct ext4_super_block), 0); 1667 (char *)es, sizeof(struct ext4_super_block), 0);
1666 } 1668 }
1667 return err; 1669 return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..b59373b625e9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
69static void ext4_clear_journal_err(struct super_block *sb, 69static void ext4_clear_journal_err(struct super_block *sb,
70 struct ext4_super_block *es); 70 struct ext4_super_block *es);
71static int ext4_sync_fs(struct super_block *sb, int wait); 71static int ext4_sync_fs(struct super_block *sb, int wait);
72static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
72static int ext4_remount(struct super_block *sb, int *flags, char *data); 73static int ext4_remount(struct super_block *sb, int *flags, char *data);
73static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 74static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
74static int ext4_unfreeze(struct super_block *sb); 75static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
398 } 399 }
399 if (test_opt(sb, ERRORS_RO)) { 400 if (test_opt(sb, ERRORS_RO)) {
400 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 401 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
402 /*
403 * Make sure updated value of ->s_mount_flags will be visible
404 * before ->s_flags update
405 */
406 smp_wmb();
401 sb->s_flags |= MS_RDONLY; 407 sb->s_flags |= MS_RDONLY;
402 } 408 }
403 if (test_opt(sb, ERRORS_PANIC)) 409 if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
422 ext4_handle_error(sb); 428 ext4_handle_error(sb);
423} 429}
424 430
425void ext4_error_inode(struct inode *inode, const char *function, 431void __ext4_error_inode(struct inode *inode, const char *function,
426 unsigned int line, ext4_fsblk_t block, 432 unsigned int line, ext4_fsblk_t block,
427 const char *fmt, ...) 433 const char *fmt, ...)
428{ 434{
429 va_list args; 435 va_list args;
430 struct va_format vaf; 436 struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
451 ext4_handle_error(inode->i_sb); 457 ext4_handle_error(inode->i_sb);
452} 458}
453 459
454void ext4_error_file(struct file *file, const char *function, 460void __ext4_error_file(struct file *file, const char *function,
455 unsigned int line, ext4_fsblk_t block, 461 unsigned int line, ext4_fsblk_t block,
456 const char *fmt, ...) 462 const char *fmt, ...)
457{ 463{
458 va_list args; 464 va_list args;
459 struct va_format vaf; 465 struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
570 576
571 if ((sb->s_flags & MS_RDONLY) == 0) { 577 if ((sb->s_flags & MS_RDONLY) == 0) {
572 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 578 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
573 sb->s_flags |= MS_RDONLY;
574 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; 579 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
580 /*
581 * Make sure updated value of ->s_mount_flags will be visible
582 * before ->s_flags update
583 */
584 smp_wmb();
585 sb->s_flags |= MS_RDONLY;
575 if (EXT4_SB(sb)->s_journal) 586 if (EXT4_SB(sb)->s_journal)
576 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 587 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
577 save_error_info(sb, function, line); 588 save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
580 panic("EXT4-fs panic from previous error\n"); 591 panic("EXT4-fs panic from previous error\n");
581} 592}
582 593
583void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) 594void __ext4_msg(struct super_block *sb,
595 const char *prefix, const char *fmt, ...)
584{ 596{
585 struct va_format vaf; 597 struct va_format vaf;
586 va_list args; 598 va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
750 ext4_unregister_li_request(sb); 762 ext4_unregister_li_request(sb);
751 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 763 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
752 764
753 flush_workqueue(sbi->dio_unwritten_wq); 765 flush_workqueue(sbi->unrsv_conversion_wq);
754 destroy_workqueue(sbi->dio_unwritten_wq); 766 flush_workqueue(sbi->rsv_conversion_wq);
767 destroy_workqueue(sbi->unrsv_conversion_wq);
768 destroy_workqueue(sbi->rsv_conversion_wq);
755 769
756 if (sbi->s_journal) { 770 if (sbi->s_journal) {
757 err = jbd2_journal_destroy(sbi->s_journal); 771 err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
760 ext4_abort(sb, "Couldn't clean up the journal"); 774 ext4_abort(sb, "Couldn't clean up the journal");
761 } 775 }
762 776
763 ext4_es_unregister_shrinker(sb); 777 ext4_es_unregister_shrinker(sbi);
764 del_timer(&sbi->s_err_report); 778 del_timer(&sbi->s_err_report);
765 ext4_release_system_zone(sb); 779 ext4_release_system_zone(sb);
766 ext4_mb_release(sb); 780 ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
849 rwlock_init(&ei->i_es_lock); 863 rwlock_init(&ei->i_es_lock);
850 INIT_LIST_HEAD(&ei->i_es_lru); 864 INIT_LIST_HEAD(&ei->i_es_lru);
851 ei->i_es_lru_nr = 0; 865 ei->i_es_lru_nr = 0;
866 ei->i_touch_when = 0;
852 ei->i_reserved_data_blocks = 0; 867 ei->i_reserved_data_blocks = 0;
853 ei->i_reserved_meta_blocks = 0; 868 ei->i_reserved_meta_blocks = 0;
854 ei->i_allocated_meta_blocks = 0; 869 ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
859 ei->i_reserved_quota = 0; 874 ei->i_reserved_quota = 0;
860#endif 875#endif
861 ei->jinode = NULL; 876 ei->jinode = NULL;
862 INIT_LIST_HEAD(&ei->i_completed_io_list); 877 INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
878 INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
863 spin_lock_init(&ei->i_completed_io_lock); 879 spin_lock_init(&ei->i_completed_io_lock);
864 ei->i_sync_tid = 0; 880 ei->i_sync_tid = 0;
865 ei->i_datasync_tid = 0; 881 ei->i_datasync_tid = 0;
866 atomic_set(&ei->i_ioend_count, 0); 882 atomic_set(&ei->i_ioend_count, 0);
867 atomic_set(&ei->i_unwritten, 0); 883 atomic_set(&ei->i_unwritten, 0);
868 INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); 884 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
885 INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
869 886
870 return &ei->vfs_inode; 887 return &ei->vfs_inode;
871} 888}
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
1093 .dirty_inode = ext4_dirty_inode, 1110 .dirty_inode = ext4_dirty_inode,
1094 .drop_inode = ext4_drop_inode, 1111 .drop_inode = ext4_drop_inode,
1095 .evict_inode = ext4_evict_inode, 1112 .evict_inode = ext4_evict_inode,
1113 .sync_fs = ext4_sync_fs_nojournal,
1096 .put_super = ext4_put_super, 1114 .put_super = ext4_put_super,
1097 .statfs = ext4_statfs, 1115 .statfs = ext4_statfs,
1098 .remount_fs = ext4_remount, 1116 .remount_fs = ext4_remount,
@@ -1341,7 +1359,7 @@ static const struct mount_opts {
1341 {Opt_delalloc, EXT4_MOUNT_DELALLOC, 1359 {Opt_delalloc, EXT4_MOUNT_DELALLOC,
1342 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1360 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1343 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, 1361 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1344 MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT}, 1362 MOPT_EXT4_ONLY | MOPT_CLEAR},
1345 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, 1363 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1346 MOPT_EXT4_ONLY | MOPT_SET}, 1364 MOPT_EXT4_ONLY | MOPT_SET},
1347 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | 1365 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1684,12 +1702,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
1684 1702
1685 if (sbi->s_qf_names[GRPQUOTA]) 1703 if (sbi->s_qf_names[GRPQUOTA])
1686 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 1704 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1687
1688 if (test_opt(sb, USRQUOTA))
1689 seq_puts(seq, ",usrquota");
1690
1691 if (test_opt(sb, GRPQUOTA))
1692 seq_puts(seq, ",grpquota");
1693#endif 1705#endif
1694} 1706}
1695 1707
@@ -1908,7 +1920,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1908 struct ext4_sb_info *sbi = EXT4_SB(sb); 1920 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 struct ext4_group_desc *gdp = NULL; 1921 struct ext4_group_desc *gdp = NULL;
1910 ext4_group_t flex_group; 1922 ext4_group_t flex_group;
1911 unsigned int groups_per_flex = 0;
1912 int i, err; 1923 int i, err;
1913 1924
1914 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1925 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1927,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1916 sbi->s_log_groups_per_flex = 0; 1927 sbi->s_log_groups_per_flex = 0;
1917 return 1; 1928 return 1;
1918 } 1929 }
1919 groups_per_flex = 1U << sbi->s_log_groups_per_flex;
1920 1930
1921 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 1931 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1922 if (err) 1932 if (err)
@@ -2164,19 +2174,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2164 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2174 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2165 dquot_initialize(inode); 2175 dquot_initialize(inode);
2166 if (inode->i_nlink) { 2176 if (inode->i_nlink) {
2167 ext4_msg(sb, KERN_DEBUG, 2177 if (test_opt(sb, DEBUG))
2168 "%s: truncating inode %lu to %lld bytes", 2178 ext4_msg(sb, KERN_DEBUG,
2169 __func__, inode->i_ino, inode->i_size); 2179 "%s: truncating inode %lu to %lld bytes",
2180 __func__, inode->i_ino, inode->i_size);
2170 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 2181 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2171 inode->i_ino, inode->i_size); 2182 inode->i_ino, inode->i_size);
2172 mutex_lock(&inode->i_mutex); 2183 mutex_lock(&inode->i_mutex);
2184 truncate_inode_pages(inode->i_mapping, inode->i_size);
2173 ext4_truncate(inode); 2185 ext4_truncate(inode);
2174 mutex_unlock(&inode->i_mutex); 2186 mutex_unlock(&inode->i_mutex);
2175 nr_truncates++; 2187 nr_truncates++;
2176 } else { 2188 } else {
2177 ext4_msg(sb, KERN_DEBUG, 2189 if (test_opt(sb, DEBUG))
2178 "%s: deleting unreferenced inode %lu", 2190 ext4_msg(sb, KERN_DEBUG,
2179 __func__, inode->i_ino); 2191 "%s: deleting unreferenced inode %lu",
2192 __func__, inode->i_ino);
2180 jbd_debug(2, "deleting unreferenced inode %lu\n", 2193 jbd_debug(2, "deleting unreferenced inode %lu\n",
2181 inode->i_ino); 2194 inode->i_ino);
2182 nr_orphans++; 2195 nr_orphans++;
@@ -2377,7 +2390,10 @@ struct ext4_attr {
2377 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2390 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2378 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2391 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2379 const char *, size_t); 2392 const char *, size_t);
2380 int offset; 2393 union {
2394 int offset;
2395 int deprecated_val;
2396 } u;
2381}; 2397};
2382 2398
2383static int parse_strtoull(const char *buf, 2399static int parse_strtoull(const char *buf,
@@ -2446,7 +2462,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2446static ssize_t sbi_ui_show(struct ext4_attr *a, 2462static ssize_t sbi_ui_show(struct ext4_attr *a,
2447 struct ext4_sb_info *sbi, char *buf) 2463 struct ext4_sb_info *sbi, char *buf)
2448{ 2464{
2449 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2465 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2450 2466
2451 return snprintf(buf, PAGE_SIZE, "%u\n", *ui); 2467 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2452} 2468}
@@ -2455,7 +2471,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2455 struct ext4_sb_info *sbi, 2471 struct ext4_sb_info *sbi,
2456 const char *buf, size_t count) 2472 const char *buf, size_t count)
2457{ 2473{
2458 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2474 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2459 unsigned long t; 2475 unsigned long t;
2460 int ret; 2476 int ret;
2461 2477
@@ -2504,12 +2520,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
2504 return count; 2520 return count;
2505} 2521}
2506 2522
2523static ssize_t sbi_deprecated_show(struct ext4_attr *a,
2524 struct ext4_sb_info *sbi, char *buf)
2525{
2526 return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
2527}
2528
2507#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ 2529#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2508static struct ext4_attr ext4_attr_##_name = { \ 2530static struct ext4_attr ext4_attr_##_name = { \
2509 .attr = {.name = __stringify(_name), .mode = _mode }, \ 2531 .attr = {.name = __stringify(_name), .mode = _mode }, \
2510 .show = _show, \ 2532 .show = _show, \
2511 .store = _store, \ 2533 .store = _store, \
2512 .offset = offsetof(struct ext4_sb_info, _elname), \ 2534 .u = { \
2535 .offset = offsetof(struct ext4_sb_info, _elname),\
2536 }, \
2513} 2537}
2514#define EXT4_ATTR(name, mode, show, store) \ 2538#define EXT4_ATTR(name, mode, show, store) \
2515static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2539static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2544,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2520#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2544#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2521 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 2545 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2522#define ATTR_LIST(name) &ext4_attr_##name.attr 2546#define ATTR_LIST(name) &ext4_attr_##name.attr
2547#define EXT4_DEPRECATED_ATTR(_name, _val) \
2548static struct ext4_attr ext4_attr_##_name = { \
2549 .attr = {.name = __stringify(_name), .mode = 0444 }, \
2550 .show = sbi_deprecated_show, \
2551 .u = { \
2552 .deprecated_val = _val, \
2553 }, \
2554}
2523 2555
2524EXT4_RO_ATTR(delayed_allocation_blocks); 2556EXT4_RO_ATTR(delayed_allocation_blocks);
2525EXT4_RO_ATTR(session_write_kbytes); 2557EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2566,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2534EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2566EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2535EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2567EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2536EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2568EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2537EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2569EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
2538EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 2570EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2539EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2571EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2540 2572
@@ -3451,7 +3483,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3451 } 3483 }
3452 if (test_opt(sb, DIOREAD_NOLOCK)) { 3484 if (test_opt(sb, DIOREAD_NOLOCK)) {
3453 ext4_msg(sb, KERN_ERR, "can't mount with " 3485 ext4_msg(sb, KERN_ERR, "can't mount with "
3454 "both data=journal and delalloc"); 3486 "both data=journal and dioread_nolock");
3455 goto failed_mount; 3487 goto failed_mount;
3456 } 3488 }
3457 if (test_opt(sb, DELALLOC)) 3489 if (test_opt(sb, DELALLOC))
@@ -3586,10 +3618,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3586 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 3618 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3587 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 3619 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3588 3620
3589 /* Do we have standard group size of blocksize * 8 blocks ? */
3590 if (sbi->s_blocks_per_group == blocksize << 3)
3591 set_opt2(sb, STD_GROUP_SIZE);
3592
3593 for (i = 0; i < 4; i++) 3621 for (i = 0; i < 4; i++)
3594 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 3622 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3595 sbi->s_def_hash_version = es->s_def_hash_version; 3623 sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3659,6 +3687,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3659 goto failed_mount; 3687 goto failed_mount;
3660 } 3688 }
3661 3689
3690 /* Do we have standard group size of clustersize * 8 blocks ? */
3691 if (sbi->s_blocks_per_group == clustersize << 3)
3692 set_opt2(sb, STD_GROUP_SIZE);
3693
3662 /* 3694 /*
3663 * Test whether we have more sectors than will fit in sector_t, 3695 * Test whether we have more sectors than will fit in sector_t,
3664 * and whether the max offset is addressable by the page cache. 3696 * and whether the max offset is addressable by the page cache.
@@ -3763,7 +3795,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3763 sbi->s_err_report.data = (unsigned long) sb; 3795 sbi->s_err_report.data = (unsigned long) sb;
3764 3796
3765 /* Register extent status tree shrinker */ 3797 /* Register extent status tree shrinker */
3766 ext4_es_register_shrinker(sb); 3798 ext4_es_register_shrinker(sbi);
3767 3799
3768 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3800 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3769 ext4_count_free_clusters(sb)); 3801 ext4_count_free_clusters(sb));
@@ -3787,7 +3819,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3787 } 3819 }
3788 3820
3789 sbi->s_stripe = ext4_get_stripe_size(sbi); 3821 sbi->s_stripe = ext4_get_stripe_size(sbi);
3790 sbi->s_max_writeback_mb_bump = 128;
3791 sbi->s_extent_max_zeroout_kb = 32; 3822 sbi->s_extent_max_zeroout_kb = 32;
3792 3823
3793 /* 3824 /*
@@ -3915,12 +3946,20 @@ no_journal:
3915 * The maximum number of concurrent works can be high and 3946 * The maximum number of concurrent works can be high and
3916 * concurrency isn't really necessary. Limit it to 1. 3947 * concurrency isn't really necessary. Limit it to 1.
3917 */ 3948 */
3918 EXT4_SB(sb)->dio_unwritten_wq = 3949 EXT4_SB(sb)->rsv_conversion_wq =
3919 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3950 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3920 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3951 if (!EXT4_SB(sb)->rsv_conversion_wq) {
3921 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3952 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3922 ret = -ENOMEM; 3953 ret = -ENOMEM;
3923 goto failed_mount_wq; 3954 goto failed_mount4;
3955 }
3956
3957 EXT4_SB(sb)->unrsv_conversion_wq =
3958 alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3959 if (!EXT4_SB(sb)->unrsv_conversion_wq) {
3960 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3961 ret = -ENOMEM;
3962 goto failed_mount4;
3924 } 3963 }
3925 3964
3926 /* 3965 /*
@@ -4074,14 +4113,17 @@ failed_mount4a:
4074 sb->s_root = NULL; 4113 sb->s_root = NULL;
4075failed_mount4: 4114failed_mount4:
4076 ext4_msg(sb, KERN_ERR, "mount failed"); 4115 ext4_msg(sb, KERN_ERR, "mount failed");
4077 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 4116 if (EXT4_SB(sb)->rsv_conversion_wq)
4117 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4118 if (EXT4_SB(sb)->unrsv_conversion_wq)
4119 destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4078failed_mount_wq: 4120failed_mount_wq:
4079 if (sbi->s_journal) { 4121 if (sbi->s_journal) {
4080 jbd2_journal_destroy(sbi->s_journal); 4122 jbd2_journal_destroy(sbi->s_journal);
4081 sbi->s_journal = NULL; 4123 sbi->s_journal = NULL;
4082 } 4124 }
4083failed_mount3: 4125failed_mount3:
4084 ext4_es_unregister_shrinker(sb); 4126 ext4_es_unregister_shrinker(sbi);
4085 del_timer(&sbi->s_err_report); 4127 del_timer(&sbi->s_err_report);
4086 if (sbi->s_flex_groups) 4128 if (sbi->s_flex_groups)
4087 ext4_kvfree(sbi->s_flex_groups); 4129 ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4559,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4517{ 4559{
4518 int ret = 0; 4560 int ret = 0;
4519 tid_t target; 4561 tid_t target;
4562 bool needs_barrier = false;
4520 struct ext4_sb_info *sbi = EXT4_SB(sb); 4563 struct ext4_sb_info *sbi = EXT4_SB(sb);
4521 4564
4522 trace_ext4_sync_fs(sb, wait); 4565 trace_ext4_sync_fs(sb, wait);
4523 flush_workqueue(sbi->dio_unwritten_wq); 4566 flush_workqueue(sbi->rsv_conversion_wq);
4567 flush_workqueue(sbi->unrsv_conversion_wq);
4524 /* 4568 /*
4525 * Writeback quota in non-journalled quota case - journalled quota has 4569 * Writeback quota in non-journalled quota case - journalled quota has
4526 * no dirty dquots 4570 * no dirty dquots
4527 */ 4571 */
4528 dquot_writeback_dquots(sb, -1); 4572 dquot_writeback_dquots(sb, -1);
4573 /*
4574 * Data writeback is possible w/o journal transaction, so barrier must
4575 * being sent at the end of the function. But we can skip it if
4576 * transaction_commit will do it for us.
4577 */
4578 target = jbd2_get_latest_transaction(sbi->s_journal);
4579 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
4580 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
4581 needs_barrier = true;
4582
4529 if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 4583 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4530 if (wait) 4584 if (wait)
4531 jbd2_log_wait_commit(sbi->s_journal, target); 4585 ret = jbd2_log_wait_commit(sbi->s_journal, target);
4532 } 4586 }
4587 if (needs_barrier) {
4588 int err;
4589 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4590 if (!ret)
4591 ret = err;
4592 }
4593
4594 return ret;
4595}
4596
4597static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
4598{
4599 int ret = 0;
4600
4601 trace_ext4_sync_fs(sb, wait);
4602 flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4603 flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4604 dquot_writeback_dquots(sb, -1);
4605 if (wait && test_opt(sb, BARRIER))
4606 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4607
4533 return ret; 4608 return ret;
4534} 4609}
4535 4610
@@ -4652,6 +4727,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4652 goto restore_opts; 4727 goto restore_opts;
4653 } 4728 }
4654 4729
4730 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4731 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4732 ext4_msg(sb, KERN_ERR, "can't mount with "
4733 "both data=journal and delalloc");
4734 err = -EINVAL;
4735 goto restore_opts;
4736 }
4737 if (test_opt(sb, DIOREAD_NOLOCK)) {
4738 ext4_msg(sb, KERN_ERR, "can't mount with "
4739 "both data=journal and dioread_nolock");
4740 err = -EINVAL;
4741 goto restore_opts;
4742 }
4743 }
4744
4655 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) 4745 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
4656 ext4_abort(sb, "Abort forced by user"); 4746 ext4_abort(sb, "Abort forced by user");
4657 4747
@@ -5406,6 +5496,7 @@ static void __exit ext4_exit_fs(void)
5406 kset_unregister(ext4_kset); 5496 kset_unregister(ext4_kset);
5407 ext4_exit_system_zone(); 5497 ext4_exit_system_zone();
5408 ext4_exit_pageio(); 5498 ext4_exit_pageio();
5499 ext4_exit_es();
5409} 5500}
5410 5501
5411MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 5502MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index fd27e7e6326e..e06e0995e00f 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -51,3 +51,15 @@ config F2FS_FS_POSIX_ACL
51 Linux website <http://acl.bestbits.at/>. 51 Linux website <http://acl.bestbits.at/>.
52 52
53 If you don't know what Access Control Lists are, say N 53 If you don't know what Access Control Lists are, say N
54
55config F2FS_FS_SECURITY
56 bool "F2FS Security Labels"
57 depends on F2FS_FS_XATTR
58 help
59 Security labels provide an access control facility to support Linux
60 Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
61 Linux. This option enables an extended attribute handler for file
62 security labels in the f2fs filesystem, so that it requires enabling
63 the extended attribute support in advance.
64
65 If you are not using a security module, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 44abc2f286e0..b7826ec1b470 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,7 +250,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
250 } 250 }
251 } 251 }
252 252
253 error = f2fs_setxattr(inode, name_index, "", value, size); 253 error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
254 254
255 kfree(value); 255 kfree(value);
256 if (!error) 256 if (!error)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b1de01da1a40..66a6b85a51d8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -357,8 +357,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
357 unsigned long blk_size = sbi->blocksize; 357 unsigned long blk_size = sbi->blocksize;
358 struct f2fs_checkpoint *cp_block; 358 struct f2fs_checkpoint *cp_block;
359 unsigned long long cur_version = 0, pre_version = 0; 359 unsigned long long cur_version = 0, pre_version = 0;
360 unsigned int crc = 0;
361 size_t crc_offset; 360 size_t crc_offset;
361 __u32 crc = 0;
362 362
363 /* Read the 1st cp block in this CP pack */ 363 /* Read the 1st cp block in this CP pack */
364 cp_page_1 = get_meta_page(sbi, cp_addr); 364 cp_page_1 = get_meta_page(sbi, cp_addr);
@@ -369,7 +369,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
369 if (crc_offset >= blk_size) 369 if (crc_offset >= blk_size)
370 goto invalid_cp1; 370 goto invalid_cp1;
371 371
372 crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); 372 crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
373 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 373 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
374 goto invalid_cp1; 374 goto invalid_cp1;
375 375
@@ -384,7 +384,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
384 if (crc_offset >= blk_size) 384 if (crc_offset >= blk_size)
385 goto invalid_cp2; 385 goto invalid_cp2;
386 386
387 crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); 387 crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
388 if (!f2fs_crc_valid(crc, cp_block, crc_offset)) 388 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
389 goto invalid_cp2; 389 goto invalid_cp2;
390 390
@@ -450,13 +450,30 @@ fail_no_cp:
450 return -EINVAL; 450 return -EINVAL;
451} 451}
452 452
453void set_dirty_dir_page(struct inode *inode, struct page *page) 453static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
454{ 454{
455 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 455 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
456 struct list_head *head = &sbi->dir_inode_list; 456 struct list_head *head = &sbi->dir_inode_list;
457 struct dir_inode_entry *new;
458 struct list_head *this; 457 struct list_head *this;
459 458
459 list_for_each(this, head) {
460 struct dir_inode_entry *entry;
461 entry = list_entry(this, struct dir_inode_entry, list);
462 if (entry->inode == inode)
463 return -EEXIST;
464 }
465 list_add_tail(&new->list, head);
466#ifdef CONFIG_F2FS_STAT_FS
467 sbi->n_dirty_dirs++;
468#endif
469 return 0;
470}
471
472void set_dirty_dir_page(struct inode *inode, struct page *page)
473{
474 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
475 struct dir_inode_entry *new;
476
460 if (!S_ISDIR(inode->i_mode)) 477 if (!S_ISDIR(inode->i_mode))
461 return; 478 return;
462retry: 479retry:
@@ -469,23 +486,31 @@ retry:
469 INIT_LIST_HEAD(&new->list); 486 INIT_LIST_HEAD(&new->list);
470 487
471 spin_lock(&sbi->dir_inode_lock); 488 spin_lock(&sbi->dir_inode_lock);
472 list_for_each(this, head) { 489 if (__add_dirty_inode(inode, new))
473 struct dir_inode_entry *entry; 490 kmem_cache_free(inode_entry_slab, new);
474 entry = list_entry(this, struct dir_inode_entry, list);
475 if (entry->inode == inode) {
476 kmem_cache_free(inode_entry_slab, new);
477 goto out;
478 }
479 }
480 list_add_tail(&new->list, head);
481 sbi->n_dirty_dirs++;
482 491
483 BUG_ON(!S_ISDIR(inode->i_mode));
484out:
485 inc_page_count(sbi, F2FS_DIRTY_DENTS); 492 inc_page_count(sbi, F2FS_DIRTY_DENTS);
486 inode_inc_dirty_dents(inode); 493 inode_inc_dirty_dents(inode);
487 SetPagePrivate(page); 494 SetPagePrivate(page);
495 spin_unlock(&sbi->dir_inode_lock);
496}
488 497
498void add_dirty_dir_inode(struct inode *inode)
499{
500 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
501 struct dir_inode_entry *new;
502retry:
503 new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
504 if (!new) {
505 cond_resched();
506 goto retry;
507 }
508 new->inode = inode;
509 INIT_LIST_HEAD(&new->list);
510
511 spin_lock(&sbi->dir_inode_lock);
512 if (__add_dirty_inode(inode, new))
513 kmem_cache_free(inode_entry_slab, new);
489 spin_unlock(&sbi->dir_inode_lock); 514 spin_unlock(&sbi->dir_inode_lock);
490} 515}
491 516
@@ -499,8 +524,10 @@ void remove_dirty_dir_inode(struct inode *inode)
499 return; 524 return;
500 525
501 spin_lock(&sbi->dir_inode_lock); 526 spin_lock(&sbi->dir_inode_lock);
502 if (atomic_read(&F2FS_I(inode)->dirty_dents)) 527 if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
503 goto out; 528 spin_unlock(&sbi->dir_inode_lock);
529 return;
530 }
504 531
505 list_for_each(this, head) { 532 list_for_each(this, head) {
506 struct dir_inode_entry *entry; 533 struct dir_inode_entry *entry;
@@ -508,12 +535,38 @@ void remove_dirty_dir_inode(struct inode *inode)
508 if (entry->inode == inode) { 535 if (entry->inode == inode) {
509 list_del(&entry->list); 536 list_del(&entry->list);
510 kmem_cache_free(inode_entry_slab, entry); 537 kmem_cache_free(inode_entry_slab, entry);
538#ifdef CONFIG_F2FS_STAT_FS
511 sbi->n_dirty_dirs--; 539 sbi->n_dirty_dirs--;
540#endif
541 break;
542 }
543 }
544 spin_unlock(&sbi->dir_inode_lock);
545
546 /* Only from the recovery routine */
547 if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
548 clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
549 iput(inode);
550 }
551}
552
553struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
554{
555 struct list_head *head = &sbi->dir_inode_list;
556 struct list_head *this;
557 struct inode *inode = NULL;
558
559 spin_lock(&sbi->dir_inode_lock);
560 list_for_each(this, head) {
561 struct dir_inode_entry *entry;
562 entry = list_entry(this, struct dir_inode_entry, list);
563 if (entry->inode->i_ino == ino) {
564 inode = entry->inode;
512 break; 565 break;
513 } 566 }
514 } 567 }
515out:
516 spin_unlock(&sbi->dir_inode_lock); 568 spin_unlock(&sbi->dir_inode_lock);
569 return inode;
517} 570}
518 571
519void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) 572void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
@@ -595,7 +648,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
595 block_t start_blk; 648 block_t start_blk;
596 struct page *cp_page; 649 struct page *cp_page;
597 unsigned int data_sum_blocks, orphan_blocks; 650 unsigned int data_sum_blocks, orphan_blocks;
598 unsigned int crc32 = 0; 651 __u32 crc32 = 0;
599 void *kaddr; 652 void *kaddr;
600 int i; 653 int i;
601 654
@@ -664,8 +717,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
664 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); 717 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
665 718
666 crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); 719 crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
667 *(__le32 *)((unsigned char *)ckpt + 720 *((__le32 *)((unsigned char *)ckpt +
668 le32_to_cpu(ckpt->checksum_offset)) 721 le32_to_cpu(ckpt->checksum_offset)))
669 = cpu_to_le32(crc32); 722 = cpu_to_le32(crc32);
670 723
671 start_blk = __start_cp_addr(sbi); 724 start_blk = __start_cp_addr(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 91ff93b0b0f4..035f9a345cdf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
68 struct buffer_head *bh_result) 68 struct buffer_head *bh_result)
69{ 69{
70 struct f2fs_inode_info *fi = F2FS_I(inode); 70 struct f2fs_inode_info *fi = F2FS_I(inode);
71#ifdef CONFIG_F2FS_STAT_FS
71 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 72 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
73#endif
72 pgoff_t start_fofs, end_fofs; 74 pgoff_t start_fofs, end_fofs;
73 block_t start_blkaddr; 75 block_t start_blkaddr;
74 76
@@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
78 return 0; 80 return 0;
79 } 81 }
80 82
83#ifdef CONFIG_F2FS_STAT_FS
81 sbi->total_hit_ext++; 84 sbi->total_hit_ext++;
85#endif
82 start_fofs = fi->ext.fofs; 86 start_fofs = fi->ext.fofs;
83 end_fofs = fi->ext.fofs + fi->ext.len - 1; 87 end_fofs = fi->ext.fofs + fi->ext.len - 1;
84 start_blkaddr = fi->ext.blk_addr; 88 start_blkaddr = fi->ext.blk_addr;
@@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
96 else 100 else
97 bh_result->b_size = UINT_MAX; 101 bh_result->b_size = UINT_MAX;
98 102
103#ifdef CONFIG_F2FS_STAT_FS
99 sbi->read_hit_ext++; 104 sbi->read_hit_ext++;
105#endif
100 read_unlock(&fi->ext.ext_lock); 106 read_unlock(&fi->ext.ext_lock);
101 return 1; 107 return 1;
102 } 108 }
@@ -199,7 +205,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
199 if (dn.data_blkaddr == NEW_ADDR) 205 if (dn.data_blkaddr == NEW_ADDR)
200 return ERR_PTR(-EINVAL); 206 return ERR_PTR(-EINVAL);
201 207
202 page = grab_cache_page(mapping, index); 208 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
203 if (!page) 209 if (!page)
204 return ERR_PTR(-ENOMEM); 210 return ERR_PTR(-ENOMEM);
205 211
@@ -233,18 +239,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
233 struct page *page; 239 struct page *page;
234 int err; 240 int err;
235 241
242repeat:
243 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
244 if (!page)
245 return ERR_PTR(-ENOMEM);
246
236 set_new_dnode(&dn, inode, NULL, NULL, 0); 247 set_new_dnode(&dn, inode, NULL, NULL, 0);
237 err = get_dnode_of_data(&dn, index, LOOKUP_NODE); 248 err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
238 if (err) 249 if (err) {
250 f2fs_put_page(page, 1);
239 return ERR_PTR(err); 251 return ERR_PTR(err);
252 }
240 f2fs_put_dnode(&dn); 253 f2fs_put_dnode(&dn);
241 254
242 if (dn.data_blkaddr == NULL_ADDR) 255 if (dn.data_blkaddr == NULL_ADDR) {
256 f2fs_put_page(page, 1);
243 return ERR_PTR(-ENOENT); 257 return ERR_PTR(-ENOENT);
244repeat: 258 }
245 page = grab_cache_page(mapping, index);
246 if (!page)
247 return ERR_PTR(-ENOMEM);
248 259
249 if (PageUptodate(page)) 260 if (PageUptodate(page))
250 return page; 261 return page;
@@ -274,9 +285,10 @@ repeat:
274 * 285 *
275 * Also, caller should grab and release a mutex by calling mutex_lock_op() and 286 * Also, caller should grab and release a mutex by calling mutex_lock_op() and
276 * mutex_unlock_op(). 287 * mutex_unlock_op().
288 * Note that, npage is set only by make_empty_dir.
277 */ 289 */
278struct page *get_new_data_page(struct inode *inode, pgoff_t index, 290struct page *get_new_data_page(struct inode *inode,
279 bool new_i_size) 291 struct page *npage, pgoff_t index, bool new_i_size)
280{ 292{
281 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 293 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
282 struct address_space *mapping = inode->i_mapping; 294 struct address_space *mapping = inode->i_mapping;
@@ -284,18 +296,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index,
284 struct dnode_of_data dn; 296 struct dnode_of_data dn;
285 int err; 297 int err;
286 298
287 set_new_dnode(&dn, inode, NULL, NULL, 0); 299 set_new_dnode(&dn, inode, npage, npage, 0);
288 err = get_dnode_of_data(&dn, index, ALLOC_NODE); 300 err = get_dnode_of_data(&dn, index, ALLOC_NODE);
289 if (err) 301 if (err)
290 return ERR_PTR(err); 302 return ERR_PTR(err);
291 303
292 if (dn.data_blkaddr == NULL_ADDR) { 304 if (dn.data_blkaddr == NULL_ADDR) {
293 if (reserve_new_block(&dn)) { 305 if (reserve_new_block(&dn)) {
294 f2fs_put_dnode(&dn); 306 if (!npage)
307 f2fs_put_dnode(&dn);
295 return ERR_PTR(-ENOSPC); 308 return ERR_PTR(-ENOSPC);
296 } 309 }
297 } 310 }
298 f2fs_put_dnode(&dn); 311 if (!npage)
312 f2fs_put_dnode(&dn);
299repeat: 313repeat:
300 page = grab_cache_page(mapping, index); 314 page = grab_cache_page(mapping, index);
301 if (!page) 315 if (!page)
@@ -325,6 +339,8 @@ repeat:
325 if (new_i_size && 339 if (new_i_size &&
326 i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { 340 i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
327 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); 341 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
342 /* Only the directory inode sets new_i_size */
343 set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
328 mark_inode_dirty_sync(inode); 344 mark_inode_dirty_sync(inode);
329 } 345 }
330 return page; 346 return page;
@@ -481,8 +497,9 @@ int do_write_data_page(struct page *page)
481 * If current allocation needs SSR, 497 * If current allocation needs SSR,
482 * it had better in-place writes for updated data. 498 * it had better in-place writes for updated data.
483 */ 499 */
484 if (old_blk_addr != NEW_ADDR && !is_cold_data(page) && 500 if (unlikely(old_blk_addr != NEW_ADDR &&
485 need_inplace_update(inode)) { 501 !is_cold_data(page) &&
502 need_inplace_update(inode))) {
486 rewrite_data_page(F2FS_SB(inode->i_sb), page, 503 rewrite_data_page(F2FS_SB(inode->i_sb), page,
487 old_blk_addr); 504 old_blk_addr);
488 } else { 505 } else {
@@ -684,6 +701,27 @@ err:
684 return err; 701 return err;
685} 702}
686 703
704static int f2fs_write_end(struct file *file,
705 struct address_space *mapping,
706 loff_t pos, unsigned len, unsigned copied,
707 struct page *page, void *fsdata)
708{
709 struct inode *inode = page->mapping->host;
710
711 SetPageUptodate(page);
712 set_page_dirty(page);
713
714 if (pos + copied > i_size_read(inode)) {
715 i_size_write(inode, pos + copied);
716 mark_inode_dirty(inode);
717 update_inode_page(inode);
718 }
719
720 unlock_page(page);
721 page_cache_release(page);
722 return copied;
723}
724
687static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, 725static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
688 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 726 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
689{ 727{
@@ -698,7 +736,8 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
698 get_data_block_ro); 736 get_data_block_ro);
699} 737}
700 738
701static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) 739static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
740 unsigned int length)
702{ 741{
703 struct inode *inode = page->mapping->host; 742 struct inode *inode = page->mapping->host;
704 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 743 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -740,7 +779,7 @@ const struct address_space_operations f2fs_dblock_aops = {
740 .writepage = f2fs_write_data_page, 779 .writepage = f2fs_write_data_page,
741 .writepages = f2fs_write_data_pages, 780 .writepages = f2fs_write_data_pages,
742 .write_begin = f2fs_write_begin, 781 .write_begin = f2fs_write_begin,
743 .write_end = nobh_write_end, 782 .write_end = f2fs_write_end,
744 .set_page_dirty = f2fs_set_data_page_dirty, 783 .set_page_dirty = f2fs_set_data_page_dirty,
745 .invalidatepage = f2fs_invalidate_data_page, 784 .invalidatepage = f2fs_invalidate_data_page,
746 .releasepage = f2fs_release_data_page, 785 .releasepage = f2fs_release_data_page,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8d9943786c31..0d6c6aafb235 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -175,12 +175,12 @@ get_cache:
175 175
176static int stat_show(struct seq_file *s, void *v) 176static int stat_show(struct seq_file *s, void *v)
177{ 177{
178 struct f2fs_stat_info *si, *next; 178 struct f2fs_stat_info *si;
179 int i = 0; 179 int i = 0;
180 int j; 180 int j;
181 181
182 mutex_lock(&f2fs_stat_mutex); 182 mutex_lock(&f2fs_stat_mutex);
183 list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { 183 list_for_each_entry(si, &f2fs_stat_list, stat_list) {
184 char devname[BDEVNAME_SIZE]; 184 char devname[BDEVNAME_SIZE];
185 185
186 update_general_status(si->sbi); 186 update_general_status(si->sbi);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1ac6b93036b7..62f0d5977c64 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -13,6 +13,7 @@
13#include "f2fs.h" 13#include "f2fs.h"
14#include "node.h" 14#include "node.h"
15#include "acl.h" 15#include "acl.h"
16#include "xattr.h"
16 17
17static unsigned long dir_blocks(struct inode *inode) 18static unsigned long dir_blocks(struct inode *inode)
18{ 19{
@@ -215,9 +216,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
215 216
216struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) 217struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
217{ 218{
218 struct page *page = NULL; 219 struct page *page;
219 struct f2fs_dir_entry *de = NULL; 220 struct f2fs_dir_entry *de;
220 struct f2fs_dentry_block *dentry_blk = NULL; 221 struct f2fs_dentry_block *dentry_blk;
221 222
222 page = get_lock_data_page(dir, 0); 223 page = get_lock_data_page(dir, 0);
223 if (IS_ERR(page)) 224 if (IS_ERR(page))
@@ -264,15 +265,10 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
264 f2fs_put_page(page, 1); 265 f2fs_put_page(page, 1);
265} 266}
266 267
267void init_dent_inode(const struct qstr *name, struct page *ipage) 268static void init_dent_inode(const struct qstr *name, struct page *ipage)
268{ 269{
269 struct f2fs_node *rn; 270 struct f2fs_node *rn;
270 271
271 if (IS_ERR(ipage))
272 return;
273
274 wait_on_page_writeback(ipage);
275
276 /* copy name info. to this inode page */ 272 /* copy name info. to this inode page */
277 rn = (struct f2fs_node *)page_address(ipage); 273 rn = (struct f2fs_node *)page_address(ipage);
278 rn->i.i_namelen = cpu_to_le32(name->len); 274 rn->i.i_namelen = cpu_to_le32(name->len);
@@ -280,14 +276,15 @@ void init_dent_inode(const struct qstr *name, struct page *ipage)
280 set_page_dirty(ipage); 276 set_page_dirty(ipage);
281} 277}
282 278
283static int make_empty_dir(struct inode *inode, struct inode *parent) 279static int make_empty_dir(struct inode *inode,
280 struct inode *parent, struct page *page)
284{ 281{
285 struct page *dentry_page; 282 struct page *dentry_page;
286 struct f2fs_dentry_block *dentry_blk; 283 struct f2fs_dentry_block *dentry_blk;
287 struct f2fs_dir_entry *de; 284 struct f2fs_dir_entry *de;
288 void *kaddr; 285 void *kaddr;
289 286
290 dentry_page = get_new_data_page(inode, 0, true); 287 dentry_page = get_new_data_page(inode, page, 0, true);
291 if (IS_ERR(dentry_page)) 288 if (IS_ERR(dentry_page))
292 return PTR_ERR(dentry_page); 289 return PTR_ERR(dentry_page);
293 290
@@ -317,63 +314,76 @@ static int make_empty_dir(struct inode *inode, struct inode *parent)
317 return 0; 314 return 0;
318} 315}
319 316
320static int init_inode_metadata(struct inode *inode, 317static struct page *init_inode_metadata(struct inode *inode,
321 struct inode *dir, const struct qstr *name) 318 struct inode *dir, const struct qstr *name)
322{ 319{
320 struct page *page;
321 int err;
322
323 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { 323 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
324 int err; 324 page = new_inode_page(inode, name);
325 err = new_inode_page(inode, name); 325 if (IS_ERR(page))
326 if (err) 326 return page;
327 return err;
328 327
329 if (S_ISDIR(inode->i_mode)) { 328 if (S_ISDIR(inode->i_mode)) {
330 err = make_empty_dir(inode, dir); 329 err = make_empty_dir(inode, dir, page);
331 if (err) { 330 if (err)
332 remove_inode_page(inode); 331 goto error;
333 return err;
334 }
335 } 332 }
336 333
337 err = f2fs_init_acl(inode, dir); 334 err = f2fs_init_acl(inode, dir);
338 if (err) { 335 if (err)
339 remove_inode_page(inode); 336 goto error;
340 return err; 337
341 } 338 err = f2fs_init_security(inode, dir, name, page);
339 if (err)
340 goto error;
341
342 wait_on_page_writeback(page);
342 } else { 343 } else {
343 struct page *ipage; 344 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
344 ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); 345 if (IS_ERR(page))
345 if (IS_ERR(ipage)) 346 return page;
346 return PTR_ERR(ipage); 347
347 set_cold_node(inode, ipage); 348 wait_on_page_writeback(page);
348 init_dent_inode(name, ipage); 349 set_cold_node(inode, page);
349 f2fs_put_page(ipage, 1);
350 } 350 }
351
352 init_dent_inode(name, page);
353
354 /*
355 * This file should be checkpointed during fsync.
356 * We lost i_pino from now on.
357 */
351 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { 358 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
359 file_lost_pino(inode);
352 inc_nlink(inode); 360 inc_nlink(inode);
353 update_inode_page(inode);
354 } 361 }
355 return 0; 362 return page;
363
364error:
365 f2fs_put_page(page, 1);
366 remove_inode_page(inode);
367 return ERR_PTR(err);
356} 368}
357 369
358static void update_parent_metadata(struct inode *dir, struct inode *inode, 370static void update_parent_metadata(struct inode *dir, struct inode *inode,
359 unsigned int current_depth) 371 unsigned int current_depth)
360{ 372{
361 bool need_dir_update = false;
362
363 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { 373 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
364 if (S_ISDIR(inode->i_mode)) { 374 if (S_ISDIR(inode->i_mode)) {
365 inc_nlink(dir); 375 inc_nlink(dir);
366 need_dir_update = true; 376 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
367 } 377 }
368 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); 378 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
369 } 379 }
370 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 380 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
371 if (F2FS_I(dir)->i_current_depth != current_depth) { 381 if (F2FS_I(dir)->i_current_depth != current_depth) {
372 F2FS_I(dir)->i_current_depth = current_depth; 382 F2FS_I(dir)->i_current_depth = current_depth;
373 need_dir_update = true; 383 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
374 } 384 }
375 385
376 if (need_dir_update) 386 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
377 update_inode_page(dir); 387 update_inode_page(dir);
378 else 388 else
379 mark_inode_dirty(dir); 389 mark_inode_dirty(dir);
@@ -423,6 +433,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
423 struct page *dentry_page = NULL; 433 struct page *dentry_page = NULL;
424 struct f2fs_dentry_block *dentry_blk = NULL; 434 struct f2fs_dentry_block *dentry_blk = NULL;
425 int slots = GET_DENTRY_SLOTS(namelen); 435 int slots = GET_DENTRY_SLOTS(namelen);
436 struct page *page;
426 int err = 0; 437 int err = 0;
427 int i; 438 int i;
428 439
@@ -448,7 +459,7 @@ start:
448 bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); 459 bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
449 460
450 for (block = bidx; block <= (bidx + nblock - 1); block++) { 461 for (block = bidx; block <= (bidx + nblock - 1); block++) {
451 dentry_page = get_new_data_page(dir, block, true); 462 dentry_page = get_new_data_page(dir, NULL, block, true);
452 if (IS_ERR(dentry_page)) 463 if (IS_ERR(dentry_page))
453 return PTR_ERR(dentry_page); 464 return PTR_ERR(dentry_page);
454 465
@@ -465,12 +476,13 @@ start:
465 ++level; 476 ++level;
466 goto start; 477 goto start;
467add_dentry: 478add_dentry:
468 err = init_inode_metadata(inode, dir, name);
469 if (err)
470 goto fail;
471
472 wait_on_page_writeback(dentry_page); 479 wait_on_page_writeback(dentry_page);
473 480
481 page = init_inode_metadata(inode, dir, name);
482 if (IS_ERR(page)) {
483 err = PTR_ERR(page);
484 goto fail;
485 }
474 de = &dentry_blk->dentry[bit_pos]; 486 de = &dentry_blk->dentry[bit_pos];
475 de->hash_code = dentry_hash; 487 de->hash_code = dentry_hash;
476 de->name_len = cpu_to_le16(namelen); 488 de->name_len = cpu_to_le16(namelen);
@@ -481,11 +493,14 @@ add_dentry:
481 test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); 493 test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
482 set_page_dirty(dentry_page); 494 set_page_dirty(dentry_page);
483 495
484 update_parent_metadata(dir, inode, current_depth); 496 /* we don't need to mark_inode_dirty now */
485
486 /* update parent inode number before releasing dentry page */
487 F2FS_I(inode)->i_pino = dir->i_ino; 497 F2FS_I(inode)->i_pino = dir->i_ino;
498 update_inode(inode, page);
499 f2fs_put_page(page, 1);
500
501 update_parent_metadata(dir, inode, current_depth);
488fail: 502fail:
503 clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
489 kunmap(dentry_page); 504 kunmap(dentry_page);
490 f2fs_put_page(dentry_page, 1); 505 f2fs_put_page(dentry_page, 1);
491 return err; 506 return err;
@@ -591,34 +606,26 @@ bool f2fs_empty_dir(struct inode *dir)
591 return true; 606 return true;
592} 607}
593 608
594static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) 609static int f2fs_readdir(struct file *file, struct dir_context *ctx)
595{ 610{
596 unsigned long pos = file->f_pos;
597 struct inode *inode = file_inode(file); 611 struct inode *inode = file_inode(file);
598 unsigned long npages = dir_blocks(inode); 612 unsigned long npages = dir_blocks(inode);
599 unsigned char *types = NULL; 613 unsigned int bit_pos = 0;
600 unsigned int bit_pos = 0, start_bit_pos = 0;
601 int over = 0;
602 struct f2fs_dentry_block *dentry_blk = NULL; 614 struct f2fs_dentry_block *dentry_blk = NULL;
603 struct f2fs_dir_entry *de = NULL; 615 struct f2fs_dir_entry *de = NULL;
604 struct page *dentry_page = NULL; 616 struct page *dentry_page = NULL;
605 unsigned int n = 0; 617 unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
606 unsigned char d_type = DT_UNKNOWN; 618 unsigned char d_type = DT_UNKNOWN;
607 int slots;
608 619
609 types = f2fs_filetype_table; 620 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
610 bit_pos = (pos % NR_DENTRY_IN_BLOCK);
611 n = (pos / NR_DENTRY_IN_BLOCK);
612 621
613 for ( ; n < npages; n++) { 622 for ( ; n < npages; n++) {
614 dentry_page = get_lock_data_page(inode, n); 623 dentry_page = get_lock_data_page(inode, n);
615 if (IS_ERR(dentry_page)) 624 if (IS_ERR(dentry_page))
616 continue; 625 continue;
617 626
618 start_bit_pos = bit_pos;
619 dentry_blk = kmap(dentry_page); 627 dentry_blk = kmap(dentry_page);
620 while (bit_pos < NR_DENTRY_IN_BLOCK) { 628 while (bit_pos < NR_DENTRY_IN_BLOCK) {
621 d_type = DT_UNKNOWN;
622 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 629 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
623 NR_DENTRY_IN_BLOCK, 630 NR_DENTRY_IN_BLOCK,
624 bit_pos); 631 bit_pos);
@@ -626,28 +633,26 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
626 break; 633 break;
627 634
628 de = &dentry_blk->dentry[bit_pos]; 635 de = &dentry_blk->dentry[bit_pos];
629 if (types && de->file_type < F2FS_FT_MAX) 636 if (de->file_type < F2FS_FT_MAX)
630 d_type = types[de->file_type]; 637 d_type = f2fs_filetype_table[de->file_type];
631 638 else
632 over = filldir(dirent, 639 d_type = DT_UNKNOWN;
640 if (!dir_emit(ctx,
633 dentry_blk->filename[bit_pos], 641 dentry_blk->filename[bit_pos],
634 le16_to_cpu(de->name_len), 642 le16_to_cpu(de->name_len),
635 (n * NR_DENTRY_IN_BLOCK) + bit_pos, 643 le32_to_cpu(de->ino), d_type))
636 le32_to_cpu(de->ino), d_type); 644 goto stop;
637 if (over) { 645
638 file->f_pos += bit_pos - start_bit_pos; 646 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
639 goto success; 647 ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos;
640 }
641 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
642 bit_pos += slots;
643 } 648 }
644 bit_pos = 0; 649 bit_pos = 0;
645 file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK; 650 ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
646 kunmap(dentry_page); 651 kunmap(dentry_page);
647 f2fs_put_page(dentry_page, 1); 652 f2fs_put_page(dentry_page, 1);
648 dentry_page = NULL; 653 dentry_page = NULL;
649 } 654 }
650success: 655stop:
651 if (dentry_page && !IS_ERR(dentry_page)) { 656 if (dentry_page && !IS_ERR(dentry_page)) {
652 kunmap(dentry_page); 657 kunmap(dentry_page);
653 f2fs_put_page(dentry_page, 1); 658 f2fs_put_page(dentry_page, 1);
@@ -659,7 +664,7 @@ success:
659const struct file_operations f2fs_dir_operations = { 664const struct file_operations f2fs_dir_operations = {
660 .llseek = generic_file_llseek, 665 .llseek = generic_file_llseek,
661 .read = generic_read_dir, 666 .read = generic_read_dir,
662 .readdir = f2fs_readdir, 667 .iterate = f2fs_readdir,
663 .fsync = f2fs_sync_file, 668 .fsync = f2fs_sync_file,
664 .unlocked_ioctl = f2fs_ioctl, 669 .unlocked_ioctl = f2fs_ioctl,
665}; 670};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 20aab02f2a42..467d42d65c48 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -37,21 +37,35 @@
37 typecheck(unsigned long long, b) && \ 37 typecheck(unsigned long long, b) && \
38 ((long long)((a) - (b)) > 0)) 38 ((long long)((a) - (b)) > 0))
39 39
40typedef u64 block_t; 40typedef u32 block_t; /*
41 * should not change u32, since it is the on-disk block
42 * address format, __le32.
43 */
41typedef u32 nid_t; 44typedef u32 nid_t;
42 45
43struct f2fs_mount_info { 46struct f2fs_mount_info {
44 unsigned int opt; 47 unsigned int opt;
45}; 48};
46 49
47static inline __u32 f2fs_crc32(void *buff, size_t len) 50#define CRCPOLY_LE 0xedb88320
51
52static inline __u32 f2fs_crc32(void *buf, size_t len)
48{ 53{
49 return crc32_le(F2FS_SUPER_MAGIC, buff, len); 54 unsigned char *p = (unsigned char *)buf;
55 __u32 crc = F2FS_SUPER_MAGIC;
56 int i;
57
58 while (len--) {
59 crc ^= *p++;
60 for (i = 0; i < 8; i++)
61 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
62 }
63 return crc;
50} 64}
51 65
52static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size) 66static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
53{ 67{
54 return f2fs_crc32(buff, buff_size) == blk_crc; 68 return f2fs_crc32(buf, buf_size) == blk_crc;
55} 69}
56 70
57/* 71/*
@@ -148,7 +162,7 @@ struct extent_info {
148 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. 162 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
149 */ 163 */
150#define FADVISE_COLD_BIT 0x01 164#define FADVISE_COLD_BIT 0x01
151#define FADVISE_CP_BIT 0x02 165#define FADVISE_LOST_PINO_BIT 0x02
152 166
153struct f2fs_inode_info { 167struct f2fs_inode_info {
154 struct inode vfs_inode; /* serve a vfs inode */ 168 struct inode vfs_inode; /* serve a vfs inode */
@@ -369,7 +383,6 @@ struct f2fs_sb_info {
369 /* for directory inode management */ 383 /* for directory inode management */
370 struct list_head dir_inode_list; /* dir inode list */ 384 struct list_head dir_inode_list; /* dir inode list */
371 spinlock_t dir_inode_lock; /* for dir inode list lock */ 385 spinlock_t dir_inode_lock; /* for dir inode list lock */
372 unsigned int n_dirty_dirs; /* # of dir inodes */
373 386
374 /* basic file system units */ 387 /* basic file system units */
375 unsigned int log_sectors_per_block; /* log2 sectors per block */ 388 unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -406,12 +419,15 @@ struct f2fs_sb_info {
406 * for stat information. 419 * for stat information.
407 * one is for the LFS mode, and the other is for the SSR mode. 420 * one is for the LFS mode, and the other is for the SSR mode.
408 */ 421 */
422#ifdef CONFIG_F2FS_STAT_FS
409 struct f2fs_stat_info *stat_info; /* FS status information */ 423 struct f2fs_stat_info *stat_info; /* FS status information */
410 unsigned int segment_count[2]; /* # of allocated segments */ 424 unsigned int segment_count[2]; /* # of allocated segments */
411 unsigned int block_count[2]; /* # of allocated blocks */ 425 unsigned int block_count[2]; /* # of allocated blocks */
412 unsigned int last_victim[2]; /* last victim segment # */
413 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ 426 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
414 int bg_gc; /* background gc calls */ 427 int bg_gc; /* background gc calls */
428 unsigned int n_dirty_dirs; /* # of dir inodes */
429#endif
430 unsigned int last_victim[2]; /* last victim segment # */
415 spinlock_t stat_lock; /* lock for stat operations */ 431 spinlock_t stat_lock; /* lock for stat operations */
416}; 432};
417 433
@@ -495,9 +511,17 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
495 511
496static inline void mutex_lock_all(struct f2fs_sb_info *sbi) 512static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
497{ 513{
498 int i = 0; 514 int i;
499 for (; i < NR_GLOBAL_LOCKS; i++) 515
500 mutex_lock(&sbi->fs_lock[i]); 516 for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
517 /*
518 * This is the only time we take multiple fs_lock[]
519 * instances; the order is immaterial since we
520 * always hold cp_mutex, which serializes multiple
521 * such operations.
522 */
523 mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
524 }
501} 525}
502 526
503static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) 527static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
@@ -843,9 +867,12 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
843/* used for f2fs_inode_info->flags */ 867/* used for f2fs_inode_info->flags */
844enum { 868enum {
845 FI_NEW_INODE, /* indicate newly allocated inode */ 869 FI_NEW_INODE, /* indicate newly allocated inode */
870 FI_DIRTY_INODE, /* indicate inode is dirty or not */
846 FI_INC_LINK, /* need to increment i_nlink */ 871 FI_INC_LINK, /* need to increment i_nlink */
847 FI_ACL_MODE, /* indicate acl mode */ 872 FI_ACL_MODE, /* indicate acl mode */
848 FI_NO_ALLOC, /* should not allocate any blocks */ 873 FI_NO_ALLOC, /* should not allocate any blocks */
874 FI_UPDATE_DIR, /* should update inode block for consistency */
875 FI_DELAY_IPUT, /* used for the recovery */
849}; 876};
850 877
851static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 878static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -878,14 +905,21 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
878 return 0; 905 return 0;
879} 906}
880 907
908static inline int f2fs_readonly(struct super_block *sb)
909{
910 return sb->s_flags & MS_RDONLY;
911}
912
881/* 913/*
882 * file.c 914 * file.c
883 */ 915 */
884int f2fs_sync_file(struct file *, loff_t, loff_t, int); 916int f2fs_sync_file(struct file *, loff_t, loff_t, int);
885void truncate_data_blocks(struct dnode_of_data *); 917void truncate_data_blocks(struct dnode_of_data *);
886void f2fs_truncate(struct inode *); 918void f2fs_truncate(struct inode *);
919int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
887int f2fs_setattr(struct dentry *, struct iattr *); 920int f2fs_setattr(struct dentry *, struct iattr *);
888int truncate_hole(struct inode *, pgoff_t, pgoff_t); 921int truncate_hole(struct inode *, pgoff_t, pgoff_t);
922int truncate_data_blocks_range(struct dnode_of_data *, int);
889long f2fs_ioctl(struct file *, unsigned int, unsigned long); 923long f2fs_ioctl(struct file *, unsigned int, unsigned long);
890long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); 924long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
891 925
@@ -913,7 +947,6 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
913ino_t f2fs_inode_by_name(struct inode *, struct qstr *); 947ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
914void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, 948void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
915 struct page *, struct inode *); 949 struct page *, struct inode *);
916void init_dent_inode(const struct qstr *, struct page *);
917int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); 950int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
918void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); 951void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
919int f2fs_make_empty(struct inode *, struct inode *); 952int f2fs_make_empty(struct inode *, struct inode *);
@@ -948,8 +981,8 @@ void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
948int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 981int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
949int truncate_inode_blocks(struct inode *, pgoff_t); 982int truncate_inode_blocks(struct inode *, pgoff_t);
950int remove_inode_page(struct inode *); 983int remove_inode_page(struct inode *);
951int new_inode_page(struct inode *, const struct qstr *); 984struct page *new_inode_page(struct inode *, const struct qstr *);
952struct page *new_node_page(struct dnode_of_data *, unsigned int); 985struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
953void ra_node_page(struct f2fs_sb_info *, nid_t); 986void ra_node_page(struct f2fs_sb_info *, nid_t);
954struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); 987struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
955struct page *get_node_page_ra(struct page *, int); 988struct page *get_node_page_ra(struct page *, int);
@@ -974,7 +1007,6 @@ void destroy_node_manager_caches(void);
974 */ 1007 */
975void f2fs_balance_fs(struct f2fs_sb_info *); 1008void f2fs_balance_fs(struct f2fs_sb_info *);
976void invalidate_blocks(struct f2fs_sb_info *, block_t); 1009void invalidate_blocks(struct f2fs_sb_info *, block_t);
977void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
978void clear_prefree_segments(struct f2fs_sb_info *); 1010void clear_prefree_segments(struct f2fs_sb_info *);
979int npages_for_summary_flush(struct f2fs_sb_info *); 1011int npages_for_summary_flush(struct f2fs_sb_info *);
980void allocate_new_segments(struct f2fs_sb_info *); 1012void allocate_new_segments(struct f2fs_sb_info *);
@@ -1011,7 +1043,9 @@ void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
1011int recover_orphan_inodes(struct f2fs_sb_info *); 1043int recover_orphan_inodes(struct f2fs_sb_info *);
1012int get_valid_checkpoint(struct f2fs_sb_info *); 1044int get_valid_checkpoint(struct f2fs_sb_info *);
1013void set_dirty_dir_page(struct inode *, struct page *); 1045void set_dirty_dir_page(struct inode *, struct page *);
1046void add_dirty_dir_inode(struct inode *);
1014void remove_dirty_dir_inode(struct inode *); 1047void remove_dirty_dir_inode(struct inode *);
1048struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
1015void sync_dirty_dir_inodes(struct f2fs_sb_info *); 1049void sync_dirty_dir_inodes(struct f2fs_sb_info *);
1016void write_checkpoint(struct f2fs_sb_info *, bool); 1050void write_checkpoint(struct f2fs_sb_info *, bool);
1017void init_orphan_info(struct f2fs_sb_info *); 1051void init_orphan_info(struct f2fs_sb_info *);
@@ -1025,7 +1059,7 @@ int reserve_new_block(struct dnode_of_data *);
1025void update_extent_cache(block_t, struct dnode_of_data *); 1059void update_extent_cache(block_t, struct dnode_of_data *);
1026struct page *find_data_page(struct inode *, pgoff_t, bool); 1060struct page *find_data_page(struct inode *, pgoff_t, bool);
1027struct page *get_lock_data_page(struct inode *, pgoff_t); 1061struct page *get_lock_data_page(struct inode *, pgoff_t);
1028struct page *get_new_data_page(struct inode *, pgoff_t, bool); 1062struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
1029int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); 1063int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
1030int do_write_data_page(struct page *); 1064int do_write_data_page(struct page *);
1031 1065
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1cae864f8dfc..d2d2b7dbdcc1 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -63,9 +63,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
63 f2fs_put_dnode(&dn); 63 f2fs_put_dnode(&dn);
64 mutex_unlock_op(sbi, ilock); 64 mutex_unlock_op(sbi, ilock);
65 65
66 file_update_time(vma->vm_file);
66 lock_page(page); 67 lock_page(page);
67 if (page->mapping != inode->i_mapping || 68 if (page->mapping != inode->i_mapping ||
68 page_offset(page) >= i_size_read(inode) || 69 page_offset(page) > i_size_read(inode) ||
69 !PageUptodate(page)) { 70 !PageUptodate(page)) {
70 unlock_page(page); 71 unlock_page(page);
71 err = -EFAULT; 72 err = -EFAULT;
@@ -76,10 +77,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
76 * check to see if the page is mapped already (no holes) 77 * check to see if the page is mapped already (no holes)
77 */ 78 */
78 if (PageMappedToDisk(page)) 79 if (PageMappedToDisk(page))
79 goto out; 80 goto mapped;
80
81 /* fill the page */
82 wait_on_page_writeback(page);
83 81
84 /* page is wholly or partially inside EOF */ 82 /* page is wholly or partially inside EOF */
85 if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { 83 if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
@@ -90,7 +88,9 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
90 set_page_dirty(page); 88 set_page_dirty(page);
91 SetPageUptodate(page); 89 SetPageUptodate(page);
92 90
93 file_update_time(vma->vm_file); 91mapped:
92 /* fill the page */
93 wait_on_page_writeback(page);
94out: 94out:
95 sb_end_pagefault(inode->i_sb); 95 sb_end_pagefault(inode->i_sb);
96 return block_page_mkwrite_return(err); 96 return block_page_mkwrite_return(err);
@@ -102,6 +102,24 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
102 .remap_pages = generic_file_remap_pages, 102 .remap_pages = generic_file_remap_pages,
103}; 103};
104 104
105static int get_parent_ino(struct inode *inode, nid_t *pino)
106{
107 struct dentry *dentry;
108
109 inode = igrab(inode);
110 dentry = d_find_any_alias(inode);
111 iput(inode);
112 if (!dentry)
113 return 0;
114
115 inode = igrab(dentry->d_parent->d_inode);
116 dput(dentry);
117
118 *pino = inode->i_ino;
119 iput(inode);
120 return 1;
121}
122
105int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 123int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
106{ 124{
107 struct inode *inode = file->f_mapping->host; 125 struct inode *inode = file->f_mapping->host;
@@ -114,7 +132,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
114 .for_reclaim = 0, 132 .for_reclaim = 0,
115 }; 133 };
116 134
117 if (inode->i_sb->s_flags & MS_RDONLY) 135 if (f2fs_readonly(inode->i_sb))
118 return 0; 136 return 0;
119 137
120 trace_f2fs_sync_file_enter(inode); 138 trace_f2fs_sync_file_enter(inode);
@@ -134,7 +152,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
134 152
135 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) 153 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
136 need_cp = true; 154 need_cp = true;
137 else if (is_cp_file(inode)) 155 else if (file_wrong_pino(inode))
138 need_cp = true; 156 need_cp = true;
139 else if (!space_for_roll_forward(sbi)) 157 else if (!space_for_roll_forward(sbi))
140 need_cp = true; 158 need_cp = true;
@@ -142,11 +160,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
142 need_cp = true; 160 need_cp = true;
143 161
144 if (need_cp) { 162 if (need_cp) {
163 nid_t pino;
164
145 /* all the dirty node pages should be flushed for POR */ 165 /* all the dirty node pages should be flushed for POR */
146 ret = f2fs_sync_fs(inode->i_sb, 1); 166 ret = f2fs_sync_fs(inode->i_sb, 1);
167 if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
168 get_parent_ino(inode, &pino)) {
169 F2FS_I(inode)->i_pino = pino;
170 file_got_pino(inode);
171 mark_inode_dirty_sync(inode);
172 ret = f2fs_write_inode(inode, NULL);
173 if (ret)
174 goto out;
175 }
147 } else { 176 } else {
148 /* if there is no written node page, write its inode page */ 177 /* if there is no written node page, write its inode page */
149 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { 178 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
179 mark_inode_dirty_sync(inode);
150 ret = f2fs_write_inode(inode, NULL); 180 ret = f2fs_write_inode(inode, NULL);
151 if (ret) 181 if (ret)
152 goto out; 182 goto out;
@@ -168,7 +198,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
168 return 0; 198 return 0;
169} 199}
170 200
171static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) 201int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
172{ 202{
173 int nr_free = 0, ofs = dn->ofs_in_node; 203 int nr_free = 0, ofs = dn->ofs_in_node;
174 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 204 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
@@ -185,10 +215,10 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
185 215
186 update_extent_cache(NULL_ADDR, dn); 216 update_extent_cache(NULL_ADDR, dn);
187 invalidate_blocks(sbi, blkaddr); 217 invalidate_blocks(sbi, blkaddr);
188 dec_valid_block_count(sbi, dn->inode, 1);
189 nr_free++; 218 nr_free++;
190 } 219 }
191 if (nr_free) { 220 if (nr_free) {
221 dec_valid_block_count(sbi, dn->inode, nr_free);
192 set_page_dirty(dn->node_page); 222 set_page_dirty(dn->node_page);
193 sync_inode_page(dn); 223 sync_inode_page(dn);
194 } 224 }
@@ -291,7 +321,7 @@ void f2fs_truncate(struct inode *inode)
291 } 321 }
292} 322}
293 323
294static int f2fs_getattr(struct vfsmount *mnt, 324int f2fs_getattr(struct vfsmount *mnt,
295 struct dentry *dentry, struct kstat *stat) 325 struct dentry *dentry, struct kstat *stat)
296{ 326{
297 struct inode *inode = dentry->d_inode; 327 struct inode *inode = dentry->d_inode;
@@ -387,7 +417,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
387 f2fs_balance_fs(sbi); 417 f2fs_balance_fs(sbi);
388 418
389 ilock = mutex_lock_op(sbi); 419 ilock = mutex_lock_op(sbi);
390 page = get_new_data_page(inode, index, false); 420 page = get_new_data_page(inode, NULL, index, false);
391 mutex_unlock_op(sbi, ilock); 421 mutex_unlock_op(sbi, ilock);
392 422
393 if (!IS_ERR(page)) { 423 if (!IS_ERR(page)) {
@@ -575,10 +605,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
575 int ret; 605 int ret;
576 606
577 switch (cmd) { 607 switch (cmd) {
578 case FS_IOC_GETFLAGS: 608 case F2FS_IOC_GETFLAGS:
579 flags = fi->i_flags & FS_FL_USER_VISIBLE; 609 flags = fi->i_flags & FS_FL_USER_VISIBLE;
580 return put_user(flags, (int __user *) arg); 610 return put_user(flags, (int __user *) arg);
581 case FS_IOC_SETFLAGS: 611 case F2FS_IOC_SETFLAGS:
582 { 612 {
583 unsigned int oldflags; 613 unsigned int oldflags;
584 614
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 14961593e93c..35f9b1a196aa 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -76,7 +76,9 @@ static int gc_thread_func(void *data)
76 else 76 else
77 wait_ms = increase_sleep_time(wait_ms); 77 wait_ms = increase_sleep_time(wait_ms);
78 78
79#ifdef CONFIG_F2FS_STAT_FS
79 sbi->bg_gc++; 80 sbi->bg_gc++;
81#endif
80 82
81 /* if return value is not zero, no victim was selected */ 83 /* if return value is not zero, no victim was selected */
82 if (f2fs_gc(sbi)) 84 if (f2fs_gc(sbi))
@@ -89,23 +91,28 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
89{ 91{
90 struct f2fs_gc_kthread *gc_th; 92 struct f2fs_gc_kthread *gc_th;
91 dev_t dev = sbi->sb->s_bdev->bd_dev; 93 dev_t dev = sbi->sb->s_bdev->bd_dev;
94 int err = 0;
92 95
93 if (!test_opt(sbi, BG_GC)) 96 if (!test_opt(sbi, BG_GC))
94 return 0; 97 goto out;
95 gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); 98 gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
96 if (!gc_th) 99 if (!gc_th) {
97 return -ENOMEM; 100 err = -ENOMEM;
101 goto out;
102 }
98 103
99 sbi->gc_thread = gc_th; 104 sbi->gc_thread = gc_th;
100 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 105 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
101 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 106 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
102 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); 107 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
103 if (IS_ERR(gc_th->f2fs_gc_task)) { 108 if (IS_ERR(gc_th->f2fs_gc_task)) {
109 err = PTR_ERR(gc_th->f2fs_gc_task);
104 kfree(gc_th); 110 kfree(gc_th);
105 sbi->gc_thread = NULL; 111 sbi->gc_thread = NULL;
106 return -ENOMEM;
107 } 112 }
108 return 0; 113
114out:
115 return err;
109} 116}
110 117
111void stop_gc_thread(struct f2fs_sb_info *sbi) 118void stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -234,14 +241,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
234{ 241{
235 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 242 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
236 struct victim_sel_policy p; 243 struct victim_sel_policy p;
237 unsigned int secno; 244 unsigned int secno, max_cost;
238 int nsearched = 0; 245 int nsearched = 0;
239 246
240 p.alloc_mode = alloc_mode; 247 p.alloc_mode = alloc_mode;
241 select_policy(sbi, gc_type, type, &p); 248 select_policy(sbi, gc_type, type, &p);
242 249
243 p.min_segno = NULL_SEGNO; 250 p.min_segno = NULL_SEGNO;
244 p.min_cost = get_max_cost(sbi, &p); 251 p.min_cost = max_cost = get_max_cost(sbi, &p);
245 252
246 mutex_lock(&dirty_i->seglist_lock); 253 mutex_lock(&dirty_i->seglist_lock);
247 254
@@ -280,7 +287,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
280 p.min_cost = cost; 287 p.min_cost = cost;
281 } 288 }
282 289
283 if (cost == get_max_cost(sbi, &p)) 290 if (cost == max_cost)
284 continue; 291 continue;
285 292
286 if (nsearched++ >= MAX_VICTIM_SEARCH) { 293 if (nsearched++ >= MAX_VICTIM_SEARCH) {
@@ -288,8 +295,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
288 break; 295 break;
289 } 296 }
290 } 297 }
291got_it:
292 if (p.min_segno != NULL_SEGNO) { 298 if (p.min_segno != NULL_SEGNO) {
299got_it:
293 if (p.alloc_mode == LFS) { 300 if (p.alloc_mode == LFS) {
294 secno = GET_SECNO(sbi, p.min_segno); 301 secno = GET_SECNO(sbi, p.min_segno);
295 if (gc_type == FG_GC) 302 if (gc_type == FG_GC)
@@ -314,28 +321,21 @@ static const struct victim_selection default_v_ops = {
314 321
315static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) 322static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
316{ 323{
317 struct list_head *this;
318 struct inode_entry *ie; 324 struct inode_entry *ie;
319 325
320 list_for_each(this, ilist) { 326 list_for_each_entry(ie, ilist, list)
321 ie = list_entry(this, struct inode_entry, list);
322 if (ie->inode->i_ino == ino) 327 if (ie->inode->i_ino == ino)
323 return ie->inode; 328 return ie->inode;
324 }
325 return NULL; 329 return NULL;
326} 330}
327 331
328static void add_gc_inode(struct inode *inode, struct list_head *ilist) 332static void add_gc_inode(struct inode *inode, struct list_head *ilist)
329{ 333{
330 struct list_head *this; 334 struct inode_entry *new_ie;
331 struct inode_entry *new_ie, *ie;
332 335
333 list_for_each(this, ilist) { 336 if (inode == find_gc_inode(inode->i_ino, ilist)) {
334 ie = list_entry(this, struct inode_entry, list); 337 iput(inode);
335 if (ie->inode == inode) { 338 return;
336 iput(inode);
337 return;
338 }
339 } 339 }
340repeat: 340repeat:
341 new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); 341 new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 91ac7f9d88ee..2b2d45d19e3e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -109,12 +109,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
109 ret = do_read_inode(inode); 109 ret = do_read_inode(inode);
110 if (ret) 110 if (ret)
111 goto bad_inode; 111 goto bad_inode;
112
113 if (!sbi->por_doing && inode->i_nlink == 0) {
114 ret = -ENOENT;
115 goto bad_inode;
116 }
117
118make_now: 112make_now:
119 if (ino == F2FS_NODE_INO(sbi)) { 113 if (ino == F2FS_NODE_INO(sbi)) {
120 inode->i_mapping->a_ops = &f2fs_node_aops; 114 inode->i_mapping->a_ops = &f2fs_node_aops;
@@ -130,8 +124,7 @@ make_now:
130 inode->i_op = &f2fs_dir_inode_operations; 124 inode->i_op = &f2fs_dir_inode_operations;
131 inode->i_fop = &f2fs_dir_operations; 125 inode->i_fop = &f2fs_dir_operations;
132 inode->i_mapping->a_ops = &f2fs_dblock_aops; 126 inode->i_mapping->a_ops = &f2fs_dblock_aops;
133 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE | 127 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
134 __GFP_ZERO);
135 } else if (S_ISLNK(inode->i_mode)) { 128 } else if (S_ISLNK(inode->i_mode)) {
136 inode->i_op = &f2fs_symlink_inode_operations; 129 inode->i_op = &f2fs_symlink_inode_operations;
137 inode->i_mapping->a_ops = &f2fs_dblock_aops; 130 inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -199,6 +192,7 @@ void update_inode(struct inode *inode, struct page *node_page)
199 192
200 set_cold_node(inode, node_page); 193 set_cold_node(inode, node_page);
201 set_page_dirty(node_page); 194 set_page_dirty(node_page);
195 clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
202} 196}
203 197
204int update_inode_page(struct inode *inode) 198int update_inode_page(struct inode *inode)
@@ -224,6 +218,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
224 inode->i_ino == F2FS_META_INO(sbi)) 218 inode->i_ino == F2FS_META_INO(sbi))
225 return 0; 219 return 0;
226 220
221 if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE))
222 return 0;
223
227 if (wbc) 224 if (wbc)
228 f2fs_balance_fs(sbi); 225 f2fs_balance_fs(sbi);
229 226
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 47abc9722b17..64c07169df05 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -112,7 +112,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
112 int count = le32_to_cpu(sbi->raw_super->extension_count); 112 int count = le32_to_cpu(sbi->raw_super->extension_count);
113 for (i = 0; i < count; i++) { 113 for (i = 0; i < count; i++) {
114 if (is_multimedia_file(name, extlist[i])) { 114 if (is_multimedia_file(name, extlist[i])) {
115 set_cold_file(inode); 115 file_set_cold(inode);
116 break; 116 break;
117 } 117 }
118 } 118 }
@@ -149,8 +149,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
149 149
150 alloc_nid_done(sbi, ino); 150 alloc_nid_done(sbi, ino);
151 151
152 if (!sbi->por_doing) 152 d_instantiate(dentry, inode);
153 d_instantiate(dentry, inode);
154 unlock_new_inode(inode); 153 unlock_new_inode(inode);
155 return 0; 154 return 0;
156out: 155out:
@@ -173,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
173 f2fs_balance_fs(sbi); 172 f2fs_balance_fs(sbi);
174 173
175 inode->i_ctime = CURRENT_TIME; 174 inode->i_ctime = CURRENT_TIME;
176 atomic_inc(&inode->i_count); 175 ihold(inode);
177 176
178 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 177 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
179 ilock = mutex_lock_op(sbi); 178 ilock = mutex_lock_op(sbi);
@@ -182,17 +181,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
182 if (err) 181 if (err)
183 goto out; 182 goto out;
184 183
185 /*
186 * This file should be checkpointed during fsync.
187 * We lost i_pino from now on.
188 */
189 set_cp_file(inode);
190
191 d_instantiate(dentry, inode); 184 d_instantiate(dentry, inode);
192 return 0; 185 return 0;
193out: 186out:
194 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 187 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
195 make_bad_inode(inode);
196 iput(inode); 188 iput(inode);
197 return err; 189 return err;
198} 190}
@@ -498,6 +490,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
498 .rmdir = f2fs_rmdir, 490 .rmdir = f2fs_rmdir,
499 .mknod = f2fs_mknod, 491 .mknod = f2fs_mknod,
500 .rename = f2fs_rename, 492 .rename = f2fs_rename,
493 .getattr = f2fs_getattr,
501 .setattr = f2fs_setattr, 494 .setattr = f2fs_setattr,
502 .get_acl = f2fs_get_acl, 495 .get_acl = f2fs_get_acl,
503#ifdef CONFIG_F2FS_FS_XATTR 496#ifdef CONFIG_F2FS_FS_XATTR
@@ -512,6 +505,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
512 .readlink = generic_readlink, 505 .readlink = generic_readlink,
513 .follow_link = page_follow_link_light, 506 .follow_link = page_follow_link_light,
514 .put_link = page_put_link, 507 .put_link = page_put_link,
508 .getattr = f2fs_getattr,
515 .setattr = f2fs_setattr, 509 .setattr = f2fs_setattr,
516#ifdef CONFIG_F2FS_FS_XATTR 510#ifdef CONFIG_F2FS_FS_XATTR
517 .setxattr = generic_setxattr, 511 .setxattr = generic_setxattr,
@@ -522,6 +516,7 @@ const struct inode_operations f2fs_symlink_inode_operations = {
522}; 516};
523 517
524const struct inode_operations f2fs_special_inode_operations = { 518const struct inode_operations f2fs_special_inode_operations = {
519 .getattr = f2fs_getattr,
525 .setattr = f2fs_setattr, 520 .setattr = f2fs_setattr,
526 .get_acl = f2fs_get_acl, 521 .get_acl = f2fs_get_acl,
527#ifdef CONFIG_F2FS_FS_XATTR 522#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3df43b4efd89..b418aee09573 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -408,10 +408,13 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
408 level = get_node_path(index, offset, noffset); 408 level = get_node_path(index, offset, noffset);
409 409
410 nids[0] = dn->inode->i_ino; 410 nids[0] = dn->inode->i_ino;
411 npage[0] = get_node_page(sbi, nids[0]); 411 npage[0] = dn->inode_page;
412 if (IS_ERR(npage[0]))
413 return PTR_ERR(npage[0]);
414 412
413 if (!npage[0]) {
414 npage[0] = get_node_page(sbi, nids[0]);
415 if (IS_ERR(npage[0]))
416 return PTR_ERR(npage[0]);
417 }
415 parent = npage[0]; 418 parent = npage[0];
416 if (level != 0) 419 if (level != 0)
417 nids[1] = get_nid(parent, offset[0], true); 420 nids[1] = get_nid(parent, offset[0], true);
@@ -430,7 +433,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
430 } 433 }
431 434
432 dn->nid = nids[i]; 435 dn->nid = nids[i];
433 npage[i] = new_node_page(dn, noffset[i]); 436 npage[i] = new_node_page(dn, noffset[i], NULL);
434 if (IS_ERR(npage[i])) { 437 if (IS_ERR(npage[i])) {
435 alloc_nid_failed(sbi, nids[i]); 438 alloc_nid_failed(sbi, nids[i]);
436 err = PTR_ERR(npage[i]); 439 err = PTR_ERR(npage[i]);
@@ -803,22 +806,19 @@ int remove_inode_page(struct inode *inode)
803 return 0; 806 return 0;
804} 807}
805 808
806int new_inode_page(struct inode *inode, const struct qstr *name) 809struct page *new_inode_page(struct inode *inode, const struct qstr *name)
807{ 810{
808 struct page *page;
809 struct dnode_of_data dn; 811 struct dnode_of_data dn;
810 812
811 /* allocate inode page for new inode */ 813 /* allocate inode page for new inode */
812 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); 814 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
813 page = new_node_page(&dn, 0); 815
814 init_dent_inode(name, page); 816 /* caller should f2fs_put_page(page, 1); */
815 if (IS_ERR(page)) 817 return new_node_page(&dn, 0, NULL);
816 return PTR_ERR(page);
817 f2fs_put_page(page, 1);
818 return 0;
819} 818}
820 819
821struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) 820struct page *new_node_page(struct dnode_of_data *dn,
821 unsigned int ofs, struct page *ipage)
822{ 822{
823 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 823 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
824 struct address_space *mapping = sbi->node_inode->i_mapping; 824 struct address_space *mapping = sbi->node_inode->i_mapping;
@@ -851,7 +851,10 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
851 set_cold_node(dn->inode, page); 851 set_cold_node(dn->inode, page);
852 852
853 dn->node_page = page; 853 dn->node_page = page;
854 sync_inode_page(dn); 854 if (ipage)
855 update_inode(dn->inode, ipage);
856 else
857 sync_inode_page(dn);
855 set_page_dirty(page); 858 set_page_dirty(page);
856 if (ofs == 0) 859 if (ofs == 0)
857 inc_valid_inode_count(sbi); 860 inc_valid_inode_count(sbi);
@@ -1205,7 +1208,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
1205 return 0; 1208 return 0;
1206} 1209}
1207 1210
1208static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) 1211static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1212 unsigned int length)
1209{ 1213{
1210 struct inode *inode = page->mapping->host; 1214 struct inode *inode = page->mapping->host;
1211 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1215 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -1492,9 +1496,10 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1492 new_ni = old_ni; 1496 new_ni = old_ni;
1493 new_ni.ino = ino; 1497 new_ni.ino = ino;
1494 1498
1499 if (!inc_valid_node_count(sbi, NULL, 1))
1500 WARN_ON(1);
1495 set_node_addr(sbi, &new_ni, NEW_ADDR); 1501 set_node_addr(sbi, &new_ni, NEW_ADDR);
1496 inc_valid_inode_count(sbi); 1502 inc_valid_inode_count(sbi);
1497
1498 f2fs_put_page(ipage, 1); 1503 f2fs_put_page(ipage, 1);
1499 return 0; 1504 return 0;
1500} 1505}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 0a2d72f0024d..c65fb4f4230f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -275,25 +275,27 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
275 * - Mark cold node blocks in their node footer 275 * - Mark cold node blocks in their node footer
276 * - Mark cold data pages in page cache 276 * - Mark cold data pages in page cache
277 */ 277 */
278static inline int is_cold_file(struct inode *inode) 278static inline int is_file(struct inode *inode, int type)
279{ 279{
280 return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; 280 return F2FS_I(inode)->i_advise & type;
281} 281}
282 282
283static inline void set_cold_file(struct inode *inode) 283static inline void set_file(struct inode *inode, int type)
284{ 284{
285 F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; 285 F2FS_I(inode)->i_advise |= type;
286} 286}
287 287
288static inline int is_cp_file(struct inode *inode) 288static inline void clear_file(struct inode *inode, int type)
289{ 289{
290 return F2FS_I(inode)->i_advise & FADVISE_CP_BIT; 290 F2FS_I(inode)->i_advise &= ~type;
291} 291}
292 292
293static inline void set_cp_file(struct inode *inode) 293#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT)
294{ 294#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT)
295 F2FS_I(inode)->i_advise |= FADVISE_CP_BIT; 295#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT)
296} 296#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT)
297#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT)
298#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT)
297 299
298static inline int is_cold_data(struct page *page) 300static inline int is_cold_data(struct page *page)
299{ 301{
@@ -310,29 +312,16 @@ static inline void clear_cold_data(struct page *page)
310 ClearPageChecked(page); 312 ClearPageChecked(page);
311} 313}
312 314
313static inline int is_cold_node(struct page *page) 315static inline int is_node(struct page *page, int type)
314{ 316{
315 void *kaddr = page_address(page); 317 void *kaddr = page_address(page);
316 struct f2fs_node *rn = (struct f2fs_node *)kaddr; 318 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
317 unsigned int flag = le32_to_cpu(rn->footer.flag); 319 return le32_to_cpu(rn->footer.flag) & (1 << type);
318 return flag & (0x1 << COLD_BIT_SHIFT);
319} 320}
320 321
321static inline unsigned char is_fsync_dnode(struct page *page) 322#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
322{ 323#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
323 void *kaddr = page_address(page); 324#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
324 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
325 unsigned int flag = le32_to_cpu(rn->footer.flag);
326 return flag & (0x1 << FSYNC_BIT_SHIFT);
327}
328
329static inline unsigned char is_dent_dnode(struct page *page)
330{
331 void *kaddr = page_address(page);
332 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
333 unsigned int flag = le32_to_cpu(rn->footer.flag);
334 return flag & (0x1 << DENT_BIT_SHIFT);
335}
336 325
337static inline void set_cold_node(struct inode *inode, struct page *page) 326static inline void set_cold_node(struct inode *inode, struct page *page)
338{ 327{
@@ -346,26 +335,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page)
346 rn->footer.flag = cpu_to_le32(flag); 335 rn->footer.flag = cpu_to_le32(flag);
347} 336}
348 337
349static inline void set_fsync_mark(struct page *page, int mark) 338static inline void set_mark(struct page *page, int mark, int type)
350{ 339{
351 void *kaddr = page_address(page); 340 struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
352 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
353 unsigned int flag = le32_to_cpu(rn->footer.flag);
354 if (mark)
355 flag |= (0x1 << FSYNC_BIT_SHIFT);
356 else
357 flag &= ~(0x1 << FSYNC_BIT_SHIFT);
358 rn->footer.flag = cpu_to_le32(flag);
359}
360
361static inline void set_dentry_mark(struct page *page, int mark)
362{
363 void *kaddr = page_address(page);
364 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
365 unsigned int flag = le32_to_cpu(rn->footer.flag); 341 unsigned int flag = le32_to_cpu(rn->footer.flag);
366 if (mark) 342 if (mark)
367 flag |= (0x1 << DENT_BIT_SHIFT); 343 flag |= (0x1 << type);
368 else 344 else
369 flag &= ~(0x1 << DENT_BIT_SHIFT); 345 flag &= ~(0x1 << type);
370 rn->footer.flag = cpu_to_le32(flag); 346 rn->footer.flag = cpu_to_le32(flag);
371} 347}
348#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
349#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 60c8a5097058..d56d951c2253 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,36 +40,54 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
40 40
41static int recover_dentry(struct page *ipage, struct inode *inode) 41static int recover_dentry(struct page *ipage, struct inode *inode)
42{ 42{
43 struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); 43 void *kaddr = page_address(ipage);
44 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
44 struct f2fs_inode *raw_inode = &(raw_node->i); 45 struct f2fs_inode *raw_inode = &(raw_node->i);
45 struct qstr name; 46 nid_t pino = le32_to_cpu(raw_inode->i_pino);
46 struct f2fs_dir_entry *de; 47 struct f2fs_dir_entry *de;
48 struct qstr name;
47 struct page *page; 49 struct page *page;
48 struct inode *dir; 50 struct inode *dir, *einode;
49 int err = 0; 51 int err = 0;
50 52
51 if (!is_dent_dnode(ipage)) 53 dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
52 goto out; 54 if (!dir) {
53 55 dir = f2fs_iget(inode->i_sb, pino);
54 dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); 56 if (IS_ERR(dir)) {
55 if (IS_ERR(dir)) { 57 err = PTR_ERR(dir);
56 err = PTR_ERR(dir); 58 goto out;
57 goto out; 59 }
60 set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
61 add_dirty_dir_inode(dir);
58 } 62 }
59 63
60 name.len = le32_to_cpu(raw_inode->i_namelen); 64 name.len = le32_to_cpu(raw_inode->i_namelen);
61 name.name = raw_inode->i_name; 65 name.name = raw_inode->i_name;
62 66retry:
63 de = f2fs_find_entry(dir, &name, &page); 67 de = f2fs_find_entry(dir, &name, &page);
64 if (de) { 68 if (de && inode->i_ino == le32_to_cpu(de->ino)) {
65 kunmap(page); 69 kunmap(page);
66 f2fs_put_page(page, 0); 70 f2fs_put_page(page, 0);
67 } else { 71 goto out;
68 err = __f2fs_add_link(dir, &name, inode); 72 }
73 if (de) {
74 einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
75 if (IS_ERR(einode)) {
76 WARN_ON(1);
77 if (PTR_ERR(einode) == -ENOENT)
78 err = -EEXIST;
79 goto out;
80 }
81 f2fs_delete_entry(de, page, einode);
82 iput(einode);
83 goto retry;
69 } 84 }
70 iput(dir); 85 err = __f2fs_add_link(dir, &name, inode);
71out: 86out:
72 kunmap(ipage); 87 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
88 "ino = %x, name = %s, dir = %lx, err = %d",
89 ino_of_node(ipage), raw_inode->i_name,
90 IS_ERR(dir) ? 0 : dir->i_ino, err);
73 return err; 91 return err;
74} 92}
75 93
@@ -79,6 +97,9 @@ static int recover_inode(struct inode *inode, struct page *node_page)
79 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; 97 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
80 struct f2fs_inode *raw_inode = &(raw_node->i); 98 struct f2fs_inode *raw_inode = &(raw_node->i);
81 99
100 if (!IS_INODE(node_page))
101 return 0;
102
82 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 103 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
83 i_size_write(inode, le64_to_cpu(raw_inode->i_size)); 104 i_size_write(inode, le64_to_cpu(raw_inode->i_size));
84 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 105 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
@@ -88,7 +109,12 @@ static int recover_inode(struct inode *inode, struct page *node_page)
88 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); 109 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
89 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 110 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
90 111
91 return recover_dentry(node_page, inode); 112 if (is_dent_dnode(node_page))
113 return recover_dentry(node_page, inode);
114
115 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
116 ino_of_node(node_page), raw_inode->i_name);
117 return 0;
92} 118}
93 119
94static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) 120static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
@@ -119,14 +145,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
119 lock_page(page); 145 lock_page(page);
120 146
121 if (cp_ver != cpver_of_node(page)) 147 if (cp_ver != cpver_of_node(page))
122 goto unlock_out; 148 break;
123 149
124 if (!is_fsync_dnode(page)) 150 if (!is_fsync_dnode(page))
125 goto next; 151 goto next;
126 152
127 entry = get_fsync_inode(head, ino_of_node(page)); 153 entry = get_fsync_inode(head, ino_of_node(page));
128 if (entry) { 154 if (entry) {
129 entry->blkaddr = blkaddr;
130 if (IS_INODE(page) && is_dent_dnode(page)) 155 if (IS_INODE(page) && is_dent_dnode(page))
131 set_inode_flag(F2FS_I(entry->inode), 156 set_inode_flag(F2FS_I(entry->inode),
132 FI_INC_LINK); 157 FI_INC_LINK);
@@ -134,48 +159,40 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
134 if (IS_INODE(page) && is_dent_dnode(page)) { 159 if (IS_INODE(page) && is_dent_dnode(page)) {
135 err = recover_inode_page(sbi, page); 160 err = recover_inode_page(sbi, page);
136 if (err) 161 if (err)
137 goto unlock_out; 162 break;
138 } 163 }
139 164
140 /* add this fsync inode to the list */ 165 /* add this fsync inode to the list */
141 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); 166 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
142 if (!entry) { 167 if (!entry) {
143 err = -ENOMEM; 168 err = -ENOMEM;
144 goto unlock_out; 169 break;
145 } 170 }
146 171
147 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); 172 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
148 if (IS_ERR(entry->inode)) { 173 if (IS_ERR(entry->inode)) {
149 err = PTR_ERR(entry->inode); 174 err = PTR_ERR(entry->inode);
150 kmem_cache_free(fsync_entry_slab, entry); 175 kmem_cache_free(fsync_entry_slab, entry);
151 goto unlock_out; 176 break;
152 } 177 }
153
154 list_add_tail(&entry->list, head); 178 list_add_tail(&entry->list, head);
155 entry->blkaddr = blkaddr;
156 }
157 if (IS_INODE(page)) {
158 err = recover_inode(entry->inode, page);
159 if (err == -ENOENT) {
160 goto next;
161 } else if (err) {
162 err = -EINVAL;
163 goto unlock_out;
164 }
165 } 179 }
180 entry->blkaddr = blkaddr;
181
182 err = recover_inode(entry->inode, page);
183 if (err && err != -ENOENT)
184 break;
166next: 185next:
167 /* check next segment */ 186 /* check next segment */
168 blkaddr = next_blkaddr_of_node(page); 187 blkaddr = next_blkaddr_of_node(page);
169 } 188 }
170unlock_out:
171 unlock_page(page); 189 unlock_page(page);
172out: 190out:
173 __free_pages(page, 0); 191 __free_pages(page, 0);
174 return err; 192 return err;
175} 193}
176 194
177static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, 195static void destroy_fsync_dnodes(struct list_head *head)
178 struct list_head *head)
179{ 196{
180 struct fsync_inode_entry *entry, *tmp; 197 struct fsync_inode_entry *entry, *tmp;
181 198
@@ -186,15 +203,15 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
186 } 203 }
187} 204}
188 205
189static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, 206static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
190 block_t blkaddr) 207 block_t blkaddr, struct dnode_of_data *dn)
191{ 208{
192 struct seg_entry *sentry; 209 struct seg_entry *sentry;
193 unsigned int segno = GET_SEGNO(sbi, blkaddr); 210 unsigned int segno = GET_SEGNO(sbi, blkaddr);
194 unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & 211 unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
195 (sbi->blocks_per_seg - 1); 212 (sbi->blocks_per_seg - 1);
196 struct f2fs_summary sum; 213 struct f2fs_summary sum;
197 nid_t ino; 214 nid_t ino, nid;
198 void *kaddr; 215 void *kaddr;
199 struct inode *inode; 216 struct inode *inode;
200 struct page *node_page; 217 struct page *node_page;
@@ -203,7 +220,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
203 220
204 sentry = get_seg_entry(sbi, segno); 221 sentry = get_seg_entry(sbi, segno);
205 if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) 222 if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
206 return; 223 return 0;
207 224
208 /* Get the previous summary */ 225 /* Get the previous summary */
209 for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { 226 for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
@@ -222,20 +239,39 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
222 f2fs_put_page(sum_page, 1); 239 f2fs_put_page(sum_page, 1);
223 } 240 }
224 241
242 /* Use the locked dnode page and inode */
243 nid = le32_to_cpu(sum.nid);
244 if (dn->inode->i_ino == nid) {
245 struct dnode_of_data tdn = *dn;
246 tdn.nid = nid;
247 tdn.node_page = dn->inode_page;
248 tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
249 truncate_data_blocks_range(&tdn, 1);
250 return 0;
251 } else if (dn->nid == nid) {
252 struct dnode_of_data tdn = *dn;
253 tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
254 truncate_data_blocks_range(&tdn, 1);
255 return 0;
256 }
257
225 /* Get the node page */ 258 /* Get the node page */
226 node_page = get_node_page(sbi, le32_to_cpu(sum.nid)); 259 node_page = get_node_page(sbi, nid);
260 if (IS_ERR(node_page))
261 return PTR_ERR(node_page);
227 bidx = start_bidx_of_node(ofs_of_node(node_page)) + 262 bidx = start_bidx_of_node(ofs_of_node(node_page)) +
228 le16_to_cpu(sum.ofs_in_node); 263 le16_to_cpu(sum.ofs_in_node);
229 ino = ino_of_node(node_page); 264 ino = ino_of_node(node_page);
230 f2fs_put_page(node_page, 1); 265 f2fs_put_page(node_page, 1);
231 266
232 /* Deallocate previous index in the node page */ 267 /* Deallocate previous index in the node page */
233 inode = f2fs_iget(sbi->sb, ino); 268 inode = f2fs_iget(sbi->sb, ino);
234 if (IS_ERR(inode)) 269 if (IS_ERR(inode))
235 return; 270 return PTR_ERR(inode);
236 271
237 truncate_hole(inode, bidx, bidx + 1); 272 truncate_hole(inode, bidx, bidx + 1);
238 iput(inode); 273 iput(inode);
274 return 0;
239} 275}
240 276
241static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, 277static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -245,7 +281,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
245 struct dnode_of_data dn; 281 struct dnode_of_data dn;
246 struct f2fs_summary sum; 282 struct f2fs_summary sum;
247 struct node_info ni; 283 struct node_info ni;
248 int err = 0; 284 int err = 0, recovered = 0;
249 int ilock; 285 int ilock;
250 286
251 start = start_bidx_of_node(ofs_of_node(page)); 287 start = start_bidx_of_node(ofs_of_node(page));
@@ -283,13 +319,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
283 } 319 }
284 320
285 /* Check the previous node page having this index */ 321 /* Check the previous node page having this index */
286 check_index_in_prev_nodes(sbi, dest); 322 err = check_index_in_prev_nodes(sbi, dest, &dn);
323 if (err)
324 goto err;
287 325
288 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); 326 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
289 327
290 /* write dummy data page */ 328 /* write dummy data page */
291 recover_data_page(sbi, NULL, &sum, src, dest); 329 recover_data_page(sbi, NULL, &sum, src, dest);
292 update_extent_cache(dest, &dn); 330 update_extent_cache(dest, &dn);
331 recovered++;
293 } 332 }
294 dn.ofs_in_node++; 333 dn.ofs_in_node++;
295 } 334 }
@@ -305,9 +344,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
305 set_page_dirty(dn.node_page); 344 set_page_dirty(dn.node_page);
306 345
307 recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); 346 recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
347err:
308 f2fs_put_dnode(&dn); 348 f2fs_put_dnode(&dn);
309 mutex_unlock_op(sbi, ilock); 349 mutex_unlock_op(sbi, ilock);
310 return 0; 350
351 f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
352 "recovered_data = %d blocks, err = %d",
353 inode->i_ino, recovered, err);
354 return err;
311} 355}
312 356
313static int recover_data(struct f2fs_sb_info *sbi, 357static int recover_data(struct f2fs_sb_info *sbi,
@@ -340,7 +384,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
340 lock_page(page); 384 lock_page(page);
341 385
342 if (cp_ver != cpver_of_node(page)) 386 if (cp_ver != cpver_of_node(page))
343 goto unlock_out; 387 break;
344 388
345 entry = get_fsync_inode(head, ino_of_node(page)); 389 entry = get_fsync_inode(head, ino_of_node(page));
346 if (!entry) 390 if (!entry)
@@ -348,7 +392,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
348 392
349 err = do_recover_data(sbi, entry->inode, page, blkaddr); 393 err = do_recover_data(sbi, entry->inode, page, blkaddr);
350 if (err) 394 if (err)
351 goto out; 395 break;
352 396
353 if (entry->blkaddr == blkaddr) { 397 if (entry->blkaddr == blkaddr) {
354 iput(entry->inode); 398 iput(entry->inode);
@@ -359,7 +403,6 @@ next:
359 /* check next segment */ 403 /* check next segment */
360 blkaddr = next_blkaddr_of_node(page); 404 blkaddr = next_blkaddr_of_node(page);
361 } 405 }
362unlock_out:
363 unlock_page(page); 406 unlock_page(page);
364out: 407out:
365 __free_pages(page, 0); 408 __free_pages(page, 0);
@@ -382,6 +425,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
382 INIT_LIST_HEAD(&inode_list); 425 INIT_LIST_HEAD(&inode_list);
383 426
384 /* step #1: find fsynced inode numbers */ 427 /* step #1: find fsynced inode numbers */
428 sbi->por_doing = 1;
385 err = find_fsync_dnodes(sbi, &inode_list); 429 err = find_fsync_dnodes(sbi, &inode_list);
386 if (err) 430 if (err)
387 goto out; 431 goto out;
@@ -390,13 +434,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
390 goto out; 434 goto out;
391 435
392 /* step #2: recover data */ 436 /* step #2: recover data */
393 sbi->por_doing = 1;
394 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); 437 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
395 sbi->por_doing = 0;
396 BUG_ON(!list_empty(&inode_list)); 438 BUG_ON(!list_empty(&inode_list));
397out: 439out:
398 destroy_fsync_dnodes(sbi, &inode_list); 440 destroy_fsync_dnodes(&inode_list);
399 kmem_cache_destroy(fsync_entry_slab); 441 kmem_cache_destroy(fsync_entry_slab);
400 write_checkpoint(sbi, false); 442 sbi->por_doing = 0;
443 if (!err)
444 write_checkpoint(sbi, false);
401 return err; 445 return err;
402} 446}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d8e84e49a5c3..a86d125a9885 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -94,7 +94,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
94 * Adding dirty entry into seglist is not critical operation. 94 * Adding dirty entry into seglist is not critical operation.
95 * If a given segment is one of current working segments, it won't be added. 95 * If a given segment is one of current working segments, it won't be added.
96 */ 96 */
97void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) 97static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
98{ 98{
99 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 99 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
100 unsigned short valid_blocks; 100 unsigned short valid_blocks;
@@ -126,17 +126,16 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
126static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) 126static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
127{ 127{
128 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 128 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
129 unsigned int segno, offset = 0; 129 unsigned int segno = -1;
130 unsigned int total_segs = TOTAL_SEGS(sbi); 130 unsigned int total_segs = TOTAL_SEGS(sbi);
131 131
132 mutex_lock(&dirty_i->seglist_lock); 132 mutex_lock(&dirty_i->seglist_lock);
133 while (1) { 133 while (1) {
134 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, 134 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
135 offset); 135 segno + 1);
136 if (segno >= total_segs) 136 if (segno >= total_segs)
137 break; 137 break;
138 __set_test_and_free(sbi, segno); 138 __set_test_and_free(sbi, segno);
139 offset = segno + 1;
140 } 139 }
141 mutex_unlock(&dirty_i->seglist_lock); 140 mutex_unlock(&dirty_i->seglist_lock);
142} 141}
@@ -144,17 +143,16 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
144void clear_prefree_segments(struct f2fs_sb_info *sbi) 143void clear_prefree_segments(struct f2fs_sb_info *sbi)
145{ 144{
146 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 145 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
147 unsigned int segno, offset = 0; 146 unsigned int segno = -1;
148 unsigned int total_segs = TOTAL_SEGS(sbi); 147 unsigned int total_segs = TOTAL_SEGS(sbi);
149 148
150 mutex_lock(&dirty_i->seglist_lock); 149 mutex_lock(&dirty_i->seglist_lock);
151 while (1) { 150 while (1) {
152 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, 151 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
153 offset); 152 segno + 1);
154 if (segno >= total_segs) 153 if (segno >= total_segs)
155 break; 154 break;
156 155
157 offset = segno + 1;
158 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) 156 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
159 dirty_i->nr_dirty[PRE]--; 157 dirty_i->nr_dirty[PRE]--;
160 158
@@ -257,11 +255,11 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
257 * This function should be resided under the curseg_mutex lock 255 * This function should be resided under the curseg_mutex lock
258 */ 256 */
259static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, 257static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
260 struct f2fs_summary *sum, unsigned short offset) 258 struct f2fs_summary *sum)
261{ 259{
262 struct curseg_info *curseg = CURSEG_I(sbi, type); 260 struct curseg_info *curseg = CURSEG_I(sbi, type);
263 void *addr = curseg->sum_blk; 261 void *addr = curseg->sum_blk;
264 addr += offset * sizeof(struct f2fs_summary); 262 addr += curseg->next_blkoff * sizeof(struct f2fs_summary);
265 memcpy(addr, sum, sizeof(struct f2fs_summary)); 263 memcpy(addr, sum, sizeof(struct f2fs_summary));
266 return; 264 return;
267} 265}
@@ -311,64 +309,14 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
311 f2fs_put_page(page, 1); 309 f2fs_put_page(page, 1);
312} 310}
313 311
314static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type)
315{
316 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
317 unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
318 unsigned int segno;
319 unsigned int ofs = 0;
320
321 /*
322 * If there is not enough reserved sections,
323 * we should not reuse prefree segments.
324 */
325 if (has_not_enough_free_secs(sbi, 0))
326 return NULL_SEGNO;
327
328 /*
329 * NODE page should not reuse prefree segment,
330 * since those information is used for SPOR.
331 */
332 if (IS_NODESEG(type))
333 return NULL_SEGNO;
334next:
335 segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs);
336 ofs += sbi->segs_per_sec;
337
338 if (segno < TOTAL_SEGS(sbi)) {
339 int i;
340
341 /* skip intermediate segments in a section */
342 if (segno % sbi->segs_per_sec)
343 goto next;
344
345 /* skip if the section is currently used */
346 if (sec_usage_check(sbi, GET_SECNO(sbi, segno)))
347 goto next;
348
349 /* skip if whole section is not prefree */
350 for (i = 1; i < sbi->segs_per_sec; i++)
351 if (!test_bit(segno + i, prefree_segmap))
352 goto next;
353
354 /* skip if whole section was not free at the last checkpoint */
355 for (i = 0; i < sbi->segs_per_sec; i++)
356 if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks)
357 goto next;
358
359 return segno;
360 }
361 return NULL_SEGNO;
362}
363
364static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) 312static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
365{ 313{
366 struct curseg_info *curseg = CURSEG_I(sbi, type); 314 struct curseg_info *curseg = CURSEG_I(sbi, type);
367 unsigned int segno = curseg->segno; 315 unsigned int segno = curseg->segno + 1;
368 struct free_segmap_info *free_i = FREE_I(sbi); 316 struct free_segmap_info *free_i = FREE_I(sbi);
369 317
370 if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec) 318 if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec)
371 return !test_bit(segno + 1, free_i->free_segmap); 319 return !test_bit(segno, free_i->free_segmap);
372 return 0; 320 return 0;
373} 321}
374 322
@@ -495,7 +443,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
495 int dir = ALLOC_LEFT; 443 int dir = ALLOC_LEFT;
496 444
497 write_sum_page(sbi, curseg->sum_blk, 445 write_sum_page(sbi, curseg->sum_blk,
498 GET_SUM_BLOCK(sbi, curseg->segno)); 446 GET_SUM_BLOCK(sbi, segno));
499 if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) 447 if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
500 dir = ALLOC_RIGHT; 448 dir = ALLOC_RIGHT;
501 449
@@ -599,11 +547,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
599 goto out; 547 goto out;
600 } 548 }
601 549
602 curseg->next_segno = check_prefree_segments(sbi, type); 550 if (type == CURSEG_WARM_NODE)
603
604 if (curseg->next_segno != NULL_SEGNO)
605 change_curseg(sbi, type, false);
606 else if (type == CURSEG_WARM_NODE)
607 new_curseg(sbi, type, false); 551 new_curseg(sbi, type, false);
608 else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) 552 else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
609 new_curseg(sbi, type, false); 553 new_curseg(sbi, type, false);
@@ -612,7 +556,10 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
612 else 556 else
613 new_curseg(sbi, type, false); 557 new_curseg(sbi, type, false);
614out: 558out:
559#ifdef CONFIG_F2FS_STAT_FS
615 sbi->segment_count[curseg->alloc_type]++; 560 sbi->segment_count[curseg->alloc_type]++;
561#endif
562 return;
616} 563}
617 564
618void allocate_new_segments(struct f2fs_sb_info *sbi) 565void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -795,7 +742,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
795 742
796 if (S_ISDIR(inode->i_mode)) 743 if (S_ISDIR(inode->i_mode))
797 return CURSEG_HOT_DATA; 744 return CURSEG_HOT_DATA;
798 else if (is_cold_data(page) || is_cold_file(inode)) 745 else if (is_cold_data(page) || file_is_cold(inode))
799 return CURSEG_COLD_DATA; 746 return CURSEG_COLD_DATA;
800 else 747 else
801 return CURSEG_WARM_DATA; 748 return CURSEG_WARM_DATA;
@@ -844,11 +791,13 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
844 * because, this function updates a summary entry in the 791 * because, this function updates a summary entry in the
845 * current summary block. 792 * current summary block.
846 */ 793 */
847 __add_sum_entry(sbi, type, sum, curseg->next_blkoff); 794 __add_sum_entry(sbi, type, sum);
848 795
849 mutex_lock(&sit_i->sentry_lock); 796 mutex_lock(&sit_i->sentry_lock);
850 __refresh_next_blkoff(sbi, curseg); 797 __refresh_next_blkoff(sbi, curseg);
798#ifdef CONFIG_F2FS_STAT_FS
851 sbi->block_count[curseg->alloc_type]++; 799 sbi->block_count[curseg->alloc_type]++;
800#endif
852 801
853 /* 802 /*
854 * SIT information should be updated before segment allocation, 803 * SIT information should be updated before segment allocation,
@@ -943,7 +892,7 @@ void recover_data_page(struct f2fs_sb_info *sbi,
943 892
944 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 893 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
945 (sbi->blocks_per_seg - 1); 894 (sbi->blocks_per_seg - 1);
946 __add_sum_entry(sbi, type, sum, curseg->next_blkoff); 895 __add_sum_entry(sbi, type, sum);
947 896
948 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 897 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
949 898
@@ -980,7 +929,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
980 } 929 }
981 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & 930 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
982 (sbi->blocks_per_seg - 1); 931 (sbi->blocks_per_seg - 1);
983 __add_sum_entry(sbi, type, sum, curseg->next_blkoff); 932 __add_sum_entry(sbi, type, sum);
984 933
985 /* change the current log to the next block addr in advance */ 934 /* change the current log to the next block addr in advance */
986 if (next_segno != segno) { 935 if (next_segno != segno) {
@@ -1579,13 +1528,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1579{ 1528{
1580 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 1529 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1581 struct free_segmap_info *free_i = FREE_I(sbi); 1530 struct free_segmap_info *free_i = FREE_I(sbi);
1582 unsigned int segno = 0, offset = 0; 1531 unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi);
1583 unsigned short valid_blocks; 1532 unsigned short valid_blocks;
1584 1533
1585 while (segno < TOTAL_SEGS(sbi)) { 1534 while (1) {
1586 /* find dirty segment based on free segmap */ 1535 /* find dirty segment based on free segmap */
1587 segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset); 1536 segno = find_next_inuse(free_i, total_segs, offset);
1588 if (segno >= TOTAL_SEGS(sbi)) 1537 if (segno >= total_segs)
1589 break; 1538 break;
1590 offset = segno + 1; 1539 offset = segno + 1;
1591 valid_blocks = get_valid_blocks(sbi, segno, 0); 1540 valid_blocks = get_valid_blocks(sbi, segno, 0);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8555f7df82c7..75c7dc363e92 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -34,7 +34,7 @@
34static struct kmem_cache *f2fs_inode_cachep; 34static struct kmem_cache *f2fs_inode_cachep;
35 35
36enum { 36enum {
37 Opt_gc_background_off, 37 Opt_gc_background,
38 Opt_disable_roll_forward, 38 Opt_disable_roll_forward,
39 Opt_discard, 39 Opt_discard,
40 Opt_noheap, 40 Opt_noheap,
@@ -46,7 +46,7 @@ enum {
46}; 46};
47 47
48static match_table_t f2fs_tokens = { 48static match_table_t f2fs_tokens = {
49 {Opt_gc_background_off, "background_gc_off"}, 49 {Opt_gc_background, "background_gc=%s"},
50 {Opt_disable_roll_forward, "disable_roll_forward"}, 50 {Opt_disable_roll_forward, "disable_roll_forward"},
51 {Opt_discard, "discard"}, 51 {Opt_discard, "discard"},
52 {Opt_noheap, "no_heap"}, 52 {Opt_noheap, "no_heap"},
@@ -76,6 +76,91 @@ static void init_once(void *foo)
76 inode_init_once(&fi->vfs_inode); 76 inode_init_once(&fi->vfs_inode);
77} 77}
78 78
79static int parse_options(struct super_block *sb, char *options)
80{
81 struct f2fs_sb_info *sbi = F2FS_SB(sb);
82 substring_t args[MAX_OPT_ARGS];
83 char *p, *name;
84 int arg = 0;
85
86 if (!options)
87 return 0;
88
89 while ((p = strsep(&options, ",")) != NULL) {
90 int token;
91 if (!*p)
92 continue;
93 /*
94 * Initialize args struct so we know whether arg was
95 * found; some options take optional arguments.
96 */
97 args[0].to = args[0].from = NULL;
98 token = match_token(p, f2fs_tokens, args);
99
100 switch (token) {
101 case Opt_gc_background:
102 name = match_strdup(&args[0]);
103
104 if (!name)
105 return -ENOMEM;
106 if (!strncmp(name, "on", 2))
107 set_opt(sbi, BG_GC);
108 else if (!strncmp(name, "off", 3))
109 clear_opt(sbi, BG_GC);
110 else {
111 kfree(name);
112 return -EINVAL;
113 }
114 kfree(name);
115 break;
116 case Opt_disable_roll_forward:
117 set_opt(sbi, DISABLE_ROLL_FORWARD);
118 break;
119 case Opt_discard:
120 set_opt(sbi, DISCARD);
121 break;
122 case Opt_noheap:
123 set_opt(sbi, NOHEAP);
124 break;
125#ifdef CONFIG_F2FS_FS_XATTR
126 case Opt_nouser_xattr:
127 clear_opt(sbi, XATTR_USER);
128 break;
129#else
130 case Opt_nouser_xattr:
131 f2fs_msg(sb, KERN_INFO,
132 "nouser_xattr options not supported");
133 break;
134#endif
135#ifdef CONFIG_F2FS_FS_POSIX_ACL
136 case Opt_noacl:
137 clear_opt(sbi, POSIX_ACL);
138 break;
139#else
140 case Opt_noacl:
141 f2fs_msg(sb, KERN_INFO, "noacl options not supported");
142 break;
143#endif
144 case Opt_active_logs:
145 if (args->from && match_int(args, &arg))
146 return -EINVAL;
147 if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
148 return -EINVAL;
149 sbi->active_logs = arg;
150 break;
151 case Opt_disable_ext_identify:
152 set_opt(sbi, DISABLE_EXT_IDENTIFY);
153 break;
154 default:
155 f2fs_msg(sb, KERN_ERR,
156 "Unrecognized mount option \"%s\" or missing value",
157 p);
158 return -EINVAL;
159 }
160 }
161 return 0;
162}
163
79static struct inode *f2fs_alloc_inode(struct super_block *sb) 164static struct inode *f2fs_alloc_inode(struct super_block *sb)
80{ 165{
81 struct f2fs_inode_info *fi; 166 struct f2fs_inode_info *fi;
@@ -112,6 +197,17 @@ static int f2fs_drop_inode(struct inode *inode)
112 return generic_drop_inode(inode); 197 return generic_drop_inode(inode);
113} 198}
114 199
200/*
201 * f2fs_dirty_inode() is called from __mark_inode_dirty()
202 *
203 * We should call set_dirty_inode to write the dirty inode through write_inode.
204 */
205static void f2fs_dirty_inode(struct inode *inode, int flags)
206{
207 set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
208 return;
209}
210
115static void f2fs_i_callback(struct rcu_head *head) 211static void f2fs_i_callback(struct rcu_head *head)
116{ 212{
117 struct inode *inode = container_of(head, struct inode, i_rcu); 213 struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -170,7 +266,7 @@ static int f2fs_freeze(struct super_block *sb)
170{ 266{
171 int err; 267 int err;
172 268
173 if (sb->s_flags & MS_RDONLY) 269 if (f2fs_readonly(sb))
174 return 0; 270 return 0;
175 271
176 err = f2fs_sync_fs(sb, 1); 272 err = f2fs_sync_fs(sb, 1);
@@ -214,10 +310,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
214{ 310{
215 struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); 311 struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
216 312
217 if (test_opt(sbi, BG_GC)) 313 if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
218 seq_puts(seq, ",background_gc_on"); 314 seq_printf(seq, ",background_gc=%s", "on");
219 else 315 else
220 seq_puts(seq, ",background_gc_off"); 316 seq_printf(seq, ",background_gc=%s", "off");
221 if (test_opt(sbi, DISABLE_ROLL_FORWARD)) 317 if (test_opt(sbi, DISABLE_ROLL_FORWARD))
222 seq_puts(seq, ",disable_roll_forward"); 318 seq_puts(seq, ",disable_roll_forward");
223 if (test_opt(sbi, DISCARD)) 319 if (test_opt(sbi, DISCARD))
@@ -244,11 +340,64 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
244 return 0; 340 return 0;
245} 341}
246 342
343static int f2fs_remount(struct super_block *sb, int *flags, char *data)
344{
345 struct f2fs_sb_info *sbi = F2FS_SB(sb);
346 struct f2fs_mount_info org_mount_opt;
347 int err, active_logs;
348
349 /*
350 * Save the old mount options in case we
351 * need to restore them.
352 */
353 org_mount_opt = sbi->mount_opt;
354 active_logs = sbi->active_logs;
355
356 /* parse mount options */
357 err = parse_options(sb, data);
358 if (err)
359 goto restore_opts;
360
361 /*
362 * Previous and new state of filesystem is RO,
363 * so no point in checking GC conditions.
364 */
365 if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
366 goto skip;
367
368 /*
369 * We stop the GC thread if FS is mounted as RO
370 * or if background_gc = off is passed in mount
371 * option. Also sync the filesystem.
372 */
373 if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) {
374 if (sbi->gc_thread) {
375 stop_gc_thread(sbi);
376 f2fs_sync_fs(sb, 1);
377 }
378 } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
379 err = start_gc_thread(sbi);
380 if (err)
381 goto restore_opts;
382 }
383skip:
384 /* Update the POSIXACL Flag */
385 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
386 (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
387 return 0;
388
389restore_opts:
390 sbi->mount_opt = org_mount_opt;
391 sbi->active_logs = active_logs;
392 return err;
393}
394
247static struct super_operations f2fs_sops = { 395static struct super_operations f2fs_sops = {
248 .alloc_inode = f2fs_alloc_inode, 396 .alloc_inode = f2fs_alloc_inode,
249 .drop_inode = f2fs_drop_inode, 397 .drop_inode = f2fs_drop_inode,
250 .destroy_inode = f2fs_destroy_inode, 398 .destroy_inode = f2fs_destroy_inode,
251 .write_inode = f2fs_write_inode, 399 .write_inode = f2fs_write_inode,
400 .dirty_inode = f2fs_dirty_inode,
252 .show_options = f2fs_show_options, 401 .show_options = f2fs_show_options,
253 .evict_inode = f2fs_evict_inode, 402 .evict_inode = f2fs_evict_inode,
254 .put_super = f2fs_put_super, 403 .put_super = f2fs_put_super,
@@ -256,6 +405,7 @@ static struct super_operations f2fs_sops = {
256 .freeze_fs = f2fs_freeze, 405 .freeze_fs = f2fs_freeze,
257 .unfreeze_fs = f2fs_unfreeze, 406 .unfreeze_fs = f2fs_unfreeze,
258 .statfs = f2fs_statfs, 407 .statfs = f2fs_statfs,
408 .remount_fs = f2fs_remount,
259}; 409};
260 410
261static struct inode *f2fs_nfs_get_inode(struct super_block *sb, 411static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -303,79 +453,6 @@ static const struct export_operations f2fs_export_ops = {
303 .get_parent = f2fs_get_parent, 453 .get_parent = f2fs_get_parent,
304}; 454};
305 455
306static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
307 char *options)
308{
309 substring_t args[MAX_OPT_ARGS];
310 char *p;
311 int arg = 0;
312
313 if (!options)
314 return 0;
315
316 while ((p = strsep(&options, ",")) != NULL) {
317 int token;
318 if (!*p)
319 continue;
320 /*
321 * Initialize args struct so we know whether arg was
322 * found; some options take optional arguments.
323 */
324 args[0].to = args[0].from = NULL;
325 token = match_token(p, f2fs_tokens, args);
326
327 switch (token) {
328 case Opt_gc_background_off:
329 clear_opt(sbi, BG_GC);
330 break;
331 case Opt_disable_roll_forward:
332 set_opt(sbi, DISABLE_ROLL_FORWARD);
333 break;
334 case Opt_discard:
335 set_opt(sbi, DISCARD);
336 break;
337 case Opt_noheap:
338 set_opt(sbi, NOHEAP);
339 break;
340#ifdef CONFIG_F2FS_FS_XATTR
341 case Opt_nouser_xattr:
342 clear_opt(sbi, XATTR_USER);
343 break;
344#else
345 case Opt_nouser_xattr:
346 f2fs_msg(sb, KERN_INFO,
347 "nouser_xattr options not supported");
348 break;
349#endif
350#ifdef CONFIG_F2FS_FS_POSIX_ACL
351 case Opt_noacl:
352 clear_opt(sbi, POSIX_ACL);
353 break;
354#else
355 case Opt_noacl:
356 f2fs_msg(sb, KERN_INFO, "noacl options not supported");
357 break;
358#endif
359 case Opt_active_logs:
360 if (args->from && match_int(args, &arg))
361 return -EINVAL;
362 if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
363 return -EINVAL;
364 sbi->active_logs = arg;
365 break;
366 case Opt_disable_ext_identify:
367 set_opt(sbi, DISABLE_EXT_IDENTIFY);
368 break;
369 default:
370 f2fs_msg(sb, KERN_ERR,
371 "Unrecognized mount option \"%s\" or missing value",
372 p);
373 return -EINVAL;
374 }
375 }
376 return 0;
377}
378
379static loff_t max_file_size(unsigned bits) 456static loff_t max_file_size(unsigned bits)
380{ 457{
381 loff_t result = ADDRS_PER_INODE; 458 loff_t result = ADDRS_PER_INODE;
@@ -541,6 +618,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
541 if (err) 618 if (err)
542 goto free_sb_buf; 619 goto free_sb_buf;
543 } 620 }
621 sb->s_fs_info = sbi;
544 /* init some FS parameters */ 622 /* init some FS parameters */
545 sbi->active_logs = NR_CURSEG_TYPE; 623 sbi->active_logs = NR_CURSEG_TYPE;
546 624
@@ -553,7 +631,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
553 set_opt(sbi, POSIX_ACL); 631 set_opt(sbi, POSIX_ACL);
554#endif 632#endif
555 /* parse mount options */ 633 /* parse mount options */
556 err = parse_options(sb, sbi, (char *)data); 634 err = parse_options(sb, (char *)data);
557 if (err) 635 if (err)
558 goto free_sb_buf; 636 goto free_sb_buf;
559 637
@@ -565,7 +643,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
565 sb->s_xattr = f2fs_xattr_handlers; 643 sb->s_xattr = f2fs_xattr_handlers;
566 sb->s_export_op = &f2fs_export_ops; 644 sb->s_export_op = &f2fs_export_ops;
567 sb->s_magic = F2FS_SUPER_MAGIC; 645 sb->s_magic = F2FS_SUPER_MAGIC;
568 sb->s_fs_info = sbi;
569 sb->s_time_gran = 1; 646 sb->s_time_gran = 1;
570 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 647 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
571 (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); 648 (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -674,10 +751,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
674 "Cannot recover all fsync data errno=%ld", err); 751 "Cannot recover all fsync data errno=%ld", err);
675 } 752 }
676 753
677 /* After POR, we can run background GC thread */ 754 /*
678 err = start_gc_thread(sbi); 755 * If filesystem is not mounted as read-only then
679 if (err) 756 * do start the gc_thread.
680 goto fail; 757 */
758 if (!(sb->s_flags & MS_RDONLY)) {
759 /* After POR, we can run background GC thread.*/
760 err = start_gc_thread(sbi);
761 if (err)
762 goto fail;
763 }
681 764
682 err = f2fs_build_stats(sbi); 765 err = f2fs_build_stats(sbi);
683 if (err) 766 if (err)
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 0b02dce31356..3ab07ecd86ca 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/rwsem.h> 21#include <linux/rwsem.h>
22#include <linux/f2fs_fs.h> 22#include <linux/f2fs_fs.h>
23#include <linux/security.h>
23#include "f2fs.h" 24#include "f2fs.h"
24#include "xattr.h" 25#include "xattr.h"
25 26
@@ -43,6 +44,10 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
43 prefix = XATTR_TRUSTED_PREFIX; 44 prefix = XATTR_TRUSTED_PREFIX;
44 prefix_len = XATTR_TRUSTED_PREFIX_LEN; 45 prefix_len = XATTR_TRUSTED_PREFIX_LEN;
45 break; 46 break;
47 case F2FS_XATTR_INDEX_SECURITY:
48 prefix = XATTR_SECURITY_PREFIX;
49 prefix_len = XATTR_SECURITY_PREFIX_LEN;
50 break;
46 default: 51 default:
47 return -EINVAL; 52 return -EINVAL;
48 } 53 }
@@ -50,7 +55,7 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
50 total_len = prefix_len + name_len + 1; 55 total_len = prefix_len + name_len + 1;
51 if (list && total_len <= list_size) { 56 if (list && total_len <= list_size) {
52 memcpy(list, prefix, prefix_len); 57 memcpy(list, prefix, prefix_len);
53 memcpy(list+prefix_len, name, name_len); 58 memcpy(list + prefix_len, name, name_len);
54 list[prefix_len + name_len] = '\0'; 59 list[prefix_len + name_len] = '\0';
55 } 60 }
56 return total_len; 61 return total_len;
@@ -70,13 +75,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
70 if (!capable(CAP_SYS_ADMIN)) 75 if (!capable(CAP_SYS_ADMIN))
71 return -EPERM; 76 return -EPERM;
72 break; 77 break;
78 case F2FS_XATTR_INDEX_SECURITY:
79 break;
73 default: 80 default:
74 return -EINVAL; 81 return -EINVAL;
75 } 82 }
76 if (strcmp(name, "") == 0) 83 if (strcmp(name, "") == 0)
77 return -EINVAL; 84 return -EINVAL;
78 return f2fs_getxattr(dentry->d_inode, type, name, 85 return f2fs_getxattr(dentry->d_inode, type, name, buffer, size);
79 buffer, size);
80} 86}
81 87
82static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, 88static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
@@ -93,13 +99,15 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
93 if (!capable(CAP_SYS_ADMIN)) 99 if (!capable(CAP_SYS_ADMIN))
94 return -EPERM; 100 return -EPERM;
95 break; 101 break;
102 case F2FS_XATTR_INDEX_SECURITY:
103 break;
96 default: 104 default:
97 return -EINVAL; 105 return -EINVAL;
98 } 106 }
99 if (strcmp(name, "") == 0) 107 if (strcmp(name, "") == 0)
100 return -EINVAL; 108 return -EINVAL;
101 109
102 return f2fs_setxattr(dentry->d_inode, type, name, value, size); 110 return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
103} 111}
104 112
105static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, 113static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
@@ -145,6 +153,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
145 return 0; 153 return 0;
146} 154}
147 155
156#ifdef CONFIG_F2FS_FS_SECURITY
157static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
158 void *page)
159{
160 const struct xattr *xattr;
161 int err = 0;
162
163 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
164 err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
165 xattr->name, xattr->value,
166 xattr->value_len, (struct page *)page);
167 if (err < 0)
168 break;
169 }
170 return err;
171}
172
173int f2fs_init_security(struct inode *inode, struct inode *dir,
174 const struct qstr *qstr, struct page *ipage)
175{
176 return security_inode_init_security(inode, dir, qstr,
177 &f2fs_initxattrs, ipage);
178}
179#endif
180
148const struct xattr_handler f2fs_xattr_user_handler = { 181const struct xattr_handler f2fs_xattr_user_handler = {
149 .prefix = XATTR_USER_PREFIX, 182 .prefix = XATTR_USER_PREFIX,
150 .flags = F2FS_XATTR_INDEX_USER, 183 .flags = F2FS_XATTR_INDEX_USER,
@@ -169,6 +202,14 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
169 .set = f2fs_xattr_advise_set, 202 .set = f2fs_xattr_advise_set,
170}; 203};
171 204
205const struct xattr_handler f2fs_xattr_security_handler = {
206 .prefix = XATTR_SECURITY_PREFIX,
207 .flags = F2FS_XATTR_INDEX_SECURITY,
208 .list = f2fs_xattr_generic_list,
209 .get = f2fs_xattr_generic_get,
210 .set = f2fs_xattr_generic_set,
211};
212
172static const struct xattr_handler *f2fs_xattr_handler_map[] = { 213static const struct xattr_handler *f2fs_xattr_handler_map[] = {
173 [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, 214 [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
174#ifdef CONFIG_F2FS_FS_POSIX_ACL 215#ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -176,6 +217,9 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
176 [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, 217 [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
177#endif 218#endif
178 [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, 219 [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
220#ifdef CONFIG_F2FS_FS_SECURITY
221 [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler,
222#endif
179 [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, 223 [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
180}; 224};
181 225
@@ -186,6 +230,9 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
186 &f2fs_xattr_acl_default_handler, 230 &f2fs_xattr_acl_default_handler,
187#endif 231#endif
188 &f2fs_xattr_trusted_handler, 232 &f2fs_xattr_trusted_handler,
233#ifdef CONFIG_F2FS_FS_SECURITY
234 &f2fs_xattr_security_handler,
235#endif
189 &f2fs_xattr_advise_handler, 236 &f2fs_xattr_advise_handler,
190 NULL, 237 NULL,
191}; 238};
@@ -218,6 +265,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
218 return -ENODATA; 265 return -ENODATA;
219 266
220 page = get_node_page(sbi, fi->i_xattr_nid); 267 page = get_node_page(sbi, fi->i_xattr_nid);
268 if (IS_ERR(page))
269 return PTR_ERR(page);
221 base_addr = page_address(page); 270 base_addr = page_address(page);
222 271
223 list_for_each_xattr(entry, base_addr) { 272 list_for_each_xattr(entry, base_addr) {
@@ -268,6 +317,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
268 return 0; 317 return 0;
269 318
270 page = get_node_page(sbi, fi->i_xattr_nid); 319 page = get_node_page(sbi, fi->i_xattr_nid);
320 if (IS_ERR(page))
321 return PTR_ERR(page);
271 base_addr = page_address(page); 322 base_addr = page_address(page);
272 323
273 list_for_each_xattr(entry, base_addr) { 324 list_for_each_xattr(entry, base_addr) {
@@ -296,7 +347,7 @@ cleanup:
296} 347}
297 348
298int f2fs_setxattr(struct inode *inode, int name_index, const char *name, 349int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
299 const void *value, size_t value_len) 350 const void *value, size_t value_len, struct page *ipage)
300{ 351{
301 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 352 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
302 struct f2fs_inode_info *fi = F2FS_I(inode); 353 struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -335,7 +386,7 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
335 set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); 386 set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
336 mark_inode_dirty(inode); 387 mark_inode_dirty(inode);
337 388
338 page = new_node_page(&dn, XATTR_NODE_OFFSET); 389 page = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
339 if (IS_ERR(page)) { 390 if (IS_ERR(page)) {
340 alloc_nid_failed(sbi, fi->i_xattr_nid); 391 alloc_nid_failed(sbi, fi->i_xattr_nid);
341 fi->i_xattr_nid = 0; 392 fi->i_xattr_nid = 0;
@@ -435,7 +486,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
435 inode->i_ctime = CURRENT_TIME; 486 inode->i_ctime = CURRENT_TIME;
436 clear_inode_flag(fi, FI_ACL_MODE); 487 clear_inode_flag(fi, FI_ACL_MODE);
437 } 488 }
438 update_inode_page(inode); 489 if (ipage)
490 update_inode(inode, ipage);
491 else
492 update_inode_page(inode);
439 mutex_unlock_op(sbi, ilock); 493 mutex_unlock_op(sbi, ilock);
440 494
441 return 0; 495 return 0;
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 49c9558305e3..3c0817bef25d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -112,21 +112,19 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler;
112extern const struct xattr_handler f2fs_xattr_acl_access_handler; 112extern const struct xattr_handler f2fs_xattr_acl_access_handler;
113extern const struct xattr_handler f2fs_xattr_acl_default_handler; 113extern const struct xattr_handler f2fs_xattr_acl_default_handler;
114extern const struct xattr_handler f2fs_xattr_advise_handler; 114extern const struct xattr_handler f2fs_xattr_advise_handler;
115extern const struct xattr_handler f2fs_xattr_security_handler;
115 116
116extern const struct xattr_handler *f2fs_xattr_handlers[]; 117extern const struct xattr_handler *f2fs_xattr_handlers[];
117 118
118extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name, 119extern int f2fs_setxattr(struct inode *, int, const char *,
119 const void *value, size_t value_len); 120 const void *, size_t, struct page *);
120extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name, 121extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
121 void *buffer, size_t buffer_size); 122extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
122extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
123 size_t buffer_size);
124
125#else 123#else
126 124
127#define f2fs_xattr_handlers NULL 125#define f2fs_xattr_handlers NULL
128static inline int f2fs_setxattr(struct inode *inode, int name_index, 126static inline int f2fs_setxattr(struct inode *inode, int name_index,
129 const char *name, const void *value, size_t value_len) 127 const char *name, const void *value, size_t value_len)
130{ 128{
131 return -EOPNOTSUPP; 129 return -EOPNOTSUPP;
132} 130}
@@ -142,4 +140,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
142} 140}
143#endif 141#endif
144 142
143#ifdef CONFIG_F2FS_FS_SECURITY
144extern int f2fs_init_security(struct inode *, struct inode *,
145 const struct qstr *, struct page *);
146#else
147static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
148 const struct qstr *qstr, struct page *ipage)
149{
150 return 0;
151}
152#endif
145#endif /* __F2FS_XATTR_H__ */ 153#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 7a6f02caf286..3963ede84eb0 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -543,6 +543,7 @@ end_of_dir:
543EXPORT_SYMBOL_GPL(fat_search_long); 543EXPORT_SYMBOL_GPL(fat_search_long);
544 544
545struct fat_ioctl_filldir_callback { 545struct fat_ioctl_filldir_callback {
546 struct dir_context ctx;
546 void __user *dirent; 547 void __user *dirent;
547 int result; 548 int result;
548 /* for dir ioctl */ 549 /* for dir ioctl */
@@ -552,8 +553,9 @@ struct fat_ioctl_filldir_callback {
552 int short_len; 553 int short_len;
553}; 554};
554 555
555static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent, 556static int __fat_readdir(struct inode *inode, struct file *file,
556 filldir_t filldir, int short_only, int both) 557 struct dir_context *ctx, int short_only,
558 struct fat_ioctl_filldir_callback *both)
557{ 559{
558 struct super_block *sb = inode->i_sb; 560 struct super_block *sb = inode->i_sb;
559 struct msdos_sb_info *sbi = MSDOS_SB(sb); 561 struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -564,27 +566,20 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
564 unsigned char bufname[FAT_MAX_SHORT_SIZE]; 566 unsigned char bufname[FAT_MAX_SHORT_SIZE];
565 int isvfat = sbi->options.isvfat; 567 int isvfat = sbi->options.isvfat;
566 const char *fill_name = NULL; 568 const char *fill_name = NULL;
567 unsigned long inum; 569 int fake_offset = 0;
568 unsigned long lpos, dummy, *furrfu = &lpos;
569 loff_t cpos; 570 loff_t cpos;
570 int short_len = 0, fill_len = 0; 571 int short_len = 0, fill_len = 0;
571 int ret = 0; 572 int ret = 0;
572 573
573 mutex_lock(&sbi->s_lock); 574 mutex_lock(&sbi->s_lock);
574 575
575 cpos = filp->f_pos; 576 cpos = ctx->pos;
576 /* Fake . and .. for the root directory. */ 577 /* Fake . and .. for the root directory. */
577 if (inode->i_ino == MSDOS_ROOT_INO) { 578 if (inode->i_ino == MSDOS_ROOT_INO) {
578 while (cpos < 2) { 579 if (!dir_emit_dots(file, ctx))
579 if (filldir(dirent, "..", cpos+1, cpos, 580 goto out;
580 MSDOS_ROOT_INO, DT_DIR) < 0) 581 if (ctx->pos == 2) {
581 goto out; 582 fake_offset = 1;
582 cpos++;
583 filp->f_pos++;
584 }
585 if (cpos == 2) {
586 dummy = 2;
587 furrfu = &dummy;
588 cpos = 0; 583 cpos = 0;
589 } 584 }
590 } 585 }
@@ -619,7 +614,7 @@ parse_record:
619 int status = fat_parse_long(inode, &cpos, &bh, &de, 614 int status = fat_parse_long(inode, &cpos, &bh, &de,
620 &unicode, &nr_slots); 615 &unicode, &nr_slots);
621 if (status < 0) { 616 if (status < 0) {
622 filp->f_pos = cpos; 617 ctx->pos = cpos;
623 ret = status; 618 ret = status;
624 goto out; 619 goto out;
625 } else if (status == PARSE_INVALID) 620 } else if (status == PARSE_INVALID)
@@ -639,6 +634,19 @@ parse_record:
639 /* !both && !short_only, so we don't need shortname. */ 634 /* !both && !short_only, so we don't need shortname. */
640 if (!both) 635 if (!both)
641 goto start_filldir; 636 goto start_filldir;
637
638 short_len = fat_parse_short(sb, de, bufname,
639 sbi->options.dotsOK);
640 if (short_len == 0)
641 goto record_end;
642 /* hack for fat_ioctl_filldir() */
643 both->longname = fill_name;
644 both->long_len = fill_len;
645 both->shortname = bufname;
646 both->short_len = short_len;
647 fill_name = NULL;
648 fill_len = 0;
649 goto start_filldir;
642 } 650 }
643 } 651 }
644 652
@@ -646,28 +654,21 @@ parse_record:
646 if (short_len == 0) 654 if (short_len == 0)
647 goto record_end; 655 goto record_end;
648 656
649 if (nr_slots) { 657 fill_name = bufname;
650 /* hack for fat_ioctl_filldir() */ 658 fill_len = short_len;
651 struct fat_ioctl_filldir_callback *p = dirent;
652
653 p->longname = fill_name;
654 p->long_len = fill_len;
655 p->shortname = bufname;
656 p->short_len = short_len;
657 fill_name = NULL;
658 fill_len = 0;
659 } else {
660 fill_name = bufname;
661 fill_len = short_len;
662 }
663 659
664start_filldir: 660start_filldir:
665 lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); 661 if (!fake_offset)
666 if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) 662 ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
667 inum = inode->i_ino; 663
668 else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { 664 if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
669 inum = parent_ino(filp->f_path.dentry); 665 if (!dir_emit_dot(file, ctx))
666 goto fill_failed;
667 } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
668 if (!dir_emit_dotdot(file, ctx))
669 goto fill_failed;
670 } else { 670 } else {
671 unsigned long inum;
671 loff_t i_pos = fat_make_i_pos(sb, bh, de); 672 loff_t i_pos = fat_make_i_pos(sb, bh, de);
672 struct inode *tmp = fat_iget(sb, i_pos); 673 struct inode *tmp = fat_iget(sb, i_pos);
673 if (tmp) { 674 if (tmp) {
@@ -675,18 +676,17 @@ start_filldir:
675 iput(tmp); 676 iput(tmp);
676 } else 677 } else
677 inum = iunique(sb, MSDOS_ROOT_INO); 678 inum = iunique(sb, MSDOS_ROOT_INO);
679 if (!dir_emit(ctx, fill_name, fill_len, inum,
680 (de->attr & ATTR_DIR) ? DT_DIR : DT_REG))
681 goto fill_failed;
678 } 682 }
679 683
680 if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
681 (de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
682 goto fill_failed;
683
684record_end: 684record_end:
685 furrfu = &lpos; 685 fake_offset = 0;
686 filp->f_pos = cpos; 686 ctx->pos = cpos;
687 goto get_new; 687 goto get_new;
688end_of_dir: 688end_of_dir:
689 filp->f_pos = cpos; 689 ctx->pos = cpos;
690fill_failed: 690fill_failed:
691 brelse(bh); 691 brelse(bh);
692 if (unicode) 692 if (unicode)
@@ -696,10 +696,9 @@ out:
696 return ret; 696 return ret;
697} 697}
698 698
699static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) 699static int fat_readdir(struct file *file, struct dir_context *ctx)
700{ 700{
701 struct inode *inode = file_inode(filp); 701 return __fat_readdir(file_inode(file), file, ctx, 0, NULL);
702 return __fat_readdir(inode, filp, dirent, filldir, 0, 0);
703} 702}
704 703
705#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ 704#define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \
@@ -755,20 +754,25 @@ efault: \
755 754
756FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) 755FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
757 756
758static int fat_ioctl_readdir(struct inode *inode, struct file *filp, 757static int fat_ioctl_readdir(struct inode *inode, struct file *file,
759 void __user *dirent, filldir_t filldir, 758 void __user *dirent, filldir_t filldir,
760 int short_only, int both) 759 int short_only, int both)
761{ 760{
762 struct fat_ioctl_filldir_callback buf; 761 struct fat_ioctl_filldir_callback buf = {
762 .ctx.actor = filldir,
763 .dirent = dirent
764 };
763 int ret; 765 int ret;
764 766
765 buf.dirent = dirent; 767 buf.dirent = dirent;
766 buf.result = 0; 768 buf.result = 0;
767 mutex_lock(&inode->i_mutex); 769 mutex_lock(&inode->i_mutex);
770 buf.ctx.pos = file->f_pos;
768 ret = -ENOENT; 771 ret = -ENOENT;
769 if (!IS_DEADDIR(inode)) { 772 if (!IS_DEADDIR(inode)) {
770 ret = __fat_readdir(inode, filp, &buf, filldir, 773 ret = __fat_readdir(inode, file, &buf.ctx,
771 short_only, both); 774 short_only, both ? &buf : NULL);
775 file->f_pos = buf.ctx.pos;
772 } 776 }
773 mutex_unlock(&inode->i_mutex); 777 mutex_unlock(&inode->i_mutex);
774 if (ret >= 0) 778 if (ret >= 0)
@@ -854,7 +858,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
854const struct file_operations fat_dir_operations = { 858const struct file_operations fat_dir_operations = {
855 .llseek = generic_file_llseek, 859 .llseek = generic_file_llseek,
856 .read = generic_read_dir, 860 .read = generic_read_dir,
857 .readdir = fat_readdir, 861 .iterate = fat_readdir,
858 .unlocked_ioctl = fat_dir_ioctl, 862 .unlocked_ioctl = fat_dir_ioctl,
859#ifdef CONFIG_COMPAT 863#ifdef CONFIG_COMPAT
860 .compat_ioctl = fat_compat_dir_ioctl, 864 .compat_ioctl = fat_compat_dir_ioctl,
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 21664fcf3616..4241e6f39e86 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -86,6 +86,7 @@ struct msdos_sb_info {
86 const void *dir_ops; /* Opaque; default directory operations */ 86 const void *dir_ops; /* Opaque; default directory operations */
87 int dir_per_block; /* dir entries per block */ 87 int dir_per_block; /* dir entries per block */
88 int dir_per_block_bits; /* log2(dir_per_block) */ 88 int dir_per_block_bits; /* log2(dir_per_block) */
89 unsigned int vol_id; /*volume ID*/
89 90
90 int fatent_shift; 91 int fatent_shift;
91 struct fatent_operations *fatent_ops; 92 struct fatent_operations *fatent_ops;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e50ddb..9b104f543056 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -114,6 +114,12 @@ out:
114 return err; 114 return err;
115} 115}
116 116
117static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr)
118{
119 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
120 return put_user(sbi->vol_id, user_attr);
121}
122
117long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 123long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118{ 124{
119 struct inode *inode = file_inode(filp); 125 struct inode *inode = file_inode(filp);
@@ -124,6 +130,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
124 return fat_ioctl_get_attributes(inode, user_attr); 130 return fat_ioctl_get_attributes(inode, user_attr);
125 case FAT_IOCTL_SET_ATTRIBUTES: 131 case FAT_IOCTL_SET_ATTRIBUTES:
126 return fat_ioctl_set_attributes(filp, user_attr); 132 return fat_ioctl_set_attributes(filp, user_attr);
133 case FAT_IOCTL_GET_VOLUME_ID:
134 return fat_ioctl_get_volume_id(inode, user_attr);
127 default: 135 default:
128 return -ENOTTY; /* Inappropriate ioctl for device */ 136 return -ENOTTY; /* Inappropriate ioctl for device */
129 } 137 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5d4513cb1b3c..11b51bb55b42 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1415,6 +1415,18 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1415 brelse(fsinfo_bh); 1415 brelse(fsinfo_bh);
1416 } 1416 }
1417 1417
1418 /* interpret volume ID as a little endian 32 bit integer */
1419 if (sbi->fat_bits == 32)
1420 sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
1421 ((u32)b->fat32.vol_id[1] << 8) |
1422 ((u32)b->fat32.vol_id[2] << 16) |
1423 ((u32)b->fat32.vol_id[3] << 24));
1424 else /* fat 16 or 12 */
1425 sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
1426 ((u32)b->fat16.vol_id[1] << 8) |
1427 ((u32)b->fat16.vol_id[2] << 16) |
1428 ((u32)b->fat16.vol_id[3] << 24));
1429
1418 sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry); 1430 sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
1419 sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; 1431 sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
1420 1432
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 359d307b5507..628e22a5a543 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
30 va_start(args, fmt); 30 va_start(args, fmt);
31 vaf.fmt = fmt; 31 vaf.fmt = fmt;
32 vaf.va = &args; 32 vaf.va = &args;
33 printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); 33 fat_msg(sb, KERN_ERR, "error, %pV", &vaf);
34 va_end(args); 34 va_end(args);
35 } 35 }
36 36
@@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
38 panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); 38 panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
39 else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { 39 else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
40 sb->s_flags |= MS_RDONLY; 40 sb->s_flags |= MS_RDONLY;
41 printk(KERN_ERR "FAT-fs (%s): Filesystem has been " 41 fat_msg(sb, KERN_ERR, "Filesystem has been set read-only");
42 "set read-only\n", sb->s_id);
43 } 42 }
44} 43}
45EXPORT_SYMBOL_GPL(__fat_fs_error); 44EXPORT_SYMBOL_GPL(__fat_fs_error);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 081b759cff83..a783b0e1272a 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
148 * that the existing dentry can be used. The msdos fs routines will 148 * that the existing dentry can be used. The msdos fs routines will
149 * return ENOENT or EINVAL as appropriate. 149 * return ENOENT or EINVAL as appropriate.
150 */ 150 */
151static int msdos_hash(const struct dentry *dentry, const struct inode *inode, 151static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
152 struct qstr *qstr)
153{ 152{
154 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options; 153 struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
155 unsigned char msdos_name[MSDOS_NAME]; 154 unsigned char msdos_name[MSDOS_NAME];
@@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
165 * Compare two msdos names. If either of the names are invalid, 164 * Compare two msdos names. If either of the names are invalid,
166 * we fall back to doing the standard name comparison. 165 * we fall back to doing the standard name comparison.
167 */ 166 */
168static int msdos_cmp(const struct dentry *parent, const struct inode *pinode, 167static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
169 const struct dentry *dentry, const struct inode *inode,
170 unsigned int len, const char *str, const struct qstr *name) 168 unsigned int len, const char *str, const struct qstr *name)
171{ 169{
172 struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options; 170 struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2da952036a3d..6df8d3d885e5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
107 * that the existing dentry can be used. The vfat fs routines will 107 * that the existing dentry can be used. The vfat fs routines will
108 * return ENOENT or EINVAL as appropriate. 108 * return ENOENT or EINVAL as appropriate.
109 */ 109 */
110static int vfat_hash(const struct dentry *dentry, const struct inode *inode, 110static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
111 struct qstr *qstr)
112{ 111{
113 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); 112 qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
114 return 0; 113 return 0;
@@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
120 * that the existing dentry can be used. The vfat fs routines will 119 * that the existing dentry can be used. The vfat fs routines will
121 * return ENOENT or EINVAL as appropriate. 120 * return ENOENT or EINVAL as appropriate.
122 */ 121 */
123static int vfat_hashi(const struct dentry *dentry, const struct inode *inode, 122static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
124 struct qstr *qstr)
125{ 123{
126 struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io; 124 struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
127 const unsigned char *name; 125 const unsigned char *name;
@@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
142/* 140/*
143 * Case insensitive compare of two vfat names. 141 * Case insensitive compare of two vfat names.
144 */ 142 */
145static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode, 143static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
146 const struct dentry *dentry, const struct inode *inode,
147 unsigned int len, const char *str, const struct qstr *name) 144 unsigned int len, const char *str, const struct qstr *name)
148{ 145{
149 struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io; 146 struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
@@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
162/* 159/*
163 * Case sensitive compare of two vfat names. 160 * Case sensitive compare of two vfat names.
164 */ 161 */
165static int vfat_cmp(const struct dentry *parent, const struct inode *pinode, 162static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
166 const struct dentry *dentry, const struct inode *inode,
167 unsigned int len, const char *str, const struct qstr *name) 163 unsigned int len, const char *str, const struct qstr *name)
168{ 164{
169 unsigned int alen, blen; 165 unsigned int alen, blen;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6599222536eb..65343c3741ff 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -730,14 +730,14 @@ static int __init fcntl_init(void)
730 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY 730 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
731 * is defined as O_NONBLOCK on some platforms and not on others. 731 * is defined as O_NONBLOCK on some platforms and not on others.
732 */ 732 */
733 BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 733 BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
734 O_RDONLY | O_WRONLY | O_RDWR | 734 O_RDONLY | O_WRONLY | O_RDWR |
735 O_CREAT | O_EXCL | O_NOCTTY | 735 O_CREAT | O_EXCL | O_NOCTTY |
736 O_TRUNC | O_APPEND | /* O_NONBLOCK | */ 736 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
737 __O_SYNC | O_DSYNC | FASYNC | 737 __O_SYNC | O_DSYNC | FASYNC |
738 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 738 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
739 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 739 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
740 __FMODE_EXEC | O_PATH 740 __FMODE_EXEC | O_PATH | __O_TMPFILE
741 )); 741 ));
742 742
743 fasync_cache = kmem_cache_create("fasync_cache", 743 fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/file_table.c b/fs/file_table.c
index 485dc0eddd67..b44e4c559786 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -227,7 +227,7 @@ static void __fput(struct file *file)
227{ 227{
228 struct dentry *dentry = file->f_path.dentry; 228 struct dentry *dentry = file->f_path.dentry;
229 struct vfsmount *mnt = file->f_path.mnt; 229 struct vfsmount *mnt = file->f_path.mnt;
230 struct inode *inode = dentry->d_inode; 230 struct inode *inode = file->f_inode;
231 231
232 might_sleep(); 232 might_sleep();
233 233
@@ -265,18 +265,15 @@ static void __fput(struct file *file)
265 mntput(mnt); 265 mntput(mnt);
266} 266}
267 267
268static DEFINE_SPINLOCK(delayed_fput_lock); 268static LLIST_HEAD(delayed_fput_list);
269static LIST_HEAD(delayed_fput_list);
270static void delayed_fput(struct work_struct *unused) 269static void delayed_fput(struct work_struct *unused)
271{ 270{
272 LIST_HEAD(head); 271 struct llist_node *node = llist_del_all(&delayed_fput_list);
273 spin_lock_irq(&delayed_fput_lock); 272 struct llist_node *next;
274 list_splice_init(&delayed_fput_list, &head); 273
275 spin_unlock_irq(&delayed_fput_lock); 274 for (; node; node = next) {
276 while (!list_empty(&head)) { 275 next = llist_next(node);
277 struct file *f = list_first_entry(&head, struct file, f_u.fu_list); 276 __fput(llist_entry(node, struct file, f_u.fu_llist));
278 list_del_init(&f->f_u.fu_list);
279 __fput(f);
280 } 277 }
281} 278}
282 279
@@ -306,18 +303,22 @@ void fput(struct file *file)
306{ 303{
307 if (atomic_long_dec_and_test(&file->f_count)) { 304 if (atomic_long_dec_and_test(&file->f_count)) {
308 struct task_struct *task = current; 305 struct task_struct *task = current;
309 unsigned long flags;
310 306
311 file_sb_list_del(file); 307 file_sb_list_del(file);
312 if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { 308 if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
313 init_task_work(&file->f_u.fu_rcuhead, ____fput); 309 init_task_work(&file->f_u.fu_rcuhead, ____fput);
314 if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) 310 if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
315 return; 311 return;
312 /*
313 * After this task has run exit_task_work(),
314 * task_work_add() will fail. free_ipc_ns()->
315 * shm_destroy() can do this. Fall through to delayed
316 * fput to avoid leaking *file.
317 */
316 } 318 }
317 spin_lock_irqsave(&delayed_fput_lock, flags); 319
318 list_add(&file->f_u.fu_list, &delayed_fput_list); 320 if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
319 schedule_work(&delayed_fput_work); 321 schedule_work(&delayed_fput_work);
320 spin_unlock_irqrestore(&delayed_fput_lock, flags);
321 } 322 }
322} 323}
323 324
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 664b07a53870..25d4099a4aea 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -49,7 +49,7 @@
49 49
50 50
51static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int); 51static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int);
52static int vxfs_readdir(struct file *, void *, filldir_t); 52static int vxfs_readdir(struct file *, struct dir_context *);
53 53
54const struct inode_operations vxfs_dir_inode_ops = { 54const struct inode_operations vxfs_dir_inode_ops = {
55 .lookup = vxfs_lookup, 55 .lookup = vxfs_lookup,
@@ -58,7 +58,7 @@ const struct inode_operations vxfs_dir_inode_ops = {
58const struct file_operations vxfs_dir_operations = { 58const struct file_operations vxfs_dir_operations = {
59 .llseek = generic_file_llseek, 59 .llseek = generic_file_llseek,
60 .read = generic_read_dir, 60 .read = generic_read_dir,
61 .readdir = vxfs_readdir, 61 .iterate = vxfs_readdir,
62}; 62};
63 63
64 64
@@ -235,7 +235,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
235 * Zero. 235 * Zero.
236 */ 236 */
237static int 237static int
238vxfs_readdir(struct file *fp, void *retp, filldir_t filler) 238vxfs_readdir(struct file *fp, struct dir_context *ctx)
239{ 239{
240 struct inode *ip = file_inode(fp); 240 struct inode *ip = file_inode(fp);
241 struct super_block *sbp = ip->i_sb; 241 struct super_block *sbp = ip->i_sb;
@@ -243,20 +243,17 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
243 u_long page, npages, block, pblocks, nblocks, offset; 243 u_long page, npages, block, pblocks, nblocks, offset;
244 loff_t pos; 244 loff_t pos;
245 245
246 switch ((long)fp->f_pos) { 246 if (ctx->pos == 0) {
247 case 0: 247 if (!dir_emit_dot(fp, ctx))
248 if (filler(retp, ".", 1, fp->f_pos, ip->i_ino, DT_DIR) < 0) 248 return 0;
249 goto out; 249 ctx->pos = 1;
250 fp->f_pos++;
251 /* fallthrough */
252 case 1:
253 if (filler(retp, "..", 2, fp->f_pos, VXFS_INO(ip)->vii_dotdot, DT_DIR) < 0)
254 goto out;
255 fp->f_pos++;
256 /* fallthrough */
257 } 250 }
258 251 if (ctx->pos == 1) {
259 pos = fp->f_pos - 2; 252 if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
253 return 0;
254 ctx->pos = 2;
255 }
256 pos = ctx->pos - 2;
260 257
261 if (pos > VXFS_DIRROUND(ip->i_size)) 258 if (pos > VXFS_DIRROUND(ip->i_size))
262 return 0; 259 return 0;
@@ -270,16 +267,16 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
270 block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks; 267 block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
271 268
272 for (; page < npages; page++, block = 0) { 269 for (; page < npages; page++, block = 0) {
273 caddr_t kaddr; 270 char *kaddr;
274 struct page *pp; 271 struct page *pp;
275 272
276 pp = vxfs_get_page(ip->i_mapping, page); 273 pp = vxfs_get_page(ip->i_mapping, page);
277 if (IS_ERR(pp)) 274 if (IS_ERR(pp))
278 continue; 275 continue;
279 kaddr = (caddr_t)page_address(pp); 276 kaddr = (char *)page_address(pp);
280 277
281 for (; block <= nblocks && block <= pblocks; block++) { 278 for (; block <= nblocks && block <= pblocks; block++) {
282 caddr_t baddr, limit; 279 char *baddr, *limit;
283 struct vxfs_dirblk *dbp; 280 struct vxfs_dirblk *dbp;
284 struct vxfs_direct *de; 281 struct vxfs_direct *de;
285 282
@@ -292,21 +289,18 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
292 (kaddr + offset) : 289 (kaddr + offset) :
293 (baddr + VXFS_DIRBLKOV(dbp))); 290 (baddr + VXFS_DIRBLKOV(dbp)));
294 291
295 for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) { 292 for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
296 int over;
297
298 if (!de->d_reclen) 293 if (!de->d_reclen)
299 break; 294 break;
300 if (!de->d_ino) 295 if (!de->d_ino)
301 continue; 296 continue;
302 297
303 offset = (caddr_t)de - kaddr; 298 offset = (char *)de - kaddr;
304 over = filler(retp, de->d_name, de->d_namelen, 299 ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
305 ((page << PAGE_CACHE_SHIFT) | offset) + 2, 300 if (!dir_emit(ctx, de->d_name, de->d_namelen,
306 de->d_ino, DT_UNKNOWN); 301 de->d_ino, DT_UNKNOWN)) {
307 if (over) {
308 vxfs_put_page(pp); 302 vxfs_put_page(pp);
309 goto done; 303 return 0;
310 } 304 }
311 } 305 }
312 offset = 0; 306 offset = 0;
@@ -314,9 +308,6 @@ vxfs_readdir(struct file *fp, void *retp, filldir_t filler)
314 vxfs_put_page(pp); 308 vxfs_put_page(pp);
315 offset = 0; 309 offset = 0;
316 } 310 }
317 311 ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
318done:
319 fp->f_pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
320out:
321 return 0; 312 return 0;
322} 313}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be57189efd5..68851ff2fd41 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,6 +45,7 @@ struct wb_writeback_work {
45 unsigned int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 unsigned int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 unsigned int for_background:1; 47 unsigned int for_background:1;
48 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
48 enum wb_reason reason; /* why was writeback initiated? */ 49 enum wb_reason reason; /* why was writeback initiated? */
49 50
50 struct list_head list; /* pending work list */ 51 struct list_head list; /* pending work list */
@@ -443,9 +444,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
443 /* 444 /*
444 * Make sure to wait on the data before writing out the metadata. 445 * Make sure to wait on the data before writing out the metadata.
445 * This is important for filesystems that modify metadata on data 446 * This is important for filesystems that modify metadata on data
446 * I/O completion. 447 * I/O completion. We don't do it for sync(2) writeback because it has a
448 * separate, external IO completion path and ->sync_fs for guaranteeing
449 * inode metadata is written back correctly.
447 */ 450 */
448 if (wbc->sync_mode == WB_SYNC_ALL) { 451 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
449 int err = filemap_fdatawait(mapping); 452 int err = filemap_fdatawait(mapping);
450 if (ret == 0) 453 if (ret == 0)
451 ret = err; 454 ret = err;
@@ -578,6 +581,7 @@ static long writeback_sb_inodes(struct super_block *sb,
578 .tagged_writepages = work->tagged_writepages, 581 .tagged_writepages = work->tagged_writepages,
579 .for_kupdate = work->for_kupdate, 582 .for_kupdate = work->for_kupdate,
580 .for_background = work->for_background, 583 .for_background = work->for_background,
584 .for_sync = work->for_sync,
581 .range_cyclic = work->range_cyclic, 585 .range_cyclic = work->range_cyclic,
582 .range_start = 0, 586 .range_start = 0,
583 .range_end = LLONG_MAX, 587 .range_end = LLONG_MAX,
@@ -959,7 +963,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
959/* 963/*
960 * Retrieve work items and do the writeback they describe 964 * Retrieve work items and do the writeback they describe
961 */ 965 */
962long wb_do_writeback(struct bdi_writeback *wb, int force_wait) 966static long wb_do_writeback(struct bdi_writeback *wb)
963{ 967{
964 struct backing_dev_info *bdi = wb->bdi; 968 struct backing_dev_info *bdi = wb->bdi;
965 struct wb_writeback_work *work; 969 struct wb_writeback_work *work;
@@ -967,12 +971,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
967 971
968 set_bit(BDI_writeback_running, &wb->bdi->state); 972 set_bit(BDI_writeback_running, &wb->bdi->state);
969 while ((work = get_next_work_item(bdi)) != NULL) { 973 while ((work = get_next_work_item(bdi)) != NULL) {
970 /*
971 * Override sync mode, in case we must wait for completion
972 * because this thread is exiting now.
973 */
974 if (force_wait)
975 work->sync_mode = WB_SYNC_ALL;
976 974
977 trace_writeback_exec(bdi, work); 975 trace_writeback_exec(bdi, work);
978 976
@@ -1021,7 +1019,7 @@ void bdi_writeback_workfn(struct work_struct *work)
1021 * rescuer as work_list needs to be drained. 1019 * rescuer as work_list needs to be drained.
1022 */ 1020 */
1023 do { 1021 do {
1024 pages_written = wb_do_writeback(wb, 0); 1022 pages_written = wb_do_writeback(wb);
1025 trace_writeback_pages_written(pages_written); 1023 trace_writeback_pages_written(pages_written);
1026 } while (!list_empty(&bdi->work_list)); 1024 } while (!list_empty(&bdi->work_list));
1027 } else { 1025 } else {
@@ -1362,6 +1360,7 @@ void sync_inodes_sb(struct super_block *sb)
1362 .range_cyclic = 0, 1360 .range_cyclic = 0,
1363 .done = &done, 1361 .done = &done,
1364 .reason = WB_REASON_SYNC, 1362 .reason = WB_REASON_SYNC,
1363 .for_sync = 1,
1365 }; 1364 };
1366 1365
1367 /* Nothing to do? */ 1366 /* Nothing to do? */
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index b52aed1dca97..f7cff367db7f 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -115,7 +115,7 @@ struct fscache_cache *fscache_select_cache_for_object(
115 struct fscache_object, cookie_link); 115 struct fscache_object, cookie_link);
116 116
117 cache = object->cache; 117 cache = object->cache;
118 if (object->state >= FSCACHE_OBJECT_DYING || 118 if (fscache_object_is_dying(object) ||
119 test_bit(FSCACHE_IOERROR, &cache->flags)) 119 test_bit(FSCACHE_IOERROR, &cache->flags))
120 cache = NULL; 120 cache = NULL;
121 121
@@ -224,8 +224,10 @@ int fscache_add_cache(struct fscache_cache *cache,
224 BUG_ON(!ifsdef); 224 BUG_ON(!ifsdef);
225 225
226 cache->flags = 0; 226 cache->flags = 0;
227 ifsdef->event_mask = ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); 227 ifsdef->event_mask =
228 ifsdef->state = FSCACHE_OBJECT_ACTIVE; 228 ((1 << NR_FSCACHE_OBJECT_EVENTS) - 1) &
229 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
230 __set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &ifsdef->flags);
229 231
230 if (!tagname) 232 if (!tagname)
231 tagname = cache->identifier; 233 tagname = cache->identifier;
@@ -330,25 +332,25 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache,
330{ 332{
331 struct fscache_object *object; 333 struct fscache_object *object;
332 334
333 spin_lock(&cache->object_list_lock);
334
335 while (!list_empty(&cache->object_list)) { 335 while (!list_empty(&cache->object_list)) {
336 object = list_entry(cache->object_list.next, 336 spin_lock(&cache->object_list_lock);
337 struct fscache_object, cache_link);
338 list_move_tail(&object->cache_link, dying_objects);
339 337
340 _debug("withdraw %p", object->cookie); 338 if (!list_empty(&cache->object_list)) {
339 object = list_entry(cache->object_list.next,
340 struct fscache_object, cache_link);
341 list_move_tail(&object->cache_link, dying_objects);
341 342
342 spin_lock(&object->lock); 343 _debug("withdraw %p", object->cookie);
343 spin_unlock(&cache->object_list_lock); 344
344 fscache_raise_event(object, FSCACHE_OBJECT_EV_WITHDRAW); 345 /* This must be done under object_list_lock to prevent
345 spin_unlock(&object->lock); 346 * a race with fscache_drop_object().
347 */
348 fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
349 }
346 350
351 spin_unlock(&cache->object_list_lock);
347 cond_resched(); 352 cond_resched();
348 spin_lock(&cache->object_list_lock);
349 } 353 }
350
351 spin_unlock(&cache->object_list_lock);
352} 354}
353 355
354/** 356/**
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index e2cba1f60c21..0e91a3c9fdb2 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -95,6 +95,11 @@ struct fscache_cookie *__fscache_acquire_cookie(
95 atomic_set(&cookie->usage, 1); 95 atomic_set(&cookie->usage, 1);
96 atomic_set(&cookie->n_children, 0); 96 atomic_set(&cookie->n_children, 0);
97 97
98 /* We keep the active count elevated until relinquishment to prevent an
99 * attempt to wake up every time the object operations queue quiesces.
100 */
101 atomic_set(&cookie->n_active, 1);
102
98 atomic_inc(&parent->usage); 103 atomic_inc(&parent->usage);
99 atomic_inc(&parent->n_children); 104 atomic_inc(&parent->n_children);
100 105
@@ -177,7 +182,6 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
177 182
178 cookie->flags = 183 cookie->flags =
179 (1 << FSCACHE_COOKIE_LOOKING_UP) | 184 (1 << FSCACHE_COOKIE_LOOKING_UP) |
180 (1 << FSCACHE_COOKIE_CREATING) |
181 (1 << FSCACHE_COOKIE_NO_DATA_YET); 185 (1 << FSCACHE_COOKIE_NO_DATA_YET);
182 186
183 /* ask the cache to allocate objects for this cookie and its parent 187 /* ask the cache to allocate objects for this cookie and its parent
@@ -205,7 +209,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
205 209
206 /* initiate the process of looking up all the objects in the chain 210 /* initiate the process of looking up all the objects in the chain
207 * (done by fscache_initialise_object()) */ 211 * (done by fscache_initialise_object()) */
208 fscache_enqueue_object(object); 212 fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD);
209 213
210 spin_unlock(&cookie->lock); 214 spin_unlock(&cookie->lock);
211 215
@@ -285,7 +289,7 @@ static int fscache_alloc_object(struct fscache_cache *cache,
285 289
286object_already_extant: 290object_already_extant:
287 ret = -ENOBUFS; 291 ret = -ENOBUFS;
288 if (object->state >= FSCACHE_OBJECT_DYING) { 292 if (fscache_object_is_dead(object)) {
289 spin_unlock(&cookie->lock); 293 spin_unlock(&cookie->lock);
290 goto error; 294 goto error;
291 } 295 }
@@ -321,7 +325,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
321 ret = -EEXIST; 325 ret = -EEXIST;
322 hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { 326 hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) {
323 if (p->cache == object->cache) { 327 if (p->cache == object->cache) {
324 if (p->state >= FSCACHE_OBJECT_DYING) 328 if (fscache_object_is_dying(p))
325 ret = -ENOBUFS; 329 ret = -ENOBUFS;
326 goto cant_attach_object; 330 goto cant_attach_object;
327 } 331 }
@@ -332,7 +336,7 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
332 hlist_for_each_entry(p, &cookie->parent->backing_objects, 336 hlist_for_each_entry(p, &cookie->parent->backing_objects,
333 cookie_link) { 337 cookie_link) {
334 if (p->cache == object->cache) { 338 if (p->cache == object->cache) {
335 if (p->state >= FSCACHE_OBJECT_DYING) { 339 if (fscache_object_is_dying(p)) {
336 ret = -ENOBUFS; 340 ret = -ENOBUFS;
337 spin_unlock(&cookie->parent->lock); 341 spin_unlock(&cookie->parent->lock);
338 goto cant_attach_object; 342 goto cant_attach_object;
@@ -400,7 +404,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
400 object = hlist_entry(cookie->backing_objects.first, 404 object = hlist_entry(cookie->backing_objects.first,
401 struct fscache_object, 405 struct fscache_object,
402 cookie_link); 406 cookie_link);
403 if (object->state < FSCACHE_OBJECT_DYING) 407 if (fscache_object_is_live(object))
404 fscache_raise_event( 408 fscache_raise_event(
405 object, FSCACHE_OBJECT_EV_INVALIDATE); 409 object, FSCACHE_OBJECT_EV_INVALIDATE);
406 } 410 }
@@ -467,9 +471,7 @@ EXPORT_SYMBOL(__fscache_update_cookie);
467 */ 471 */
468void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire) 472void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
469{ 473{
470 struct fscache_cache *cache;
471 struct fscache_object *object; 474 struct fscache_object *object;
472 unsigned long event;
473 475
474 fscache_stat(&fscache_n_relinquishes); 476 fscache_stat(&fscache_n_relinquishes);
475 if (retire) 477 if (retire)
@@ -481,8 +483,11 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
481 return; 483 return;
482 } 484 }
483 485
484 _enter("%p{%s,%p},%d", 486 _enter("%p{%s,%p,%d},%d",
485 cookie, cookie->def->name, cookie->netfs_data, retire); 487 cookie, cookie->def->name, cookie->netfs_data,
488 atomic_read(&cookie->n_active), retire);
489
490 ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
486 491
487 if (atomic_read(&cookie->n_children) != 0) { 492 if (atomic_read(&cookie->n_children) != 0) {
488 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", 493 printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
@@ -490,62 +495,28 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
490 BUG(); 495 BUG();
491 } 496 }
492 497
493 /* wait for the cookie to finish being instantiated (or to fail) */ 498 /* No further netfs-accessing operations on this cookie permitted */
494 if (test_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) { 499 set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
495 fscache_stat(&fscache_n_relinquishes_waitcrt); 500 if (retire)
496 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_CREATING, 501 set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
497 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
498 }
499
500 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
501 502
502try_again:
503 spin_lock(&cookie->lock); 503 spin_lock(&cookie->lock);
504 504 hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
505 /* break links with all the active objects */ 505 fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
506 while (!hlist_empty(&cookie->backing_objects)) {
507 int n_reads;
508 object = hlist_entry(cookie->backing_objects.first,
509 struct fscache_object,
510 cookie_link);
511
512 _debug("RELEASE OBJ%x", object->debug_id);
513
514 set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
515 n_reads = atomic_read(&object->n_reads);
516 if (n_reads) {
517 int n_ops = object->n_ops;
518 int n_in_progress = object->n_in_progress;
519 spin_unlock(&cookie->lock);
520 printk(KERN_ERR "FS-Cache:"
521 " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
522 cookie->def->name,
523 n_reads, n_ops, n_in_progress);
524 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
525 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
526 printk("Wait finished\n");
527 goto try_again;
528 }
529
530 /* detach each cache object from the object cookie */
531 spin_lock(&object->lock);
532 hlist_del_init(&object->cookie_link);
533
534 cache = object->cache;
535 object->cookie = NULL;
536 fscache_raise_event(object, event);
537 spin_unlock(&object->lock);
538
539 if (atomic_dec_and_test(&cookie->usage))
540 /* the cookie refcount shouldn't be reduced to 0 yet */
541 BUG();
542 } 506 }
507 spin_unlock(&cookie->lock);
543 508
544 /* detach pointers back to the netfs */ 509 /* Wait for cessation of activity requiring access to the netfs (when
510 * n_active reaches 0).
511 */
512 if (!atomic_dec_and_test(&cookie->n_active))
513 wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
514 TASK_UNINTERRUPTIBLE);
515
516 /* Clear pointers back to the netfs */
545 cookie->netfs_data = NULL; 517 cookie->netfs_data = NULL;
546 cookie->def = NULL; 518 cookie->def = NULL;
547 519 BUG_ON(cookie->stores.rnode);
548 spin_unlock(&cookie->lock);
549 520
550 if (cookie->parent) { 521 if (cookie->parent) {
551 ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); 522 ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0);
@@ -553,7 +524,7 @@ try_again:
553 atomic_dec(&cookie->parent->n_children); 524 atomic_dec(&cookie->parent->n_children);
554 } 525 }
555 526
556 /* finally dispose of the cookie */ 527 /* Dispose of the netfs's link to the cookie */
557 ASSERTCMP(atomic_read(&cookie->usage), >, 0); 528 ASSERTCMP(atomic_read(&cookie->usage), >, 0);
558 fscache_cookie_put(cookie); 529 fscache_cookie_put(cookie);
559 530
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index f5b4baee7352..10a2ade0bdf8 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -55,6 +55,7 @@ static struct fscache_cookie_def fscache_fsdef_index_def = {
55 55
56struct fscache_cookie fscache_fsdef_index = { 56struct fscache_cookie fscache_fsdef_index = {
57 .usage = ATOMIC_INIT(1), 57 .usage = ATOMIC_INIT(1),
58 .n_active = ATOMIC_INIT(1),
58 .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), 59 .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
59 .backing_objects = HLIST_HEAD_INIT, 60 .backing_objects = HLIST_HEAD_INIT,
60 .def = &fscache_fsdef_index_def, 61 .def = &fscache_fsdef_index_def,
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index ee38fef4be51..12d505bedb5c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -93,14 +93,11 @@ static inline bool fscache_object_congested(void)
93 93
94extern int fscache_wait_bit(void *); 94extern int fscache_wait_bit(void *);
95extern int fscache_wait_bit_interruptible(void *); 95extern int fscache_wait_bit_interruptible(void *);
96extern int fscache_wait_atomic_t(atomic_t *);
96 97
97/* 98/*
98 * object.c 99 * object.c
99 */ 100 */
100extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
101
102extern void fscache_withdrawing_object(struct fscache_cache *,
103 struct fscache_object *);
104extern void fscache_enqueue_object(struct fscache_object *); 101extern void fscache_enqueue_object(struct fscache_object *);
105 102
106/* 103/*
@@ -110,8 +107,10 @@ extern void fscache_enqueue_object(struct fscache_object *);
110extern const struct file_operations fscache_objlist_fops; 107extern const struct file_operations fscache_objlist_fops;
111 108
112extern void fscache_objlist_add(struct fscache_object *); 109extern void fscache_objlist_add(struct fscache_object *);
110extern void fscache_objlist_remove(struct fscache_object *);
113#else 111#else
114#define fscache_objlist_add(object) do {} while(0) 112#define fscache_objlist_add(object) do {} while(0)
113#define fscache_objlist_remove(object) do {} while(0)
115#endif 114#endif
116 115
117/* 116/*
@@ -291,6 +290,10 @@ static inline void fscache_raise_event(struct fscache_object *object,
291 unsigned event) 290 unsigned event)
292{ 291{
293 BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS); 292 BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
293#if 0
294 printk("*** fscache_raise_event(OBJ%d{%lx},%x)\n",
295 object->debug_id, object->event_mask, (1 << event));
296#endif
294 if (!test_and_set_bit(event, &object->events) && 297 if (!test_and_set_bit(event, &object->events) &&
295 test_bit(event, &object->event_mask)) 298 test_bit(event, &object->event_mask))
296 fscache_enqueue_object(object); 299 fscache_enqueue_object(object);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index f9d856773f79..7c27907e650c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -205,7 +205,6 @@ int fscache_wait_bit(void *flags)
205 schedule(); 205 schedule();
206 return 0; 206 return 0;
207} 207}
208EXPORT_SYMBOL(fscache_wait_bit);
209 208
210/* 209/*
211 * wait_on_bit() sleep function for interruptible waiting 210 * wait_on_bit() sleep function for interruptible waiting
@@ -215,4 +214,12 @@ int fscache_wait_bit_interruptible(void *flags)
215 schedule(); 214 schedule();
216 return signal_pending(current); 215 return signal_pending(current);
217} 216}
218EXPORT_SYMBOL(fscache_wait_bit_interruptible); 217
218/*
219 * wait_on_atomic_t() sleep function for uninterruptible waiting
220 */
221int fscache_wait_atomic_t(atomic_t *p)
222{
223 schedule();
224 return 0;
225}
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index e028b8eb1c40..b1bb6117473a 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -40,6 +40,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
40 /* initialise the primary index cookie */ 40 /* initialise the primary index cookie */
41 atomic_set(&netfs->primary_index->usage, 1); 41 atomic_set(&netfs->primary_index->usage, 1);
42 atomic_set(&netfs->primary_index->n_children, 0); 42 atomic_set(&netfs->primary_index->n_children, 0);
43 atomic_set(&netfs->primary_index->n_active, 1);
43 44
44 netfs->primary_index->def = &fscache_fsdef_netfs_def; 45 netfs->primary_index->def = &fscache_fsdef_netfs_def;
45 netfs->primary_index->parent = &fscache_fsdef_index; 46 netfs->primary_index->parent = &fscache_fsdef_index;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index f27c89d17885..e1959efad64f 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -70,13 +70,10 @@ void fscache_objlist_add(struct fscache_object *obj)
70 write_unlock(&fscache_object_list_lock); 70 write_unlock(&fscache_object_list_lock);
71} 71}
72 72
73/** 73/*
74 * fscache_object_destroy - Note that a cache object is about to be destroyed 74 * Remove an object from the object list.
75 * @object: The object to be destroyed
76 *
77 * Note the imminent destruction and deallocation of a cache object record.
78 */ 75 */
79void fscache_object_destroy(struct fscache_object *obj) 76void fscache_objlist_remove(struct fscache_object *obj)
80{ 77{
81 write_lock(&fscache_object_list_lock); 78 write_lock(&fscache_object_list_lock);
82 79
@@ -85,7 +82,6 @@ void fscache_object_destroy(struct fscache_object *obj)
85 82
86 write_unlock(&fscache_object_list_lock); 83 write_unlock(&fscache_object_list_lock);
87} 84}
88EXPORT_SYMBOL(fscache_object_destroy);
89 85
90/* 86/*
91 * find the object in the tree on or after the specified index 87 * find the object in the tree on or after the specified index
@@ -166,15 +162,14 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
166{ 162{
167 struct fscache_objlist_data *data = m->private; 163 struct fscache_objlist_data *data = m->private;
168 struct fscache_object *obj = v; 164 struct fscache_object *obj = v;
165 struct fscache_cookie *cookie;
169 unsigned long config = data->config; 166 unsigned long config = data->config;
170 uint16_t keylen, auxlen;
171 char _type[3], *type; 167 char _type[3], *type;
172 bool no_cookie;
173 u8 *buf = data->buf, *p; 168 u8 *buf = data->buf, *p;
174 169
175 if ((unsigned long) v == 1) { 170 if ((unsigned long) v == 1) {
176 seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" 171 seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
177 " EM EV F S" 172 " EM EV FL S"
178 " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); 173 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
179 if (config & (FSCACHE_OBJLIST_CONFIG_KEY | 174 if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
180 FSCACHE_OBJLIST_CONFIG_AUX)) 175 FSCACHE_OBJLIST_CONFIG_AUX))
@@ -193,7 +188,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
193 188
194 if ((unsigned long) v == 2) { 189 if ((unsigned long) v == 2) {
195 seq_puts(m, "======== ======== ==== ===== === === === == =====" 190 seq_puts(m, "======== ======== ==== ===== === === === == ====="
196 " == == = =" 191 " == == == ="
197 " | ================ == == ================"); 192 " | ================ == == ================");
198 if (config & (FSCACHE_OBJLIST_CONFIG_KEY | 193 if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
199 FSCACHE_OBJLIST_CONFIG_AUX)) 194 FSCACHE_OBJLIST_CONFIG_AUX))
@@ -216,10 +211,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
216 } \ 211 } \
217 } while(0) 212 } while(0)
218 213
214 cookie = obj->cookie;
219 if (~config) { 215 if (~config) {
220 FILTER(obj->cookie, 216 FILTER(cookie->def,
221 COOKIE, NOCOOKIE); 217 COOKIE, NOCOOKIE);
222 FILTER(obj->state != FSCACHE_OBJECT_ACTIVE || 218 FILTER(fscache_object_is_active(obj) ||
223 obj->n_ops != 0 || 219 obj->n_ops != 0 ||
224 obj->n_obj_ops != 0 || 220 obj->n_obj_ops != 0 ||
225 obj->flags || 221 obj->flags ||
@@ -235,10 +231,10 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
235 } 231 }
236 232
237 seq_printf(m, 233 seq_printf(m,
238 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ", 234 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ",
239 obj->debug_id, 235 obj->debug_id,
240 obj->parent ? obj->parent->debug_id : -1, 236 obj->parent ? obj->parent->debug_id : -1,
241 fscache_object_states_short[obj->state], 237 obj->state->short_name,
242 obj->n_children, 238 obj->n_children,
243 obj->n_ops, 239 obj->n_ops,
244 obj->n_obj_ops, 240 obj->n_obj_ops,
@@ -250,48 +246,40 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
250 obj->flags, 246 obj->flags,
251 work_busy(&obj->work)); 247 work_busy(&obj->work));
252 248
253 no_cookie = true; 249 if (fscache_use_cookie(obj)) {
254 keylen = auxlen = 0; 250 uint16_t keylen = 0, auxlen = 0;
255 if (obj->cookie) {
256 spin_lock(&obj->lock);
257 if (obj->cookie) {
258 switch (obj->cookie->def->type) {
259 case 0:
260 type = "IX";
261 break;
262 case 1:
263 type = "DT";
264 break;
265 default:
266 sprintf(_type, "%02u",
267 obj->cookie->def->type);
268 type = _type;
269 break;
270 }
271 251
272 seq_printf(m, "%-16s %s %2lx %16p", 252 switch (cookie->def->type) {
273 obj->cookie->def->name, 253 case 0:
274 type, 254 type = "IX";
275 obj->cookie->flags, 255 break;
276 obj->cookie->netfs_data); 256 case 1:
277 257 type = "DT";
278 if (obj->cookie->def->get_key && 258 break;
279 config & FSCACHE_OBJLIST_CONFIG_KEY) 259 default:
280 keylen = obj->cookie->def->get_key( 260 sprintf(_type, "%02u", cookie->def->type);
281 obj->cookie->netfs_data, 261 type = _type;
282 buf, 400); 262 break;
283
284 if (obj->cookie->def->get_aux &&
285 config & FSCACHE_OBJLIST_CONFIG_AUX)
286 auxlen = obj->cookie->def->get_aux(
287 obj->cookie->netfs_data,
288 buf + keylen, 512 - keylen);
289
290 no_cookie = false;
291 } 263 }
292 spin_unlock(&obj->lock);
293 264
294 if (!no_cookie && (keylen > 0 || auxlen > 0)) { 265 seq_printf(m, "%-16s %s %2lx %16p",
266 cookie->def->name,
267 type,
268 cookie->flags,
269 cookie->netfs_data);
270
271 if (cookie->def->get_key &&
272 config & FSCACHE_OBJLIST_CONFIG_KEY)
273 keylen = cookie->def->get_key(cookie->netfs_data,
274 buf, 400);
275
276 if (cookie->def->get_aux &&
277 config & FSCACHE_OBJLIST_CONFIG_AUX)
278 auxlen = cookie->def->get_aux(cookie->netfs_data,
279 buf + keylen, 512 - keylen);
280 fscache_unuse_cookie(obj);
281
282 if (keylen > 0 || auxlen > 0) {
295 seq_printf(m, " "); 283 seq_printf(m, " ");
296 for (p = buf; keylen > 0; keylen--) 284 for (p = buf; keylen > 0; keylen--)
297 seq_printf(m, "%02x", *p++); 285 seq_printf(m, "%02x", *p++);
@@ -302,12 +290,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
302 seq_printf(m, "%02x", *p++); 290 seq_printf(m, "%02x", *p++);
303 } 291 }
304 } 292 }
305 }
306 293
307 if (no_cookie)
308 seq_printf(m, "<no_cookie>\n");
309 else
310 seq_printf(m, "\n"); 294 seq_printf(m, "\n");
295 } else {
296 seq_printf(m, "<no_netfs>\n");
297 }
311 return 0; 298 return 0;
312} 299}
313 300
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 50d41c180211..86d75a60b20c 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -15,52 +15,131 @@
15#define FSCACHE_DEBUG_LEVEL COOKIE 15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/prefetch.h>
18#include "internal.h" 19#include "internal.h"
19 20
20const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { 21static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *, int);
21 [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", 22static const struct fscache_state *fscache_kill_dependents(struct fscache_object *, int);
22 [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", 23static const struct fscache_state *fscache_drop_object(struct fscache_object *, int);
23 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", 24static const struct fscache_state *fscache_initialise_object(struct fscache_object *, int);
24 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", 25static const struct fscache_state *fscache_invalidate_object(struct fscache_object *, int);
25 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", 26static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *, int);
26 [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING", 27static const struct fscache_state *fscache_kill_object(struct fscache_object *, int);
27 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", 28static const struct fscache_state *fscache_lookup_failure(struct fscache_object *, int);
28 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", 29static const struct fscache_state *fscache_look_up_object(struct fscache_object *, int);
29 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", 30static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
30 [FSCACHE_OBJECT_ABORT_INIT] = "OBJECT_ABORT_INIT", 31static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
31 [FSCACHE_OBJECT_RELEASING] = "OBJECT_RELEASING", 32static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
32 [FSCACHE_OBJECT_RECYCLING] = "OBJECT_RECYCLING", 33
33 [FSCACHE_OBJECT_WITHDRAWING] = "OBJECT_WITHDRAWING", 34#define __STATE_NAME(n) fscache_osm_##n
34 [FSCACHE_OBJECT_DEAD] = "OBJECT_DEAD", 35#define STATE(n) (&__STATE_NAME(n))
36
37/*
38 * Define a work state. Work states are execution states. No event processing
39 * is performed by them. The function attached to a work state returns a
40 * pointer indicating the next state to which the state machine should
41 * transition. Returning NO_TRANSIT repeats the current state, but goes back
42 * to the scheduler first.
43 */
44#define WORK_STATE(n, sn, f) \
45 const struct fscache_state __STATE_NAME(n) = { \
46 .name = #n, \
47 .short_name = sn, \
48 .work = f \
49 }
50
51/*
52 * Returns from work states.
53 */
54#define transit_to(state) ({ prefetch(&STATE(state)->work); STATE(state); })
55
56#define NO_TRANSIT ((struct fscache_state *)NULL)
57
58/*
59 * Define a wait state. Wait states are event processing states. No execution
60 * is performed by them. Wait states are just tables of "if event X occurs,
61 * clear it and transition to state Y". The dispatcher returns to the
62 * scheduler if none of the events in which the wait state has an interest are
63 * currently pending.
64 */
65#define WAIT_STATE(n, sn, ...) \
66 const struct fscache_state __STATE_NAME(n) = { \
67 .name = #n, \
68 .short_name = sn, \
69 .work = NULL, \
70 .transitions = { __VA_ARGS__, { 0, NULL } } \
71 }
72
73#define TRANSIT_TO(state, emask) \
74 { .events = (emask), .transit_to = STATE(state) }
75
76/*
77 * The object state machine.
78 */
79static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object);
80static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready);
81static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation);
82static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object);
83static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object);
84static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available);
85static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents);
86
87static WORK_STATE(INVALIDATE_OBJECT, "INVL", fscache_invalidate_object);
88static WORK_STATE(UPDATE_OBJECT, "UPDT", fscache_update_object);
89
90static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure);
91static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object);
92static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents);
93static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object);
94static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL);
95
96static WAIT_STATE(WAIT_FOR_INIT, "?INI",
97 TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
98
99static WAIT_STATE(WAIT_FOR_PARENT, "?PRN",
100 TRANSIT_TO(PARENT_READY, 1 << FSCACHE_OBJECT_EV_PARENT_READY));
101
102static WAIT_STATE(WAIT_FOR_CMD, "?CMD",
103 TRANSIT_TO(INVALIDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_INVALIDATE),
104 TRANSIT_TO(UPDATE_OBJECT, 1 << FSCACHE_OBJECT_EV_UPDATE),
105 TRANSIT_TO(JUMPSTART_DEPS, 1 << FSCACHE_OBJECT_EV_NEW_CHILD));
106
107static WAIT_STATE(WAIT_FOR_CLEARANCE, "?CLR",
108 TRANSIT_TO(KILL_OBJECT, 1 << FSCACHE_OBJECT_EV_CLEARED));
109
110/*
111 * Out-of-band event transition tables. These are for handling unexpected
112 * events, such as an I/O error. If an OOB event occurs, the state machine
113 * clears and disables the event and forces a transition to the nominated work
114 * state (acurrently executing work states will complete first).
115 *
116 * In such a situation, object->state remembers the state the machine should
117 * have been in/gone to and returning NO_TRANSIT returns to that.
118 */
119static const struct fscache_transition fscache_osm_init_oob[] = {
120 TRANSIT_TO(ABORT_INIT,
121 (1 << FSCACHE_OBJECT_EV_ERROR) |
122 (1 << FSCACHE_OBJECT_EV_KILL)),
123 { 0, NULL }
124};
125
126static const struct fscache_transition fscache_osm_lookup_oob[] = {
127 TRANSIT_TO(LOOKUP_FAILURE,
128 (1 << FSCACHE_OBJECT_EV_ERROR) |
129 (1 << FSCACHE_OBJECT_EV_KILL)),
130 { 0, NULL }
35}; 131};
36EXPORT_SYMBOL(fscache_object_states); 132
37 133static const struct fscache_transition fscache_osm_run_oob[] = {
38const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { 134 TRANSIT_TO(KILL_OBJECT,
39 [FSCACHE_OBJECT_INIT] = "INIT", 135 (1 << FSCACHE_OBJECT_EV_ERROR) |
40 [FSCACHE_OBJECT_LOOKING_UP] = "LOOK", 136 (1 << FSCACHE_OBJECT_EV_KILL)),
41 [FSCACHE_OBJECT_CREATING] = "CRTN", 137 { 0, NULL }
42 [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
43 [FSCACHE_OBJECT_ACTIVE] = "ACTV",
44 [FSCACHE_OBJECT_INVALIDATING] = "INVL",
45 [FSCACHE_OBJECT_UPDATING] = "UPDT",
46 [FSCACHE_OBJECT_DYING] = "DYNG",
47 [FSCACHE_OBJECT_LC_DYING] = "LCDY",
48 [FSCACHE_OBJECT_ABORT_INIT] = "ABTI",
49 [FSCACHE_OBJECT_RELEASING] = "RELS",
50 [FSCACHE_OBJECT_RECYCLING] = "RCYC",
51 [FSCACHE_OBJECT_WITHDRAWING] = "WTHD",
52 [FSCACHE_OBJECT_DEAD] = "DEAD",
53}; 138};
54 139
55static int fscache_get_object(struct fscache_object *); 140static int fscache_get_object(struct fscache_object *);
56static void fscache_put_object(struct fscache_object *); 141static void fscache_put_object(struct fscache_object *);
57static void fscache_initialise_object(struct fscache_object *); 142static bool fscache_enqueue_dependents(struct fscache_object *, int);
58static void fscache_lookup_object(struct fscache_object *);
59static void fscache_object_available(struct fscache_object *);
60static void fscache_invalidate_object(struct fscache_object *);
61static void fscache_release_object(struct fscache_object *);
62static void fscache_withdraw_object(struct fscache_object *);
63static void fscache_enqueue_dependents(struct fscache_object *);
64static void fscache_dequeue_object(struct fscache_object *); 143static void fscache_dequeue_object(struct fscache_object *);
65 144
66/* 145/*
@@ -75,295 +154,116 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
75 object->debug_id, parent->debug_id, parent->n_ops); 154 object->debug_id, parent->debug_id, parent->n_ops);
76 155
77 spin_lock_nested(&parent->lock, 1); 156 spin_lock_nested(&parent->lock, 1);
78 parent->n_ops--;
79 parent->n_obj_ops--; 157 parent->n_obj_ops--;
158 parent->n_ops--;
80 if (parent->n_ops == 0) 159 if (parent->n_ops == 0)
81 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); 160 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
82 spin_unlock(&parent->lock); 161 spin_unlock(&parent->lock);
83} 162}
84 163
85/* 164/*
86 * Notify netfs of invalidation completion. 165 * Object state machine dispatcher.
87 */ 166 */
88static inline void fscache_invalidation_complete(struct fscache_cookie *cookie) 167static void fscache_object_sm_dispatcher(struct fscache_object *object)
89{ 168{
90 if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) 169 const struct fscache_transition *t;
91 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); 170 const struct fscache_state *state, *new_state;
92} 171 unsigned long events, event_mask;
93 172 int event = -1;
94/*
95 * process events that have been sent to an object's state machine
96 * - initiates parent lookup
97 * - does object lookup
98 * - does object creation
99 * - does object recycling and retirement
100 * - does object withdrawal
101 */
102static void fscache_object_state_machine(struct fscache_object *object)
103{
104 enum fscache_object_state new_state;
105 struct fscache_cookie *cookie;
106 int event;
107 173
108 ASSERT(object != NULL); 174 ASSERT(object != NULL);
109 175
110 _enter("{OBJ%x,%s,%lx}", 176 _enter("{OBJ%x,%s,%lx}",
111 object->debug_id, fscache_object_states[object->state], 177 object->debug_id, object->state->name, object->events);
112 object->events); 178
113 179 event_mask = object->event_mask;
114 switch (object->state) { 180restart:
115 /* wait for the parent object to become ready */ 181 object->event_mask = 0; /* Mask normal event handling */
116 case FSCACHE_OBJECT_INIT: 182 state = object->state;
117 object->event_mask = 183restart_masked:
118 FSCACHE_OBJECT_EVENTS_MASK & 184 events = object->events;
119 ~(1 << FSCACHE_OBJECT_EV_CLEARED); 185
120 fscache_initialise_object(object); 186 /* Handle any out-of-band events (typically an error) */
121 goto done; 187 if (events & object->oob_event_mask) {
122 188 _debug("{OBJ%x} oob %lx",
123 /* look up the object metadata on disk */ 189 object->debug_id, events & object->oob_event_mask);
124 case FSCACHE_OBJECT_LOOKING_UP: 190 for (t = object->oob_table; t->events; t++) {
125 fscache_lookup_object(object); 191 if (events & t->events) {
126 goto lookup_transit; 192 state = t->transit_to;
127 193 ASSERT(state->work != NULL);
128 /* create the object metadata on disk */ 194 event = fls(events & t->events) - 1;
129 case FSCACHE_OBJECT_CREATING: 195 __clear_bit(event, &object->oob_event_mask);
130 fscache_lookup_object(object); 196 clear_bit(event, &object->events);
131 goto lookup_transit; 197 goto execute_work_state;
132 198 }
133 /* handle an object becoming available; start pending
134 * operations and queue dependent operations for processing */
135 case FSCACHE_OBJECT_AVAILABLE:
136 fscache_object_available(object);
137 goto active_transit;
138
139 /* normal running state */
140 case FSCACHE_OBJECT_ACTIVE:
141 goto active_transit;
142
143 /* Invalidate an object on disk */
144 case FSCACHE_OBJECT_INVALIDATING:
145 clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
146 fscache_stat(&fscache_n_invalidates_run);
147 fscache_stat(&fscache_n_cop_invalidate_object);
148 fscache_invalidate_object(object);
149 fscache_stat_d(&fscache_n_cop_invalidate_object);
150 fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
151 goto active_transit;
152
153 /* update the object metadata on disk */
154 case FSCACHE_OBJECT_UPDATING:
155 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
156 fscache_stat(&fscache_n_updates_run);
157 fscache_stat(&fscache_n_cop_update_object);
158 object->cache->ops->update_object(object);
159 fscache_stat_d(&fscache_n_cop_update_object);
160 goto active_transit;
161
162 /* handle an object dying during lookup or creation */
163 case FSCACHE_OBJECT_LC_DYING:
164 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
165 fscache_stat(&fscache_n_cop_lookup_complete);
166 object->cache->ops->lookup_complete(object);
167 fscache_stat_d(&fscache_n_cop_lookup_complete);
168
169 spin_lock(&object->lock);
170 object->state = FSCACHE_OBJECT_DYING;
171 cookie = object->cookie;
172 if (cookie) {
173 if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
174 &cookie->flags))
175 wake_up_bit(&cookie->flags,
176 FSCACHE_COOKIE_LOOKING_UP);
177 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
178 &cookie->flags))
179 wake_up_bit(&cookie->flags,
180 FSCACHE_COOKIE_CREATING);
181 } 199 }
182 spin_unlock(&object->lock); 200 }
183 201
184 fscache_done_parent_op(object); 202 /* Wait states are just transition tables */
203 if (!state->work) {
204 if (events & event_mask) {
205 for (t = state->transitions; t->events; t++) {
206 if (events & t->events) {
207 new_state = t->transit_to;
208 event = fls(events & t->events) - 1;
209 clear_bit(event, &object->events);
210 _debug("{OBJ%x} ev %d: %s -> %s",
211 object->debug_id, event,
212 state->name, new_state->name);
213 object->state = state = new_state;
214 goto execute_work_state;
215 }
216 }
185 217
186 /* wait for completion of all active operations on this object 218 /* The event mask didn't include all the tabled bits */
187 * and the death of all child objects of this object */ 219 BUG();
188 case FSCACHE_OBJECT_DYING:
189 dying:
190 clear_bit(FSCACHE_OBJECT_EV_CLEARED, &object->events);
191 spin_lock(&object->lock);
192 _debug("dying OBJ%x {%d,%d}",
193 object->debug_id, object->n_ops, object->n_children);
194 if (object->n_ops == 0 && object->n_children == 0) {
195 object->event_mask &=
196 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
197 object->event_mask |=
198 (1 << FSCACHE_OBJECT_EV_WITHDRAW) |
199 (1 << FSCACHE_OBJECT_EV_RETIRE) |
200 (1 << FSCACHE_OBJECT_EV_RELEASE) |
201 (1 << FSCACHE_OBJECT_EV_ERROR);
202 } else {
203 object->event_mask &=
204 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
205 (1 << FSCACHE_OBJECT_EV_RETIRE) |
206 (1 << FSCACHE_OBJECT_EV_RELEASE) |
207 (1 << FSCACHE_OBJECT_EV_ERROR));
208 object->event_mask |=
209 1 << FSCACHE_OBJECT_EV_CLEARED;
210 } 220 }
211 spin_unlock(&object->lock); 221 /* Randomly woke up */
212 fscache_enqueue_dependents(object); 222 goto unmask_events;
213 fscache_start_operations(object);
214 goto terminal_transit;
215
216 /* handle an abort during initialisation */
217 case FSCACHE_OBJECT_ABORT_INIT:
218 _debug("handle abort init %lx", object->events);
219 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
220
221 spin_lock(&object->lock);
222 fscache_dequeue_object(object);
223
224 object->state = FSCACHE_OBJECT_DYING;
225 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
226 &object->cookie->flags))
227 wake_up_bit(&object->cookie->flags,
228 FSCACHE_COOKIE_CREATING);
229 spin_unlock(&object->lock);
230 goto dying;
231
232 /* handle the netfs releasing an object and possibly marking it
233 * obsolete too */
234 case FSCACHE_OBJECT_RELEASING:
235 case FSCACHE_OBJECT_RECYCLING:
236 object->event_mask &=
237 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
238 (1 << FSCACHE_OBJECT_EV_RETIRE) |
239 (1 << FSCACHE_OBJECT_EV_RELEASE) |
240 (1 << FSCACHE_OBJECT_EV_ERROR));
241 fscache_release_object(object);
242 spin_lock(&object->lock);
243 object->state = FSCACHE_OBJECT_DEAD;
244 spin_unlock(&object->lock);
245 fscache_stat(&fscache_n_object_dead);
246 goto terminal_transit;
247
248 /* handle the parent cache of this object being withdrawn from
249 * active service */
250 case FSCACHE_OBJECT_WITHDRAWING:
251 object->event_mask &=
252 ~((1 << FSCACHE_OBJECT_EV_WITHDRAW) |
253 (1 << FSCACHE_OBJECT_EV_RETIRE) |
254 (1 << FSCACHE_OBJECT_EV_RELEASE) |
255 (1 << FSCACHE_OBJECT_EV_ERROR));
256 fscache_withdraw_object(object);
257 spin_lock(&object->lock);
258 object->state = FSCACHE_OBJECT_DEAD;
259 spin_unlock(&object->lock);
260 fscache_stat(&fscache_n_object_dead);
261 goto terminal_transit;
262
263 /* complain about the object being woken up once it is
264 * deceased */
265 case FSCACHE_OBJECT_DEAD:
266 printk(KERN_ERR "FS-Cache:"
267 " Unexpected event in dead state %lx\n",
268 object->events & object->event_mask);
269 BUG();
270
271 default:
272 printk(KERN_ERR "FS-Cache: Unknown object state %u\n",
273 object->state);
274 BUG();
275 }
276
277 /* determine the transition from a lookup state */
278lookup_transit:
279 event = fls(object->events & object->event_mask) - 1;
280 switch (event) {
281 case FSCACHE_OBJECT_EV_WITHDRAW:
282 case FSCACHE_OBJECT_EV_RETIRE:
283 case FSCACHE_OBJECT_EV_RELEASE:
284 case FSCACHE_OBJECT_EV_ERROR:
285 new_state = FSCACHE_OBJECT_LC_DYING;
286 goto change_state;
287 case FSCACHE_OBJECT_EV_INVALIDATE:
288 new_state = FSCACHE_OBJECT_INVALIDATING;
289 goto change_state;
290 case FSCACHE_OBJECT_EV_REQUEUE:
291 goto done;
292 case -1:
293 goto done; /* sleep until event */
294 default:
295 goto unsupported_event;
296 } 223 }
297 224
298 /* determine the transition from an active state */ 225execute_work_state:
299active_transit: 226 _debug("{OBJ%x} exec %s", object->debug_id, state->name);
300 event = fls(object->events & object->event_mask) - 1;
301 switch (event) {
302 case FSCACHE_OBJECT_EV_WITHDRAW:
303 case FSCACHE_OBJECT_EV_RETIRE:
304 case FSCACHE_OBJECT_EV_RELEASE:
305 case FSCACHE_OBJECT_EV_ERROR:
306 new_state = FSCACHE_OBJECT_DYING;
307 goto change_state;
308 case FSCACHE_OBJECT_EV_INVALIDATE:
309 new_state = FSCACHE_OBJECT_INVALIDATING;
310 goto change_state;
311 case FSCACHE_OBJECT_EV_UPDATE:
312 new_state = FSCACHE_OBJECT_UPDATING;
313 goto change_state;
314 case -1:
315 new_state = FSCACHE_OBJECT_ACTIVE;
316 goto change_state; /* sleep until event */
317 default:
318 goto unsupported_event;
319 }
320 227
321 /* determine the transition from a terminal state */ 228 new_state = state->work(object, event);
322terminal_transit: 229 event = -1;
323 event = fls(object->events & object->event_mask) - 1; 230 if (new_state == NO_TRANSIT) {
324 switch (event) { 231 _debug("{OBJ%x} %s notrans", object->debug_id, state->name);
325 case FSCACHE_OBJECT_EV_WITHDRAW: 232 fscache_enqueue_object(object);
326 new_state = FSCACHE_OBJECT_WITHDRAWING; 233 event_mask = object->oob_event_mask;
327 goto change_state; 234 goto unmask_events;
328 case FSCACHE_OBJECT_EV_RETIRE:
329 new_state = FSCACHE_OBJECT_RECYCLING;
330 goto change_state;
331 case FSCACHE_OBJECT_EV_RELEASE:
332 new_state = FSCACHE_OBJECT_RELEASING;
333 goto change_state;
334 case FSCACHE_OBJECT_EV_ERROR:
335 new_state = FSCACHE_OBJECT_WITHDRAWING;
336 goto change_state;
337 case FSCACHE_OBJECT_EV_CLEARED:
338 new_state = FSCACHE_OBJECT_DYING;
339 goto change_state;
340 case -1:
341 goto done; /* sleep until event */
342 default:
343 goto unsupported_event;
344 } 235 }
345 236
346change_state: 237 _debug("{OBJ%x} %s -> %s",
347 spin_lock(&object->lock); 238 object->debug_id, state->name, new_state->name);
348 object->state = new_state; 239 object->state = state = new_state;
349 spin_unlock(&object->lock);
350 240
351done: 241 if (state->work) {
352 _leave(" [->%s]", fscache_object_states[object->state]); 242 if (unlikely(state->work == ((void *)2UL))) {
353 return; 243 _leave(" [dead]");
244 return;
245 }
246 goto restart_masked;
247 }
354 248
355unsupported_event: 249 /* Transited to wait state */
356 printk(KERN_ERR "FS-Cache:" 250 event_mask = object->oob_event_mask;
357 " Unsupported event %d [%lx/%lx] in state %s\n", 251 for (t = state->transitions; t->events; t++)
358 event, object->events, object->event_mask, 252 event_mask |= t->events;
359 fscache_object_states[object->state]); 253
360 BUG(); 254unmask_events:
255 object->event_mask = event_mask;
256 smp_mb();
257 events = object->events;
258 if (events & event_mask)
259 goto restart;
260 _leave(" [msk %lx]", event_mask);
361} 261}
362 262
363/* 263/*
364 * execute an object 264 * execute an object
365 */ 265 */
366void fscache_object_work_func(struct work_struct *work) 266static void fscache_object_work_func(struct work_struct *work)
367{ 267{
368 struct fscache_object *object = 268 struct fscache_object *object =
369 container_of(work, struct fscache_object, work); 269 container_of(work, struct fscache_object, work);
@@ -372,14 +272,70 @@ void fscache_object_work_func(struct work_struct *work)
372 _enter("{OBJ%x}", object->debug_id); 272 _enter("{OBJ%x}", object->debug_id);
373 273
374 start = jiffies; 274 start = jiffies;
375 fscache_object_state_machine(object); 275 fscache_object_sm_dispatcher(object);
376 fscache_hist(fscache_objs_histogram, start); 276 fscache_hist(fscache_objs_histogram, start);
377 if (object->events & object->event_mask)
378 fscache_enqueue_object(object);
379 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
380 fscache_put_object(object); 277 fscache_put_object(object);
381} 278}
382EXPORT_SYMBOL(fscache_object_work_func); 279
280/**
281 * fscache_object_init - Initialise a cache object description
282 * @object: Object description
283 * @cookie: Cookie object will be attached to
284 * @cache: Cache in which backing object will be found
285 *
286 * Initialise a cache object description to its basic values.
287 *
288 * See Documentation/filesystems/caching/backend-api.txt for a complete
289 * description.
290 */
291void fscache_object_init(struct fscache_object *object,
292 struct fscache_cookie *cookie,
293 struct fscache_cache *cache)
294{
295 const struct fscache_transition *t;
296
297 atomic_inc(&cache->object_count);
298
299 object->state = STATE(WAIT_FOR_INIT);
300 object->oob_table = fscache_osm_init_oob;
301 object->flags = 1 << FSCACHE_OBJECT_IS_LIVE;
302 spin_lock_init(&object->lock);
303 INIT_LIST_HEAD(&object->cache_link);
304 INIT_HLIST_NODE(&object->cookie_link);
305 INIT_WORK(&object->work, fscache_object_work_func);
306 INIT_LIST_HEAD(&object->dependents);
307 INIT_LIST_HEAD(&object->dep_link);
308 INIT_LIST_HEAD(&object->pending_ops);
309 object->n_children = 0;
310 object->n_ops = object->n_in_progress = object->n_exclusive = 0;
311 object->events = 0;
312 object->store_limit = 0;
313 object->store_limit_l = 0;
314 object->cache = cache;
315 object->cookie = cookie;
316 object->parent = NULL;
317
318 object->oob_event_mask = 0;
319 for (t = object->oob_table; t->events; t++)
320 object->oob_event_mask |= t->events;
321 object->event_mask = object->oob_event_mask;
322 for (t = object->state->transitions; t->events; t++)
323 object->event_mask |= t->events;
324}
325EXPORT_SYMBOL(fscache_object_init);
326
327/*
328 * Abort object initialisation before we start it.
329 */
330static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object,
331 int event)
332{
333 _enter("{OBJ%x},%d", object->debug_id, event);
334
335 object->oob_event_mask = 0;
336 fscache_dequeue_object(object);
337 return transit_to(KILL_OBJECT);
338}
383 339
384/* 340/*
385 * initialise an object 341 * initialise an object
@@ -387,130 +343,136 @@ EXPORT_SYMBOL(fscache_object_work_func);
387 * immediately to do a creation 343 * immediately to do a creation
388 * - we may need to start the process of creating a parent and we need to wait 344 * - we may need to start the process of creating a parent and we need to wait
389 * for the parent's lookup and creation to complete if it's not there yet 345 * for the parent's lookup and creation to complete if it's not there yet
390 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
391 * leaf-most cookies of the object and all its children
392 */ 346 */
393static void fscache_initialise_object(struct fscache_object *object) 347static const struct fscache_state *fscache_initialise_object(struct fscache_object *object,
348 int event)
394{ 349{
395 struct fscache_object *parent; 350 struct fscache_object *parent;
351 bool success;
396 352
397 _enter(""); 353 _enter("{OBJ%x},%d", object->debug_id, event);
398 ASSERT(object->cookie != NULL);
399 ASSERT(object->cookie->parent != NULL);
400
401 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
402 (1 << FSCACHE_OBJECT_EV_RELEASE) |
403 (1 << FSCACHE_OBJECT_EV_RETIRE) |
404 (1 << FSCACHE_OBJECT_EV_WITHDRAW))) {
405 _debug("abort init %lx", object->events);
406 spin_lock(&object->lock);
407 object->state = FSCACHE_OBJECT_ABORT_INIT;
408 spin_unlock(&object->lock);
409 return;
410 }
411 354
412 spin_lock(&object->cookie->lock); 355 ASSERT(list_empty(&object->dep_link));
413 spin_lock_nested(&object->cookie->parent->lock, 1);
414 356
415 parent = object->parent; 357 parent = object->parent;
416 if (!parent) { 358 if (!parent) {
417 _debug("no parent"); 359 _leave(" [no parent]");
418 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); 360 return transit_to(DROP_OBJECT);
419 } else { 361 }
420 spin_lock(&object->lock);
421 spin_lock_nested(&parent->lock, 1);
422 _debug("parent %s", fscache_object_states[parent->state]);
423
424 if (parent->state >= FSCACHE_OBJECT_DYING) {
425 _debug("bad parent");
426 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events);
427 } else if (parent->state < FSCACHE_OBJECT_AVAILABLE) {
428 _debug("wait");
429
430 /* we may get woken up in this state by child objects
431 * binding on to us, so we need to make sure we don't
432 * add ourself to the list multiple times */
433 if (list_empty(&object->dep_link)) {
434 fscache_stat(&fscache_n_cop_grab_object);
435 object->cache->ops->grab_object(object);
436 fscache_stat_d(&fscache_n_cop_grab_object);
437 list_add(&object->dep_link,
438 &parent->dependents);
439
440 /* fscache_acquire_non_index_cookie() uses this
441 * to wake the chain up */
442 if (parent->state == FSCACHE_OBJECT_INIT)
443 fscache_enqueue_object(parent);
444 }
445 } else {
446 _debug("go");
447 parent->n_ops++;
448 parent->n_obj_ops++;
449 object->lookup_jif = jiffies;
450 object->state = FSCACHE_OBJECT_LOOKING_UP;
451 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
452 }
453 362
454 spin_unlock(&parent->lock); 363 _debug("parent: %s of:%lx", parent->state->name, parent->flags);
455 spin_unlock(&object->lock); 364
365 if (fscache_object_is_dying(parent)) {
366 _leave(" [bad parent]");
367 return transit_to(DROP_OBJECT);
456 } 368 }
457 369
458 spin_unlock(&object->cookie->parent->lock); 370 if (fscache_object_is_available(parent)) {
459 spin_unlock(&object->cookie->lock); 371 _leave(" [ready]");
372 return transit_to(PARENT_READY);
373 }
374
375 _debug("wait");
376
377 spin_lock(&parent->lock);
378 fscache_stat(&fscache_n_cop_grab_object);
379 success = false;
380 if (fscache_object_is_live(parent) &&
381 object->cache->ops->grab_object(object)) {
382 list_add(&object->dep_link, &parent->dependents);
383 success = true;
384 }
385 fscache_stat_d(&fscache_n_cop_grab_object);
386 spin_unlock(&parent->lock);
387 if (!success) {
388 _leave(" [grab failed]");
389 return transit_to(DROP_OBJECT);
390 }
391
392 /* fscache_acquire_non_index_cookie() uses this
393 * to wake the chain up */
394 fscache_raise_event(parent, FSCACHE_OBJECT_EV_NEW_CHILD);
395 _leave(" [wait]");
396 return transit_to(WAIT_FOR_PARENT);
397}
398
399/*
400 * Once the parent object is ready, we should kick off our lookup op.
401 */
402static const struct fscache_state *fscache_parent_ready(struct fscache_object *object,
403 int event)
404{
405 struct fscache_object *parent = object->parent;
406
407 _enter("{OBJ%x},%d", object->debug_id, event);
408
409 ASSERT(parent != NULL);
410
411 spin_lock(&parent->lock);
412 parent->n_ops++;
413 parent->n_obj_ops++;
414 object->lookup_jif = jiffies;
415 spin_unlock(&parent->lock);
416
460 _leave(""); 417 _leave("");
418 return transit_to(LOOK_UP_OBJECT);
461} 419}
462 420
463/* 421/*
464 * look an object up in the cache from which it was allocated 422 * look an object up in the cache from which it was allocated
465 * - we hold an "access lock" on the parent object, so the parent object cannot 423 * - we hold an "access lock" on the parent object, so the parent object cannot
466 * be withdrawn by either party till we've finished 424 * be withdrawn by either party till we've finished
467 * - an object's cookie is pinned until we clear FSCACHE_COOKIE_CREATING on the
468 * leaf-most cookies of the object and all its children
469 */ 425 */
470static void fscache_lookup_object(struct fscache_object *object) 426static const struct fscache_state *fscache_look_up_object(struct fscache_object *object,
427 int event)
471{ 428{
472 struct fscache_cookie *cookie = object->cookie; 429 struct fscache_cookie *cookie = object->cookie;
473 struct fscache_object *parent; 430 struct fscache_object *parent = object->parent;
474 int ret; 431 int ret;
475 432
476 _enter(""); 433 _enter("{OBJ%x},%d", object->debug_id, event);
434
435 object->oob_table = fscache_osm_lookup_oob;
477 436
478 parent = object->parent;
479 ASSERT(parent != NULL); 437 ASSERT(parent != NULL);
480 ASSERTCMP(parent->n_ops, >, 0); 438 ASSERTCMP(parent->n_ops, >, 0);
481 ASSERTCMP(parent->n_obj_ops, >, 0); 439 ASSERTCMP(parent->n_obj_ops, >, 0);
482 440
483 /* make sure the parent is still available */ 441 /* make sure the parent is still available */
484 ASSERTCMP(parent->state, >=, FSCACHE_OBJECT_AVAILABLE); 442 ASSERT(fscache_object_is_available(parent));
485 443
486 if (parent->state >= FSCACHE_OBJECT_DYING || 444 if (fscache_object_is_dying(parent) ||
487 test_bit(FSCACHE_IOERROR, &object->cache->flags)) { 445 test_bit(FSCACHE_IOERROR, &object->cache->flags) ||
488 _debug("unavailable"); 446 !fscache_use_cookie(object)) {
489 set_bit(FSCACHE_OBJECT_EV_WITHDRAW, &object->events); 447 _leave(" [unavailable]");
490 _leave(""); 448 return transit_to(LOOKUP_FAILURE);
491 return;
492 } 449 }
493 450
494 _debug("LOOKUP \"%s/%s\" in \"%s\"", 451 _debug("LOOKUP \"%s\" in \"%s\"",
495 parent->cookie->def->name, cookie->def->name, 452 cookie->def->name, object->cache->tag->name);
496 object->cache->tag->name);
497 453
498 fscache_stat(&fscache_n_object_lookups); 454 fscache_stat(&fscache_n_object_lookups);
499 fscache_stat(&fscache_n_cop_lookup_object); 455 fscache_stat(&fscache_n_cop_lookup_object);
500 ret = object->cache->ops->lookup_object(object); 456 ret = object->cache->ops->lookup_object(object);
501 fscache_stat_d(&fscache_n_cop_lookup_object); 457 fscache_stat_d(&fscache_n_cop_lookup_object);
502 458
503 if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) 459 fscache_unuse_cookie(object);
504 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
505 460
506 if (ret == -ETIMEDOUT) { 461 if (ret == -ETIMEDOUT) {
507 /* probably stuck behind another object, so move this one to 462 /* probably stuck behind another object, so move this one to
508 * the back of the queue */ 463 * the back of the queue */
509 fscache_stat(&fscache_n_object_lookups_timed_out); 464 fscache_stat(&fscache_n_object_lookups_timed_out);
510 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); 465 _leave(" [timeout]");
466 return NO_TRANSIT;
511 } 467 }
512 468
513 _leave(""); 469 if (ret < 0) {
470 _leave(" [error]");
471 return transit_to(LOOKUP_FAILURE);
472 }
473
474 _leave(" [ok]");
475 return transit_to(OBJECT_AVAILABLE);
514} 476}
515 477
516/** 478/**
@@ -524,32 +486,20 @@ void fscache_object_lookup_negative(struct fscache_object *object)
524{ 486{
525 struct fscache_cookie *cookie = object->cookie; 487 struct fscache_cookie *cookie = object->cookie;
526 488
527 _enter("{OBJ%x,%s}", 489 _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
528 object->debug_id, fscache_object_states[object->state]);
529 490
530 spin_lock(&object->lock); 491 if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
531 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
532 fscache_stat(&fscache_n_object_lookups_negative); 492 fscache_stat(&fscache_n_object_lookups_negative);
533 493
534 /* transit here to allow write requests to begin stacking up 494 /* Allow write requests to begin stacking up and read requests to begin
535 * and read requests to begin returning ENODATA */ 495 * returning ENODATA.
536 object->state = FSCACHE_OBJECT_CREATING; 496 */
537 spin_unlock(&object->lock);
538
539 set_bit(FSCACHE_COOKIE_PENDING_FILL, &cookie->flags);
540 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); 497 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
541 498
542 _debug("wake up lookup %p", &cookie->flags); 499 _debug("wake up lookup %p", &cookie->flags);
543 smp_mb__before_clear_bit(); 500 clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
544 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
545 smp_mb__after_clear_bit();
546 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); 501 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
547 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
548 } else {
549 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
550 spin_unlock(&object->lock);
551 } 502 }
552
553 _leave(""); 503 _leave("");
554} 504}
555EXPORT_SYMBOL(fscache_object_lookup_negative); 505EXPORT_SYMBOL(fscache_object_lookup_negative);
@@ -568,38 +518,26 @@ void fscache_obtained_object(struct fscache_object *object)
568{ 518{
569 struct fscache_cookie *cookie = object->cookie; 519 struct fscache_cookie *cookie = object->cookie;
570 520
571 _enter("{OBJ%x,%s}", 521 _enter("{OBJ%x,%s}", object->debug_id, object->state->name);
572 object->debug_id, fscache_object_states[object->state]);
573 522
574 /* if we were still looking up, then we must have a positive lookup 523 /* if we were still looking up, then we must have a positive lookup
575 * result, in which case there may be data available */ 524 * result, in which case there may be data available */
576 spin_lock(&object->lock); 525 if (!test_and_set_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
577 if (object->state == FSCACHE_OBJECT_LOOKING_UP) {
578 fscache_stat(&fscache_n_object_lookups_positive); 526 fscache_stat(&fscache_n_object_lookups_positive);
579 527
580 clear_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); 528 /* We do (presumably) have data */
529 clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
581 530
582 object->state = FSCACHE_OBJECT_AVAILABLE; 531 /* Allow write requests to begin stacking up and read requests
583 spin_unlock(&object->lock); 532 * to begin shovelling data.
584 533 */
585 smp_mb__before_clear_bit(); 534 clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
586 clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
587 smp_mb__after_clear_bit();
588 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); 535 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
589 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
590 } else { 536 } else {
591 ASSERTCMP(object->state, ==, FSCACHE_OBJECT_CREATING);
592 fscache_stat(&fscache_n_object_created); 537 fscache_stat(&fscache_n_object_created);
593
594 object->state = FSCACHE_OBJECT_AVAILABLE;
595 spin_unlock(&object->lock);
596 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
597 smp_wmb();
598 } 538 }
599 539
600 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &cookie->flags)) 540 set_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
601 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_CREATING);
602
603 _leave(""); 541 _leave("");
604} 542}
605EXPORT_SYMBOL(fscache_obtained_object); 543EXPORT_SYMBOL(fscache_obtained_object);
@@ -607,15 +545,14 @@ EXPORT_SYMBOL(fscache_obtained_object);
607/* 545/*
608 * handle an object that has just become available 546 * handle an object that has just become available
609 */ 547 */
610static void fscache_object_available(struct fscache_object *object) 548static const struct fscache_state *fscache_object_available(struct fscache_object *object,
549 int event)
611{ 550{
612 _enter("{OBJ%x}", object->debug_id); 551 _enter("{OBJ%x},%d", object->debug_id, event);
613 552
614 spin_lock(&object->lock); 553 object->oob_table = fscache_osm_run_oob;
615 554
616 if (object->cookie && 555 spin_lock(&object->lock);
617 test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
618 wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
619 556
620 fscache_done_parent_op(object); 557 fscache_done_parent_op(object);
621 if (object->n_in_progress == 0) { 558 if (object->n_in_progress == 0) {
@@ -631,130 +568,158 @@ static void fscache_object_available(struct fscache_object *object)
631 fscache_stat(&fscache_n_cop_lookup_complete); 568 fscache_stat(&fscache_n_cop_lookup_complete);
632 object->cache->ops->lookup_complete(object); 569 object->cache->ops->lookup_complete(object);
633 fscache_stat_d(&fscache_n_cop_lookup_complete); 570 fscache_stat_d(&fscache_n_cop_lookup_complete);
634 fscache_enqueue_dependents(object);
635 571
636 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); 572 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
637 fscache_stat(&fscache_n_object_avail); 573 fscache_stat(&fscache_n_object_avail);
638 574
639 _leave(""); 575 _leave("");
576 return transit_to(JUMPSTART_DEPS);
640} 577}
641 578
642/* 579/*
643 * drop an object's attachments 580 * Wake up this object's dependent objects now that we've become available.
644 */ 581 */
645static void fscache_drop_object(struct fscache_object *object) 582static const struct fscache_state *fscache_jumpstart_dependents(struct fscache_object *object,
583 int event)
646{ 584{
647 struct fscache_object *parent = object->parent; 585 _enter("{OBJ%x},%d", object->debug_id, event);
648 struct fscache_cache *cache = object->cache;
649 586
650 _enter("{OBJ%x,%d}", object->debug_id, object->n_children); 587 if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_PARENT_READY))
588 return NO_TRANSIT; /* Not finished; requeue */
589 return transit_to(WAIT_FOR_CMD);
590}
651 591
652 ASSERTCMP(object->cookie, ==, NULL); 592/*
653 ASSERT(hlist_unhashed(&object->cookie_link)); 593 * Handle lookup or creation failute.
594 */
595static const struct fscache_state *fscache_lookup_failure(struct fscache_object *object,
596 int event)
597{
598 struct fscache_cookie *cookie;
654 599
655 spin_lock(&cache->object_list_lock); 600 _enter("{OBJ%x},%d", object->debug_id, event);
656 list_del_init(&object->cache_link);
657 spin_unlock(&cache->object_list_lock);
658 601
659 fscache_stat(&fscache_n_cop_drop_object); 602 object->oob_event_mask = 0;
660 cache->ops->drop_object(object);
661 fscache_stat_d(&fscache_n_cop_drop_object);
662 603
663 if (parent) { 604 fscache_stat(&fscache_n_cop_lookup_complete);
664 _debug("release parent OBJ%x {%d}", 605 object->cache->ops->lookup_complete(object);
665 parent->debug_id, parent->n_children); 606 fscache_stat_d(&fscache_n_cop_lookup_complete);
666 607
667 spin_lock(&parent->lock); 608 cookie = object->cookie;
668 parent->n_children--; 609 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
669 if (parent->n_children == 0) 610 if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags))
670 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED); 611 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP);
671 spin_unlock(&parent->lock); 612
672 object->parent = NULL; 613 fscache_done_parent_op(object);
614 return transit_to(KILL_OBJECT);
615}
616
617/*
618 * Wait for completion of all active operations on this object and the death of
619 * all child objects of this object.
620 */
621static const struct fscache_state *fscache_kill_object(struct fscache_object *object,
622 int event)
623{
624 _enter("{OBJ%x,%d,%d},%d",
625 object->debug_id, object->n_ops, object->n_children, event);
626
627 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
628 object->oob_event_mask = 0;
629
630 if (list_empty(&object->dependents) &&
631 object->n_ops == 0 &&
632 object->n_children == 0)
633 return transit_to(DROP_OBJECT);
634
635 if (object->n_in_progress == 0) {
636 spin_lock(&object->lock);
637 if (object->n_ops > 0 && object->n_in_progress == 0)
638 fscache_start_operations(object);
639 spin_unlock(&object->lock);
673 } 640 }
674 641
675 /* this just shifts the object release to the work processor */ 642 if (!list_empty(&object->dependents))
676 fscache_put_object(object); 643 return transit_to(KILL_DEPENDENTS);
677 644
678 _leave(""); 645 return transit_to(WAIT_FOR_CLEARANCE);
679} 646}
680 647
681/* 648/*
682 * release or recycle an object that the netfs has discarded 649 * Kill dependent objects.
683 */ 650 */
684static void fscache_release_object(struct fscache_object *object) 651static const struct fscache_state *fscache_kill_dependents(struct fscache_object *object,
652 int event)
685{ 653{
686 _enter(""); 654 _enter("{OBJ%x},%d", object->debug_id, event);
687 655
688 fscache_drop_object(object); 656 if (!fscache_enqueue_dependents(object, FSCACHE_OBJECT_EV_KILL))
657 return NO_TRANSIT; /* Not finished */
658 return transit_to(WAIT_FOR_CLEARANCE);
689} 659}
690 660
691/* 661/*
692 * withdraw an object from active service 662 * Drop an object's attachments
693 */ 663 */
694static void fscache_withdraw_object(struct fscache_object *object) 664static const struct fscache_state *fscache_drop_object(struct fscache_object *object,
665 int event)
695{ 666{
696 struct fscache_cookie *cookie; 667 struct fscache_object *parent = object->parent;
697 bool detached; 668 struct fscache_cookie *cookie = object->cookie;
669 struct fscache_cache *cache = object->cache;
670 bool awaken = false;
698 671
699 _enter(""); 672 _enter("{OBJ%x,%d},%d", object->debug_id, object->n_children, event);
700 673
701 spin_lock(&object->lock); 674 ASSERT(cookie != NULL);
702 cookie = object->cookie; 675 ASSERT(!hlist_unhashed(&object->cookie_link));
703 if (cookie) {
704 /* need to get the cookie lock before the object lock, starting
705 * from the object pointer */
706 atomic_inc(&cookie->usage);
707 spin_unlock(&object->lock);
708 676
709 detached = false; 677 /* Make sure the cookie no longer points here and that the netfs isn't
710 spin_lock(&cookie->lock); 678 * waiting for us.
711 spin_lock(&object->lock); 679 */
680 spin_lock(&cookie->lock);
681 hlist_del_init(&object->cookie_link);
682 if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
683 awaken = true;
684 spin_unlock(&cookie->lock);
712 685
713 if (object->cookie == cookie) { 686 if (awaken)
714 hlist_del_init(&object->cookie_link); 687 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
715 object->cookie = NULL;
716 fscache_invalidation_complete(cookie);
717 detached = true;
718 }
719 spin_unlock(&cookie->lock);
720 fscache_cookie_put(cookie);
721 if (detached)
722 fscache_cookie_put(cookie);
723 }
724 688
689 /* Prevent a race with our last child, which has to signal EV_CLEARED
690 * before dropping our spinlock.
691 */
692 spin_lock(&object->lock);
725 spin_unlock(&object->lock); 693 spin_unlock(&object->lock);
726 694
727 fscache_drop_object(object); 695 /* Discard from the cache's collection of objects */
728} 696 spin_lock(&cache->object_list_lock);
697 list_del_init(&object->cache_link);
698 spin_unlock(&cache->object_list_lock);
729 699
730/* 700 fscache_stat(&fscache_n_cop_drop_object);
731 * withdraw an object from active service at the behest of the cache 701 cache->ops->drop_object(object);
732 * - need break the links to a cached object cookie 702 fscache_stat_d(&fscache_n_cop_drop_object);
733 * - called under two situations:
734 * (1) recycler decides to reclaim an in-use object
735 * (2) a cache is unmounted
736 * - have to take care as the cookie can be being relinquished by the netfs
737 * simultaneously
738 * - the object is pinned by the caller holding a refcount on it
739 */
740void fscache_withdrawing_object(struct fscache_cache *cache,
741 struct fscache_object *object)
742{
743 bool enqueue = false;
744 703
745 _enter(",OBJ%x", object->debug_id); 704 /* The parent object wants to know when all it dependents have gone */
705 if (parent) {
706 _debug("release parent OBJ%x {%d}",
707 parent->debug_id, parent->n_children);
746 708
747 spin_lock(&object->lock); 709 spin_lock(&parent->lock);
748 if (object->state < FSCACHE_OBJECT_WITHDRAWING) { 710 parent->n_children--;
749 object->state = FSCACHE_OBJECT_WITHDRAWING; 711 if (parent->n_children == 0)
750 enqueue = true; 712 fscache_raise_event(parent, FSCACHE_OBJECT_EV_CLEARED);
713 spin_unlock(&parent->lock);
714 object->parent = NULL;
751 } 715 }
752 spin_unlock(&object->lock);
753 716
754 if (enqueue) 717 /* this just shifts the object release to the work processor */
755 fscache_enqueue_object(object); 718 fscache_put_object(object);
719 fscache_stat(&fscache_n_object_dead);
756 720
757 _leave(""); 721 _leave("");
722 return transit_to(OBJECT_DEAD);
758} 723}
759 724
760/* 725/*
@@ -771,7 +736,7 @@ static int fscache_get_object(struct fscache_object *object)
771} 736}
772 737
773/* 738/*
774 * discard a ref on a work item 739 * Discard a ref on an object
775 */ 740 */
776static void fscache_put_object(struct fscache_object *object) 741static void fscache_put_object(struct fscache_object *object)
777{ 742{
@@ -780,6 +745,22 @@ static void fscache_put_object(struct fscache_object *object)
780 fscache_stat_d(&fscache_n_cop_put_object); 745 fscache_stat_d(&fscache_n_cop_put_object);
781} 746}
782 747
748/**
749 * fscache_object_destroy - Note that a cache object is about to be destroyed
750 * @object: The object to be destroyed
751 *
752 * Note the imminent destruction and deallocation of a cache object record.
753 */
754void fscache_object_destroy(struct fscache_object *object)
755{
756 fscache_objlist_remove(object);
757
758 /* We can get rid of the cookie now */
759 fscache_cookie_put(object->cookie);
760 object->cookie = NULL;
761}
762EXPORT_SYMBOL(fscache_object_destroy);
763
783/* 764/*
784 * enqueue an object for metadata-type processing 765 * enqueue an object for metadata-type processing
785 */ 766 */
@@ -803,7 +784,7 @@ void fscache_enqueue_object(struct fscache_object *object)
803 784
804/** 785/**
805 * fscache_object_sleep_till_congested - Sleep until object wq is congested 786 * fscache_object_sleep_till_congested - Sleep until object wq is congested
806 * @timoutp: Scheduler sleep timeout 787 * @timeoutp: Scheduler sleep timeout
807 * 788 *
808 * Allow an object handler to sleep until the object workqueue is congested. 789 * Allow an object handler to sleep until the object workqueue is congested.
809 * 790 *
@@ -831,18 +812,21 @@ bool fscache_object_sleep_till_congested(signed long *timeoutp)
831EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); 812EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
832 813
833/* 814/*
834 * enqueue the dependents of an object for metadata-type processing 815 * Enqueue the dependents of an object for metadata-type processing.
835 * - the caller must hold the object's lock 816 *
836 * - this may cause an already locked object to wind up being processed again 817 * If we don't manage to finish the list before the scheduler wants to run
818 * again then return false immediately. We return true if the list was
819 * cleared.
837 */ 820 */
838static void fscache_enqueue_dependents(struct fscache_object *object) 821static bool fscache_enqueue_dependents(struct fscache_object *object, int event)
839{ 822{
840 struct fscache_object *dep; 823 struct fscache_object *dep;
824 bool ret = true;
841 825
842 _enter("{OBJ%x}", object->debug_id); 826 _enter("{OBJ%x}", object->debug_id);
843 827
844 if (list_empty(&object->dependents)) 828 if (list_empty(&object->dependents))
845 return; 829 return true;
846 830
847 spin_lock(&object->lock); 831 spin_lock(&object->lock);
848 832
@@ -851,23 +835,23 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
851 struct fscache_object, dep_link); 835 struct fscache_object, dep_link);
852 list_del_init(&dep->dep_link); 836 list_del_init(&dep->dep_link);
853 837
854 838 fscache_raise_event(dep, event);
855 /* sort onto appropriate lists */
856 fscache_enqueue_object(dep);
857 fscache_put_object(dep); 839 fscache_put_object(dep);
858 840
859 if (!list_empty(&object->dependents)) 841 if (!list_empty(&object->dependents) && need_resched()) {
860 cond_resched_lock(&object->lock); 842 ret = false;
843 break;
844 }
861 } 845 }
862 846
863 spin_unlock(&object->lock); 847 spin_unlock(&object->lock);
848 return ret;
864} 849}
865 850
866/* 851/*
867 * remove an object from whatever queue it's waiting on 852 * remove an object from whatever queue it's waiting on
868 * - the caller must hold object->lock
869 */ 853 */
870void fscache_dequeue_object(struct fscache_object *object) 854static void fscache_dequeue_object(struct fscache_object *object)
871{ 855{
872 _enter("{OBJ%x}", object->debug_id); 856 _enter("{OBJ%x}", object->debug_id);
873 857
@@ -886,7 +870,10 @@ void fscache_dequeue_object(struct fscache_object *object)
886 * @data: The auxiliary data for the object 870 * @data: The auxiliary data for the object
887 * @datalen: The size of the auxiliary data 871 * @datalen: The size of the auxiliary data
888 * 872 *
889 * This function consults the netfs about the coherency state of an object 873 * This function consults the netfs about the coherency state of an object.
874 * The caller must be holding a ref on cookie->n_active (held by
875 * fscache_look_up_object() on behalf of the cache backend during object lookup
876 * and creation).
890 */ 877 */
891enum fscache_checkaux fscache_check_aux(struct fscache_object *object, 878enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
892 const void *data, uint16_t datalen) 879 const void *data, uint16_t datalen)
@@ -927,12 +914,23 @@ EXPORT_SYMBOL(fscache_check_aux);
927/* 914/*
928 * Asynchronously invalidate an object. 915 * Asynchronously invalidate an object.
929 */ 916 */
930static void fscache_invalidate_object(struct fscache_object *object) 917static const struct fscache_state *_fscache_invalidate_object(struct fscache_object *object,
918 int event)
931{ 919{
932 struct fscache_operation *op; 920 struct fscache_operation *op;
933 struct fscache_cookie *cookie = object->cookie; 921 struct fscache_cookie *cookie = object->cookie;
934 922
935 _enter("{OBJ%x}", object->debug_id); 923 _enter("{OBJ%x},%d", object->debug_id, event);
924
925 /* We're going to need the cookie. If the cookie is not available then
926 * retire the object instead.
927 */
928 if (!fscache_use_cookie(object)) {
929 ASSERT(object->cookie->stores.rnode == NULL);
930 set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
931 _leave(" [no cookie]");
932 return transit_to(KILL_OBJECT);
933 }
936 934
937 /* Reject any new read/write ops and abort any that are pending. */ 935 /* Reject any new read/write ops and abort any that are pending. */
938 fscache_invalidate_writes(cookie); 936 fscache_invalidate_writes(cookie);
@@ -941,14 +939,13 @@ static void fscache_invalidate_object(struct fscache_object *object)
941 939
942 /* Now we have to wait for in-progress reads and writes */ 940 /* Now we have to wait for in-progress reads and writes */
943 op = kzalloc(sizeof(*op), GFP_KERNEL); 941 op = kzalloc(sizeof(*op), GFP_KERNEL);
944 if (!op) { 942 if (!op)
945 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR); 943 goto nomem;
946 _leave(" [ENOMEM]");
947 return;
948 }
949 944
950 fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); 945 fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
951 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); 946 op->flags = FSCACHE_OP_ASYNC |
947 (1 << FSCACHE_OP_EXCLUSIVE) |
948 (1 << FSCACHE_OP_UNUSE_COOKIE);
952 949
953 spin_lock(&cookie->lock); 950 spin_lock(&cookie->lock);
954 if (fscache_submit_exclusive_op(object, op) < 0) 951 if (fscache_submit_exclusive_op(object, op) < 0)
@@ -965,13 +962,50 @@ static void fscache_invalidate_object(struct fscache_object *object)
965 /* We can allow read and write requests to come in once again. They'll 962 /* We can allow read and write requests to come in once again. They'll
966 * queue up behind our exclusive invalidation operation. 963 * queue up behind our exclusive invalidation operation.
967 */ 964 */
968 fscache_invalidation_complete(cookie); 965 if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
969 _leave(""); 966 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
970 return; 967 _leave(" [ok]");
968 return transit_to(UPDATE_OBJECT);
969
970nomem:
971 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
972 fscache_unuse_cookie(object);
973 _leave(" [ENOMEM]");
974 return transit_to(KILL_OBJECT);
971 975
972submit_op_failed: 976submit_op_failed:
977 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
973 spin_unlock(&cookie->lock); 978 spin_unlock(&cookie->lock);
974 kfree(op); 979 kfree(op);
975 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
976 _leave(" [EIO]"); 980 _leave(" [EIO]");
981 return transit_to(KILL_OBJECT);
982}
983
984static const struct fscache_state *fscache_invalidate_object(struct fscache_object *object,
985 int event)
986{
987 const struct fscache_state *s;
988
989 fscache_stat(&fscache_n_invalidates_run);
990 fscache_stat(&fscache_n_cop_invalidate_object);
991 s = _fscache_invalidate_object(object, event);
992 fscache_stat_d(&fscache_n_cop_invalidate_object);
993 return s;
994}
995
996/*
997 * Asynchronously update an object.
998 */
999static const struct fscache_state *fscache_update_object(struct fscache_object *object,
1000 int event)
1001{
1002 _enter("{OBJ%x},%d", object->debug_id, event);
1003
1004 fscache_stat(&fscache_n_updates_run);
1005 fscache_stat(&fscache_n_cop_update_object);
1006 object->cache->ops->update_object(object);
1007 fscache_stat_d(&fscache_n_cop_update_object);
1008
1009 _leave("");
1010 return transit_to(WAIT_FOR_CMD);
977} 1011}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 762a9ec4ffa4..318071aca217 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -35,7 +35,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
35 35
36 ASSERT(list_empty(&op->pend_link)); 36 ASSERT(list_empty(&op->pend_link));
37 ASSERT(op->processor != NULL); 37 ASSERT(op->processor != NULL);
38 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); 38 ASSERT(fscache_object_is_available(op->object));
39 ASSERTCMP(atomic_read(&op->usage), >, 0); 39 ASSERTCMP(atomic_read(&op->usage), >, 0);
40 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS); 40 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
41 41
@@ -119,7 +119,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
119 /* need to issue a new write op after this */ 119 /* need to issue a new write op after this */
120 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); 120 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
121 ret = 0; 121 ret = 0;
122 } else if (object->state == FSCACHE_OBJECT_CREATING) { 122 } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
123 op->object = object; 123 op->object = object;
124 object->n_ops++; 124 object->n_ops++;
125 object->n_exclusive++; /* reads and writes must wait */ 125 object->n_exclusive++; /* reads and writes must wait */
@@ -144,7 +144,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
144 */ 144 */
145static void fscache_report_unexpected_submission(struct fscache_object *object, 145static void fscache_report_unexpected_submission(struct fscache_object *object,
146 struct fscache_operation *op, 146 struct fscache_operation *op,
147 unsigned long ostate) 147 const struct fscache_state *ostate)
148{ 148{
149 static bool once_only; 149 static bool once_only;
150 struct fscache_operation *p; 150 struct fscache_operation *p;
@@ -155,11 +155,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
155 once_only = true; 155 once_only = true;
156 156
157 kdebug("unexpected submission OP%x [OBJ%x %s]", 157 kdebug("unexpected submission OP%x [OBJ%x %s]",
158 op->debug_id, object->debug_id, 158 op->debug_id, object->debug_id, object->state->name);
159 fscache_object_states[object->state]); 159 kdebug("objstate=%s [%s]", object->state->name, ostate->name);
160 kdebug("objstate=%s [%s]",
161 fscache_object_states[object->state],
162 fscache_object_states[ostate]);
163 kdebug("objflags=%lx", object->flags); 160 kdebug("objflags=%lx", object->flags);
164 kdebug("objevent=%lx [%lx]", object->events, object->event_mask); 161 kdebug("objevent=%lx [%lx]", object->events, object->event_mask);
165 kdebug("ops=%u inp=%u exc=%u", 162 kdebug("ops=%u inp=%u exc=%u",
@@ -190,7 +187,7 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
190int fscache_submit_op(struct fscache_object *object, 187int fscache_submit_op(struct fscache_object *object,
191 struct fscache_operation *op) 188 struct fscache_operation *op)
192{ 189{
193 unsigned long ostate; 190 const struct fscache_state *ostate;
194 int ret; 191 int ret;
195 192
196 _enter("{OBJ%x OP%x},{%u}", 193 _enter("{OBJ%x OP%x},{%u}",
@@ -226,16 +223,14 @@ int fscache_submit_op(struct fscache_object *object,
226 fscache_run_op(object, op); 223 fscache_run_op(object, op);
227 } 224 }
228 ret = 0; 225 ret = 0;
229 } else if (object->state == FSCACHE_OBJECT_CREATING) { 226 } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
230 op->object = object; 227 op->object = object;
231 object->n_ops++; 228 object->n_ops++;
232 atomic_inc(&op->usage); 229 atomic_inc(&op->usage);
233 list_add_tail(&op->pend_link, &object->pending_ops); 230 list_add_tail(&op->pend_link, &object->pending_ops);
234 fscache_stat(&fscache_n_op_pend); 231 fscache_stat(&fscache_n_op_pend);
235 ret = 0; 232 ret = 0;
236 } else if (object->state == FSCACHE_OBJECT_DYING || 233 } else if (fscache_object_is_dying(object)) {
237 object->state == FSCACHE_OBJECT_LC_DYING ||
238 object->state == FSCACHE_OBJECT_WITHDRAWING) {
239 fscache_stat(&fscache_n_op_rejected); 234 fscache_stat(&fscache_n_op_rejected);
240 op->state = FSCACHE_OP_ST_CANCELLED; 235 op->state = FSCACHE_OP_ST_CANCELLED;
241 ret = -ENOBUFS; 236 ret = -ENOBUFS;
@@ -265,8 +260,8 @@ void fscache_abort_object(struct fscache_object *object)
265} 260}
266 261
267/* 262/*
268 * jump start the operation processing on an object 263 * Jump start the operation processing on an object. The caller must hold
269 * - caller must hold object->lock 264 * object->lock.
270 */ 265 */
271void fscache_start_operations(struct fscache_object *object) 266void fscache_start_operations(struct fscache_object *object)
272{ 267{
@@ -428,14 +423,10 @@ void fscache_put_operation(struct fscache_operation *op)
428 423
429 object = op->object; 424 object = op->object;
430 425
431 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) { 426 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
432 if (atomic_dec_and_test(&object->n_reads)) { 427 atomic_dec(&object->n_reads);
433 clear_bit(FSCACHE_COOKIE_WAITING_ON_READS, 428 if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags))
434 &object->cookie->flags); 429 fscache_unuse_cookie(object);
435 wake_up_bit(&object->cookie->flags,
436 FSCACHE_COOKIE_WAITING_ON_READS);
437 }
438 }
439 430
440 /* now... we may get called with the object spinlock held, so we 431 /* now... we may get called with the object spinlock held, so we
441 * complete the cleanup here only if we can immediately acquire the 432 * complete the cleanup here only if we can immediately acquire the
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ff000e52072d..d479ab3c63e4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -109,7 +109,7 @@ page_busy:
109 * allocator as the work threads writing to the cache may all end up 109 * allocator as the work threads writing to the cache may all end up
110 * sleeping on memory allocation, so we may need to impose a timeout 110 * sleeping on memory allocation, so we may need to impose a timeout
111 * too. */ 111 * too. */
112 if (!(gfp & __GFP_WAIT)) { 112 if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
113 fscache_stat(&fscache_n_store_vmscan_busy); 113 fscache_stat(&fscache_n_store_vmscan_busy);
114 return false; 114 return false;
115 } 115 }
@@ -163,10 +163,12 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
163 163
164 fscache_stat(&fscache_n_attr_changed_calls); 164 fscache_stat(&fscache_n_attr_changed_calls);
165 165
166 if (fscache_object_is_active(object)) { 166 if (fscache_object_is_active(object) &&
167 fscache_use_cookie(object)) {
167 fscache_stat(&fscache_n_cop_attr_changed); 168 fscache_stat(&fscache_n_cop_attr_changed);
168 ret = object->cache->ops->attr_changed(object); 169 ret = object->cache->ops->attr_changed(object);
169 fscache_stat_d(&fscache_n_cop_attr_changed); 170 fscache_stat_d(&fscache_n_cop_attr_changed);
171 fscache_unuse_cookie(object);
170 if (ret < 0) 172 if (ret < 0)
171 fscache_abort_object(object); 173 fscache_abort_object(object);
172 } 174 }
@@ -233,7 +235,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
233 235
234 _enter("{OP%x}", op->op.debug_id); 236 _enter("{OP%x}", op->op.debug_id);
235 237
236 ASSERTCMP(op->n_pages, ==, 0); 238 ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
237 239
238 fscache_hist(fscache_retrieval_histogram, op->start_time); 240 fscache_hist(fscache_retrieval_histogram, op->start_time);
239 if (op->context) 241 if (op->context)
@@ -246,6 +248,7 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
246 * allocate a retrieval op 248 * allocate a retrieval op
247 */ 249 */
248static struct fscache_retrieval *fscache_alloc_retrieval( 250static struct fscache_retrieval *fscache_alloc_retrieval(
251 struct fscache_cookie *cookie,
249 struct address_space *mapping, 252 struct address_space *mapping,
250 fscache_rw_complete_t end_io_func, 253 fscache_rw_complete_t end_io_func,
251 void *context) 254 void *context)
@@ -260,7 +263,10 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
260 } 263 }
261 264
262 fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); 265 fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
263 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); 266 atomic_inc(&cookie->n_active);
267 op->op.flags = FSCACHE_OP_MYTHREAD |
268 (1UL << FSCACHE_OP_WAITING) |
269 (1UL << FSCACHE_OP_UNUSE_COOKIE);
264 op->mapping = mapping; 270 op->mapping = mapping;
265 op->end_io_func = end_io_func; 271 op->end_io_func = end_io_func;
266 op->context = context; 272 op->context = context;
@@ -310,7 +316,7 @@ static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
310 struct fscache_retrieval *op = 316 struct fscache_retrieval *op =
311 container_of(_op, struct fscache_retrieval, op); 317 container_of(_op, struct fscache_retrieval, op);
312 318
313 op->n_pages = 0; 319 atomic_set(&op->n_pages, 0);
314} 320}
315 321
316/* 322/*
@@ -394,12 +400,13 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
394 if (fscache_wait_for_deferred_lookup(cookie) < 0) 400 if (fscache_wait_for_deferred_lookup(cookie) < 0)
395 return -ERESTARTSYS; 401 return -ERESTARTSYS;
396 402
397 op = fscache_alloc_retrieval(page->mapping, end_io_func, context); 403 op = fscache_alloc_retrieval(cookie, page->mapping,
404 end_io_func,context);
398 if (!op) { 405 if (!op) {
399 _leave(" = -ENOMEM"); 406 _leave(" = -ENOMEM");
400 return -ENOMEM; 407 return -ENOMEM;
401 } 408 }
402 op->n_pages = 1; 409 atomic_set(&op->n_pages, 1);
403 410
404 spin_lock(&cookie->lock); 411 spin_lock(&cookie->lock);
405 412
@@ -408,7 +415,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
408 object = hlist_entry(cookie->backing_objects.first, 415 object = hlist_entry(cookie->backing_objects.first,
409 struct fscache_object, cookie_link); 416 struct fscache_object, cookie_link);
410 417
411 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); 418 ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
412 419
413 atomic_inc(&object->n_reads); 420 atomic_inc(&object->n_reads);
414 __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); 421 __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -465,6 +472,7 @@ nobufs_unlock_dec:
465 atomic_dec(&object->n_reads); 472 atomic_dec(&object->n_reads);
466nobufs_unlock: 473nobufs_unlock:
467 spin_unlock(&cookie->lock); 474 spin_unlock(&cookie->lock);
475 atomic_dec(&cookie->n_active);
468 kfree(op); 476 kfree(op);
469nobufs: 477nobufs:
470 fscache_stat(&fscache_n_retrievals_nobufs); 478 fscache_stat(&fscache_n_retrievals_nobufs);
@@ -522,10 +530,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
522 if (fscache_wait_for_deferred_lookup(cookie) < 0) 530 if (fscache_wait_for_deferred_lookup(cookie) < 0)
523 return -ERESTARTSYS; 531 return -ERESTARTSYS;
524 532
525 op = fscache_alloc_retrieval(mapping, end_io_func, context); 533 op = fscache_alloc_retrieval(cookie, mapping, end_io_func, context);
526 if (!op) 534 if (!op)
527 return -ENOMEM; 535 return -ENOMEM;
528 op->n_pages = *nr_pages; 536 atomic_set(&op->n_pages, *nr_pages);
529 537
530 spin_lock(&cookie->lock); 538 spin_lock(&cookie->lock);
531 539
@@ -589,6 +597,7 @@ nobufs_unlock_dec:
589 atomic_dec(&object->n_reads); 597 atomic_dec(&object->n_reads);
590nobufs_unlock: 598nobufs_unlock:
591 spin_unlock(&cookie->lock); 599 spin_unlock(&cookie->lock);
600 atomic_dec(&cookie->n_active);
592 kfree(op); 601 kfree(op);
593nobufs: 602nobufs:
594 fscache_stat(&fscache_n_retrievals_nobufs); 603 fscache_stat(&fscache_n_retrievals_nobufs);
@@ -631,10 +640,10 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
631 if (fscache_wait_for_deferred_lookup(cookie) < 0) 640 if (fscache_wait_for_deferred_lookup(cookie) < 0)
632 return -ERESTARTSYS; 641 return -ERESTARTSYS;
633 642
634 op = fscache_alloc_retrieval(page->mapping, NULL, NULL); 643 op = fscache_alloc_retrieval(cookie, page->mapping, NULL, NULL);
635 if (!op) 644 if (!op)
636 return -ENOMEM; 645 return -ENOMEM;
637 op->n_pages = 1; 646 atomic_set(&op->n_pages, 1);
638 647
639 spin_lock(&cookie->lock); 648 spin_lock(&cookie->lock);
640 649
@@ -675,6 +684,7 @@ error:
675 684
676nobufs_unlock: 685nobufs_unlock:
677 spin_unlock(&cookie->lock); 686 spin_unlock(&cookie->lock);
687 atomic_dec(&cookie->n_active);
678 kfree(op); 688 kfree(op);
679nobufs: 689nobufs:
680 fscache_stat(&fscache_n_allocs_nobufs); 690 fscache_stat(&fscache_n_allocs_nobufs);
@@ -729,8 +739,9 @@ static void fscache_write_op(struct fscache_operation *_op)
729 */ 739 */
730 spin_unlock(&object->lock); 740 spin_unlock(&object->lock);
731 fscache_op_complete(&op->op, false); 741 fscache_op_complete(&op->op, false);
732 _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}", 742 _leave(" [cancel] op{f=%lx s=%u} obj{s=%s f=%lx}",
733 _op->flags, _op->state, object->state, object->flags); 743 _op->flags, _op->state, object->state->short_name,
744 object->flags);
734 return; 745 return;
735 } 746 }
736 747
@@ -796,11 +807,16 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
796 807
797 _enter(""); 808 _enter("");
798 809
799 while (spin_lock(&cookie->stores_lock), 810 for (;;) {
800 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0, 811 spin_lock(&cookie->stores_lock);
801 ARRAY_SIZE(results), 812 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
802 FSCACHE_COOKIE_PENDING_TAG), 813 ARRAY_SIZE(results),
803 n > 0) { 814 FSCACHE_COOKIE_PENDING_TAG);
815 if (n == 0) {
816 spin_unlock(&cookie->stores_lock);
817 break;
818 }
819
804 for (i = n - 1; i >= 0; i--) { 820 for (i = n - 1; i >= 0; i--) {
805 page = results[i]; 821 page = results[i];
806 radix_tree_delete(&cookie->stores, page->index); 822 radix_tree_delete(&cookie->stores, page->index);
@@ -812,7 +828,6 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
812 page_cache_release(results[i]); 828 page_cache_release(results[i]);
813 } 829 }
814 830
815 spin_unlock(&cookie->stores_lock);
816 _leave(""); 831 _leave("");
817} 832}
818 833
@@ -829,14 +844,12 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
829 * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is 844 * (1) negative lookup, object not yet created (FSCACHE_COOKIE_CREATING is
830 * set) 845 * set)
831 * 846 *
832 * (a) no writes yet (set FSCACHE_COOKIE_PENDING_FILL and queue deferred 847 * (a) no writes yet
833 * fill op)
834 * 848 *
835 * (b) writes deferred till post-creation (mark page for writing and 849 * (b) writes deferred till post-creation (mark page for writing and
836 * return immediately) 850 * return immediately)
837 * 851 *
838 * (2) negative lookup, object created, initial fill being made from netfs 852 * (2) negative lookup, object created, initial fill being made from netfs
839 * (FSCACHE_COOKIE_INITIAL_FILL is set)
840 * 853 *
841 * (a) fill point not yet reached this page (mark page for writing and 854 * (a) fill point not yet reached this page (mark page for writing and
842 * return) 855 * return)
@@ -873,7 +886,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
873 886
874 fscache_operation_init(&op->op, fscache_write_op, 887 fscache_operation_init(&op->op, fscache_write_op,
875 fscache_release_write_op); 888 fscache_release_write_op);
876 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); 889 op->op.flags = FSCACHE_OP_ASYNC |
890 (1 << FSCACHE_OP_WAITING) |
891 (1 << FSCACHE_OP_UNUSE_COOKIE);
877 892
878 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 893 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
879 if (ret < 0) 894 if (ret < 0)
@@ -919,6 +934,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
919 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); 934 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
920 op->store_limit = object->store_limit; 935 op->store_limit = object->store_limit;
921 936
937 atomic_inc(&cookie->n_active);
922 if (fscache_submit_op(object, &op->op) < 0) 938 if (fscache_submit_op(object, &op->op) < 0)
923 goto submit_failed; 939 goto submit_failed;
924 940
@@ -945,6 +961,7 @@ already_pending:
945 return 0; 961 return 0;
946 962
947submit_failed: 963submit_failed:
964 atomic_dec(&cookie->n_active);
948 spin_lock(&cookie->stores_lock); 965 spin_lock(&cookie->stores_lock);
949 radix_tree_delete(&cookie->stores, page->index); 966 radix_tree_delete(&cookie->stores, page->index);
950 spin_unlock(&cookie->stores_lock); 967 spin_unlock(&cookie->stores_lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f3f783dc4f75..72a5d5b04494 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,7 +14,7 @@
14#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) 17static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
18{ 18{
19 struct fuse_conn *fc = get_fuse_conn(dir); 19 struct fuse_conn *fc = get_fuse_conn(dir);
20 struct fuse_inode *fi = get_fuse_inode(dir); 20 struct fuse_inode *fi = get_fuse_inode(dir);
@@ -25,7 +25,7 @@ static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
25 return true; 25 return true;
26 if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) 26 if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
27 return true; 27 return true;
28 if (filp->f_pos == 0) 28 if (ctx->pos == 0)
29 return true; 29 return true;
30 return false; 30 return false;
31} 31}
@@ -1165,25 +1165,23 @@ static int fuse_permission(struct inode *inode, int mask)
1165} 1165}
1166 1166
1167static int parse_dirfile(char *buf, size_t nbytes, struct file *file, 1167static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
1168 void *dstbuf, filldir_t filldir) 1168 struct dir_context *ctx)
1169{ 1169{
1170 while (nbytes >= FUSE_NAME_OFFSET) { 1170 while (nbytes >= FUSE_NAME_OFFSET) {
1171 struct fuse_dirent *dirent = (struct fuse_dirent *) buf; 1171 struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
1172 size_t reclen = FUSE_DIRENT_SIZE(dirent); 1172 size_t reclen = FUSE_DIRENT_SIZE(dirent);
1173 int over;
1174 if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) 1173 if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
1175 return -EIO; 1174 return -EIO;
1176 if (reclen > nbytes) 1175 if (reclen > nbytes)
1177 break; 1176 break;
1178 1177
1179 over = filldir(dstbuf, dirent->name, dirent->namelen, 1178 if (!dir_emit(ctx, dirent->name, dirent->namelen,
1180 file->f_pos, dirent->ino, dirent->type); 1179 dirent->ino, dirent->type))
1181 if (over)
1182 break; 1180 break;
1183 1181
1184 buf += reclen; 1182 buf += reclen;
1185 nbytes -= reclen; 1183 nbytes -= reclen;
1186 file->f_pos = dirent->off; 1184 ctx->pos = dirent->off;
1187 } 1185 }
1188 1186
1189 return 0; 1187 return 0;
@@ -1225,30 +1223,46 @@ static int fuse_direntplus_link(struct file *file,
1225 if (name.name[1] == '.' && name.len == 2) 1223 if (name.name[1] == '.' && name.len == 2)
1226 return 0; 1224 return 0;
1227 } 1225 }
1226
1227 if (invalid_nodeid(o->nodeid))
1228 return -EIO;
1229 if (!fuse_valid_type(o->attr.mode))
1230 return -EIO;
1231
1228 fc = get_fuse_conn(dir); 1232 fc = get_fuse_conn(dir);
1229 1233
1230 name.hash = full_name_hash(name.name, name.len); 1234 name.hash = full_name_hash(name.name, name.len);
1231 dentry = d_lookup(parent, &name); 1235 dentry = d_lookup(parent, &name);
1232 if (dentry && dentry->d_inode) { 1236 if (dentry) {
1233 inode = dentry->d_inode; 1237 inode = dentry->d_inode;
1234 if (get_node_id(inode) == o->nodeid) { 1238 if (!inode) {
1239 d_drop(dentry);
1240 } else if (get_node_id(inode) != o->nodeid ||
1241 ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
1242 err = d_invalidate(dentry);
1243 if (err)
1244 goto out;
1245 } else if (is_bad_inode(inode)) {
1246 err = -EIO;
1247 goto out;
1248 } else {
1235 struct fuse_inode *fi; 1249 struct fuse_inode *fi;
1236 fi = get_fuse_inode(inode); 1250 fi = get_fuse_inode(inode);
1237 spin_lock(&fc->lock); 1251 spin_lock(&fc->lock);
1238 fi->nlookup++; 1252 fi->nlookup++;
1239 spin_unlock(&fc->lock); 1253 spin_unlock(&fc->lock);
1240 1254
1255 fuse_change_attributes(inode, &o->attr,
1256 entry_attr_timeout(o),
1257 attr_version);
1258
1241 /* 1259 /*
1242 * The other branch to 'found' comes via fuse_iget() 1260 * The other branch to 'found' comes via fuse_iget()
1243 * which bumps nlookup inside 1261 * which bumps nlookup inside
1244 */ 1262 */
1245 goto found; 1263 goto found;
1246 } 1264 }
1247 err = d_invalidate(dentry);
1248 if (err)
1249 goto out;
1250 dput(dentry); 1265 dput(dentry);
1251 dentry = NULL;
1252 } 1266 }
1253 1267
1254 dentry = d_alloc(parent, &name); 1268 dentry = d_alloc(parent, &name);
@@ -1261,30 +1275,35 @@ static int fuse_direntplus_link(struct file *file,
1261 if (!inode) 1275 if (!inode)
1262 goto out; 1276 goto out;
1263 1277
1264 alias = d_materialise_unique(dentry, inode); 1278 if (S_ISDIR(inode->i_mode)) {
1265 err = PTR_ERR(alias); 1279 mutex_lock(&fc->inst_mutex);
1266 if (IS_ERR(alias)) 1280 alias = fuse_d_add_directory(dentry, inode);
1267 goto out; 1281 mutex_unlock(&fc->inst_mutex);
1282 err = PTR_ERR(alias);
1283 if (IS_ERR(alias)) {
1284 iput(inode);
1285 goto out;
1286 }
1287 } else {
1288 alias = d_splice_alias(inode, dentry);
1289 }
1290
1268 if (alias) { 1291 if (alias) {
1269 dput(dentry); 1292 dput(dentry);
1270 dentry = alias; 1293 dentry = alias;
1271 } 1294 }
1272 1295
1273found: 1296found:
1274 fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
1275 attr_version);
1276
1277 fuse_change_entry_timeout(dentry, o); 1297 fuse_change_entry_timeout(dentry, o);
1278 1298
1279 err = 0; 1299 err = 0;
1280out: 1300out:
1281 if (dentry) 1301 dput(dentry);
1282 dput(dentry);
1283 return err; 1302 return err;
1284} 1303}
1285 1304
1286static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, 1305static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
1287 void *dstbuf, filldir_t filldir, u64 attr_version) 1306 struct dir_context *ctx, u64 attr_version)
1288{ 1307{
1289 struct fuse_direntplus *direntplus; 1308 struct fuse_direntplus *direntplus;
1290 struct fuse_dirent *dirent; 1309 struct fuse_dirent *dirent;
@@ -1309,10 +1328,9 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
1309 we need to send a FORGET for each of those 1328 we need to send a FORGET for each of those
1310 which we did not link. 1329 which we did not link.
1311 */ 1330 */
1312 over = filldir(dstbuf, dirent->name, dirent->namelen, 1331 over = !dir_emit(ctx, dirent->name, dirent->namelen,
1313 file->f_pos, dirent->ino, 1332 dirent->ino, dirent->type);
1314 dirent->type); 1333 ctx->pos = dirent->off;
1315 file->f_pos = dirent->off;
1316 } 1334 }
1317 1335
1318 buf += reclen; 1336 buf += reclen;
@@ -1326,7 +1344,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
1326 return 0; 1344 return 0;
1327} 1345}
1328 1346
1329static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) 1347static int fuse_readdir(struct file *file, struct dir_context *ctx)
1330{ 1348{
1331 int plus, err; 1349 int plus, err;
1332 size_t nbytes; 1350 size_t nbytes;
@@ -1349,17 +1367,17 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1349 return -ENOMEM; 1367 return -ENOMEM;
1350 } 1368 }
1351 1369
1352 plus = fuse_use_readdirplus(inode, file); 1370 plus = fuse_use_readdirplus(inode, ctx);
1353 req->out.argpages = 1; 1371 req->out.argpages = 1;
1354 req->num_pages = 1; 1372 req->num_pages = 1;
1355 req->pages[0] = page; 1373 req->pages[0] = page;
1356 req->page_descs[0].length = PAGE_SIZE; 1374 req->page_descs[0].length = PAGE_SIZE;
1357 if (plus) { 1375 if (plus) {
1358 attr_version = fuse_get_attr_version(fc); 1376 attr_version = fuse_get_attr_version(fc);
1359 fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, 1377 fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
1360 FUSE_READDIRPLUS); 1378 FUSE_READDIRPLUS);
1361 } else { 1379 } else {
1362 fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, 1380 fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
1363 FUSE_READDIR); 1381 FUSE_READDIR);
1364 } 1382 }
1365 fuse_request_send(fc, req); 1383 fuse_request_send(fc, req);
@@ -1369,11 +1387,11 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
1369 if (!err) { 1387 if (!err) {
1370 if (plus) { 1388 if (plus) {
1371 err = parse_dirplusfile(page_address(page), nbytes, 1389 err = parse_dirplusfile(page_address(page), nbytes,
1372 file, dstbuf, filldir, 1390 file, ctx,
1373 attr_version); 1391 attr_version);
1374 } else { 1392 } else {
1375 err = parse_dirfile(page_address(page), nbytes, file, 1393 err = parse_dirfile(page_address(page), nbytes, file,
1376 dstbuf, filldir); 1394 ctx);
1377 } 1395 }
1378 } 1396 }
1379 1397
@@ -1886,7 +1904,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
1886static const struct file_operations fuse_dir_operations = { 1904static const struct file_operations fuse_dir_operations = {
1887 .llseek = generic_file_llseek, 1905 .llseek = generic_file_llseek,
1888 .read = generic_read_dir, 1906 .read = generic_read_dir,
1889 .readdir = fuse_readdir, 1907 .iterate = fuse_readdir,
1890 .open = fuse_dir_open, 1908 .open = fuse_dir_open,
1891 .release = fuse_dir_release, 1909 .release = fuse_dir_release,
1892 .fsync = fuse_dir_fsync, 1910 .fsync = fuse_dir_fsync,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 35f281033142..5c121fe19c5f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -548,8 +548,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
548 res = io->bytes < 0 ? io->size : io->bytes; 548 res = io->bytes < 0 ? io->size : io->bytes;
549 549
550 if (!is_sync_kiocb(io->iocb)) { 550 if (!is_sync_kiocb(io->iocb)) {
551 struct path *path = &io->iocb->ki_filp->f_path; 551 struct inode *inode = file_inode(io->iocb->ki_filp);
552 struct inode *inode = path->dentry->d_inode;
553 struct fuse_conn *fc = get_fuse_conn(inode); 552 struct fuse_conn *fc = get_fuse_conn(inode);
554 struct fuse_inode *fi = get_fuse_inode(inode); 553 struct fuse_inode *fi = get_fuse_inode(inode);
555 554
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9a0cdde14a08..0b578598c6ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -785,7 +785,7 @@ static const struct super_operations fuse_super_operations = {
785static void sanitize_global_limit(unsigned *limit) 785static void sanitize_global_limit(unsigned *limit)
786{ 786{
787 if (*limit == 0) 787 if (*limit == 0)
788 *limit = ((num_physpages << PAGE_SHIFT) >> 13) / 788 *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
789 sizeof(struct fuse_req); 789 sizeof(struct fuse_req);
790 790
791 if (*limit >= 1 << 16) 791 if (*limit >= 1 << 16)
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 5a376ab81feb..90c6a8faaecb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -20,13 +20,12 @@ config GFS2_FS
20 be found here: http://sources.redhat.com/cluster 20 be found here: http://sources.redhat.com/cluster
21 21
22 The "nolock" lock module is now built in to GFS2 by default. If 22 The "nolock" lock module is now built in to GFS2 by default. If
23 you want to use the DLM, be sure to enable HOTPLUG and IPv4/6 23 you want to use the DLM, be sure to enable IPv4/6 networking.
24 networking.
25 24
26config GFS2_FS_LOCKING_DLM 25config GFS2_FS_LOCKING_DLM
27 bool "GFS2 DLM locking" 26 bool "GFS2 DLM locking"
28 depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ 27 depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
29 HOTPLUG && CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) 28 CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS)
30 help 29 help
31 Multiple node locking module for GFS2 30 Multiple node locking module for GFS2
32 31
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..ee48ad37d9c0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -110,7 +110,7 @@ static int gfs2_writepage_common(struct page *page,
110 /* Is the page fully outside i_size? (truncate in progress) */ 110 /* Is the page fully outside i_size? (truncate in progress) */
111 offset = i_size & (PAGE_CACHE_SIZE-1); 111 offset = i_size & (PAGE_CACHE_SIZE-1);
112 if (page->index > end_index || (page->index == end_index && !offset)) { 112 if (page->index > end_index || (page->index == end_index && !offset)) {
113 page->mapping->a_ops->invalidatepage(page, 0); 113 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
114 goto out; 114 goto out;
115 } 115 }
116 return 1; 116 return 1;
@@ -299,7 +299,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
299 299
300 /* Is the page fully outside i_size? (truncate in progress) */ 300 /* Is the page fully outside i_size? (truncate in progress) */
301 if (page->index > end_index || (page->index == end_index && !offset)) { 301 if (page->index > end_index || (page->index == end_index && !offset)) {
302 page->mapping->a_ops->invalidatepage(page, 0); 302 page->mapping->a_ops->invalidatepage(page, 0,
303 PAGE_CACHE_SIZE);
303 unlock_page(page); 304 unlock_page(page);
304 continue; 305 continue;
305 } 306 }
@@ -943,27 +944,33 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
943 unlock_buffer(bh); 944 unlock_buffer(bh);
944} 945}
945 946
946static void gfs2_invalidatepage(struct page *page, unsigned long offset) 947static void gfs2_invalidatepage(struct page *page, unsigned int offset,
948 unsigned int length)
947{ 949{
948 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); 950 struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
951 unsigned int stop = offset + length;
952 int partial_page = (offset || length < PAGE_CACHE_SIZE);
949 struct buffer_head *bh, *head; 953 struct buffer_head *bh, *head;
950 unsigned long pos = 0; 954 unsigned long pos = 0;
951 955
952 BUG_ON(!PageLocked(page)); 956 BUG_ON(!PageLocked(page));
953 if (offset == 0) 957 if (!partial_page)
954 ClearPageChecked(page); 958 ClearPageChecked(page);
955 if (!page_has_buffers(page)) 959 if (!page_has_buffers(page))
956 goto out; 960 goto out;
957 961
958 bh = head = page_buffers(page); 962 bh = head = page_buffers(page);
959 do { 963 do {
964 if (pos + bh->b_size > stop)
965 return;
966
960 if (offset <= pos) 967 if (offset <= pos)
961 gfs2_discard(sdp, bh); 968 gfs2_discard(sdp, bh);
962 pos += bh->b_size; 969 pos += bh->b_size;
963 bh = bh->b_this_page; 970 bh = bh->b_this_page;
964 } while (bh != head); 971 } while (bh != head);
965out: 972out:
966 if (offset == 0) 973 if (!partial_page)
967 try_to_release_page(page, 0); 974 try_to_release_page(page, 0);
968} 975}
969 976
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93b5809c20bb..5e2f56fccf6b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1232,7 +1232,9 @@ static int do_grow(struct inode *inode, u64 size)
1232 unstuff = 1; 1232 unstuff = 1;
1233 } 1233 }
1234 1234
1235 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0); 1235 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1236 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1237 0 : RES_QUOTA), 0);
1236 if (error) 1238 if (error)
1237 goto do_grow_release; 1239 goto do_grow_release;
1238 1240
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4fddb3c22d25..f2448ab2aac5 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -109,8 +109,7 @@ fail:
109 return 0; 109 return 0;
110} 110}
111 111
112static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode, 112static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
113 struct qstr *str)
114{ 113{
115 str->hash = gfs2_disk_hash(str->name, str->len); 114 str->hash = gfs2_disk_hash(str->name, str->len);
116 return 0; 115 return 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b631c9043460..0cb4c1557f20 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1125,13 +1125,14 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1125 if (IS_ERR(hc)) 1125 if (IS_ERR(hc))
1126 return PTR_ERR(hc); 1126 return PTR_ERR(hc);
1127 1127
1128 h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); 1128 hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN);
1129 if (hc2 == NULL) 1129 if (hc2 == NULL)
1130 hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); 1130 hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL);
1131 1131
1132 if (!hc2) 1132 if (!hc2)
1133 return -ENOMEM; 1133 return -ENOMEM;
1134 1134
1135 h = hc2;
1135 error = gfs2_meta_inode_buffer(dip, &dibh); 1136 error = gfs2_meta_inode_buffer(dip, &dibh);
1136 if (error) 1137 if (error)
1137 goto out_kfree; 1138 goto out_kfree;
@@ -1212,9 +1213,7 @@ static int compare_dents(const void *a, const void *b)
1212/** 1213/**
1213 * do_filldir_main - read out directory entries 1214 * do_filldir_main - read out directory entries
1214 * @dip: The GFS2 inode 1215 * @dip: The GFS2 inode
1215 * @offset: The offset in the file to read from 1216 * @ctx: what to feed the entries to
1216 * @opaque: opaque data to pass to filldir
1217 * @filldir: The function to pass entries to
1218 * @darr: an array of struct gfs2_dirent pointers to read 1217 * @darr: an array of struct gfs2_dirent pointers to read
1219 * @entries: the number of entries in darr 1218 * @entries: the number of entries in darr
1220 * @copied: pointer to int that's non-zero if a entry has been copied out 1219 * @copied: pointer to int that's non-zero if a entry has been copied out
@@ -1224,11 +1223,10 @@ static int compare_dents(const void *a, const void *b)
1224 * the possibility that they will fall into different readdir buffers or 1223 * the possibility that they will fall into different readdir buffers or
1225 * that someone will want to seek to that location. 1224 * that someone will want to seek to that location.
1226 * 1225 *
1227 * Returns: errno, >0 on exception from filldir 1226 * Returns: errno, >0 if the actor tells you to stop
1228 */ 1227 */
1229 1228
1230static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, 1229static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
1231 void *opaque, filldir_t filldir,
1232 const struct gfs2_dirent **darr, u32 entries, 1230 const struct gfs2_dirent **darr, u32 entries,
1233 int *copied) 1231 int *copied)
1234{ 1232{
@@ -1236,7 +1234,6 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1236 u64 off, off_next; 1234 u64 off, off_next;
1237 unsigned int x, y; 1235 unsigned int x, y;
1238 int run = 0; 1236 int run = 0;
1239 int error = 0;
1240 1237
1241 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); 1238 sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
1242 1239
@@ -1253,9 +1250,9 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1253 off_next = be32_to_cpu(dent_next->de_hash); 1250 off_next = be32_to_cpu(dent_next->de_hash);
1254 off_next = gfs2_disk_hash2offset(off_next); 1251 off_next = gfs2_disk_hash2offset(off_next);
1255 1252
1256 if (off < *offset) 1253 if (off < ctx->pos)
1257 continue; 1254 continue;
1258 *offset = off; 1255 ctx->pos = off;
1259 1256
1260 if (off_next == off) { 1257 if (off_next == off) {
1261 if (*copied && !run) 1258 if (*copied && !run)
@@ -1264,26 +1261,25 @@ static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
1264 } else 1261 } else
1265 run = 0; 1262 run = 0;
1266 } else { 1263 } else {
1267 if (off < *offset) 1264 if (off < ctx->pos)
1268 continue; 1265 continue;
1269 *offset = off; 1266 ctx->pos = off;
1270 } 1267 }
1271 1268
1272 error = filldir(opaque, (const char *)(dent + 1), 1269 if (!dir_emit(ctx, (const char *)(dent + 1),
1273 be16_to_cpu(dent->de_name_len), 1270 be16_to_cpu(dent->de_name_len),
1274 off, be64_to_cpu(dent->de_inum.no_addr), 1271 be64_to_cpu(dent->de_inum.no_addr),
1275 be16_to_cpu(dent->de_type)); 1272 be16_to_cpu(dent->de_type)))
1276 if (error)
1277 return 1; 1273 return 1;
1278 1274
1279 *copied = 1; 1275 *copied = 1;
1280 } 1276 }
1281 1277
1282 /* Increment the *offset by one, so the next time we come into the 1278 /* Increment the ctx->pos by one, so the next time we come into the
1283 do_filldir fxn, we get the next entry instead of the last one in the 1279 do_filldir fxn, we get the next entry instead of the last one in the
1284 current leaf */ 1280 current leaf */
1285 1281
1286 (*offset)++; 1282 ctx->pos++;
1287 1283
1288 return 0; 1284 return 0;
1289} 1285}
@@ -1307,8 +1303,8 @@ static void gfs2_free_sort_buffer(void *ptr)
1307 kfree(ptr); 1303 kfree(ptr);
1308} 1304}
1309 1305
1310static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, 1306static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
1311 filldir_t filldir, int *copied, unsigned *depth, 1307 int *copied, unsigned *depth,
1312 u64 leaf_no) 1308 u64 leaf_no)
1313{ 1309{
1314 struct gfs2_inode *ip = GFS2_I(inode); 1310 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1386,8 +1382,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
1386 } while(lfn); 1382 } while(lfn);
1387 1383
1388 BUG_ON(entries2 != entries); 1384 BUG_ON(entries2 != entries);
1389 error = do_filldir_main(ip, offset, opaque, filldir, darr, 1385 error = do_filldir_main(ip, ctx, darr, entries, copied);
1390 entries, copied);
1391out_free: 1386out_free:
1392 for(i = 0; i < leaf; i++) 1387 for(i = 0; i < leaf; i++)
1393 brelse(larr[i]); 1388 brelse(larr[i]);
@@ -1446,15 +1441,13 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index,
1446/** 1441/**
1447 * dir_e_read - Reads the entries from a directory into a filldir buffer 1442 * dir_e_read - Reads the entries from a directory into a filldir buffer
1448 * @dip: dinode pointer 1443 * @dip: dinode pointer
1449 * @offset: the hash of the last entry read shifted to the right once 1444 * @ctx: actor to feed the entries to
1450 * @opaque: buffer for the filldir function to fill
1451 * @filldir: points to the filldir function to use
1452 * 1445 *
1453 * Returns: errno 1446 * Returns: errno
1454 */ 1447 */
1455 1448
1456static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, 1449static int dir_e_read(struct inode *inode, struct dir_context *ctx,
1457 filldir_t filldir, struct file_ra_state *f_ra) 1450 struct file_ra_state *f_ra)
1458{ 1451{
1459 struct gfs2_inode *dip = GFS2_I(inode); 1452 struct gfs2_inode *dip = GFS2_I(inode);
1460 u32 hsize, len = 0; 1453 u32 hsize, len = 0;
@@ -1465,7 +1458,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1465 unsigned depth = 0; 1458 unsigned depth = 0;
1466 1459
1467 hsize = 1 << dip->i_depth; 1460 hsize = 1 << dip->i_depth;
1468 hash = gfs2_dir_offset2hash(*offset); 1461 hash = gfs2_dir_offset2hash(ctx->pos);
1469 index = hash >> (32 - dip->i_depth); 1462 index = hash >> (32 - dip->i_depth);
1470 1463
1471 if (dip->i_hash_cache == NULL) 1464 if (dip->i_hash_cache == NULL)
@@ -1477,7 +1470,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1477 gfs2_dir_readahead(inode, hsize, index, f_ra); 1470 gfs2_dir_readahead(inode, hsize, index, f_ra);
1478 1471
1479 while (index < hsize) { 1472 while (index < hsize) {
1480 error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, 1473 error = gfs2_dir_read_leaf(inode, ctx,
1481 &copied, &depth, 1474 &copied, &depth,
1482 be64_to_cpu(lp[index])); 1475 be64_to_cpu(lp[index]));
1483 if (error) 1476 if (error)
@@ -1492,8 +1485,8 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1492 return error; 1485 return error;
1493} 1486}
1494 1487
1495int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 1488int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
1496 filldir_t filldir, struct file_ra_state *f_ra) 1489 struct file_ra_state *f_ra)
1497{ 1490{
1498 struct gfs2_inode *dip = GFS2_I(inode); 1491 struct gfs2_inode *dip = GFS2_I(inode);
1499 struct gfs2_sbd *sdp = GFS2_SB(inode); 1492 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1507,7 +1500,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1507 return 0; 1500 return 0;
1508 1501
1509 if (dip->i_diskflags & GFS2_DIF_EXHASH) 1502 if (dip->i_diskflags & GFS2_DIF_EXHASH)
1510 return dir_e_read(inode, offset, opaque, filldir, f_ra); 1503 return dir_e_read(inode, ctx, f_ra);
1511 1504
1512 if (!gfs2_is_stuffed(dip)) { 1505 if (!gfs2_is_stuffed(dip)) {
1513 gfs2_consist_inode(dip); 1506 gfs2_consist_inode(dip);
@@ -1539,7 +1532,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1539 error = -EIO; 1532 error = -EIO;
1540 goto out; 1533 goto out;
1541 } 1534 }
1542 error = do_filldir_main(dip, offset, opaque, filldir, darr, 1535 error = do_filldir_main(dip, ctx, darr,
1543 dip->i_entries, &copied); 1536 dip->i_entries, &copied);
1544out: 1537out:
1545 kfree(darr); 1538 kfree(darr);
@@ -1555,9 +1548,9 @@ out:
1555 1548
1556/** 1549/**
1557 * gfs2_dir_search - Search a directory 1550 * gfs2_dir_search - Search a directory
1558 * @dip: The GFS2 inode 1551 * @dip: The GFS2 dir inode
1559 * @filename: 1552 * @name: The name we are looking up
1560 * @inode: 1553 * @fail_on_exist: Fail if the name exists rather than looking it up
1561 * 1554 *
1562 * This routine searches a directory for a file or another directory. 1555 * This routine searches a directory for a file or another directory.
1563 * Assumes a glock is held on dip. 1556 * Assumes a glock is held on dip.
@@ -1565,22 +1558,25 @@ out:
1565 * Returns: errno 1558 * Returns: errno
1566 */ 1559 */
1567 1560
1568struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name) 1561struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
1562 bool fail_on_exist)
1569{ 1563{
1570 struct buffer_head *bh; 1564 struct buffer_head *bh;
1571 struct gfs2_dirent *dent; 1565 struct gfs2_dirent *dent;
1572 struct inode *inode; 1566 u64 addr, formal_ino;
1567 u16 dtype;
1573 1568
1574 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); 1569 dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
1575 if (dent) { 1570 if (dent) {
1576 if (IS_ERR(dent)) 1571 if (IS_ERR(dent))
1577 return ERR_CAST(dent); 1572 return ERR_CAST(dent);
1578 inode = gfs2_inode_lookup(dir->i_sb, 1573 dtype = be16_to_cpu(dent->de_type);
1579 be16_to_cpu(dent->de_type), 1574 addr = be64_to_cpu(dent->de_inum.no_addr);
1580 be64_to_cpu(dent->de_inum.no_addr), 1575 formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
1581 be64_to_cpu(dent->de_inum.no_formal_ino), 0);
1582 brelse(bh); 1576 brelse(bh);
1583 return inode; 1577 if (fail_on_exist)
1578 return ERR_PTR(-EEXIST);
1579 return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
1584 } 1580 }
1585 return ERR_PTR(-ENOENT); 1581 return ERR_PTR(-ENOENT);
1586} 1582}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 98c960beab35..4f03bbd1873f 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -18,14 +18,15 @@ struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19 19
20extern struct inode *gfs2_dir_search(struct inode *dir, 20extern struct inode *gfs2_dir_search(struct inode *dir,
21 const struct qstr *filename); 21 const struct qstr *filename,
22 bool fail_on_exist);
22extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 23extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
23 const struct gfs2_inode *ip); 24 const struct gfs2_inode *ip);
24extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 25extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
25 const struct gfs2_inode *ip); 26 const struct gfs2_inode *ip);
26extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); 27extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
27extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 28extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
28 filldir_t filldir, struct file_ra_state *f_ra); 29 struct file_ra_state *f_ra);
29extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 30extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
30 const struct gfs2_inode *nip, unsigned int new_type); 31 const struct gfs2_inode *nip, unsigned int new_type);
31 32
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9973df4ff565..8b9b3775e2e7 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -64,6 +64,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
64} 64}
65 65
66struct get_name_filldir { 66struct get_name_filldir {
67 struct dir_context ctx;
67 struct gfs2_inum_host inum; 68 struct gfs2_inum_host inum;
68 char *name; 69 char *name;
69}; 70};
@@ -88,9 +89,11 @@ static int gfs2_get_name(struct dentry *parent, char *name,
88 struct inode *dir = parent->d_inode; 89 struct inode *dir = parent->d_inode;
89 struct inode *inode = child->d_inode; 90 struct inode *inode = child->d_inode;
90 struct gfs2_inode *dip, *ip; 91 struct gfs2_inode *dip, *ip;
91 struct get_name_filldir gnfd; 92 struct get_name_filldir gnfd = {
93 .ctx.actor = get_name_filldir,
94 .name = name
95 };
92 struct gfs2_holder gh; 96 struct gfs2_holder gh;
93 u64 offset = 0;
94 int error; 97 int error;
95 struct file_ra_state f_ra = { .start = 0 }; 98 struct file_ra_state f_ra = { .start = 0 };
96 99
@@ -106,13 +109,12 @@ static int gfs2_get_name(struct dentry *parent, char *name,
106 *name = 0; 109 *name = 0;
107 gnfd.inum.no_addr = ip->i_no_addr; 110 gnfd.inum.no_addr = ip->i_no_addr;
108 gnfd.inum.no_formal_ino = ip->i_no_formal_ino; 111 gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
109 gnfd.name = name;
110 112
111 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); 113 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
112 if (error) 114 if (error)
113 return error; 115 return error;
114 116
115 error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir, &f_ra); 117 error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra);
116 118
117 gfs2_glock_dq_uninit(&gh); 119 gfs2_glock_dq_uninit(&gh);
118 120
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ad0dc38d87ab..72c3866a7320 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -82,35 +82,28 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
82} 82}
83 83
84/** 84/**
85 * gfs2_readdir - Read directory entries from a directory 85 * gfs2_readdir - Iterator for a directory
86 * @file: The directory to read from 86 * @file: The directory to read from
87 * @dirent: Buffer for dirents 87 * @ctx: What to feed directory entries to
88 * @filldir: Function used to do the copying
89 * 88 *
90 * Returns: errno 89 * Returns: errno
91 */ 90 */
92 91
93static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) 92static int gfs2_readdir(struct file *file, struct dir_context *ctx)
94{ 93{
95 struct inode *dir = file->f_mapping->host; 94 struct inode *dir = file->f_mapping->host;
96 struct gfs2_inode *dip = GFS2_I(dir); 95 struct gfs2_inode *dip = GFS2_I(dir);
97 struct gfs2_holder d_gh; 96 struct gfs2_holder d_gh;
98 u64 offset = file->f_pos;
99 int error; 97 int error;
100 98
101 gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); 99 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
102 error = gfs2_glock_nq(&d_gh); 100 if (error)
103 if (error) {
104 gfs2_holder_uninit(&d_gh);
105 return error; 101 return error;
106 }
107 102
108 error = gfs2_dir_read(dir, &offset, dirent, filldir, &file->f_ra); 103 error = gfs2_dir_read(dir, ctx, &file->f_ra);
109 104
110 gfs2_glock_dq_uninit(&d_gh); 105 gfs2_glock_dq_uninit(&d_gh);
111 106
112 file->f_pos = offset;
113
114 return error; 107 return error;
115} 108}
116 109
@@ -538,21 +531,30 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
538} 531}
539 532
540/** 533/**
541 * gfs2_open - open a file 534 * gfs2_open_common - This is common to open and atomic_open
542 * @inode: the inode to open 535 * @inode: The inode being opened
543 * @file: the struct file for this opening 536 * @file: The file being opened
544 * 537 *
545 * Returns: errno 538 * This maybe called under a glock or not depending upon how it has
539 * been called. We must always be called under a glock for regular
540 * files, however. For other file types, it does not matter whether
541 * we hold the glock or not.
542 *
543 * Returns: Error code or 0 for success
546 */ 544 */
547 545
548static int gfs2_open(struct inode *inode, struct file *file) 546int gfs2_open_common(struct inode *inode, struct file *file)
549{ 547{
550 struct gfs2_inode *ip = GFS2_I(inode);
551 struct gfs2_holder i_gh;
552 struct gfs2_file *fp; 548 struct gfs2_file *fp;
553 int error; 549 int ret;
550
551 if (S_ISREG(inode->i_mode)) {
552 ret = generic_file_open(inode, file);
553 if (ret)
554 return ret;
555 }
554 556
555 fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); 557 fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS);
556 if (!fp) 558 if (!fp)
557 return -ENOMEM; 559 return -ENOMEM;
558 560
@@ -560,29 +562,43 @@ static int gfs2_open(struct inode *inode, struct file *file)
560 562
561 gfs2_assert_warn(GFS2_SB(inode), !file->private_data); 563 gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
562 file->private_data = fp; 564 file->private_data = fp;
565 return 0;
566}
567
568/**
569 * gfs2_open - open a file
570 * @inode: the inode to open
571 * @file: the struct file for this opening
572 *
573 * After atomic_open, this function is only used for opening files
574 * which are already cached. We must still get the glock for regular
575 * files to ensure that we have the file size uptodate for the large
576 * file check which is in the common code. That is only an issue for
577 * regular files though.
578 *
579 * Returns: errno
580 */
581
582static int gfs2_open(struct inode *inode, struct file *file)
583{
584 struct gfs2_inode *ip = GFS2_I(inode);
585 struct gfs2_holder i_gh;
586 int error;
587 bool need_unlock = false;
563 588
564 if (S_ISREG(ip->i_inode.i_mode)) { 589 if (S_ISREG(ip->i_inode.i_mode)) {
565 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 590 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
566 &i_gh); 591 &i_gh);
567 if (error) 592 if (error)
568 goto fail; 593 return error;
594 need_unlock = true;
595 }
569 596
570 if (!(file->f_flags & O_LARGEFILE) && 597 error = gfs2_open_common(inode, file);
571 i_size_read(inode) > MAX_NON_LFS) {
572 error = -EOVERFLOW;
573 goto fail_gunlock;
574 }
575 598
599 if (need_unlock)
576 gfs2_glock_dq_uninit(&i_gh); 600 gfs2_glock_dq_uninit(&i_gh);
577 }
578 601
579 return 0;
580
581fail_gunlock:
582 gfs2_glock_dq_uninit(&i_gh);
583fail:
584 file->private_data = NULL;
585 kfree(fp);
586 return error; 602 return error;
587} 603}
588 604
@@ -896,7 +912,7 @@ out_uninit:
896 * cluster; until we do, disable leases (by just returning -EINVAL), 912 * cluster; until we do, disable leases (by just returning -EINVAL),
897 * unless the administrator has requested purely local locking. 913 * unless the administrator has requested purely local locking.
898 * 914 *
899 * Locking: called under lock_flocks 915 * Locking: called under i_lock
900 * 916 *
901 * Returns: errno 917 * Returns: errno
902 */ 918 */
@@ -1048,7 +1064,7 @@ const struct file_operations gfs2_file_fops = {
1048}; 1064};
1049 1065
1050const struct file_operations gfs2_dir_fops = { 1066const struct file_operations gfs2_dir_fops = {
1051 .readdir = gfs2_readdir, 1067 .iterate = gfs2_readdir,
1052 .unlocked_ioctl = gfs2_ioctl, 1068 .unlocked_ioctl = gfs2_ioctl,
1053 .open = gfs2_open, 1069 .open = gfs2_open,
1054 .release = gfs2_release, 1070 .release = gfs2_release,
@@ -1078,7 +1094,7 @@ const struct file_operations gfs2_file_fops_nolock = {
1078}; 1094};
1079 1095
1080const struct file_operations gfs2_dir_fops_nolock = { 1096const struct file_operations gfs2_dir_fops_nolock = {
1081 .readdir = gfs2_readdir, 1097 .iterate = gfs2_readdir,
1082 .unlocked_ioctl = gfs2_ioctl, 1098 .unlocked_ioctl = gfs2_ioctl,
1083 .open = gfs2_open, 1099 .open = gfs2_open,
1084 .release = gfs2_release, 1100 .release = gfs2_release,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c66e99c97571..5f2e5224c51c 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -54,7 +54,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
54 struct gfs2_bufdata *bd, *tmp; 54 struct gfs2_bufdata *bd, *tmp;
55 struct buffer_head *bh; 55 struct buffer_head *bh;
56 const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); 56 const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock);
57 sector_t blocknr;
58 57
59 gfs2_log_lock(sdp); 58 gfs2_log_lock(sdp);
60 spin_lock(&sdp->sd_ail_lock); 59 spin_lock(&sdp->sd_ail_lock);
@@ -65,13 +64,6 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
65 continue; 64 continue;
66 gfs2_ail_error(gl, bh); 65 gfs2_ail_error(gl, bh);
67 } 66 }
68 blocknr = bh->b_blocknr;
69 bh->b_private = NULL;
70 gfs2_remove_from_ail(bd); /* drops ref on bh */
71
72 bd->bd_bh = NULL;
73 bd->bd_blkno = blocknr;
74
75 gfs2_trans_add_revoke(sdp, bd); 67 gfs2_trans_add_revoke(sdp, bd);
76 } 68 }
77 GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); 69 GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 62b484e4a9e4..bbb2715171cd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -313,7 +313,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
313 goto out; 313 goto out;
314 } 314 }
315 315
316 inode = gfs2_dir_search(dir, name); 316 inode = gfs2_dir_search(dir, name, false);
317 if (IS_ERR(inode)) 317 if (IS_ERR(inode))
318 error = PTR_ERR(inode); 318 error = PTR_ERR(inode);
319out: 319out:
@@ -346,17 +346,6 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
346 if (!dip->i_inode.i_nlink) 346 if (!dip->i_inode.i_nlink)
347 return -ENOENT; 347 return -ENOENT;
348 348
349 error = gfs2_dir_check(&dip->i_inode, name, NULL);
350 switch (error) {
351 case -ENOENT:
352 error = 0;
353 break;
354 case 0:
355 return -EEXIST;
356 default:
357 return error;
358 }
359
360 if (dip->i_entries == (u32)-1) 349 if (dip->i_entries == (u32)-1)
361 return -EFBIG; 350 return -EFBIG;
362 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) 351 if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
@@ -546,6 +535,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
546 * gfs2_create_inode - Create a new inode 535 * gfs2_create_inode - Create a new inode
547 * @dir: The parent directory 536 * @dir: The parent directory
548 * @dentry: The new dentry 537 * @dentry: The new dentry
538 * @file: If non-NULL, the file which is being opened
549 * @mode: The permissions on the new inode 539 * @mode: The permissions on the new inode
550 * @dev: For device nodes, this is the device number 540 * @dev: For device nodes, this is the device number
551 * @symname: For symlinks, this is the link destination 541 * @symname: For symlinks, this is the link destination
@@ -555,8 +545,9 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
555 */ 545 */
556 546
557static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, 547static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
548 struct file *file,
558 umode_t mode, dev_t dev, const char *symname, 549 umode_t mode, dev_t dev, const char *symname,
559 unsigned int size, int excl) 550 unsigned int size, int excl, int *opened)
560{ 551{
561 const struct qstr *name = &dentry->d_name; 552 const struct qstr *name = &dentry->d_name;
562 struct gfs2_holder ghs[2]; 553 struct gfs2_holder ghs[2];
@@ -564,6 +555,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
564 struct gfs2_inode *dip = GFS2_I(dir), *ip; 555 struct gfs2_inode *dip = GFS2_I(dir), *ip;
565 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 556 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
566 struct gfs2_glock *io_gl; 557 struct gfs2_glock *io_gl;
558 struct dentry *d;
567 int error; 559 int error;
568 u32 aflags = 0; 560 u32 aflags = 0;
569 int arq; 561 int arq;
@@ -584,15 +576,30 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
584 goto fail; 576 goto fail;
585 577
586 error = create_ok(dip, name, mode); 578 error = create_ok(dip, name, mode);
587 if ((error == -EEXIST) && S_ISREG(mode) && !excl) {
588 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
589 gfs2_glock_dq_uninit(ghs);
590 d_instantiate(dentry, inode);
591 return IS_ERR(inode) ? PTR_ERR(inode) : 0;
592 }
593 if (error) 579 if (error)
594 goto fail_gunlock; 580 goto fail_gunlock;
595 581
582 inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl);
583 error = PTR_ERR(inode);
584 if (!IS_ERR(inode)) {
585 d = d_splice_alias(inode, dentry);
586 error = 0;
587 if (file && !IS_ERR(d)) {
588 if (d == NULL)
589 d = dentry;
590 if (S_ISREG(inode->i_mode))
591 error = finish_open(file, d, gfs2_open_common, opened);
592 else
593 error = finish_no_open(file, d);
594 }
595 gfs2_glock_dq_uninit(ghs);
596 if (IS_ERR(d))
597 return PTR_RET(d);
598 return error;
599 } else if (error != -ENOENT) {
600 goto fail_gunlock;
601 }
602
596 arq = error = gfs2_diradd_alloc_required(dir, name); 603 arq = error = gfs2_diradd_alloc_required(dir, name);
597 if (error < 0) 604 if (error < 0)
598 goto fail_gunlock; 605 goto fail_gunlock;
@@ -686,10 +693,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
686 goto fail_gunlock3; 693 goto fail_gunlock3;
687 694
688 mark_inode_dirty(inode); 695 mark_inode_dirty(inode);
696 d_instantiate(dentry, inode);
697 if (file)
698 error = finish_open(file, dentry, gfs2_open_common, opened);
689 gfs2_glock_dq_uninit(ghs); 699 gfs2_glock_dq_uninit(ghs);
690 gfs2_glock_dq_uninit(ghs + 1); 700 gfs2_glock_dq_uninit(ghs + 1);
691 d_instantiate(dentry, inode); 701 return error;
692 return 0;
693 702
694fail_gunlock3: 703fail_gunlock3:
695 gfs2_glock_dq_uninit(ghs + 1); 704 gfs2_glock_dq_uninit(ghs + 1);
@@ -729,36 +738,56 @@ fail:
729static int gfs2_create(struct inode *dir, struct dentry *dentry, 738static int gfs2_create(struct inode *dir, struct dentry *dentry,
730 umode_t mode, bool excl) 739 umode_t mode, bool excl)
731{ 740{
732 return gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0, excl); 741 return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl, NULL);
733} 742}
734 743
735/** 744/**
736 * gfs2_lookup - Look up a filename in a directory and return its inode 745 * __gfs2_lookup - Look up a filename in a directory and return its inode
737 * @dir: The directory inode 746 * @dir: The directory inode
738 * @dentry: The dentry of the new inode 747 * @dentry: The dentry of the new inode
739 * @nd: passed from Linux VFS, ignored by us 748 * @file: File to be opened
749 * @opened: atomic_open flags
740 * 750 *
741 * Called by the VFS layer. Lock dir and call gfs2_lookupi()
742 * 751 *
743 * Returns: errno 752 * Returns: errno
744 */ 753 */
745 754
746static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, 755static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
747 unsigned int flags) 756 struct file *file, int *opened)
748{ 757{
749 struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0); 758 struct inode *inode;
750 if (inode && !IS_ERR(inode)) { 759 struct dentry *d;
751 struct gfs2_glock *gl = GFS2_I(inode)->i_gl; 760 struct gfs2_holder gh;
752 struct gfs2_holder gh; 761 struct gfs2_glock *gl;
753 int error; 762 int error;
754 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); 763
755 if (error) { 764 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
756 iput(inode); 765 if (!inode)
757 return ERR_PTR(error); 766 return NULL;
758 } 767 if (IS_ERR(inode))
759 gfs2_glock_dq_uninit(&gh); 768 return ERR_CAST(inode);
769
770 gl = GFS2_I(inode)->i_gl;
771 error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
772 if (error) {
773 iput(inode);
774 return ERR_PTR(error);
760 } 775 }
761 return d_splice_alias(inode, dentry); 776
777 d = d_splice_alias(inode, dentry);
778 if (file && S_ISREG(inode->i_mode))
779 error = finish_open(file, dentry, gfs2_open_common, opened);
780
781 gfs2_glock_dq_uninit(&gh);
782 if (error)
783 return ERR_PTR(error);
784 return d;
785}
786
787static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
788 unsigned flags)
789{
790 return __gfs2_lookup(dir, dentry, NULL, NULL);
762} 791}
763 792
764/** 793/**
@@ -1076,7 +1105,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
1076 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) 1105 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
1077 return -ENAMETOOLONG; 1106 return -ENAMETOOLONG;
1078 1107
1079 return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); 1108 return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0, NULL);
1080} 1109}
1081 1110
1082/** 1111/**
@@ -1092,7 +1121,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1092{ 1121{
1093 struct gfs2_sbd *sdp = GFS2_SB(dir); 1122 struct gfs2_sbd *sdp = GFS2_SB(dir);
1094 unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 1123 unsigned dsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
1095 return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, dsize, 0); 1124 return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0, NULL);
1096} 1125}
1097 1126
1098/** 1127/**
@@ -1107,7 +1136,43 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1107static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, 1136static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
1108 dev_t dev) 1137 dev_t dev)
1109{ 1138{
1110 return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0, 0); 1139 return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0, NULL);
1140}
1141
1142/**
1143 * gfs2_atomic_open - Atomically open a file
1144 * @dir: The directory
1145 * @dentry: The proposed new entry
1146 * @file: The proposed new struct file
1147 * @flags: open flags
1148 * @mode: File mode
1149 * @opened: Flag to say whether the file has been opened or not
1150 *
1151 * Returns: error code or 0 for success
1152 */
1153
1154static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
1155 struct file *file, unsigned flags,
1156 umode_t mode, int *opened)
1157{
1158 struct dentry *d;
1159 bool excl = !!(flags & O_EXCL);
1160
1161 d = __gfs2_lookup(dir, dentry, file, opened);
1162 if (IS_ERR(d))
1163 return PTR_ERR(d);
1164 if (d == NULL)
1165 d = dentry;
1166 if (d->d_inode) {
1167 if (!(*opened & FILE_OPENED))
1168 return finish_no_open(file, d);
1169 return 0;
1170 }
1171
1172 if (!(flags & O_CREAT))
1173 return -ENOENT;
1174
1175 return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl, opened);
1111} 1176}
1112 1177
1113/* 1178/*
@@ -1787,6 +1852,7 @@ const struct inode_operations gfs2_dir_iops = {
1787 .removexattr = gfs2_removexattr, 1852 .removexattr = gfs2_removexattr,
1788 .fiemap = gfs2_fiemap, 1853 .fiemap = gfs2_fiemap,
1789 .get_acl = gfs2_get_acl, 1854 .get_acl = gfs2_get_acl,
1855 .atomic_open = gfs2_atomic_open,
1790}; 1856};
1791 1857
1792const struct inode_operations gfs2_symlink_iops = { 1858const struct inode_operations gfs2_symlink_iops = {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c53c7477f6da..ba4d9492d422 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -109,6 +109,7 @@ extern int gfs2_permission(struct inode *inode, int mask);
109extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); 109extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr);
110extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); 110extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
111extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); 111extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
112extern int gfs2_open_common(struct inode *inode, struct file *file);
112 113
113extern const struct inode_operations gfs2_file_iops; 114extern const struct inode_operations gfs2_file_iops;
114extern const struct inode_operations gfs2_dir_iops; 115extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b404f4853034..610613fb65b5 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -211,15 +211,16 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
211static int gfs2_ail1_empty(struct gfs2_sbd *sdp) 211static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
212{ 212{
213 struct gfs2_trans *tr, *s; 213 struct gfs2_trans *tr, *s;
214 int oldest_tr = 1;
214 int ret; 215 int ret;
215 216
216 spin_lock(&sdp->sd_ail_lock); 217 spin_lock(&sdp->sd_ail_lock);
217 list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { 218 list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
218 gfs2_ail1_empty_one(sdp, tr); 219 gfs2_ail1_empty_one(sdp, tr);
219 if (list_empty(&tr->tr_ail1_list)) 220 if (list_empty(&tr->tr_ail1_list) && oldest_tr)
220 list_move(&tr->tr_list, &sdp->sd_ail2_list); 221 list_move(&tr->tr_list, &sdp->sd_ail2_list);
221 else 222 else
222 break; 223 oldest_tr = 0;
223 } 224 }
224 ret = list_empty(&sdp->sd_ail1_list); 225 ret = list_empty(&sdp->sd_ail1_list);
225 spin_unlock(&sdp->sd_ail_lock); 226 spin_unlock(&sdp->sd_ail_lock);
@@ -317,7 +318,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
317 318
318int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) 319int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
319{ 320{
320 unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize); 321 unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize);
321 unsigned wanted = blks + reserved_blks; 322 unsigned wanted = blks + reserved_blks;
322 DEFINE_WAIT(wait); 323 DEFINE_WAIT(wait);
323 int did_wait = 0; 324 int did_wait = 0;
@@ -545,6 +546,76 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip)
545 spin_unlock(&sdp->sd_ordered_lock); 546 spin_unlock(&sdp->sd_ordered_lock);
546} 547}
547 548
549void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
550{
551 struct buffer_head *bh = bd->bd_bh;
552 struct gfs2_glock *gl = bd->bd_gl;
553
554 gfs2_remove_from_ail(bd);
555 bd->bd_bh = NULL;
556 bh->b_private = NULL;
557 bd->bd_blkno = bh->b_blocknr;
558 bd->bd_ops = &gfs2_revoke_lops;
559 sdp->sd_log_num_revoke++;
560 atomic_inc(&gl->gl_revokes);
561 set_bit(GLF_LFLUSH, &gl->gl_flags);
562 list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
563}
564
565void gfs2_write_revokes(struct gfs2_sbd *sdp)
566{
567 struct gfs2_trans *tr;
568 struct gfs2_bufdata *bd, *tmp;
569 int have_revokes = 0;
570 int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
571
572 gfs2_ail1_empty(sdp);
573 spin_lock(&sdp->sd_ail_lock);
574 list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
575 list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
576 if (list_empty(&bd->bd_list)) {
577 have_revokes = 1;
578 goto done;
579 }
580 }
581 }
582done:
583 spin_unlock(&sdp->sd_ail_lock);
584 if (have_revokes == 0)
585 return;
586 while (sdp->sd_log_num_revoke > max_revokes)
587 max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64);
588 max_revokes -= sdp->sd_log_num_revoke;
589 if (!sdp->sd_log_num_revoke) {
590 atomic_dec(&sdp->sd_log_blks_free);
591 /* If no blocks have been reserved, we need to also
592 * reserve a block for the header */
593 if (!sdp->sd_log_blks_reserved)
594 atomic_dec(&sdp->sd_log_blks_free);
595 }
596 gfs2_log_lock(sdp);
597 spin_lock(&sdp->sd_ail_lock);
598 list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
599 list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
600 if (max_revokes == 0)
601 goto out_of_blocks;
602 if (!list_empty(&bd->bd_list))
603 continue;
604 gfs2_add_revoke(sdp, bd);
605 max_revokes--;
606 }
607 }
608out_of_blocks:
609 spin_unlock(&sdp->sd_ail_lock);
610 gfs2_log_unlock(sdp);
611
612 if (!sdp->sd_log_num_revoke) {
613 atomic_inc(&sdp->sd_log_blks_free);
614 if (!sdp->sd_log_blks_reserved)
615 atomic_inc(&sdp->sd_log_blks_free);
616 }
617}
618
548/** 619/**
549 * log_write_header - Get and initialize a journal header buffer 620 * log_write_header - Get and initialize a journal header buffer
550 * @sdp: The GFS2 superblock 621 * @sdp: The GFS2 superblock
@@ -562,7 +633,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
562 lh = page_address(page); 633 lh = page_address(page);
563 clear_page(lh); 634 clear_page(lh);
564 635
565 gfs2_ail1_empty(sdp);
566 tail = current_tail(sdp); 636 tail = current_tail(sdp);
567 637
568 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); 638 lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3566f35915e0..37216634f0aa 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -72,5 +72,7 @@ extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
72extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); 72extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
73extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); 73extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
74extern int gfs2_logd(void *data); 74extern int gfs2_logd(void *data);
75extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
76extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
75 77
76#endif /* __LOG_DOT_H__ */ 78#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 6c33d7b6e0c4..17c5b5d7dc88 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -16,6 +16,7 @@
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/list_sort.h>
19 20
20#include "gfs2.h" 21#include "gfs2.h"
21#include "incore.h" 22#include "incore.h"
@@ -401,6 +402,20 @@ static void gfs2_check_magic(struct buffer_head *bh)
401 kunmap_atomic(kaddr); 402 kunmap_atomic(kaddr);
402} 403}
403 404
405static int blocknr_cmp(void *priv, struct list_head *a, struct list_head *b)
406{
407 struct gfs2_bufdata *bda, *bdb;
408
409 bda = list_entry(a, struct gfs2_bufdata, bd_list);
410 bdb = list_entry(b, struct gfs2_bufdata, bd_list);
411
412 if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
413 return -1;
414 if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
415 return 1;
416 return 0;
417}
418
404static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, 419static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
405 unsigned int total, struct list_head *blist, 420 unsigned int total, struct list_head *blist,
406 bool is_databuf) 421 bool is_databuf)
@@ -413,6 +428,7 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
413 __be64 *ptr; 428 __be64 *ptr;
414 429
415 gfs2_log_lock(sdp); 430 gfs2_log_lock(sdp);
431 list_sort(NULL, blist, blocknr_cmp);
416 bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list); 432 bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list);
417 while(total) { 433 while(total) {
418 num = total; 434 num = total;
@@ -590,6 +606,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
590 struct page *page; 606 struct page *page;
591 unsigned int length; 607 unsigned int length;
592 608
609 gfs2_write_revokes(sdp);
593 if (!sdp->sd_log_num_revoke) 610 if (!sdp->sd_log_num_revoke)
594 return; 611 return;
595 612
@@ -836,10 +853,6 @@ const struct gfs2_log_operations gfs2_revoke_lops = {
836 .lo_name = "revoke", 853 .lo_name = "revoke",
837}; 854};
838 855
839const struct gfs2_log_operations gfs2_rg_lops = {
840 .lo_name = "rg",
841};
842
843const struct gfs2_log_operations gfs2_databuf_lops = { 856const struct gfs2_log_operations gfs2_databuf_lops = {
844 .lo_before_commit = databuf_lo_before_commit, 857 .lo_before_commit = databuf_lo_before_commit,
845 .lo_after_commit = databuf_lo_after_commit, 858 .lo_after_commit = databuf_lo_after_commit,
@@ -851,7 +864,6 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
851const struct gfs2_log_operations *gfs2_log_ops[] = { 864const struct gfs2_log_operations *gfs2_log_ops[] = {
852 &gfs2_databuf_lops, 865 &gfs2_databuf_lops,
853 &gfs2_buf_lops, 866 &gfs2_buf_lops,
854 &gfs2_rg_lops,
855 &gfs2_revoke_lops, 867 &gfs2_revoke_lops,
856 NULL, 868 NULL,
857}; 869};
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 87e062e05c92..9ca2e6438419 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -23,7 +23,6 @@
23extern const struct gfs2_log_operations gfs2_glock_lops; 23extern const struct gfs2_log_operations gfs2_glock_lops;
24extern const struct gfs2_log_operations gfs2_buf_lops; 24extern const struct gfs2_log_operations gfs2_buf_lops;
25extern const struct gfs2_log_operations gfs2_revoke_lops; 25extern const struct gfs2_log_operations gfs2_revoke_lops;
26extern const struct gfs2_log_operations gfs2_rg_lops;
27extern const struct gfs2_log_operations gfs2_databuf_lops; 26extern const struct gfs2_log_operations gfs2_databuf_lops;
28 27
29extern const struct gfs2_log_operations *gfs2_log_ops[]; 28extern const struct gfs2_log_operations *gfs2_log_ops[];
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1a89afb68472..0da390686c08 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -296,10 +296,6 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
296 if (bd) { 296 if (bd) {
297 spin_lock(&sdp->sd_ail_lock); 297 spin_lock(&sdp->sd_ail_lock);
298 if (bd->bd_tr) { 298 if (bd->bd_tr) {
299 gfs2_remove_from_ail(bd);
300 bh->b_private = NULL;
301 bd->bd_bh = NULL;
302 bd->bd_blkno = bh->b_blocknr;
303 gfs2_trans_add_revoke(sdp, bd); 299 gfs2_trans_add_revoke(sdp, bd);
304 } 300 }
305 spin_unlock(&sdp->sd_ail_lock); 301 spin_unlock(&sdp->sd_ail_lock);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..0262c190b6f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -916,16 +916,16 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
916 goto fail_quotad; 916 goto fail_quotad;
917 917
918 p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); 918 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
919 error = IS_ERR(p); 919 if (IS_ERR(p)) {
920 if (error) { 920 error = PTR_ERR(p);
921 fs_err(sdp, "can't start logd thread: %d\n", error); 921 fs_err(sdp, "can't start logd thread: %d\n", error);
922 return error; 922 return error;
923 } 923 }
924 sdp->sd_logd_process = p; 924 sdp->sd_logd_process = p;
925 925
926 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); 926 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
927 error = IS_ERR(p); 927 if (IS_ERR(p)) {
928 if (error) { 928 error = PTR_ERR(p);
929 fs_err(sdp, "can't start quotad thread: %d\n", error); 929 fs_err(sdp, "can't start quotad thread: %d\n", error);
930 goto fail; 930 goto fail;
931 } 931 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c253b13722e8..3768c2f40e43 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1154,11 +1154,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
1154 return error; 1154 return error;
1155} 1155}
1156 1156
1157static int gfs2_quota_sync_timeo(struct super_block *sb, int type)
1158{
1159 return gfs2_quota_sync(sb, type);
1160}
1161
1162int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) 1157int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
1163{ 1158{
1164 struct gfs2_quota_data *qd; 1159 struct gfs2_quota_data *qd;
@@ -1414,7 +1409,7 @@ int gfs2_quotad(void *data)
1414 &tune->gt_statfs_quantum); 1409 &tune->gt_statfs_quantum);
1415 1410
1416 /* Update quota file */ 1411 /* Update quota file */
1417 quotad_check_timeo(sdp, "sync", gfs2_quota_sync_timeo, t, 1412 quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
1418 &quotad_timeo, &tune->gt_quota_quantum); 1413 &quotad_timeo, &tune->gt_quota_quantum);
1419 1414
1420 /* Check for & recover partially truncated inodes */ 1415 /* Check for & recover partially truncated inodes */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9809156e3d04..69317435faa7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1288,13 +1288,15 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1288 minlen = max_t(u64, r.minlen, 1288 minlen = max_t(u64, r.minlen,
1289 q->limits.discard_granularity) >> bs_shift; 1289 q->limits.discard_granularity) >> bs_shift;
1290 1290
1291 if (end <= start || minlen > sdp->sd_max_rg_data)
1292 return -EINVAL;
1293
1291 rgd = gfs2_blk2rgrpd(sdp, start, 0); 1294 rgd = gfs2_blk2rgrpd(sdp, start, 0);
1292 rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0); 1295 rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1293 1296
1294 if (end <= start || 1297 if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1295 minlen > sdp->sd_max_rg_data || 1298 && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1296 start > rgd_end->rd_data0 + rgd_end->rd_data) 1299 return -EINVAL; /* start is beyond the end of the fs */
1297 return -EINVAL;
1298 1300
1299 while (1) { 1301 while (1) {
1300 1302
@@ -1336,7 +1338,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
1336 } 1338 }
1337 1339
1338out: 1340out:
1339 r.len = trimmed << 9; 1341 r.len = trimmed << bs_shift;
1340 if (copy_to_user(argp, &r, sizeof(r))) 1342 if (copy_to_user(argp, &r, sizeof(r)))
1341 return -EFAULT; 1343 return -EFAULT;
1342 1344
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7374907742a8..2b20d7046bf3 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -270,19 +270,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
270 270
271void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) 271void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
272{ 272{
273 struct gfs2_glock *gl = bd->bd_gl;
274 struct gfs2_trans *tr = current->journal_info; 273 struct gfs2_trans *tr = current->journal_info;
275 274
276 BUG_ON(!list_empty(&bd->bd_list)); 275 BUG_ON(!list_empty(&bd->bd_list));
277 BUG_ON(!list_empty(&bd->bd_ail_st_list)); 276 gfs2_add_revoke(sdp, bd);
278 BUG_ON(!list_empty(&bd->bd_ail_gl_list));
279 bd->bd_ops = &gfs2_revoke_lops;
280 tr->tr_touched = 1; 277 tr->tr_touched = 1;
281 tr->tr_num_revoke++; 278 tr->tr_num_revoke++;
282 sdp->sd_log_num_revoke++;
283 atomic_inc(&gl->gl_revokes);
284 set_bit(GLF_LFLUSH, &gl->gl_flags);
285 list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
286} 279}
287 280
288void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) 281void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index e0101b6fb0d7..145566851e7a 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -51,9 +51,9 @@ done:
51/* 51/*
52 * hfs_readdir 52 * hfs_readdir
53 */ 53 */
54static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 54static int hfs_readdir(struct file *file, struct dir_context *ctx)
55{ 55{
56 struct inode *inode = file_inode(filp); 56 struct inode *inode = file_inode(file);
57 struct super_block *sb = inode->i_sb; 57 struct super_block *sb = inode->i_sb;
58 int len, err; 58 int len, err;
59 char strbuf[HFS_MAX_NAMELEN]; 59 char strbuf[HFS_MAX_NAMELEN];
@@ -62,7 +62,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
62 struct hfs_readdir_data *rd; 62 struct hfs_readdir_data *rd;
63 u16 type; 63 u16 type;
64 64
65 if (filp->f_pos >= inode->i_size) 65 if (ctx->pos >= inode->i_size)
66 return 0; 66 return 0;
67 67
68 err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); 68 err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
@@ -73,14 +73,13 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
73 if (err) 73 if (err)
74 goto out; 74 goto out;
75 75
76 switch ((u32)filp->f_pos) { 76 if (ctx->pos == 0) {
77 case 0:
78 /* This is completely artificial... */ 77 /* This is completely artificial... */
79 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) 78 if (!dir_emit_dot(file, ctx))
80 goto out; 79 goto out;
81 filp->f_pos++; 80 ctx->pos = 1;
82 /* fall through */ 81 }
83 case 1: 82 if (ctx->pos == 1) {
84 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { 83 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
85 err = -EIO; 84 err = -EIO;
86 goto out; 85 goto out;
@@ -97,18 +96,16 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
97 // err = -EIO; 96 // err = -EIO;
98 // goto out; 97 // goto out;
99 //} 98 //}
100 if (filldir(dirent, "..", 2, 1, 99 if (!dir_emit(ctx, "..", 2,
101 be32_to_cpu(entry.thread.ParID), DT_DIR)) 100 be32_to_cpu(entry.thread.ParID), DT_DIR))
102 goto out; 101 goto out;
103 filp->f_pos++; 102 ctx->pos = 2;
104 /* fall through */
105 default:
106 if (filp->f_pos >= inode->i_size)
107 goto out;
108 err = hfs_brec_goto(&fd, filp->f_pos - 1);
109 if (err)
110 goto out;
111 } 103 }
104 if (ctx->pos >= inode->i_size)
105 goto out;
106 err = hfs_brec_goto(&fd, ctx->pos - 1);
107 if (err)
108 goto out;
112 109
113 for (;;) { 110 for (;;) {
114 if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { 111 if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) {
@@ -131,7 +128,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
131 err = -EIO; 128 err = -EIO;
132 goto out; 129 goto out;
133 } 130 }
134 if (filldir(dirent, strbuf, len, filp->f_pos, 131 if (!dir_emit(ctx, strbuf, len,
135 be32_to_cpu(entry.dir.DirID), DT_DIR)) 132 be32_to_cpu(entry.dir.DirID), DT_DIR))
136 break; 133 break;
137 } else if (type == HFS_CDR_FIL) { 134 } else if (type == HFS_CDR_FIL) {
@@ -140,7 +137,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
140 err = -EIO; 137 err = -EIO;
141 goto out; 138 goto out;
142 } 139 }
143 if (filldir(dirent, strbuf, len, filp->f_pos, 140 if (!dir_emit(ctx, strbuf, len,
144 be32_to_cpu(entry.file.FlNum), DT_REG)) 141 be32_to_cpu(entry.file.FlNum), DT_REG))
145 break; 142 break;
146 } else { 143 } else {
@@ -148,22 +145,22 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
148 err = -EIO; 145 err = -EIO;
149 goto out; 146 goto out;
150 } 147 }
151 filp->f_pos++; 148 ctx->pos++;
152 if (filp->f_pos >= inode->i_size) 149 if (ctx->pos >= inode->i_size)
153 goto out; 150 goto out;
154 err = hfs_brec_goto(&fd, 1); 151 err = hfs_brec_goto(&fd, 1);
155 if (err) 152 if (err)
156 goto out; 153 goto out;
157 } 154 }
158 rd = filp->private_data; 155 rd = file->private_data;
159 if (!rd) { 156 if (!rd) {
160 rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); 157 rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL);
161 if (!rd) { 158 if (!rd) {
162 err = -ENOMEM; 159 err = -ENOMEM;
163 goto out; 160 goto out;
164 } 161 }
165 filp->private_data = rd; 162 file->private_data = rd;
166 rd->file = filp; 163 rd->file = file;
167 list_add(&rd->list, &HFS_I(inode)->open_dir_list); 164 list_add(&rd->list, &HFS_I(inode)->open_dir_list);
168 } 165 }
169 memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key)); 166 memcpy(&rd->key, &fd.key, sizeof(struct hfs_cat_key));
@@ -306,7 +303,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
306 303
307const struct file_operations hfs_dir_operations = { 304const struct file_operations hfs_dir_operations = {
308 .read = generic_read_dir, 305 .read = generic_read_dir,
309 .readdir = hfs_readdir, 306 .iterate = hfs_readdir,
310 .llseek = generic_file_llseek, 307 .llseek = generic_file_llseek,
311 .release = hfs_dir_release, 308 .release = hfs_dir_release,
312}; 309};
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index a73b11839a41..0524cda47a6e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -229,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
229/* string.c */ 229/* string.c */
230extern const struct dentry_operations hfs_dentry_operations; 230extern const struct dentry_operations hfs_dentry_operations;
231 231
232extern int hfs_hash_dentry(const struct dentry *, const struct inode *, 232extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
233 struct qstr *);
234extern int hfs_strcmp(const unsigned char *, unsigned int, 233extern int hfs_strcmp(const unsigned char *, unsigned int,
235 const unsigned char *, unsigned int); 234 const unsigned char *, unsigned int);
236extern int hfs_compare_dentry(const struct dentry *parent, 235extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
237 const struct inode *pinode,
238 const struct dentry *dentry, const struct inode *inode,
239 unsigned int len, const char *str, const struct qstr *name); 236 unsigned int len, const char *str, const struct qstr *name);
240 237
241/* trans.c */ 238/* trans.c */
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 495a976a3cc9..85b610c3909f 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,8 +51,7 @@ static unsigned char caseorder[256] = {
51/* 51/*
52 * Hash a string to an integer in a case-independent way 52 * Hash a string to an integer in a case-independent way
53 */ 53 */
54int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 54int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
55 struct qstr *this)
56{ 55{
57 const unsigned char *name = this->name; 56 const unsigned char *name = this->name;
58 unsigned int hash, len = this->len; 57 unsigned int hash, len = this->len;
@@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
93 * Test for equality of two strings in the HFS filename character ordering. 92 * Test for equality of two strings in the HFS filename character ordering.
94 * return 1 on failure and 0 on success 93 * return 1 on failure and 0 on success
95 */ 94 */
96int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode, 95int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
97 const struct dentry *dentry, const struct inode *inode,
98 unsigned int len, const char *str, const struct qstr *name) 96 unsigned int len, const char *str, const struct qstr *name)
99{ 97{
100 const unsigned char *n1, *n2; 98 const unsigned char *n1, *n2;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index a37ac934732f..d8ce4bd17fc5 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -121,9 +121,9 @@ fail:
121 return ERR_PTR(err); 121 return ERR_PTR(err);
122} 122}
123 123
124static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir) 124static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
125{ 125{
126 struct inode *inode = file_inode(filp); 126 struct inode *inode = file_inode(file);
127 struct super_block *sb = inode->i_sb; 127 struct super_block *sb = inode->i_sb;
128 int len, err; 128 int len, err;
129 char strbuf[HFSPLUS_MAX_STRLEN + 1]; 129 char strbuf[HFSPLUS_MAX_STRLEN + 1];
@@ -132,7 +132,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
132 struct hfsplus_readdir_data *rd; 132 struct hfsplus_readdir_data *rd;
133 u16 type; 133 u16 type;
134 134
135 if (filp->f_pos >= inode->i_size) 135 if (file->f_pos >= inode->i_size)
136 return 0; 136 return 0;
137 137
138 err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); 138 err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
@@ -143,14 +143,13 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
143 if (err) 143 if (err)
144 goto out; 144 goto out;
145 145
146 switch ((u32)filp->f_pos) { 146 if (ctx->pos == 0) {
147 case 0:
148 /* This is completely artificial... */ 147 /* This is completely artificial... */
149 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) 148 if (!dir_emit_dot(file, ctx))
150 goto out; 149 goto out;
151 filp->f_pos++; 150 ctx->pos = 1;
152 /* fall through */ 151 }
153 case 1: 152 if (ctx->pos == 1) {
154 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { 153 if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) {
155 err = -EIO; 154 err = -EIO;
156 goto out; 155 goto out;
@@ -168,19 +167,16 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
168 err = -EIO; 167 err = -EIO;
169 goto out; 168 goto out;
170 } 169 }
171 if (filldir(dirent, "..", 2, 1, 170 if (!dir_emit(ctx, "..", 2,
172 be32_to_cpu(entry.thread.parentID), DT_DIR)) 171 be32_to_cpu(entry.thread.parentID), DT_DIR))
173 goto out; 172 goto out;
174 filp->f_pos++; 173 ctx->pos = 2;
175 /* fall through */
176 default:
177 if (filp->f_pos >= inode->i_size)
178 goto out;
179 err = hfs_brec_goto(&fd, filp->f_pos - 1);
180 if (err)
181 goto out;
182 } 174 }
183 175 if (ctx->pos >= inode->i_size)
176 goto out;
177 err = hfs_brec_goto(&fd, ctx->pos - 1);
178 if (err)
179 goto out;
184 for (;;) { 180 for (;;) {
185 if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) { 181 if (be32_to_cpu(fd.key->cat.parent) != inode->i_ino) {
186 pr_err("walked past end of dir\n"); 182 pr_err("walked past end of dir\n");
@@ -211,7 +207,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
211 HFSPLUS_SB(sb)->hidden_dir->i_ino == 207 HFSPLUS_SB(sb)->hidden_dir->i_ino ==
212 be32_to_cpu(entry.folder.id)) 208 be32_to_cpu(entry.folder.id))
213 goto next; 209 goto next;
214 if (filldir(dirent, strbuf, len, filp->f_pos, 210 if (!dir_emit(ctx, strbuf, len,
215 be32_to_cpu(entry.folder.id), DT_DIR)) 211 be32_to_cpu(entry.folder.id), DT_DIR))
216 break; 212 break;
217 } else if (type == HFSPLUS_FILE) { 213 } else if (type == HFSPLUS_FILE) {
@@ -220,7 +216,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
220 err = -EIO; 216 err = -EIO;
221 goto out; 217 goto out;
222 } 218 }
223 if (filldir(dirent, strbuf, len, filp->f_pos, 219 if (!dir_emit(ctx, strbuf, len,
224 be32_to_cpu(entry.file.id), DT_REG)) 220 be32_to_cpu(entry.file.id), DT_REG))
225 break; 221 break;
226 } else { 222 } else {
@@ -229,22 +225,22 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
229 goto out; 225 goto out;
230 } 226 }
231next: 227next:
232 filp->f_pos++; 228 ctx->pos++;
233 if (filp->f_pos >= inode->i_size) 229 if (ctx->pos >= inode->i_size)
234 goto out; 230 goto out;
235 err = hfs_brec_goto(&fd, 1); 231 err = hfs_brec_goto(&fd, 1);
236 if (err) 232 if (err)
237 goto out; 233 goto out;
238 } 234 }
239 rd = filp->private_data; 235 rd = file->private_data;
240 if (!rd) { 236 if (!rd) {
241 rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL); 237 rd = kmalloc(sizeof(struct hfsplus_readdir_data), GFP_KERNEL);
242 if (!rd) { 238 if (!rd) {
243 err = -ENOMEM; 239 err = -ENOMEM;
244 goto out; 240 goto out;
245 } 241 }
246 filp->private_data = rd; 242 file->private_data = rd;
247 rd->file = filp; 243 rd->file = file;
248 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list); 244 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
249 } 245 }
250 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); 246 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
@@ -538,7 +534,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
538const struct file_operations hfsplus_dir_operations = { 534const struct file_operations hfsplus_dir_operations = {
539 .fsync = hfsplus_file_fsync, 535 .fsync = hfsplus_file_fsync,
540 .read = generic_read_dir, 536 .read = generic_read_dir,
541 .readdir = hfsplus_readdir, 537 .iterate = hfsplus_readdir,
542 .unlocked_ioctl = hfsplus_ioctl, 538 .unlocked_ioctl = hfsplus_ioctl,
543 .llseek = generic_file_llseek, 539 .llseek = generic_file_llseek,
544 .release = hfsplus_dir_release, 540 .release = hfsplus_dir_release,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 60b0a3388b26..ede79317cfb8 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -495,11 +495,8 @@ int hfsplus_uni2asc(struct super_block *,
495 const struct hfsplus_unistr *, char *, int *); 495 const struct hfsplus_unistr *, char *, int *);
496int hfsplus_asc2uni(struct super_block *, 496int hfsplus_asc2uni(struct super_block *,
497 struct hfsplus_unistr *, int, const char *, int); 497 struct hfsplus_unistr *, int, const char *, int);
498int hfsplus_hash_dentry(const struct dentry *dentry, 498int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
499 const struct inode *inode, struct qstr *str); 499int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
500int hfsplus_compare_dentry(const struct dentry *parent,
501 const struct inode *pinode,
502 const struct dentry *dentry, const struct inode *inode,
503 unsigned int len, const char *str, const struct qstr *name); 500 unsigned int len, const char *str, const struct qstr *name);
504 501
505/* wrapper.c */ 502/* wrapper.c */
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 2c2e47dcfdd8..e8ef121a4d8b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -334,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb,
334 * Composed unicode characters are decomposed and case-folding is performed 334 * Composed unicode characters are decomposed and case-folding is performed
335 * if the appropriate bits are (un)set on the superblock. 335 * if the appropriate bits are (un)set on the superblock.
336 */ 336 */
337int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode, 337int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
338 struct qstr *str)
339{ 338{
340 struct super_block *sb = dentry->d_sb; 339 struct super_block *sb = dentry->d_sb;
341 const char *astr; 340 const char *astr;
@@ -386,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
386 * Composed unicode characters are decomposed and case-folding is performed 385 * Composed unicode characters are decomposed and case-folding is performed
387 * if the appropriate bits are (un)set on the superblock. 386 * if the appropriate bits are (un)set on the superblock.
388 */ 387 */
389int hfsplus_compare_dentry(const struct dentry *parent, 388int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
390 const struct inode *pinode,
391 const struct dentry *dentry, const struct inode *inode,
392 unsigned int len, const char *str, const struct qstr *name) 389 unsigned int len, const char *str, const struct qstr *name)
393{ 390{
394 struct super_block *sb = parent->d_sb; 391 struct super_block *sb = parent->d_sb;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 32f35f187989..cddb05217512 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -277,7 +277,7 @@ static const struct super_operations hostfs_sbops = {
277 .show_options = hostfs_show_options, 277 .show_options = hostfs_show_options,
278}; 278};
279 279
280int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) 280int hostfs_readdir(struct file *file, struct dir_context *ctx)
281{ 281{
282 void *dir; 282 void *dir;
283 char *name; 283 char *name;
@@ -292,12 +292,11 @@ int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
292 __putname(name); 292 __putname(name);
293 if (dir == NULL) 293 if (dir == NULL)
294 return -error; 294 return -error;
295 next = file->f_pos; 295 next = ctx->pos;
296 while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) { 296 while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
297 error = (*filldir)(ent, name, len, file->f_pos, 297 if (!dir_emit(ctx, name, len, ino, type))
298 ino, type); 298 break;
299 if (error) break; 299 ctx->pos = next;
300 file->f_pos = next;
301 } 300 }
302 close_dir(dir); 301 close_dir(dir);
303 return 0; 302 return 0;
@@ -393,7 +392,7 @@ static const struct file_operations hostfs_file_fops = {
393 392
394static const struct file_operations hostfs_dir_fops = { 393static const struct file_operations hostfs_dir_fops = {
395 .llseek = generic_file_llseek, 394 .llseek = generic_file_llseek,
396 .readdir = hostfs_readdir, 395 .iterate = hostfs_readdir,
397 .read = generic_read_dir, 396 .read = generic_read_dir,
398}; 397};
399 398
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index f49d1498aa2e..4d0a1afa058c 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -7,8 +7,37 @@
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/blkdev.h>
10#include "hpfs_fn.h" 11#include "hpfs_fn.h"
11 12
13void hpfs_prefetch_sectors(struct super_block *s, unsigned secno, int n)
14{
15 struct buffer_head *bh;
16 struct blk_plug plug;
17
18 if (n <= 0 || unlikely(secno >= hpfs_sb(s)->sb_fs_size))
19 return;
20
21 bh = sb_find_get_block(s, secno);
22 if (bh) {
23 if (buffer_uptodate(bh)) {
24 brelse(bh);
25 return;
26 }
27 brelse(bh);
28 };
29
30 blk_start_plug(&plug);
31 while (n > 0) {
32 if (unlikely(secno >= hpfs_sb(s)->sb_fs_size))
33 break;
34 sb_breadahead(s, secno);
35 secno++;
36 n--;
37 }
38 blk_finish_plug(&plug);
39}
40
12/* Map a sector into a buffer and return pointers to it and to the buffer. */ 41/* Map a sector into a buffer and return pointers to it and to the buffer. */
13 42
14void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp, 43void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head **bhp,
@@ -18,6 +47,8 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head
18 47
19 hpfs_lock_assert(s); 48 hpfs_lock_assert(s);
20 49
50 hpfs_prefetch_sectors(s, secno, ahead);
51
21 cond_resched(); 52 cond_resched();
22 53
23 *bhp = bh = sb_bread(s, secno); 54 *bhp = bh = sb_bread(s, secno);
@@ -67,6 +98,8 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
67 return NULL; 98 return NULL;
68 } 99 }
69 100
101 hpfs_prefetch_sectors(s, secno, 4 + ahead);
102
70 qbh->data = data = kmalloc(2048, GFP_NOFS); 103 qbh->data = data = kmalloc(2048, GFP_NOFS);
71 if (!data) { 104 if (!data) {
72 printk("HPFS: hpfs_map_4sectors: out of memory\n"); 105 printk("HPFS: hpfs_map_4sectors: out of memory\n");
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 05d4816e4e77..fa27980f2229 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,8 +12,7 @@
12 * Note: the dentry argument is the parent dentry. 12 * Note: the dentry argument is the parent dentry.
13 */ 13 */
14 14
15static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode, 15static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
16 struct qstr *qstr)
17{ 16{
18 unsigned long hash; 17 unsigned long hash;
19 int i; 18 int i;
@@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino
35 return 0; 34 return 0;
36} 35}
37 36
38static int hpfs_compare_dentry(const struct dentry *parent, 37static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
39 const struct inode *pinode,
40 const struct dentry *dentry, const struct inode *inode,
41 unsigned int len, const char *str, const struct qstr *name) 38 unsigned int len, const char *str, const struct qstr *name)
42{ 39{
43 unsigned al = len; 40 unsigned al = len;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 834ac13c04b7..292b1acb9b81 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -57,14 +57,14 @@ fail:
57 return -ESPIPE; 57 return -ESPIPE;
58} 58}
59 59
60static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 60static int hpfs_readdir(struct file *file, struct dir_context *ctx)
61{ 61{
62 struct inode *inode = file_inode(filp); 62 struct inode *inode = file_inode(file);
63 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); 63 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
64 struct quad_buffer_head qbh; 64 struct quad_buffer_head qbh;
65 struct hpfs_dirent *de; 65 struct hpfs_dirent *de;
66 int lc; 66 int lc;
67 long old_pos; 67 loff_t next_pos;
68 unsigned char *tempname; 68 unsigned char *tempname;
69 int c1, c2 = 0; 69 int c1, c2 = 0;
70 int ret = 0; 70 int ret = 0;
@@ -105,11 +105,11 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
105 } 105 }
106 } 106 }
107 lc = hpfs_sb(inode->i_sb)->sb_lowercase; 107 lc = hpfs_sb(inode->i_sb)->sb_lowercase;
108 if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */ 108 if (ctx->pos == 12) { /* diff -r requires this (note, that diff -r */
109 filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */ 109 ctx->pos = 13; /* also fails on msdos filesystem in 2.0) */
110 goto out; 110 goto out;
111 } 111 }
112 if (filp->f_pos == 13) { 112 if (ctx->pos == 13) {
113 ret = -ENOENT; 113 ret = -ENOENT;
114 goto out; 114 goto out;
115 } 115 }
@@ -120,33 +120,34 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
120 accepted by filldir, but what can I do? 120 accepted by filldir, but what can I do?
121 maybe killall -9 ls helps */ 121 maybe killall -9 ls helps */
122 if (hpfs_sb(inode->i_sb)->sb_chk) 122 if (hpfs_sb(inode->i_sb)->sb_chk)
123 if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) { 123 if (hpfs_stop_cycles(inode->i_sb, ctx->pos, &c1, &c2, "hpfs_readdir")) {
124 ret = -EFSERROR; 124 ret = -EFSERROR;
125 goto out; 125 goto out;
126 } 126 }
127 if (filp->f_pos == 12) 127 if (ctx->pos == 12)
128 goto out; 128 goto out;
129 if (filp->f_pos == 3 || filp->f_pos == 4 || filp->f_pos == 5) { 129 if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
130 printk("HPFS: warning: pos==%d\n",(int)filp->f_pos); 130 printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
131 goto out; 131 goto out;
132 } 132 }
133 if (filp->f_pos == 0) { 133 if (ctx->pos == 0) {
134 if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) 134 if (!dir_emit_dot(file, ctx))
135 goto out; 135 goto out;
136 filp->f_pos = 11; 136 ctx->pos = 11;
137 } 137 }
138 if (filp->f_pos == 11) { 138 if (ctx->pos == 11) {
139 if (filldir(dirent, "..", 2, filp->f_pos, hpfs_inode->i_parent_dir, DT_DIR) < 0) 139 if (!dir_emit(ctx, "..", 2, hpfs_inode->i_parent_dir, DT_DIR))
140 goto out; 140 goto out;
141 filp->f_pos = 1; 141 ctx->pos = 1;
142 } 142 }
143 if (filp->f_pos == 1) { 143 if (ctx->pos == 1) {
144 filp->f_pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; 144 ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1;
145 hpfs_add_pos(inode, &filp->f_pos); 145 hpfs_add_pos(inode, &file->f_pos);
146 filp->f_version = inode->i_version; 146 file->f_version = inode->i_version;
147 } 147 }
148 old_pos = filp->f_pos; 148 next_pos = ctx->pos;
149 if (!(de = map_pos_dirent(inode, &filp->f_pos, &qbh))) { 149 if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) {
150 ctx->pos = next_pos;
150 ret = -EIOERROR; 151 ret = -EIOERROR;
151 goto out; 152 goto out;
152 } 153 }
@@ -154,20 +155,21 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
154 if (hpfs_sb(inode->i_sb)->sb_chk) { 155 if (hpfs_sb(inode->i_sb)->sb_chk) {
155 if (de->first && !de->last && (de->namelen != 2 156 if (de->first && !de->last && (de->namelen != 2
156 || de ->name[0] != 1 || de->name[1] != 1)) 157 || de ->name[0] != 1 || de->name[1] != 1))
157 hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", old_pos); 158 hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08lx", (unsigned long)ctx->pos);
158 if (de->last && (de->namelen != 1 || de ->name[0] != 255)) 159 if (de->last && (de->namelen != 1 || de ->name[0] != 255))
159 hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", old_pos); 160 hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08lx", (unsigned long)ctx->pos);
160 } 161 }
161 hpfs_brelse4(&qbh); 162 hpfs_brelse4(&qbh);
163 ctx->pos = next_pos;
162 goto again; 164 goto again;
163 } 165 }
164 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3); 166 tempname = hpfs_translate_name(inode->i_sb, de->name, de->namelen, lc, de->not_8x3);
165 if (filldir(dirent, tempname, de->namelen, old_pos, le32_to_cpu(de->fnode), DT_UNKNOWN) < 0) { 167 if (!dir_emit(ctx, tempname, de->namelen, le32_to_cpu(de->fnode), DT_UNKNOWN)) {
166 filp->f_pos = old_pos;
167 if (tempname != de->name) kfree(tempname); 168 if (tempname != de->name) kfree(tempname);
168 hpfs_brelse4(&qbh); 169 hpfs_brelse4(&qbh);
169 goto out; 170 goto out;
170 } 171 }
172 ctx->pos = next_pos;
171 if (tempname != de->name) kfree(tempname); 173 if (tempname != de->name) kfree(tempname);
172 hpfs_brelse4(&qbh); 174 hpfs_brelse4(&qbh);
173 } 175 }
@@ -322,7 +324,7 @@ const struct file_operations hpfs_dir_ops =
322{ 324{
323 .llseek = hpfs_dir_lseek, 325 .llseek = hpfs_dir_lseek,
324 .read = generic_read_dir, 326 .read = generic_read_dir,
325 .readdir = hpfs_readdir, 327 .iterate = hpfs_readdir,
326 .release = hpfs_dir_release, 328 .release = hpfs_dir_release,
327 .fsync = hpfs_file_fsync, 329 .fsync = hpfs_file_fsync,
328}; 330};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index e4ba5fe4c3b5..4e9dabcf1f4c 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include "hpfs_fn.h" 9#include "hpfs_fn.h"
10#include <linux/mpage.h>
10 11
11#define BLOCKS(size) (((size) + 511) >> 9) 12#define BLOCKS(size) (((size) + 511) >> 9)
12 13
@@ -34,7 +35,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
34 * so we must ignore such errors. 35 * so we must ignore such errors.
35 */ 36 */
36 37
37static secno hpfs_bmap(struct inode *inode, unsigned file_secno) 38static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_secs)
38{ 39{
39 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode); 40 struct hpfs_inode_info *hpfs_inode = hpfs_i(inode);
40 unsigned n, disk_secno; 41 unsigned n, disk_secno;
@@ -42,11 +43,20 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
42 struct buffer_head *bh; 43 struct buffer_head *bh;
43 if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0; 44 if (BLOCKS(hpfs_i(inode)->mmu_private) <= file_secno) return 0;
44 n = file_secno - hpfs_inode->i_file_sec; 45 n = file_secno - hpfs_inode->i_file_sec;
45 if (n < hpfs_inode->i_n_secs) return hpfs_inode->i_disk_sec + n; 46 if (n < hpfs_inode->i_n_secs) {
47 *n_secs = hpfs_inode->i_n_secs - n;
48 return hpfs_inode->i_disk_sec + n;
49 }
46 if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0; 50 if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0;
47 disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh); 51 disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh);
48 if (disk_secno == -1) return 0; 52 if (disk_secno == -1) return 0;
49 if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0; 53 if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0;
54 n = file_secno - hpfs_inode->i_file_sec;
55 if (n < hpfs_inode->i_n_secs) {
56 *n_secs = hpfs_inode->i_n_secs - n;
57 return hpfs_inode->i_disk_sec + n;
58 }
59 *n_secs = 1;
50 return disk_secno; 60 return disk_secno;
51} 61}
52 62
@@ -67,10 +77,14 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
67{ 77{
68 int r; 78 int r;
69 secno s; 79 secno s;
80 unsigned n_secs;
70 hpfs_lock(inode->i_sb); 81 hpfs_lock(inode->i_sb);
71 s = hpfs_bmap(inode, iblock); 82 s = hpfs_bmap(inode, iblock, &n_secs);
72 if (s) { 83 if (s) {
84 if (bh_result->b_size >> 9 < n_secs)
85 n_secs = bh_result->b_size >> 9;
73 map_bh(bh_result, inode->i_sb, s); 86 map_bh(bh_result, inode->i_sb, s);
87 bh_result->b_size = n_secs << 9;
74 goto ret_0; 88 goto ret_0;
75 } 89 }
76 if (!create) goto ret_0; 90 if (!create) goto ret_0;
@@ -95,14 +109,26 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he
95 return r; 109 return r;
96} 110}
97 111
112static int hpfs_readpage(struct file *file, struct page *page)
113{
114 return mpage_readpage(page, hpfs_get_block);
115}
116
98static int hpfs_writepage(struct page *page, struct writeback_control *wbc) 117static int hpfs_writepage(struct page *page, struct writeback_control *wbc)
99{ 118{
100 return block_write_full_page(page,hpfs_get_block, wbc); 119 return block_write_full_page(page, hpfs_get_block, wbc);
101} 120}
102 121
103static int hpfs_readpage(struct file *file, struct page *page) 122static int hpfs_readpages(struct file *file, struct address_space *mapping,
123 struct list_head *pages, unsigned nr_pages)
124{
125 return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block);
126}
127
128static int hpfs_writepages(struct address_space *mapping,
129 struct writeback_control *wbc)
104{ 130{
105 return block_read_full_page(page,hpfs_get_block); 131 return mpage_writepages(mapping, wbc, hpfs_get_block);
106} 132}
107 133
108static void hpfs_write_failed(struct address_space *mapping, loff_t to) 134static void hpfs_write_failed(struct address_space *mapping, loff_t to)
@@ -161,6 +187,8 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
161const struct address_space_operations hpfs_aops = { 187const struct address_space_operations hpfs_aops = {
162 .readpage = hpfs_readpage, 188 .readpage = hpfs_readpage,
163 .writepage = hpfs_writepage, 189 .writepage = hpfs_writepage,
190 .readpages = hpfs_readpages,
191 .writepages = hpfs_writepages,
164 .write_begin = hpfs_write_begin, 192 .write_begin = hpfs_write_begin,
165 .write_end = hpfs_write_end, 193 .write_end = hpfs_write_end,
166 .bmap = _hpfs_bmap 194 .bmap = _hpfs_bmap
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index b7ae286646b5..1b398636e990 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -27,8 +27,9 @@
27#define ALLOC_FWD_MAX 128 27#define ALLOC_FWD_MAX 128
28#define ALLOC_M 1 28#define ALLOC_M 1
29#define FNODE_RD_AHEAD 16 29#define FNODE_RD_AHEAD 16
30#define ANODE_RD_AHEAD 16 30#define ANODE_RD_AHEAD 0
31#define DNODE_RD_AHEAD 4 31#define DNODE_RD_AHEAD 72
32#define COUNT_RD_AHEAD 62
32 33
33#define FREE_DNODES_ADD 58 34#define FREE_DNODES_ADD 58
34#define FREE_DNODES_DEL 29 35#define FREE_DNODES_DEL 29
@@ -207,6 +208,7 @@ void hpfs_remove_fnode(struct super_block *, fnode_secno fno);
207 208
208/* buffer.c */ 209/* buffer.c */
209 210
211void hpfs_prefetch_sectors(struct super_block *, unsigned, int);
210void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int); 212void *hpfs_map_sector(struct super_block *, unsigned, struct buffer_head **, int);
211void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **); 213void *hpfs_get_sector(struct super_block *, unsigned, struct buffer_head **);
212void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int); 214void *hpfs_map_4sectors(struct super_block *, unsigned, struct quad_buffer_head *, int);
@@ -271,6 +273,7 @@ void hpfs_evict_inode(struct inode *);
271 273
272__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *); 274__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
273__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *); 275__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
276void hpfs_prefetch_bitmap(struct super_block *, unsigned);
274unsigned char *hpfs_load_code_page(struct super_block *, secno); 277unsigned char *hpfs_load_code_page(struct super_block *, secno);
275__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp); 278__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
276struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **); 279struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 4acb19d78359..3aa66ae1031e 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -17,7 +17,9 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
17 struct quad_buffer_head *qbh, char *id) 17 struct quad_buffer_head *qbh, char *id)
18{ 18{
19 secno sec; 19 secno sec;
20 if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) { 20 __le32 *ret;
21 unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
22 if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) {
21 hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id); 23 hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
22 return NULL; 24 return NULL;
23 } 25 }
@@ -26,7 +28,23 @@ __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
26 hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id); 28 hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
27 return NULL; 29 return NULL;
28 } 30 }
29 return hpfs_map_4sectors(s, sec, qbh, 4); 31 ret = hpfs_map_4sectors(s, sec, qbh, 4);
32 if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1);
33 return ret;
34}
35
36void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block)
37{
38 unsigned to_prefetch, next_prefetch;
39 unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
40 if (unlikely(bmp_block >= n_bands))
41 return;
42 to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]);
43 if (unlikely(bmp_block + 1 >= n_bands))
44 next_prefetch = 0;
45 else
46 next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]);
47 hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch));
30} 48}
31 49
32/* 50/*
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a0617e706957..4334cda8dba1 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
121 unsigned long *bits; 121 unsigned long *bits;
122 unsigned count; 122 unsigned count;
123 123
124 bits = hpfs_map_4sectors(s, secno, &qbh, 4); 124 bits = hpfs_map_4sectors(s, secno, &qbh, 0);
125 if (!bits) 125 if (!bits)
126 return 0; 126 return 0;
127 count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); 127 count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
@@ -134,8 +134,13 @@ static unsigned count_bitmaps(struct super_block *s)
134 unsigned n, count, n_bands; 134 unsigned n, count, n_bands;
135 n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; 135 n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
136 count = 0; 136 count = 0;
137 for (n = 0; n < n_bands; n++) 137 for (n = 0; n < COUNT_RD_AHEAD; n++) {
138 hpfs_prefetch_bitmap(s, n);
139 }
140 for (n = 0; n < n_bands; n++) {
141 hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
138 count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); 142 count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
143 }
139 return count; 144 return count;
140} 145}
141 146
@@ -558,7 +563,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
558 sbi->sb_cp_table = NULL; 563 sbi->sb_cp_table = NULL;
559 sbi->sb_c_bitmap = -1; 564 sbi->sb_c_bitmap = -1;
560 sbi->sb_max_fwd_alloc = 0xffffff; 565 sbi->sb_max_fwd_alloc = 0xffffff;
561 566
567 if (sbi->sb_fs_size >= 0x80000000) {
568 hpfs_error(s, "invalid size in superblock: %08x",
569 (unsigned)sbi->sb_fs_size);
570 goto bail4;
571 }
572
562 /* Load bitmap directory */ 573 /* Load bitmap directory */
563 if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps)))) 574 if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, le32_to_cpu(superblock->bitmaps))))
564 goto bail4; 575 goto bail4;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index cd3e38972c86..4338ff32959d 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
69 struct dentry *parent; 69 struct dentry *parent;
70 char *root, *name; 70 char *root, *name;
71 const char *seg_name; 71 const char *seg_name;
72 int len, seg_len; 72 int len, seg_len, root_len;
73 73
74 len = 0; 74 len = 0;
75 parent = dentry; 75 parent = dentry;
@@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra)
81 } 81 }
82 82
83 root = "proc"; 83 root = "proc";
84 len += strlen(root); 84 root_len = strlen(root);
85 len += root_len;
85 name = kmalloc(len + extra + 1, GFP_KERNEL); 86 name = kmalloc(len + extra + 1, GFP_KERNEL);
86 if (name == NULL) 87 if (name == NULL)
87 return NULL; 88 return NULL;
@@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra)
91 while (parent->d_parent != parent) { 92 while (parent->d_parent != parent) {
92 if (is_pid(parent)) { 93 if (is_pid(parent)) {
93 seg_name = "pid"; 94 seg_name = "pid";
94 seg_len = strlen("pid"); 95 seg_len = strlen(seg_name);
95 } 96 }
96 else { 97 else {
97 seg_name = parent->d_name.name; 98 seg_name = parent->d_name.name;
@@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra)
100 101
101 len -= seg_len + 1; 102 len -= seg_len + 1;
102 name[len] = '/'; 103 name[len] = '/';
103 strncpy(&name[len + 1], seg_name, seg_len); 104 memcpy(&name[len + 1], seg_name, seg_len);
104 parent = parent->d_parent; 105 parent = parent->d_parent;
105 } 106 }
106 strncpy(name, root, strlen(root)); 107 memcpy(name, root, root_len);
107 return name; 108 return name;
108} 109}
109 110
@@ -542,8 +543,8 @@ static const struct file_operations hppfs_file_fops = {
542}; 543};
543 544
544struct hppfs_dirent { 545struct hppfs_dirent {
545 void *vfs_dirent; 546 struct dir_context ctx;
546 filldir_t filldir; 547 struct dir_context *caller;
547 struct dentry *dentry; 548 struct dentry *dentry;
548}; 549};
549 550
@@ -555,34 +556,29 @@ static int hppfs_filldir(void *d, const char *name, int size,
555 if (file_removed(dirent->dentry, name)) 556 if (file_removed(dirent->dentry, name))
556 return 0; 557 return 0;
557 558
558 return (*dirent->filldir)(dirent->vfs_dirent, name, size, offset, 559 dirent->caller->pos = dirent->ctx.pos;
559 inode, type); 560 return !dir_emit(dirent->caller, name, size, inode, type);
560} 561}
561 562
562static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) 563static int hppfs_readdir(struct file *file, struct dir_context *ctx)
563{ 564{
564 struct hppfs_private *data = file->private_data; 565 struct hppfs_private *data = file->private_data;
565 struct file *proc_file = data->proc_file; 566 struct file *proc_file = data->proc_file;
566 int (*readdir)(struct file *, void *, filldir_t); 567 struct hppfs_dirent d = {
567 struct hppfs_dirent dirent = ((struct hppfs_dirent) 568 .ctx.actor = hppfs_filldir,
568 { .vfs_dirent = ent, 569 .caller = ctx,
569 .filldir = filldir, 570 .dentry = file->f_path.dentry
570 .dentry = file->f_path.dentry 571 };
571 });
572 int err; 572 int err;
573 573 proc_file->f_pos = ctx->pos;
574 readdir = file_inode(proc_file)->i_fop->readdir; 574 err = iterate_dir(proc_file, &d.ctx);
575 575 ctx->pos = d.ctx.pos;
576 proc_file->f_pos = file->f_pos;
577 err = (*readdir)(proc_file, &dirent, hppfs_filldir);
578 file->f_pos = proc_file->f_pos;
579
580 return err; 576 return err;
581} 577}
582 578
583static const struct file_operations hppfs_dir_fops = { 579static const struct file_operations hppfs_dir_fops = {
584 .owner = NULL, 580 .owner = NULL,
585 .readdir = hppfs_readdir, 581 .iterate = hppfs_readdir,
586 .open = hppfs_dir_open, 582 .open = hppfs_dir_open,
587 .llseek = default_llseek, 583 .llseek = default_llseek,
588 .release = hppfs_release, 584 .release = hppfs_release,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a3f868ae3fd4..34423978b170 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -463,6 +463,14 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
463 return inode; 463 return inode;
464} 464}
465 465
466/*
467 * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never
468 * be taken from reclaim -- unlike regular filesystems. This needs an
469 * annotation because huge_pmd_share() does an allocation under
470 * i_mmap_mutex.
471 */
472struct lock_class_key hugetlbfs_i_mmap_mutex_key;
473
466static struct inode *hugetlbfs_get_inode(struct super_block *sb, 474static struct inode *hugetlbfs_get_inode(struct super_block *sb,
467 struct inode *dir, 475 struct inode *dir,
468 umode_t mode, dev_t dev) 476 umode_t mode, dev_t dev)
@@ -474,6 +482,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
474 struct hugetlbfs_inode_info *info; 482 struct hugetlbfs_inode_info *info;
475 inode->i_ino = get_next_ino(); 483 inode->i_ino = get_next_ino();
476 inode_init_owner(inode, dir, mode); 484 inode_init_owner(inode, dir, mode);
485 lockdep_set_class(&inode->i_mapping->i_mmap_mutex,
486 &hugetlbfs_i_mmap_mutex_key);
477 inode->i_mapping->a_ops = &hugetlbfs_aops; 487 inode->i_mapping->a_ops = &hugetlbfs_aops;
478 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 488 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
479 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 489 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 00d5fc3b86e1..d6dfb09c8280 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink);
333 */ 333 */
334void inc_nlink(struct inode *inode) 334void inc_nlink(struct inode *inode)
335{ 335{
336 if (WARN_ON(inode->i_nlink == 0)) 336 if (unlikely(inode->i_nlink == 0)) {
337 WARN_ON(!(inode->i_state & I_LINKABLE));
337 atomic_long_dec(&inode->i_sb->s_remove_count); 338 atomic_long_dec(&inode->i_sb->s_remove_count);
339 }
338 340
339 inode->__i_nlink++; 341 inode->__i_nlink++;
340} 342}
diff --git a/fs/internal.h b/fs/internal.h
index 68121584ae37..7c5f01cf619d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,11 +96,12 @@ struct open_flags {
96 umode_t mode; 96 umode_t mode;
97 int acc_mode; 97 int acc_mode;
98 int intent; 98 int intent;
99 int lookup_flags;
99}; 100};
100extern struct file *do_filp_open(int dfd, struct filename *pathname, 101extern struct file *do_filp_open(int dfd, struct filename *pathname,
101 const struct open_flags *op, int flags); 102 const struct open_flags *op);
102extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, 103extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
103 const char *, const struct open_flags *, int lookup_flags); 104 const char *, const struct open_flags *);
104 105
105extern long do_handle_open(int mountdirfd, 106extern long do_handle_open(int mountdirfd,
106 struct file_handle __user *ufh, int open_flag); 107 struct file_handle __user *ufh, int open_flag);
@@ -130,6 +131,7 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
130 * read_write.c 131 * read_write.c
131 */ 132 */
132extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); 133extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
134extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
133 135
134/* 136/*
135 * splice.c 137 * splice.c
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index a7d5c3c3d4e6..b943cbd963bb 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -78,8 +78,8 @@ int get_acorn_filename(struct iso_directory_record *de,
78/* 78/*
79 * This should _really_ be cleaned up some day.. 79 * This should _really_ be cleaned up some day..
80 */ 80 */
81static int do_isofs_readdir(struct inode *inode, struct file *filp, 81static int do_isofs_readdir(struct inode *inode, struct file *file,
82 void *dirent, filldir_t filldir, 82 struct dir_context *ctx,
83 char *tmpname, struct iso_directory_record *tmpde) 83 char *tmpname, struct iso_directory_record *tmpde)
84{ 84{
85 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); 85 unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
@@ -94,10 +94,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
94 struct iso_directory_record *de; 94 struct iso_directory_record *de;
95 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); 95 struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
96 96
97 offset = filp->f_pos & (bufsize - 1); 97 offset = ctx->pos & (bufsize - 1);
98 block = filp->f_pos >> bufbits; 98 block = ctx->pos >> bufbits;
99 99
100 while (filp->f_pos < inode->i_size) { 100 while (ctx->pos < inode->i_size) {
101 int de_len; 101 int de_len;
102 102
103 if (!bh) { 103 if (!bh) {
@@ -108,7 +108,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
108 108
109 de = (struct iso_directory_record *) (bh->b_data + offset); 109 de = (struct iso_directory_record *) (bh->b_data + offset);
110 110
111 de_len = *(unsigned char *) de; 111 de_len = *(unsigned char *)de;
112 112
113 /* 113 /*
114 * If the length byte is zero, we should move on to the next 114 * If the length byte is zero, we should move on to the next
@@ -119,8 +119,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
119 if (de_len == 0) { 119 if (de_len == 0) {
120 brelse(bh); 120 brelse(bh);
121 bh = NULL; 121 bh = NULL;
122 filp->f_pos = (filp->f_pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); 122 ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1);
123 block = filp->f_pos >> bufbits; 123 block = ctx->pos >> bufbits;
124 offset = 0; 124 offset = 0;
125 continue; 125 continue;
126 } 126 }
@@ -164,16 +164,16 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
164 164
165 if (de->flags[-sbi->s_high_sierra] & 0x80) { 165 if (de->flags[-sbi->s_high_sierra] & 0x80) {
166 first_de = 0; 166 first_de = 0;
167 filp->f_pos += de_len; 167 ctx->pos += de_len;
168 continue; 168 continue;
169 } 169 }
170 first_de = 1; 170 first_de = 1;
171 171
172 /* Handle the case of the '.' directory */ 172 /* Handle the case of the '.' directory */
173 if (de->name_len[0] == 1 && de->name[0] == 0) { 173 if (de->name_len[0] == 1 && de->name[0] == 0) {
174 if (filldir(dirent, ".", 1, filp->f_pos, inode->i_ino, DT_DIR) < 0) 174 if (!dir_emit_dot(file, ctx))
175 break; 175 break;
176 filp->f_pos += de_len; 176 ctx->pos += de_len;
177 continue; 177 continue;
178 } 178 }
179 179
@@ -181,10 +181,9 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
181 181
182 /* Handle the case of the '..' directory */ 182 /* Handle the case of the '..' directory */
183 if (de->name_len[0] == 1 && de->name[0] == 1) { 183 if (de->name_len[0] == 1 && de->name[0] == 1) {
184 inode_number = parent_ino(filp->f_path.dentry); 184 if (!dir_emit_dotdot(file, ctx))
185 if (filldir(dirent, "..", 2, filp->f_pos, inode_number, DT_DIR) < 0)
186 break; 185 break;
187 filp->f_pos += de_len; 186 ctx->pos += de_len;
188 continue; 187 continue;
189 } 188 }
190 189
@@ -198,7 +197,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
198 if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) || 197 if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) ||
199 (!sbi->s_showassoc && 198 (!sbi->s_showassoc &&
200 (de->flags[-sbi->s_high_sierra] & 4))) { 199 (de->flags[-sbi->s_high_sierra] & 4))) {
201 filp->f_pos += de_len; 200 ctx->pos += de_len;
202 continue; 201 continue;
203 } 202 }
204 203
@@ -230,10 +229,10 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
230 } 229 }
231 } 230 }
232 if (len > 0) { 231 if (len > 0) {
233 if (filldir(dirent, p, len, filp->f_pos, inode_number, DT_UNKNOWN) < 0) 232 if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN))
234 break; 233 break;
235 } 234 }
236 filp->f_pos += de_len; 235 ctx->pos += de_len;
237 236
238 continue; 237 continue;
239 } 238 }
@@ -247,13 +246,12 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp,
247 * handling split directory entries.. The real work is done by 246 * handling split directory entries.. The real work is done by
248 * "do_isofs_readdir()". 247 * "do_isofs_readdir()".
249 */ 248 */
250static int isofs_readdir(struct file *filp, 249static int isofs_readdir(struct file *file, struct dir_context *ctx)
251 void *dirent, filldir_t filldir)
252{ 250{
253 int result; 251 int result;
254 char *tmpname; 252 char *tmpname;
255 struct iso_directory_record *tmpde; 253 struct iso_directory_record *tmpde;
256 struct inode *inode = file_inode(filp); 254 struct inode *inode = file_inode(file);
257 255
258 tmpname = (char *)__get_free_page(GFP_KERNEL); 256 tmpname = (char *)__get_free_page(GFP_KERNEL);
259 if (tmpname == NULL) 257 if (tmpname == NULL)
@@ -261,7 +259,7 @@ static int isofs_readdir(struct file *filp,
261 259
262 tmpde = (struct iso_directory_record *) (tmpname+1024); 260 tmpde = (struct iso_directory_record *) (tmpname+1024);
263 261
264 result = do_isofs_readdir(inode, filp, dirent, filldir, tmpname, tmpde); 262 result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde);
265 263
266 free_page((unsigned long) tmpname); 264 free_page((unsigned long) tmpname);
267 return result; 265 return result;
@@ -271,7 +269,7 @@ const struct file_operations isofs_dir_operations =
271{ 269{
272 .llseek = generic_file_llseek, 270 .llseek = generic_file_llseek,
273 .read = generic_read_dir, 271 .read = generic_read_dir,
274 .readdir = isofs_readdir, 272 .iterate = isofs_readdir,
275}; 273};
276 274
277/* 275/*
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d9b8aebdeb22..c348d6d88624 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,31 +28,23 @@
28 28
29#define BEQUIET 29#define BEQUIET
30 30
31static int isofs_hashi(const struct dentry *parent, const struct inode *inode, 31static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
32 struct qstr *qstr); 32static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
33static int isofs_hash(const struct dentry *parent, const struct inode *inode,
34 struct qstr *qstr);
35static int isofs_dentry_cmpi(const struct dentry *parent, 33static int isofs_dentry_cmpi(const struct dentry *parent,
36 const struct inode *pinode, 34 const struct dentry *dentry,
37 const struct dentry *dentry, const struct inode *inode,
38 unsigned int len, const char *str, const struct qstr *name); 35 unsigned int len, const char *str, const struct qstr *name);
39static int isofs_dentry_cmp(const struct dentry *parent, 36static int isofs_dentry_cmp(const struct dentry *parent,
40 const struct inode *pinode, 37 const struct dentry *dentry,
41 const struct dentry *dentry, const struct inode *inode,
42 unsigned int len, const char *str, const struct qstr *name); 38 unsigned int len, const char *str, const struct qstr *name);
43 39
44#ifdef CONFIG_JOLIET 40#ifdef CONFIG_JOLIET
45static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode, 41static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
46 struct qstr *qstr); 42static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
47static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
48 struct qstr *qstr);
49static int isofs_dentry_cmpi_ms(const struct dentry *parent, 43static int isofs_dentry_cmpi_ms(const struct dentry *parent,
50 const struct inode *pinode, 44 const struct dentry *dentry,
51 const struct dentry *dentry, const struct inode *inode,
52 unsigned int len, const char *str, const struct qstr *name); 45 unsigned int len, const char *str, const struct qstr *name);
53static int isofs_dentry_cmp_ms(const struct dentry *parent, 46static int isofs_dentry_cmp_ms(const struct dentry *parent,
54 const struct inode *pinode, 47 const struct dentry *dentry,
55 const struct dentry *dentry, const struct inode *inode,
56 unsigned int len, const char *str, const struct qstr *name); 48 unsigned int len, const char *str, const struct qstr *name);
57#endif 49#endif
58 50
@@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common(
265} 257}
266 258
267static int 259static int
268isofs_hash(const struct dentry *dentry, const struct inode *inode, 260isofs_hash(const struct dentry *dentry, struct qstr *qstr)
269 struct qstr *qstr)
270{ 261{
271 return isofs_hash_common(dentry, qstr, 0); 262 return isofs_hash_common(dentry, qstr, 0);
272} 263}
273 264
274static int 265static int
275isofs_hashi(const struct dentry *dentry, const struct inode *inode, 266isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
276 struct qstr *qstr)
277{ 267{
278 return isofs_hashi_common(dentry, qstr, 0); 268 return isofs_hashi_common(dentry, qstr, 0);
279} 269}
280 270
281static int 271static int
282isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode, 272isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
283 const struct dentry *dentry, const struct inode *inode,
284 unsigned int len, const char *str, const struct qstr *name) 273 unsigned int len, const char *str, const struct qstr *name)
285{ 274{
286 return isofs_dentry_cmp_common(len, str, name, 0, 0); 275 return isofs_dentry_cmp_common(len, str, name, 0, 0);
287} 276}
288 277
289static int 278static int
290isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode, 279isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
291 const struct dentry *dentry, const struct inode *inode,
292 unsigned int len, const char *str, const struct qstr *name) 280 unsigned int len, const char *str, const struct qstr *name)
293{ 281{
294 return isofs_dentry_cmp_common(len, str, name, 0, 1); 282 return isofs_dentry_cmp_common(len, str, name, 0, 1);
@@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
296 284
297#ifdef CONFIG_JOLIET 285#ifdef CONFIG_JOLIET
298static int 286static int
299isofs_hash_ms(const struct dentry *dentry, const struct inode *inode, 287isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
300 struct qstr *qstr)
301{ 288{
302 return isofs_hash_common(dentry, qstr, 1); 289 return isofs_hash_common(dentry, qstr, 1);
303} 290}
304 291
305static int 292static int
306isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode, 293isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
307 struct qstr *qstr)
308{ 294{
309 return isofs_hashi_common(dentry, qstr, 1); 295 return isofs_hashi_common(dentry, qstr, 1);
310} 296}
311 297
312static int 298static int
313isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode, 299isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
314 const struct dentry *dentry, const struct inode *inode,
315 unsigned int len, const char *str, const struct qstr *name) 300 unsigned int len, const char *str, const struct qstr *name)
316{ 301{
317 return isofs_dentry_cmp_common(len, str, name, 1, 0); 302 return isofs_dentry_cmp_common(len, str, name, 1, 0);
318} 303}
319 304
320static int 305static int
321isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode, 306isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
322 const struct dentry *dentry, const struct inode *inode,
323 unsigned int len, const char *str, const struct qstr *name) 307 unsigned int len, const char *str, const struct qstr *name)
324{ 308{
325 return isofs_dentry_cmp_common(len, str, name, 1, 1); 309 return isofs_dentry_cmp_common(len, str, name, 1, 1);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c167028844ed..95295640d9c8 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
37 37
38 qstr.name = compare; 38 qstr.name = compare;
39 qstr.len = dlen; 39 qstr.len = dlen;
40 return dentry->d_op->d_compare(NULL, NULL, NULL, NULL, 40 return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
41 dentry->d_name.len, dentry->d_name.name, &qstr);
42} 41}
43 42
44/* 43/*
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e3e255c0a509..be0c39b66fe0 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2019,16 +2019,20 @@ zap_buffer_unlocked:
2019 * void journal_invalidatepage() - invalidate a journal page 2019 * void journal_invalidatepage() - invalidate a journal page
2020 * @journal: journal to use for flush 2020 * @journal: journal to use for flush
2021 * @page: page to flush 2021 * @page: page to flush
2022 * @offset: length of page to invalidate. 2022 * @offset: offset of the range to invalidate
2023 * @length: length of the range to invalidate
2023 * 2024 *
2024 * Reap page buffers containing data after offset in page. 2025 * Reap page buffers containing data in specified range in page.
2025 */ 2026 */
2026void journal_invalidatepage(journal_t *journal, 2027void journal_invalidatepage(journal_t *journal,
2027 struct page *page, 2028 struct page *page,
2028 unsigned long offset) 2029 unsigned int offset,
2030 unsigned int length)
2029{ 2031{
2030 struct buffer_head *head, *bh, *next; 2032 struct buffer_head *head, *bh, *next;
2033 unsigned int stop = offset + length;
2031 unsigned int curr_off = 0; 2034 unsigned int curr_off = 0;
2035 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2032 int may_free = 1; 2036 int may_free = 1;
2033 2037
2034 if (!PageLocked(page)) 2038 if (!PageLocked(page))
@@ -2036,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
2036 if (!page_has_buffers(page)) 2040 if (!page_has_buffers(page))
2037 return; 2041 return;
2038 2042
2043 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
2044
2039 /* We will potentially be playing with lists other than just the 2045 /* We will potentially be playing with lists other than just the
2040 * data lists (especially for journaled data mode), so be 2046 * data lists (especially for journaled data mode), so be
2041 * cautious in our locking. */ 2047 * cautious in our locking. */
@@ -2045,11 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
2045 unsigned int next_off = curr_off + bh->b_size; 2051 unsigned int next_off = curr_off + bh->b_size;
2046 next = bh->b_this_page; 2052 next = bh->b_this_page;
2047 2053
2054 if (next_off > stop)
2055 return;
2056
2048 if (offset <= curr_off) { 2057 if (offset <= curr_off) {
2049 /* This block is wholly outside the truncation point */ 2058 /* This block is wholly outside the truncation point */
2050 lock_buffer(bh); 2059 lock_buffer(bh);
2051 may_free &= journal_unmap_buffer(journal, bh, 2060 may_free &= journal_unmap_buffer(journal, bh,
2052 offset > 0); 2061 partial_page);
2053 unlock_buffer(bh); 2062 unlock_buffer(bh);
2054 } 2063 }
2055 curr_off = next_off; 2064 curr_off = next_off;
@@ -2057,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
2057 2066
2058 } while (bh != head); 2067 } while (bh != head);
2059 2068
2060 if (!offset) { 2069 if (!partial_page) {
2061 if (may_free && try_to_free_buffers(page)) 2070 if (may_free && try_to_free_buffers(page))
2062 J_ASSERT(!page_has_buffers(page)); 2071 J_ASSERT(!page_has_buffers(page));
2063 } 2072 }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 69a48c2944da..5a9f5534d57b 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -20,7 +20,7 @@ config JBD2
20 20
21config JBD2_DEBUG 21config JBD2_DEBUG
22 bool "JBD2 (ext4) debugging support" 22 bool "JBD2 (ext4) debugging support"
23 depends on JBD2 && DEBUG_FS 23 depends on JBD2
24 help 24 help
25 If you are using the ext4 journaled file system (or 25 If you are using the ext4 journaled file system (or
26 potentially any other filesystem/device using JBD2), this option 26 potentially any other filesystem/device using JBD2), this option
@@ -29,7 +29,7 @@ config JBD2_DEBUG
29 By default, the debugging output will be turned off. 29 By default, the debugging output will be turned off.
30 30
31 If you select Y here, then you will be able to turn on debugging 31 If you select Y here, then you will be able to turn on debugging
32 with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a 32 with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
33 number between 1 and 5. The higher the number, the more debugging 33 number between 1 and 5. The higher the number, the more debugging
34 output is generated. To turn debugging off again, do 34 output is generated. To turn debugging off again, do
35 "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug". 35 "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index c78841ee81cf..7f34f4716165 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -120,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
120 int nblocks, space_left; 120 int nblocks, space_left;
121 /* assert_spin_locked(&journal->j_state_lock); */ 121 /* assert_spin_locked(&journal->j_state_lock); */
122 122
123 nblocks = jbd_space_needed(journal); 123 nblocks = jbd2_space_needed(journal);
124 while (__jbd2_log_space_left(journal) < nblocks) { 124 while (jbd2_log_space_left(journal) < nblocks) {
125 if (journal->j_flags & JBD2_ABORT) 125 if (journal->j_flags & JBD2_ABORT)
126 return; 126 return;
127 write_unlock(&journal->j_state_lock); 127 write_unlock(&journal->j_state_lock);
@@ -140,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
140 */ 140 */
141 write_lock(&journal->j_state_lock); 141 write_lock(&journal->j_state_lock);
142 spin_lock(&journal->j_list_lock); 142 spin_lock(&journal->j_list_lock);
143 nblocks = jbd_space_needed(journal); 143 nblocks = jbd2_space_needed(journal);
144 space_left = __jbd2_log_space_left(journal); 144 space_left = jbd2_log_space_left(journal);
145 if (space_left < nblocks) { 145 if (space_left < nblocks) {
146 int chkpt = journal->j_checkpoint_transactions != NULL; 146 int chkpt = journal->j_checkpoint_transactions != NULL;
147 tid_t tid = 0; 147 tid_t tid = 0;
@@ -156,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
156 /* We were able to recover space; yay! */ 156 /* We were able to recover space; yay! */
157 ; 157 ;
158 } else if (tid) { 158 } else if (tid) {
159 /*
160 * jbd2_journal_commit_transaction() may want
161 * to take the checkpoint_mutex if JBD2_FLUSHED
162 * is set. So we need to temporarily drop it.
163 */
164 mutex_unlock(&journal->j_checkpoint_mutex);
159 jbd2_log_wait_commit(journal, tid); 165 jbd2_log_wait_commit(journal, tid);
166 write_lock(&journal->j_state_lock);
167 continue;
160 } else { 168 } else {
161 printk(KERN_ERR "%s: needed %d blocks and " 169 printk(KERN_ERR "%s: needed %d blocks and "
162 "only had %d space available\n", 170 "only had %d space available\n",
@@ -625,10 +633,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
625 633
626 __jbd2_journal_drop_transaction(journal, transaction); 634 __jbd2_journal_drop_transaction(journal, transaction);
627 jbd2_journal_free_transaction(transaction); 635 jbd2_journal_free_transaction(transaction);
628
629 /* Just in case anybody was waiting for more transactions to be
630 checkpointed... */
631 wake_up(&journal->j_wait_logspace);
632 ret = 1; 636 ret = 1;
633out: 637out:
634 return ret; 638 return ret;
@@ -690,9 +694,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
690 J_ASSERT(transaction->t_state == T_FINISHED); 694 J_ASSERT(transaction->t_state == T_FINISHED);
691 J_ASSERT(transaction->t_buffers == NULL); 695 J_ASSERT(transaction->t_buffers == NULL);
692 J_ASSERT(transaction->t_forget == NULL); 696 J_ASSERT(transaction->t_forget == NULL);
693 J_ASSERT(transaction->t_iobuf_list == NULL);
694 J_ASSERT(transaction->t_shadow_list == NULL); 697 J_ASSERT(transaction->t_shadow_list == NULL);
695 J_ASSERT(transaction->t_log_list == NULL);
696 J_ASSERT(transaction->t_checkpoint_list == NULL); 698 J_ASSERT(transaction->t_checkpoint_list == NULL);
697 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 699 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
698 J_ASSERT(atomic_read(&transaction->t_updates) == 0); 700 J_ASSERT(atomic_read(&transaction->t_updates) == 0);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0f53946f13c1..559bec1a37b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -30,15 +30,22 @@
30#include <trace/events/jbd2.h> 30#include <trace/events/jbd2.h>
31 31
32/* 32/*
33 * Default IO end handler for temporary BJ_IO buffer_heads. 33 * IO end handler for temporary buffer_heads handling writes to the journal.
34 */ 34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{ 36{
37 struct buffer_head *orig_bh = bh->b_private;
38
37 BUFFER_TRACE(bh, ""); 39 BUFFER_TRACE(bh, "");
38 if (uptodate) 40 if (uptodate)
39 set_buffer_uptodate(bh); 41 set_buffer_uptodate(bh);
40 else 42 else
41 clear_buffer_uptodate(bh); 43 clear_buffer_uptodate(bh);
44 if (orig_bh) {
45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 smp_mb__after_clear_bit();
47 wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 }
42 unlock_buffer(bh); 49 unlock_buffer(bh);
43} 50}
44 51
@@ -85,8 +92,7 @@ nope:
85 __brelse(bh); 92 __brelse(bh);
86} 93}
87 94
88static void jbd2_commit_block_csum_set(journal_t *j, 95static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
89 struct journal_head *descriptor)
90{ 96{
91 struct commit_header *h; 97 struct commit_header *h;
92 __u32 csum; 98 __u32 csum;
@@ -94,12 +100,11 @@ static void jbd2_commit_block_csum_set(journal_t *j,
94 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 100 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
95 return; 101 return;
96 102
97 h = (struct commit_header *)(jh2bh(descriptor)->b_data); 103 h = (struct commit_header *)(bh->b_data);
98 h->h_chksum_type = 0; 104 h->h_chksum_type = 0;
99 h->h_chksum_size = 0; 105 h->h_chksum_size = 0;
100 h->h_chksum[0] = 0; 106 h->h_chksum[0] = 0;
101 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 107 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
102 j->j_blocksize);
103 h->h_chksum[0] = cpu_to_be32(csum); 108 h->h_chksum[0] = cpu_to_be32(csum);
104} 109}
105 110
@@ -116,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
116 struct buffer_head **cbh, 121 struct buffer_head **cbh,
117 __u32 crc32_sum) 122 __u32 crc32_sum)
118{ 123{
119 struct journal_head *descriptor;
120 struct commit_header *tmp; 124 struct commit_header *tmp;
121 struct buffer_head *bh; 125 struct buffer_head *bh;
122 int ret; 126 int ret;
@@ -127,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
127 if (is_journal_aborted(journal)) 131 if (is_journal_aborted(journal))
128 return 0; 132 return 0;
129 133
130 descriptor = jbd2_journal_get_descriptor_buffer(journal); 134 bh = jbd2_journal_get_descriptor_buffer(journal);
131 if (!descriptor) 135 if (!bh)
132 return 1; 136 return 1;
133 137
134 bh = jh2bh(descriptor);
135
136 tmp = (struct commit_header *)bh->b_data; 138 tmp = (struct commit_header *)bh->b_data;
137 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
138 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); 140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -146,9 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
146 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 148 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
147 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 149 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
148 } 150 }
149 jbd2_commit_block_csum_set(journal, descriptor); 151 jbd2_commit_block_csum_set(journal, bh);
150 152
151 JBUFFER_TRACE(descriptor, "submit commit block"); 153 BUFFER_TRACE(bh, "submit commit block");
152 lock_buffer(bh); 154 lock_buffer(bh);
153 clear_buffer_dirty(bh); 155 clear_buffer_dirty(bh);
154 set_buffer_uptodate(bh); 156 set_buffer_uptodate(bh);
@@ -180,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
180 if (unlikely(!buffer_uptodate(bh))) 182 if (unlikely(!buffer_uptodate(bh)))
181 ret = -EIO; 183 ret = -EIO;
182 put_bh(bh); /* One for getblk() */ 184 put_bh(bh); /* One for getblk() */
183 jbd2_journal_put_journal_head(bh2jh(bh));
184 185
185 return ret; 186 return ret;
186} 187}
@@ -321,7 +322,7 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
321} 322}
322 323
323static void jbd2_descr_block_csum_set(journal_t *j, 324static void jbd2_descr_block_csum_set(journal_t *j,
324 struct journal_head *descriptor) 325 struct buffer_head *bh)
325{ 326{
326 struct jbd2_journal_block_tail *tail; 327 struct jbd2_journal_block_tail *tail;
327 __u32 csum; 328 __u32 csum;
@@ -329,12 +330,10 @@ static void jbd2_descr_block_csum_set(journal_t *j,
329 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 330 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
330 return; 331 return;
331 332
332 tail = (struct jbd2_journal_block_tail *) 333 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
333 (jh2bh(descriptor)->b_data + j->j_blocksize -
334 sizeof(struct jbd2_journal_block_tail)); 334 sizeof(struct jbd2_journal_block_tail));
335 tail->t_checksum = 0; 335 tail->t_checksum = 0;
336 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 336 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
337 j->j_blocksize);
338 tail->t_checksum = cpu_to_be32(csum); 337 tail->t_checksum = cpu_to_be32(csum);
339} 338}
340 339
@@ -343,20 +342,21 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
343{ 342{
344 struct page *page = bh->b_page; 343 struct page *page = bh->b_page;
345 __u8 *addr; 344 __u8 *addr;
346 __u32 csum; 345 __u32 csum32;
347 346
348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 347 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
349 return; 348 return;
350 349
351 sequence = cpu_to_be32(sequence); 350 sequence = cpu_to_be32(sequence);
352 addr = kmap_atomic(page); 351 addr = kmap_atomic(page);
353 csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
354 sizeof(sequence)); 353 sizeof(sequence));
355 csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data), 354 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
356 bh->b_size); 355 bh->b_size);
357 kunmap_atomic(addr); 356 kunmap_atomic(addr);
358 357
359 tag->t_checksum = cpu_to_be32(csum); 358 /* We only have space to store the lower 16 bits of the crc32c. */
359 tag->t_checksum = cpu_to_be16(csum32);
360} 360}
361/* 361/*
362 * jbd2_journal_commit_transaction 362 * jbd2_journal_commit_transaction
@@ -368,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
368{ 368{
369 struct transaction_stats_s stats; 369 struct transaction_stats_s stats;
370 transaction_t *commit_transaction; 370 transaction_t *commit_transaction;
371 struct journal_head *jh, *new_jh, *descriptor; 371 struct journal_head *jh;
372 struct buffer_head *descriptor;
372 struct buffer_head **wbuf = journal->j_wbuf; 373 struct buffer_head **wbuf = journal->j_wbuf;
373 int bufs; 374 int bufs;
374 int flags; 375 int flags;
@@ -392,6 +393,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
392 tid_t first_tid; 393 tid_t first_tid;
393 int update_tail; 394 int update_tail;
394 int csum_size = 0; 395 int csum_size = 0;
396 LIST_HEAD(io_bufs);
397 LIST_HEAD(log_bufs);
395 398
396 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 399 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
397 csum_size = sizeof(struct jbd2_journal_block_tail); 400 csum_size = sizeof(struct jbd2_journal_block_tail);
@@ -424,13 +427,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
424 J_ASSERT(journal->j_committing_transaction == NULL); 427 J_ASSERT(journal->j_committing_transaction == NULL);
425 428
426 commit_transaction = journal->j_running_transaction; 429 commit_transaction = journal->j_running_transaction;
427 J_ASSERT(commit_transaction->t_state == T_RUNNING);
428 430
429 trace_jbd2_start_commit(journal, commit_transaction); 431 trace_jbd2_start_commit(journal, commit_transaction);
430 jbd_debug(1, "JBD2: starting commit of transaction %d\n", 432 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
431 commit_transaction->t_tid); 433 commit_transaction->t_tid);
432 434
433 write_lock(&journal->j_state_lock); 435 write_lock(&journal->j_state_lock);
436 J_ASSERT(commit_transaction->t_state == T_RUNNING);
434 commit_transaction->t_state = T_LOCKED; 437 commit_transaction->t_state = T_LOCKED;
435 438
436 trace_jbd2_commit_locking(journal, commit_transaction); 439 trace_jbd2_commit_locking(journal, commit_transaction);
@@ -520,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
520 */ 523 */
521 jbd2_journal_switch_revoke_table(journal); 524 jbd2_journal_switch_revoke_table(journal);
522 525
526 /*
527 * Reserved credits cannot be claimed anymore, free them
528 */
529 atomic_sub(atomic_read(&journal->j_reserved_credits),
530 &commit_transaction->t_outstanding_credits);
531
523 trace_jbd2_commit_flushing(journal, commit_transaction); 532 trace_jbd2_commit_flushing(journal, commit_transaction);
524 stats.run.rs_flushing = jiffies; 533 stats.run.rs_flushing = jiffies;
525 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 534 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -533,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
533 wake_up(&journal->j_wait_transaction_locked); 542 wake_up(&journal->j_wait_transaction_locked);
534 write_unlock(&journal->j_state_lock); 543 write_unlock(&journal->j_state_lock);
535 544
536 jbd_debug(3, "JBD2: commit phase 2\n"); 545 jbd_debug(3, "JBD2: commit phase 2a\n");
537 546
538 /* 547 /*
539 * Now start flushing things to disk, in the order they appear 548 * Now start flushing things to disk, in the order they appear
@@ -545,10 +554,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
545 554
546 blk_start_plug(&plug); 555 blk_start_plug(&plug);
547 jbd2_journal_write_revoke_records(journal, commit_transaction, 556 jbd2_journal_write_revoke_records(journal, commit_transaction,
548 WRITE_SYNC); 557 &log_bufs, WRITE_SYNC);
549 blk_finish_plug(&plug); 558 blk_finish_plug(&plug);
550 559
551 jbd_debug(3, "JBD2: commit phase 2\n"); 560 jbd_debug(3, "JBD2: commit phase 2b\n");
552 561
553 /* 562 /*
554 * Way to go: we have now written out all of the data for a 563 * Way to go: we have now written out all of the data for a
@@ -571,8 +580,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
571 atomic_read(&commit_transaction->t_outstanding_credits)); 580 atomic_read(&commit_transaction->t_outstanding_credits));
572 581
573 err = 0; 582 err = 0;
574 descriptor = NULL;
575 bufs = 0; 583 bufs = 0;
584 descriptor = NULL;
576 blk_start_plug(&plug); 585 blk_start_plug(&plug);
577 while (commit_transaction->t_buffers) { 586 while (commit_transaction->t_buffers) {
578 587
@@ -604,8 +613,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
604 record the metadata buffer. */ 613 record the metadata buffer. */
605 614
606 if (!descriptor) { 615 if (!descriptor) {
607 struct buffer_head *bh;
608
609 J_ASSERT (bufs == 0); 616 J_ASSERT (bufs == 0);
610 617
611 jbd_debug(4, "JBD2: get descriptor\n"); 618 jbd_debug(4, "JBD2: get descriptor\n");
@@ -616,26 +623,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
616 continue; 623 continue;
617 } 624 }
618 625
619 bh = jh2bh(descriptor);
620 jbd_debug(4, "JBD2: got buffer %llu (%p)\n", 626 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
621 (unsigned long long)bh->b_blocknr, bh->b_data); 627 (unsigned long long)descriptor->b_blocknr,
622 header = (journal_header_t *)&bh->b_data[0]; 628 descriptor->b_data);
629 header = (journal_header_t *)descriptor->b_data;
623 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 630 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
624 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); 631 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
625 header->h_sequence = cpu_to_be32(commit_transaction->t_tid); 632 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
626 633
627 tagp = &bh->b_data[sizeof(journal_header_t)]; 634 tagp = &descriptor->b_data[sizeof(journal_header_t)];
628 space_left = bh->b_size - sizeof(journal_header_t); 635 space_left = descriptor->b_size -
636 sizeof(journal_header_t);
629 first_tag = 1; 637 first_tag = 1;
630 set_buffer_jwrite(bh); 638 set_buffer_jwrite(descriptor);
631 set_buffer_dirty(bh); 639 set_buffer_dirty(descriptor);
632 wbuf[bufs++] = bh; 640 wbuf[bufs++] = descriptor;
633 641
634 /* Record it so that we can wait for IO 642 /* Record it so that we can wait for IO
635 completion later */ 643 completion later */
636 BUFFER_TRACE(bh, "ph3: file as descriptor"); 644 BUFFER_TRACE(descriptor, "ph3: file as descriptor");
637 jbd2_journal_file_buffer(descriptor, commit_transaction, 645 jbd2_file_log_bh(&log_bufs, descriptor);
638 BJ_LogCtl);
639 } 646 }
640 647
641 /* Where is the buffer to be written? */ 648 /* Where is the buffer to be written? */
@@ -658,29 +665,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
658 665
659 /* Bump b_count to prevent truncate from stumbling over 666 /* Bump b_count to prevent truncate from stumbling over
660 the shadowed buffer! @@@ This can go if we ever get 667 the shadowed buffer! @@@ This can go if we ever get
661 rid of the BJ_IO/BJ_Shadow pairing of buffers. */ 668 rid of the shadow pairing of buffers. */
662 atomic_inc(&jh2bh(jh)->b_count); 669 atomic_inc(&jh2bh(jh)->b_count);
663 670
664 /* Make a temporary IO buffer with which to write it out
665 (this will requeue both the metadata buffer and the
666 temporary IO buffer). new_bh goes on BJ_IO*/
667
668 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
669 /* 671 /*
670 * akpm: jbd2_journal_write_metadata_buffer() sets 672 * Make a temporary IO buffer with which to write it out
671 * new_bh->b_transaction to commit_transaction. 673 * (this will requeue the metadata buffer to BJ_Shadow).
672 * We need to clean this up before we release new_bh
673 * (which is of type BJ_IO)
674 */ 674 */
675 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
675 JBUFFER_TRACE(jh, "ph3: write metadata"); 676 JBUFFER_TRACE(jh, "ph3: write metadata");
676 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 677 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
677 jh, &new_jh, blocknr); 678 jh, &wbuf[bufs], blocknr);
678 if (flags < 0) { 679 if (flags < 0) {
679 jbd2_journal_abort(journal, flags); 680 jbd2_journal_abort(journal, flags);
680 continue; 681 continue;
681 } 682 }
682 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 683 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
683 wbuf[bufs++] = jh2bh(new_jh);
684 684
685 /* Record the new block's tag in the current descriptor 685 /* Record the new block's tag in the current descriptor
686 buffer */ 686 buffer */
@@ -694,10 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
694 tag = (journal_block_tag_t *) tagp; 694 tag = (journal_block_tag_t *) tagp;
695 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 695 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
696 tag->t_flags = cpu_to_be16(tag_flag); 696 tag->t_flags = cpu_to_be16(tag_flag);
697 jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh), 697 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
698 commit_transaction->t_tid); 698 commit_transaction->t_tid);
699 tagp += tag_bytes; 699 tagp += tag_bytes;
700 space_left -= tag_bytes; 700 space_left -= tag_bytes;
701 bufs++;
701 702
702 if (first_tag) { 703 if (first_tag) {
703 memcpy (tagp, journal->j_uuid, 16); 704 memcpy (tagp, journal->j_uuid, 16);
@@ -809,7 +810,7 @@ start_journal_io:
809 the log. Before we can commit it, wait for the IO so far to 810 the log. Before we can commit it, wait for the IO so far to
810 complete. Control buffers being written are on the 811 complete. Control buffers being written are on the
811 transaction's t_log_list queue, and metadata buffers are on 812 transaction's t_log_list queue, and metadata buffers are on
812 the t_iobuf_list queue. 813 the io_bufs list.
813 814
814 Wait for the buffers in reverse order. That way we are 815 Wait for the buffers in reverse order. That way we are
815 less likely to be woken up until all IOs have completed, and 816 less likely to be woken up until all IOs have completed, and
@@ -818,47 +819,33 @@ start_journal_io:
818 819
819 jbd_debug(3, "JBD2: commit phase 3\n"); 820 jbd_debug(3, "JBD2: commit phase 3\n");
820 821
821 /* 822 while (!list_empty(&io_bufs)) {
822 * akpm: these are BJ_IO, and j_list_lock is not needed. 823 struct buffer_head *bh = list_entry(io_bufs.prev,
823 * See __journal_try_to_free_buffer. 824 struct buffer_head,
824 */ 825 b_assoc_buffers);
825wait_for_iobuf:
826 while (commit_transaction->t_iobuf_list != NULL) {
827 struct buffer_head *bh;
828 826
829 jh = commit_transaction->t_iobuf_list->b_tprev; 827 wait_on_buffer(bh);
830 bh = jh2bh(jh); 828 cond_resched();
831 if (buffer_locked(bh)) {
832 wait_on_buffer(bh);
833 goto wait_for_iobuf;
834 }
835 if (cond_resched())
836 goto wait_for_iobuf;
837 829
838 if (unlikely(!buffer_uptodate(bh))) 830 if (unlikely(!buffer_uptodate(bh)))
839 err = -EIO; 831 err = -EIO;
840 832 jbd2_unfile_log_bh(bh);
841 clear_buffer_jwrite(bh);
842
843 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
844 jbd2_journal_unfile_buffer(journal, jh);
845 833
846 /* 834 /*
847 * ->t_iobuf_list should contain only dummy buffer_heads 835 * The list contains temporary buffer heads created by
848 * which were created by jbd2_journal_write_metadata_buffer(). 836 * jbd2_journal_write_metadata_buffer().
849 */ 837 */
850 BUFFER_TRACE(bh, "dumping temporary bh"); 838 BUFFER_TRACE(bh, "dumping temporary bh");
851 jbd2_journal_put_journal_head(jh);
852 __brelse(bh); 839 __brelse(bh);
853 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 840 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
854 free_buffer_head(bh); 841 free_buffer_head(bh);
855 842
856 /* We also have to unlock and free the corresponding 843 /* We also have to refile the corresponding shadowed buffer */
857 shadowed buffer */
858 jh = commit_transaction->t_shadow_list->b_tprev; 844 jh = commit_transaction->t_shadow_list->b_tprev;
859 bh = jh2bh(jh); 845 bh = jh2bh(jh);
860 clear_bit(BH_JWrite, &bh->b_state); 846 clear_buffer_jwrite(bh);
861 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 847 J_ASSERT_BH(bh, buffer_jbddirty(bh));
848 J_ASSERT_BH(bh, !buffer_shadow(bh));
862 849
863 /* The metadata is now released for reuse, but we need 850 /* The metadata is now released for reuse, but we need
864 to remember it against this transaction so that when 851 to remember it against this transaction so that when
@@ -866,14 +853,6 @@ wait_for_iobuf:
866 required. */ 853 required. */
867 JBUFFER_TRACE(jh, "file as BJ_Forget"); 854 JBUFFER_TRACE(jh, "file as BJ_Forget");
868 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 855 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
869 /*
870 * Wake up any transactions which were waiting for this IO to
871 * complete. The barrier must be here so that changes by
872 * jbd2_journal_file_buffer() take effect before wake_up_bit()
873 * does the waitqueue check.
874 */
875 smp_mb();
876 wake_up_bit(&bh->b_state, BH_Unshadow);
877 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 856 JBUFFER_TRACE(jh, "brelse shadowed buffer");
878 __brelse(bh); 857 __brelse(bh);
879 } 858 }
@@ -883,26 +862,19 @@ wait_for_iobuf:
883 jbd_debug(3, "JBD2: commit phase 4\n"); 862 jbd_debug(3, "JBD2: commit phase 4\n");
884 863
885 /* Here we wait for the revoke record and descriptor record buffers */ 864 /* Here we wait for the revoke record and descriptor record buffers */
886 wait_for_ctlbuf: 865 while (!list_empty(&log_bufs)) {
887 while (commit_transaction->t_log_list != NULL) {
888 struct buffer_head *bh; 866 struct buffer_head *bh;
889 867
890 jh = commit_transaction->t_log_list->b_tprev; 868 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
891 bh = jh2bh(jh); 869 wait_on_buffer(bh);
892 if (buffer_locked(bh)) { 870 cond_resched();
893 wait_on_buffer(bh);
894 goto wait_for_ctlbuf;
895 }
896 if (cond_resched())
897 goto wait_for_ctlbuf;
898 871
899 if (unlikely(!buffer_uptodate(bh))) 872 if (unlikely(!buffer_uptodate(bh)))
900 err = -EIO; 873 err = -EIO;
901 874
902 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 875 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
903 clear_buffer_jwrite(bh); 876 clear_buffer_jwrite(bh);
904 jbd2_journal_unfile_buffer(journal, jh); 877 jbd2_unfile_log_bh(bh);
905 jbd2_journal_put_journal_head(jh);
906 __brelse(bh); /* One for getblk */ 878 __brelse(bh); /* One for getblk */
907 /* AKPM: bforget here */ 879 /* AKPM: bforget here */
908 } 880 }
@@ -952,9 +924,7 @@ wait_for_iobuf:
952 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 924 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
953 J_ASSERT(commit_transaction->t_buffers == NULL); 925 J_ASSERT(commit_transaction->t_buffers == NULL);
954 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 926 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
955 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
956 J_ASSERT(commit_transaction->t_shadow_list == NULL); 927 J_ASSERT(commit_transaction->t_shadow_list == NULL);
957 J_ASSERT(commit_transaction->t_log_list == NULL);
958 928
959restart_loop: 929restart_loop:
960 /* 930 /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 95457576e434..02c7ad9d7a41 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -103,6 +103,24 @@ EXPORT_SYMBOL(jbd2_inode_cache);
103static void __journal_abort_soft (journal_t *journal, int errno); 103static void __journal_abort_soft (journal_t *journal, int errno);
104static int jbd2_journal_create_slab(size_t slab_size); 104static int jbd2_journal_create_slab(size_t slab_size);
105 105
106#ifdef CONFIG_JBD2_DEBUG
107void __jbd2_debug(int level, const char *file, const char *func,
108 unsigned int line, const char *fmt, ...)
109{
110 struct va_format vaf;
111 va_list args;
112
113 if (level > jbd2_journal_enable_debug)
114 return;
115 va_start(args, fmt);
116 vaf.fmt = fmt;
117 vaf.va = &args;
118 printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
119 va_end(args);
120}
121EXPORT_SYMBOL(__jbd2_debug);
122#endif
123
106/* Checksumming functions */ 124/* Checksumming functions */
107int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
108{ 126{
@@ -310,14 +328,12 @@ static void journal_kill_thread(journal_t *journal)
310 * 328 *
311 * If the source buffer has already been modified by a new transaction 329 * If the source buffer has already been modified by a new transaction
312 * since we took the last commit snapshot, we use the frozen copy of 330 * since we took the last commit snapshot, we use the frozen copy of
313 * that data for IO. If we end up using the existing buffer_head's data 331 * that data for IO. If we end up using the existing buffer_head's data
314 * for the write, then we *have* to lock the buffer to prevent anyone 332 * for the write, then we have to make sure nobody modifies it while the
315 * else from using and possibly modifying it while the IO is in 333 * IO is in progress. do_get_write_access() handles this.
316 * progress.
317 * 334 *
318 * The function returns a pointer to the buffer_heads to be used for IO. 335 * The function returns a pointer to the buffer_head to be used for IO.
319 * 336 *
320 * We assume that the journal has already been locked in this function.
321 * 337 *
322 * Return value: 338 * Return value:
323 * <0: Error 339 * <0: Error
@@ -330,15 +346,14 @@ static void journal_kill_thread(journal_t *journal)
330 346
331int jbd2_journal_write_metadata_buffer(transaction_t *transaction, 347int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
332 struct journal_head *jh_in, 348 struct journal_head *jh_in,
333 struct journal_head **jh_out, 349 struct buffer_head **bh_out,
334 unsigned long long blocknr) 350 sector_t blocknr)
335{ 351{
336 int need_copy_out = 0; 352 int need_copy_out = 0;
337 int done_copy_out = 0; 353 int done_copy_out = 0;
338 int do_escape = 0; 354 int do_escape = 0;
339 char *mapped_data; 355 char *mapped_data;
340 struct buffer_head *new_bh; 356 struct buffer_head *new_bh;
341 struct journal_head *new_jh;
342 struct page *new_page; 357 struct page *new_page;
343 unsigned int new_offset; 358 unsigned int new_offset;
344 struct buffer_head *bh_in = jh2bh(jh_in); 359 struct buffer_head *bh_in = jh2bh(jh_in);
@@ -368,14 +383,13 @@ retry_alloc:
368 383
369 /* keep subsequent assertions sane */ 384 /* keep subsequent assertions sane */
370 atomic_set(&new_bh->b_count, 1); 385 atomic_set(&new_bh->b_count, 1);
371 new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
372 386
387 jbd_lock_bh_state(bh_in);
388repeat:
373 /* 389 /*
374 * If a new transaction has already done a buffer copy-out, then 390 * If a new transaction has already done a buffer copy-out, then
375 * we use that version of the data for the commit. 391 * we use that version of the data for the commit.
376 */ 392 */
377 jbd_lock_bh_state(bh_in);
378repeat:
379 if (jh_in->b_frozen_data) { 393 if (jh_in->b_frozen_data) {
380 done_copy_out = 1; 394 done_copy_out = 1;
381 new_page = virt_to_page(jh_in->b_frozen_data); 395 new_page = virt_to_page(jh_in->b_frozen_data);
@@ -415,7 +429,7 @@ repeat:
415 jbd_unlock_bh_state(bh_in); 429 jbd_unlock_bh_state(bh_in);
416 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 430 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
417 if (!tmp) { 431 if (!tmp) {
418 jbd2_journal_put_journal_head(new_jh); 432 brelse(new_bh);
419 return -ENOMEM; 433 return -ENOMEM;
420 } 434 }
421 jbd_lock_bh_state(bh_in); 435 jbd_lock_bh_state(bh_in);
@@ -426,7 +440,7 @@ repeat:
426 440
427 jh_in->b_frozen_data = tmp; 441 jh_in->b_frozen_data = tmp;
428 mapped_data = kmap_atomic(new_page); 442 mapped_data = kmap_atomic(new_page);
429 memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); 443 memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
430 kunmap_atomic(mapped_data); 444 kunmap_atomic(mapped_data);
431 445
432 new_page = virt_to_page(tmp); 446 new_page = virt_to_page(tmp);
@@ -452,14 +466,14 @@ repeat:
452 } 466 }
453 467
454 set_bh_page(new_bh, new_page, new_offset); 468 set_bh_page(new_bh, new_page, new_offset);
455 new_jh->b_transaction = NULL; 469 new_bh->b_size = bh_in->b_size;
456 new_bh->b_size = jh2bh(jh_in)->b_size; 470 new_bh->b_bdev = journal->j_dev;
457 new_bh->b_bdev = transaction->t_journal->j_dev;
458 new_bh->b_blocknr = blocknr; 471 new_bh->b_blocknr = blocknr;
472 new_bh->b_private = bh_in;
459 set_buffer_mapped(new_bh); 473 set_buffer_mapped(new_bh);
460 set_buffer_dirty(new_bh); 474 set_buffer_dirty(new_bh);
461 475
462 *jh_out = new_jh; 476 *bh_out = new_bh;
463 477
464 /* 478 /*
465 * The to-be-written buffer needs to get moved to the io queue, 479 * The to-be-written buffer needs to get moved to the io queue,
@@ -470,11 +484,9 @@ repeat:
470 spin_lock(&journal->j_list_lock); 484 spin_lock(&journal->j_list_lock);
471 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); 485 __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
472 spin_unlock(&journal->j_list_lock); 486 spin_unlock(&journal->j_list_lock);
487 set_buffer_shadow(bh_in);
473 jbd_unlock_bh_state(bh_in); 488 jbd_unlock_bh_state(bh_in);
474 489
475 JBUFFER_TRACE(new_jh, "file as BJ_IO");
476 jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
477
478 return do_escape | (done_copy_out << 1); 490 return do_escape | (done_copy_out << 1);
479} 491}
480 492
@@ -484,35 +496,6 @@ repeat:
484 */ 496 */
485 497
486/* 498/*
487 * __jbd2_log_space_left: Return the number of free blocks left in the journal.
488 *
489 * Called with the journal already locked.
490 *
491 * Called under j_state_lock
492 */
493
494int __jbd2_log_space_left(journal_t *journal)
495{
496 int left = journal->j_free;
497
498 /* assert_spin_locked(&journal->j_state_lock); */
499
500 /*
501 * Be pessimistic here about the number of those free blocks which
502 * might be required for log descriptor control blocks.
503 */
504
505#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
506
507 left -= MIN_LOG_RESERVED_BLOCKS;
508
509 if (left <= 0)
510 return 0;
511 left -= (left >> 3);
512 return left;
513}
514
515/*
516 * Called with j_state_lock locked for writing. 499 * Called with j_state_lock locked for writing.
517 * Returns true if a transaction commit was started. 500 * Returns true if a transaction commit was started.
518 */ 501 */
@@ -564,20 +547,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
564} 547}
565 548
566/* 549/*
567 * Force and wait upon a commit if the calling process is not within 550 * Force and wait any uncommitted transactions. We can only force the running
568 * transaction. This is used for forcing out undo-protected data which contains 551 * transaction if we don't have an active handle, otherwise, we will deadlock.
569 * bitmaps, when the fs is running out of space. 552 * Returns: <0 in case of error,
570 * 553 * 0 if nothing to commit,
571 * We can only force the running transaction if we don't have an active handle; 554 * 1 if transaction was successfully committed.
572 * otherwise, we will deadlock.
573 *
574 * Returns true if a transaction was started.
575 */ 555 */
576int jbd2_journal_force_commit_nested(journal_t *journal) 556static int __jbd2_journal_force_commit(journal_t *journal)
577{ 557{
578 transaction_t *transaction = NULL; 558 transaction_t *transaction = NULL;
579 tid_t tid; 559 tid_t tid;
580 int need_to_start = 0; 560 int need_to_start = 0, ret = 0;
581 561
582 read_lock(&journal->j_state_lock); 562 read_lock(&journal->j_state_lock);
583 if (journal->j_running_transaction && !current->journal_info) { 563 if (journal->j_running_transaction && !current->journal_info) {
@@ -588,16 +568,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
588 transaction = journal->j_committing_transaction; 568 transaction = journal->j_committing_transaction;
589 569
590 if (!transaction) { 570 if (!transaction) {
571 /* Nothing to commit */
591 read_unlock(&journal->j_state_lock); 572 read_unlock(&journal->j_state_lock);
592 return 0; /* Nothing to retry */ 573 return 0;
593 } 574 }
594
595 tid = transaction->t_tid; 575 tid = transaction->t_tid;
596 read_unlock(&journal->j_state_lock); 576 read_unlock(&journal->j_state_lock);
597 if (need_to_start) 577 if (need_to_start)
598 jbd2_log_start_commit(journal, tid); 578 jbd2_log_start_commit(journal, tid);
599 jbd2_log_wait_commit(journal, tid); 579 ret = jbd2_log_wait_commit(journal, tid);
600 return 1; 580 if (!ret)
581 ret = 1;
582
583 return ret;
584}
585
586/**
587 * Force and wait upon a commit if the calling process is not within
588 * transaction. This is used for forcing out undo-protected data which contains
589 * bitmaps, when the fs is running out of space.
590 *
591 * @journal: journal to force
592 * Returns true if progress was made.
593 */
594int jbd2_journal_force_commit_nested(journal_t *journal)
595{
596 int ret;
597
598 ret = __jbd2_journal_force_commit(journal);
599 return ret > 0;
600}
601
602/**
603 * int journal_force_commit() - force any uncommitted transactions
604 * @journal: journal to force
605 *
606 * Caller want unconditional commit. We can only force the running transaction
607 * if we don't have an active handle, otherwise, we will deadlock.
608 */
609int jbd2_journal_force_commit(journal_t *journal)
610{
611 int ret;
612
613 J_ASSERT(!current->journal_info);
614 ret = __jbd2_journal_force_commit(journal);
615 if (ret > 0)
616 ret = 0;
617 return ret;
601} 618}
602 619
603/* 620/*
@@ -798,7 +815,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
798 * But we don't bother doing that, so there will be coherency problems with 815 * But we don't bother doing that, so there will be coherency problems with
799 * mmaps of blockdevs which hold live JBD-controlled filesystems. 816 * mmaps of blockdevs which hold live JBD-controlled filesystems.
800 */ 817 */
801struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) 818struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
802{ 819{
803 struct buffer_head *bh; 820 struct buffer_head *bh;
804 unsigned long long blocknr; 821 unsigned long long blocknr;
@@ -817,7 +834,7 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
817 set_buffer_uptodate(bh); 834 set_buffer_uptodate(bh);
818 unlock_buffer(bh); 835 unlock_buffer(bh);
819 BUFFER_TRACE(bh, "return this buffer"); 836 BUFFER_TRACE(bh, "return this buffer");
820 return jbd2_journal_add_journal_head(bh); 837 return bh;
821} 838}
822 839
823/* 840/*
@@ -1062,11 +1079,10 @@ static journal_t * journal_init_common (void)
1062 return NULL; 1079 return NULL;
1063 1080
1064 init_waitqueue_head(&journal->j_wait_transaction_locked); 1081 init_waitqueue_head(&journal->j_wait_transaction_locked);
1065 init_waitqueue_head(&journal->j_wait_logspace);
1066 init_waitqueue_head(&journal->j_wait_done_commit); 1082 init_waitqueue_head(&journal->j_wait_done_commit);
1067 init_waitqueue_head(&journal->j_wait_checkpoint);
1068 init_waitqueue_head(&journal->j_wait_commit); 1083 init_waitqueue_head(&journal->j_wait_commit);
1069 init_waitqueue_head(&journal->j_wait_updates); 1084 init_waitqueue_head(&journal->j_wait_updates);
1085 init_waitqueue_head(&journal->j_wait_reserved);
1070 mutex_init(&journal->j_barrier); 1086 mutex_init(&journal->j_barrier);
1071 mutex_init(&journal->j_checkpoint_mutex); 1087 mutex_init(&journal->j_checkpoint_mutex);
1072 spin_lock_init(&journal->j_revoke_lock); 1088 spin_lock_init(&journal->j_revoke_lock);
@@ -1076,6 +1092,7 @@ static journal_t * journal_init_common (void)
1076 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 1092 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
1077 journal->j_min_batch_time = 0; 1093 journal->j_min_batch_time = 0;
1078 journal->j_max_batch_time = 15000; /* 15ms */ 1094 journal->j_max_batch_time = 15000; /* 15ms */
1095 atomic_set(&journal->j_reserved_credits, 0);
1079 1096
1080 /* The journal is marked for error until we succeed with recovery! */ 1097 /* The journal is marked for error until we succeed with recovery! */
1081 journal->j_flags = JBD2_ABORT; 1098 journal->j_flags = JBD2_ABORT;
@@ -1318,6 +1335,7 @@ static int journal_reset(journal_t *journal)
1318static void jbd2_write_superblock(journal_t *journal, int write_op) 1335static void jbd2_write_superblock(journal_t *journal, int write_op)
1319{ 1336{
1320 struct buffer_head *bh = journal->j_sb_buffer; 1337 struct buffer_head *bh = journal->j_sb_buffer;
1338 journal_superblock_t *sb = journal->j_superblock;
1321 int ret; 1339 int ret;
1322 1340
1323 trace_jbd2_write_superblock(journal, write_op); 1341 trace_jbd2_write_superblock(journal, write_op);
@@ -1339,6 +1357,7 @@ static void jbd2_write_superblock(journal_t *journal, int write_op)
1339 clear_buffer_write_io_error(bh); 1357 clear_buffer_write_io_error(bh);
1340 set_buffer_uptodate(bh); 1358 set_buffer_uptodate(bh);
1341 } 1359 }
1360 jbd2_superblock_csum_set(journal, sb);
1342 get_bh(bh); 1361 get_bh(bh);
1343 bh->b_end_io = end_buffer_write_sync; 1362 bh->b_end_io = end_buffer_write_sync;
1344 ret = submit_bh(write_op, bh); 1363 ret = submit_bh(write_op, bh);
@@ -1435,7 +1454,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
1435 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", 1454 jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
1436 journal->j_errno); 1455 journal->j_errno);
1437 sb->s_errno = cpu_to_be32(journal->j_errno); 1456 sb->s_errno = cpu_to_be32(journal->j_errno);
1438 jbd2_superblock_csum_set(journal, sb);
1439 read_unlock(&journal->j_state_lock); 1457 read_unlock(&journal->j_state_lock);
1440 1458
1441 jbd2_write_superblock(journal, WRITE_SYNC); 1459 jbd2_write_superblock(journal, WRITE_SYNC);
@@ -2325,13 +2343,13 @@ static struct journal_head *journal_alloc_journal_head(void)
2325#ifdef CONFIG_JBD2_DEBUG 2343#ifdef CONFIG_JBD2_DEBUG
2326 atomic_inc(&nr_journal_heads); 2344 atomic_inc(&nr_journal_heads);
2327#endif 2345#endif
2328 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2346 ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2329 if (!ret) { 2347 if (!ret) {
2330 jbd_debug(1, "out of memory for journal_head\n"); 2348 jbd_debug(1, "out of memory for journal_head\n");
2331 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); 2349 pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
2332 while (!ret) { 2350 while (!ret) {
2333 yield(); 2351 yield();
2334 ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS); 2352 ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2335 } 2353 }
2336 } 2354 }
2337 return ret; 2355 return ret;
@@ -2393,10 +2411,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
2393 struct journal_head *new_jh = NULL; 2411 struct journal_head *new_jh = NULL;
2394 2412
2395repeat: 2413repeat:
2396 if (!buffer_jbd(bh)) { 2414 if (!buffer_jbd(bh))
2397 new_jh = journal_alloc_journal_head(); 2415 new_jh = journal_alloc_journal_head();
2398 memset(new_jh, 0, sizeof(*new_jh));
2399 }
2400 2416
2401 jbd_lock_bh_journal_head(bh); 2417 jbd_lock_bh_journal_head(bh);
2402 if (buffer_jbd(bh)) { 2418 if (buffer_jbd(bh)) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 626846bac32f..d4851464b57e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -399,18 +399,17 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, 399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
400 void *buf, __u32 sequence) 400 void *buf, __u32 sequence)
401{ 401{
402 __u32 provided, calculated; 402 __u32 csum32;
403 403
404 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 404 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
405 return 1; 405 return 1;
406 406
407 sequence = cpu_to_be32(sequence); 407 sequence = cpu_to_be32(sequence);
408 calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence, 408 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
409 sizeof(sequence)); 409 sizeof(sequence));
410 calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize); 410 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
411 provided = be32_to_cpu(tag->t_checksum);
412 411
413 return provided == cpu_to_be32(calculated); 412 return tag->t_checksum == cpu_to_be16(csum32);
414} 413}
415 414
416static int do_one_pass(journal_t *journal, 415static int do_one_pass(journal_t *journal,
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index f30b80b4ce8b..198c9c10276d 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
122 122
123#ifdef __KERNEL__ 123#ifdef __KERNEL__
124static void write_one_revoke_record(journal_t *, transaction_t *, 124static void write_one_revoke_record(journal_t *, transaction_t *,
125 struct journal_head **, int *, 125 struct list_head *,
126 struct buffer_head **, int *,
126 struct jbd2_revoke_record_s *, int); 127 struct jbd2_revoke_record_s *, int);
127static void flush_descriptor(journal_t *, struct journal_head *, int, int); 128static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
128#endif 129#endif
129 130
130/* Utility functions to maintain the revoke table */ 131/* Utility functions to maintain the revoke table */
@@ -531,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
531 */ 532 */
532void jbd2_journal_write_revoke_records(journal_t *journal, 533void jbd2_journal_write_revoke_records(journal_t *journal,
533 transaction_t *transaction, 534 transaction_t *transaction,
535 struct list_head *log_bufs,
534 int write_op) 536 int write_op)
535{ 537{
536 struct journal_head *descriptor; 538 struct buffer_head *descriptor;
537 struct jbd2_revoke_record_s *record; 539 struct jbd2_revoke_record_s *record;
538 struct jbd2_revoke_table_s *revoke; 540 struct jbd2_revoke_table_s *revoke;
539 struct list_head *hash_list; 541 struct list_head *hash_list;
@@ -553,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
553 while (!list_empty(hash_list)) { 555 while (!list_empty(hash_list)) {
554 record = (struct jbd2_revoke_record_s *) 556 record = (struct jbd2_revoke_record_s *)
555 hash_list->next; 557 hash_list->next;
556 write_one_revoke_record(journal, transaction, 558 write_one_revoke_record(journal, transaction, log_bufs,
557 &descriptor, &offset, 559 &descriptor, &offset,
558 record, write_op); 560 record, write_op);
559 count++; 561 count++;
@@ -573,13 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
573 575
574static void write_one_revoke_record(journal_t *journal, 576static void write_one_revoke_record(journal_t *journal,
575 transaction_t *transaction, 577 transaction_t *transaction,
576 struct journal_head **descriptorp, 578 struct list_head *log_bufs,
579 struct buffer_head **descriptorp,
577 int *offsetp, 580 int *offsetp,
578 struct jbd2_revoke_record_s *record, 581 struct jbd2_revoke_record_s *record,
579 int write_op) 582 int write_op)
580{ 583{
581 int csum_size = 0; 584 int csum_size = 0;
582 struct journal_head *descriptor; 585 struct buffer_head *descriptor;
583 int offset; 586 int offset;
584 journal_header_t *header; 587 journal_header_t *header;
585 588
@@ -609,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
609 descriptor = jbd2_journal_get_descriptor_buffer(journal); 612 descriptor = jbd2_journal_get_descriptor_buffer(journal);
610 if (!descriptor) 613 if (!descriptor)
611 return; 614 return;
612 header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; 615 header = (journal_header_t *)descriptor->b_data;
613 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); 616 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
614 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); 617 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
615 header->h_sequence = cpu_to_be32(transaction->t_tid); 618 header->h_sequence = cpu_to_be32(transaction->t_tid);
616 619
617 /* Record it so that we can wait for IO completion later */ 620 /* Record it so that we can wait for IO completion later */
618 JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); 621 BUFFER_TRACE(descriptor, "file in log_bufs");
619 jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl); 622 jbd2_file_log_bh(log_bufs, descriptor);
620 623
621 offset = sizeof(jbd2_journal_revoke_header_t); 624 offset = sizeof(jbd2_journal_revoke_header_t);
622 *descriptorp = descriptor; 625 *descriptorp = descriptor;
623 } 626 }
624 627
625 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { 628 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
626 * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) = 629 * ((__be64 *)(&descriptor->b_data[offset])) =
627 cpu_to_be64(record->blocknr); 630 cpu_to_be64(record->blocknr);
628 offset += 8; 631 offset += 8;
629 632
630 } else { 633 } else {
631 * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = 634 * ((__be32 *)(&descriptor->b_data[offset])) =
632 cpu_to_be32(record->blocknr); 635 cpu_to_be32(record->blocknr);
633 offset += 4; 636 offset += 4;
634 } 637 }
@@ -636,8 +639,7 @@ static void write_one_revoke_record(journal_t *journal,
636 *offsetp = offset; 639 *offsetp = offset;
637} 640}
638 641
639static void jbd2_revoke_csum_set(journal_t *j, 642static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
640 struct journal_head *descriptor)
641{ 643{
642 struct jbd2_journal_revoke_tail *tail; 644 struct jbd2_journal_revoke_tail *tail;
643 __u32 csum; 645 __u32 csum;
@@ -645,12 +647,10 @@ static void jbd2_revoke_csum_set(journal_t *j,
645 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 647 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
646 return; 648 return;
647 649
648 tail = (struct jbd2_journal_revoke_tail *) 650 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
649 (jh2bh(descriptor)->b_data + j->j_blocksize -
650 sizeof(struct jbd2_journal_revoke_tail)); 651 sizeof(struct jbd2_journal_revoke_tail));
651 tail->r_checksum = 0; 652 tail->r_checksum = 0;
652 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data, 653 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
653 j->j_blocksize);
654 tail->r_checksum = cpu_to_be32(csum); 654 tail->r_checksum = cpu_to_be32(csum);
655} 655}
656 656
@@ -662,25 +662,24 @@ static void jbd2_revoke_csum_set(journal_t *j,
662 */ 662 */
663 663
664static void flush_descriptor(journal_t *journal, 664static void flush_descriptor(journal_t *journal,
665 struct journal_head *descriptor, 665 struct buffer_head *descriptor,
666 int offset, int write_op) 666 int offset, int write_op)
667{ 667{
668 jbd2_journal_revoke_header_t *header; 668 jbd2_journal_revoke_header_t *header;
669 struct buffer_head *bh = jh2bh(descriptor);
670 669
671 if (is_journal_aborted(journal)) { 670 if (is_journal_aborted(journal)) {
672 put_bh(bh); 671 put_bh(descriptor);
673 return; 672 return;
674 } 673 }
675 674
676 header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data; 675 header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
677 header->r_count = cpu_to_be32(offset); 676 header->r_count = cpu_to_be32(offset);
678 jbd2_revoke_csum_set(journal, descriptor); 677 jbd2_revoke_csum_set(journal, descriptor);
679 678
680 set_buffer_jwrite(bh); 679 set_buffer_jwrite(descriptor);
681 BUFFER_TRACE(bh, "write"); 680 BUFFER_TRACE(descriptor, "write");
682 set_buffer_dirty(bh); 681 set_buffer_dirty(descriptor);
683 write_dirty_buffer(bh, write_op); 682 write_dirty_buffer(descriptor, write_op);
684} 683}
685#endif 684#endif
686 685
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 10f524c59ea8..7aa9a32573bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -89,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
89 transaction->t_expires = jiffies + journal->j_commit_interval; 89 transaction->t_expires = jiffies + journal->j_commit_interval;
90 spin_lock_init(&transaction->t_handle_lock); 90 spin_lock_init(&transaction->t_handle_lock);
91 atomic_set(&transaction->t_updates, 0); 91 atomic_set(&transaction->t_updates, 0);
92 atomic_set(&transaction->t_outstanding_credits, 0); 92 atomic_set(&transaction->t_outstanding_credits,
93 atomic_read(&journal->j_reserved_credits));
93 atomic_set(&transaction->t_handle_count, 0); 94 atomic_set(&transaction->t_handle_count, 0);
94 INIT_LIST_HEAD(&transaction->t_inode_list); 95 INIT_LIST_HEAD(&transaction->t_inode_list);
95 INIT_LIST_HEAD(&transaction->t_private_list); 96 INIT_LIST_HEAD(&transaction->t_private_list);
@@ -141,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
141} 142}
142 143
143/* 144/*
145 * Wait until running transaction passes T_LOCKED state. Also starts the commit
146 * if needed. The function expects running transaction to exist and releases
147 * j_state_lock.
148 */
149static void wait_transaction_locked(journal_t *journal)
150 __releases(journal->j_state_lock)
151{
152 DEFINE_WAIT(wait);
153 int need_to_start;
154 tid_t tid = journal->j_running_transaction->t_tid;
155
156 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
157 TASK_UNINTERRUPTIBLE);
158 need_to_start = !tid_geq(journal->j_commit_request, tid);
159 read_unlock(&journal->j_state_lock);
160 if (need_to_start)
161 jbd2_log_start_commit(journal, tid);
162 schedule();
163 finish_wait(&journal->j_wait_transaction_locked, &wait);
164}
165
166static void sub_reserved_credits(journal_t *journal, int blocks)
167{
168 atomic_sub(blocks, &journal->j_reserved_credits);
169 wake_up(&journal->j_wait_reserved);
170}
171
172/*
173 * Wait until we can add credits for handle to the running transaction. Called
174 * with j_state_lock held for reading. Returns 0 if handle joined the running
175 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
176 * caller must retry.
177 */
178static int add_transaction_credits(journal_t *journal, int blocks,
179 int rsv_blocks)
180{
181 transaction_t *t = journal->j_running_transaction;
182 int needed;
183 int total = blocks + rsv_blocks;
184
185 /*
186 * If the current transaction is locked down for commit, wait
187 * for the lock to be released.
188 */
189 if (t->t_state == T_LOCKED) {
190 wait_transaction_locked(journal);
191 return 1;
192 }
193
194 /*
195 * If there is not enough space left in the log to write all
196 * potential buffers requested by this operation, we need to
197 * stall pending a log checkpoint to free some more log space.
198 */
199 needed = atomic_add_return(total, &t->t_outstanding_credits);
200 if (needed > journal->j_max_transaction_buffers) {
201 /*
202 * If the current transaction is already too large,
203 * then start to commit it: we can then go back and
204 * attach this handle to a new transaction.
205 */
206 atomic_sub(total, &t->t_outstanding_credits);
207 wait_transaction_locked(journal);
208 return 1;
209 }
210
211 /*
212 * The commit code assumes that it can get enough log space
213 * without forcing a checkpoint. This is *critical* for
214 * correctness: a checkpoint of a buffer which is also
215 * associated with a committing transaction creates a deadlock,
216 * so commit simply cannot force through checkpoints.
217 *
218 * We must therefore ensure the necessary space in the journal
219 * *before* starting to dirty potentially checkpointed buffers
220 * in the new transaction.
221 */
222 if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
223 atomic_sub(total, &t->t_outstanding_credits);
224 read_unlock(&journal->j_state_lock);
225 write_lock(&journal->j_state_lock);
226 if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
227 __jbd2_log_wait_for_space(journal);
228 write_unlock(&journal->j_state_lock);
229 return 1;
230 }
231
232 /* No reservation? We are done... */
233 if (!rsv_blocks)
234 return 0;
235
236 needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
237 /* We allow at most half of a transaction to be reserved */
238 if (needed > journal->j_max_transaction_buffers / 2) {
239 sub_reserved_credits(journal, rsv_blocks);
240 atomic_sub(total, &t->t_outstanding_credits);
241 read_unlock(&journal->j_state_lock);
242 wait_event(journal->j_wait_reserved,
243 atomic_read(&journal->j_reserved_credits) + rsv_blocks
244 <= journal->j_max_transaction_buffers / 2);
245 return 1;
246 }
247 return 0;
248}
249
250/*
144 * start_this_handle: Given a handle, deal with any locking or stalling 251 * start_this_handle: Given a handle, deal with any locking or stalling
145 * needed to make sure that there is enough journal space for the handle 252 * needed to make sure that there is enough journal space for the handle
146 * to begin. Attach the handle to a transaction and set up the 253 * to begin. Attach the handle to a transaction and set up the
@@ -151,18 +258,24 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
151 gfp_t gfp_mask) 258 gfp_t gfp_mask)
152{ 259{
153 transaction_t *transaction, *new_transaction = NULL; 260 transaction_t *transaction, *new_transaction = NULL;
154 tid_t tid; 261 int blocks = handle->h_buffer_credits;
155 int needed, need_to_start; 262 int rsv_blocks = 0;
156 int nblocks = handle->h_buffer_credits;
157 unsigned long ts = jiffies; 263 unsigned long ts = jiffies;
158 264
159 if (nblocks > journal->j_max_transaction_buffers) { 265 /*
266 * 1/2 of transaction can be reserved so we can practically handle
267 * only 1/2 of maximum transaction size per operation
268 */
269 if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
160 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n", 270 printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
161 current->comm, nblocks, 271 current->comm, blocks,
162 journal->j_max_transaction_buffers); 272 journal->j_max_transaction_buffers / 2);
163 return -ENOSPC; 273 return -ENOSPC;
164 } 274 }
165 275
276 if (handle->h_rsv_handle)
277 rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
278
166alloc_transaction: 279alloc_transaction:
167 if (!journal->j_running_transaction) { 280 if (!journal->j_running_transaction) {
168 new_transaction = kmem_cache_zalloc(transaction_cache, 281 new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -199,8 +312,12 @@ repeat:
199 return -EROFS; 312 return -EROFS;
200 } 313 }
201 314
202 /* Wait on the journal's transaction barrier if necessary */ 315 /*
203 if (journal->j_barrier_count) { 316 * Wait on the journal's transaction barrier if necessary. Specifically
317 * we allow reserved handles to proceed because otherwise commit could
318 * deadlock on page writeback not being able to complete.
319 */
320 if (!handle->h_reserved && journal->j_barrier_count) {
204 read_unlock(&journal->j_state_lock); 321 read_unlock(&journal->j_state_lock);
205 wait_event(journal->j_wait_transaction_locked, 322 wait_event(journal->j_wait_transaction_locked,
206 journal->j_barrier_count == 0); 323 journal->j_barrier_count == 0);
@@ -213,7 +330,7 @@ repeat:
213 goto alloc_transaction; 330 goto alloc_transaction;
214 write_lock(&journal->j_state_lock); 331 write_lock(&journal->j_state_lock);
215 if (!journal->j_running_transaction && 332 if (!journal->j_running_transaction &&
216 !journal->j_barrier_count) { 333 (handle->h_reserved || !journal->j_barrier_count)) {
217 jbd2_get_transaction(journal, new_transaction); 334 jbd2_get_transaction(journal, new_transaction);
218 new_transaction = NULL; 335 new_transaction = NULL;
219 } 336 }
@@ -223,85 +340,18 @@ repeat:
223 340
224 transaction = journal->j_running_transaction; 341 transaction = journal->j_running_transaction;
225 342
226 /* 343 if (!handle->h_reserved) {
227 * If the current transaction is locked down for commit, wait for the 344 /* We may have dropped j_state_lock - restart in that case */
228 * lock to be released. 345 if (add_transaction_credits(journal, blocks, rsv_blocks))
229 */ 346 goto repeat;
230 if (transaction->t_state == T_LOCKED) { 347 } else {
231 DEFINE_WAIT(wait);
232
233 prepare_to_wait(&journal->j_wait_transaction_locked,
234 &wait, TASK_UNINTERRUPTIBLE);
235 read_unlock(&journal->j_state_lock);
236 schedule();
237 finish_wait(&journal->j_wait_transaction_locked, &wait);
238 goto repeat;
239 }
240
241 /*
242 * If there is not enough space left in the log to write all potential
243 * buffers requested by this operation, we need to stall pending a log
244 * checkpoint to free some more log space.
245 */
246 needed = atomic_add_return(nblocks,
247 &transaction->t_outstanding_credits);
248
249 if (needed > journal->j_max_transaction_buffers) {
250 /* 348 /*
251 * If the current transaction is already too large, then start 349 * We have handle reserved so we are allowed to join T_LOCKED
252 * to commit it: we can then go back and attach this handle to 350 * transaction and we don't have to check for transaction size
253 * a new transaction. 351 * and journal space.
254 */ 352 */
255 DEFINE_WAIT(wait); 353 sub_reserved_credits(journal, blocks);
256 354 handle->h_reserved = 0;
257 jbd_debug(2, "Handle %p starting new commit...\n", handle);
258 atomic_sub(nblocks, &transaction->t_outstanding_credits);
259 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
260 TASK_UNINTERRUPTIBLE);
261 tid = transaction->t_tid;
262 need_to_start = !tid_geq(journal->j_commit_request, tid);
263 read_unlock(&journal->j_state_lock);
264 if (need_to_start)
265 jbd2_log_start_commit(journal, tid);
266 schedule();
267 finish_wait(&journal->j_wait_transaction_locked, &wait);
268 goto repeat;
269 }
270
271 /*
272 * The commit code assumes that it can get enough log space
273 * without forcing a checkpoint. This is *critical* for
274 * correctness: a checkpoint of a buffer which is also
275 * associated with a committing transaction creates a deadlock,
276 * so commit simply cannot force through checkpoints.
277 *
278 * We must therefore ensure the necessary space in the journal
279 * *before* starting to dirty potentially checkpointed buffers
280 * in the new transaction.
281 *
282 * The worst part is, any transaction currently committing can
283 * reduce the free space arbitrarily. Be careful to account for
284 * those buffers when checkpointing.
285 */
286
287 /*
288 * @@@ AKPM: This seems rather over-defensive. We're giving commit
289 * a _lot_ of headroom: 1/4 of the journal plus the size of
290 * the committing transaction. Really, we only need to give it
291 * committing_transaction->t_outstanding_credits plus "enough" for
292 * the log control blocks.
293 * Also, this test is inconsistent with the matching one in
294 * jbd2_journal_extend().
295 */
296 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
297 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
298 atomic_sub(nblocks, &transaction->t_outstanding_credits);
299 read_unlock(&journal->j_state_lock);
300 write_lock(&journal->j_state_lock);
301 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
302 __jbd2_log_wait_for_space(journal);
303 write_unlock(&journal->j_state_lock);
304 goto repeat;
305 } 355 }
306 356
307 /* OK, account for the buffers that this operation expects to 357 /* OK, account for the buffers that this operation expects to
@@ -309,15 +359,16 @@ repeat:
309 */ 359 */
310 update_t_max_wait(transaction, ts); 360 update_t_max_wait(transaction, ts);
311 handle->h_transaction = transaction; 361 handle->h_transaction = transaction;
312 handle->h_requested_credits = nblocks; 362 handle->h_requested_credits = blocks;
313 handle->h_start_jiffies = jiffies; 363 handle->h_start_jiffies = jiffies;
314 atomic_inc(&transaction->t_updates); 364 atomic_inc(&transaction->t_updates);
315 atomic_inc(&transaction->t_handle_count); 365 atomic_inc(&transaction->t_handle_count);
316 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 366 jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
317 handle, nblocks, 367 handle, blocks,
318 atomic_read(&transaction->t_outstanding_credits), 368 atomic_read(&transaction->t_outstanding_credits),
319 __jbd2_log_space_left(journal)); 369 jbd2_log_space_left(journal));
320 read_unlock(&journal->j_state_lock); 370 read_unlock(&journal->j_state_lock);
371 current->journal_info = handle;
321 372
322 lock_map_acquire(&handle->h_lockdep_map); 373 lock_map_acquire(&handle->h_lockdep_map);
323 jbd2_journal_free_transaction(new_transaction); 374 jbd2_journal_free_transaction(new_transaction);
@@ -348,16 +399,21 @@ static handle_t *new_handle(int nblocks)
348 * 399 *
349 * We make sure that the transaction can guarantee at least nblocks of 400 * We make sure that the transaction can guarantee at least nblocks of
350 * modified buffers in the log. We block until the log can guarantee 401 * modified buffers in the log. We block until the log can guarantee
351 * that much space. 402 * that much space. Additionally, if rsv_blocks > 0, we also create another
352 * 403 * handle with rsv_blocks reserved blocks in the journal. This handle is
353 * This function is visible to journal users (like ext3fs), so is not 404 * is stored in h_rsv_handle. It is not attached to any particular transaction
354 * called with the journal already locked. 405 * and thus doesn't block transaction commit. If the caller uses this reserved
406 * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
407 * on the parent handle will dispose the reserved one. Reserved handle has to
408 * be converted to a normal handle using jbd2_journal_start_reserved() before
409 * it can be used.
355 * 410 *
356 * Return a pointer to a newly allocated handle, or an ERR_PTR() value 411 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
357 * on failure. 412 * on failure.
358 */ 413 */
359handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, 414handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
360 unsigned int type, unsigned int line_no) 415 gfp_t gfp_mask, unsigned int type,
416 unsigned int line_no)
361{ 417{
362 handle_t *handle = journal_current_handle(); 418 handle_t *handle = journal_current_handle();
363 int err; 419 int err;
@@ -374,13 +430,24 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask,
374 handle = new_handle(nblocks); 430 handle = new_handle(nblocks);
375 if (!handle) 431 if (!handle)
376 return ERR_PTR(-ENOMEM); 432 return ERR_PTR(-ENOMEM);
433 if (rsv_blocks) {
434 handle_t *rsv_handle;
377 435
378 current->journal_info = handle; 436 rsv_handle = new_handle(rsv_blocks);
437 if (!rsv_handle) {
438 jbd2_free_handle(handle);
439 return ERR_PTR(-ENOMEM);
440 }
441 rsv_handle->h_reserved = 1;
442 rsv_handle->h_journal = journal;
443 handle->h_rsv_handle = rsv_handle;
444 }
379 445
380 err = start_this_handle(journal, handle, gfp_mask); 446 err = start_this_handle(journal, handle, gfp_mask);
381 if (err < 0) { 447 if (err < 0) {
448 if (handle->h_rsv_handle)
449 jbd2_free_handle(handle->h_rsv_handle);
382 jbd2_free_handle(handle); 450 jbd2_free_handle(handle);
383 current->journal_info = NULL;
384 return ERR_PTR(err); 451 return ERR_PTR(err);
385 } 452 }
386 handle->h_type = type; 453 handle->h_type = type;
@@ -395,10 +462,65 @@ EXPORT_SYMBOL(jbd2__journal_start);
395 462
396handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 463handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
397{ 464{
398 return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); 465 return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
399} 466}
400EXPORT_SYMBOL(jbd2_journal_start); 467EXPORT_SYMBOL(jbd2_journal_start);
401 468
469void jbd2_journal_free_reserved(handle_t *handle)
470{
471 journal_t *journal = handle->h_journal;
472
473 WARN_ON(!handle->h_reserved);
474 sub_reserved_credits(journal, handle->h_buffer_credits);
475 jbd2_free_handle(handle);
476}
477EXPORT_SYMBOL(jbd2_journal_free_reserved);
478
479/**
480 * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
481 * @handle: handle to start
482 *
483 * Start handle that has been previously reserved with jbd2_journal_reserve().
484 * This attaches @handle to the running transaction (or creates one if there's
485 * not transaction running). Unlike jbd2_journal_start() this function cannot
486 * block on journal commit, checkpointing, or similar stuff. It can block on
487 * memory allocation or frozen journal though.
488 *
489 * Return 0 on success, non-zero on error - handle is freed in that case.
490 */
491int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
492 unsigned int line_no)
493{
494 journal_t *journal = handle->h_journal;
495 int ret = -EIO;
496
497 if (WARN_ON(!handle->h_reserved)) {
498 /* Someone passed in normal handle? Just stop it. */
499 jbd2_journal_stop(handle);
500 return ret;
501 }
502 /*
503 * Usefulness of mixing of reserved and unreserved handles is
504 * questionable. So far nobody seems to need it so just error out.
505 */
506 if (WARN_ON(current->journal_info)) {
507 jbd2_journal_free_reserved(handle);
508 return ret;
509 }
510
511 handle->h_journal = NULL;
512 /*
513 * GFP_NOFS is here because callers are likely from writeback or
514 * similarly constrained call sites
515 */
516 ret = start_this_handle(journal, handle, GFP_NOFS);
517 if (ret < 0)
518 jbd2_journal_free_reserved(handle);
519 handle->h_type = type;
520 handle->h_line_no = line_no;
521 return ret;
522}
523EXPORT_SYMBOL(jbd2_journal_start_reserved);
402 524
403/** 525/**
404 * int jbd2_journal_extend() - extend buffer credits. 526 * int jbd2_journal_extend() - extend buffer credits.
@@ -423,49 +545,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
423int jbd2_journal_extend(handle_t *handle, int nblocks) 545int jbd2_journal_extend(handle_t *handle, int nblocks)
424{ 546{
425 transaction_t *transaction = handle->h_transaction; 547 transaction_t *transaction = handle->h_transaction;
426 journal_t *journal = transaction->t_journal; 548 journal_t *journal;
427 int result; 549 int result;
428 int wanted; 550 int wanted;
429 551
430 result = -EIO; 552 WARN_ON(!transaction);
431 if (is_handle_aborted(handle)) 553 if (is_handle_aborted(handle))
432 goto out; 554 return -EROFS;
555 journal = transaction->t_journal;
433 556
434 result = 1; 557 result = 1;
435 558
436 read_lock(&journal->j_state_lock); 559 read_lock(&journal->j_state_lock);
437 560
438 /* Don't extend a locked-down transaction! */ 561 /* Don't extend a locked-down transaction! */
439 if (handle->h_transaction->t_state != T_RUNNING) { 562 if (transaction->t_state != T_RUNNING) {
440 jbd_debug(3, "denied handle %p %d blocks: " 563 jbd_debug(3, "denied handle %p %d blocks: "
441 "transaction not running\n", handle, nblocks); 564 "transaction not running\n", handle, nblocks);
442 goto error_out; 565 goto error_out;
443 } 566 }
444 567
445 spin_lock(&transaction->t_handle_lock); 568 spin_lock(&transaction->t_handle_lock);
446 wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; 569 wanted = atomic_add_return(nblocks,
570 &transaction->t_outstanding_credits);
447 571
448 if (wanted > journal->j_max_transaction_buffers) { 572 if (wanted > journal->j_max_transaction_buffers) {
449 jbd_debug(3, "denied handle %p %d blocks: " 573 jbd_debug(3, "denied handle %p %d blocks: "
450 "transaction too large\n", handle, nblocks); 574 "transaction too large\n", handle, nblocks);
575 atomic_sub(nblocks, &transaction->t_outstanding_credits);
451 goto unlock; 576 goto unlock;
452 } 577 }
453 578
454 if (wanted > __jbd2_log_space_left(journal)) { 579 if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
580 jbd2_log_space_left(journal)) {
455 jbd_debug(3, "denied handle %p %d blocks: " 581 jbd_debug(3, "denied handle %p %d blocks: "
456 "insufficient log space\n", handle, nblocks); 582 "insufficient log space\n", handle, nblocks);
583 atomic_sub(nblocks, &transaction->t_outstanding_credits);
457 goto unlock; 584 goto unlock;
458 } 585 }
459 586
460 trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, 587 trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
461 handle->h_transaction->t_tid, 588 transaction->t_tid,
462 handle->h_type, handle->h_line_no, 589 handle->h_type, handle->h_line_no,
463 handle->h_buffer_credits, 590 handle->h_buffer_credits,
464 nblocks); 591 nblocks);
465 592
466 handle->h_buffer_credits += nblocks; 593 handle->h_buffer_credits += nblocks;
467 handle->h_requested_credits += nblocks; 594 handle->h_requested_credits += nblocks;
468 atomic_add(nblocks, &transaction->t_outstanding_credits);
469 result = 0; 595 result = 0;
470 596
471 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); 597 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -473,7 +599,6 @@ unlock:
473 spin_unlock(&transaction->t_handle_lock); 599 spin_unlock(&transaction->t_handle_lock);
474error_out: 600error_out:
475 read_unlock(&journal->j_state_lock); 601 read_unlock(&journal->j_state_lock);
476out:
477 return result; 602 return result;
478} 603}
479 604
@@ -490,19 +615,22 @@ out:
490 * to a running handle, a call to jbd2_journal_restart will commit the 615 * to a running handle, a call to jbd2_journal_restart will commit the
491 * handle's transaction so far and reattach the handle to a new 616 * handle's transaction so far and reattach the handle to a new
492 * transaction capabable of guaranteeing the requested number of 617 * transaction capabable of guaranteeing the requested number of
493 * credits. 618 * credits. We preserve reserved handle if there's any attached to the
619 * passed in handle.
494 */ 620 */
495int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) 621int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
496{ 622{
497 transaction_t *transaction = handle->h_transaction; 623 transaction_t *transaction = handle->h_transaction;
498 journal_t *journal = transaction->t_journal; 624 journal_t *journal;
499 tid_t tid; 625 tid_t tid;
500 int need_to_start, ret; 626 int need_to_start, ret;
501 627
628 WARN_ON(!transaction);
502 /* If we've had an abort of any type, don't even think about 629 /* If we've had an abort of any type, don't even think about
503 * actually doing the restart! */ 630 * actually doing the restart! */
504 if (is_handle_aborted(handle)) 631 if (is_handle_aborted(handle))
505 return 0; 632 return 0;
633 journal = transaction->t_journal;
506 634
507 /* 635 /*
508 * First unlink the handle from its current transaction, and start the 636 * First unlink the handle from its current transaction, and start the
@@ -515,12 +643,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
515 spin_lock(&transaction->t_handle_lock); 643 spin_lock(&transaction->t_handle_lock);
516 atomic_sub(handle->h_buffer_credits, 644 atomic_sub(handle->h_buffer_credits,
517 &transaction->t_outstanding_credits); 645 &transaction->t_outstanding_credits);
646 if (handle->h_rsv_handle) {
647 sub_reserved_credits(journal,
648 handle->h_rsv_handle->h_buffer_credits);
649 }
518 if (atomic_dec_and_test(&transaction->t_updates)) 650 if (atomic_dec_and_test(&transaction->t_updates))
519 wake_up(&journal->j_wait_updates); 651 wake_up(&journal->j_wait_updates);
652 tid = transaction->t_tid;
520 spin_unlock(&transaction->t_handle_lock); 653 spin_unlock(&transaction->t_handle_lock);
654 handle->h_transaction = NULL;
655 current->journal_info = NULL;
521 656
522 jbd_debug(2, "restarting handle %p\n", handle); 657 jbd_debug(2, "restarting handle %p\n", handle);
523 tid = transaction->t_tid;
524 need_to_start = !tid_geq(journal->j_commit_request, tid); 658 need_to_start = !tid_geq(journal->j_commit_request, tid);
525 read_unlock(&journal->j_state_lock); 659 read_unlock(&journal->j_state_lock);
526 if (need_to_start) 660 if (need_to_start)
@@ -557,6 +691,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
557 write_lock(&journal->j_state_lock); 691 write_lock(&journal->j_state_lock);
558 ++journal->j_barrier_count; 692 ++journal->j_barrier_count;
559 693
694 /* Wait until there are no reserved handles */
695 if (atomic_read(&journal->j_reserved_credits)) {
696 write_unlock(&journal->j_state_lock);
697 wait_event(journal->j_wait_reserved,
698 atomic_read(&journal->j_reserved_credits) == 0);
699 write_lock(&journal->j_state_lock);
700 }
701
560 /* Wait until there are no running updates */ 702 /* Wait until there are no running updates */
561 while (1) { 703 while (1) {
562 transaction_t *transaction = journal->j_running_transaction; 704 transaction_t *transaction = journal->j_running_transaction;
@@ -619,6 +761,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
619 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 761 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
620} 762}
621 763
764static int sleep_on_shadow_bh(void *word)
765{
766 io_schedule();
767 return 0;
768}
769
622/* 770/*
623 * If the buffer is already part of the current transaction, then there 771 * If the buffer is already part of the current transaction, then there
624 * is nothing we need to do. If it is already part of a prior 772 * is nothing we need to do. If it is already part of a prior
@@ -634,17 +782,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
634 int force_copy) 782 int force_copy)
635{ 783{
636 struct buffer_head *bh; 784 struct buffer_head *bh;
637 transaction_t *transaction; 785 transaction_t *transaction = handle->h_transaction;
638 journal_t *journal; 786 journal_t *journal;
639 int error; 787 int error;
640 char *frozen_buffer = NULL; 788 char *frozen_buffer = NULL;
641 int need_copy = 0; 789 int need_copy = 0;
642 unsigned long start_lock, time_lock; 790 unsigned long start_lock, time_lock;
643 791
792 WARN_ON(!transaction);
644 if (is_handle_aborted(handle)) 793 if (is_handle_aborted(handle))
645 return -EROFS; 794 return -EROFS;
646
647 transaction = handle->h_transaction;
648 journal = transaction->t_journal; 795 journal = transaction->t_journal;
649 796
650 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); 797 jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -754,41 +901,29 @@ repeat:
754 * journaled. If the primary copy is already going to 901 * journaled. If the primary copy is already going to
755 * disk then we cannot do copy-out here. */ 902 * disk then we cannot do copy-out here. */
756 903
757 if (jh->b_jlist == BJ_Shadow) { 904 if (buffer_shadow(bh)) {
758 DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
759 wait_queue_head_t *wqh;
760
761 wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
762
763 JBUFFER_TRACE(jh, "on shadow: sleep"); 905 JBUFFER_TRACE(jh, "on shadow: sleep");
764 jbd_unlock_bh_state(bh); 906 jbd_unlock_bh_state(bh);
765 /* commit wakes up all shadow buffers after IO */ 907 wait_on_bit(&bh->b_state, BH_Shadow,
766 for ( ; ; ) { 908 sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
767 prepare_to_wait(wqh, &wait.wait,
768 TASK_UNINTERRUPTIBLE);
769 if (jh->b_jlist != BJ_Shadow)
770 break;
771 schedule();
772 }
773 finish_wait(wqh, &wait.wait);
774 goto repeat; 909 goto repeat;
775 } 910 }
776 911
777 /* Only do the copy if the currently-owning transaction 912 /*
778 * still needs it. If it is on the Forget list, the 913 * Only do the copy if the currently-owning transaction still
779 * committing transaction is past that stage. The 914 * needs it. If buffer isn't on BJ_Metadata list, the
780 * buffer had better remain locked during the kmalloc, 915 * committing transaction is past that stage (here we use the
781 * but that should be true --- we hold the journal lock 916 * fact that BH_Shadow is set under bh_state lock together with
782 * still and the buffer is already on the BUF_JOURNAL 917 * refiling to BJ_Shadow list and at this point we know the
783 * list so won't be flushed. 918 * buffer doesn't have BH_Shadow set).
784 * 919 *
785 * Subtle point, though: if this is a get_undo_access, 920 * Subtle point, though: if this is a get_undo_access,
786 * then we will be relying on the frozen_data to contain 921 * then we will be relying on the frozen_data to contain
787 * the new value of the committed_data record after the 922 * the new value of the committed_data record after the
788 * transaction, so we HAVE to force the frozen_data copy 923 * transaction, so we HAVE to force the frozen_data copy
789 * in that case. */ 924 * in that case.
790 925 */
791 if (jh->b_jlist != BJ_Forget || force_copy) { 926 if (jh->b_jlist == BJ_Metadata || force_copy) {
792 JBUFFER_TRACE(jh, "generate frozen data"); 927 JBUFFER_TRACE(jh, "generate frozen data");
793 if (!frozen_buffer) { 928 if (!frozen_buffer) {
794 JBUFFER_TRACE(jh, "allocate memory for buffer"); 929 JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -915,14 +1050,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
915int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) 1050int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
916{ 1051{
917 transaction_t *transaction = handle->h_transaction; 1052 transaction_t *transaction = handle->h_transaction;
918 journal_t *journal = transaction->t_journal; 1053 journal_t *journal;
919 struct journal_head *jh = jbd2_journal_add_journal_head(bh); 1054 struct journal_head *jh = jbd2_journal_add_journal_head(bh);
920 int err; 1055 int err;
921 1056
922 jbd_debug(5, "journal_head %p\n", jh); 1057 jbd_debug(5, "journal_head %p\n", jh);
1058 WARN_ON(!transaction);
923 err = -EROFS; 1059 err = -EROFS;
924 if (is_handle_aborted(handle)) 1060 if (is_handle_aborted(handle))
925 goto out; 1061 goto out;
1062 journal = transaction->t_journal;
926 err = 0; 1063 err = 0;
927 1064
928 JBUFFER_TRACE(jh, "entry"); 1065 JBUFFER_TRACE(jh, "entry");
@@ -1128,12 +1265,14 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
1128int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) 1265int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1129{ 1266{
1130 transaction_t *transaction = handle->h_transaction; 1267 transaction_t *transaction = handle->h_transaction;
1131 journal_t *journal = transaction->t_journal; 1268 journal_t *journal;
1132 struct journal_head *jh; 1269 struct journal_head *jh;
1133 int ret = 0; 1270 int ret = 0;
1134 1271
1272 WARN_ON(!transaction);
1135 if (is_handle_aborted(handle)) 1273 if (is_handle_aborted(handle))
1136 goto out; 1274 return -EROFS;
1275 journal = transaction->t_journal;
1137 jh = jbd2_journal_grab_journal_head(bh); 1276 jh = jbd2_journal_grab_journal_head(bh);
1138 if (!jh) { 1277 if (!jh) {
1139 ret = -EUCLEAN; 1278 ret = -EUCLEAN;
@@ -1227,7 +1366,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1227 1366
1228 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1367 JBUFFER_TRACE(jh, "file as BJ_Metadata");
1229 spin_lock(&journal->j_list_lock); 1368 spin_lock(&journal->j_list_lock);
1230 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); 1369 __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
1231 spin_unlock(&journal->j_list_lock); 1370 spin_unlock(&journal->j_list_lock);
1232out_unlock_bh: 1371out_unlock_bh:
1233 jbd_unlock_bh_state(bh); 1372 jbd_unlock_bh_state(bh);
@@ -1258,12 +1397,17 @@ out:
1258int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) 1397int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1259{ 1398{
1260 transaction_t *transaction = handle->h_transaction; 1399 transaction_t *transaction = handle->h_transaction;
1261 journal_t *journal = transaction->t_journal; 1400 journal_t *journal;
1262 struct journal_head *jh; 1401 struct journal_head *jh;
1263 int drop_reserve = 0; 1402 int drop_reserve = 0;
1264 int err = 0; 1403 int err = 0;
1265 int was_modified = 0; 1404 int was_modified = 0;
1266 1405
1406 WARN_ON(!transaction);
1407 if (is_handle_aborted(handle))
1408 return -EROFS;
1409 journal = transaction->t_journal;
1410
1267 BUFFER_TRACE(bh, "entry"); 1411 BUFFER_TRACE(bh, "entry");
1268 1412
1269 jbd_lock_bh_state(bh); 1413 jbd_lock_bh_state(bh);
@@ -1290,7 +1434,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1290 */ 1434 */
1291 jh->b_modified = 0; 1435 jh->b_modified = 0;
1292 1436
1293 if (jh->b_transaction == handle->h_transaction) { 1437 if (jh->b_transaction == transaction) {
1294 J_ASSERT_JH(jh, !jh->b_frozen_data); 1438 J_ASSERT_JH(jh, !jh->b_frozen_data);
1295 1439
1296 /* If we are forgetting a buffer which is already part 1440 /* If we are forgetting a buffer which is already part
@@ -1385,19 +1529,21 @@ drop:
1385int jbd2_journal_stop(handle_t *handle) 1529int jbd2_journal_stop(handle_t *handle)
1386{ 1530{
1387 transaction_t *transaction = handle->h_transaction; 1531 transaction_t *transaction = handle->h_transaction;
1388 journal_t *journal = transaction->t_journal; 1532 journal_t *journal;
1389 int err, wait_for_commit = 0; 1533 int err = 0, wait_for_commit = 0;
1390 tid_t tid; 1534 tid_t tid;
1391 pid_t pid; 1535 pid_t pid;
1392 1536
1537 if (!transaction)
1538 goto free_and_exit;
1539 journal = transaction->t_journal;
1540
1393 J_ASSERT(journal_current_handle() == handle); 1541 J_ASSERT(journal_current_handle() == handle);
1394 1542
1395 if (is_handle_aborted(handle)) 1543 if (is_handle_aborted(handle))
1396 err = -EIO; 1544 err = -EIO;
1397 else { 1545 else
1398 J_ASSERT(atomic_read(&transaction->t_updates) > 0); 1546 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
1399 err = 0;
1400 }
1401 1547
1402 if (--handle->h_ref > 0) { 1548 if (--handle->h_ref > 0) {
1403 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, 1549 jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1407,7 +1553,7 @@ int jbd2_journal_stop(handle_t *handle)
1407 1553
1408 jbd_debug(4, "Handle %p going down\n", handle); 1554 jbd_debug(4, "Handle %p going down\n", handle);
1409 trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, 1555 trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
1410 handle->h_transaction->t_tid, 1556 transaction->t_tid,
1411 handle->h_type, handle->h_line_no, 1557 handle->h_type, handle->h_line_no,
1412 jiffies - handle->h_start_jiffies, 1558 jiffies - handle->h_start_jiffies,
1413 handle->h_sync, handle->h_requested_credits, 1559 handle->h_sync, handle->h_requested_credits,
@@ -1518,33 +1664,13 @@ int jbd2_journal_stop(handle_t *handle)
1518 1664
1519 lock_map_release(&handle->h_lockdep_map); 1665 lock_map_release(&handle->h_lockdep_map);
1520 1666
1667 if (handle->h_rsv_handle)
1668 jbd2_journal_free_reserved(handle->h_rsv_handle);
1669free_and_exit:
1521 jbd2_free_handle(handle); 1670 jbd2_free_handle(handle);
1522 return err; 1671 return err;
1523} 1672}
1524 1673
1525/**
1526 * int jbd2_journal_force_commit() - force any uncommitted transactions
1527 * @journal: journal to force
1528 *
1529 * For synchronous operations: force any uncommitted transactions
1530 * to disk. May seem kludgy, but it reuses all the handle batching
1531 * code in a very simple manner.
1532 */
1533int jbd2_journal_force_commit(journal_t *journal)
1534{
1535 handle_t *handle;
1536 int ret;
1537
1538 handle = jbd2_journal_start(journal, 1);
1539 if (IS_ERR(handle)) {
1540 ret = PTR_ERR(handle);
1541 } else {
1542 handle->h_sync = 1;
1543 ret = jbd2_journal_stop(handle);
1544 }
1545 return ret;
1546}
1547
1548/* 1674/*
1549 * 1675 *
1550 * List management code snippets: various functions for manipulating the 1676 * List management code snippets: various functions for manipulating the
@@ -1601,10 +1727,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1601 * Remove a buffer from the appropriate transaction list. 1727 * Remove a buffer from the appropriate transaction list.
1602 * 1728 *
1603 * Note that this function can *change* the value of 1729 * Note that this function can *change* the value of
1604 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, 1730 * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
1605 * t_log_list or t_reserved_list. If the caller is holding onto a copy of one 1731 * t_reserved_list. If the caller is holding onto a copy of one of these
1606 * of these pointers, it could go bad. Generally the caller needs to re-read 1732 * pointers, it could go bad. Generally the caller needs to re-read the
1607 * the pointer from the transaction_t. 1733 * pointer from the transaction_t.
1608 * 1734 *
1609 * Called under j_list_lock. 1735 * Called under j_list_lock.
1610 */ 1736 */
@@ -1634,15 +1760,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1634 case BJ_Forget: 1760 case BJ_Forget:
1635 list = &transaction->t_forget; 1761 list = &transaction->t_forget;
1636 break; 1762 break;
1637 case BJ_IO:
1638 list = &transaction->t_iobuf_list;
1639 break;
1640 case BJ_Shadow: 1763 case BJ_Shadow:
1641 list = &transaction->t_shadow_list; 1764 list = &transaction->t_shadow_list;
1642 break; 1765 break;
1643 case BJ_LogCtl:
1644 list = &transaction->t_log_list;
1645 break;
1646 case BJ_Reserved: 1766 case BJ_Reserved:
1647 list = &transaction->t_reserved_list; 1767 list = &transaction->t_reserved_list;
1648 break; 1768 break;
@@ -2034,18 +2154,23 @@ zap_buffer_unlocked:
2034 * void jbd2_journal_invalidatepage() 2154 * void jbd2_journal_invalidatepage()
2035 * @journal: journal to use for flush... 2155 * @journal: journal to use for flush...
2036 * @page: page to flush 2156 * @page: page to flush
2037 * @offset: length of page to invalidate. 2157 * @offset: start of the range to invalidate
2158 * @length: length of the range to invalidate
2038 * 2159 *
2039 * Reap page buffers containing data after offset in page. Can return -EBUSY 2160 * Reap page buffers containing data after in the specified range in page.
2040 * if buffers are part of the committing transaction and the page is straddling 2161 * Can return -EBUSY if buffers are part of the committing transaction and
2041 * i_size. Caller then has to wait for current commit and try again. 2162 * the page is straddling i_size. Caller then has to wait for current commit
2163 * and try again.
2042 */ 2164 */
2043int jbd2_journal_invalidatepage(journal_t *journal, 2165int jbd2_journal_invalidatepage(journal_t *journal,
2044 struct page *page, 2166 struct page *page,
2045 unsigned long offset) 2167 unsigned int offset,
2168 unsigned int length)
2046{ 2169{
2047 struct buffer_head *head, *bh, *next; 2170 struct buffer_head *head, *bh, *next;
2171 unsigned int stop = offset + length;
2048 unsigned int curr_off = 0; 2172 unsigned int curr_off = 0;
2173 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2049 int may_free = 1; 2174 int may_free = 1;
2050 int ret = 0; 2175 int ret = 0;
2051 2176
@@ -2054,6 +2179,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2054 if (!page_has_buffers(page)) 2179 if (!page_has_buffers(page))
2055 return 0; 2180 return 0;
2056 2181
2182 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
2183
2057 /* We will potentially be playing with lists other than just the 2184 /* We will potentially be playing with lists other than just the
2058 * data lists (especially for journaled data mode), so be 2185 * data lists (especially for journaled data mode), so be
2059 * cautious in our locking. */ 2186 * cautious in our locking. */
@@ -2063,10 +2190,13 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2063 unsigned int next_off = curr_off + bh->b_size; 2190 unsigned int next_off = curr_off + bh->b_size;
2064 next = bh->b_this_page; 2191 next = bh->b_this_page;
2065 2192
2193 if (next_off > stop)
2194 return 0;
2195
2066 if (offset <= curr_off) { 2196 if (offset <= curr_off) {
2067 /* This block is wholly outside the truncation point */ 2197 /* This block is wholly outside the truncation point */
2068 lock_buffer(bh); 2198 lock_buffer(bh);
2069 ret = journal_unmap_buffer(journal, bh, offset > 0); 2199 ret = journal_unmap_buffer(journal, bh, partial_page);
2070 unlock_buffer(bh); 2200 unlock_buffer(bh);
2071 if (ret < 0) 2201 if (ret < 0)
2072 return ret; 2202 return ret;
@@ -2077,7 +2207,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
2077 2207
2078 } while (bh != head); 2208 } while (bh != head);
2079 2209
2080 if (!offset) { 2210 if (!partial_page) {
2081 if (may_free && try_to_free_buffers(page)) 2211 if (may_free && try_to_free_buffers(page))
2082 J_ASSERT(!page_has_buffers(page)); 2212 J_ASSERT(!page_has_buffers(page));
2083 } 2213 }
@@ -2138,15 +2268,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2138 case BJ_Forget: 2268 case BJ_Forget:
2139 list = &transaction->t_forget; 2269 list = &transaction->t_forget;
2140 break; 2270 break;
2141 case BJ_IO:
2142 list = &transaction->t_iobuf_list;
2143 break;
2144 case BJ_Shadow: 2271 case BJ_Shadow:
2145 list = &transaction->t_shadow_list; 2272 list = &transaction->t_shadow_list;
2146 break; 2273 break;
2147 case BJ_LogCtl:
2148 list = &transaction->t_log_list;
2149 break;
2150 case BJ_Reserved: 2274 case BJ_Reserved:
2151 list = &transaction->t_reserved_list; 2275 list = &transaction->t_reserved_list;
2152 break; 2276 break;
@@ -2248,10 +2372,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2248int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) 2372int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2249{ 2373{
2250 transaction_t *transaction = handle->h_transaction; 2374 transaction_t *transaction = handle->h_transaction;
2251 journal_t *journal = transaction->t_journal; 2375 journal_t *journal;
2252 2376
2377 WARN_ON(!transaction);
2253 if (is_handle_aborted(handle)) 2378 if (is_handle_aborted(handle))
2254 return -EIO; 2379 return -EROFS;
2380 journal = transaction->t_journal;
2255 2381
2256 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, 2382 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2257 transaction->t_tid); 2383 transaction->t_tid);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index acd46a4160cb..e3aac222472e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -22,7 +22,7 @@
22#include <linux/time.h> 22#include <linux/time.h>
23#include "nodelist.h" 23#include "nodelist.h"
24 24
25static int jffs2_readdir (struct file *, void *, filldir_t); 25static int jffs2_readdir (struct file *, struct dir_context *);
26 26
27static int jffs2_create (struct inode *,struct dentry *,umode_t, 27static int jffs2_create (struct inode *,struct dentry *,umode_t,
28 bool); 28 bool);
@@ -40,7 +40,7 @@ static int jffs2_rename (struct inode *, struct dentry *,
40const struct file_operations jffs2_dir_operations = 40const struct file_operations jffs2_dir_operations =
41{ 41{
42 .read = generic_read_dir, 42 .read = generic_read_dir,
43 .readdir = jffs2_readdir, 43 .iterate = jffs2_readdir,
44 .unlocked_ioctl=jffs2_ioctl, 44 .unlocked_ioctl=jffs2_ioctl,
45 .fsync = jffs2_fsync, 45 .fsync = jffs2_fsync,
46 .llseek = generic_file_llseek, 46 .llseek = generic_file_llseek,
@@ -114,60 +114,40 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
114/***********************************************************************/ 114/***********************************************************************/
115 115
116 116
117static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) 117static int jffs2_readdir(struct file *file, struct dir_context *ctx)
118{ 118{
119 struct jffs2_inode_info *f; 119 struct inode *inode = file_inode(file);
120 struct inode *inode = file_inode(filp); 120 struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
121 struct jffs2_full_dirent *fd; 121 struct jffs2_full_dirent *fd;
122 unsigned long offset, curofs; 122 unsigned long curofs = 1;
123 123
124 jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", 124 jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino);
125 file_inode(filp)->i_ino);
126 125
127 f = JFFS2_INODE_INFO(inode); 126 if (!dir_emit_dots(file, ctx))
128 127 return 0;
129 offset = filp->f_pos;
130
131 if (offset == 0) {
132 jffs2_dbg(1, "Dirent 0: \".\", ino #%lu\n", inode->i_ino);
133 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
134 goto out;
135 offset++;
136 }
137 if (offset == 1) {
138 unsigned long pino = parent_ino(filp->f_path.dentry);
139 jffs2_dbg(1, "Dirent 1: \"..\", ino #%lu\n", pino);
140 if (filldir(dirent, "..", 2, 1, pino, DT_DIR) < 0)
141 goto out;
142 offset++;
143 }
144 128
145 curofs=1;
146 mutex_lock(&f->sem); 129 mutex_lock(&f->sem);
147 for (fd = f->dents; fd; fd = fd->next) { 130 for (fd = f->dents; fd; fd = fd->next) {
148
149 curofs++; 131 curofs++;
150 /* First loop: curofs = 2; offset = 2 */ 132 /* First loop: curofs = 2; pos = 2 */
151 if (curofs < offset) { 133 if (curofs < ctx->pos) {
152 jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n", 134 jffs2_dbg(2, "Skipping dirent: \"%s\", ino #%u, type %d, because curofs %ld < offset %ld\n",
153 fd->name, fd->ino, fd->type, curofs, offset); 135 fd->name, fd->ino, fd->type, curofs, (unsigned long)ctx->pos);
154 continue; 136 continue;
155 } 137 }
156 if (!fd->ino) { 138 if (!fd->ino) {
157 jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n", 139 jffs2_dbg(2, "Skipping deletion dirent \"%s\"\n",
158 fd->name); 140 fd->name);
159 offset++; 141 ctx->pos++;
160 continue; 142 continue;
161 } 143 }
162 jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n", 144 jffs2_dbg(2, "Dirent %ld: \"%s\", ino #%u, type %d\n",
163 offset, fd->name, fd->ino, fd->type); 145 (unsigned long)ctx->pos, fd->name, fd->ino, fd->type);
164 if (filldir(dirent, fd->name, strlen(fd->name), offset, fd->ino, fd->type) < 0) 146 if (!dir_emit(ctx, fd->name, strlen(fd->name), fd->ino, fd->type))
165 break; 147 break;
166 offset++; 148 ctx->pos++;
167 } 149 }
168 mutex_unlock(&f->sem); 150 mutex_unlock(&f->sem);
169 out:
170 filp->f_pos = offset;
171 return 0; 151 return 0;
172} 152}
173 153
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 9a55f53be5ff..370d7b6c5942 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -346,8 +346,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
346 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", 346 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
347 (unsigned long long) blkno, 347 (unsigned long long) blkno,
348 (unsigned long long) nblocks); 348 (unsigned long long) nblocks);
349 jfs_error(ip->i_sb, 349 jfs_error(ip->i_sb, "block to be freed is outside the map\n");
350 "dbFree: block to be freed is outside the map");
351 return -EIO; 350 return -EIO;
352 } 351 }
353 352
@@ -384,7 +383,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
384 383
385 /* free the blocks. */ 384 /* free the blocks. */
386 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { 385 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
387 jfs_error(ip->i_sb, "dbFree: error in block map\n"); 386 jfs_error(ip->i_sb, "error in block map\n");
388 release_metapage(mp); 387 release_metapage(mp);
389 IREAD_UNLOCK(ipbmap); 388 IREAD_UNLOCK(ipbmap);
390 return (rc); 389 return (rc);
@@ -441,8 +440,7 @@ dbUpdatePMap(struct inode *ipbmap,
441 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", 440 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
442 (unsigned long long) blkno, 441 (unsigned long long) blkno,
443 (unsigned long long) nblocks); 442 (unsigned long long) nblocks);
444 jfs_error(ipbmap->i_sb, 443 jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
445 "dbUpdatePMap: blocks are outside the map");
446 return -EIO; 444 return -EIO;
447 } 445 }
448 446
@@ -726,7 +724,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
726 724
727 /* the hint should be within the map */ 725 /* the hint should be within the map */
728 if (hint >= mapSize) { 726 if (hint >= mapSize) {
729 jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); 727 jfs_error(ip->i_sb, "the hint is outside the map\n");
730 return -EIO; 728 return -EIO;
731 } 729 }
732 730
@@ -1057,8 +1055,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1057 bmp = sbi->bmap; 1055 bmp = sbi->bmap;
1058 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { 1056 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
1059 IREAD_UNLOCK(ipbmap); 1057 IREAD_UNLOCK(ipbmap);
1060 jfs_error(ip->i_sb, 1058 jfs_error(ip->i_sb, "the block is outside the filesystem\n");
1061 "dbExtend: the block is outside the filesystem");
1062 return -EIO; 1059 return -EIO;
1063 } 1060 }
1064 1061
@@ -1134,8 +1131,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1134 u32 mask; 1131 u32 mask;
1135 1132
1136 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { 1133 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1137 jfs_error(bmp->db_ipbmap->i_sb, 1134 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
1138 "dbAllocNext: Corrupt dmap page");
1139 return -EIO; 1135 return -EIO;
1140 } 1136 }
1141 1137
@@ -1265,8 +1261,7 @@ dbAllocNear(struct bmap * bmp,
1265 s8 *leaf; 1261 s8 *leaf;
1266 1262
1267 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { 1263 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1268 jfs_error(bmp->db_ipbmap->i_sb, 1264 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
1269 "dbAllocNear: Corrupt dmap page");
1270 return -EIO; 1265 return -EIO;
1271 } 1266 }
1272 1267
@@ -1381,8 +1376,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1381 */ 1376 */
1382 if (l2nb > bmp->db_agl2size) { 1377 if (l2nb > bmp->db_agl2size) {
1383 jfs_error(bmp->db_ipbmap->i_sb, 1378 jfs_error(bmp->db_ipbmap->i_sb,
1384 "dbAllocAG: allocation request is larger than the " 1379 "allocation request is larger than the allocation group size\n");
1385 "allocation group size");
1386 return -EIO; 1380 return -EIO;
1387 } 1381 }
1388 1382
@@ -1417,7 +1411,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1417 (unsigned long long) blkno, 1411 (unsigned long long) blkno,
1418 (unsigned long long) nblocks); 1412 (unsigned long long) nblocks);
1419 jfs_error(bmp->db_ipbmap->i_sb, 1413 jfs_error(bmp->db_ipbmap->i_sb,
1420 "dbAllocAG: dbAllocCtl failed in free AG"); 1414 "dbAllocCtl failed in free AG\n");
1421 } 1415 }
1422 return (rc); 1416 return (rc);
1423 } 1417 }
@@ -1433,8 +1427,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1433 budmin = dcp->budmin; 1427 budmin = dcp->budmin;
1434 1428
1435 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 1429 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1436 jfs_error(bmp->db_ipbmap->i_sb, 1430 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
1437 "dbAllocAG: Corrupt dmapctl page");
1438 release_metapage(mp); 1431 release_metapage(mp);
1439 return -EIO; 1432 return -EIO;
1440 } 1433 }
@@ -1475,7 +1468,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1475 } 1468 }
1476 if (n == 4) { 1469 if (n == 4) {
1477 jfs_error(bmp->db_ipbmap->i_sb, 1470 jfs_error(bmp->db_ipbmap->i_sb,
1478 "dbAllocAG: failed descending stree"); 1471 "failed descending stree\n");
1479 release_metapage(mp); 1472 release_metapage(mp);
1480 return -EIO; 1473 return -EIO;
1481 } 1474 }
@@ -1515,8 +1508,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1515 &blkno))) { 1508 &blkno))) {
1516 if (rc == -ENOSPC) { 1509 if (rc == -ENOSPC) {
1517 jfs_error(bmp->db_ipbmap->i_sb, 1510 jfs_error(bmp->db_ipbmap->i_sb,
1518 "dbAllocAG: control page " 1511 "control page inconsistent\n");
1519 "inconsistent");
1520 return -EIO; 1512 return -EIO;
1521 } 1513 }
1522 return (rc); 1514 return (rc);
@@ -1528,7 +1520,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1528 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1520 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1529 if (rc == -ENOSPC) { 1521 if (rc == -ENOSPC) {
1530 jfs_error(bmp->db_ipbmap->i_sb, 1522 jfs_error(bmp->db_ipbmap->i_sb,
1531 "dbAllocAG: unable to allocate blocks"); 1523 "unable to allocate blocks\n");
1532 rc = -EIO; 1524 rc = -EIO;
1533 } 1525 }
1534 return (rc); 1526 return (rc);
@@ -1587,8 +1579,7 @@ static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1587 */ 1579 */
1588 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1580 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1589 if (rc == -ENOSPC) { 1581 if (rc == -ENOSPC) {
1590 jfs_error(bmp->db_ipbmap->i_sb, 1582 jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
1591 "dbAllocAny: unable to allocate blocks");
1592 return -EIO; 1583 return -EIO;
1593 } 1584 }
1594 return (rc); 1585 return (rc);
@@ -1652,8 +1643,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
1652 range_cnt = min_t(u64, max_ranges + 1, 32 * 1024); 1643 range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
1653 totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS); 1644 totrim = kmalloc(sizeof(struct range2trim) * range_cnt, GFP_NOFS);
1654 if (totrim == NULL) { 1645 if (totrim == NULL) {
1655 jfs_error(bmp->db_ipbmap->i_sb, 1646 jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
1656 "dbDiscardAG: no memory for trim array");
1657 IWRITE_UNLOCK(ipbmap); 1647 IWRITE_UNLOCK(ipbmap);
1658 return 0; 1648 return 0;
1659 } 1649 }
@@ -1682,8 +1672,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
1682 nblocks = 1 << l2nb; 1672 nblocks = 1 << l2nb;
1683 } else { 1673 } else {
1684 /* Trim any already allocated blocks */ 1674 /* Trim any already allocated blocks */
1685 jfs_error(bmp->db_ipbmap->i_sb, 1675 jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
1686 "dbDiscardAG: -EIO");
1687 break; 1676 break;
1688 } 1677 }
1689 1678
@@ -1761,7 +1750,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1761 1750
1762 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 1751 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1763 jfs_error(bmp->db_ipbmap->i_sb, 1752 jfs_error(bmp->db_ipbmap->i_sb,
1764 "dbFindCtl: Corrupt dmapctl page"); 1753 "Corrupt dmapctl page\n");
1765 release_metapage(mp); 1754 release_metapage(mp);
1766 return -EIO; 1755 return -EIO;
1767 } 1756 }
@@ -1782,7 +1771,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1782 if (rc) { 1771 if (rc) {
1783 if (lev != level) { 1772 if (lev != level) {
1784 jfs_error(bmp->db_ipbmap->i_sb, 1773 jfs_error(bmp->db_ipbmap->i_sb,
1785 "dbFindCtl: dmap inconsistent"); 1774 "dmap inconsistent\n");
1786 return -EIO; 1775 return -EIO;
1787 } 1776 }
1788 return -ENOSPC; 1777 return -ENOSPC;
@@ -1906,7 +1895,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1906 if (dp->tree.stree[ROOT] != L2BPERDMAP) { 1895 if (dp->tree.stree[ROOT] != L2BPERDMAP) {
1907 release_metapage(mp); 1896 release_metapage(mp);
1908 jfs_error(bmp->db_ipbmap->i_sb, 1897 jfs_error(bmp->db_ipbmap->i_sb,
1909 "dbAllocCtl: the dmap is not all free"); 1898 "the dmap is not all free\n");
1910 rc = -EIO; 1899 rc = -EIO;
1911 goto backout; 1900 goto backout;
1912 } 1901 }
@@ -1953,7 +1942,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1953 * to indicate that we have leaked blocks. 1942 * to indicate that we have leaked blocks.
1954 */ 1943 */
1955 jfs_error(bmp->db_ipbmap->i_sb, 1944 jfs_error(bmp->db_ipbmap->i_sb,
1956 "dbAllocCtl: I/O Error: Block Leakage."); 1945 "I/O Error: Block Leakage\n");
1957 continue; 1946 continue;
1958 } 1947 }
1959 dp = (struct dmap *) mp->data; 1948 dp = (struct dmap *) mp->data;
@@ -1965,8 +1954,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1965 * to indicate that we have leaked blocks. 1954 * to indicate that we have leaked blocks.
1966 */ 1955 */
1967 release_metapage(mp); 1956 release_metapage(mp);
1968 jfs_error(bmp->db_ipbmap->i_sb, 1957 jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
1969 "dbAllocCtl: Block Leakage.");
1970 continue; 1958 continue;
1971 } 1959 }
1972 1960
@@ -2263,8 +2251,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2263 for (; nwords > 0; nwords -= nw) { 2251 for (; nwords > 0; nwords -= nw) {
2264 if (leaf[word] < BUDMIN) { 2252 if (leaf[word] < BUDMIN) {
2265 jfs_error(bmp->db_ipbmap->i_sb, 2253 jfs_error(bmp->db_ipbmap->i_sb,
2266 "dbAllocBits: leaf page " 2254 "leaf page corrupt\n");
2267 "corrupt");
2268 break; 2255 break;
2269 } 2256 }
2270 2257
@@ -2536,8 +2523,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2536 dcp = (struct dmapctl *) mp->data; 2523 dcp = (struct dmapctl *) mp->data;
2537 2524
2538 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 2525 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
2539 jfs_error(bmp->db_ipbmap->i_sb, 2526 jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
2540 "dbAdjCtl: Corrupt dmapctl page");
2541 release_metapage(mp); 2527 release_metapage(mp);
2542 return -EIO; 2528 return -EIO;
2543 } 2529 }
@@ -2638,8 +2624,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2638 assert(level == bmp->db_maxlevel); 2624 assert(level == bmp->db_maxlevel);
2639 if (bmp->db_maxfreebud != oldroot) { 2625 if (bmp->db_maxfreebud != oldroot) {
2640 jfs_error(bmp->db_ipbmap->i_sb, 2626 jfs_error(bmp->db_ipbmap->i_sb,
2641 "dbAdjCtl: the maximum free buddy is " 2627 "the maximum free buddy is not the old root\n");
2642 "not the old root");
2643 } 2628 }
2644 bmp->db_maxfreebud = dcp->stree[ROOT]; 2629 bmp->db_maxfreebud = dcp->stree[ROOT];
2645 } 2630 }
@@ -3481,7 +3466,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3481 p = BMAPBLKNO + nbperpage; /* L2 page */ 3466 p = BMAPBLKNO + nbperpage; /* L2 page */
3482 l2mp = read_metapage(ipbmap, p, PSIZE, 0); 3467 l2mp = read_metapage(ipbmap, p, PSIZE, 0);
3483 if (!l2mp) { 3468 if (!l2mp) {
3484 jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read"); 3469 jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
3485 return -EIO; 3470 return -EIO;
3486 } 3471 }
3487 l2dcp = (struct dmapctl *) l2mp->data; 3472 l2dcp = (struct dmapctl *) l2mp->data;
@@ -3646,8 +3631,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3646 } 3631 }
3647 } /* for each L1 in a L2 */ 3632 } /* for each L1 in a L2 */
3648 3633
3649 jfs_error(ipbmap->i_sb, 3634 jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
3650 "dbExtendFS: function has not returned as expected");
3651errout: 3635errout:
3652 if (l0mp) 3636 if (l0mp)
3653 release_metapage(l0mp); 3637 release_metapage(l0mp);
@@ -3717,7 +3701,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
3717 } 3701 }
3718 if (bmp->db_agpref >= bmp->db_numag) { 3702 if (bmp->db_agpref >= bmp->db_numag) {
3719 jfs_error(ipbmap->i_sb, 3703 jfs_error(ipbmap->i_sb,
3720 "cannot find ag with average freespace"); 3704 "cannot find ag with average freespace\n");
3721 } 3705 }
3722 } 3706 }
3723 3707
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0ddbeceafc62..8743ba9c6742 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -124,21 +124,21 @@ struct dtsplit {
124#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) 124#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
125 125
126/* get page buffer for specified block address */ 126/* get page buffer for specified block address */
127#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ 127#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
128{\ 128do { \
129 BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\ 129 BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \
130 if (!(RC))\ 130 if (!(RC)) { \
131 {\ 131 if (((P)->header.nextindex > \
132 if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ 132 (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
133 ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ 133 ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
134 {\ 134 BT_PUTPAGE(MP); \
135 BT_PUTPAGE(MP);\ 135 jfs_error((IP)->i_sb, \
136 jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\ 136 "DT_GETPAGE: dtree page corrupt\n"); \
137 MP = NULL;\ 137 MP = NULL; \
138 RC = -EIO;\ 138 RC = -EIO; \
139 }\ 139 } \
140 }\ 140 } \
141} 141} while (0)
142 142
143/* for consistency */ 143/* for consistency */
144#define DT_PUTPAGE(MP) BT_PUTPAGE(MP) 144#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -776,7 +776,7 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
776 /* Something's corrupted, mark filesystem dirty so 776 /* Something's corrupted, mark filesystem dirty so
777 * chkdsk will fix it. 777 * chkdsk will fix it.
778 */ 778 */
779 jfs_error(sb, "stack overrun in dtSearch!"); 779 jfs_error(sb, "stack overrun!\n");
780 BT_STACK_DUMP(btstack); 780 BT_STACK_DUMP(btstack);
781 rc = -EIO; 781 rc = -EIO;
782 goto out; 782 goto out;
@@ -3002,9 +3002,9 @@ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent)
3002 * return: offset = (pn, index) of start entry 3002 * return: offset = (pn, index) of start entry
3003 * of next jfs_readdir()/dtRead() 3003 * of next jfs_readdir()/dtRead()
3004 */ 3004 */
3005int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 3005int jfs_readdir(struct file *file, struct dir_context *ctx)
3006{ 3006{
3007 struct inode *ip = file_inode(filp); 3007 struct inode *ip = file_inode(file);
3008 struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; 3008 struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab;
3009 int rc = 0; 3009 int rc = 0;
3010 loff_t dtpos; /* legacy OS/2 style position */ 3010 loff_t dtpos; /* legacy OS/2 style position */
@@ -3033,7 +3033,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3033 int overflow, fix_page, page_fixed = 0; 3033 int overflow, fix_page, page_fixed = 0;
3034 static int unique_pos = 2; /* If we can't fix broken index */ 3034 static int unique_pos = 2; /* If we can't fix broken index */
3035 3035
3036 if (filp->f_pos == DIREND) 3036 if (ctx->pos == DIREND)
3037 return 0; 3037 return 0;
3038 3038
3039 if (DO_INDEX(ip)) { 3039 if (DO_INDEX(ip)) {
@@ -3045,7 +3045,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3045 */ 3045 */
3046 do_index = 1; 3046 do_index = 1;
3047 3047
3048 dir_index = (u32) filp->f_pos; 3048 dir_index = (u32) ctx->pos;
3049 3049
3050 if (dir_index > 1) { 3050 if (dir_index > 1) {
3051 struct dir_table_slot dirtab_slot; 3051 struct dir_table_slot dirtab_slot;
@@ -3053,25 +3053,25 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3053 if (dtEmpty(ip) || 3053 if (dtEmpty(ip) ||
3054 (dir_index >= JFS_IP(ip)->next_index)) { 3054 (dir_index >= JFS_IP(ip)->next_index)) {
3055 /* Stale position. Directory has shrunk */ 3055 /* Stale position. Directory has shrunk */
3056 filp->f_pos = DIREND; 3056 ctx->pos = DIREND;
3057 return 0; 3057 return 0;
3058 } 3058 }
3059 repeat: 3059 repeat:
3060 rc = read_index(ip, dir_index, &dirtab_slot); 3060 rc = read_index(ip, dir_index, &dirtab_slot);
3061 if (rc) { 3061 if (rc) {
3062 filp->f_pos = DIREND; 3062 ctx->pos = DIREND;
3063 return rc; 3063 return rc;
3064 } 3064 }
3065 if (dirtab_slot.flag == DIR_INDEX_FREE) { 3065 if (dirtab_slot.flag == DIR_INDEX_FREE) {
3066 if (loop_count++ > JFS_IP(ip)->next_index) { 3066 if (loop_count++ > JFS_IP(ip)->next_index) {
3067 jfs_err("jfs_readdir detected " 3067 jfs_err("jfs_readdir detected "
3068 "infinite loop!"); 3068 "infinite loop!");
3069 filp->f_pos = DIREND; 3069 ctx->pos = DIREND;
3070 return 0; 3070 return 0;
3071 } 3071 }
3072 dir_index = le32_to_cpu(dirtab_slot.addr2); 3072 dir_index = le32_to_cpu(dirtab_slot.addr2);
3073 if (dir_index == -1) { 3073 if (dir_index == -1) {
3074 filp->f_pos = DIREND; 3074 ctx->pos = DIREND;
3075 return 0; 3075 return 0;
3076 } 3076 }
3077 goto repeat; 3077 goto repeat;
@@ -3080,13 +3080,13 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3080 index = dirtab_slot.slot; 3080 index = dirtab_slot.slot;
3081 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); 3081 DT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
3082 if (rc) { 3082 if (rc) {
3083 filp->f_pos = DIREND; 3083 ctx->pos = DIREND;
3084 return 0; 3084 return 0;
3085 } 3085 }
3086 if (p->header.flag & BT_INTERNAL) { 3086 if (p->header.flag & BT_INTERNAL) {
3087 jfs_err("jfs_readdir: bad index table"); 3087 jfs_err("jfs_readdir: bad index table");
3088 DT_PUTPAGE(mp); 3088 DT_PUTPAGE(mp);
3089 filp->f_pos = -1; 3089 ctx->pos = -1;
3090 return 0; 3090 return 0;
3091 } 3091 }
3092 } else { 3092 } else {
@@ -3094,23 +3094,22 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3094 /* 3094 /*
3095 * self "." 3095 * self "."
3096 */ 3096 */
3097 filp->f_pos = 0; 3097 ctx->pos = 0;
3098 if (filldir(dirent, ".", 1, 0, ip->i_ino, 3098 if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
3099 DT_DIR))
3100 return 0; 3099 return 0;
3101 } 3100 }
3102 /* 3101 /*
3103 * parent ".." 3102 * parent ".."
3104 */ 3103 */
3105 filp->f_pos = 1; 3104 ctx->pos = 1;
3106 if (filldir(dirent, "..", 2, 1, PARENT(ip), DT_DIR)) 3105 if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
3107 return 0; 3106 return 0;
3108 3107
3109 /* 3108 /*
3110 * Find first entry of left-most leaf 3109 * Find first entry of left-most leaf
3111 */ 3110 */
3112 if (dtEmpty(ip)) { 3111 if (dtEmpty(ip)) {
3113 filp->f_pos = DIREND; 3112 ctx->pos = DIREND;
3114 return 0; 3113 return 0;
3115 } 3114 }
3116 3115
@@ -3128,23 +3127,19 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3128 * pn > 0: Real entries, pn=1 -> leftmost page 3127 * pn > 0: Real entries, pn=1 -> leftmost page
3129 * pn = index = -1: No more entries 3128 * pn = index = -1: No more entries
3130 */ 3129 */
3131 dtpos = filp->f_pos; 3130 dtpos = ctx->pos;
3132 if (dtpos == 0) { 3131 if (dtpos == 0) {
3133 /* build "." entry */ 3132 /* build "." entry */
3134 3133 if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR))
3135 if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino,
3136 DT_DIR))
3137 return 0; 3134 return 0;
3138 dtoffset->index = 1; 3135 dtoffset->index = 1;
3139 filp->f_pos = dtpos; 3136 ctx->pos = dtpos;
3140 } 3137 }
3141 3138
3142 if (dtoffset->pn == 0) { 3139 if (dtoffset->pn == 0) {
3143 if (dtoffset->index == 1) { 3140 if (dtoffset->index == 1) {
3144 /* build ".." entry */ 3141 /* build ".." entry */
3145 3142 if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR))
3146 if (filldir(dirent, "..", 2, filp->f_pos,
3147 PARENT(ip), DT_DIR))
3148 return 0; 3143 return 0;
3149 } else { 3144 } else {
3150 jfs_err("jfs_readdir called with " 3145 jfs_err("jfs_readdir called with "
@@ -3152,18 +3147,18 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3152 } 3147 }
3153 dtoffset->pn = 1; 3148 dtoffset->pn = 1;
3154 dtoffset->index = 0; 3149 dtoffset->index = 0;
3155 filp->f_pos = dtpos; 3150 ctx->pos = dtpos;
3156 } 3151 }
3157 3152
3158 if (dtEmpty(ip)) { 3153 if (dtEmpty(ip)) {
3159 filp->f_pos = DIREND; 3154 ctx->pos = DIREND;
3160 return 0; 3155 return 0;
3161 } 3156 }
3162 3157
3163 if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) { 3158 if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) {
3164 jfs_err("jfs_readdir: unexpected rc = %d " 3159 jfs_err("jfs_readdir: unexpected rc = %d "
3165 "from dtReadNext", rc); 3160 "from dtReadNext", rc);
3166 filp->f_pos = DIREND; 3161 ctx->pos = DIREND;
3167 return 0; 3162 return 0;
3168 } 3163 }
3169 /* get start leaf page and index */ 3164 /* get start leaf page and index */
@@ -3171,7 +3166,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3171 3166
3172 /* offset beyond directory eof ? */ 3167 /* offset beyond directory eof ? */
3173 if (bn < 0) { 3168 if (bn < 0) {
3174 filp->f_pos = DIREND; 3169 ctx->pos = DIREND;
3175 return 0; 3170 return 0;
3176 } 3171 }
3177 } 3172 }
@@ -3180,7 +3175,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3180 if (dirent_buf == 0) { 3175 if (dirent_buf == 0) {
3181 DT_PUTPAGE(mp); 3176 DT_PUTPAGE(mp);
3182 jfs_warn("jfs_readdir: __get_free_page failed!"); 3177 jfs_warn("jfs_readdir: __get_free_page failed!");
3183 filp->f_pos = DIREND; 3178 ctx->pos = DIREND;
3184 return -ENOMEM; 3179 return -ENOMEM;
3185 } 3180 }
3186 3181
@@ -3252,8 +3247,7 @@ int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
3252 /* Sanity Check */ 3247 /* Sanity Check */
3253 if (d_namleft == 0) { 3248 if (d_namleft == 0) {
3254 jfs_error(ip->i_sb, 3249 jfs_error(ip->i_sb,
3255 "JFS:Dtree error: ino = " 3250 "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n",
3256 "%ld, bn=%Ld, index = %d",
3257 (long)ip->i_ino, 3251 (long)ip->i_ino,
3258 (long long)bn, 3252 (long long)bn,
3259 i); 3253 i);
@@ -3295,9 +3289,9 @@ skip_one:
3295 3289
3296 jfs_dirent = (struct jfs_dirent *) dirent_buf; 3290 jfs_dirent = (struct jfs_dirent *) dirent_buf;
3297 while (jfs_dirents--) { 3291 while (jfs_dirents--) {
3298 filp->f_pos = jfs_dirent->position; 3292 ctx->pos = jfs_dirent->position;
3299 if (filldir(dirent, jfs_dirent->name, 3293 if (!dir_emit(ctx, jfs_dirent->name,
3300 jfs_dirent->name_len, filp->f_pos, 3294 jfs_dirent->name_len,
3301 jfs_dirent->ino, DT_UNKNOWN)) 3295 jfs_dirent->ino, DT_UNKNOWN))
3302 goto out; 3296 goto out;
3303 jfs_dirent = next_jfs_dirent(jfs_dirent); 3297 jfs_dirent = next_jfs_dirent(jfs_dirent);
@@ -3309,7 +3303,7 @@ skip_one:
3309 } 3303 }
3310 3304
3311 if (!overflow && (bn == 0)) { 3305 if (!overflow && (bn == 0)) {
3312 filp->f_pos = DIREND; 3306 ctx->pos = DIREND;
3313 break; 3307 break;
3314 } 3308 }
3315 3309
@@ -3373,7 +3367,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack)
3373 */ 3367 */
3374 if (BT_STACK_FULL(btstack)) { 3368 if (BT_STACK_FULL(btstack)) {
3375 DT_PUTPAGE(mp); 3369 DT_PUTPAGE(mp);
3376 jfs_error(ip->i_sb, "dtReadFirst: btstack overrun"); 3370 jfs_error(ip->i_sb, "btstack overrun\n");
3377 BT_STACK_DUMP(btstack); 3371 BT_STACK_DUMP(btstack);
3378 return -EIO; 3372 return -EIO;
3379 } 3373 }
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 2545bb317235..fd4169e6e698 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -265,5 +265,5 @@ extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key,
265extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, 265extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key,
266 ino_t * orig_ino, ino_t new_ino, int flag); 266 ino_t * orig_ino, ino_t new_ino, int flag);
267 267
268extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir); 268extern int jfs_readdir(struct file *file, struct dir_context *ctx);
269#endif /* !_H_JFS_DTREE */ 269#endif /* !_H_JFS_DTREE */
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index e5fe8506ed16..2ae7d59ab10a 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -388,7 +388,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
388 388
389 if ((rc == 0) && xlen) { 389 if ((rc == 0) && xlen) {
390 if (xlen != nbperpage) { 390 if (xlen != nbperpage) {
391 jfs_error(ip->i_sb, "extHint: corrupt xtree"); 391 jfs_error(ip->i_sb, "corrupt xtree\n");
392 rc = -EIO; 392 rc = -EIO;
393 } 393 }
394 XADaddress(xp, xaddr); 394 XADaddress(xp, xaddr);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f7e042b63ddb..f321986e73d2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -386,7 +386,7 @@ int diRead(struct inode *ip)
386 dp += rel_inode; 386 dp += rel_inode;
387 387
388 if (ip->i_ino != le32_to_cpu(dp->di_number)) { 388 if (ip->i_ino != le32_to_cpu(dp->di_number)) {
389 jfs_error(ip->i_sb, "diRead: i_ino != di_number"); 389 jfs_error(ip->i_sb, "i_ino != di_number\n");
390 rc = -EIO; 390 rc = -EIO;
391 } else if (le32_to_cpu(dp->di_nlink) == 0) 391 } else if (le32_to_cpu(dp->di_nlink) == 0)
392 rc = -ESTALE; 392 rc = -ESTALE;
@@ -625,7 +625,7 @@ int diWrite(tid_t tid, struct inode *ip)
625 if (!addressPXD(&(jfs_ip->ixpxd)) || 625 if (!addressPXD(&(jfs_ip->ixpxd)) ||
626 (lengthPXD(&(jfs_ip->ixpxd)) != 626 (lengthPXD(&(jfs_ip->ixpxd)) !=
627 JFS_IP(ipimap)->i_imap->im_nbperiext)) { 627 JFS_IP(ipimap)->i_imap->im_nbperiext)) {
628 jfs_error(ip->i_sb, "diWrite: ixpxd invalid"); 628 jfs_error(ip->i_sb, "ixpxd invalid\n");
629 return -EIO; 629 return -EIO;
630 } 630 }
631 631
@@ -893,8 +893,7 @@ int diFree(struct inode *ip)
893 if (iagno >= imap->im_nextiag) { 893 if (iagno >= imap->im_nextiag) {
894 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, 894 print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4,
895 imap, 32, 0); 895 imap, 32, 0);
896 jfs_error(ip->i_sb, 896 jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n",
897 "diFree: inum = %d, iagno = %d, nextiag = %d",
898 (uint) inum, iagno, imap->im_nextiag); 897 (uint) inum, iagno, imap->im_nextiag);
899 return -EIO; 898 return -EIO;
900 } 899 }
@@ -930,15 +929,14 @@ int diFree(struct inode *ip)
930 mask = HIGHORDER >> bitno; 929 mask = HIGHORDER >> bitno;
931 930
932 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 931 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
933 jfs_error(ip->i_sb, 932 jfs_error(ip->i_sb, "wmap shows inode already free\n");
934 "diFree: wmap shows inode already free");
935 } 933 }
936 934
937 if (!addressPXD(&iagp->inoext[extno])) { 935 if (!addressPXD(&iagp->inoext[extno])) {
938 release_metapage(mp); 936 release_metapage(mp);
939 IREAD_UNLOCK(ipimap); 937 IREAD_UNLOCK(ipimap);
940 AG_UNLOCK(imap, agno); 938 AG_UNLOCK(imap, agno);
941 jfs_error(ip->i_sb, "diFree: invalid inoext"); 939 jfs_error(ip->i_sb, "invalid inoext\n");
942 return -EIO; 940 return -EIO;
943 } 941 }
944 942
@@ -950,7 +948,7 @@ int diFree(struct inode *ip)
950 release_metapage(mp); 948 release_metapage(mp);
951 IREAD_UNLOCK(ipimap); 949 IREAD_UNLOCK(ipimap);
952 AG_UNLOCK(imap, agno); 950 AG_UNLOCK(imap, agno);
953 jfs_error(ip->i_sb, "diFree: numfree > numinos"); 951 jfs_error(ip->i_sb, "numfree > numinos\n");
954 return -EIO; 952 return -EIO;
955 } 953 }
956 /* 954 /*
@@ -1199,7 +1197,7 @@ int diFree(struct inode *ip)
1199 * for the inode being freed. 1197 * for the inode being freed.
1200 */ 1198 */
1201 if (iagp->pmap[extno] != 0) { 1199 if (iagp->pmap[extno] != 0) {
1202 jfs_error(ip->i_sb, "diFree: the pmap does not show inode free"); 1200 jfs_error(ip->i_sb, "the pmap does not show inode free\n");
1203 } 1201 }
1204 iagp->wmap[extno] = 0; 1202 iagp->wmap[extno] = 0;
1205 PXDlength(&iagp->inoext[extno], 0); 1203 PXDlength(&iagp->inoext[extno], 0);
@@ -1518,8 +1516,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
1518 release_metapage(mp); 1516 release_metapage(mp);
1519 AG_UNLOCK(imap, agno); 1517 AG_UNLOCK(imap, agno);
1520 jfs_error(ip->i_sb, 1518 jfs_error(ip->i_sb,
1521 "diAlloc: can't find free bit " 1519 "can't find free bit in wmap\n");
1522 "in wmap");
1523 return -EIO; 1520 return -EIO;
1524 } 1521 }
1525 1522
@@ -1660,7 +1657,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1660 numinos = imap->im_agctl[agno].numinos; 1657 numinos = imap->im_agctl[agno].numinos;
1661 1658
1662 if (numfree > numinos) { 1659 if (numfree > numinos) {
1663 jfs_error(ip->i_sb, "diAllocAG: numfree > numinos"); 1660 jfs_error(ip->i_sb, "numfree > numinos\n");
1664 return -EIO; 1661 return -EIO;
1665 } 1662 }
1666 1663
@@ -1811,8 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1811 if (!iagp->nfreeinos) { 1808 if (!iagp->nfreeinos) {
1812 IREAD_UNLOCK(imap->im_ipimap); 1809 IREAD_UNLOCK(imap->im_ipimap);
1813 release_metapage(mp); 1810 release_metapage(mp);
1814 jfs_error(ip->i_sb, 1811 jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n");
1815 "diAllocIno: nfreeinos = 0, but iag on freelist");
1816 return -EIO; 1812 return -EIO;
1817 } 1813 }
1818 1814
@@ -1824,7 +1820,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1824 IREAD_UNLOCK(imap->im_ipimap); 1820 IREAD_UNLOCK(imap->im_ipimap);
1825 release_metapage(mp); 1821 release_metapage(mp);
1826 jfs_error(ip->i_sb, 1822 jfs_error(ip->i_sb,
1827 "diAllocIno: free inode not found in summary map"); 1823 "free inode not found in summary map\n");
1828 return -EIO; 1824 return -EIO;
1829 } 1825 }
1830 1826
@@ -1839,7 +1835,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1839 if (rem >= EXTSPERSUM) { 1835 if (rem >= EXTSPERSUM) {
1840 IREAD_UNLOCK(imap->im_ipimap); 1836 IREAD_UNLOCK(imap->im_ipimap);
1841 release_metapage(mp); 1837 release_metapage(mp);
1842 jfs_error(ip->i_sb, "diAllocIno: no free extent found"); 1838 jfs_error(ip->i_sb, "no free extent found\n");
1843 return -EIO; 1839 return -EIO;
1844 } 1840 }
1845 extno = (sword << L2EXTSPERSUM) + rem; 1841 extno = (sword << L2EXTSPERSUM) + rem;
@@ -1850,7 +1846,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
1850 if (rem >= INOSPEREXT) { 1846 if (rem >= INOSPEREXT) {
1851 IREAD_UNLOCK(imap->im_ipimap); 1847 IREAD_UNLOCK(imap->im_ipimap);
1852 release_metapage(mp); 1848 release_metapage(mp);
1853 jfs_error(ip->i_sb, "diAllocIno: free inode not found"); 1849 jfs_error(ip->i_sb, "free inode not found\n");
1854 return -EIO; 1850 return -EIO;
1855 } 1851 }
1856 1852
@@ -1936,7 +1932,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1936 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); 1932 IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
1937 if ((rc = diIAGRead(imap, iagno, &mp))) { 1933 if ((rc = diIAGRead(imap, iagno, &mp))) {
1938 IREAD_UNLOCK(imap->im_ipimap); 1934 IREAD_UNLOCK(imap->im_ipimap);
1939 jfs_error(ip->i_sb, "diAllocExt: error reading iag"); 1935 jfs_error(ip->i_sb, "error reading iag\n");
1940 return rc; 1936 return rc;
1941 } 1937 }
1942 iagp = (struct iag *) mp->data; 1938 iagp = (struct iag *) mp->data;
@@ -1948,8 +1944,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1948 if (sword >= SMAPSZ) { 1944 if (sword >= SMAPSZ) {
1949 release_metapage(mp); 1945 release_metapage(mp);
1950 IREAD_UNLOCK(imap->im_ipimap); 1946 IREAD_UNLOCK(imap->im_ipimap);
1951 jfs_error(ip->i_sb, 1947 jfs_error(ip->i_sb, "free ext summary map not found\n");
1952 "diAllocExt: free ext summary map not found");
1953 return -EIO; 1948 return -EIO;
1954 } 1949 }
1955 if (~iagp->extsmap[sword]) 1950 if (~iagp->extsmap[sword])
@@ -1962,7 +1957,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
1962 if (rem >= EXTSPERSUM) { 1957 if (rem >= EXTSPERSUM) {
1963 release_metapage(mp); 1958 release_metapage(mp);
1964 IREAD_UNLOCK(imap->im_ipimap); 1959 IREAD_UNLOCK(imap->im_ipimap);
1965 jfs_error(ip->i_sb, "diAllocExt: free extent not found"); 1960 jfs_error(ip->i_sb, "free extent not found\n");
1966 return -EIO; 1961 return -EIO;
1967 } 1962 }
1968 extno = (sword << L2EXTSPERSUM) + rem; 1963 extno = (sword << L2EXTSPERSUM) + rem;
@@ -2081,8 +2076,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2081 if (bmp) 2076 if (bmp)
2082 release_metapage(bmp); 2077 release_metapage(bmp);
2083 2078
2084 jfs_error(imap->im_ipimap->i_sb, 2079 jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n");
2085 "diAllocBit: iag inconsistent");
2086 return -EIO; 2080 return -EIO;
2087 } 2081 }
2088 2082
@@ -2189,7 +2183,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2189 /* better have free extents. 2183 /* better have free extents.
2190 */ 2184 */
2191 if (!iagp->nfreeexts) { 2185 if (!iagp->nfreeexts) {
2192 jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents"); 2186 jfs_error(imap->im_ipimap->i_sb, "no free extents\n");
2193 return -EIO; 2187 return -EIO;
2194 } 2188 }
2195 2189
@@ -2261,7 +2255,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2261 } 2255 }
2262 if (ciagp == NULL) { 2256 if (ciagp == NULL) {
2263 jfs_error(imap->im_ipimap->i_sb, 2257 jfs_error(imap->im_ipimap->i_sb,
2264 "diNewExt: ciagp == NULL"); 2258 "ciagp == NULL\n");
2265 rc = -EIO; 2259 rc = -EIO;
2266 goto error_out; 2260 goto error_out;
2267 } 2261 }
@@ -2498,7 +2492,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2498 IWRITE_UNLOCK(ipimap); 2492 IWRITE_UNLOCK(ipimap);
2499 IAGFREE_UNLOCK(imap); 2493 IAGFREE_UNLOCK(imap);
2500 jfs_error(imap->im_ipimap->i_sb, 2494 jfs_error(imap->im_ipimap->i_sb,
2501 "diNewIAG: ipimap->i_size is wrong"); 2495 "ipimap->i_size is wrong\n");
2502 return -EIO; 2496 return -EIO;
2503 } 2497 }
2504 2498
@@ -2758,8 +2752,7 @@ diUpdatePMap(struct inode *ipimap,
2758 iagno = INOTOIAG(inum); 2752 iagno = INOTOIAG(inum);
2759 /* make sure that the iag is contained within the map */ 2753 /* make sure that the iag is contained within the map */
2760 if (iagno >= imap->im_nextiag) { 2754 if (iagno >= imap->im_nextiag) {
2761 jfs_error(ipimap->i_sb, 2755 jfs_error(ipimap->i_sb, "the iag is outside the map\n");
2762 "diUpdatePMap: the iag is outside the map");
2763 return -EIO; 2756 return -EIO;
2764 } 2757 }
2765 /* read the iag */ 2758 /* read the iag */
@@ -2788,13 +2781,13 @@ diUpdatePMap(struct inode *ipimap,
2788 */ 2781 */
2789 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 2782 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2790 jfs_error(ipimap->i_sb, 2783 jfs_error(ipimap->i_sb,
2791 "diUpdatePMap: inode %ld not marked as " 2784 "inode %ld not marked as allocated in wmap!\n",
2792 "allocated in wmap!", inum); 2785 inum);
2793 } 2786 }
2794 if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { 2787 if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) {
2795 jfs_error(ipimap->i_sb, 2788 jfs_error(ipimap->i_sb,
2796 "diUpdatePMap: inode %ld not marked as " 2789 "inode %ld not marked as allocated in pmap!\n",
2797 "allocated in pmap!", inum); 2790 inum);
2798 } 2791 }
2799 /* update the bitmap for the extent of the freed inode */ 2792 /* update the bitmap for the extent of the freed inode */
2800 iagp->pmap[extno] &= cpu_to_le32(~mask); 2793 iagp->pmap[extno] &= cpu_to_le32(~mask);
@@ -2809,15 +2802,13 @@ diUpdatePMap(struct inode *ipimap,
2809 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { 2802 if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
2810 release_metapage(mp); 2803 release_metapage(mp);
2811 jfs_error(ipimap->i_sb, 2804 jfs_error(ipimap->i_sb,
2812 "diUpdatePMap: the inode is not allocated in " 2805 "the inode is not allocated in the working map\n");
2813 "the working map");
2814 return -EIO; 2806 return -EIO;
2815 } 2807 }
2816 if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { 2808 if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) {
2817 release_metapage(mp); 2809 release_metapage(mp);
2818 jfs_error(ipimap->i_sb, 2810 jfs_error(ipimap->i_sb,
2819 "diUpdatePMap: the inode is not free in the " 2811 "the inode is not free in the persistent map\n");
2820 "persistent map");
2821 return -EIO; 2812 return -EIO;
2822 } 2813 }
2823 /* update the bitmap for the extent of the allocated inode */ 2814 /* update the bitmap for the extent of the allocated inode */
@@ -2909,8 +2900,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2909 iagp = (struct iag *) bp->data; 2900 iagp = (struct iag *) bp->data;
2910 if (le32_to_cpu(iagp->iagnum) != i) { 2901 if (le32_to_cpu(iagp->iagnum) != i) {
2911 release_metapage(bp); 2902 release_metapage(bp);
2912 jfs_error(ipimap->i_sb, 2903 jfs_error(ipimap->i_sb, "unexpected value of iagnum\n");
2913 "diExtendFs: unexpected value of iagnum");
2914 return -EIO; 2904 return -EIO;
2915 } 2905 }
2916 2906
@@ -2986,8 +2976,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
2986 2976
2987 if (xnuminos != atomic_read(&imap->im_numinos) || 2977 if (xnuminos != atomic_read(&imap->im_numinos) ||
2988 xnumfree != atomic_read(&imap->im_numfree)) { 2978 xnumfree != atomic_read(&imap->im_numfree)) {
2989 jfs_error(ipimap->i_sb, 2979 jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n");
2990 "diExtendFs: numinos or numfree incorrect");
2991 return -EIO; 2980 return -EIO;
2992 } 2981 }
2993 2982
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 6740d34cd82b..d165cde0c68d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -571,9 +571,10 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
571 return ret; 571 return ret;
572} 572}
573 573
574static void metapage_invalidatepage(struct page *page, unsigned long offset) 574static void metapage_invalidatepage(struct page *page, unsigned int offset,
575 unsigned int length)
575{ 576{
576 BUG_ON(offset); 577 BUG_ON(offset || length < PAGE_CACHE_SIZE);
577 578
578 BUG_ON(PageWriteback(page)); 579 BUG_ON(PageWriteback(page));
579 580
@@ -646,7 +647,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
646 if (mp) { 647 if (mp) {
647 if (mp->logical_size != size) { 648 if (mp->logical_size != size) {
648 jfs_error(inode->i_sb, 649 jfs_error(inode->i_sb,
649 "__get_metapage: mp->logical_size != size"); 650 "get_mp->logical_size != size\n");
650 jfs_err("logical_size = %d, size = %d", 651 jfs_err("logical_size = %d, size = %d",
651 mp->logical_size, size); 652 mp->logical_size, size);
652 dump_stack(); 653 dump_stack();
@@ -657,8 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
657 if (test_bit(META_discard, &mp->flag)) { 658 if (test_bit(META_discard, &mp->flag)) {
658 if (!new) { 659 if (!new) {
659 jfs_error(inode->i_sb, 660 jfs_error(inode->i_sb,
660 "__get_metapage: using a " 661 "using a discarded metapage\n");
661 "discarded metapage");
662 discard_metapage(mp); 662 discard_metapage(mp);
663 goto unlock; 663 goto unlock;
664 } 664 }
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 884fc21ab8ee..04847b8d3070 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h
@@ -108,6 +108,7 @@ struct jfs_superblock {
108 108
109extern int readSuper(struct super_block *, struct buffer_head **); 109extern int readSuper(struct super_block *, struct buffer_head **);
110extern int updateSuper(struct super_block *, uint); 110extern int updateSuper(struct super_block *, uint);
111__printf(2, 3)
111extern void jfs_error(struct super_block *, const char *, ...); 112extern void jfs_error(struct super_block *, const char *, ...);
112extern int jfs_mount(struct super_block *); 113extern int jfs_mount(struct super_block *);
113extern int jfs_mount_rw(struct super_block *, int); 114extern int jfs_mount_rw(struct super_block *, int);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 5fcc02eaa64c..564c4f279ac6 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2684,7 +2684,7 @@ void txAbort(tid_t tid, int dirty)
2684 * mark filesystem dirty 2684 * mark filesystem dirty
2685 */ 2685 */
2686 if (dirty) 2686 if (dirty)
2687 jfs_error(tblk->sb, "txAbort"); 2687 jfs_error(tblk->sb, "\n");
2688 2688
2689 return; 2689 return;
2690} 2690}
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 6c50871e6220..5ad7748860ce 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -64,22 +64,23 @@
64 64
65/* get page buffer for specified block address */ 65/* get page buffer for specified block address */
66/* ToDo: Replace this ugly macro with a function */ 66/* ToDo: Replace this ugly macro with a function */
67#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ 67#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
68{\ 68do { \
69 BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ 69 BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \
70 if (!(RC))\ 70 if (!(RC)) { \
71 {\ 71 if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
72 if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ 72 (le16_to_cpu((P)->header.nextindex) > \
73 (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ 73 le16_to_cpu((P)->header.maxentry)) || \
74 (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ 74 (le16_to_cpu((P)->header.maxentry) > \
75 {\ 75 (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
76 jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\ 76 jfs_error((IP)->i_sb, \
77 BT_PUTPAGE(MP);\ 77 "XT_GETPAGE: xtree page corrupt\n"); \
78 MP = NULL;\ 78 BT_PUTPAGE(MP); \
79 RC = -EIO;\ 79 MP = NULL; \
80 }\ 80 RC = -EIO; \
81 }\ 81 } \
82} 82 } \
83} while (0)
83 84
84/* for consistency */ 85/* for consistency */
85#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) 86#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -499,7 +500,7 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
499 500
500 /* push (bn, index) of the parent page/entry */ 501 /* push (bn, index) of the parent page/entry */
501 if (BT_STACK_FULL(btstack)) { 502 if (BT_STACK_FULL(btstack)) {
502 jfs_error(ip->i_sb, "stack overrun in xtSearch!"); 503 jfs_error(ip->i_sb, "stack overrun!\n");
503 XT_PUTPAGE(mp); 504 XT_PUTPAGE(mp);
504 return -EIO; 505 return -EIO;
505 } 506 }
@@ -1385,7 +1386,7 @@ int xtExtend(tid_t tid, /* transaction id */
1385 1386
1386 if (cmp != 0) { 1387 if (cmp != 0) {
1387 XT_PUTPAGE(mp); 1388 XT_PUTPAGE(mp);
1388 jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent"); 1389 jfs_error(ip->i_sb, "xtSearch did not find extent\n");
1389 return -EIO; 1390 return -EIO;
1390 } 1391 }
1391 1392
@@ -1393,7 +1394,7 @@ int xtExtend(tid_t tid, /* transaction id */
1393 xad = &p->xad[index]; 1394 xad = &p->xad[index];
1394 if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { 1395 if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) {
1395 XT_PUTPAGE(mp); 1396 XT_PUTPAGE(mp);
1396 jfs_error(ip->i_sb, "xtExtend: extension is not contiguous"); 1397 jfs_error(ip->i_sb, "extension is not contiguous\n");
1397 return -EIO; 1398 return -EIO;
1398 } 1399 }
1399 1400
@@ -1552,7 +1553,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1552 1553
1553 if (cmp != 0) { 1554 if (cmp != 0) {
1554 XT_PUTPAGE(mp); 1555 XT_PUTPAGE(mp);
1555 jfs_error(ip->i_sb, "xtTailgate: couldn't find extent"); 1556 jfs_error(ip->i_sb, "couldn't find extent\n");
1556 return -EIO; 1557 return -EIO;
1557 } 1558 }
1558 1559
@@ -1560,8 +1561,7 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n",
1560 nextindex = le16_to_cpu(p->header.nextindex); 1561 nextindex = le16_to_cpu(p->header.nextindex);
1561 if (index != nextindex - 1) { 1562 if (index != nextindex - 1) {
1562 XT_PUTPAGE(mp); 1563 XT_PUTPAGE(mp);
1563 jfs_error(ip->i_sb, 1564 jfs_error(ip->i_sb, "the entry found is not the last entry\n");
1564 "xtTailgate: the entry found is not the last entry");
1565 return -EIO; 1565 return -EIO;
1566 } 1566 }
1567 1567
@@ -1734,7 +1734,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1734 1734
1735 if (cmp != 0) { 1735 if (cmp != 0) {
1736 XT_PUTPAGE(mp); 1736 XT_PUTPAGE(mp);
1737 jfs_error(ip->i_sb, "xtUpdate: Could not find extent"); 1737 jfs_error(ip->i_sb, "Could not find extent\n");
1738 return -EIO; 1738 return -EIO;
1739 } 1739 }
1740 1740
@@ -1758,7 +1758,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1758 (nxoff + nxlen > xoff + xlen)) { 1758 (nxoff + nxlen > xoff + xlen)) {
1759 XT_PUTPAGE(mp); 1759 XT_PUTPAGE(mp);
1760 jfs_error(ip->i_sb, 1760 jfs_error(ip->i_sb,
1761 "xtUpdate: nXAD in not completely contained within XAD"); 1761 "nXAD in not completely contained within XAD\n");
1762 return -EIO; 1762 return -EIO;
1763 } 1763 }
1764 1764
@@ -1907,7 +1907,7 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
1907 1907
1908 if (xoff >= nxoff) { 1908 if (xoff >= nxoff) {
1909 XT_PUTPAGE(mp); 1909 XT_PUTPAGE(mp);
1910 jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff"); 1910 jfs_error(ip->i_sb, "xoff >= nxoff\n");
1911 return -EIO; 1911 return -EIO;
1912 } 1912 }
1913/* #endif _JFS_WIP_COALESCE */ 1913/* #endif _JFS_WIP_COALESCE */
@@ -2048,14 +2048,13 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
2048 2048
2049 if (cmp != 0) { 2049 if (cmp != 0) {
2050 XT_PUTPAGE(mp); 2050 XT_PUTPAGE(mp);
2051 jfs_error(ip->i_sb, "xtUpdate: xtSearch failed"); 2051 jfs_error(ip->i_sb, "xtSearch failed\n");
2052 return -EIO; 2052 return -EIO;
2053 } 2053 }
2054 2054
2055 if (index0 != index) { 2055 if (index0 != index) {
2056 XT_PUTPAGE(mp); 2056 XT_PUTPAGE(mp);
2057 jfs_error(ip->i_sb, 2057 jfs_error(ip->i_sb, "unexpected value of index\n");
2058 "xtUpdate: unexpected value of index");
2059 return -EIO; 2058 return -EIO;
2060 } 2059 }
2061 } 2060 }
@@ -3650,7 +3649,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
3650 getChild: 3649 getChild:
3651 /* save current parent entry for the child page */ 3650 /* save current parent entry for the child page */
3652 if (BT_STACK_FULL(&btstack)) { 3651 if (BT_STACK_FULL(&btstack)) {
3653 jfs_error(ip->i_sb, "stack overrun in xtTruncate!"); 3652 jfs_error(ip->i_sb, "stack overrun!\n");
3654 XT_PUTPAGE(mp); 3653 XT_PUTPAGE(mp);
3655 return -EIO; 3654 return -EIO;
3656 } 3655 }
@@ -3751,8 +3750,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3751 3750
3752 if (cmp != 0) { 3751 if (cmp != 0) {
3753 XT_PUTPAGE(mp); 3752 XT_PUTPAGE(mp);
3754 jfs_error(ip->i_sb, 3753 jfs_error(ip->i_sb, "did not find extent\n");
3755 "xtTruncate_pmap: did not find extent");
3756 return -EIO; 3754 return -EIO;
3757 } 3755 }
3758 } else { 3756 } else {
@@ -3851,7 +3849,7 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
3851 getChild: 3849 getChild:
3852 /* save current parent entry for the child page */ 3850 /* save current parent entry for the child page */
3853 if (BT_STACK_FULL(&btstack)) { 3851 if (BT_STACK_FULL(&btstack)) {
3854 jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!"); 3852 jfs_error(ip->i_sb, "stack overrun!\n");
3855 XT_PUTPAGE(mp); 3853 XT_PUTPAGE(mp);
3856 return -EIO; 3854 return -EIO;
3857 } 3855 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 3b91a7ad6086..aa8a3370631b 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1176,7 +1176,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1176 if (!S_ISDIR(old_ip->i_mode) && new_ip) 1176 if (!S_ISDIR(old_ip->i_mode) && new_ip)
1177 IWRITE_UNLOCK(new_ip); 1177 IWRITE_UNLOCK(new_ip);
1178 jfs_error(new_ip->i_sb, 1178 jfs_error(new_ip->i_sb,
1179 "jfs_rename: new_ip->i_nlink != 0"); 1179 "new_ip->i_nlink != 0\n");
1180 return -EIO; 1180 return -EIO;
1181 } 1181 }
1182 tblk = tid_to_tblock(tid); 1182 tblk = tid_to_tblock(tid);
@@ -1529,7 +1529,7 @@ const struct inode_operations jfs_dir_inode_operations = {
1529 1529
1530const struct file_operations jfs_dir_operations = { 1530const struct file_operations jfs_dir_operations = {
1531 .read = generic_read_dir, 1531 .read = generic_read_dir,
1532 .readdir = jfs_readdir, 1532 .iterate = jfs_readdir,
1533 .fsync = jfs_fsync, 1533 .fsync = jfs_fsync,
1534 .unlocked_ioctl = jfs_ioctl, 1534 .unlocked_ioctl = jfs_ioctl,
1535#ifdef CONFIG_COMPAT 1535#ifdef CONFIG_COMPAT
@@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = {
1538 .llseek = generic_file_llseek, 1538 .llseek = generic_file_llseek,
1539}; 1539};
1540 1540
1541static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode, 1541static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
1542 struct qstr *this)
1543{ 1542{
1544 unsigned long hash; 1543 unsigned long hash;
1545 int i; 1544 int i;
@@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
1552 return 0; 1551 return 0;
1553} 1552}
1554 1553
1555static int jfs_ci_compare(const struct dentry *parent, 1554static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
1556 const struct inode *pinode,
1557 const struct dentry *dentry, const struct inode *inode,
1558 unsigned int len, const char *str, const struct qstr *name) 1555 unsigned int len, const char *str, const struct qstr *name)
1559{ 1556{
1560 int i, result = 1; 1557 int i, result = 1;
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 8d0c1c7c0820..90b3bc21e9b0 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -530,7 +530,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
530 goto resume; 530 goto resume;
531 531
532 error_out: 532 error_out:
533 jfs_error(sb, "jfs_extendfs"); 533 jfs_error(sb, "\n");
534 534
535 resume: 535 resume:
536 /* 536 /*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 788e0a9c1fb0..6669aa2042c3 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -92,16 +92,20 @@ static void jfs_handle_error(struct super_block *sb)
92 /* nothing is done for continue beyond marking the superblock dirty */ 92 /* nothing is done for continue beyond marking the superblock dirty */
93} 93}
94 94
95void jfs_error(struct super_block *sb, const char * function, ...) 95void jfs_error(struct super_block *sb, const char *fmt, ...)
96{ 96{
97 static char error_buf[256]; 97 struct va_format vaf;
98 va_list args; 98 va_list args;
99 99
100 va_start(args, function); 100 va_start(args, fmt);
101 vsnprintf(error_buf, sizeof(error_buf), function, args); 101
102 va_end(args); 102 vaf.fmt = fmt;
103 vaf.va = &args;
103 104
104 pr_err("ERROR: (device %s): %s\n", sb->s_id, error_buf); 105 pr_err("ERROR: (device %s): %pf: %pV\n",
106 sb->s_id, __builtin_return_address(0), &vaf);
107
108 va_end(args);
105 109
106 jfs_handle_error(sb); 110 jfs_handle_error(sb);
107} 111}
@@ -617,7 +621,7 @@ static int jfs_freeze(struct super_block *sb)
617 txQuiesce(sb); 621 txQuiesce(sb);
618 rc = lmLogShutdown(log); 622 rc = lmLogShutdown(log);
619 if (rc) { 623 if (rc) {
620 jfs_error(sb, "jfs_freeze: lmLogShutdown failed"); 624 jfs_error(sb, "lmLogShutdown failed\n");
621 625
622 /* let operations fail rather than hang */ 626 /* let operations fail rather than hang */
623 txResume(sb); 627 txResume(sb);
@@ -646,12 +650,12 @@ static int jfs_unfreeze(struct super_block *sb)
646 if (!(sb->s_flags & MS_RDONLY)) { 650 if (!(sb->s_flags & MS_RDONLY)) {
647 rc = updateSuper(sb, FM_MOUNT); 651 rc = updateSuper(sb, FM_MOUNT);
648 if (rc) { 652 if (rc) {
649 jfs_error(sb, "jfs_unfreeze: updateSuper failed"); 653 jfs_error(sb, "updateSuper failed\n");
650 goto out; 654 goto out;
651 } 655 }
652 rc = lmLogInit(log); 656 rc = lmLogInit(log);
653 if (rc) 657 if (rc)
654 jfs_error(sb, "jfs_unfreeze: lmLogInit failed"); 658 jfs_error(sb, "lmLogInit failed\n");
655out: 659out:
656 txResume(sb); 660 txResume(sb);
657 } 661 }
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 42d67f9757bf..d3472f4cd530 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -382,7 +382,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
382 382
383 nbytes = sizeDXD(&ji->ea); 383 nbytes = sizeDXD(&ji->ea);
384 if (!nbytes) { 384 if (!nbytes) {
385 jfs_error(sb, "ea_read: nbytes is 0"); 385 jfs_error(sb, "nbytes is 0\n");
386 return -EIO; 386 return -EIO;
387 } 387 }
388 388
@@ -482,7 +482,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
482 current_blocks = 0; 482 current_blocks = 0;
483 } else { 483 } else {
484 if (!(ji->ea.flag & DXD_EXTENT)) { 484 if (!(ji->ea.flag & DXD_EXTENT)) {
485 jfs_error(sb, "ea_get: invalid ea.flag)"); 485 jfs_error(sb, "invalid ea.flag\n");
486 return -EIO; 486 return -EIO;
487 } 487 }
488 current_blocks = (ea_size + sb->s_blocksize - 1) >> 488 current_blocks = (ea_size + sb->s_blocksize - 1) >>
@@ -1089,8 +1089,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
1089} 1089}
1090 1090
1091#ifdef CONFIG_JFS_SECURITY 1091#ifdef CONFIG_JFS_SECURITY
1092int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, 1092static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
1093 void *fs_info) 1093 void *fs_info)
1094{ 1094{
1095 const struct xattr *xattr; 1095 const struct xattr *xattr;
1096 tid_t *tid = fs_info; 1096 tid_t *tid = fs_info;
diff --git a/fs/libfs.c b/fs/libfs.c
index 916da8c4158b..3a3a9b53bf5a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -61,7 +61,8 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
61 61
62 if (dentry->d_name.len > NAME_MAX) 62 if (dentry->d_name.len > NAME_MAX)
63 return ERR_PTR(-ENAMETOOLONG); 63 return ERR_PTR(-ENAMETOOLONG);
64 d_set_d_op(dentry, &simple_dentry_operations); 64 if (!dentry->d_sb->s_d_op)
65 d_set_d_op(dentry, &simple_dentry_operations);
65 d_add(dentry, NULL); 66 d_add(dentry, NULL);
66 return NULL; 67 return NULL;
67} 68}
@@ -135,60 +136,40 @@ static inline unsigned char dt_type(struct inode *inode)
135 * both impossible due to the lock on directory. 136 * both impossible due to the lock on directory.
136 */ 137 */
137 138
138int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) 139int dcache_readdir(struct file *file, struct dir_context *ctx)
139{ 140{
140 struct dentry *dentry = filp->f_path.dentry; 141 struct dentry *dentry = file->f_path.dentry;
141 struct dentry *cursor = filp->private_data; 142 struct dentry *cursor = file->private_data;
142 struct list_head *p, *q = &cursor->d_u.d_child; 143 struct list_head *p, *q = &cursor->d_u.d_child;
143 ino_t ino;
144 int i = filp->f_pos;
145 144
146 switch (i) { 145 if (!dir_emit_dots(file, ctx))
147 case 0: 146 return 0;
148 ino = dentry->d_inode->i_ino; 147 spin_lock(&dentry->d_lock);
149 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 148 if (ctx->pos == 2)
150 break; 149 list_move(q, &dentry->d_subdirs);
151 filp->f_pos++; 150
152 i++; 151 for (p = q->next; p != &dentry->d_subdirs; p = p->next) {
153 /* fallthrough */ 152 struct dentry *next = list_entry(p, struct dentry, d_u.d_child);
154 case 1: 153 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
155 ino = parent_ino(dentry); 154 if (!simple_positive(next)) {
156 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 155 spin_unlock(&next->d_lock);
157 break; 156 continue;
158 filp->f_pos++; 157 }
159 i++;
160 /* fallthrough */
161 default:
162 spin_lock(&dentry->d_lock);
163 if (filp->f_pos == 2)
164 list_move(q, &dentry->d_subdirs);
165
166 for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
167 struct dentry *next;
168 next = list_entry(p, struct dentry, d_u.d_child);
169 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
170 if (!simple_positive(next)) {
171 spin_unlock(&next->d_lock);
172 continue;
173 }
174 158
175 spin_unlock(&next->d_lock); 159 spin_unlock(&next->d_lock);
176 spin_unlock(&dentry->d_lock); 160 spin_unlock(&dentry->d_lock);
177 if (filldir(dirent, next->d_name.name, 161 if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
178 next->d_name.len, filp->f_pos, 162 next->d_inode->i_ino, dt_type(next->d_inode)))
179 next->d_inode->i_ino, 163 return 0;
180 dt_type(next->d_inode)) < 0) 164 spin_lock(&dentry->d_lock);
181 return 0; 165 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED);
182 spin_lock(&dentry->d_lock); 166 /* next is still alive */
183 spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); 167 list_move(q, p);
184 /* next is still alive */ 168 spin_unlock(&next->d_lock);
185 list_move(q, p); 169 p = q;
186 spin_unlock(&next->d_lock); 170 ctx->pos++;
187 p = q;
188 filp->f_pos++;
189 }
190 spin_unlock(&dentry->d_lock);
191 } 171 }
172 spin_unlock(&dentry->d_lock);
192 return 0; 173 return 0;
193} 174}
194 175
@@ -202,7 +183,7 @@ const struct file_operations simple_dir_operations = {
202 .release = dcache_dir_close, 183 .release = dcache_dir_close,
203 .llseek = dcache_dir_lseek, 184 .llseek = dcache_dir_lseek,
204 .read = generic_read_dir, 185 .read = generic_read_dir,
205 .readdir = dcache_readdir, 186 .iterate = dcache_readdir,
206 .fsync = noop_fsync, 187 .fsync = noop_fsync,
207}; 188};
208 189
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 01bfe7662751..41e491b8e5d7 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -64,12 +64,17 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
64 nlm_init->protocol, nlm_version, 64 nlm_init->protocol, nlm_version,
65 nlm_init->hostname, nlm_init->noresvport, 65 nlm_init->hostname, nlm_init->noresvport,
66 nlm_init->net); 66 nlm_init->net);
67 if (host == NULL) { 67 if (host == NULL)
68 lockd_down(nlm_init->net); 68 goto out_nohost;
69 return ERR_PTR(-ENOLCK); 69 if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
70 } 70 goto out_nobind;
71 71
72 return host; 72 return host;
73out_nobind:
74 nlmclnt_release_host(host);
75out_nohost:
76 lockd_down(nlm_init->net);
77 return ERR_PTR(-ENOLCK);
73} 78}
74EXPORT_SYMBOL_GPL(nlmclnt_init); 79EXPORT_SYMBOL_GPL(nlmclnt_init);
75 80
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 9760ecb9b60f..acd394716349 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -125,14 +125,15 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
125{ 125{
126 struct nlm_args *argp = &req->a_args; 126 struct nlm_args *argp = &req->a_args;
127 struct nlm_lock *lock = &argp->lock; 127 struct nlm_lock *lock = &argp->lock;
128 char *nodename = req->a_host->h_rpcclnt->cl_nodename;
128 129
129 nlmclnt_next_cookie(&argp->cookie); 130 nlmclnt_next_cookie(&argp->cookie);
130 memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh)); 131 memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
131 lock->caller = utsname()->nodename; 132 lock->caller = nodename;
132 lock->oh.data = req->a_owner; 133 lock->oh.data = req->a_owner;
133 lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", 134 lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
134 (unsigned int)fl->fl_u.nfs_fl.owner->pid, 135 (unsigned int)fl->fl_u.nfs_fl.owner->pid,
135 utsname()->nodename); 136 nodename);
136 lock->svid = fl->fl_u.nfs_fl.owner->pid; 137 lock->svid = fl->fl_u.nfs_fl.owner->pid;
137 lock->fl.fl_start = fl->fl_start; 138 lock->fl.fl_start = fl->fl_start;
138 lock->fl.fl_end = fl->fl_end; 139 lock->fl.fl_end = fl->fl_end;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index a2aa97d45670..10d6c41aecad 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv)
305 svc_sock_update_bufs(serv); 305 svc_sock_update_bufs(serv);
306 serv->sv_maxconn = nlm_max_connections; 306 serv->sv_maxconn = nlm_max_connections;
307 307
308 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); 308 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name);
309 if (IS_ERR(nlmsvc_task)) { 309 if (IS_ERR(nlmsvc_task)) {
310 error = PTR_ERR(nlmsvc_task); 310 error = PTR_ERR(nlmsvc_task);
311 printk(KERN_WARNING 311 printk(KERN_WARNING
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e703318c41df..e066a3902973 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
276 dprintk("lockd: unlinking block %p...\n", block); 276 dprintk("lockd: unlinking block %p...\n", block);
277 277
278 /* Remove block from list */ 278 /* Remove block from list */
279 status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl); 279 status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
280 nlmsvc_remove_block(block); 280 nlmsvc_remove_block(block);
281 return status; 281 return status;
282} 282}
@@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
744 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; 744 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
745} 745}
746 746
747/*
748 * Since NLM uses two "keys" for tracking locks, we need to hash them down
749 * to one for the blocked_hash. Here, we're just xor'ing the host address
750 * with the pid in order to create a key value for picking a hash bucket.
751 */
752static unsigned long
753nlmsvc_owner_key(struct file_lock *fl)
754{
755 return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
756}
757
747const struct lock_manager_operations nlmsvc_lock_operations = { 758const struct lock_manager_operations nlmsvc_lock_operations = {
748 .lm_compare_owner = nlmsvc_same_owner, 759 .lm_compare_owner = nlmsvc_same_owner,
760 .lm_owner_key = nlmsvc_owner_key,
749 .lm_notify = nlmsvc_notify_blocked, 761 .lm_notify = nlmsvc_notify_blocked,
750 .lm_grant = nlmsvc_grant_deferred, 762 .lm_grant = nlmsvc_grant_deferred,
751}; 763};
@@ -939,6 +951,7 @@ nlmsvc_retry_blocked(void)
939 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 951 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
940 struct nlm_block *block; 952 struct nlm_block *block;
941 953
954 spin_lock(&nlm_blocked_lock);
942 while (!list_empty(&nlm_blocked) && !kthread_should_stop()) { 955 while (!list_empty(&nlm_blocked) && !kthread_should_stop()) {
943 block = list_entry(nlm_blocked.next, struct nlm_block, b_list); 956 block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
944 957
@@ -948,6 +961,7 @@ nlmsvc_retry_blocked(void)
948 timeout = block->b_when - jiffies; 961 timeout = block->b_when - jiffies;
949 break; 962 break;
950 } 963 }
964 spin_unlock(&nlm_blocked_lock);
951 965
952 dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n", 966 dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
953 block, block->b_when); 967 block, block->b_when);
@@ -957,7 +971,9 @@ nlmsvc_retry_blocked(void)
957 retry_deferred_block(block); 971 retry_deferred_block(block);
958 } else 972 } else
959 nlmsvc_grant_blocked(block); 973 nlmsvc_grant_blocked(block);
974 spin_lock(&nlm_blocked_lock);
960 } 975 }
976 spin_unlock(&nlm_blocked_lock);
961 977
962 return timeout; 978 return timeout;
963} 979}
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 97e87415b145..dc5c75930f0f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
169 169
170again: 170again:
171 file->f_locks = 0; 171 file->f_locks = 0;
172 lock_flocks(); /* protects i_flock list */ 172 spin_lock(&inode->i_lock);
173 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 173 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
174 if (fl->fl_lmops != &nlmsvc_lock_operations) 174 if (fl->fl_lmops != &nlmsvc_lock_operations)
175 continue; 175 continue;
@@ -181,7 +181,7 @@ again:
181 if (match(lockhost, host)) { 181 if (match(lockhost, host)) {
182 struct file_lock lock = *fl; 182 struct file_lock lock = *fl;
183 183
184 unlock_flocks(); 184 spin_unlock(&inode->i_lock);
185 lock.fl_type = F_UNLCK; 185 lock.fl_type = F_UNLCK;
186 lock.fl_start = 0; 186 lock.fl_start = 0;
187 lock.fl_end = OFFSET_MAX; 187 lock.fl_end = OFFSET_MAX;
@@ -193,7 +193,7 @@ again:
193 goto again; 193 goto again;
194 } 194 }
195 } 195 }
196 unlock_flocks(); 196 spin_unlock(&inode->i_lock);
197 197
198 return 0; 198 return 0;
199} 199}
@@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file)
228 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) 228 if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
229 return 1; 229 return 1;
230 230
231 lock_flocks(); 231 spin_lock(&inode->i_lock);
232 for (fl = inode->i_flock; fl; fl = fl->fl_next) { 232 for (fl = inode->i_flock; fl; fl = fl->fl_next) {
233 if (fl->fl_lmops == &nlmsvc_lock_operations) { 233 if (fl->fl_lmops == &nlmsvc_lock_operations) {
234 unlock_flocks(); 234 spin_unlock(&inode->i_lock);
235 return 1; 235 return 1;
236 } 236 }
237 } 237 }
238 unlock_flocks(); 238 spin_unlock(&inode->i_lock);
239 file->f_locks = 0; 239 file->f_locks = 0;
240 return 0; 240 return 0;
241} 241}
diff --git a/fs/locks.c b/fs/locks.c
index cb424a4fed71..b27a3005d78d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -126,6 +126,9 @@
126#include <linux/time.h> 126#include <linux/time.h>
127#include <linux/rcupdate.h> 127#include <linux/rcupdate.h>
128#include <linux/pid_namespace.h> 128#include <linux/pid_namespace.h>
129#include <linux/hashtable.h>
130#include <linux/percpu.h>
131#include <linux/lglock.h>
129 132
130#include <asm/uaccess.h> 133#include <asm/uaccess.h>
131 134
@@ -153,30 +156,53 @@ int lease_break_time = 45;
153#define for_each_lock(inode, lockp) \ 156#define for_each_lock(inode, lockp) \
154 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) 157 for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
155 158
156static LIST_HEAD(file_lock_list); 159/*
157static LIST_HEAD(blocked_list); 160 * The global file_lock_list is only used for displaying /proc/locks, so we
158static DEFINE_SPINLOCK(file_lock_lock); 161 * keep a list on each CPU, with each list protected by its own spinlock via
162 * the file_lock_lglock. Note that alterations to the list also require that
163 * the relevant i_lock is held.
164 */
165DEFINE_STATIC_LGLOCK(file_lock_lglock);
166static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
159 167
160/* 168/*
161 * Protects the two list heads above, plus the inode->i_flock list 169 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
170 * It is protected by blocked_lock_lock.
171 *
172 * We hash locks by lockowner in order to optimize searching for the lock a
173 * particular lockowner is waiting on.
174 *
175 * FIXME: make this value scale via some heuristic? We generally will want more
176 * buckets when we have more lockowners holding locks, but that's a little
177 * difficult to determine without knowing what the workload will look like.
162 */ 178 */
163void lock_flocks(void) 179#define BLOCKED_HASH_BITS 7
164{ 180static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
165 spin_lock(&file_lock_lock);
166}
167EXPORT_SYMBOL_GPL(lock_flocks);
168 181
169void unlock_flocks(void) 182/*
170{ 183 * This lock protects the blocked_hash. Generally, if you're accessing it, you
171 spin_unlock(&file_lock_lock); 184 * want to be holding this lock.
172} 185 *
173EXPORT_SYMBOL_GPL(unlock_flocks); 186 * In addition, it also protects the fl->fl_block list, and the fl->fl_next
187 * pointer for file_lock structures that are acting as lock requests (in
188 * contrast to those that are acting as records of acquired locks).
189 *
190 * Note that when we acquire this lock in order to change the above fields,
191 * we often hold the i_lock as well. In certain cases, when reading the fields
192 * protected by this lock, we can skip acquiring it iff we already hold the
193 * i_lock.
194 *
195 * In particular, adding an entry to the fl_block list requires that you hold
196 * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
197 * an entry from the list however only requires the file_lock_lock.
198 */
199static DEFINE_SPINLOCK(blocked_lock_lock);
174 200
175static struct kmem_cache *filelock_cache __read_mostly; 201static struct kmem_cache *filelock_cache __read_mostly;
176 202
177static void locks_init_lock_heads(struct file_lock *fl) 203static void locks_init_lock_heads(struct file_lock *fl)
178{ 204{
179 INIT_LIST_HEAD(&fl->fl_link); 205 INIT_HLIST_NODE(&fl->fl_link);
180 INIT_LIST_HEAD(&fl->fl_block); 206 INIT_LIST_HEAD(&fl->fl_block);
181 init_waitqueue_head(&fl->fl_wait); 207 init_waitqueue_head(&fl->fl_wait);
182} 208}
@@ -210,7 +236,7 @@ void locks_free_lock(struct file_lock *fl)
210{ 236{
211 BUG_ON(waitqueue_active(&fl->fl_wait)); 237 BUG_ON(waitqueue_active(&fl->fl_wait));
212 BUG_ON(!list_empty(&fl->fl_block)); 238 BUG_ON(!list_empty(&fl->fl_block));
213 BUG_ON(!list_empty(&fl->fl_link)); 239 BUG_ON(!hlist_unhashed(&fl->fl_link));
214 240
215 locks_release_private(fl); 241 locks_release_private(fl);
216 kmem_cache_free(filelock_cache, fl); 242 kmem_cache_free(filelock_cache, fl);
@@ -484,47 +510,118 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
484 return fl1->fl_owner == fl2->fl_owner; 510 return fl1->fl_owner == fl2->fl_owner;
485} 511}
486 512
513/* Must be called with the i_lock held! */
514static inline void
515locks_insert_global_locks(struct file_lock *fl)
516{
517 lg_local_lock(&file_lock_lglock);
518 fl->fl_link_cpu = smp_processor_id();
519 hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
520 lg_local_unlock(&file_lock_lglock);
521}
522
523/* Must be called with the i_lock held! */
524static inline void
525locks_delete_global_locks(struct file_lock *fl)
526{
527 /*
528 * Avoid taking lock if already unhashed. This is safe since this check
529 * is done while holding the i_lock, and new insertions into the list
530 * also require that it be held.
531 */
532 if (hlist_unhashed(&fl->fl_link))
533 return;
534 lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
535 hlist_del_init(&fl->fl_link);
536 lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
537}
538
539static unsigned long
540posix_owner_key(struct file_lock *fl)
541{
542 if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
543 return fl->fl_lmops->lm_owner_key(fl);
544 return (unsigned long)fl->fl_owner;
545}
546
547static inline void
548locks_insert_global_blocked(struct file_lock *waiter)
549{
550 hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
551}
552
553static inline void
554locks_delete_global_blocked(struct file_lock *waiter)
555{
556 hash_del(&waiter->fl_link);
557}
558
487/* Remove waiter from blocker's block list. 559/* Remove waiter from blocker's block list.
488 * When blocker ends up pointing to itself then the list is empty. 560 * When blocker ends up pointing to itself then the list is empty.
561 *
562 * Must be called with blocked_lock_lock held.
489 */ 563 */
490static void __locks_delete_block(struct file_lock *waiter) 564static void __locks_delete_block(struct file_lock *waiter)
491{ 565{
566 locks_delete_global_blocked(waiter);
492 list_del_init(&waiter->fl_block); 567 list_del_init(&waiter->fl_block);
493 list_del_init(&waiter->fl_link);
494 waiter->fl_next = NULL; 568 waiter->fl_next = NULL;
495} 569}
496 570
497/* 571static void locks_delete_block(struct file_lock *waiter)
498 */
499void locks_delete_block(struct file_lock *waiter)
500{ 572{
501 lock_flocks(); 573 spin_lock(&blocked_lock_lock);
502 __locks_delete_block(waiter); 574 __locks_delete_block(waiter);
503 unlock_flocks(); 575 spin_unlock(&blocked_lock_lock);
504} 576}
505EXPORT_SYMBOL(locks_delete_block);
506 577
507/* Insert waiter into blocker's block list. 578/* Insert waiter into blocker's block list.
508 * We use a circular list so that processes can be easily woken up in 579 * We use a circular list so that processes can be easily woken up in
509 * the order they blocked. The documentation doesn't require this but 580 * the order they blocked. The documentation doesn't require this but
510 * it seems like the reasonable thing to do. 581 * it seems like the reasonable thing to do.
582 *
583 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
584 * list itself is protected by the file_lock_list, but by ensuring that the
585 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
586 * in some cases when we see that the fl_block list is empty.
511 */ 587 */
512static void locks_insert_block(struct file_lock *blocker, 588static void __locks_insert_block(struct file_lock *blocker,
513 struct file_lock *waiter) 589 struct file_lock *waiter)
514{ 590{
515 BUG_ON(!list_empty(&waiter->fl_block)); 591 BUG_ON(!list_empty(&waiter->fl_block));
516 list_add_tail(&waiter->fl_block, &blocker->fl_block);
517 waiter->fl_next = blocker; 592 waiter->fl_next = blocker;
593 list_add_tail(&waiter->fl_block, &blocker->fl_block);
518 if (IS_POSIX(blocker)) 594 if (IS_POSIX(blocker))
519 list_add(&waiter->fl_link, &blocked_list); 595 locks_insert_global_blocked(waiter);
520} 596}
521 597
522/* Wake up processes blocked waiting for blocker. 598/* Must be called with i_lock held. */
523 * If told to wait then schedule the processes until the block list 599static void locks_insert_block(struct file_lock *blocker,
524 * is empty, otherwise empty the block list ourselves. 600 struct file_lock *waiter)
601{
602 spin_lock(&blocked_lock_lock);
603 __locks_insert_block(blocker, waiter);
604 spin_unlock(&blocked_lock_lock);
605}
606
607/*
608 * Wake up processes blocked waiting for blocker.
609 *
610 * Must be called with the inode->i_lock held!
525 */ 611 */
526static void locks_wake_up_blocks(struct file_lock *blocker) 612static void locks_wake_up_blocks(struct file_lock *blocker)
527{ 613{
614 /*
615 * Avoid taking global lock if list is empty. This is safe since new
616 * blocked requests are only added to the list under the i_lock, and
617 * the i_lock is always held here. Note that removal from the fl_block
618 * list does not require the i_lock, so we must recheck list_empty()
619 * after acquiring the blocked_lock_lock.
620 */
621 if (list_empty(&blocker->fl_block))
622 return;
623
624 spin_lock(&blocked_lock_lock);
528 while (!list_empty(&blocker->fl_block)) { 625 while (!list_empty(&blocker->fl_block)) {
529 struct file_lock *waiter; 626 struct file_lock *waiter;
530 627
@@ -536,20 +633,23 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
536 else 633 else
537 wake_up(&waiter->fl_wait); 634 wake_up(&waiter->fl_wait);
538 } 635 }
636 spin_unlock(&blocked_lock_lock);
539} 637}
540 638
541/* Insert file lock fl into an inode's lock list at the position indicated 639/* Insert file lock fl into an inode's lock list at the position indicated
542 * by pos. At the same time add the lock to the global file lock list. 640 * by pos. At the same time add the lock to the global file lock list.
641 *
642 * Must be called with the i_lock held!
543 */ 643 */
544static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) 644static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
545{ 645{
546 list_add(&fl->fl_link, &file_lock_list);
547
548 fl->fl_nspid = get_pid(task_tgid(current)); 646 fl->fl_nspid = get_pid(task_tgid(current));
549 647
550 /* insert into file's list */ 648 /* insert into file's list */
551 fl->fl_next = *pos; 649 fl->fl_next = *pos;
552 *pos = fl; 650 *pos = fl;
651
652 locks_insert_global_locks(fl);
553} 653}
554 654
555/* 655/*
@@ -557,14 +657,17 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
557 * Wake up processes that are blocked waiting for this lock, 657 * Wake up processes that are blocked waiting for this lock,
558 * notify the FS that the lock has been cleared and 658 * notify the FS that the lock has been cleared and
559 * finally free the lock. 659 * finally free the lock.
660 *
661 * Must be called with the i_lock held!
560 */ 662 */
561static void locks_delete_lock(struct file_lock **thisfl_p) 663static void locks_delete_lock(struct file_lock **thisfl_p)
562{ 664{
563 struct file_lock *fl = *thisfl_p; 665 struct file_lock *fl = *thisfl_p;
564 666
667 locks_delete_global_locks(fl);
668
565 *thisfl_p = fl->fl_next; 669 *thisfl_p = fl->fl_next;
566 fl->fl_next = NULL; 670 fl->fl_next = NULL;
567 list_del_init(&fl->fl_link);
568 671
569 if (fl->fl_nspid) { 672 if (fl->fl_nspid) {
570 put_pid(fl->fl_nspid); 673 put_pid(fl->fl_nspid);
@@ -625,8 +728,9 @@ void
625posix_test_lock(struct file *filp, struct file_lock *fl) 728posix_test_lock(struct file *filp, struct file_lock *fl)
626{ 729{
627 struct file_lock *cfl; 730 struct file_lock *cfl;
731 struct inode *inode = file_inode(filp);
628 732
629 lock_flocks(); 733 spin_lock(&inode->i_lock);
630 for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { 734 for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
631 if (!IS_POSIX(cfl)) 735 if (!IS_POSIX(cfl))
632 continue; 736 continue;
@@ -639,7 +743,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
639 fl->fl_pid = pid_vnr(cfl->fl_nspid); 743 fl->fl_pid = pid_vnr(cfl->fl_nspid);
640 } else 744 } else
641 fl->fl_type = F_UNLCK; 745 fl->fl_type = F_UNLCK;
642 unlock_flocks(); 746 spin_unlock(&inode->i_lock);
643 return; 747 return;
644} 748}
645EXPORT_SYMBOL(posix_test_lock); 749EXPORT_SYMBOL(posix_test_lock);
@@ -676,13 +780,14 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
676{ 780{
677 struct file_lock *fl; 781 struct file_lock *fl;
678 782
679 list_for_each_entry(fl, &blocked_list, fl_link) { 783 hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
680 if (posix_same_owner(fl, block_fl)) 784 if (posix_same_owner(fl, block_fl))
681 return fl->fl_next; 785 return fl->fl_next;
682 } 786 }
683 return NULL; 787 return NULL;
684} 788}
685 789
790/* Must be called with the blocked_lock_lock held! */
686static int posix_locks_deadlock(struct file_lock *caller_fl, 791static int posix_locks_deadlock(struct file_lock *caller_fl,
687 struct file_lock *block_fl) 792 struct file_lock *block_fl)
688{ 793{
@@ -718,7 +823,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
718 return -ENOMEM; 823 return -ENOMEM;
719 } 824 }
720 825
721 lock_flocks(); 826 spin_lock(&inode->i_lock);
722 if (request->fl_flags & FL_ACCESS) 827 if (request->fl_flags & FL_ACCESS)
723 goto find_conflict; 828 goto find_conflict;
724 829
@@ -748,9 +853,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
748 * give it the opportunity to lock the file. 853 * give it the opportunity to lock the file.
749 */ 854 */
750 if (found) { 855 if (found) {
751 unlock_flocks(); 856 spin_unlock(&inode->i_lock);
752 cond_resched(); 857 cond_resched();
753 lock_flocks(); 858 spin_lock(&inode->i_lock);
754 } 859 }
755 860
756find_conflict: 861find_conflict:
@@ -777,7 +882,7 @@ find_conflict:
777 error = 0; 882 error = 0;
778 883
779out: 884out:
780 unlock_flocks(); 885 spin_unlock(&inode->i_lock);
781 if (new_fl) 886 if (new_fl)
782 locks_free_lock(new_fl); 887 locks_free_lock(new_fl);
783 return error; 888 return error;
@@ -791,7 +896,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
791 struct file_lock *left = NULL; 896 struct file_lock *left = NULL;
792 struct file_lock *right = NULL; 897 struct file_lock *right = NULL;
793 struct file_lock **before; 898 struct file_lock **before;
794 int error, added = 0; 899 int error;
900 bool added = false;
795 901
796 /* 902 /*
797 * We may need two file_lock structures for this operation, 903 * We may need two file_lock structures for this operation,
@@ -806,7 +912,12 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
806 new_fl2 = locks_alloc_lock(); 912 new_fl2 = locks_alloc_lock();
807 } 913 }
808 914
809 lock_flocks(); 915 spin_lock(&inode->i_lock);
916 /*
917 * New lock request. Walk all POSIX locks and look for conflicts. If
918 * there are any, either return error or put the request on the
919 * blocker's list of waiters and the global blocked_hash.
920 */
810 if (request->fl_type != F_UNLCK) { 921 if (request->fl_type != F_UNLCK) {
811 for_each_lock(inode, before) { 922 for_each_lock(inode, before) {
812 fl = *before; 923 fl = *before;
@@ -819,11 +930,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
819 error = -EAGAIN; 930 error = -EAGAIN;
820 if (!(request->fl_flags & FL_SLEEP)) 931 if (!(request->fl_flags & FL_SLEEP))
821 goto out; 932 goto out;
933 /*
934 * Deadlock detection and insertion into the blocked
935 * locks list must be done while holding the same lock!
936 */
822 error = -EDEADLK; 937 error = -EDEADLK;
823 if (posix_locks_deadlock(request, fl)) 938 spin_lock(&blocked_lock_lock);
824 goto out; 939 if (likely(!posix_locks_deadlock(request, fl))) {
825 error = FILE_LOCK_DEFERRED; 940 error = FILE_LOCK_DEFERRED;
826 locks_insert_block(fl, request); 941 __locks_insert_block(fl, request);
942 }
943 spin_unlock(&blocked_lock_lock);
827 goto out; 944 goto out;
828 } 945 }
829 } 946 }
@@ -845,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
845 before = &fl->fl_next; 962 before = &fl->fl_next;
846 } 963 }
847 964
848 /* Process locks with this owner. */ 965 /* Process locks with this owner. */
849 while ((fl = *before) && posix_same_owner(request, fl)) { 966 while ((fl = *before) && posix_same_owner(request, fl)) {
850 /* Detect adjacent or overlapping regions (if same lock type) 967 /* Detect adjacent or overlapping regions (if same lock type)
851 */ 968 */
@@ -880,7 +997,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
880 continue; 997 continue;
881 } 998 }
882 request = fl; 999 request = fl;
883 added = 1; 1000 added = true;
884 } 1001 }
885 else { 1002 else {
886 /* Processing for different lock types is a bit 1003 /* Processing for different lock types is a bit
@@ -891,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
891 if (fl->fl_start > request->fl_end) 1008 if (fl->fl_start > request->fl_end)
892 break; 1009 break;
893 if (request->fl_type == F_UNLCK) 1010 if (request->fl_type == F_UNLCK)
894 added = 1; 1011 added = true;
895 if (fl->fl_start < request->fl_start) 1012 if (fl->fl_start < request->fl_start)
896 left = fl; 1013 left = fl;
897 /* If the next lock in the list has a higher end 1014 /* If the next lock in the list has a higher end
@@ -921,7 +1038,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
921 locks_release_private(fl); 1038 locks_release_private(fl);
922 locks_copy_private(fl, request); 1039 locks_copy_private(fl, request);
923 request = fl; 1040 request = fl;
924 added = 1; 1041 added = true;
925 } 1042 }
926 } 1043 }
927 /* Go on to next lock. 1044 /* Go on to next lock.
@@ -931,10 +1048,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
931 } 1048 }
932 1049
933 /* 1050 /*
934 * The above code only modifies existing locks in case of 1051 * The above code only modifies existing locks in case of merging or
935 * merging or replacing. If new lock(s) need to be inserted 1052 * replacing. If new lock(s) need to be inserted all modifications are
936 * all modifications are done bellow this, so it's safe yet to 1053 * done below this, so it's safe yet to bail out.
937 * bail out.
938 */ 1054 */
939 error = -ENOLCK; /* "no luck" */ 1055 error = -ENOLCK; /* "no luck" */
940 if (right && left == right && !new_fl2) 1056 if (right && left == right && !new_fl2)
@@ -974,7 +1090,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
974 locks_wake_up_blocks(left); 1090 locks_wake_up_blocks(left);
975 } 1091 }
976 out: 1092 out:
977 unlock_flocks(); 1093 spin_unlock(&inode->i_lock);
978 /* 1094 /*
979 * Free any unused locks. 1095 * Free any unused locks.
980 */ 1096 */
@@ -1049,14 +1165,14 @@ int locks_mandatory_locked(struct inode *inode)
1049 /* 1165 /*
1050 * Search the lock list for this inode for any POSIX locks. 1166 * Search the lock list for this inode for any POSIX locks.
1051 */ 1167 */
1052 lock_flocks(); 1168 spin_lock(&inode->i_lock);
1053 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1169 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1054 if (!IS_POSIX(fl)) 1170 if (!IS_POSIX(fl))
1055 continue; 1171 continue;
1056 if (fl->fl_owner != owner) 1172 if (fl->fl_owner != owner)
1057 break; 1173 break;
1058 } 1174 }
1059 unlock_flocks(); 1175 spin_unlock(&inode->i_lock);
1060 return fl ? -EAGAIN : 0; 1176 return fl ? -EAGAIN : 0;
1061} 1177}
1062 1178
@@ -1199,7 +1315,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
1199 if (IS_ERR(new_fl)) 1315 if (IS_ERR(new_fl))
1200 return PTR_ERR(new_fl); 1316 return PTR_ERR(new_fl);
1201 1317
1202 lock_flocks(); 1318 spin_lock(&inode->i_lock);
1203 1319
1204 time_out_leases(inode); 1320 time_out_leases(inode);
1205 1321
@@ -1249,11 +1365,11 @@ restart:
1249 break_time++; 1365 break_time++;
1250 } 1366 }
1251 locks_insert_block(flock, new_fl); 1367 locks_insert_block(flock, new_fl);
1252 unlock_flocks(); 1368 spin_unlock(&inode->i_lock);
1253 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1369 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1254 !new_fl->fl_next, break_time); 1370 !new_fl->fl_next, break_time);
1255 lock_flocks(); 1371 spin_lock(&inode->i_lock);
1256 __locks_delete_block(new_fl); 1372 locks_delete_block(new_fl);
1257 if (error >= 0) { 1373 if (error >= 0) {
1258 if (error == 0) 1374 if (error == 0)
1259 time_out_leases(inode); 1375 time_out_leases(inode);
@@ -1270,7 +1386,7 @@ restart:
1270 } 1386 }
1271 1387
1272out: 1388out:
1273 unlock_flocks(); 1389 spin_unlock(&inode->i_lock);
1274 locks_free_lock(new_fl); 1390 locks_free_lock(new_fl);
1275 return error; 1391 return error;
1276} 1392}
@@ -1323,9 +1439,10 @@ EXPORT_SYMBOL(lease_get_mtime);
1323int fcntl_getlease(struct file *filp) 1439int fcntl_getlease(struct file *filp)
1324{ 1440{
1325 struct file_lock *fl; 1441 struct file_lock *fl;
1442 struct inode *inode = file_inode(filp);
1326 int type = F_UNLCK; 1443 int type = F_UNLCK;
1327 1444
1328 lock_flocks(); 1445 spin_lock(&inode->i_lock);
1329 time_out_leases(file_inode(filp)); 1446 time_out_leases(file_inode(filp));
1330 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); 1447 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
1331 fl = fl->fl_next) { 1448 fl = fl->fl_next) {
@@ -1334,11 +1451,11 @@ int fcntl_getlease(struct file *filp)
1334 break; 1451 break;
1335 } 1452 }
1336 } 1453 }
1337 unlock_flocks(); 1454 spin_unlock(&inode->i_lock);
1338 return type; 1455 return type;
1339} 1456}
1340 1457
1341int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) 1458static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1342{ 1459{
1343 struct file_lock *fl, **before, **my_before = NULL, *lease; 1460 struct file_lock *fl, **before, **my_before = NULL, *lease;
1344 struct dentry *dentry = filp->f_path.dentry; 1461 struct dentry *dentry = filp->f_path.dentry;
@@ -1351,7 +1468,7 @@ int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
1351 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1468 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
1352 goto out; 1469 goto out;
1353 if ((arg == F_WRLCK) 1470 if ((arg == F_WRLCK)
1354 && ((dentry->d_count > 1) 1471 && ((d_count(dentry) > 1)
1355 || (atomic_read(&inode->i_count) > 1))) 1472 || (atomic_read(&inode->i_count) > 1)))
1356 goto out; 1473 goto out;
1357 1474
@@ -1403,7 +1520,7 @@ out:
1403 return error; 1520 return error;
1404} 1521}
1405 1522
1406int generic_delete_lease(struct file *filp, struct file_lock **flp) 1523static int generic_delete_lease(struct file *filp, struct file_lock **flp)
1407{ 1524{
1408 struct file_lock *fl, **before; 1525 struct file_lock *fl, **before;
1409 struct dentry *dentry = filp->f_path.dentry; 1526 struct dentry *dentry = filp->f_path.dentry;
@@ -1428,7 +1545,7 @@ int generic_delete_lease(struct file *filp, struct file_lock **flp)
1428 * The (input) flp->fl_lmops->lm_break function is required 1545 * The (input) flp->fl_lmops->lm_break function is required
1429 * by break_lease(). 1546 * by break_lease().
1430 * 1547 *
1431 * Called with file_lock_lock held. 1548 * Called with inode->i_lock held.
1432 */ 1549 */
1433int generic_setlease(struct file *filp, long arg, struct file_lock **flp) 1550int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1434{ 1551{
@@ -1497,11 +1614,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1497 1614
1498int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) 1615int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1499{ 1616{
1617 struct inode *inode = file_inode(filp);
1500 int error; 1618 int error;
1501 1619
1502 lock_flocks(); 1620 spin_lock(&inode->i_lock);
1503 error = __vfs_setlease(filp, arg, lease); 1621 error = __vfs_setlease(filp, arg, lease);
1504 unlock_flocks(); 1622 spin_unlock(&inode->i_lock);
1505 1623
1506 return error; 1624 return error;
1507} 1625}
@@ -1519,6 +1637,7 @@ static int do_fcntl_delete_lease(struct file *filp)
1519static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) 1637static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1520{ 1638{
1521 struct file_lock *fl, *ret; 1639 struct file_lock *fl, *ret;
1640 struct inode *inode = file_inode(filp);
1522 struct fasync_struct *new; 1641 struct fasync_struct *new;
1523 int error; 1642 int error;
1524 1643
@@ -1532,10 +1651,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1532 return -ENOMEM; 1651 return -ENOMEM;
1533 } 1652 }
1534 ret = fl; 1653 ret = fl;
1535 lock_flocks(); 1654 spin_lock(&inode->i_lock);
1536 error = __vfs_setlease(filp, arg, &ret); 1655 error = __vfs_setlease(filp, arg, &ret);
1537 if (error) { 1656 if (error) {
1538 unlock_flocks(); 1657 spin_unlock(&inode->i_lock);
1539 locks_free_lock(fl); 1658 locks_free_lock(fl);
1540 goto out_free_fasync; 1659 goto out_free_fasync;
1541 } 1660 }
@@ -1552,7 +1671,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1552 new = NULL; 1671 new = NULL;
1553 1672
1554 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 1673 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1555 unlock_flocks(); 1674 spin_unlock(&inode->i_lock);
1556 1675
1557out_free_fasync: 1676out_free_fasync:
1558 if (new) 1677 if (new)
@@ -2076,7 +2195,7 @@ void locks_remove_flock(struct file *filp)
2076 fl.fl_ops->fl_release_private(&fl); 2195 fl.fl_ops->fl_release_private(&fl);
2077 } 2196 }
2078 2197
2079 lock_flocks(); 2198 spin_lock(&inode->i_lock);
2080 before = &inode->i_flock; 2199 before = &inode->i_flock;
2081 2200
2082 while ((fl = *before) != NULL) { 2201 while ((fl = *before) != NULL) {
@@ -2094,30 +2213,28 @@ void locks_remove_flock(struct file *filp)
2094 } 2213 }
2095 before = &fl->fl_next; 2214 before = &fl->fl_next;
2096 } 2215 }
2097 unlock_flocks(); 2216 spin_unlock(&inode->i_lock);
2098} 2217}
2099 2218
2100/** 2219/**
2101 * posix_unblock_lock - stop waiting for a file lock 2220 * posix_unblock_lock - stop waiting for a file lock
2102 * @filp: how the file was opened
2103 * @waiter: the lock which was waiting 2221 * @waiter: the lock which was waiting
2104 * 2222 *
2105 * lockd needs to block waiting for locks. 2223 * lockd needs to block waiting for locks.
2106 */ 2224 */
2107int 2225int
2108posix_unblock_lock(struct file *filp, struct file_lock *waiter) 2226posix_unblock_lock(struct file_lock *waiter)
2109{ 2227{
2110 int status = 0; 2228 int status = 0;
2111 2229
2112 lock_flocks(); 2230 spin_lock(&blocked_lock_lock);
2113 if (waiter->fl_next) 2231 if (waiter->fl_next)
2114 __locks_delete_block(waiter); 2232 __locks_delete_block(waiter);
2115 else 2233 else
2116 status = -ENOENT; 2234 status = -ENOENT;
2117 unlock_flocks(); 2235 spin_unlock(&blocked_lock_lock);
2118 return status; 2236 return status;
2119} 2237}
2120
2121EXPORT_SYMBOL(posix_unblock_lock); 2238EXPORT_SYMBOL(posix_unblock_lock);
2122 2239
2123/** 2240/**
@@ -2140,6 +2257,11 @@ EXPORT_SYMBOL_GPL(vfs_cancel_lock);
2140#include <linux/proc_fs.h> 2257#include <linux/proc_fs.h>
2141#include <linux/seq_file.h> 2258#include <linux/seq_file.h>
2142 2259
2260struct locks_iterator {
2261 int li_cpu;
2262 loff_t li_pos;
2263};
2264
2143static void lock_get_status(struct seq_file *f, struct file_lock *fl, 2265static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2144 loff_t id, char *pfx) 2266 loff_t id, char *pfx)
2145{ 2267{
@@ -2213,37 +2335,41 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2213 2335
2214static int locks_show(struct seq_file *f, void *v) 2336static int locks_show(struct seq_file *f, void *v)
2215{ 2337{
2338 struct locks_iterator *iter = f->private;
2216 struct file_lock *fl, *bfl; 2339 struct file_lock *fl, *bfl;
2217 2340
2218 fl = list_entry(v, struct file_lock, fl_link); 2341 fl = hlist_entry(v, struct file_lock, fl_link);
2219 2342
2220 lock_get_status(f, fl, *((loff_t *)f->private), ""); 2343 lock_get_status(f, fl, iter->li_pos, "");
2221 2344
2222 list_for_each_entry(bfl, &fl->fl_block, fl_block) 2345 list_for_each_entry(bfl, &fl->fl_block, fl_block)
2223 lock_get_status(f, bfl, *((loff_t *)f->private), " ->"); 2346 lock_get_status(f, bfl, iter->li_pos, " ->");
2224 2347
2225 return 0; 2348 return 0;
2226} 2349}
2227 2350
2228static void *locks_start(struct seq_file *f, loff_t *pos) 2351static void *locks_start(struct seq_file *f, loff_t *pos)
2229{ 2352{
2230 loff_t *p = f->private; 2353 struct locks_iterator *iter = f->private;
2231 2354
2232 lock_flocks(); 2355 iter->li_pos = *pos + 1;
2233 *p = (*pos + 1); 2356 lg_global_lock(&file_lock_lglock);
2234 return seq_list_start(&file_lock_list, *pos); 2357 spin_lock(&blocked_lock_lock);
2358 return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
2235} 2359}
2236 2360
2237static void *locks_next(struct seq_file *f, void *v, loff_t *pos) 2361static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
2238{ 2362{
2239 loff_t *p = f->private; 2363 struct locks_iterator *iter = f->private;
2240 ++*p; 2364
2241 return seq_list_next(v, &file_lock_list, pos); 2365 ++iter->li_pos;
2366 return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
2242} 2367}
2243 2368
2244static void locks_stop(struct seq_file *f, void *v) 2369static void locks_stop(struct seq_file *f, void *v)
2245{ 2370{
2246 unlock_flocks(); 2371 spin_unlock(&blocked_lock_lock);
2372 lg_global_unlock(&file_lock_lglock);
2247} 2373}
2248 2374
2249static const struct seq_operations locks_seq_operations = { 2375static const struct seq_operations locks_seq_operations = {
@@ -2255,7 +2381,8 @@ static const struct seq_operations locks_seq_operations = {
2255 2381
2256static int locks_open(struct inode *inode, struct file *filp) 2382static int locks_open(struct inode *inode, struct file *filp)
2257{ 2383{
2258 return seq_open_private(filp, &locks_seq_operations, sizeof(loff_t)); 2384 return seq_open_private(filp, &locks_seq_operations,
2385 sizeof(struct locks_iterator));
2259} 2386}
2260 2387
2261static const struct file_operations proc_locks_operations = { 2388static const struct file_operations proc_locks_operations = {
@@ -2290,7 +2417,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2290{ 2417{
2291 struct file_lock *fl; 2418 struct file_lock *fl;
2292 int result = 1; 2419 int result = 1;
2293 lock_flocks(); 2420
2421 spin_lock(&inode->i_lock);
2294 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2422 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2295 if (IS_POSIX(fl)) { 2423 if (IS_POSIX(fl)) {
2296 if (fl->fl_type == F_RDLCK) 2424 if (fl->fl_type == F_RDLCK)
@@ -2307,7 +2435,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2307 result = 0; 2435 result = 0;
2308 break; 2436 break;
2309 } 2437 }
2310 unlock_flocks(); 2438 spin_unlock(&inode->i_lock);
2311 return result; 2439 return result;
2312} 2440}
2313 2441
@@ -2330,7 +2458,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2330{ 2458{
2331 struct file_lock *fl; 2459 struct file_lock *fl;
2332 int result = 1; 2460 int result = 1;
2333 lock_flocks(); 2461
2462 spin_lock(&inode->i_lock);
2334 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 2463 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2335 if (IS_POSIX(fl)) { 2464 if (IS_POSIX(fl)) {
2336 if ((fl->fl_end < start) || (fl->fl_start > (start + len))) 2465 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2345,7 +2474,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2345 result = 0; 2474 result = 0;
2346 break; 2475 break;
2347 } 2476 }
2348 unlock_flocks(); 2477 spin_unlock(&inode->i_lock);
2349 return result; 2478 return result;
2350} 2479}
2351 2480
@@ -2353,9 +2482,16 @@ EXPORT_SYMBOL(lock_may_write);
2353 2482
2354static int __init filelock_init(void) 2483static int __init filelock_init(void)
2355{ 2484{
2485 int i;
2486
2356 filelock_cache = kmem_cache_create("file_lock_cache", 2487 filelock_cache = kmem_cache_create("file_lock_cache",
2357 sizeof(struct file_lock), 0, SLAB_PANIC, NULL); 2488 sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
2358 2489
2490 lg_lock_init(&file_lock_lglock, "file_lock_lglock");
2491
2492 for_each_possible_cpu(i)
2493 INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
2494
2359 return 0; 2495 return 0;
2360} 2496}
2361 2497
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index b82751082112..6bdc347008f5 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -281,17 +281,23 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
281 281
282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good 282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
283 * way to combine the two copies */ 283 * way to combine the two copies */
284#define IMPLICIT_NODES 2 284static int logfs_readdir(struct file *file, struct dir_context *ctx)
285static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
286{ 285{
287 struct inode *dir = file_inode(file); 286 struct inode *dir = file_inode(file);
288 loff_t pos = file->f_pos - IMPLICIT_NODES; 287 loff_t pos;
289 struct page *page; 288 struct page *page;
290 struct logfs_disk_dentry *dd; 289 struct logfs_disk_dentry *dd;
291 int full;
292 290
291 if (ctx->pos < 0)
292 return -EINVAL;
293
294 if (!dir_emit_dots(file, ctx))
295 return 0;
296
297 pos = ctx->pos - 2;
293 BUG_ON(pos < 0); 298 BUG_ON(pos < 0);
294 for (;; pos++) { 299 for (;; pos++, ctx->pos++) {
300 bool full;
295 if (beyond_eof(dir, pos)) 301 if (beyond_eof(dir, pos))
296 break; 302 break;
297 if (!logfs_exist_block(dir, pos)) { 303 if (!logfs_exist_block(dir, pos)) {
@@ -306,42 +312,17 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
306 dd = kmap(page); 312 dd = kmap(page);
307 BUG_ON(dd->namelen == 0); 313 BUG_ON(dd->namelen == 0);
308 314
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), 315 full = !dir_emit(ctx, (char *)dd->name,
310 pos, be64_to_cpu(dd->ino), dd->type); 316 be16_to_cpu(dd->namelen),
317 be64_to_cpu(dd->ino), dd->type);
311 kunmap(page); 318 kunmap(page);
312 page_cache_release(page); 319 page_cache_release(page);
313 if (full) 320 if (full)
314 break; 321 break;
315 } 322 }
316
317 file->f_pos = pos + IMPLICIT_NODES;
318 return 0; 323 return 0;
319} 324}
320 325
321static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
322{
323 struct inode *inode = file_inode(file);
324 ino_t pino = parent_ino(file->f_dentry);
325 int err;
326
327 if (file->f_pos < 0)
328 return -EINVAL;
329
330 if (file->f_pos == 0) {
331 if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
332 return 0;
333 file->f_pos++;
334 }
335 if (file->f_pos == 1) {
336 if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
337 return 0;
338 file->f_pos++;
339 }
340
341 err = __logfs_readdir(file, buf, filldir);
342 return err;
343}
344
345static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name) 326static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
346{ 327{
347 dd->namelen = cpu_to_be16(name->len); 328 dd->namelen = cpu_to_be16(name->len);
@@ -814,7 +795,7 @@ const struct inode_operations logfs_dir_iops = {
814const struct file_operations logfs_dir_fops = { 795const struct file_operations logfs_dir_fops = {
815 .fsync = logfs_fsync, 796 .fsync = logfs_fsync,
816 .unlocked_ioctl = logfs_ioctl, 797 .unlocked_ioctl = logfs_ioctl,
817 .readdir = logfs_readdir, 798 .iterate = logfs_readdir,
818 .read = generic_read_dir, 799 .read = generic_read_dir,
819 .llseek = default_llseek, 800 .llseek = default_llseek,
820}; 801};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index c2219a6dd3c8..57914fc32b62 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -159,7 +159,8 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
159 return __logfs_writepage(page); 159 return __logfs_writepage(page);
160} 160}
161 161
162static void logfs_invalidatepage(struct page *page, unsigned long offset) 162static void logfs_invalidatepage(struct page *page, unsigned int offset,
163 unsigned int length)
163{ 164{
164 struct logfs_block *block = logfs_block(page); 165 struct logfs_block *block = logfs_block(page);
165 166
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 038da0991794..d448a777166b 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -884,7 +884,8 @@ static struct logfs_area *alloc_area(struct super_block *sb)
884 return area; 884 return area;
885} 885}
886 886
887static void map_invalidatepage(struct page *page, unsigned long l) 887static void map_invalidatepage(struct page *page, unsigned int o,
888 unsigned int l)
888{ 889{
889 return; 890 return;
890} 891}
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index a9ed6f36e6ea..dfaf6fa9b7b5 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -16,12 +16,12 @@
16typedef struct minix_dir_entry minix_dirent; 16typedef struct minix_dir_entry minix_dirent;
17typedef struct minix3_dir_entry minix3_dirent; 17typedef struct minix3_dir_entry minix3_dirent;
18 18
19static int minix_readdir(struct file *, void *, filldir_t); 19static int minix_readdir(struct file *, struct dir_context *);
20 20
21const struct file_operations minix_dir_operations = { 21const struct file_operations minix_dir_operations = {
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .iterate = minix_readdir,
25 .fsync = generic_file_fsync, 25 .fsync = generic_file_fsync,
26}; 26};
27 27
@@ -82,22 +82,23 @@ static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
82 return (void*)((char*)de + sbi->s_dirsize); 82 return (void*)((char*)de + sbi->s_dirsize);
83} 83}
84 84
85static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) 85static int minix_readdir(struct file *file, struct dir_context *ctx)
86{ 86{
87 unsigned long pos = filp->f_pos; 87 struct inode *inode = file_inode(file);
88 struct inode *inode = file_inode(filp);
89 struct super_block *sb = inode->i_sb; 88 struct super_block *sb = inode->i_sb;
90 unsigned offset = pos & ~PAGE_CACHE_MASK;
91 unsigned long n = pos >> PAGE_CACHE_SHIFT;
92 unsigned long npages = dir_pages(inode);
93 struct minix_sb_info *sbi = minix_sb(sb); 89 struct minix_sb_info *sbi = minix_sb(sb);
94 unsigned chunk_size = sbi->s_dirsize; 90 unsigned chunk_size = sbi->s_dirsize;
95 char *name; 91 unsigned long npages = dir_pages(inode);
96 __u32 inumber; 92 unsigned long pos = ctx->pos;
93 unsigned offset;
94 unsigned long n;
97 95
98 pos = (pos + chunk_size-1) & ~(chunk_size-1); 96 ctx->pos = pos = ALIGN(pos, chunk_size);
99 if (pos >= inode->i_size) 97 if (pos >= inode->i_size)
100 goto done; 98 return 0;
99
100 offset = pos & ~PAGE_CACHE_MASK;
101 n = pos >> PAGE_CACHE_SHIFT;
101 102
102 for ( ; n < npages; n++, offset = 0) { 103 for ( ; n < npages; n++, offset = 0) {
103 char *p, *kaddr, *limit; 104 char *p, *kaddr, *limit;
@@ -109,6 +110,8 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
109 p = kaddr+offset; 110 p = kaddr+offset;
110 limit = kaddr + minix_last_byte(inode, n) - chunk_size; 111 limit = kaddr + minix_last_byte(inode, n) - chunk_size;
111 for ( ; p <= limit; p = minix_next_entry(p, sbi)) { 112 for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
113 const char *name;
114 __u32 inumber;
112 if (sbi->s_version == MINIX_V3) { 115 if (sbi->s_version == MINIX_V3) {
113 minix3_dirent *de3 = (minix3_dirent *)p; 116 minix3_dirent *de3 = (minix3_dirent *)p;
114 name = de3->name; 117 name = de3->name;
@@ -119,24 +122,17 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir)
119 inumber = de->inode; 122 inumber = de->inode;
120 } 123 }
121 if (inumber) { 124 if (inumber) {
122 int over;
123
124 unsigned l = strnlen(name, sbi->s_namelen); 125 unsigned l = strnlen(name, sbi->s_namelen);
125 offset = p - kaddr; 126 if (!dir_emit(ctx, name, l,
126 over = filldir(dirent, name, l, 127 inumber, DT_UNKNOWN)) {
127 (n << PAGE_CACHE_SHIFT) | offset,
128 inumber, DT_UNKNOWN);
129 if (over) {
130 dir_put_page(page); 128 dir_put_page(page);
131 goto done; 129 return 0;
132 } 130 }
133 } 131 }
132 ctx->pos += chunk_size;
134 } 133 }
135 dir_put_page(page); 134 dir_put_page(page);
136 } 135 }
137
138done:
139 filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset;
140 return 0; 136 return 0;
141} 137}
142 138
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 0db73d9dd668..cd950e2331b6 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
54 return error; 54 return error;
55} 55}
56 56
57static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
58{
59 int error;
60 struct inode *inode = minix_new_inode(dir, mode, &error);
61 if (inode) {
62 minix_set_inode(inode, 0);
63 mark_inode_dirty(inode);
64 d_tmpfile(dentry, inode);
65 }
66 return error;
67}
68
57static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, 69static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
58 bool excl) 70 bool excl)
59{ 71{
@@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = {
254 .mknod = minix_mknod, 266 .mknod = minix_mknod,
255 .rename = minix_rename, 267 .rename = minix_rename,
256 .getattr = minix_getattr, 268 .getattr = minix_getattr,
269 .tmpfile = minix_tmpfile,
257}; 270};
diff --git a/fs/namei.c b/fs/namei.c
index 9ed9361223c0..89a612e392eb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1352,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd,
1352 */ 1352 */
1353 if (nd->flags & LOOKUP_RCU) { 1353 if (nd->flags & LOOKUP_RCU) {
1354 unsigned seq; 1354 unsigned seq;
1355 dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode); 1355 dentry = __d_lookup_rcu(parent, &nd->last, &seq);
1356 if (!dentry) 1356 if (!dentry)
1357 goto unlazy; 1357 goto unlazy;
1358 1358
@@ -1787,8 +1787,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1787 struct dentry *parent = nd->path.dentry; 1787 struct dentry *parent = nd->path.dentry;
1788 nd->flags &= ~LOOKUP_JUMPED; 1788 nd->flags &= ~LOOKUP_JUMPED;
1789 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1789 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1790 err = parent->d_op->d_hash(parent, nd->inode, 1790 err = parent->d_op->d_hash(parent, &this);
1791 &this);
1792 if (err < 0) 1791 if (err < 0)
1793 break; 1792 break;
1794 } 1793 }
@@ -2121,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2121 * to use its own hash.. 2120 * to use its own hash..
2122 */ 2121 */
2123 if (base->d_flags & DCACHE_OP_HASH) { 2122 if (base->d_flags & DCACHE_OP_HASH) {
2124 int err = base->d_op->d_hash(base, base->d_inode, &this); 2123 int err = base->d_op->d_hash(base, &this);
2125 if (err < 0) 2124 if (err < 0)
2126 return ERR_PTR(err); 2125 return ERR_PTR(err);
2127 } 2126 }
@@ -2690,28 +2689,10 @@ static int do_last(struct nameidata *nd, struct path *path,
2690 nd->flags &= ~LOOKUP_PARENT; 2689 nd->flags &= ~LOOKUP_PARENT;
2691 nd->flags |= op->intent; 2690 nd->flags |= op->intent;
2692 2691
2693 switch (nd->last_type) { 2692 if (nd->last_type != LAST_NORM) {
2694 case LAST_DOTDOT:
2695 case LAST_DOT:
2696 error = handle_dots(nd, nd->last_type); 2693 error = handle_dots(nd, nd->last_type);
2697 if (error) 2694 if (error)
2698 return error; 2695 return error;
2699 /* fallthrough */
2700 case LAST_ROOT:
2701 error = complete_walk(nd);
2702 if (error)
2703 return error;
2704 audit_inode(name, nd->path.dentry, 0);
2705 if (open_flag & O_CREAT) {
2706 error = -EISDIR;
2707 goto out;
2708 }
2709 goto finish_open;
2710 case LAST_BIND:
2711 error = complete_walk(nd);
2712 if (error)
2713 return error;
2714 audit_inode(name, dir, 0);
2715 goto finish_open; 2696 goto finish_open;
2716 } 2697 }
2717 2698
@@ -2841,19 +2822,19 @@ finish_lookup:
2841 } 2822 }
2842 nd->inode = inode; 2823 nd->inode = inode;
2843 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ 2824 /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
2825finish_open:
2844 error = complete_walk(nd); 2826 error = complete_walk(nd);
2845 if (error) { 2827 if (error) {
2846 path_put(&save_parent); 2828 path_put(&save_parent);
2847 return error; 2829 return error;
2848 } 2830 }
2831 audit_inode(name, nd->path.dentry, 0);
2849 error = -EISDIR; 2832 error = -EISDIR;
2850 if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) 2833 if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
2851 goto out; 2834 goto out;
2852 error = -ENOTDIR; 2835 error = -ENOTDIR;
2853 if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) 2836 if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
2854 goto out; 2837 goto out;
2855 audit_inode(name, nd->path.dentry, 0);
2856finish_open:
2857 if (!S_ISREG(nd->inode->i_mode)) 2838 if (!S_ISREG(nd->inode->i_mode))
2858 will_truncate = false; 2839 will_truncate = false;
2859 2840
@@ -2920,6 +2901,67 @@ stale_open:
2920 goto retry_lookup; 2901 goto retry_lookup;
2921} 2902}
2922 2903
2904static int do_tmpfile(int dfd, struct filename *pathname,
2905 struct nameidata *nd, int flags,
2906 const struct open_flags *op,
2907 struct file *file, int *opened)
2908{
2909 static const struct qstr name = QSTR_INIT("/", 1);
2910 struct dentry *dentry, *child;
2911 struct inode *dir;
2912 int error = path_lookupat(dfd, pathname->name,
2913 flags | LOOKUP_DIRECTORY, nd);
2914 if (unlikely(error))
2915 return error;
2916 error = mnt_want_write(nd->path.mnt);
2917 if (unlikely(error))
2918 goto out;
2919 /* we want directory to be writable */
2920 error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
2921 if (error)
2922 goto out2;
2923 dentry = nd->path.dentry;
2924 dir = dentry->d_inode;
2925 if (!dir->i_op->tmpfile) {
2926 error = -EOPNOTSUPP;
2927 goto out2;
2928 }
2929 child = d_alloc(dentry, &name);
2930 if (unlikely(!child)) {
2931 error = -ENOMEM;
2932 goto out2;
2933 }
2934 nd->flags &= ~LOOKUP_DIRECTORY;
2935 nd->flags |= op->intent;
2936 dput(nd->path.dentry);
2937 nd->path.dentry = child;
2938 error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
2939 if (error)
2940 goto out2;
2941 audit_inode(pathname, nd->path.dentry, 0);
2942 error = may_open(&nd->path, op->acc_mode, op->open_flag);
2943 if (error)
2944 goto out2;
2945 file->f_path.mnt = nd->path.mnt;
2946 error = finish_open(file, nd->path.dentry, NULL, opened);
2947 if (error)
2948 goto out2;
2949 error = open_check_o_direct(file);
2950 if (error) {
2951 fput(file);
2952 } else if (!(op->open_flag & O_EXCL)) {
2953 struct inode *inode = file_inode(file);
2954 spin_lock(&inode->i_lock);
2955 inode->i_state |= I_LINKABLE;
2956 spin_unlock(&inode->i_lock);
2957 }
2958out2:
2959 mnt_drop_write(nd->path.mnt);
2960out:
2961 path_put(&nd->path);
2962 return error;
2963}
2964
2923static struct file *path_openat(int dfd, struct filename *pathname, 2965static struct file *path_openat(int dfd, struct filename *pathname,
2924 struct nameidata *nd, const struct open_flags *op, int flags) 2966 struct nameidata *nd, const struct open_flags *op, int flags)
2925{ 2967{
@@ -2935,6 +2977,11 @@ static struct file *path_openat(int dfd, struct filename *pathname,
2935 2977
2936 file->f_flags = op->open_flag; 2978 file->f_flags = op->open_flag;
2937 2979
2980 if (unlikely(file->f_flags & __O_TMPFILE)) {
2981 error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
2982 goto out;
2983 }
2984
2938 error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); 2985 error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
2939 if (unlikely(error)) 2986 if (unlikely(error))
2940 goto out; 2987 goto out;
@@ -2987,9 +3034,10 @@ out:
2987} 3034}
2988 3035
2989struct file *do_filp_open(int dfd, struct filename *pathname, 3036struct file *do_filp_open(int dfd, struct filename *pathname,
2990 const struct open_flags *op, int flags) 3037 const struct open_flags *op)
2991{ 3038{
2992 struct nameidata nd; 3039 struct nameidata nd;
3040 int flags = op->lookup_flags;
2993 struct file *filp; 3041 struct file *filp;
2994 3042
2995 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); 3043 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
@@ -3001,17 +3049,16 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
3001} 3049}
3002 3050
3003struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, 3051struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3004 const char *name, const struct open_flags *op, int flags) 3052 const char *name, const struct open_flags *op)
3005{ 3053{
3006 struct nameidata nd; 3054 struct nameidata nd;
3007 struct file *file; 3055 struct file *file;
3008 struct filename filename = { .name = name }; 3056 struct filename filename = { .name = name };
3057 int flags = op->lookup_flags | LOOKUP_ROOT;
3009 3058
3010 nd.root.mnt = mnt; 3059 nd.root.mnt = mnt;
3011 nd.root.dentry = dentry; 3060 nd.root.dentry = dentry;
3012 3061
3013 flags |= LOOKUP_ROOT;
3014
3015 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) 3062 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
3016 return ERR_PTR(-ELOOP); 3063 return ERR_PTR(-ELOOP);
3017 3064
@@ -3586,12 +3633,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
3586 3633
3587 mutex_lock(&inode->i_mutex); 3634 mutex_lock(&inode->i_mutex);
3588 /* Make sure we don't allow creating hardlink to an unlinked file */ 3635 /* Make sure we don't allow creating hardlink to an unlinked file */
3589 if (inode->i_nlink == 0) 3636 if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
3590 error = -ENOENT; 3637 error = -ENOENT;
3591 else if (max_links && inode->i_nlink >= max_links) 3638 else if (max_links && inode->i_nlink >= max_links)
3592 error = -EMLINK; 3639 error = -EMLINK;
3593 else 3640 else
3594 error = dir->i_op->link(old_dentry, dir, new_dentry); 3641 error = dir->i_op->link(old_dentry, dir, new_dentry);
3642
3643 if (!error && (inode->i_state & I_LINKABLE)) {
3644 spin_lock(&inode->i_lock);
3645 inode->i_state &= ~I_LINKABLE;
3646 spin_unlock(&inode->i_lock);
3647 }
3595 mutex_unlock(&inode->i_mutex); 3648 mutex_unlock(&inode->i_mutex);
3596 if (!error) 3649 if (!error)
3597 fsnotify_link(dir, inode, new_dentry); 3650 fsnotify_link(dir, inode, new_dentry);
@@ -3618,15 +3671,11 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3618 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) 3671 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
3619 return -EINVAL; 3672 return -EINVAL;
3620 /* 3673 /*
3621 * To use null names we require CAP_DAC_READ_SEARCH 3674 * Using empty names is equivalent to using AT_SYMLINK_FOLLOW
3622 * This ensures that not everyone will be able to create 3675 * on /proc/self/fd/<fd>.
3623 * handlink using the passed filedescriptor.
3624 */ 3676 */
3625 if (flags & AT_EMPTY_PATH) { 3677 if (flags & AT_EMPTY_PATH)
3626 if (!capable(CAP_DAC_READ_SEARCH))
3627 return -ENOENT;
3628 how = LOOKUP_EMPTY; 3678 how = LOOKUP_EMPTY;
3629 }
3630 3679
3631 if (flags & AT_SYMLINK_FOLLOW) 3680 if (flags & AT_SYMLINK_FOLLOW)
3632 how |= LOOKUP_FOLLOW; 3681 how |= LOOKUP_FOLLOW;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 6792ce11f2bf..3be047474bfc 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -23,12 +23,12 @@
23 23
24#include "ncp_fs.h" 24#include "ncp_fs.h"
25 25
26static void ncp_read_volume_list(struct file *, void *, filldir_t, 26static void ncp_read_volume_list(struct file *, struct dir_context *,
27 struct ncp_cache_control *); 27 struct ncp_cache_control *);
28static void ncp_do_readdir(struct file *, void *, filldir_t, 28static void ncp_do_readdir(struct file *, struct dir_context *,
29 struct ncp_cache_control *); 29 struct ncp_cache_control *);
30 30
31static int ncp_readdir(struct file *, void *, filldir_t); 31static int ncp_readdir(struct file *, struct dir_context *);
32 32
33static int ncp_create(struct inode *, struct dentry *, umode_t, bool); 33static int ncp_create(struct inode *, struct dentry *, umode_t, bool);
34static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int); 34static struct dentry *ncp_lookup(struct inode *, struct dentry *, unsigned int);
@@ -49,7 +49,7 @@ const struct file_operations ncp_dir_operations =
49{ 49{
50 .llseek = generic_file_llseek, 50 .llseek = generic_file_llseek,
51 .read = generic_read_dir, 51 .read = generic_read_dir,
52 .readdir = ncp_readdir, 52 .iterate = ncp_readdir,
53 .unlocked_ioctl = ncp_ioctl, 53 .unlocked_ioctl = ncp_ioctl,
54#ifdef CONFIG_COMPAT 54#ifdef CONFIG_COMPAT
55 .compat_ioctl = ncp_compat_ioctl, 55 .compat_ioctl = ncp_compat_ioctl,
@@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations =
73 * Dentry operations routines 73 * Dentry operations routines
74 */ 74 */
75static int ncp_lookup_validate(struct dentry *, unsigned int); 75static int ncp_lookup_validate(struct dentry *, unsigned int);
76static int ncp_hash_dentry(const struct dentry *, const struct inode *, 76static int ncp_hash_dentry(const struct dentry *, struct qstr *);
77 struct qstr *); 77static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
78static int ncp_compare_dentry(const struct dentry *, const struct inode *,
79 const struct dentry *, const struct inode *,
80 unsigned int, const char *, const struct qstr *); 78 unsigned int, const char *, const struct qstr *);
81static int ncp_delete_dentry(const struct dentry *); 79static int ncp_delete_dentry(const struct dentry *);
82 80
@@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i)
119/* 117/*
120 * Note: leave the hash unchanged if the directory 118 * Note: leave the hash unchanged if the directory
121 * is case-sensitive. 119 * is case-sensitive.
120 *
121 * Accessing the parent inode can be racy under RCU pathwalking.
122 * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
123 * the callers will handle races.
122 */ 124 */
123static int 125static int
124ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode, 126ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
125 struct qstr *this)
126{ 127{
128 struct inode *inode = ACCESS_ONCE(dentry->d_inode);
129
130 if (!inode)
131 return 0;
132
127 if (!ncp_case_sensitive(inode)) { 133 if (!ncp_case_sensitive(inode)) {
128 struct super_block *sb = dentry->d_sb; 134 struct super_block *sb = dentry->d_sb;
129 struct nls_table *t; 135 struct nls_table *t;
@@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
140 return 0; 146 return 0;
141} 147}
142 148
149/*
150 * Accessing the parent inode can be racy under RCU pathwalking.
151 * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
152 * the callers will handle races.
153 */
143static int 154static int
144ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode, 155ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
145 const struct dentry *dentry, const struct inode *inode,
146 unsigned int len, const char *str, const struct qstr *name) 156 unsigned int len, const char *str, const struct qstr *name)
147{ 157{
158 struct inode *pinode;
159
148 if (len != name->len) 160 if (len != name->len)
149 return 1; 161 return 1;
150 162
163 pinode = ACCESS_ONCE(parent->d_inode);
164 if (!pinode)
165 return 1;
166
151 if (ncp_case_sensitive(pinode)) 167 if (ncp_case_sensitive(pinode))
152 return strncmp(str, name->name, len); 168 return strncmp(str, name->name, len);
153 169
@@ -424,9 +440,9 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
424 return ncp_date_dos2unix(i.modifyTime, i.modifyDate); 440 return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
425} 441}
426 442
427static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir) 443static int ncp_readdir(struct file *file, struct dir_context *ctx)
428{ 444{
429 struct dentry *dentry = filp->f_path.dentry; 445 struct dentry *dentry = file->f_path.dentry;
430 struct inode *inode = dentry->d_inode; 446 struct inode *inode = dentry->d_inode;
431 struct page *page = NULL; 447 struct page *page = NULL;
432 struct ncp_server *server = NCP_SERVER(inode); 448 struct ncp_server *server = NCP_SERVER(inode);
@@ -440,7 +456,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
440 456
441 DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n", 457 DDPRINTK("ncp_readdir: reading %s/%s, pos=%d\n",
442 dentry->d_parent->d_name.name, dentry->d_name.name, 458 dentry->d_parent->d_name.name, dentry->d_name.name,
443 (int) filp->f_pos); 459 (int) ctx->pos);
444 460
445 result = -EIO; 461 result = -EIO;
446 /* Do not generate '.' and '..' when server is dead. */ 462 /* Do not generate '.' and '..' when server is dead. */
@@ -448,16 +464,8 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
448 goto out; 464 goto out;
449 465
450 result = 0; 466 result = 0;
451 if (filp->f_pos == 0) { 467 if (!dir_emit_dots(file, ctx))
452 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR)) 468 goto out;
453 goto out;
454 filp->f_pos = 1;
455 }
456 if (filp->f_pos == 1) {
457 if (filldir(dirent, "..", 2, 1, parent_ino(dentry), DT_DIR))
458 goto out;
459 filp->f_pos = 2;
460 }
461 469
462 page = grab_cache_page(&inode->i_data, 0); 470 page = grab_cache_page(&inode->i_data, 0);
463 if (!page) 471 if (!page)
@@ -469,7 +477,7 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
469 if (!PageUptodate(page) || !ctl.head.eof) 477 if (!PageUptodate(page) || !ctl.head.eof)
470 goto init_cache; 478 goto init_cache;
471 479
472 if (filp->f_pos == 2) { 480 if (ctx->pos == 2) {
473 if (jiffies - ctl.head.time >= NCP_MAX_AGE(server)) 481 if (jiffies - ctl.head.time >= NCP_MAX_AGE(server))
474 goto init_cache; 482 goto init_cache;
475 483
@@ -479,10 +487,10 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
479 goto init_cache; 487 goto init_cache;
480 } 488 }
481 489
482 if (filp->f_pos > ctl.head.end) 490 if (ctx->pos > ctl.head.end)
483 goto finished; 491 goto finished;
484 492
485 ctl.fpos = filp->f_pos + (NCP_DIRCACHE_START - 2); 493 ctl.fpos = ctx->pos + (NCP_DIRCACHE_START - 2);
486 ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE; 494 ctl.ofs = ctl.fpos / NCP_DIRCACHE_SIZE;
487 ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE; 495 ctl.idx = ctl.fpos % NCP_DIRCACHE_SIZE;
488 496
@@ -497,21 +505,21 @@ static int ncp_readdir(struct file *filp, void *dirent, filldir_t filldir)
497 } 505 }
498 while (ctl.idx < NCP_DIRCACHE_SIZE) { 506 while (ctl.idx < NCP_DIRCACHE_SIZE) {
499 struct dentry *dent; 507 struct dentry *dent;
500 int res; 508 bool over;
501 509
502 dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], 510 dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
503 dentry, filp->f_pos); 511 dentry, ctx->pos);
504 if (!dent) 512 if (!dent)
505 goto invalid_cache; 513 goto invalid_cache;
506 res = filldir(dirent, dent->d_name.name, 514 over = !dir_emit(ctx, dent->d_name.name,
507 dent->d_name.len, filp->f_pos, 515 dent->d_name.len,
508 dent->d_inode->i_ino, DT_UNKNOWN); 516 dent->d_inode->i_ino, DT_UNKNOWN);
509 dput(dent); 517 dput(dent);
510 if (res) 518 if (over)
511 goto finished; 519 goto finished;
512 filp->f_pos += 1; 520 ctx->pos += 1;
513 ctl.idx += 1; 521 ctl.idx += 1;
514 if (filp->f_pos > ctl.head.end) 522 if (ctx->pos > ctl.head.end)
515 goto finished; 523 goto finished;
516 } 524 }
517 if (ctl.page) { 525 if (ctl.page) {
@@ -548,9 +556,9 @@ init_cache:
548 ctl.valid = 1; 556 ctl.valid = 1;
549read_really: 557read_really:
550 if (ncp_is_server_root(inode)) { 558 if (ncp_is_server_root(inode)) {
551 ncp_read_volume_list(filp, dirent, filldir, &ctl); 559 ncp_read_volume_list(file, ctx, &ctl);
552 } else { 560 } else {
553 ncp_do_readdir(filp, dirent, filldir, &ctl); 561 ncp_do_readdir(file, ctx, &ctl);
554 } 562 }
555 ctl.head.end = ctl.fpos - 1; 563 ctl.head.end = ctl.fpos - 1;
556 ctl.head.eof = ctl.valid; 564 ctl.head.eof = ctl.valid;
@@ -573,11 +581,11 @@ out:
573} 581}
574 582
575static int 583static int
576ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 584ncp_fill_cache(struct file *file, struct dir_context *ctx,
577 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, 585 struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
578 int inval_childs) 586 int inval_childs)
579{ 587{
580 struct dentry *newdent, *dentry = filp->f_path.dentry; 588 struct dentry *newdent, *dentry = file->f_path.dentry;
581 struct inode *dir = dentry->d_inode; 589 struct inode *dir = dentry->d_inode;
582 struct ncp_cache_control ctl = *ctrl; 590 struct ncp_cache_control ctl = *ctrl;
583 struct qstr qname; 591 struct qstr qname;
@@ -666,15 +674,13 @@ ncp_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
666end_advance: 674end_advance:
667 if (!valid) 675 if (!valid)
668 ctl.valid = 0; 676 ctl.valid = 0;
669 if (!ctl.filled && (ctl.fpos == filp->f_pos)) { 677 if (!ctl.filled && (ctl.fpos == ctx->pos)) {
670 if (!ino)
671 ino = find_inode_number(dentry, &qname);
672 if (!ino) 678 if (!ino)
673 ino = iunique(dir->i_sb, 2); 679 ino = iunique(dir->i_sb, 2);
674 ctl.filled = filldir(dirent, qname.name, qname.len, 680 ctl.filled = !dir_emit(ctx, qname.name, qname.len,
675 filp->f_pos, ino, DT_UNKNOWN); 681 ino, DT_UNKNOWN);
676 if (!ctl.filled) 682 if (!ctl.filled)
677 filp->f_pos += 1; 683 ctx->pos += 1;
678 } 684 }
679 ctl.fpos += 1; 685 ctl.fpos += 1;
680 ctl.idx += 1; 686 ctl.idx += 1;
@@ -683,10 +689,10 @@ end_advance:
683} 689}
684 690
685static void 691static void
686ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir, 692ncp_read_volume_list(struct file *file, struct dir_context *ctx,
687 struct ncp_cache_control *ctl) 693 struct ncp_cache_control *ctl)
688{ 694{
689 struct dentry *dentry = filp->f_path.dentry; 695 struct dentry *dentry = file->f_path.dentry;
690 struct inode *inode = dentry->d_inode; 696 struct inode *inode = dentry->d_inode;
691 struct ncp_server *server = NCP_SERVER(inode); 697 struct ncp_server *server = NCP_SERVER(inode);
692 struct ncp_volume_info info; 698 struct ncp_volume_info info;
@@ -694,7 +700,7 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
694 int i; 700 int i;
695 701
696 DPRINTK("ncp_read_volume_list: pos=%ld\n", 702 DPRINTK("ncp_read_volume_list: pos=%ld\n",
697 (unsigned long) filp->f_pos); 703 (unsigned long) ctx->pos);
698 704
699 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) { 705 for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
700 int inval_dentry; 706 int inval_dentry;
@@ -715,16 +721,16 @@ ncp_read_volume_list(struct file *filp, void *dirent, filldir_t filldir,
715 } 721 }
716 inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL); 722 inval_dentry = ncp_update_known_namespace(server, entry.i.volNumber, NULL);
717 entry.volume = entry.i.volNumber; 723 entry.volume = entry.i.volNumber;
718 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, inval_dentry)) 724 if (!ncp_fill_cache(file, ctx, ctl, &entry, inval_dentry))
719 return; 725 return;
720 } 726 }
721} 727}
722 728
723static void 729static void
724ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, 730ncp_do_readdir(struct file *file, struct dir_context *ctx,
725 struct ncp_cache_control *ctl) 731 struct ncp_cache_control *ctl)
726{ 732{
727 struct dentry *dentry = filp->f_path.dentry; 733 struct dentry *dentry = file->f_path.dentry;
728 struct inode *dir = dentry->d_inode; 734 struct inode *dir = dentry->d_inode;
729 struct ncp_server *server = NCP_SERVER(dir); 735 struct ncp_server *server = NCP_SERVER(dir);
730 struct nw_search_sequence seq; 736 struct nw_search_sequence seq;
@@ -736,7 +742,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
736 742
737 DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n", 743 DPRINTK("ncp_do_readdir: %s/%s, fpos=%ld\n",
738 dentry->d_parent->d_name.name, dentry->d_name.name, 744 dentry->d_parent->d_name.name, dentry->d_name.name,
739 (unsigned long) filp->f_pos); 745 (unsigned long) ctx->pos);
740 PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n", 746 PPRINTK("ncp_do_readdir: init %s, volnum=%d, dirent=%u\n",
741 dentry->d_name.name, NCP_FINFO(dir)->volNumber, 747 dentry->d_name.name, NCP_FINFO(dir)->volNumber,
742 NCP_FINFO(dir)->dirEntNum); 748 NCP_FINFO(dir)->dirEntNum);
@@ -778,7 +784,7 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir,
778 rpl += onerpl; 784 rpl += onerpl;
779 rpls -= onerpl; 785 rpls -= onerpl;
780 entry.volume = entry.i.volNumber; 786 entry.volume = entry.i.volNumber;
781 if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry, 0)) 787 if (!ncp_fill_cache(file, ctx, ctl, &entry, 0))
782 break; 788 break;
783 } 789 }
784 } while (more); 790 } while (more);
@@ -1131,17 +1137,6 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1131 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1137 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1132 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1138 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1133 1139
1134 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
1135 /*
1136 * fail with EBUSY if there are still references to this
1137 * directory.
1138 */
1139 dentry_unhash(new_dentry);
1140 error = -EBUSY;
1141 if (!d_unhashed(new_dentry))
1142 goto out;
1143 }
1144
1145 ncp_age_dentry(server, old_dentry); 1140 ncp_age_dentry(server, old_dentry);
1146 ncp_age_dentry(server, new_dentry); 1141 ncp_age_dentry(server, new_dentry);
1147 1142
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 26910c8154da..4659da67e7f6 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -403,18 +403,24 @@ static int ncp_parse_options(struct ncp_mount_data_kernel *data, char *options)
403 switch (optval) { 403 switch (optval) {
404 case 'u': 404 case 'u':
405 data->uid = make_kuid(current_user_ns(), optint); 405 data->uid = make_kuid(current_user_ns(), optint);
406 if (!uid_valid(data->uid)) 406 if (!uid_valid(data->uid)) {
407 ret = -EINVAL;
407 goto err; 408 goto err;
409 }
408 break; 410 break;
409 case 'g': 411 case 'g':
410 data->gid = make_kgid(current_user_ns(), optint); 412 data->gid = make_kgid(current_user_ns(), optint);
411 if (!gid_valid(data->gid)) 413 if (!gid_valid(data->gid)) {
414 ret = -EINVAL;
412 goto err; 415 goto err;
416 }
413 break; 417 break;
414 case 'o': 418 case 'o':
415 data->mounted_uid = make_kuid(current_user_ns(), optint); 419 data->mounted_uid = make_kuid(current_user_ns(), optint);
416 if (!uid_valid(data->mounted_uid)) 420 if (!uid_valid(data->mounted_uid)) {
421 ret = -EINVAL;
417 goto err; 422 goto err;
423 }
418 break; 424 break;
419 case 'm': 425 case 'm':
420 data->file_mode = optint; 426 data->file_mode = optint;
@@ -891,6 +897,10 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
891 if (!server) /* How this could happen? */ 897 if (!server) /* How this could happen? */
892 goto out; 898 goto out;
893 899
900 result = -EPERM;
901 if (IS_DEADDIR(dentry->d_inode))
902 goto out;
903
894 /* ageing the dentry to force validation */ 904 /* ageing the dentry to force validation */
895 ncp_age_dentry(server, dentry); 905 ncp_age_dentry(server, dentry);
896 906
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index ee24df5af1f9..3c5dd55d284c 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
117 return -EINVAL; 117 return -EINVAL;
118 /* we do not support files bigger than 4GB... We eventually 118 /* we do not support files bigger than 4GB... We eventually
119 supports just 4GB... */ 119 supports just 4GB... */
120 if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff 120 if (vma_pages(vma) + vma->vm_pgoff
121 > (1U << (32 - PAGE_SHIFT))) 121 > (1U << (32 - PAGE_SHIFT)))
122 return -EFBIG; 122 return -EFBIG;
123 123
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 13ca196385f5..b5e80b0af315 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -104,6 +104,15 @@ config NFS_V4_1
104 104
105 If unsure, say N. 105 If unsure, say N.
106 106
107config NFS_V4_2
108 bool "NFS client support for NFSv4.2"
109 depends on NFS_V4_1
110 help
111 This option enables support for minor version 2 of the NFSv4 protocol
112 in the kernel's NFS client.
113
114 If unsure, say N.
115
107config PNFS_FILE_LAYOUT 116config PNFS_FILE_LAYOUT
108 tristate 117 tristate
109 depends on NFS_V4_1 118 depends on NFS_V4_1
@@ -131,6 +140,11 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
131 If the NFS client is unchanged from the upstream kernel, this 140 If the NFS client is unchanged from the upstream kernel, this
132 option should be set to the default "kernel.org". 141 option should be set to the default "kernel.org".
133 142
143config NFS_V4_SECURITY_LABEL
144 bool
145 depends on NFS_V4_2 && SECURITY
146 default y
147
134config ROOT_NFS 148config ROOT_NFS
135 bool "Root file system on NFS" 149 bool "Root file system on NFS"
136 depends on NFS_FS=y && IP_PNP 150 depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index cce2c057bd2d..e0bb048e9576 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,8 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ 7nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
8 direct.o pagelist.o read.o symlink.o unlink.o \ 8 direct.o pagelist.o read.o symlink.o unlink.o \
9 write.o namespace.o mount_clnt.o \ 9 write.o namespace.o mount_clnt.o
10 dns_resolve.o cache_lib.o
11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
12nfs-$(CONFIG_SYSCTL) += sysctl.o 11nfs-$(CONFIG_SYSCTL) += sysctl.o
13nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 12nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
@@ -22,7 +21,8 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
22obj-$(CONFIG_NFS_V4) += nfsv4.o 21obj-$(CONFIG_NFS_V4) += nfsv4.o
23nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ 22nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
24 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ 23 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
25 nfs4namespace.o nfs4getroot.o nfs4client.o 24 nfs4namespace.o nfs4getroot.o nfs4client.o dns_resolve.o
25nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o 27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
28 28
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 434b93ec0970..e242bbf72972 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1089,9 +1089,10 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1089 dev->pgbase = 0; 1089 dev->pgbase = 0;
1090 dev->pglen = PAGE_SIZE * max_pages; 1090 dev->pglen = PAGE_SIZE * max_pages;
1091 dev->mincount = 0; 1091 dev->mincount = 0;
1092 dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
1092 1093
1093 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 1094 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
1094 rc = nfs4_proc_getdeviceinfo(server, dev); 1095 rc = nfs4_proc_getdeviceinfo(server, dev, NULL);
1095 dprintk("%s getdevice info returns %d\n", __func__, rc); 1096 dprintk("%s getdevice info returns %d\n", __func__, rc);
1096 if (rc) { 1097 if (rc) {
1097 rv = ERR_PTR(rc); 1098 rv = ERR_PTR(rc);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index cff089a412c7..67cd73213168 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -211,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
211 struct svc_rqst *rqstp; 211 struct svc_rqst *rqstp;
212 int (*callback_svc)(void *vrqstp); 212 int (*callback_svc)(void *vrqstp);
213 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; 213 struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
214 char svc_name[12];
215 int ret; 214 int ret;
216 215
217 nfs_callback_bc_serv(minorversion, xprt, serv); 216 nfs_callback_bc_serv(minorversion, xprt, serv);
@@ -235,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
235 234
236 svc_sock_update_bufs(serv); 235 svc_sock_update_bufs(serv);
237 236
238 sprintf(svc_name, "nfsv4.%u-svc", minorversion);
239 cb_info->serv = serv; 237 cb_info->serv = serv;
240 cb_info->rqst = rqstp; 238 cb_info->rqst = rqstp;
241 cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); 239 cb_info->task = kthread_run(callback_svc, cb_info->rqst,
240 "nfsv4.%u-svc", minorversion);
242 if (IS_ERR(cb_info->task)) { 241 if (IS_ERR(cb_info->task)) {
243 ret = PTR_ERR(cb_info->task); 242 ret = PTR_ERR(cb_info->task);
244 svc_exit_thread(cb_info->rqst); 243 svc_exit_thread(cb_info->rqst);
@@ -282,6 +281,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
282 ret = nfs4_callback_up_net(serv, net); 281 ret = nfs4_callback_up_net(serv, net);
283 break; 282 break;
284 case 1: 283 case 1:
284 case 2:
285 ret = nfs41_callback_up_net(serv, net); 285 ret = nfs41_callback_up_net(serv, net);
286 break; 286 break;
287 default: 287 default:
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index efd54f0a4c46..84326e9fb47a 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -32,6 +32,8 @@ enum nfs4_callback_opnum {
32 OP_CB_WANTS_CANCELLED = 12, 32 OP_CB_WANTS_CANCELLED = 12,
33 OP_CB_NOTIFY_LOCK = 13, 33 OP_CB_NOTIFY_LOCK = 13,
34 OP_CB_NOTIFY_DEVICEID = 14, 34 OP_CB_NOTIFY_DEVICEID = 14,
35/* Callback operations new to NFSv4.2 */
36 OP_CB_OFFLOAD = 15,
35 OP_CB_ILLEGAL = 10044, 37 OP_CB_ILLEGAL = 10044,
36}; 38};
37 39
@@ -39,6 +41,7 @@ struct cb_process_state {
39 __be32 drc_status; 41 __be32 drc_status;
40 struct nfs_client *clp; 42 struct nfs_client *clp;
41 u32 slotid; 43 u32 slotid;
44 u32 minorversion;
42 struct net *net; 45 struct net *net;
43}; 46};
44 47
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 0bc27684ebfa..e6ebc4c38c81 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -406,7 +406,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
406 int i; 406 int i;
407 __be32 status = htonl(NFS4ERR_BADSESSION); 407 __be32 status = htonl(NFS4ERR_BADSESSION);
408 408
409 clp = nfs4_find_client_sessionid(cps->net, args->csa_addr, &args->csa_sessionid); 409 clp = nfs4_find_client_sessionid(cps->net, args->csa_addr,
410 &args->csa_sessionid, cps->minorversion);
410 if (clp == NULL) 411 if (clp == NULL)
411 goto out; 412 goto out;
412 413
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a35582c9d444..f4ccfe6521ec 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -166,9 +166,9 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
166 if (unlikely(p == NULL)) 166 if (unlikely(p == NULL))
167 return htonl(NFS4ERR_RESOURCE); 167 return htonl(NFS4ERR_RESOURCE);
168 hdr->minorversion = ntohl(*p++); 168 hdr->minorversion = ntohl(*p++);
169 /* Check minor version is zero or one. */ 169 /* Check for minor version support */
170 if (hdr->minorversion <= 1) { 170 if (hdr->minorversion <= NFS4_MAX_MINOR_VERSION) {
171 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 */ 171 hdr->cb_ident = ntohl(*p++); /* ignored by v4.1 and v4.2 */
172 } else { 172 } else {
173 pr_warn_ratelimited("NFS: %s: NFSv4 server callback with " 173 pr_warn_ratelimited("NFS: %s: NFSv4 server callback with "
174 "illegal minor version %u!\n", 174 "illegal minor version %u!\n",
@@ -786,6 +786,26 @@ static void nfs4_cb_free_slot(struct cb_process_state *cps)
786} 786}
787#endif /* CONFIG_NFS_V4_1 */ 787#endif /* CONFIG_NFS_V4_1 */
788 788
789#ifdef CONFIG_NFS_V4_2
790static __be32
791preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
792{
793 __be32 status = preprocess_nfs41_op(nop, op_nr, op);
794 if (status != htonl(NFS4ERR_OP_ILLEGAL))
795 return status;
796
797 if (op_nr == OP_CB_OFFLOAD)
798 return htonl(NFS4ERR_NOTSUPP);
799 return htonl(NFS4ERR_OP_ILLEGAL);
800}
801#else /* CONFIG_NFS_V4_2 */
802static __be32
803preprocess_nfs42_op(int nop, unsigned int op_nr, struct callback_op **op)
804{
805 return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
806}
807#endif /* CONFIG_NFS_V4_2 */
808
789static __be32 809static __be32
790preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) 810preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
791{ 811{
@@ -801,8 +821,7 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op)
801 return htonl(NFS_OK); 821 return htonl(NFS_OK);
802} 822}
803 823
804static __be32 process_op(uint32_t minorversion, int nop, 824static __be32 process_op(int nop, struct svc_rqst *rqstp,
805 struct svc_rqst *rqstp,
806 struct xdr_stream *xdr_in, void *argp, 825 struct xdr_stream *xdr_in, void *argp,
807 struct xdr_stream *xdr_out, void *resp, 826 struct xdr_stream *xdr_out, void *resp,
808 struct cb_process_state *cps) 827 struct cb_process_state *cps)
@@ -819,10 +838,22 @@ static __be32 process_op(uint32_t minorversion, int nop,
819 return status; 838 return status;
820 839
821 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", 840 dprintk("%s: minorversion=%d nop=%d op_nr=%u\n",
822 __func__, minorversion, nop, op_nr); 841 __func__, cps->minorversion, nop, op_nr);
842
843 switch (cps->minorversion) {
844 case 0:
845 status = preprocess_nfs4_op(op_nr, &op);
846 break;
847 case 1:
848 status = preprocess_nfs41_op(nop, op_nr, &op);
849 break;
850 case 2:
851 status = preprocess_nfs42_op(nop, op_nr, &op);
852 break;
853 default:
854 status = htonl(NFS4ERR_MINOR_VERS_MISMATCH);
855 }
823 856
824 status = minorversion ? preprocess_nfs41_op(nop, op_nr, &op) :
825 preprocess_nfs4_op(op_nr, &op);
826 if (status == htonl(NFS4ERR_OP_ILLEGAL)) 857 if (status == htonl(NFS4ERR_OP_ILLEGAL))
827 op_nr = OP_CB_ILLEGAL; 858 op_nr = OP_CB_ILLEGAL;
828 if (status) 859 if (status)
@@ -885,14 +916,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
885 return rpc_drop_reply; 916 return rpc_drop_reply;
886 } 917 }
887 918
919 cps.minorversion = hdr_arg.minorversion;
888 hdr_res.taglen = hdr_arg.taglen; 920 hdr_res.taglen = hdr_arg.taglen;
889 hdr_res.tag = hdr_arg.tag; 921 hdr_res.tag = hdr_arg.tag;
890 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) 922 if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
891 return rpc_system_err; 923 return rpc_system_err;
892 924
893 while (status == 0 && nops != hdr_arg.nops) { 925 while (status == 0 && nops != hdr_arg.nops) {
894 status = process_op(hdr_arg.minorversion, nops, rqstp, 926 status = process_op(nops, rqstp, &xdr_in,
895 &xdr_in, argp, &xdr_out, resp, &cps); 927 argp, &xdr_out, resp, &cps);
896 nops++; 928 nops++;
897 } 929 }
898 930
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index c513b0cc835f..340b1eff0267 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -753,8 +753,6 @@ static int nfs_init_server(struct nfs_server *server,
753 data->timeo, data->retrans); 753 data->timeo, data->retrans);
754 if (data->flags & NFS_MOUNT_NORESVPORT) 754 if (data->flags & NFS_MOUNT_NORESVPORT)
755 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 755 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
756 if (server->options & NFS_OPTION_MIGRATION)
757 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
758 756
759 /* Allocate or find a client reference we can use */ 757 /* Allocate or find a client reference we can use */
760 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); 758 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
@@ -1076,7 +1074,7 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
1076 } 1074 }
1077 1075
1078 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1076 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1079 error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr); 1077 error = nfs_mod->rpc_ops->getattr(server, mount_info->mntfh, fattr, NULL);
1080 if (error < 0) { 1078 if (error < 0) {
1081 dprintk("nfs_create_server: getattr error = %d\n", -error); 1079 dprintk("nfs_create_server: getattr error = %d\n", -error);
1082 goto error; 1080 goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 57db3244f4d9..7ec4814e298d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -73,20 +73,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
73 if (inode->i_flock == NULL) 73 if (inode->i_flock == NULL)
74 goto out; 74 goto out;
75 75
76 /* Protect inode->i_flock using the file locks lock */ 76 /* Protect inode->i_flock using the i_lock */
77 lock_flocks(); 77 spin_lock(&inode->i_lock);
78 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 78 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
79 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 79 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
80 continue; 80 continue;
81 if (nfs_file_open_context(fl->fl_file) != ctx) 81 if (nfs_file_open_context(fl->fl_file) != ctx)
82 continue; 82 continue;
83 unlock_flocks(); 83 spin_unlock(&inode->i_lock);
84 status = nfs4_lock_delegation_recall(fl, state, stateid); 84 status = nfs4_lock_delegation_recall(fl, state, stateid);
85 if (status < 0) 85 if (status < 0)
86 goto out; 86 goto out;
87 lock_flocks(); 87 spin_lock(&inode->i_lock);
88 } 88 }
89 unlock_flocks(); 89 spin_unlock(&inode->i_lock);
90out: 90out:
91 return status; 91 return status;
92} 92}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e093e73178b7..e474ca2b2bfe 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
33#include <linux/pagevec.h> 33#include <linux/pagevec.h>
34#include <linux/namei.h> 34#include <linux/namei.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/swap.h>
36#include <linux/sched.h> 37#include <linux/sched.h>
37#include <linux/kmemleak.h> 38#include <linux/kmemleak.h>
38#include <linux/xattr.h> 39#include <linux/xattr.h>
@@ -46,7 +47,7 @@
46 47
47static int nfs_opendir(struct inode *, struct file *); 48static int nfs_opendir(struct inode *, struct file *);
48static int nfs_closedir(struct inode *, struct file *); 49static int nfs_closedir(struct inode *, struct file *);
49static int nfs_readdir(struct file *, void *, filldir_t); 50static int nfs_readdir(struct file *, struct dir_context *);
50static int nfs_fsync_dir(struct file *, loff_t, loff_t, int); 51static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
51static loff_t nfs_llseek_dir(struct file *, loff_t, int); 52static loff_t nfs_llseek_dir(struct file *, loff_t, int);
52static void nfs_readdir_clear_array(struct page*); 53static void nfs_readdir_clear_array(struct page*);
@@ -54,7 +55,7 @@ static void nfs_readdir_clear_array(struct page*);
54const struct file_operations nfs_dir_operations = { 55const struct file_operations nfs_dir_operations = {
55 .llseek = nfs_llseek_dir, 56 .llseek = nfs_llseek_dir,
56 .read = generic_read_dir, 57 .read = generic_read_dir,
57 .readdir = nfs_readdir, 58 .iterate = nfs_readdir,
58 .open = nfs_opendir, 59 .open = nfs_opendir,
59 .release = nfs_closedir, 60 .release = nfs_closedir,
60 .fsync = nfs_fsync_dir, 61 .fsync = nfs_fsync_dir,
@@ -147,6 +148,7 @@ typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, int);
147typedef struct { 148typedef struct {
148 struct file *file; 149 struct file *file;
149 struct page *page; 150 struct page *page;
151 struct dir_context *ctx;
150 unsigned long page_index; 152 unsigned long page_index;
151 u64 *dir_cookie; 153 u64 *dir_cookie;
152 u64 last_cookie; 154 u64 last_cookie;
@@ -252,7 +254,7 @@ out:
252static 254static
253int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) 255int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
254{ 256{
255 loff_t diff = desc->file->f_pos - desc->current_index; 257 loff_t diff = desc->ctx->pos - desc->current_index;
256 unsigned int index; 258 unsigned int index;
257 259
258 if (diff < 0) 260 if (diff < 0)
@@ -289,7 +291,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
289 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { 291 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
290 ctx->duped = 0; 292 ctx->duped = 0;
291 ctx->attr_gencount = nfsi->attr_gencount; 293 ctx->attr_gencount = nfsi->attr_gencount;
292 } else if (new_pos < desc->file->f_pos) { 294 } else if (new_pos < desc->ctx->pos) {
293 if (ctx->duped > 0 295 if (ctx->duped > 0
294 && ctx->dup_cookie == *desc->dir_cookie) { 296 && ctx->dup_cookie == *desc->dir_cookie) {
295 if (printk_ratelimit()) { 297 if (printk_ratelimit()) {
@@ -307,7 +309,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
307 ctx->dup_cookie = *desc->dir_cookie; 309 ctx->dup_cookie = *desc->dir_cookie;
308 ctx->duped = -1; 310 ctx->duped = -1;
309 } 311 }
310 desc->file->f_pos = new_pos; 312 desc->ctx->pos = new_pos;
311 desc->cache_entry_index = i; 313 desc->cache_entry_index = i;
312 return 0; 314 return 0;
313 } 315 }
@@ -405,13 +407,13 @@ different:
405} 407}
406 408
407static 409static
408bool nfs_use_readdirplus(struct inode *dir, struct file *filp) 410bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
409{ 411{
410 if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) 412 if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
411 return false; 413 return false;
412 if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags)) 414 if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
413 return true; 415 return true;
414 if (filp->f_pos == 0) 416 if (ctx->pos == 0)
415 return true; 417 return true;
416 return false; 418 return false;
417} 419}
@@ -435,6 +437,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
435 struct dentry *alias; 437 struct dentry *alias;
436 struct inode *dir = parent->d_inode; 438 struct inode *dir = parent->d_inode;
437 struct inode *inode; 439 struct inode *inode;
440 int status;
438 441
439 if (filename.name[0] == '.') { 442 if (filename.name[0] == '.') {
440 if (filename.len == 1) 443 if (filename.len == 1)
@@ -447,7 +450,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
447 dentry = d_lookup(parent, &filename); 450 dentry = d_lookup(parent, &filename);
448 if (dentry != NULL) { 451 if (dentry != NULL) {
449 if (nfs_same_file(dentry, entry)) { 452 if (nfs_same_file(dentry, entry)) {
450 nfs_refresh_inode(dentry->d_inode, entry->fattr); 453 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
454 status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
455 if (!status)
456 nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
451 goto out; 457 goto out;
452 } else { 458 } else {
453 if (d_invalidate(dentry) != 0) 459 if (d_invalidate(dentry) != 0)
@@ -460,7 +466,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
460 if (dentry == NULL) 466 if (dentry == NULL)
461 return; 467 return;
462 468
463 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr); 469 inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
464 if (IS_ERR(inode)) 470 if (IS_ERR(inode))
465 goto out; 471 goto out;
466 472
@@ -585,10 +591,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
585 if (entry.fh == NULL || entry.fattr == NULL) 591 if (entry.fh == NULL || entry.fattr == NULL)
586 goto out; 592 goto out;
587 593
594 entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
595 if (IS_ERR(entry.label)) {
596 status = PTR_ERR(entry.label);
597 goto out;
598 }
599
588 array = nfs_readdir_get_array(page); 600 array = nfs_readdir_get_array(page);
589 if (IS_ERR(array)) { 601 if (IS_ERR(array)) {
590 status = PTR_ERR(array); 602 status = PTR_ERR(array);
591 goto out; 603 goto out_label_free;
592 } 604 }
593 memset(array, 0, sizeof(struct nfs_cache_array)); 605 memset(array, 0, sizeof(struct nfs_cache_array));
594 array->eof_index = -1; 606 array->eof_index = -1;
@@ -614,6 +626,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
614 nfs_readdir_free_large_page(pages_ptr, pages, array_size); 626 nfs_readdir_free_large_page(pages_ptr, pages, array_size);
615out_release_array: 627out_release_array:
616 nfs_readdir_release_array(page); 628 nfs_readdir_release_array(page);
629out_label_free:
630 nfs4_label_free(entry.label);
617out: 631out:
618 nfs_free_fattr(entry.fattr); 632 nfs_free_fattr(entry.fattr);
619 nfs_free_fhandle(entry.fh); 633 nfs_free_fhandle(entry.fh);
@@ -702,8 +716,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
702 * Once we've found the start of the dirent within a page: fill 'er up... 716 * Once we've found the start of the dirent within a page: fill 'er up...
703 */ 717 */
704static 718static
705int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, 719int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
706 filldir_t filldir)
707{ 720{
708 struct file *file = desc->file; 721 struct file *file = desc->file;
709 int i = 0; 722 int i = 0;
@@ -721,13 +734,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
721 struct nfs_cache_array_entry *ent; 734 struct nfs_cache_array_entry *ent;
722 735
723 ent = &array->array[i]; 736 ent = &array->array[i];
724 if (filldir(dirent, ent->string.name, ent->string.len, 737 if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
725 file->f_pos, nfs_compat_user_ino64(ent->ino), 738 nfs_compat_user_ino64(ent->ino), ent->d_type)) {
726 ent->d_type) < 0) {
727 desc->eof = 1; 739 desc->eof = 1;
728 break; 740 break;
729 } 741 }
730 file->f_pos++; 742 desc->ctx->pos++;
731 if (i < (array->size-1)) 743 if (i < (array->size-1))
732 *desc->dir_cookie = array->array[i+1].cookie; 744 *desc->dir_cookie = array->array[i+1].cookie;
733 else 745 else
@@ -759,8 +771,7 @@ out:
759 * directory in the page cache by the time we get here. 771 * directory in the page cache by the time we get here.
760 */ 772 */
761static inline 773static inline
762int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, 774int uncached_readdir(nfs_readdir_descriptor_t *desc)
763 filldir_t filldir)
764{ 775{
765 struct page *page = NULL; 776 struct page *page = NULL;
766 int status; 777 int status;
@@ -785,7 +796,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
785 if (status < 0) 796 if (status < 0)
786 goto out_release; 797 goto out_release;
787 798
788 status = nfs_do_filldir(desc, dirent, filldir); 799 status = nfs_do_filldir(desc);
789 800
790 out: 801 out:
791 dfprintk(DIRCACHE, "NFS: %s: returns %d\n", 802 dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
@@ -800,35 +811,37 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
800 last cookie cache takes care of the common case of reading the 811 last cookie cache takes care of the common case of reading the
801 whole directory. 812 whole directory.
802 */ 813 */
803static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 814static int nfs_readdir(struct file *file, struct dir_context *ctx)
804{ 815{
805 struct dentry *dentry = filp->f_path.dentry; 816 struct dentry *dentry = file->f_path.dentry;
806 struct inode *inode = dentry->d_inode; 817 struct inode *inode = dentry->d_inode;
807 nfs_readdir_descriptor_t my_desc, 818 nfs_readdir_descriptor_t my_desc,
808 *desc = &my_desc; 819 *desc = &my_desc;
809 struct nfs_open_dir_context *dir_ctx = filp->private_data; 820 struct nfs_open_dir_context *dir_ctx = file->private_data;
810 int res; 821 int res = 0;
811 822
812 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 823 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
813 dentry->d_parent->d_name.name, dentry->d_name.name, 824 dentry->d_parent->d_name.name, dentry->d_name.name,
814 (long long)filp->f_pos); 825 (long long)ctx->pos);
815 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS); 826 nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
816 827
817 /* 828 /*
818 * filp->f_pos points to the dirent entry number. 829 * ctx->pos points to the dirent entry number.
819 * *desc->dir_cookie has the cookie for the next entry. We have 830 * *desc->dir_cookie has the cookie for the next entry. We have
820 * to either find the entry with the appropriate number or 831 * to either find the entry with the appropriate number or
821 * revalidate the cookie. 832 * revalidate the cookie.
822 */ 833 */
823 memset(desc, 0, sizeof(*desc)); 834 memset(desc, 0, sizeof(*desc));
824 835
825 desc->file = filp; 836 desc->file = file;
837 desc->ctx = ctx;
826 desc->dir_cookie = &dir_ctx->dir_cookie; 838 desc->dir_cookie = &dir_ctx->dir_cookie;
827 desc->decode = NFS_PROTO(inode)->decode_dirent; 839 desc->decode = NFS_PROTO(inode)->decode_dirent;
828 desc->plus = nfs_use_readdirplus(inode, filp) ? 1 : 0; 840 desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
829 841
830 nfs_block_sillyrename(dentry); 842 nfs_block_sillyrename(dentry);
831 res = nfs_revalidate_mapping(inode, filp->f_mapping); 843 if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
844 res = nfs_revalidate_mapping(inode, file->f_mapping);
832 if (res < 0) 845 if (res < 0)
833 goto out; 846 goto out;
834 847
@@ -840,7 +853,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
840 /* This means either end of directory */ 853 /* This means either end of directory */
841 if (*desc->dir_cookie && desc->eof == 0) { 854 if (*desc->dir_cookie && desc->eof == 0) {
842 /* Or that the server has 'lost' a cookie */ 855 /* Or that the server has 'lost' a cookie */
843 res = uncached_readdir(desc, dirent, filldir); 856 res = uncached_readdir(desc);
844 if (res == 0) 857 if (res == 0)
845 continue; 858 continue;
846 } 859 }
@@ -857,7 +870,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
857 if (res < 0) 870 if (res < 0)
858 break; 871 break;
859 872
860 res = nfs_do_filldir(desc, dirent, filldir); 873 res = nfs_do_filldir(desc);
861 if (res < 0) 874 if (res < 0)
862 break; 875 break;
863 } while (!desc->eof); 876 } while (!desc->eof);
@@ -1040,6 +1053,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1040 struct dentry *parent; 1053 struct dentry *parent;
1041 struct nfs_fh *fhandle = NULL; 1054 struct nfs_fh *fhandle = NULL;
1042 struct nfs_fattr *fattr = NULL; 1055 struct nfs_fattr *fattr = NULL;
1056 struct nfs4_label *label = NULL;
1043 int error; 1057 int error;
1044 1058
1045 if (flags & LOOKUP_RCU) 1059 if (flags & LOOKUP_RCU)
@@ -1082,7 +1096,11 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1082 if (fhandle == NULL || fattr == NULL) 1096 if (fhandle == NULL || fattr == NULL)
1083 goto out_error; 1097 goto out_error;
1084 1098
1085 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1099 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
1100 if (IS_ERR(label))
1101 goto out_error;
1102
1103 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
1086 if (error) 1104 if (error)
1087 goto out_bad; 1105 goto out_bad;
1088 if (nfs_compare_fh(NFS_FH(inode), fhandle)) 1106 if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1090,8 +1108,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1090 if ((error = nfs_refresh_inode(inode, fattr)) != 0) 1108 if ((error = nfs_refresh_inode(inode, fattr)) != 0)
1091 goto out_bad; 1109 goto out_bad;
1092 1110
1111 nfs_setsecurity(inode, fattr, label);
1112
1093 nfs_free_fattr(fattr); 1113 nfs_free_fattr(fattr);
1094 nfs_free_fhandle(fhandle); 1114 nfs_free_fhandle(fhandle);
1115 nfs4_label_free(label);
1116
1095out_set_verifier: 1117out_set_verifier:
1096 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1118 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1097 out_valid: 1119 out_valid:
@@ -1108,6 +1130,7 @@ out_zap_parent:
1108 out_bad: 1130 out_bad:
1109 nfs_free_fattr(fattr); 1131 nfs_free_fattr(fattr);
1110 nfs_free_fhandle(fhandle); 1132 nfs_free_fhandle(fhandle);
1133 nfs4_label_free(label);
1111 nfs_mark_for_revalidate(dir); 1134 nfs_mark_for_revalidate(dir);
1112 if (inode && S_ISDIR(inode->i_mode)) { 1135 if (inode && S_ISDIR(inode->i_mode)) {
1113 /* Purge readdir caches. */ 1136 /* Purge readdir caches. */
@@ -1128,6 +1151,7 @@ out_zap_parent:
1128out_error: 1151out_error:
1129 nfs_free_fattr(fattr); 1152 nfs_free_fattr(fattr);
1130 nfs_free_fhandle(fhandle); 1153 nfs_free_fhandle(fhandle);
1154 nfs4_label_free(label);
1131 dput(parent); 1155 dput(parent);
1132 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n", 1156 dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) lookup returned error %d\n",
1133 __func__, dentry->d_parent->d_name.name, 1157 __func__, dentry->d_parent->d_name.name,
@@ -1256,6 +1280,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
1256 struct inode *inode = NULL; 1280 struct inode *inode = NULL;
1257 struct nfs_fh *fhandle = NULL; 1281 struct nfs_fh *fhandle = NULL;
1258 struct nfs_fattr *fattr = NULL; 1282 struct nfs_fattr *fattr = NULL;
1283 struct nfs4_label *label = NULL;
1259 int error; 1284 int error;
1260 1285
1261 dfprintk(VFS, "NFS: lookup(%s/%s)\n", 1286 dfprintk(VFS, "NFS: lookup(%s/%s)\n",
@@ -1282,17 +1307,21 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
1282 if (fhandle == NULL || fattr == NULL) 1307 if (fhandle == NULL || fattr == NULL)
1283 goto out; 1308 goto out;
1284 1309
1310 label = nfs4_label_alloc(NFS_SERVER(dir), GFP_NOWAIT);
1311 if (IS_ERR(label))
1312 goto out;
1313
1285 parent = dentry->d_parent; 1314 parent = dentry->d_parent;
1286 /* Protect against concurrent sillydeletes */ 1315 /* Protect against concurrent sillydeletes */
1287 nfs_block_sillyrename(parent); 1316 nfs_block_sillyrename(parent);
1288 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1317 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
1289 if (error == -ENOENT) 1318 if (error == -ENOENT)
1290 goto no_entry; 1319 goto no_entry;
1291 if (error < 0) { 1320 if (error < 0) {
1292 res = ERR_PTR(error); 1321 res = ERR_PTR(error);
1293 goto out_unblock_sillyrename; 1322 goto out_unblock_sillyrename;
1294 } 1323 }
1295 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1324 inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
1296 res = ERR_CAST(inode); 1325 res = ERR_CAST(inode);
1297 if (IS_ERR(res)) 1326 if (IS_ERR(res))
1298 goto out_unblock_sillyrename; 1327 goto out_unblock_sillyrename;
@@ -1310,6 +1339,7 @@ no_entry:
1310 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1339 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1311out_unblock_sillyrename: 1340out_unblock_sillyrename:
1312 nfs_unblock_sillyrename(parent); 1341 nfs_unblock_sillyrename(parent);
1342 nfs4_label_free(label);
1313out: 1343out:
1314 nfs_free_fattr(fattr); 1344 nfs_free_fattr(fattr);
1315 nfs_free_fhandle(fhandle); 1345 nfs_free_fhandle(fhandle);
@@ -1357,18 +1387,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
1357{ 1387{
1358 int err; 1388 int err;
1359 1389
1360 if (ctx->dentry != dentry) {
1361 dput(ctx->dentry);
1362 ctx->dentry = dget(dentry);
1363 }
1364
1365 /* If the open_intent is for execute, we have an extra check to make */
1366 if (ctx->mode & FMODE_EXEC) {
1367 err = nfs_may_open(dentry->d_inode, ctx->cred, open_flags);
1368 if (err < 0)
1369 goto out;
1370 }
1371
1372 err = finish_open(file, dentry, do_open, opened); 1390 err = finish_open(file, dentry, do_open, opened);
1373 if (err) 1391 if (err)
1374 goto out; 1392 goto out;
@@ -1427,13 +1445,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1427 1445
1428 nfs_block_sillyrename(dentry->d_parent); 1446 nfs_block_sillyrename(dentry->d_parent);
1429 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr); 1447 inode = NFS_PROTO(dir)->open_context(dir, ctx, open_flags, &attr);
1430 d_drop(dentry); 1448 nfs_unblock_sillyrename(dentry->d_parent);
1431 if (IS_ERR(inode)) { 1449 if (IS_ERR(inode)) {
1432 nfs_unblock_sillyrename(dentry->d_parent);
1433 put_nfs_open_context(ctx); 1450 put_nfs_open_context(ctx);
1434 err = PTR_ERR(inode); 1451 err = PTR_ERR(inode);
1435 switch (err) { 1452 switch (err) {
1436 case -ENOENT: 1453 case -ENOENT:
1454 d_drop(dentry);
1437 d_add(dentry, NULL); 1455 d_add(dentry, NULL);
1438 break; 1456 break;
1439 case -EISDIR: 1457 case -EISDIR:
@@ -1449,16 +1467,8 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1449 } 1467 }
1450 goto out; 1468 goto out;
1451 } 1469 }
1452 res = d_add_unique(dentry, inode);
1453 if (res != NULL)
1454 dentry = res;
1455
1456 nfs_unblock_sillyrename(dentry->d_parent);
1457 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1458
1459 err = nfs_finish_open(ctx, dentry, file, open_flags, opened);
1460 1470
1461 dput(res); 1471 err = nfs_finish_open(ctx, ctx->dentry, file, open_flags, opened);
1462out: 1472out:
1463 return err; 1473 return err;
1464 1474
@@ -1528,7 +1538,8 @@ no_open:
1528 * Code common to create, mkdir, and mknod. 1538 * Code common to create, mkdir, and mknod.
1529 */ 1539 */
1530int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, 1540int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1531 struct nfs_fattr *fattr) 1541 struct nfs_fattr *fattr,
1542 struct nfs4_label *label)
1532{ 1543{
1533 struct dentry *parent = dget_parent(dentry); 1544 struct dentry *parent = dget_parent(dentry);
1534 struct inode *dir = parent->d_inode; 1545 struct inode *dir = parent->d_inode;
@@ -1541,18 +1552,18 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1541 if (dentry->d_inode) 1552 if (dentry->d_inode)
1542 goto out; 1553 goto out;
1543 if (fhandle->size == 0) { 1554 if (fhandle->size == 0) {
1544 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1555 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL);
1545 if (error) 1556 if (error)
1546 goto out_error; 1557 goto out_error;
1547 } 1558 }
1548 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1559 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1549 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1560 if (!(fattr->valid & NFS_ATTR_FATTR)) {
1550 struct nfs_server *server = NFS_SB(dentry->d_sb); 1561 struct nfs_server *server = NFS_SB(dentry->d_sb);
1551 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); 1562 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr, NULL);
1552 if (error < 0) 1563 if (error < 0)
1553 goto out_error; 1564 goto out_error;
1554 } 1565 }
1555 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1566 inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
1556 error = PTR_ERR(inode); 1567 error = PTR_ERR(inode);
1557 if (IS_ERR(inode)) 1568 if (IS_ERR(inode))
1558 goto out_error; 1569 goto out_error;
@@ -1721,7 +1732,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
1721 dir->i_ino, dentry->d_name.name); 1732 dir->i_ino, dentry->d_name.name);
1722 1733
1723 spin_lock(&dentry->d_lock); 1734 spin_lock(&dentry->d_lock);
1724 if (dentry->d_count > 1) { 1735 if (d_count(dentry) > 1) {
1725 spin_unlock(&dentry->d_lock); 1736 spin_unlock(&dentry->d_lock);
1726 /* Start asynchronous writeout of the inode */ 1737 /* Start asynchronous writeout of the inode */
1727 write_inode_now(dentry->d_inode, 0); 1738 write_inode_now(dentry->d_inode, 0);
@@ -1759,7 +1770,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
1759 */ 1770 */
1760int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 1771int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1761{ 1772{
1762 struct pagevec lru_pvec;
1763 struct page *page; 1773 struct page *page;
1764 char *kaddr; 1774 char *kaddr;
1765 struct iattr attr; 1775 struct iattr attr;
@@ -1799,11 +1809,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1799 * No big deal if we can't add this page to the page cache here. 1809 * No big deal if we can't add this page to the page cache here.
1800 * READLINK will get the missing page from the server if needed. 1810 * READLINK will get the missing page from the server if needed.
1801 */ 1811 */
1802 pagevec_init(&lru_pvec, 0); 1812 if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
1803 if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
1804 GFP_KERNEL)) { 1813 GFP_KERNEL)) {
1805 pagevec_add(&lru_pvec, page);
1806 pagevec_lru_add_file(&lru_pvec);
1807 SetPageUptodate(page); 1814 SetPageUptodate(page);
1808 unlock_page(page); 1815 unlock_page(page);
1809 } else 1816 } else
@@ -1870,7 +1877,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1870 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", 1877 dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n",
1871 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1878 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1872 new_dentry->d_parent->d_name.name, new_dentry->d_name.name, 1879 new_dentry->d_parent->d_name.name, new_dentry->d_name.name,
1873 new_dentry->d_count); 1880 d_count(new_dentry));
1874 1881
1875 /* 1882 /*
1876 * For non-directories, check whether the target is busy and if so, 1883 * For non-directories, check whether the target is busy and if so,
@@ -1888,7 +1895,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1888 rehash = new_dentry; 1895 rehash = new_dentry;
1889 } 1896 }
1890 1897
1891 if (new_dentry->d_count > 2) { 1898 if (d_count(new_dentry) > 2) {
1892 int err; 1899 int err;
1893 1900
1894 /* copy the target dentry's name */ 1901 /* copy the target dentry's name */
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 945527092295..fc0f95ec7358 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -29,7 +29,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
29 kfree(ip_addr); 29 kfree(ip_addr);
30 return ret; 30 return ret;
31} 31}
32EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
33 32
34#else 33#else
35 34
@@ -351,7 +350,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name,
351 ret = -ESRCH; 350 ret = -ESRCH;
352 return ret; 351 return ret;
353} 352}
354EXPORT_SYMBOL_GPL(nfs_dns_resolve_name);
355 353
356static struct cache_detail nfs_dns_resolve_template = { 354static struct cache_detail nfs_dns_resolve_template = {
357 .owner = THIS_MODULE, 355 .owner = THIS_MODULE,
@@ -396,6 +394,21 @@ void nfs_dns_resolver_cache_destroy(struct net *net)
396 cache_destroy_net(nn->nfs_dns_resolve, net); 394 cache_destroy_net(nn->nfs_dns_resolve, net);
397} 395}
398 396
397static int nfs4_dns_net_init(struct net *net)
398{
399 return nfs_dns_resolver_cache_init(net);
400}
401
402static void nfs4_dns_net_exit(struct net *net)
403{
404 nfs_dns_resolver_cache_destroy(net);
405}
406
407static struct pernet_operations nfs4_dns_resolver_ops = {
408 .init = nfs4_dns_net_init,
409 .exit = nfs4_dns_net_exit,
410};
411
399static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, 412static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
400 void *ptr) 413 void *ptr)
401{ 414{
@@ -432,11 +445,24 @@ static struct notifier_block nfs_dns_resolver_block = {
432 445
433int nfs_dns_resolver_init(void) 446int nfs_dns_resolver_init(void)
434{ 447{
435 return rpc_pipefs_notifier_register(&nfs_dns_resolver_block); 448 int err;
449
450 err = register_pernet_subsys(&nfs4_dns_resolver_ops);
451 if (err < 0)
452 goto out;
453 err = rpc_pipefs_notifier_register(&nfs_dns_resolver_block);
454 if (err < 0)
455 goto out1;
456 return 0;
457out1:
458 unregister_pernet_subsys(&nfs4_dns_resolver_ops);
459out:
460 return err;
436} 461}
437 462
438void nfs_dns_resolver_destroy(void) 463void nfs_dns_resolver_destroy(void)
439{ 464{
440 rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block); 465 rpc_pipefs_notifier_unregister(&nfs_dns_resolver_block);
466 unregister_pernet_subsys(&nfs4_dns_resolver_ops);
441} 467}
442#endif 468#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a87a44f84113..94e94bd11aae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -451,11 +451,13 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
451 * - Called if either PG_private or PG_fscache is set on the page 451 * - Called if either PG_private or PG_fscache is set on the page
452 * - Caller holds page lock 452 * - Caller holds page lock
453 */ 453 */
454static void nfs_invalidate_page(struct page *page, unsigned long offset) 454static void nfs_invalidate_page(struct page *page, unsigned int offset,
455 unsigned int length)
455{ 456{
456 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset); 457 dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
458 page, offset, length);
457 459
458 if (offset != 0) 460 if (offset != 0 || length < PAGE_CACHE_SIZE)
459 return; 461 return;
460 /* Cancel any unstarted writes on this page */ 462 /* Cancel any unstarted writes on this page */
461 nfs_wb_page_cancel(page_file_mapping(page)->host, page); 463 nfs_wb_page_cancel(page_file_mapping(page)->host, page);
@@ -493,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
493 return nfs_fscache_release_page(page, gfp); 495 return nfs_fscache_release_page(page, gfp);
494} 496}
495 497
498static void nfs_check_dirty_writeback(struct page *page,
499 bool *dirty, bool *writeback)
500{
501 struct nfs_inode *nfsi;
502 struct address_space *mapping = page_file_mapping(page);
503
504 if (!mapping || PageSwapCache(page))
505 return;
506
507 /*
508 * Check if an unstable page is currently being committed and
509 * if so, have the VM treat it as if the page is under writeback
510 * so it will not block due to pages that will shortly be freeable.
511 */
512 nfsi = NFS_I(mapping->host);
513 if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
514 *writeback = true;
515 return;
516 }
517
518 /*
519 * If PagePrivate() is set, then the page is not freeable and as the
520 * inode is not being committed, it's not going to be cleaned in the
521 * near future so treat it as dirty
522 */
523 if (PagePrivate(page))
524 *dirty = true;
525}
526
496/* 527/*
497 * Attempt to clear the private state associated with a page when an error 528 * Attempt to clear the private state associated with a page when an error
498 * occurs that requires the cached contents of an inode to be written back or 529 * occurs that requires the cached contents of an inode to be written back or
@@ -540,6 +571,7 @@ const struct address_space_operations nfs_file_aops = {
540 .direct_IO = nfs_direct_IO, 571 .direct_IO = nfs_direct_IO,
541 .migratepage = nfs_migrate_page, 572 .migratepage = nfs_migrate_page,
542 .launder_page = nfs_launder_page, 573 .launder_page = nfs_launder_page,
574 .is_dirty_writeback = nfs_check_dirty_writeback,
543 .error_remove_page = generic_error_remove_page, 575 .error_remove_page = generic_error_remove_page,
544#ifdef CONFIG_NFS_SWAP 576#ifdef CONFIG_NFS_SWAP
545 .swap_activate = nfs_swap_activate, 577 .swap_activate = nfs_swap_activate,
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 44efaa8c5f78..66984a9aafaa 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -95,7 +95,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
95 goto out; 95 goto out;
96 } 96 }
97 97
98 inode = nfs_fhget(sb, mntfh, fsinfo.fattr); 98 inode = nfs_fhget(sb, mntfh, fsinfo.fattr, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 dprintk("nfs_get_root: get root inode failed\n"); 100 dprintk("nfs_get_root: get root inode failed\n");
101 ret = ERR_CAST(inode); 101 ret = ERR_CAST(inode);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index c516da5873fd..c2c4163d5683 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -262,29 +262,42 @@ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen,
262 return desclen; 262 return desclen;
263} 263}
264 264
265static ssize_t nfs_idmap_request_key(struct key_type *key_type, 265static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
266 const char *name, size_t namelen, 266 const char *type, struct idmap *idmap)
267 const char *type, void *data,
268 size_t data_size, struct idmap *idmap)
269{ 267{
270 const struct cred *saved_cred;
271 struct key *rkey;
272 char *desc; 268 char *desc;
273 struct user_key_payload *payload; 269 struct key *rkey;
274 ssize_t ret; 270 ssize_t ret;
275 271
276 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); 272 ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc);
277 if (ret <= 0) 273 if (ret <= 0)
278 goto out; 274 return ERR_PTR(ret);
275
276 rkey = request_key(&key_type_id_resolver, desc, "");
277 if (IS_ERR(rkey)) {
278 mutex_lock(&idmap->idmap_mutex);
279 rkey = request_key_with_auxdata(&key_type_id_resolver_legacy,
280 desc, "", 0, idmap);
281 mutex_unlock(&idmap->idmap_mutex);
282 }
283
284 kfree(desc);
285 return rkey;
286}
287
288static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
289 const char *type, void *data,
290 size_t data_size, struct idmap *idmap)
291{
292 const struct cred *saved_cred;
293 struct key *rkey;
294 struct user_key_payload *payload;
295 ssize_t ret;
279 296
280 saved_cred = override_creds(id_resolver_cache); 297 saved_cred = override_creds(id_resolver_cache);
281 if (idmap) 298 rkey = nfs_idmap_request_key(name, namelen, type, idmap);
282 rkey = request_key_with_auxdata(key_type, desc, "", 0, idmap);
283 else
284 rkey = request_key(&key_type_id_resolver, desc, "");
285 revert_creds(saved_cred); 299 revert_creds(saved_cred);
286 300
287 kfree(desc);
288 if (IS_ERR(rkey)) { 301 if (IS_ERR(rkey)) {
289 ret = PTR_ERR(rkey); 302 ret = PTR_ERR(rkey);
290 goto out; 303 goto out;
@@ -316,23 +329,6 @@ out:
316 return ret; 329 return ret;
317} 330}
318 331
319static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
320 const char *type, void *data,
321 size_t data_size, struct idmap *idmap)
322{
323 ssize_t ret = nfs_idmap_request_key(&key_type_id_resolver,
324 name, namelen, type, data,
325 data_size, NULL);
326 if (ret < 0) {
327 mutex_lock(&idmap->idmap_mutex);
328 ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
329 name, namelen, type, data,
330 data_size, idmap);
331 mutex_unlock(&idmap->idmap_mutex);
332 }
333 return ret;
334}
335
336/* ID -> Name */ 332/* ID -> Name */
337static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, 333static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf,
338 size_t buflen, struct idmap *idmap) 334 size_t buflen, struct idmap *idmap)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c1c7a9d78722..941246f2b43d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -48,7 +48,6 @@
48#include "iostat.h" 48#include "iostat.h"
49#include "internal.h" 49#include "internal.h"
50#include "fscache.h" 50#include "fscache.h"
51#include "dns_resolve.h"
52#include "pnfs.h" 51#include "pnfs.h"
53#include "nfs.h" 52#include "nfs.h"
54#include "netns.h" 53#include "netns.h"
@@ -79,7 +78,7 @@ int nfs_wait_bit_killable(void *word)
79{ 78{
80 if (fatal_signal_pending(current)) 79 if (fatal_signal_pending(current))
81 return -ERESTARTSYS; 80 return -ERESTARTSYS;
82 freezable_schedule(); 81 freezable_schedule_unsafe();
83 return 0; 82 return 0;
84} 83}
85EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); 84EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
@@ -162,11 +161,19 @@ static void nfs_zap_caches_locked(struct inode *inode)
162 161
163 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); 162 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { 163 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
166 nfs_fscache_invalidate(inode); 164 nfs_fscache_invalidate(inode);
167 } else { 165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
168 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 166 | NFS_INO_INVALID_LABEL
169 } 167 | NFS_INO_INVALID_DATA
168 | NFS_INO_INVALID_ACCESS
169 | NFS_INO_INVALID_ACL
170 | NFS_INO_REVAL_PAGECACHE;
171 } else
172 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
173 | NFS_INO_INVALID_LABEL
174 | NFS_INO_INVALID_ACCESS
175 | NFS_INO_INVALID_ACL
176 | NFS_INO_REVAL_PAGECACHE;
170} 177}
171 178
172void nfs_zap_caches(struct inode *inode) 179void nfs_zap_caches(struct inode *inode)
@@ -257,12 +264,72 @@ nfs_init_locked(struct inode *inode, void *opaque)
257 return 0; 264 return 0;
258} 265}
259 266
267#ifdef CONFIG_NFS_V4_SECURITY_LABEL
268void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
269 struct nfs4_label *label)
270{
271 int error;
272
273 if (label == NULL)
274 return;
275
276 if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
277 return;
278
279 if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
280 return;
281
282 if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
283 error = security_inode_notifysecctx(inode, label->label,
284 label->len);
285 if (error)
286 printk(KERN_ERR "%s() %s %d "
287 "security_inode_notifysecctx() %d\n",
288 __func__,
289 (char *)label->label,
290 label->len, error);
291 }
292}
293
294struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
295{
296 struct nfs4_label *label = NULL;
297 int minor_version = server->nfs_client->cl_minorversion;
298
299 if (minor_version < 2)
300 return label;
301
302 if (!(server->caps & NFS_CAP_SECURITY_LABEL))
303 return label;
304
305 label = kzalloc(sizeof(struct nfs4_label), flags);
306 if (label == NULL)
307 return ERR_PTR(-ENOMEM);
308
309 label->label = kzalloc(NFS4_MAXLABELLEN, flags);
310 if (label->label == NULL) {
311 kfree(label);
312 return ERR_PTR(-ENOMEM);
313 }
314 label->len = NFS4_MAXLABELLEN;
315
316 return label;
317}
318EXPORT_SYMBOL_GPL(nfs4_label_alloc);
319#else
320void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
321 struct nfs4_label *label)
322{
323}
324#endif
325EXPORT_SYMBOL_GPL(nfs_setsecurity);
326
260/* 327/*
261 * This is our front-end to iget that looks up inodes by file handle 328 * This is our front-end to iget that looks up inodes by file handle
262 * instead of inode number. 329 * instead of inode number.
263 */ 330 */
264struct inode * 331struct inode *
265nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) 332nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label)
266{ 333{
267 struct nfs_find_desc desc = { 334 struct nfs_find_desc desc = {
268 .fh = fh, 335 .fh = fh,
@@ -384,6 +451,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
384 */ 451 */
385 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); 452 inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
386 } 453 }
454
455 nfs_setsecurity(inode, fattr, label);
456
387 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 457 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
388 nfsi->attrtimeo_timestamp = now; 458 nfsi->attrtimeo_timestamp = now;
389 nfsi->access_cache = RB_ROOT; 459 nfsi->access_cache = RB_ROOT;
@@ -449,7 +519,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
449 NFS_PROTO(inode)->return_delegation(inode); 519 NFS_PROTO(inode)->return_delegation(inode);
450 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); 520 error = NFS_PROTO(inode)->setattr(dentry, fattr, attr);
451 if (error == 0) 521 if (error == 0)
452 nfs_refresh_inode(inode, fattr); 522 error = nfs_refresh_inode(inode, fattr);
453 nfs_free_fattr(fattr); 523 nfs_free_fattr(fattr);
454out: 524out:
455 return error; 525 return error;
@@ -713,16 +783,23 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context);
713 * Ensure that mmap has a recent RPC credential for use when writing out 783 * Ensure that mmap has a recent RPC credential for use when writing out
714 * shared pages 784 * shared pages
715 */ 785 */
716void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) 786void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
717{ 787{
718 struct inode *inode = file_inode(filp); 788 struct inode *inode = ctx->dentry->d_inode;
719 struct nfs_inode *nfsi = NFS_I(inode); 789 struct nfs_inode *nfsi = NFS_I(inode);
720 790
721 filp->private_data = get_nfs_open_context(ctx);
722 spin_lock(&inode->i_lock); 791 spin_lock(&inode->i_lock);
723 list_add(&ctx->list, &nfsi->open_files); 792 list_add(&ctx->list, &nfsi->open_files);
724 spin_unlock(&inode->i_lock); 793 spin_unlock(&inode->i_lock);
725} 794}
795EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
796
797void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx)
798{
799 filp->private_data = get_nfs_open_context(ctx);
800 if (list_empty(&ctx->list))
801 nfs_inode_attach_open_context(ctx);
802}
726EXPORT_SYMBOL_GPL(nfs_file_set_open_context); 803EXPORT_SYMBOL_GPL(nfs_file_set_open_context);
727 804
728/* 805/*
@@ -748,10 +825,11 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
748 825
749static void nfs_file_clear_open_context(struct file *filp) 826static void nfs_file_clear_open_context(struct file *filp)
750{ 827{
751 struct inode *inode = file_inode(filp);
752 struct nfs_open_context *ctx = nfs_file_open_context(filp); 828 struct nfs_open_context *ctx = nfs_file_open_context(filp);
753 829
754 if (ctx) { 830 if (ctx) {
831 struct inode *inode = ctx->dentry->d_inode;
832
755 filp->private_data = NULL; 833 filp->private_data = NULL;
756 spin_lock(&inode->i_lock); 834 spin_lock(&inode->i_lock);
757 list_move_tail(&ctx->list, &NFS_I(inode)->open_files); 835 list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -790,6 +868,7 @@ int
790__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 868__nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
791{ 869{
792 int status = -ESTALE; 870 int status = -ESTALE;
871 struct nfs4_label *label = NULL;
793 struct nfs_fattr *fattr = NULL; 872 struct nfs_fattr *fattr = NULL;
794 struct nfs_inode *nfsi = NFS_I(inode); 873 struct nfs_inode *nfsi = NFS_I(inode);
795 874
@@ -807,7 +886,14 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
807 goto out; 886 goto out;
808 887
809 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); 888 nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
810 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr); 889
890 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
891 if (IS_ERR(label)) {
892 status = PTR_ERR(label);
893 goto out;
894 }
895
896 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
811 if (status != 0) { 897 if (status != 0) {
812 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 898 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
813 inode->i_sb->s_id, 899 inode->i_sb->s_id,
@@ -817,7 +903,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
817 if (!S_ISDIR(inode->i_mode)) 903 if (!S_ISDIR(inode->i_mode))
818 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); 904 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
819 } 905 }
820 goto out; 906 goto err_out;
821 } 907 }
822 908
823 status = nfs_refresh_inode(inode, fattr); 909 status = nfs_refresh_inode(inode, fattr);
@@ -825,7 +911,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
825 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 911 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
826 inode->i_sb->s_id, 912 inode->i_sb->s_id,
827 (long long)NFS_FILEID(inode), status); 913 (long long)NFS_FILEID(inode), status);
828 goto out; 914 goto err_out;
829 } 915 }
830 916
831 if (nfsi->cache_validity & NFS_INO_INVALID_ACL) 917 if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
@@ -835,7 +921,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
835 inode->i_sb->s_id, 921 inode->i_sb->s_id,
836 (long long)NFS_FILEID(inode)); 922 (long long)NFS_FILEID(inode));
837 923
838 out: 924err_out:
925 nfs4_label_free(label);
926out:
839 nfs_free_fattr(fattr); 927 nfs_free_fattr(fattr);
840 return status; 928 return status;
841} 929}
@@ -847,7 +935,7 @@ int nfs_attribute_timeout(struct inode *inode)
847 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 935 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
848} 936}
849 937
850static int nfs_attribute_cache_expired(struct inode *inode) 938int nfs_attribute_cache_expired(struct inode *inode)
851{ 939{
852 if (nfs_have_delegated_attributes(inode)) 940 if (nfs_have_delegated_attributes(inode))
853 return 0; 941 return 0;
@@ -863,7 +951,8 @@ static int nfs_attribute_cache_expired(struct inode *inode)
863 */ 951 */
864int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) 952int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
865{ 953{
866 if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR) 954 if (!(NFS_I(inode)->cache_validity &
955 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
867 && !nfs_attribute_cache_expired(inode)) 956 && !nfs_attribute_cache_expired(inode))
868 return NFS_STALE(inode) ? -ESTALE : 0; 957 return NFS_STALE(inode) ? -ESTALE : 0;
869 return __nfs_revalidate_inode(server, inode); 958 return __nfs_revalidate_inode(server, inode);
@@ -873,9 +962,15 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
873static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) 962static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
874{ 963{
875 struct nfs_inode *nfsi = NFS_I(inode); 964 struct nfs_inode *nfsi = NFS_I(inode);
876 965 int ret;
966
877 if (mapping->nrpages != 0) { 967 if (mapping->nrpages != 0) {
878 int ret = invalidate_inode_pages2(mapping); 968 if (S_ISREG(inode->i_mode)) {
969 ret = nfs_sync_mapping(mapping);
970 if (ret < 0)
971 return ret;
972 }
973 ret = invalidate_inode_pages2(mapping);
879 if (ret < 0) 974 if (ret < 0)
880 return ret; 975 return ret;
881 } 976 }
@@ -1243,6 +1338,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1243 spin_lock(&inode->i_lock); 1338 spin_lock(&inode->i_lock);
1244 status = nfs_post_op_update_inode_locked(inode, fattr); 1339 status = nfs_post_op_update_inode_locked(inode, fattr);
1245 spin_unlock(&inode->i_lock); 1340 spin_unlock(&inode->i_lock);
1341
1246 return status; 1342 return status;
1247} 1343}
1248EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); 1344EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
@@ -1483,7 +1579,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1483 inode->i_blocks = fattr->du.nfs2.blocks; 1579 inode->i_blocks = fattr->du.nfs2.blocks;
1484 1580
1485 /* Update attrtimeo value if we're out of the unstable period */ 1581 /* Update attrtimeo value if we're out of the unstable period */
1486 if (invalid & NFS_INO_INVALID_ATTR) { 1582 if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
1487 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 1583 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
1488 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 1584 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
1489 nfsi->attrtimeo_timestamp = now; 1585 nfsi->attrtimeo_timestamp = now;
@@ -1496,6 +1592,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1496 } 1592 }
1497 } 1593 }
1498 invalid &= ~NFS_INO_INVALID_ATTR; 1594 invalid &= ~NFS_INO_INVALID_ATTR;
1595 invalid &= ~NFS_INO_INVALID_LABEL;
1499 /* Don't invalidate the data if we were to blame */ 1596 /* Don't invalidate the data if we were to blame */
1500 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 1597 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1501 || S_ISLNK(inode->i_mode))) 1598 || S_ISLNK(inode->i_mode)))
@@ -1638,12 +1735,11 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
1638static int nfs_net_init(struct net *net) 1735static int nfs_net_init(struct net *net)
1639{ 1736{
1640 nfs_clients_init(net); 1737 nfs_clients_init(net);
1641 return nfs_dns_resolver_cache_init(net); 1738 return 0;
1642} 1739}
1643 1740
1644static void nfs_net_exit(struct net *net) 1741static void nfs_net_exit(struct net *net)
1645{ 1742{
1646 nfs_dns_resolver_cache_destroy(net);
1647 nfs_cleanup_cb_ident_idr(net); 1743 nfs_cleanup_cb_ident_idr(net);
1648} 1744}
1649 1745
@@ -1661,10 +1757,6 @@ static int __init init_nfs_fs(void)
1661{ 1757{
1662 int err; 1758 int err;
1663 1759
1664 err = nfs_dns_resolver_init();
1665 if (err < 0)
1666 goto out10;;
1667
1668 err = register_pernet_subsys(&nfs_net_ops); 1760 err = register_pernet_subsys(&nfs_net_ops);
1669 if (err < 0) 1761 if (err < 0)
1670 goto out9; 1762 goto out9;
@@ -1730,8 +1822,6 @@ out7:
1730out8: 1822out8:
1731 unregister_pernet_subsys(&nfs_net_ops); 1823 unregister_pernet_subsys(&nfs_net_ops);
1732out9: 1824out9:
1733 nfs_dns_resolver_destroy();
1734out10:
1735 return err; 1825 return err;
1736} 1826}
1737 1827
@@ -1744,7 +1834,6 @@ static void __exit exit_nfs_fs(void)
1744 nfs_destroy_nfspagecache(); 1834 nfs_destroy_nfspagecache();
1745 nfs_fscache_unregister(); 1835 nfs_fscache_unregister();
1746 unregister_pernet_subsys(&nfs_net_ops); 1836 unregister_pernet_subsys(&nfs_net_ops);
1747 nfs_dns_resolver_destroy();
1748#ifdef CONFIG_PROC_FS 1837#ifdef CONFIG_PROC_FS
1749 rpc_proc_unregister(&init_net, "nfs"); 1838 rpc_proc_unregister(&init_net, "nfs");
1750#endif 1839#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 91e59a39fc08..3c8373f90ab3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -165,7 +165,7 @@ extern void nfs_free_client(struct nfs_client *);
165extern struct nfs_client *nfs4_find_client_ident(struct net *, int); 165extern struct nfs_client *nfs4_find_client_ident(struct net *, int);
166extern struct nfs_client * 166extern struct nfs_client *
167nfs4_find_client_sessionid(struct net *, const struct sockaddr *, 167nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
168 struct nfs4_sessionid *); 168 struct nfs4_sessionid *, u32);
169extern struct nfs_server *nfs_create_server(struct nfs_mount_info *, 169extern struct nfs_server *nfs_create_server(struct nfs_mount_info *,
170 struct nfs_subversion *); 170 struct nfs_subversion *);
171extern struct nfs_server *nfs4_create_server( 171extern struct nfs_server *nfs4_create_server(
@@ -255,6 +255,7 @@ extern int nfs4_decode_dirent(struct xdr_stream *,
255#ifdef CONFIG_NFS_V4_1 255#ifdef CONFIG_NFS_V4_1
256extern const u32 nfs41_maxread_overhead; 256extern const u32 nfs41_maxread_overhead;
257extern const u32 nfs41_maxwrite_overhead; 257extern const u32 nfs41_maxwrite_overhead;
258extern const u32 nfs41_maxgetdevinfo_overhead;
258#endif 259#endif
259 260
260/* nfs4proc.c */ 261/* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 91a6faf811ac..99a45283b9ee 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -139,7 +139,10 @@ struct mnt_fhstatus {
139 * nfs_mount - Obtain an NFS file handle for the given host and path 139 * nfs_mount - Obtain an NFS file handle for the given host and path
140 * @info: pointer to mount request arguments 140 * @info: pointer to mount request arguments
141 * 141 *
142 * Uses default timeout parameters specified by underlying transport. 142 * Uses default timeout parameters specified by underlying transport. On
143 * successful return, the auth_flavs list and auth_flav_len will be populated
144 * with the list from the server or a faked-up list if the server didn't
145 * provide one.
143 */ 146 */
144int nfs_mount(struct nfs_mount_request *info) 147int nfs_mount(struct nfs_mount_request *info)
145{ 148{
@@ -195,6 +198,15 @@ int nfs_mount(struct nfs_mount_request *info)
195 dprintk("NFS: MNT request succeeded\n"); 198 dprintk("NFS: MNT request succeeded\n");
196 status = 0; 199 status = 0;
197 200
201 /*
202 * If the server didn't provide a flavor list, allow the
203 * client to try any flavor.
204 */
205 if (info->version != NFS_MNT3_VERSION || *info->auth_flav_len == 0) {
206 dprintk("NFS: Faking up auth_flavs list\n");
207 info->auth_flavs[0] = RPC_AUTH_NULL;
208 *info->auth_flav_len = 1;
209 }
198out: 210out:
199 return status; 211 return status;
200 212
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index fc8dc20fdeb9..348b535cd786 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -280,7 +280,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry,
280 struct dentry *parent = dget_parent(dentry); 280 struct dentry *parent = dget_parent(dentry);
281 281
282 /* Look it up again to get its attributes */ 282 /* Look it up again to get its attributes */
283 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr); 283 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL);
284 dput(parent); 284 dput(parent);
285 if (err != 0) 285 if (err != 0)
286 return ERR_PTR(err); 286 return ERR_PTR(err);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 43ea96ced28c..f5c84c3efbca 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
33 res = rpc_call_sync(clnt, msg, flags); 33 res = rpc_call_sync(clnt, msg, flags);
34 if (res != -EJUKEBOX) 34 if (res != -EJUKEBOX)
35 break; 35 break;
36 freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); 36 freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
37 res = -ERESTARTSYS; 37 res = -ERESTARTSYS;
38 } while (!fatal_signal_pending(current)); 38 } while (!fatal_signal_pending(current));
39 return res; 39 return res;
@@ -98,7 +98,7 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
98 */ 98 */
99static int 99static int
100nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, 100nfs3_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
101 struct nfs_fattr *fattr) 101 struct nfs_fattr *fattr, struct nfs4_label *label)
102{ 102{
103 struct rpc_message msg = { 103 struct rpc_message msg = {
104 .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR], 104 .rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR],
@@ -143,7 +143,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
143 143
144static int 144static int
145nfs3_proc_lookup(struct inode *dir, struct qstr *name, 145nfs3_proc_lookup(struct inode *dir, struct qstr *name,
146 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 146 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
147 struct nfs4_label *label)
147{ 148{
148 struct nfs3_diropargs arg = { 149 struct nfs3_diropargs arg = {
149 .fh = NFS_FH(dir), 150 .fh = NFS_FH(dir),
@@ -300,7 +301,7 @@ static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_
300 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); 301 status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
301 nfs_post_op_update_inode(dir, data->res.dir_attr); 302 nfs_post_op_update_inode(dir, data->res.dir_attr);
302 if (status == 0) 303 if (status == 0)
303 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 304 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
304 return status; 305 return status;
305} 306}
306 307
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a1dd768d0a35..ee81e354bce7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -194,7 +194,7 @@ struct nfs4_state_recovery_ops {
194 int (*recover_lock)(struct nfs4_state *, struct file_lock *); 194 int (*recover_lock)(struct nfs4_state *, struct file_lock *);
195 int (*establish_clid)(struct nfs_client *, struct rpc_cred *); 195 int (*establish_clid)(struct nfs_client *, struct rpc_cred *);
196 struct rpc_cred * (*get_clid_cred)(struct nfs_client *); 196 struct rpc_cred * (*get_clid_cred)(struct nfs_client *);
197 int (*reclaim_complete)(struct nfs_client *); 197 int (*reclaim_complete)(struct nfs_client *, struct rpc_cred *);
198 int (*detect_trunking)(struct nfs_client *, struct nfs_client **, 198 int (*detect_trunking)(struct nfs_client *, struct nfs_client **,
199 struct rpc_cred *); 199 struct rpc_cred *);
200}; 200};
@@ -303,10 +303,10 @@ is_ds_client(struct nfs_client *clp)
303extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; 303extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
304 304
305extern const u32 nfs4_fattr_bitmap[3]; 305extern const u32 nfs4_fattr_bitmap[3];
306extern const u32 nfs4_statfs_bitmap[2]; 306extern const u32 nfs4_statfs_bitmap[3];
307extern const u32 nfs4_pathconf_bitmap[2]; 307extern const u32 nfs4_pathconf_bitmap[3];
308extern const u32 nfs4_fsinfo_bitmap[3]; 308extern const u32 nfs4_fsinfo_bitmap[3];
309extern const u32 nfs4_fs_locations_bitmap[2]; 309extern const u32 nfs4_fs_locations_bitmap[3];
310 310
311void nfs4_free_client(struct nfs_client *); 311void nfs4_free_client(struct nfs_client *);
312 312
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 4cbad5d6b276..90dce91dd5b5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -66,6 +66,11 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
66 if (err) 66 if (err)
67 goto error; 67 goto error;
68 68
69 if (cl_init->minorversion > NFS4_MAX_MINOR_VERSION) {
70 err = -EINVAL;
71 goto error;
72 }
73
69 spin_lock_init(&clp->cl_lock); 74 spin_lock_init(&clp->cl_lock);
70 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state); 75 INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
71 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); 76 rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
@@ -562,14 +567,14 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
562 */ 567 */
563struct nfs_client * 568struct nfs_client *
564nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, 569nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
565 struct nfs4_sessionid *sid) 570 struct nfs4_sessionid *sid, u32 minorversion)
566{ 571{
567 struct nfs_client *clp; 572 struct nfs_client *clp;
568 struct nfs_net *nn = net_generic(net, nfs_net_id); 573 struct nfs_net *nn = net_generic(net, nfs_net_id);
569 574
570 spin_lock(&nn->nfs_client_lock); 575 spin_lock(&nn->nfs_client_lock);
571 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { 576 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
572 if (nfs4_cb_match_client(addr, clp, 1) == false) 577 if (nfs4_cb_match_client(addr, clp, minorversion) == false)
573 continue; 578 continue;
574 579
575 if (!nfs4_has_session(clp)) 580 if (!nfs4_has_session(clp))
@@ -592,7 +597,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
592 597
593struct nfs_client * 598struct nfs_client *
594nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, 599nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
595 struct nfs4_sessionid *sid) 600 struct nfs4_sessionid *sid, u32 minorversion)
596{ 601{
597 return NULL; 602 return NULL;
598} 603}
@@ -626,6 +631,8 @@ static int nfs4_set_client(struct nfs_server *server,
626 631
627 if (server->flags & NFS_MOUNT_NORESVPORT) 632 if (server->flags & NFS_MOUNT_NORESVPORT)
628 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 633 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
634 if (server->options & NFS_OPTION_MIGRATION)
635 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
629 636
630 /* Allocate or find a client reference we can use */ 637 /* Allocate or find a client reference we can use */
631 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); 638 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
@@ -730,7 +737,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
730 return -ENOMEM; 737 return -ENOMEM;
731 738
732 /* We must ensure the session is initialised first */ 739 /* We must ensure the session is initialised first */
733 error = nfs4_init_session(server); 740 error = nfs4_init_session(server->nfs_client);
734 if (error < 0) 741 if (error < 0)
735 goto out; 742 goto out;
736 743
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 13e6bb3e3fe5..e5b804dd944c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -69,7 +69,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
69 goto out_drop; 69 goto out_drop;
70 } 70 }
71 } 71 }
72 iput(inode);
73 if (inode != dentry->d_inode) 72 if (inode != dentry->d_inode)
74 goto out_drop; 73 goto out_drop;
75 74
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 22d10623f5ee..17ed87ef9de8 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -643,7 +643,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
643 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 643 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
644 NFS_SERVER(lo->plh_inode)->nfs_client, id); 644 NFS_SERVER(lo->plh_inode)->nfs_client, id);
645 if (d == NULL) { 645 if (d == NULL) {
646 dsaddr = filelayout_get_device_info(lo->plh_inode, id, gfp_flags); 646 dsaddr = filelayout_get_device_info(lo->plh_inode, id,
647 lo->plh_lc_cred, gfp_flags);
647 if (dsaddr == NULL) 648 if (dsaddr == NULL)
648 goto out; 649 goto out;
649 } else 650 } else
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 235ff952d3c8..cebd20e7e923 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -150,6 +150,7 @@ struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
152struct nfs4_file_layout_dsaddr * 152struct nfs4_file_layout_dsaddr *
153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); 153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
154 struct rpc_cred *cred, gfp_t gfp_flags);
154 155
155#endif /* FS_NFS_NFS4FILELAYOUT_H */ 156#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 661a0f611215..95604f64cab8 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -668,7 +668,10 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
668 * of available devices, and return it. 668 * of available devices, and return it.
669 */ 669 */
670struct nfs4_file_layout_dsaddr * 670struct nfs4_file_layout_dsaddr *
671filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags) 671filelayout_get_device_info(struct inode *inode,
672 struct nfs4_deviceid *dev_id,
673 struct rpc_cred *cred,
674 gfp_t gfp_flags)
672{ 675{
673 struct pnfs_device *pdev = NULL; 676 struct pnfs_device *pdev = NULL;
674 u32 max_resp_sz; 677 u32 max_resp_sz;
@@ -708,8 +711,9 @@ filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gf
708 pdev->pgbase = 0; 711 pdev->pgbase = 0;
709 pdev->pglen = max_resp_sz; 712 pdev->pglen = max_resp_sz;
710 pdev->mincount = 0; 713 pdev->mincount = 0;
714 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
711 715
712 rc = nfs4_proc_getdeviceinfo(server, pdev); 716 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
713 dprintk("%s getdevice info returns %d\n", __func__, rc); 717 dprintk("%s getdevice info returns %d\n", __func__, rc);
714 if (rc) 718 if (rc)
715 goto out_free; 719 goto out_free;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d7ba5616989c..108a774095f7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,15 +77,68 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
77static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 77static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
78static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 78static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
79static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); 79static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
80static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *); 80static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
81static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 81static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
82static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 82static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
83 struct nfs_fattr *fattr, struct iattr *sattr, 83 struct nfs_fattr *fattr, struct iattr *sattr,
84 struct nfs4_state *state); 84 struct nfs4_state *state, struct nfs4_label *ilabel,
85 struct nfs4_label *olabel);
85#ifdef CONFIG_NFS_V4_1 86#ifdef CONFIG_NFS_V4_1
86static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *); 87static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
87static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *); 88 struct rpc_cred *);
89static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
90 struct rpc_cred *);
88#endif 91#endif
92
93#ifdef CONFIG_NFS_V4_SECURITY_LABEL
94static inline struct nfs4_label *
95nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
96 struct iattr *sattr, struct nfs4_label *label)
97{
98 int err;
99
100 if (label == NULL)
101 return NULL;
102
103 if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
104 return NULL;
105
106 if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
107 return NULL;
108
109 err = security_dentry_init_security(dentry, sattr->ia_mode,
110 &dentry->d_name, (void **)&label->label, &label->len);
111 if (err == 0)
112 return label;
113
114 return NULL;
115}
116static inline void
117nfs4_label_release_security(struct nfs4_label *label)
118{
119 if (label)
120 security_release_secctx(label->label, label->len);
121}
122static inline u32 *nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
123{
124 if (label)
125 return server->attr_bitmask;
126
127 return server->attr_bitmask_nl;
128}
129#else
130static inline struct nfs4_label *
131nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
132 struct iattr *sattr, struct nfs4_label *l)
133{ return NULL; }
134static inline void
135nfs4_label_release_security(struct nfs4_label *label)
136{ return; }
137static inline u32 *
138nfs4_bitmask(struct nfs_server *server, struct nfs4_label *label)
139{ return server->attr_bitmask; }
140#endif
141
89/* Prevent leaks of NFSv4 errors into userland */ 142/* Prevent leaks of NFSv4 errors into userland */
90static int nfs4_map_errors(int err) 143static int nfs4_map_errors(int err)
91{ 144{
@@ -134,7 +187,10 @@ const u32 nfs4_fattr_bitmap[3] = {
134 | FATTR4_WORD1_SPACE_USED 187 | FATTR4_WORD1_SPACE_USED
135 | FATTR4_WORD1_TIME_ACCESS 188 | FATTR4_WORD1_TIME_ACCESS
136 | FATTR4_WORD1_TIME_METADATA 189 | FATTR4_WORD1_TIME_METADATA
137 | FATTR4_WORD1_TIME_MODIFY 190 | FATTR4_WORD1_TIME_MODIFY,
191#ifdef CONFIG_NFS_V4_SECURITY_LABEL
192 FATTR4_WORD2_SECURITY_LABEL
193#endif
138}; 194};
139 195
140static const u32 nfs4_pnfs_open_bitmap[3] = { 196static const u32 nfs4_pnfs_open_bitmap[3] = {
@@ -161,7 +217,7 @@ static const u32 nfs4_open_noattr_bitmap[3] = {
161 | FATTR4_WORD0_FILEID, 217 | FATTR4_WORD0_FILEID,
162}; 218};
163 219
164const u32 nfs4_statfs_bitmap[2] = { 220const u32 nfs4_statfs_bitmap[3] = {
165 FATTR4_WORD0_FILES_AVAIL 221 FATTR4_WORD0_FILES_AVAIL
166 | FATTR4_WORD0_FILES_FREE 222 | FATTR4_WORD0_FILES_FREE
167 | FATTR4_WORD0_FILES_TOTAL, 223 | FATTR4_WORD0_FILES_TOTAL,
@@ -170,7 +226,7 @@ const u32 nfs4_statfs_bitmap[2] = {
170 | FATTR4_WORD1_SPACE_TOTAL 226 | FATTR4_WORD1_SPACE_TOTAL
171}; 227};
172 228
173const u32 nfs4_pathconf_bitmap[2] = { 229const u32 nfs4_pathconf_bitmap[3] = {
174 FATTR4_WORD0_MAXLINK 230 FATTR4_WORD0_MAXLINK
175 | FATTR4_WORD0_MAXNAME, 231 | FATTR4_WORD0_MAXNAME,
176 0 232 0
@@ -185,7 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
185 FATTR4_WORD2_LAYOUT_BLKSIZE 241 FATTR4_WORD2_LAYOUT_BLKSIZE
186}; 242};
187 243
188const u32 nfs4_fs_locations_bitmap[2] = { 244const u32 nfs4_fs_locations_bitmap[3] = {
189 FATTR4_WORD0_TYPE 245 FATTR4_WORD0_TYPE
190 | FATTR4_WORD0_CHANGE 246 | FATTR4_WORD0_CHANGE
191 | FATTR4_WORD0_SIZE 247 | FATTR4_WORD0_SIZE
@@ -201,7 +257,7 @@ const u32 nfs4_fs_locations_bitmap[2] = {
201 | FATTR4_WORD1_TIME_ACCESS 257 | FATTR4_WORD1_TIME_ACCESS
202 | FATTR4_WORD1_TIME_METADATA 258 | FATTR4_WORD1_TIME_METADATA
203 | FATTR4_WORD1_TIME_MODIFY 259 | FATTR4_WORD1_TIME_MODIFY
204 | FATTR4_WORD1_MOUNTED_ON_FILEID 260 | FATTR4_WORD1_MOUNTED_ON_FILEID,
205}; 261};
206 262
207static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, 263static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
@@ -268,7 +324,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
268 *timeout = NFS4_POLL_RETRY_MIN; 324 *timeout = NFS4_POLL_RETRY_MIN;
269 if (*timeout > NFS4_POLL_RETRY_MAX) 325 if (*timeout > NFS4_POLL_RETRY_MAX)
270 *timeout = NFS4_POLL_RETRY_MAX; 326 *timeout = NFS4_POLL_RETRY_MAX;
271 freezable_schedule_timeout_killable(*timeout); 327 freezable_schedule_timeout_killable_unsafe(*timeout);
272 if (fatal_signal_pending(current)) 328 if (fatal_signal_pending(current))
273 res = -ERESTARTSYS; 329 res = -ERESTARTSYS;
274 *timeout <<= 1; 330 *timeout <<= 1;
@@ -762,6 +818,7 @@ struct nfs4_opendata {
762 struct nfs4_string owner_name; 818 struct nfs4_string owner_name;
763 struct nfs4_string group_name; 819 struct nfs4_string group_name;
764 struct nfs_fattr f_attr; 820 struct nfs_fattr f_attr;
821 struct nfs4_label *f_label;
765 struct dentry *dir; 822 struct dentry *dir;
766 struct dentry *dentry; 823 struct dentry *dentry;
767 struct nfs4_state_owner *owner; 824 struct nfs4_state_owner *owner;
@@ -807,6 +864,7 @@ nfs4_map_atomic_open_claim(struct nfs_server *server,
807static void nfs4_init_opendata_res(struct nfs4_opendata *p) 864static void nfs4_init_opendata_res(struct nfs4_opendata *p)
808{ 865{
809 p->o_res.f_attr = &p->f_attr; 866 p->o_res.f_attr = &p->f_attr;
867 p->o_res.f_label = p->f_label;
810 p->o_res.seqid = p->o_arg.seqid; 868 p->o_res.seqid = p->o_arg.seqid;
811 p->c_res.seqid = p->c_arg.seqid; 869 p->c_res.seqid = p->c_arg.seqid;
812 p->o_res.server = p->o_arg.server; 870 p->o_res.server = p->o_arg.server;
@@ -818,6 +876,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
818static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, 876static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
819 struct nfs4_state_owner *sp, fmode_t fmode, int flags, 877 struct nfs4_state_owner *sp, fmode_t fmode, int flags,
820 const struct iattr *attrs, 878 const struct iattr *attrs,
879 struct nfs4_label *label,
821 enum open_claim_type4 claim, 880 enum open_claim_type4 claim,
822 gfp_t gfp_mask) 881 gfp_t gfp_mask)
823{ 882{
@@ -829,9 +888,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
829 p = kzalloc(sizeof(*p), gfp_mask); 888 p = kzalloc(sizeof(*p), gfp_mask);
830 if (p == NULL) 889 if (p == NULL)
831 goto err; 890 goto err;
891
892 p->f_label = nfs4_label_alloc(server, gfp_mask);
893 if (IS_ERR(p->f_label))
894 goto err_free_p;
895
832 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); 896 p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
833 if (p->o_arg.seqid == NULL) 897 if (p->o_arg.seqid == NULL)
834 goto err_free; 898 goto err_free_label;
835 nfs_sb_active(dentry->d_sb); 899 nfs_sb_active(dentry->d_sb);
836 p->dentry = dget(dentry); 900 p->dentry = dget(dentry);
837 p->dir = parent; 901 p->dir = parent;
@@ -852,8 +916,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
852 p->o_arg.id.uniquifier = sp->so_seqid.owner_id; 916 p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
853 p->o_arg.name = &dentry->d_name; 917 p->o_arg.name = &dentry->d_name;
854 p->o_arg.server = server; 918 p->o_arg.server = server;
855 p->o_arg.bitmask = server->attr_bitmask; 919 p->o_arg.bitmask = nfs4_bitmask(server, label);
856 p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; 920 p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
921 p->o_arg.label = label;
857 p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); 922 p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
858 switch (p->o_arg.claim) { 923 switch (p->o_arg.claim) {
859 case NFS4_OPEN_CLAIM_NULL: 924 case NFS4_OPEN_CLAIM_NULL:
@@ -884,7 +949,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
884 nfs4_init_opendata_res(p); 949 nfs4_init_opendata_res(p);
885 kref_init(&p->kref); 950 kref_init(&p->kref);
886 return p; 951 return p;
887err_free: 952
953err_free_label:
954 nfs4_label_free(p->f_label);
955err_free_p:
888 kfree(p); 956 kfree(p);
889err: 957err:
890 dput(parent); 958 dput(parent);
@@ -901,6 +969,9 @@ static void nfs4_opendata_free(struct kref *kref)
901 if (p->state != NULL) 969 if (p->state != NULL)
902 nfs4_put_open_state(p->state); 970 nfs4_put_open_state(p->state);
903 nfs4_put_state_owner(p->owner); 971 nfs4_put_state_owner(p->owner);
972
973 nfs4_label_free(p->f_label);
974
904 dput(p->dir); 975 dput(p->dir);
905 dput(p->dentry); 976 dput(p->dentry);
906 nfs_sb_deactive(sb); 977 nfs_sb_deactive(sb);
@@ -1179,6 +1250,8 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
1179 if (ret) 1250 if (ret)
1180 goto err; 1251 goto err;
1181 1252
1253 nfs_setsecurity(inode, &data->f_attr, data->f_label);
1254
1182 if (data->o_res.delegation_type != 0) 1255 if (data->o_res.delegation_type != 0)
1183 nfs4_opendata_check_deleg(data, state); 1256 nfs4_opendata_check_deleg(data, state);
1184 update_open_stateid(state, &data->o_res.stateid, NULL, 1257 update_open_stateid(state, &data->o_res.stateid, NULL,
@@ -1205,7 +1278,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
1205 ret = -EAGAIN; 1278 ret = -EAGAIN;
1206 if (!(data->f_attr.valid & NFS_ATTR_FATTR)) 1279 if (!(data->f_attr.valid & NFS_ATTR_FATTR))
1207 goto err; 1280 goto err;
1208 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); 1281 inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
1209 ret = PTR_ERR(inode); 1282 ret = PTR_ERR(inode);
1210 if (IS_ERR(inode)) 1283 if (IS_ERR(inode))
1211 goto err; 1284 goto err;
@@ -1258,7 +1331,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
1258 struct nfs4_opendata *opendata; 1331 struct nfs4_opendata *opendata;
1259 1332
1260 opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, 1333 opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0,
1261 NULL, claim, GFP_NOFS); 1334 NULL, NULL, claim, GFP_NOFS);
1262 if (opendata == NULL) 1335 if (opendata == NULL)
1263 return ERR_PTR(-ENOMEM); 1336 return ERR_PTR(-ENOMEM);
1264 opendata->state = state; 1337 opendata->state = state;
@@ -1784,7 +1857,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1784 return status; 1857 return status;
1785 } 1858 }
1786 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) 1859 if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
1787 _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr); 1860 _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
1788 return 0; 1861 return 0;
1789} 1862}
1790 1863
@@ -1855,18 +1928,30 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1855{ 1928{
1856 struct nfs_server *server = NFS_SERVER(state->inode); 1929 struct nfs_server *server = NFS_SERVER(state->inode);
1857 nfs4_stateid *stateid = &state->stateid; 1930 nfs4_stateid *stateid = &state->stateid;
1858 int status; 1931 struct nfs_delegation *delegation;
1932 struct rpc_cred *cred = NULL;
1933 int status = -NFS4ERR_BAD_STATEID;
1859 1934
1860 /* If a state reset has been done, test_stateid is unneeded */ 1935 /* If a state reset has been done, test_stateid is unneeded */
1861 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) 1936 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
1862 return; 1937 return;
1863 1938
1864 status = nfs41_test_stateid(server, stateid); 1939 /* Get the delegation credential for use by test/free_stateid */
1940 rcu_read_lock();
1941 delegation = rcu_dereference(NFS_I(state->inode)->delegation);
1942 if (delegation != NULL &&
1943 nfs4_stateid_match(&delegation->stateid, stateid)) {
1944 cred = get_rpccred(delegation->cred);
1945 rcu_read_unlock();
1946 status = nfs41_test_stateid(server, stateid, cred);
1947 } else
1948 rcu_read_unlock();
1949
1865 if (status != NFS_OK) { 1950 if (status != NFS_OK) {
1866 /* Free the stateid unless the server explicitly 1951 /* Free the stateid unless the server explicitly
1867 * informs us the stateid is unrecognized. */ 1952 * informs us the stateid is unrecognized. */
1868 if (status != -NFS4ERR_BAD_STATEID) 1953 if (status != -NFS4ERR_BAD_STATEID)
1869 nfs41_free_stateid(server, stateid); 1954 nfs41_free_stateid(server, stateid, cred);
1870 nfs_remove_bad_delegation(state->inode); 1955 nfs_remove_bad_delegation(state->inode);
1871 1956
1872 write_seqlock(&state->seqlock); 1957 write_seqlock(&state->seqlock);
@@ -1874,6 +1959,9 @@ static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
1874 write_sequnlock(&state->seqlock); 1959 write_sequnlock(&state->seqlock);
1875 clear_bit(NFS_DELEGATED_STATE, &state->flags); 1960 clear_bit(NFS_DELEGATED_STATE, &state->flags);
1876 } 1961 }
1962
1963 if (cred != NULL)
1964 put_rpccred(cred);
1877} 1965}
1878 1966
1879/** 1967/**
@@ -1888,6 +1976,7 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
1888{ 1976{
1889 struct nfs_server *server = NFS_SERVER(state->inode); 1977 struct nfs_server *server = NFS_SERVER(state->inode);
1890 nfs4_stateid *stateid = &state->open_stateid; 1978 nfs4_stateid *stateid = &state->open_stateid;
1979 struct rpc_cred *cred = state->owner->so_cred;
1891 int status; 1980 int status;
1892 1981
1893 /* If a state reset has been done, test_stateid is unneeded */ 1982 /* If a state reset has been done, test_stateid is unneeded */
@@ -1896,12 +1985,12 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
1896 (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) 1985 (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
1897 return -NFS4ERR_BAD_STATEID; 1986 return -NFS4ERR_BAD_STATEID;
1898 1987
1899 status = nfs41_test_stateid(server, stateid); 1988 status = nfs41_test_stateid(server, stateid, cred);
1900 if (status != NFS_OK) { 1989 if (status != NFS_OK) {
1901 /* Free the stateid unless the server explicitly 1990 /* Free the stateid unless the server explicitly
1902 * informs us the stateid is unrecognized. */ 1991 * informs us the stateid is unrecognized. */
1903 if (status != -NFS4ERR_BAD_STATEID) 1992 if (status != -NFS4ERR_BAD_STATEID)
1904 nfs41_free_stateid(server, stateid); 1993 nfs41_free_stateid(server, stateid, cred);
1905 1994
1906 clear_bit(NFS_O_RDONLY_STATE, &state->flags); 1995 clear_bit(NFS_O_RDONLY_STATE, &state->flags);
1907 clear_bit(NFS_O_WRONLY_STATE, &state->flags); 1996 clear_bit(NFS_O_WRONLY_STATE, &state->flags);
@@ -1942,10 +2031,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
1942static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, 2031static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
1943 fmode_t fmode, 2032 fmode_t fmode,
1944 int flags, 2033 int flags,
1945 struct nfs4_state **res) 2034 struct nfs_open_context *ctx)
1946{ 2035{
1947 struct nfs4_state_owner *sp = opendata->owner; 2036 struct nfs4_state_owner *sp = opendata->owner;
1948 struct nfs_server *server = sp->so_server; 2037 struct nfs_server *server = sp->so_server;
2038 struct dentry *dentry;
1949 struct nfs4_state *state; 2039 struct nfs4_state *state;
1950 unsigned int seq; 2040 unsigned int seq;
1951 int ret; 2041 int ret;
@@ -1963,13 +2053,31 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
1963 if (server->caps & NFS_CAP_POSIX_LOCK) 2053 if (server->caps & NFS_CAP_POSIX_LOCK)
1964 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 2054 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
1965 2055
2056 dentry = opendata->dentry;
2057 if (dentry->d_inode == NULL) {
2058 /* FIXME: Is this d_drop() ever needed? */
2059 d_drop(dentry);
2060 dentry = d_add_unique(dentry, igrab(state->inode));
2061 if (dentry == NULL) {
2062 dentry = opendata->dentry;
2063 } else if (dentry != ctx->dentry) {
2064 dput(ctx->dentry);
2065 ctx->dentry = dget(dentry);
2066 }
2067 nfs_set_verifier(dentry,
2068 nfs_save_change_attribute(opendata->dir->d_inode));
2069 }
2070
1966 ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); 2071 ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
1967 if (ret != 0) 2072 if (ret != 0)
1968 goto out; 2073 goto out;
1969 2074
1970 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) 2075 ctx->state = state;
1971 nfs4_schedule_stateid_recovery(server, state); 2076 if (dentry->d_inode == state->inode) {
1972 *res = state; 2077 nfs_inode_attach_open_context(ctx);
2078 if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
2079 nfs4_schedule_stateid_recovery(server, state);
2080 }
1973out: 2081out:
1974 return ret; 2082 return ret;
1975} 2083}
@@ -1978,19 +2086,21 @@ out:
1978 * Returns a referenced nfs4_state 2086 * Returns a referenced nfs4_state
1979 */ 2087 */
1980static int _nfs4_do_open(struct inode *dir, 2088static int _nfs4_do_open(struct inode *dir,
1981 struct dentry *dentry, 2089 struct nfs_open_context *ctx,
1982 fmode_t fmode,
1983 int flags, 2090 int flags,
1984 struct iattr *sattr, 2091 struct iattr *sattr,
1985 struct rpc_cred *cred, 2092 struct nfs4_label *label)
1986 struct nfs4_state **res,
1987 struct nfs4_threshold **ctx_th)
1988{ 2093{
1989 struct nfs4_state_owner *sp; 2094 struct nfs4_state_owner *sp;
1990 struct nfs4_state *state = NULL; 2095 struct nfs4_state *state = NULL;
1991 struct nfs_server *server = NFS_SERVER(dir); 2096 struct nfs_server *server = NFS_SERVER(dir);
1992 struct nfs4_opendata *opendata; 2097 struct nfs4_opendata *opendata;
2098 struct dentry *dentry = ctx->dentry;
2099 struct rpc_cred *cred = ctx->cred;
2100 struct nfs4_threshold **ctx_th = &ctx->mdsthreshold;
2101 fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC);
1993 enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; 2102 enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
2103 struct nfs4_label *olabel = NULL;
1994 int status; 2104 int status;
1995 2105
1996 /* Protect against reboot recovery conflicts */ 2106 /* Protect against reboot recovery conflicts */
@@ -2009,22 +2119,31 @@ static int _nfs4_do_open(struct inode *dir,
2009 if (dentry->d_inode) 2119 if (dentry->d_inode)
2010 claim = NFS4_OPEN_CLAIM_FH; 2120 claim = NFS4_OPEN_CLAIM_FH;
2011 opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, 2121 opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
2012 claim, GFP_KERNEL); 2122 label, claim, GFP_KERNEL);
2013 if (opendata == NULL) 2123 if (opendata == NULL)
2014 goto err_put_state_owner; 2124 goto err_put_state_owner;
2015 2125
2126 if (label) {
2127 olabel = nfs4_label_alloc(server, GFP_KERNEL);
2128 if (IS_ERR(olabel)) {
2129 status = PTR_ERR(olabel);
2130 goto err_opendata_put;
2131 }
2132 }
2133
2016 if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) { 2134 if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
2017 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); 2135 opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
2018 if (!opendata->f_attr.mdsthreshold) 2136 if (!opendata->f_attr.mdsthreshold)
2019 goto err_opendata_put; 2137 goto err_free_label;
2020 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; 2138 opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
2021 } 2139 }
2022 if (dentry->d_inode != NULL) 2140 if (dentry->d_inode != NULL)
2023 opendata->state = nfs4_get_open_state(dentry->d_inode, sp); 2141 opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
2024 2142
2025 status = _nfs4_open_and_get_state(opendata, fmode, flags, &state); 2143 status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx);
2026 if (status != 0) 2144 if (status != 0)
2027 goto err_opendata_put; 2145 goto err_free_label;
2146 state = ctx->state;
2028 2147
2029 if ((opendata->o_arg.open_flags & O_EXCL) && 2148 if ((opendata->o_arg.open_flags & O_EXCL) &&
2030 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { 2149 (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
@@ -2033,10 +2152,12 @@ static int _nfs4_do_open(struct inode *dir,
2033 nfs_fattr_init(opendata->o_res.f_attr); 2152 nfs_fattr_init(opendata->o_res.f_attr);
2034 status = nfs4_do_setattr(state->inode, cred, 2153 status = nfs4_do_setattr(state->inode, cred,
2035 opendata->o_res.f_attr, sattr, 2154 opendata->o_res.f_attr, sattr,
2036 state); 2155 state, label, olabel);
2037 if (status == 0) 2156 if (status == 0) {
2038 nfs_setattr_update_inode(state->inode, sattr); 2157 nfs_setattr_update_inode(state->inode, sattr);
2039 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); 2158 nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
2159 nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
2160 }
2040 } 2161 }
2041 2162
2042 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) 2163 if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
@@ -2045,38 +2166,37 @@ static int _nfs4_do_open(struct inode *dir,
2045 kfree(opendata->f_attr.mdsthreshold); 2166 kfree(opendata->f_attr.mdsthreshold);
2046 opendata->f_attr.mdsthreshold = NULL; 2167 opendata->f_attr.mdsthreshold = NULL;
2047 2168
2169 nfs4_label_free(olabel);
2170
2048 nfs4_opendata_put(opendata); 2171 nfs4_opendata_put(opendata);
2049 nfs4_put_state_owner(sp); 2172 nfs4_put_state_owner(sp);
2050 *res = state;
2051 return 0; 2173 return 0;
2174err_free_label:
2175 nfs4_label_free(olabel);
2052err_opendata_put: 2176err_opendata_put:
2053 kfree(opendata->f_attr.mdsthreshold); 2177 kfree(opendata->f_attr.mdsthreshold);
2054 nfs4_opendata_put(opendata); 2178 nfs4_opendata_put(opendata);
2055err_put_state_owner: 2179err_put_state_owner:
2056 nfs4_put_state_owner(sp); 2180 nfs4_put_state_owner(sp);
2057out_err: 2181out_err:
2058 *res = NULL;
2059 return status; 2182 return status;
2060} 2183}
2061 2184
2062 2185
2063static struct nfs4_state *nfs4_do_open(struct inode *dir, 2186static struct nfs4_state *nfs4_do_open(struct inode *dir,
2064 struct dentry *dentry, 2187 struct nfs_open_context *ctx,
2065 fmode_t fmode,
2066 int flags, 2188 int flags,
2067 struct iattr *sattr, 2189 struct iattr *sattr,
2068 struct rpc_cred *cred, 2190 struct nfs4_label *label)
2069 struct nfs4_threshold **ctx_th)
2070{ 2191{
2071 struct nfs_server *server = NFS_SERVER(dir); 2192 struct nfs_server *server = NFS_SERVER(dir);
2072 struct nfs4_exception exception = { }; 2193 struct nfs4_exception exception = { };
2073 struct nfs4_state *res; 2194 struct nfs4_state *res;
2074 int status; 2195 int status;
2075 2196
2076 fmode &= FMODE_READ|FMODE_WRITE|FMODE_EXEC;
2077 do { 2197 do {
2078 status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, 2198 status = _nfs4_do_open(dir, ctx, flags, sattr, label);
2079 &res, ctx_th); 2199 res = ctx->state;
2080 if (status == 0) 2200 if (status == 0)
2081 break; 2201 break;
2082 /* NOTE: BAD_SEQID means the server and client disagree about the 2202 /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -2122,7 +2242,8 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
2122 2242
2123static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2243static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2124 struct nfs_fattr *fattr, struct iattr *sattr, 2244 struct nfs_fattr *fattr, struct iattr *sattr,
2125 struct nfs4_state *state) 2245 struct nfs4_state *state, struct nfs4_label *ilabel,
2246 struct nfs4_label *olabel)
2126{ 2247{
2127 struct nfs_server *server = NFS_SERVER(inode); 2248 struct nfs_server *server = NFS_SERVER(inode);
2128 struct nfs_setattrargs arg = { 2249 struct nfs_setattrargs arg = {
@@ -2130,9 +2251,11 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2130 .iap = sattr, 2251 .iap = sattr,
2131 .server = server, 2252 .server = server,
2132 .bitmask = server->attr_bitmask, 2253 .bitmask = server->attr_bitmask,
2254 .label = ilabel,
2133 }; 2255 };
2134 struct nfs_setattrres res = { 2256 struct nfs_setattrres res = {
2135 .fattr = fattr, 2257 .fattr = fattr,
2258 .label = olabel,
2136 .server = server, 2259 .server = server,
2137 }; 2260 };
2138 struct rpc_message msg = { 2261 struct rpc_message msg = {
@@ -2146,6 +2269,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2146 bool truncate; 2269 bool truncate;
2147 int status; 2270 int status;
2148 2271
2272 arg.bitmask = nfs4_bitmask(server, ilabel);
2273 if (ilabel)
2274 arg.bitmask = nfs4_bitmask(server, olabel);
2275
2149 nfs_fattr_init(fattr); 2276 nfs_fattr_init(fattr);
2150 2277
2151 /* Servers should only apply open mode checks for file size changes */ 2278 /* Servers should only apply open mode checks for file size changes */
@@ -2172,7 +2299,8 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2172 2299
2173static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2300static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2174 struct nfs_fattr *fattr, struct iattr *sattr, 2301 struct nfs_fattr *fattr, struct iattr *sattr,
2175 struct nfs4_state *state) 2302 struct nfs4_state *state, struct nfs4_label *ilabel,
2303 struct nfs4_label *olabel)
2176{ 2304{
2177 struct nfs_server *server = NFS_SERVER(inode); 2305 struct nfs_server *server = NFS_SERVER(inode);
2178 struct nfs4_exception exception = { 2306 struct nfs4_exception exception = {
@@ -2181,7 +2309,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2181 }; 2309 };
2182 int err; 2310 int err;
2183 do { 2311 do {
2184 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state); 2312 err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
2185 switch (err) { 2313 switch (err) {
2186 case -NFS4ERR_OPENMODE: 2314 case -NFS4ERR_OPENMODE:
2187 if (!(sattr->ia_valid & ATTR_SIZE)) { 2315 if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -2426,14 +2554,18 @@ static struct inode *
2426nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr) 2554nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, int open_flags, struct iattr *attr)
2427{ 2555{
2428 struct nfs4_state *state; 2556 struct nfs4_state *state;
2557 struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
2558
2559 label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
2429 2560
2430 /* Protect against concurrent sillydeletes */ 2561 /* Protect against concurrent sillydeletes */
2431 state = nfs4_do_open(dir, ctx->dentry, ctx->mode, open_flags, attr, 2562 state = nfs4_do_open(dir, ctx, open_flags, attr, label);
2432 ctx->cred, &ctx->mdsthreshold); 2563
2564 nfs4_label_release_security(label);
2565
2433 if (IS_ERR(state)) 2566 if (IS_ERR(state))
2434 return ERR_CAST(state); 2567 return ERR_CAST(state);
2435 ctx->state = state; 2568 return state->inode;
2436 return igrab(state->inode);
2437} 2569}
2438 2570
2439static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) 2571static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
@@ -2489,7 +2621,17 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2489 server->caps |= NFS_CAP_CTIME; 2621 server->caps |= NFS_CAP_CTIME;
2490 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) 2622 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
2491 server->caps |= NFS_CAP_MTIME; 2623 server->caps |= NFS_CAP_MTIME;
2624#ifdef CONFIG_NFS_V4_SECURITY_LABEL
2625 if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)
2626 server->caps |= NFS_CAP_SECURITY_LABEL;
2627#endif
2628 memcpy(server->attr_bitmask_nl, res.attr_bitmask,
2629 sizeof(server->attr_bitmask));
2492 2630
2631 if (server->caps & NFS_CAP_SECURITY_LABEL) {
2632 server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
2633 res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
2634 }
2493 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); 2635 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
2494 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; 2636 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
2495 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2637 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
@@ -2515,8 +2657,9 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
2515static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, 2657static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2516 struct nfs_fsinfo *info) 2658 struct nfs_fsinfo *info)
2517{ 2659{
2660 u32 bitmask[3];
2518 struct nfs4_lookup_root_arg args = { 2661 struct nfs4_lookup_root_arg args = {
2519 .bitmask = nfs4_fattr_bitmap, 2662 .bitmask = bitmask,
2520 }; 2663 };
2521 struct nfs4_lookup_res res = { 2664 struct nfs4_lookup_res res = {
2522 .server = server, 2665 .server = server,
@@ -2529,6 +2672,13 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2529 .rpc_resp = &res, 2672 .rpc_resp = &res,
2530 }; 2673 };
2531 2674
2675 bitmask[0] = nfs4_fattr_bitmap[0];
2676 bitmask[1] = nfs4_fattr_bitmap[1];
2677 /*
2678 * Process the label in the upcoming getfattr
2679 */
2680 bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
2681
2532 nfs_fattr_init(info->fattr); 2682 nfs_fattr_init(info->fattr);
2533 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2683 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2534} 2684}
@@ -2648,6 +2798,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
2648{ 2798{
2649 int error; 2799 int error;
2650 struct nfs_fattr *fattr = info->fattr; 2800 struct nfs_fattr *fattr = info->fattr;
2801 struct nfs4_label *label = NULL;
2651 2802
2652 error = nfs4_server_capabilities(server, mntfh); 2803 error = nfs4_server_capabilities(server, mntfh);
2653 if (error < 0) { 2804 if (error < 0) {
@@ -2655,16 +2806,23 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh,
2655 return error; 2806 return error;
2656 } 2807 }
2657 2808
2658 error = nfs4_proc_getattr(server, mntfh, fattr); 2809 label = nfs4_label_alloc(server, GFP_KERNEL);
2810 if (IS_ERR(label))
2811 return PTR_ERR(label);
2812
2813 error = nfs4_proc_getattr(server, mntfh, fattr, label);
2659 if (error < 0) { 2814 if (error < 0) {
2660 dprintk("nfs4_get_root: getattr error = %d\n", -error); 2815 dprintk("nfs4_get_root: getattr error = %d\n", -error);
2661 return error; 2816 goto err_free_label;
2662 } 2817 }
2663 2818
2664 if (fattr->valid & NFS_ATTR_FATTR_FSID && 2819 if (fattr->valid & NFS_ATTR_FATTR_FSID &&
2665 !nfs_fsid_equal(&server->fsid, &fattr->fsid)) 2820 !nfs_fsid_equal(&server->fsid, &fattr->fsid))
2666 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); 2821 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
2667 2822
2823err_free_label:
2824 nfs4_label_free(label);
2825
2668 return error; 2826 return error;
2669} 2827}
2670 2828
@@ -2711,7 +2869,8 @@ out:
2711 return status; 2869 return status;
2712} 2870}
2713 2871
2714static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2872static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2873 struct nfs_fattr *fattr, struct nfs4_label *label)
2715{ 2874{
2716 struct nfs4_getattr_arg args = { 2875 struct nfs4_getattr_arg args = {
2717 .fh = fhandle, 2876 .fh = fhandle,
@@ -2719,6 +2878,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2719 }; 2878 };
2720 struct nfs4_getattr_res res = { 2879 struct nfs4_getattr_res res = {
2721 .fattr = fattr, 2880 .fattr = fattr,
2881 .label = label,
2722 .server = server, 2882 .server = server,
2723 }; 2883 };
2724 struct rpc_message msg = { 2884 struct rpc_message msg = {
@@ -2726,18 +2886,21 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2726 .rpc_argp = &args, 2886 .rpc_argp = &args,
2727 .rpc_resp = &res, 2887 .rpc_resp = &res,
2728 }; 2888 };
2729 2889
2890 args.bitmask = nfs4_bitmask(server, label);
2891
2730 nfs_fattr_init(fattr); 2892 nfs_fattr_init(fattr);
2731 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); 2893 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2732} 2894}
2733 2895
2734static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2896static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2897 struct nfs_fattr *fattr, struct nfs4_label *label)
2735{ 2898{
2736 struct nfs4_exception exception = { }; 2899 struct nfs4_exception exception = { };
2737 int err; 2900 int err;
2738 do { 2901 do {
2739 err = nfs4_handle_exception(server, 2902 err = nfs4_handle_exception(server,
2740 _nfs4_proc_getattr(server, fhandle, fattr), 2903 _nfs4_proc_getattr(server, fhandle, fattr, label),
2741 &exception); 2904 &exception);
2742 } while (exception.retry); 2905 } while (exception.retry);
2743 return err; 2906 return err;
@@ -2767,6 +2930,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2767 struct inode *inode = dentry->d_inode; 2930 struct inode *inode = dentry->d_inode;
2768 struct rpc_cred *cred = NULL; 2931 struct rpc_cred *cred = NULL;
2769 struct nfs4_state *state = NULL; 2932 struct nfs4_state *state = NULL;
2933 struct nfs4_label *label = NULL;
2770 int status; 2934 int status;
2771 2935
2772 if (pnfs_ld_layoutret_on_setattr(inode)) 2936 if (pnfs_ld_layoutret_on_setattr(inode))
@@ -2793,15 +2957,22 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2793 } 2957 }
2794 } 2958 }
2795 2959
2796 status = nfs4_do_setattr(inode, cred, fattr, sattr, state); 2960 label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
2797 if (status == 0) 2961 if (IS_ERR(label))
2962 return PTR_ERR(label);
2963
2964 status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
2965 if (status == 0) {
2798 nfs_setattr_update_inode(inode, sattr); 2966 nfs_setattr_update_inode(inode, sattr);
2967 nfs_setsecurity(inode, fattr, label);
2968 }
2969 nfs4_label_free(label);
2799 return status; 2970 return status;
2800} 2971}
2801 2972
2802static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, 2973static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2803 const struct qstr *name, struct nfs_fh *fhandle, 2974 const struct qstr *name, struct nfs_fh *fhandle,
2804 struct nfs_fattr *fattr) 2975 struct nfs_fattr *fattr, struct nfs4_label *label)
2805{ 2976{
2806 struct nfs_server *server = NFS_SERVER(dir); 2977 struct nfs_server *server = NFS_SERVER(dir);
2807 int status; 2978 int status;
@@ -2813,6 +2984,7 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2813 struct nfs4_lookup_res res = { 2984 struct nfs4_lookup_res res = {
2814 .server = server, 2985 .server = server,
2815 .fattr = fattr, 2986 .fattr = fattr,
2987 .label = label,
2816 .fh = fhandle, 2988 .fh = fhandle,
2817 }; 2989 };
2818 struct rpc_message msg = { 2990 struct rpc_message msg = {
@@ -2821,6 +2993,8 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2821 .rpc_resp = &res, 2993 .rpc_resp = &res,
2822 }; 2994 };
2823 2995
2996 args.bitmask = nfs4_bitmask(server, label);
2997
2824 nfs_fattr_init(fattr); 2998 nfs_fattr_init(fattr);
2825 2999
2826 dprintk("NFS call lookup %s\n", name->name); 3000 dprintk("NFS call lookup %s\n", name->name);
@@ -2839,13 +3013,13 @@ static void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr)
2839 3013
2840static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, 3014static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
2841 struct qstr *name, struct nfs_fh *fhandle, 3015 struct qstr *name, struct nfs_fh *fhandle,
2842 struct nfs_fattr *fattr) 3016 struct nfs_fattr *fattr, struct nfs4_label *label)
2843{ 3017{
2844 struct nfs4_exception exception = { }; 3018 struct nfs4_exception exception = { };
2845 struct rpc_clnt *client = *clnt; 3019 struct rpc_clnt *client = *clnt;
2846 int err; 3020 int err;
2847 do { 3021 do {
2848 err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr); 3022 err = _nfs4_proc_lookup(client, dir, name, fhandle, fattr, label);
2849 switch (err) { 3023 switch (err) {
2850 case -NFS4ERR_BADNAME: 3024 case -NFS4ERR_BADNAME:
2851 err = -ENOENT; 3025 err = -ENOENT;
@@ -2879,12 +3053,13 @@ out:
2879} 3053}
2880 3054
2881static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, 3055static int nfs4_proc_lookup(struct inode *dir, struct qstr *name,
2882 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 3056 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
3057 struct nfs4_label *label)
2883{ 3058{
2884 int status; 3059 int status;
2885 struct rpc_clnt *client = NFS_CLIENT(dir); 3060 struct rpc_clnt *client = NFS_CLIENT(dir);
2886 3061
2887 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); 3062 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, label);
2888 if (client != NFS_CLIENT(dir)) { 3063 if (client != NFS_CLIENT(dir)) {
2889 rpc_shutdown_client(client); 3064 rpc_shutdown_client(client);
2890 nfs_fixup_secinfo_attributes(fattr); 3065 nfs_fixup_secinfo_attributes(fattr);
@@ -2896,15 +3071,13 @@ struct rpc_clnt *
2896nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name, 3071nfs4_proc_lookup_mountpoint(struct inode *dir, struct qstr *name,
2897 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 3072 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
2898{ 3073{
3074 struct rpc_clnt *client = NFS_CLIENT(dir);
2899 int status; 3075 int status;
2900 struct rpc_clnt *client = rpc_clone_client(NFS_CLIENT(dir));
2901 3076
2902 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr); 3077 status = nfs4_proc_lookup_common(&client, dir, name, fhandle, fattr, NULL);
2903 if (status < 0) { 3078 if (status < 0)
2904 rpc_shutdown_client(client);
2905 return ERR_PTR(status); 3079 return ERR_PTR(status);
2906 } 3080 return (client == NFS_CLIENT(dir)) ? rpc_clone_client(client) : client;
2907 return client;
2908} 3081}
2909 3082
2910static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) 3083static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry)
@@ -2924,7 +3097,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2924 .rpc_cred = entry->cred, 3097 .rpc_cred = entry->cred,
2925 }; 3098 };
2926 int mode = entry->mask; 3099 int mode = entry->mask;
2927 int status; 3100 int status = 0;
2928 3101
2929 /* 3102 /*
2930 * Determine which access bits we want to ask for... 3103 * Determine which access bits we want to ask for...
@@ -3029,6 +3202,7 @@ static int
3029nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 3202nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3030 int flags) 3203 int flags)
3031{ 3204{
3205 struct nfs4_label l, *ilabel = NULL;
3032 struct nfs_open_context *ctx; 3206 struct nfs_open_context *ctx;
3033 struct nfs4_state *state; 3207 struct nfs4_state *state;
3034 int status = 0; 3208 int status = 0;
@@ -3037,19 +3211,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
3037 if (IS_ERR(ctx)) 3211 if (IS_ERR(ctx))
3038 return PTR_ERR(ctx); 3212 return PTR_ERR(ctx);
3039 3213
3214 ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
3215
3040 sattr->ia_mode &= ~current_umask(); 3216 sattr->ia_mode &= ~current_umask();
3041 state = nfs4_do_open(dir, dentry, ctx->mode, 3217 state = nfs4_do_open(dir, ctx, flags, sattr, ilabel);
3042 flags, sattr, ctx->cred,
3043 &ctx->mdsthreshold);
3044 d_drop(dentry);
3045 if (IS_ERR(state)) { 3218 if (IS_ERR(state)) {
3046 status = PTR_ERR(state); 3219 status = PTR_ERR(state);
3047 goto out; 3220 goto out;
3048 } 3221 }
3049 d_add(dentry, igrab(state->inode));
3050 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
3051 ctx->state = state;
3052out: 3222out:
3223 nfs4_label_release_security(ilabel);
3053 put_nfs_open_context(ctx); 3224 put_nfs_open_context(ctx);
3054 return status; 3225 return status;
3055} 3226}
@@ -3098,6 +3269,8 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
3098 res->server = server; 3269 res->server = server;
3099 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE]; 3270 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
3100 nfs41_init_sequence(&args->seq_args, &res->seq_res, 1); 3271 nfs41_init_sequence(&args->seq_args, &res->seq_res, 1);
3272
3273 nfs_fattr_init(res->dir_attr);
3101} 3274}
3102 3275
3103static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) 3276static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
@@ -3173,7 +3346,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
3173 .rpc_resp = &res, 3346 .rpc_resp = &res,
3174 }; 3347 };
3175 int status = -ENOMEM; 3348 int status = -ENOMEM;
3176 3349
3177 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 3350 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3178 if (!status) { 3351 if (!status) {
3179 update_changeattr(old_dir, &res.old_cinfo); 3352 update_changeattr(old_dir, &res.old_cinfo);
@@ -3207,6 +3380,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
3207 }; 3380 };
3208 struct nfs4_link_res res = { 3381 struct nfs4_link_res res = {
3209 .server = server, 3382 .server = server,
3383 .label = NULL,
3210 }; 3384 };
3211 struct rpc_message msg = { 3385 struct rpc_message msg = {
3212 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK], 3386 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LINK],
@@ -3219,11 +3393,24 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
3219 if (res.fattr == NULL) 3393 if (res.fattr == NULL)
3220 goto out; 3394 goto out;
3221 3395
3396 res.label = nfs4_label_alloc(server, GFP_KERNEL);
3397 if (IS_ERR(res.label)) {
3398 status = PTR_ERR(res.label);
3399 goto out;
3400 }
3401 arg.bitmask = nfs4_bitmask(server, res.label);
3402
3222 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 3403 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3223 if (!status) { 3404 if (!status) {
3224 update_changeattr(dir, &res.cinfo); 3405 update_changeattr(dir, &res.cinfo);
3225 nfs_post_op_update_inode(inode, res.fattr); 3406 status = nfs_post_op_update_inode(inode, res.fattr);
3407 if (!status)
3408 nfs_setsecurity(inode, res.fattr, res.label);
3226 } 3409 }
3410
3411
3412 nfs4_label_free(res.label);
3413
3227out: 3414out:
3228 nfs_free_fattr(res.fattr); 3415 nfs_free_fattr(res.fattr);
3229 return status; 3416 return status;
@@ -3247,6 +3434,7 @@ struct nfs4_createdata {
3247 struct nfs4_create_res res; 3434 struct nfs4_create_res res;
3248 struct nfs_fh fh; 3435 struct nfs_fh fh;
3249 struct nfs_fattr fattr; 3436 struct nfs_fattr fattr;
3437 struct nfs4_label *label;
3250}; 3438};
3251 3439
3252static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, 3440static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
@@ -3258,6 +3446,10 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
3258 if (data != NULL) { 3446 if (data != NULL) {
3259 struct nfs_server *server = NFS_SERVER(dir); 3447 struct nfs_server *server = NFS_SERVER(dir);
3260 3448
3449 data->label = nfs4_label_alloc(server, GFP_KERNEL);
3450 if (IS_ERR(data->label))
3451 goto out_free;
3452
3261 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE]; 3453 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
3262 data->msg.rpc_argp = &data->arg; 3454 data->msg.rpc_argp = &data->arg;
3263 data->msg.rpc_resp = &data->res; 3455 data->msg.rpc_resp = &data->res;
@@ -3266,13 +3458,17 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
3266 data->arg.name = name; 3458 data->arg.name = name;
3267 data->arg.attrs = sattr; 3459 data->arg.attrs = sattr;
3268 data->arg.ftype = ftype; 3460 data->arg.ftype = ftype;
3269 data->arg.bitmask = server->attr_bitmask; 3461 data->arg.bitmask = nfs4_bitmask(server, data->label);
3270 data->res.server = server; 3462 data->res.server = server;
3271 data->res.fh = &data->fh; 3463 data->res.fh = &data->fh;
3272 data->res.fattr = &data->fattr; 3464 data->res.fattr = &data->fattr;
3465 data->res.label = data->label;
3273 nfs_fattr_init(data->res.fattr); 3466 nfs_fattr_init(data->res.fattr);
3274 } 3467 }
3275 return data; 3468 return data;
3469out_free:
3470 kfree(data);
3471 return NULL;
3276} 3472}
3277 3473
3278static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) 3474static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
@@ -3281,18 +3477,20 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
3281 &data->arg.seq_args, &data->res.seq_res, 1); 3477 &data->arg.seq_args, &data->res.seq_res, 1);
3282 if (status == 0) { 3478 if (status == 0) {
3283 update_changeattr(dir, &data->res.dir_cinfo); 3479 update_changeattr(dir, &data->res.dir_cinfo);
3284 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 3480 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
3285 } 3481 }
3286 return status; 3482 return status;
3287} 3483}
3288 3484
3289static void nfs4_free_createdata(struct nfs4_createdata *data) 3485static void nfs4_free_createdata(struct nfs4_createdata *data)
3290{ 3486{
3487 nfs4_label_free(data->label);
3291 kfree(data); 3488 kfree(data);
3292} 3489}
3293 3490
3294static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, 3491static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3295 struct page *page, unsigned int len, struct iattr *sattr) 3492 struct page *page, unsigned int len, struct iattr *sattr,
3493 struct nfs4_label *label)
3296{ 3494{
3297 struct nfs4_createdata *data; 3495 struct nfs4_createdata *data;
3298 int status = -ENAMETOOLONG; 3496 int status = -ENAMETOOLONG;
@@ -3308,6 +3506,7 @@ static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3308 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK]; 3506 data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
3309 data->arg.u.symlink.pages = &page; 3507 data->arg.u.symlink.pages = &page;
3310 data->arg.u.symlink.len = len; 3508 data->arg.u.symlink.len = len;
3509 data->arg.label = label;
3311 3510
3312 status = nfs4_do_create(dir, dentry, data); 3511 status = nfs4_do_create(dir, dentry, data);
3313 3512
@@ -3320,18 +3519,24 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
3320 struct page *page, unsigned int len, struct iattr *sattr) 3519 struct page *page, unsigned int len, struct iattr *sattr)
3321{ 3520{
3322 struct nfs4_exception exception = { }; 3521 struct nfs4_exception exception = { };
3522 struct nfs4_label l, *label = NULL;
3323 int err; 3523 int err;
3524
3525 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3526
3324 do { 3527 do {
3325 err = nfs4_handle_exception(NFS_SERVER(dir), 3528 err = nfs4_handle_exception(NFS_SERVER(dir),
3326 _nfs4_proc_symlink(dir, dentry, page, 3529 _nfs4_proc_symlink(dir, dentry, page,
3327 len, sattr), 3530 len, sattr, label),
3328 &exception); 3531 &exception);
3329 } while (exception.retry); 3532 } while (exception.retry);
3533
3534 nfs4_label_release_security(label);
3330 return err; 3535 return err;
3331} 3536}
3332 3537
3333static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, 3538static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3334 struct iattr *sattr) 3539 struct iattr *sattr, struct nfs4_label *label)
3335{ 3540{
3336 struct nfs4_createdata *data; 3541 struct nfs4_createdata *data;
3337 int status = -ENOMEM; 3542 int status = -ENOMEM;
@@ -3340,6 +3545,7 @@ static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3340 if (data == NULL) 3545 if (data == NULL)
3341 goto out; 3546 goto out;
3342 3547
3548 data->arg.label = label;
3343 status = nfs4_do_create(dir, dentry, data); 3549 status = nfs4_do_create(dir, dentry, data);
3344 3550
3345 nfs4_free_createdata(data); 3551 nfs4_free_createdata(data);
@@ -3351,14 +3557,19 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
3351 struct iattr *sattr) 3557 struct iattr *sattr)
3352{ 3558{
3353 struct nfs4_exception exception = { }; 3559 struct nfs4_exception exception = { };
3560 struct nfs4_label l, *label = NULL;
3354 int err; 3561 int err;
3355 3562
3563 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3564
3356 sattr->ia_mode &= ~current_umask(); 3565 sattr->ia_mode &= ~current_umask();
3357 do { 3566 do {
3358 err = nfs4_handle_exception(NFS_SERVER(dir), 3567 err = nfs4_handle_exception(NFS_SERVER(dir),
3359 _nfs4_proc_mkdir(dir, dentry, sattr), 3568 _nfs4_proc_mkdir(dir, dentry, sattr, label),
3360 &exception); 3569 &exception);
3361 } while (exception.retry); 3570 } while (exception.retry);
3571 nfs4_label_release_security(label);
3572
3362 return err; 3573 return err;
3363} 3574}
3364 3575
@@ -3416,7 +3627,7 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
3416} 3627}
3417 3628
3418static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, 3629static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3419 struct iattr *sattr, dev_t rdev) 3630 struct iattr *sattr, struct nfs4_label *label, dev_t rdev)
3420{ 3631{
3421 struct nfs4_createdata *data; 3632 struct nfs4_createdata *data;
3422 int mode = sattr->ia_mode; 3633 int mode = sattr->ia_mode;
@@ -3441,7 +3652,8 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3441 status = -EINVAL; 3652 status = -EINVAL;
3442 goto out_free; 3653 goto out_free;
3443 } 3654 }
3444 3655
3656 data->arg.label = label;
3445 status = nfs4_do_create(dir, dentry, data); 3657 status = nfs4_do_create(dir, dentry, data);
3446out_free: 3658out_free:
3447 nfs4_free_createdata(data); 3659 nfs4_free_createdata(data);
@@ -3453,14 +3665,20 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3453 struct iattr *sattr, dev_t rdev) 3665 struct iattr *sattr, dev_t rdev)
3454{ 3666{
3455 struct nfs4_exception exception = { }; 3667 struct nfs4_exception exception = { };
3668 struct nfs4_label l, *label = NULL;
3456 int err; 3669 int err;
3457 3670
3671 label = nfs4_label_init_security(dir, dentry, sattr, &l);
3672
3458 sattr->ia_mode &= ~current_umask(); 3673 sattr->ia_mode &= ~current_umask();
3459 do { 3674 do {
3460 err = nfs4_handle_exception(NFS_SERVER(dir), 3675 err = nfs4_handle_exception(NFS_SERVER(dir),
3461 _nfs4_proc_mknod(dir, dentry, sattr, rdev), 3676 _nfs4_proc_mknod(dir, dentry, sattr, label, rdev),
3462 &exception); 3677 &exception);
3463 } while (exception.retry); 3678 } while (exception.retry);
3679
3680 nfs4_label_release_security(label);
3681
3464 return err; 3682 return err;
3465} 3683}
3466 3684
@@ -4187,6 +4405,155 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
4187 return err; 4405 return err;
4188} 4406}
4189 4407
4408#ifdef CONFIG_NFS_V4_SECURITY_LABEL
4409static int _nfs4_get_security_label(struct inode *inode, void *buf,
4410 size_t buflen)
4411{
4412 struct nfs_server *server = NFS_SERVER(inode);
4413 struct nfs_fattr fattr;
4414 struct nfs4_label label = {0, 0, buflen, buf};
4415
4416 u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
4417 struct nfs4_getattr_arg args = {
4418 .fh = NFS_FH(inode),
4419 .bitmask = bitmask,
4420 };
4421 struct nfs4_getattr_res res = {
4422 .fattr = &fattr,
4423 .label = &label,
4424 .server = server,
4425 };
4426 struct rpc_message msg = {
4427 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
4428 .rpc_argp = &args,
4429 .rpc_resp = &res,
4430 };
4431 int ret;
4432
4433 nfs_fattr_init(&fattr);
4434
4435 ret = rpc_call_sync(server->client, &msg, 0);
4436 if (ret)
4437 return ret;
4438 if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
4439 return -ENOENT;
4440 if (buflen < label.len)
4441 return -ERANGE;
4442 return 0;
4443}
4444
4445static int nfs4_get_security_label(struct inode *inode, void *buf,
4446 size_t buflen)
4447{
4448 struct nfs4_exception exception = { };
4449 int err;
4450
4451 if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
4452 return -EOPNOTSUPP;
4453
4454 do {
4455 err = nfs4_handle_exception(NFS_SERVER(inode),
4456 _nfs4_get_security_label(inode, buf, buflen),
4457 &exception);
4458 } while (exception.retry);
4459 return err;
4460}
4461
4462static int _nfs4_do_set_security_label(struct inode *inode,
4463 struct nfs4_label *ilabel,
4464 struct nfs_fattr *fattr,
4465 struct nfs4_label *olabel)
4466{
4467
4468 struct iattr sattr = {0};
4469 struct nfs_server *server = NFS_SERVER(inode);
4470 const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
4471 struct nfs_setattrargs args = {
4472 .fh = NFS_FH(inode),
4473 .iap = &sattr,
4474 .server = server,
4475 .bitmask = bitmask,
4476 .label = ilabel,
4477 };
4478 struct nfs_setattrres res = {
4479 .fattr = fattr,
4480 .label = olabel,
4481 .server = server,
4482 };
4483 struct rpc_message msg = {
4484 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
4485 .rpc_argp = &args,
4486 .rpc_resp = &res,
4487 };
4488 int status;
4489
4490 nfs4_stateid_copy(&args.stateid, &zero_stateid);
4491
4492 status = rpc_call_sync(server->client, &msg, 0);
4493 if (status)
4494 dprintk("%s failed: %d\n", __func__, status);
4495
4496 return status;
4497}
4498
4499static int nfs4_do_set_security_label(struct inode *inode,
4500 struct nfs4_label *ilabel,
4501 struct nfs_fattr *fattr,
4502 struct nfs4_label *olabel)
4503{
4504 struct nfs4_exception exception = { };
4505 int err;
4506
4507 do {
4508 err = nfs4_handle_exception(NFS_SERVER(inode),
4509 _nfs4_do_set_security_label(inode, ilabel,
4510 fattr, olabel),
4511 &exception);
4512 } while (exception.retry);
4513 return err;
4514}
4515
4516static int
4517nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
4518{
4519 struct nfs4_label ilabel, *olabel = NULL;
4520 struct nfs_fattr fattr;
4521 struct rpc_cred *cred;
4522 struct inode *inode = dentry->d_inode;
4523 int status;
4524
4525 if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
4526 return -EOPNOTSUPP;
4527
4528 nfs_fattr_init(&fattr);
4529
4530 ilabel.pi = 0;
4531 ilabel.lfs = 0;
4532 ilabel.label = (char *)buf;
4533 ilabel.len = buflen;
4534
4535 cred = rpc_lookup_cred();
4536 if (IS_ERR(cred))
4537 return PTR_ERR(cred);
4538
4539 olabel = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
4540 if (IS_ERR(olabel)) {
4541 status = -PTR_ERR(olabel);
4542 goto out;
4543 }
4544
4545 status = nfs4_do_set_security_label(inode, &ilabel, &fattr, olabel);
4546 if (status == 0)
4547 nfs_setsecurity(inode, &fattr, olabel);
4548
4549 nfs4_label_free(olabel);
4550out:
4551 put_rpccred(cred);
4552 return status;
4553}
4554#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
4555
4556
4190static int 4557static int
4191nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 4558nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
4192{ 4559{
@@ -4345,7 +4712,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4345 /* cb_client4 */ 4712 /* cb_client4 */
4346 rcu_read_lock(); 4713 rcu_read_lock();
4347 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, 4714 setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
4348 sizeof(setclientid.sc_netid), 4715 sizeof(setclientid.sc_netid), "%s",
4349 rpc_peeraddr2str(clp->cl_rpcclient, 4716 rpc_peeraddr2str(clp->cl_rpcclient,
4350 RPC_DISPLAY_NETID)); 4717 RPC_DISPLAY_NETID));
4351 rcu_read_unlock(); 4718 rcu_read_unlock();
@@ -4528,7 +4895,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
4528static unsigned long 4895static unsigned long
4529nfs4_set_lock_task_retry(unsigned long timeout) 4896nfs4_set_lock_task_retry(unsigned long timeout)
4530{ 4897{
4531 freezable_schedule_timeout_killable(timeout); 4898 freezable_schedule_timeout_killable_unsafe(timeout);
4532 timeout <<= 1; 4899 timeout <<= 1;
4533 if (timeout > NFS4_LOCK_MAXTIMEOUT) 4900 if (timeout > NFS4_LOCK_MAXTIMEOUT)
4534 return NFS4_LOCK_MAXTIMEOUT; 4901 return NFS4_LOCK_MAXTIMEOUT;
@@ -5056,13 +5423,18 @@ static int nfs41_check_expired_locks(struct nfs4_state *state)
5056 5423
5057 list_for_each_entry(lsp, &state->lock_states, ls_locks) { 5424 list_for_each_entry(lsp, &state->lock_states, ls_locks) {
5058 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { 5425 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
5059 status = nfs41_test_stateid(server, &lsp->ls_stateid); 5426 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
5427
5428 status = nfs41_test_stateid(server,
5429 &lsp->ls_stateid,
5430 cred);
5060 if (status != NFS_OK) { 5431 if (status != NFS_OK) {
5061 /* Free the stateid unless the server 5432 /* Free the stateid unless the server
5062 * informs us the stateid is unrecognized. */ 5433 * informs us the stateid is unrecognized. */
5063 if (status != -NFS4ERR_BAD_STATEID) 5434 if (status != -NFS4ERR_BAD_STATEID)
5064 nfs41_free_stateid(server, 5435 nfs41_free_stateid(server,
5065 &lsp->ls_stateid); 5436 &lsp->ls_stateid,
5437 cred);
5066 clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); 5438 clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
5067 ret = status; 5439 ret = status;
5068 } 5440 }
@@ -5295,6 +5667,53 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
5295 return len; 5667 return len;
5296} 5668}
5297 5669
5670#ifdef CONFIG_NFS_V4_SECURITY_LABEL
5671static inline int nfs4_server_supports_labels(struct nfs_server *server)
5672{
5673 return server->caps & NFS_CAP_SECURITY_LABEL;
5674}
5675
5676static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
5677 const void *buf, size_t buflen,
5678 int flags, int type)
5679{
5680 if (security_ismaclabel(key))
5681 return nfs4_set_security_label(dentry, buf, buflen);
5682
5683 return -EOPNOTSUPP;
5684}
5685
5686static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
5687 void *buf, size_t buflen, int type)
5688{
5689 if (security_ismaclabel(key))
5690 return nfs4_get_security_label(dentry->d_inode, buf, buflen);
5691 return -EOPNOTSUPP;
5692}
5693
5694static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
5695 size_t list_len, const char *name,
5696 size_t name_len, int type)
5697{
5698 size_t len = 0;
5699
5700 if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) {
5701 len = security_inode_listsecurity(dentry->d_inode, NULL, 0);
5702 if (list && len <= list_len)
5703 security_inode_listsecurity(dentry->d_inode, list, len);
5704 }
5705 return len;
5706}
5707
5708static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
5709 .prefix = XATTR_SECURITY_PREFIX,
5710 .list = nfs4_xattr_list_nfs4_label,
5711 .get = nfs4_xattr_get_nfs4_label,
5712 .set = nfs4_xattr_set_nfs4_label,
5713};
5714#endif
5715
5716
5298/* 5717/*
5299 * nfs_fhget will use either the mounted_on_fileid or the fileid 5718 * nfs_fhget will use either the mounted_on_fileid or the fileid
5300 */ 5719 */
@@ -5318,7 +5737,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
5318 struct page *page) 5737 struct page *page)
5319{ 5738{
5320 struct nfs_server *server = NFS_SERVER(dir); 5739 struct nfs_server *server = NFS_SERVER(dir);
5321 u32 bitmask[2] = { 5740 u32 bitmask[3] = {
5322 [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, 5741 [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
5323 }; 5742 };
5324 struct nfs4_fs_locations_arg args = { 5743 struct nfs4_fs_locations_arg args = {
@@ -5505,7 +5924,8 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
5505 struct nfs41_exchange_id_args args = { 5924 struct nfs41_exchange_id_args args = {
5506 .verifier = &verifier, 5925 .verifier = &verifier,
5507 .client = clp, 5926 .client = clp,
5508 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER, 5927 .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
5928 EXCHGID4_FLAG_BIND_PRINC_STATEID,
5509 }; 5929 };
5510 struct nfs41_exchange_id_res res = { 5930 struct nfs41_exchange_id_res res = {
5511 0 5931 0
@@ -5762,17 +6182,14 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
5762 */ 6182 */
5763static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) 6183static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
5764{ 6184{
5765 struct nfs4_session *session = args->client->cl_session; 6185 unsigned int max_rqst_sz, max_resp_sz;
5766 unsigned int mxrqst_sz = session->fc_target_max_rqst_sz, 6186
5767 mxresp_sz = session->fc_target_max_resp_sz; 6187 max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
6188 max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
5768 6189
5769 if (mxrqst_sz == 0)
5770 mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
5771 if (mxresp_sz == 0)
5772 mxresp_sz = NFS_MAX_FILE_IO_SIZE;
5773 /* Fore channel attributes */ 6190 /* Fore channel attributes */
5774 args->fc_attrs.max_rqst_sz = mxrqst_sz; 6191 args->fc_attrs.max_rqst_sz = max_rqst_sz;
5775 args->fc_attrs.max_resp_sz = mxresp_sz; 6192 args->fc_attrs.max_resp_sz = max_resp_sz;
5776 args->fc_attrs.max_ops = NFS4_MAX_OPS; 6193 args->fc_attrs.max_ops = NFS4_MAX_OPS;
5777 args->fc_attrs.max_reqs = max_session_slots; 6194 args->fc_attrs.max_reqs = max_session_slots;
5778 6195
@@ -6159,12 +6576,14 @@ static const struct rpc_call_ops nfs4_reclaim_complete_call_ops = {
6159/* 6576/*
6160 * Issue a global reclaim complete. 6577 * Issue a global reclaim complete.
6161 */ 6578 */
6162static int nfs41_proc_reclaim_complete(struct nfs_client *clp) 6579static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
6580 struct rpc_cred *cred)
6163{ 6581{
6164 struct nfs4_reclaim_complete_data *calldata; 6582 struct nfs4_reclaim_complete_data *calldata;
6165 struct rpc_task *task; 6583 struct rpc_task *task;
6166 struct rpc_message msg = { 6584 struct rpc_message msg = {
6167 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], 6585 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE],
6586 .rpc_cred = cred,
6168 }; 6587 };
6169 struct rpc_task_setup task_setup_data = { 6588 struct rpc_task_setup task_setup_data = {
6170 .rpc_client = clp->cl_rpcclient, 6589 .rpc_client = clp->cl_rpcclient,
@@ -6348,6 +6767,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
6348 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], 6767 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
6349 .rpc_argp = &lgp->args, 6768 .rpc_argp = &lgp->args,
6350 .rpc_resp = &lgp->res, 6769 .rpc_resp = &lgp->res,
6770 .rpc_cred = lgp->cred,
6351 }; 6771 };
6352 struct rpc_task_setup task_setup_data = { 6772 struct rpc_task_setup task_setup_data = {
6353 .rpc_client = server->client, 6773 .rpc_client = server->client,
@@ -6451,6 +6871,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
6451 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], 6871 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
6452 .rpc_argp = &lrp->args, 6872 .rpc_argp = &lrp->args,
6453 .rpc_resp = &lrp->res, 6873 .rpc_resp = &lrp->res,
6874 .rpc_cred = lrp->cred,
6454 }; 6875 };
6455 struct rpc_task_setup task_setup_data = { 6876 struct rpc_task_setup task_setup_data = {
6456 .rpc_client = lrp->clp->cl_rpcclient, 6877 .rpc_client = lrp->clp->cl_rpcclient,
@@ -6520,7 +6941,9 @@ int nfs4_proc_getdevicelist(struct nfs_server *server,
6520EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); 6941EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
6521 6942
6522static int 6943static int
6523_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 6944_nfs4_proc_getdeviceinfo(struct nfs_server *server,
6945 struct pnfs_device *pdev,
6946 struct rpc_cred *cred)
6524{ 6947{
6525 struct nfs4_getdeviceinfo_args args = { 6948 struct nfs4_getdeviceinfo_args args = {
6526 .pdev = pdev, 6949 .pdev = pdev,
@@ -6532,6 +6955,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
6532 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], 6955 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO],
6533 .rpc_argp = &args, 6956 .rpc_argp = &args,
6534 .rpc_resp = &res, 6957 .rpc_resp = &res,
6958 .rpc_cred = cred,
6535 }; 6959 };
6536 int status; 6960 int status;
6537 6961
@@ -6542,14 +6966,16 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
6542 return status; 6966 return status;
6543} 6967}
6544 6968
6545int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 6969int nfs4_proc_getdeviceinfo(struct nfs_server *server,
6970 struct pnfs_device *pdev,
6971 struct rpc_cred *cred)
6546{ 6972{
6547 struct nfs4_exception exception = { }; 6973 struct nfs4_exception exception = { };
6548 int err; 6974 int err;
6549 6975
6550 do { 6976 do {
6551 err = nfs4_handle_exception(server, 6977 err = nfs4_handle_exception(server,
6552 _nfs4_proc_getdeviceinfo(server, pdev), 6978 _nfs4_proc_getdeviceinfo(server, pdev, cred),
6553 &exception); 6979 &exception);
6554 } while (exception.retry); 6980 } while (exception.retry);
6555 return err; 6981 return err;
@@ -6733,7 +7159,9 @@ out:
6733 return err; 7159 return err;
6734} 7160}
6735 7161
6736static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7162static int _nfs41_test_stateid(struct nfs_server *server,
7163 nfs4_stateid *stateid,
7164 struct rpc_cred *cred)
6737{ 7165{
6738 int status; 7166 int status;
6739 struct nfs41_test_stateid_args args = { 7167 struct nfs41_test_stateid_args args = {
@@ -6744,6 +7172,7 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6744 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID], 7172 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_TEST_STATEID],
6745 .rpc_argp = &args, 7173 .rpc_argp = &args,
6746 .rpc_resp = &res, 7174 .rpc_resp = &res,
7175 .rpc_cred = cred,
6747 }; 7176 };
6748 7177
6749 dprintk("NFS call test_stateid %p\n", stateid); 7178 dprintk("NFS call test_stateid %p\n", stateid);
@@ -6764,17 +7193,20 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6764 * 7193 *
6765 * @server: server / transport on which to perform the operation 7194 * @server: server / transport on which to perform the operation
6766 * @stateid: state ID to test 7195 * @stateid: state ID to test
7196 * @cred: credential
6767 * 7197 *
6768 * Returns NFS_OK if the server recognizes that "stateid" is valid. 7198 * Returns NFS_OK if the server recognizes that "stateid" is valid.
6769 * Otherwise a negative NFS4ERR value is returned if the operation 7199 * Otherwise a negative NFS4ERR value is returned if the operation
6770 * failed or the state ID is not currently valid. 7200 * failed or the state ID is not currently valid.
6771 */ 7201 */
6772static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7202static int nfs41_test_stateid(struct nfs_server *server,
7203 nfs4_stateid *stateid,
7204 struct rpc_cred *cred)
6773{ 7205{
6774 struct nfs4_exception exception = { }; 7206 struct nfs4_exception exception = { };
6775 int err; 7207 int err;
6776 do { 7208 do {
6777 err = _nfs41_test_stateid(server, stateid); 7209 err = _nfs41_test_stateid(server, stateid, cred);
6778 if (err != -NFS4ERR_DELAY) 7210 if (err != -NFS4ERR_DELAY)
6779 break; 7211 break;
6780 nfs4_handle_exception(server, err, &exception); 7212 nfs4_handle_exception(server, err, &exception);
@@ -6823,10 +7255,12 @@ const struct rpc_call_ops nfs41_free_stateid_ops = {
6823 7255
6824static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, 7256static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
6825 nfs4_stateid *stateid, 7257 nfs4_stateid *stateid,
7258 struct rpc_cred *cred,
6826 bool privileged) 7259 bool privileged)
6827{ 7260{
6828 struct rpc_message msg = { 7261 struct rpc_message msg = {
6829 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], 7262 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
7263 .rpc_cred = cred,
6830 }; 7264 };
6831 struct rpc_task_setup task_setup = { 7265 struct rpc_task_setup task_setup = {
6832 .rpc_client = server->client, 7266 .rpc_client = server->client,
@@ -6859,16 +7293,19 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
6859 * 7293 *
6860 * @server: server / transport on which to perform the operation 7294 * @server: server / transport on which to perform the operation
6861 * @stateid: state ID to release 7295 * @stateid: state ID to release
7296 * @cred: credential
6862 * 7297 *
6863 * Returns NFS_OK if the server freed "stateid". Otherwise a 7298 * Returns NFS_OK if the server freed "stateid". Otherwise a
6864 * negative NFS4ERR value is returned. 7299 * negative NFS4ERR value is returned.
6865 */ 7300 */
6866static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) 7301static int nfs41_free_stateid(struct nfs_server *server,
7302 nfs4_stateid *stateid,
7303 struct rpc_cred *cred)
6867{ 7304{
6868 struct rpc_task *task; 7305 struct rpc_task *task;
6869 int ret; 7306 int ret;
6870 7307
6871 task = _nfs41_free_stateid(server, stateid, true); 7308 task = _nfs41_free_stateid(server, stateid, cred, true);
6872 if (IS_ERR(task)) 7309 if (IS_ERR(task))
6873 return PTR_ERR(task); 7310 return PTR_ERR(task);
6874 ret = rpc_wait_for_completion_task(task); 7311 ret = rpc_wait_for_completion_task(task);
@@ -6881,8 +7318,9 @@ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6881static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) 7318static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
6882{ 7319{
6883 struct rpc_task *task; 7320 struct rpc_task *task;
7321 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
6884 7322
6885 task = _nfs41_free_stateid(server, &lsp->ls_stateid, false); 7323 task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
6886 nfs4_free_lock_state(server, lsp); 7324 nfs4_free_lock_state(server, lsp);
6887 if (IS_ERR(task)) 7325 if (IS_ERR(task))
6888 return PTR_ERR(task); 7326 return PTR_ERR(task);
@@ -7004,11 +7442,33 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
7004}; 7442};
7005#endif 7443#endif
7006 7444
7445#if defined(CONFIG_NFS_V4_2)
7446static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
7447 .minor_version = 2,
7448 .init_caps = NFS_CAP_READDIRPLUS
7449 | NFS_CAP_ATOMIC_OPEN
7450 | NFS_CAP_CHANGE_ATTR
7451 | NFS_CAP_POSIX_LOCK
7452 | NFS_CAP_STATEID_NFSV41
7453 | NFS_CAP_ATOMIC_OPEN_V1,
7454 .call_sync = nfs4_call_sync_sequence,
7455 .match_stateid = nfs41_match_stateid,
7456 .find_root_sec = nfs41_find_root_sec,
7457 .free_lock_state = nfs41_free_lock_state,
7458 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
7459 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
7460 .state_renewal_ops = &nfs41_state_renewal_ops,
7461};
7462#endif
7463
7007const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { 7464const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
7008 [0] = &nfs_v4_0_minor_ops, 7465 [0] = &nfs_v4_0_minor_ops,
7009#if defined(CONFIG_NFS_V4_1) 7466#if defined(CONFIG_NFS_V4_1)
7010 [1] = &nfs_v4_1_minor_ops, 7467 [1] = &nfs_v4_1_minor_ops,
7011#endif 7468#endif
7469#if defined(CONFIG_NFS_V4_2)
7470 [2] = &nfs_v4_2_minor_ops,
7471#endif
7012}; 7472};
7013 7473
7014const struct inode_operations nfs4_dir_inode_operations = { 7474const struct inode_operations nfs4_dir_inode_operations = {
@@ -7108,6 +7568,9 @@ static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
7108 7568
7109const struct xattr_handler *nfs4_xattr_handlers[] = { 7569const struct xattr_handler *nfs4_xattr_handlers[] = {
7110 &nfs4_xattr_nfs4_acl_handler, 7570 &nfs4_xattr_nfs4_acl_handler,
7571#ifdef CONFIG_NFS_V4_SECURITY_LABEL
7572 &nfs4_xattr_nfs4_label_handler,
7573#endif
7111 NULL 7574 NULL
7112}; 7575};
7113 7576
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index c4e225e4a9af..36e21cb29d65 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -478,48 +478,12 @@ static int nfs41_check_session_ready(struct nfs_client *clp)
478 return 0; 478 return 0;
479} 479}
480 480
481int nfs4_init_session(struct nfs_server *server) 481int nfs4_init_session(struct nfs_client *clp)
482{ 482{
483 struct nfs_client *clp = server->nfs_client;
484 struct nfs4_session *session;
485 unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
486 unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
487
488 if (!nfs4_has_session(clp)) 483 if (!nfs4_has_session(clp))
489 return 0; 484 return 0;
490 485
491 if (server->rsize != 0) 486 clear_bit(NFS4_SESSION_INITING, &clp->cl_session->session_state);
492 target_max_resp_sz = server->rsize;
493 target_max_resp_sz += nfs41_maxread_overhead;
494
495 if (server->wsize != 0)
496 target_max_rqst_sz = server->wsize;
497 target_max_rqst_sz += nfs41_maxwrite_overhead;
498
499 session = clp->cl_session;
500 spin_lock(&clp->cl_lock);
501 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
502 /* Initialise targets and channel attributes */
503 session->fc_target_max_rqst_sz = target_max_rqst_sz;
504 session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
505 session->fc_target_max_resp_sz = target_max_resp_sz;
506 session->fc_attrs.max_resp_sz = target_max_resp_sz;
507 } else {
508 /* Just adjust the targets */
509 if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
510 session->fc_target_max_rqst_sz = target_max_rqst_sz;
511 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
512 }
513 if (target_max_resp_sz > session->fc_target_max_resp_sz) {
514 session->fc_target_max_resp_sz = target_max_resp_sz;
515 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
516 }
517 }
518 spin_unlock(&clp->cl_lock);
519
520 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
521 nfs4_schedule_lease_recovery(clp);
522
523 return nfs41_check_session_ready(clp); 487 return nfs41_check_session_ready(clp);
524} 488}
525 489
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index ff7d9f0f8a65..3a153d82b90c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -66,9 +66,6 @@ struct nfs4_session {
66 struct nfs4_channel_attrs bc_attrs; 66 struct nfs4_channel_attrs bc_attrs;
67 struct nfs4_slot_table bc_slot_table; 67 struct nfs4_slot_table bc_slot_table;
68 struct nfs_client *clp; 68 struct nfs_client *clp;
69 /* Create session arguments */
70 unsigned int fc_target_max_rqst_sz;
71 unsigned int fc_target_max_resp_sz;
72}; 69};
73 70
74enum nfs4_session_state { 71enum nfs4_session_state {
@@ -89,7 +86,7 @@ extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
89 86
90extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 87extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
91extern void nfs4_destroy_session(struct nfs4_session *session); 88extern void nfs4_destroy_session(struct nfs4_session *session);
92extern int nfs4_init_session(struct nfs_server *server); 89extern int nfs4_init_session(struct nfs_client *clp);
93extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); 90extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
94 91
95extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); 92extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
@@ -122,7 +119,7 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
122 119
123#else /* defined(CONFIG_NFS_V4_1) */ 120#else /* defined(CONFIG_NFS_V4_1) */
124 121
125static inline int nfs4_init_session(struct nfs_server *server) 122static inline int nfs4_init_session(struct nfs_client *clp)
126{ 123{
127 return 0; 124 return 0;
128} 125}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1fab140764c4..e22862f13564 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -228,19 +228,8 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
228 return status; 228 return status;
229} 229}
230 230
231/* 231static void nfs4_end_drain_slot_table(struct nfs4_slot_table *tbl)
232 * Back channel returns NFS4ERR_DELAY for new requests when
233 * NFS4_SESSION_DRAINING is set so there is no work to be done when draining
234 * is ended.
235 */
236static void nfs4_end_drain_session(struct nfs_client *clp)
237{ 232{
238 struct nfs4_session *ses = clp->cl_session;
239 struct nfs4_slot_table *tbl;
240
241 if (ses == NULL)
242 return;
243 tbl = &ses->fc_slot_table;
244 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { 233 if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
245 spin_lock(&tbl->slot_tbl_lock); 234 spin_lock(&tbl->slot_tbl_lock);
246 nfs41_wake_slot_table(tbl); 235 nfs41_wake_slot_table(tbl);
@@ -248,6 +237,16 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
248 } 237 }
249} 238}
250 239
240static void nfs4_end_drain_session(struct nfs_client *clp)
241{
242 struct nfs4_session *ses = clp->cl_session;
243
244 if (ses != NULL) {
245 nfs4_end_drain_slot_table(&ses->bc_slot_table);
246 nfs4_end_drain_slot_table(&ses->fc_slot_table);
247 }
248}
249
251/* 250/*
252 * Signal state manager thread if session fore channel is drained 251 * Signal state manager thread if session fore channel is drained
253 */ 252 */
@@ -1194,7 +1193,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
1194 snprintf(buf, sizeof(buf), "%s-manager", 1193 snprintf(buf, sizeof(buf), "%s-manager",
1195 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); 1194 rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
1196 rcu_read_unlock(); 1195 rcu_read_unlock();
1197 task = kthread_run(nfs4_run_state_manager, clp, buf); 1196 task = kthread_run(nfs4_run_state_manager, clp, "%s", buf);
1198 if (IS_ERR(task)) { 1197 if (IS_ERR(task)) {
1199 printk(KERN_ERR "%s: kthread_run: %ld\n", 1198 printk(KERN_ERR "%s: kthread_run: %ld\n",
1200 __func__, PTR_ERR(task)); 1199 __func__, PTR_ERR(task));
@@ -1373,13 +1372,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1373 /* Guard against delegation returns and new lock/unlock calls */ 1372 /* Guard against delegation returns and new lock/unlock calls */
1374 down_write(&nfsi->rwsem); 1373 down_write(&nfsi->rwsem);
1375 /* Protect inode->i_flock using the BKL */ 1374 /* Protect inode->i_flock using the BKL */
1376 lock_flocks(); 1375 spin_lock(&inode->i_lock);
1377 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1376 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1378 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 1377 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
1379 continue; 1378 continue;
1380 if (nfs_file_open_context(fl->fl_file)->state != state) 1379 if (nfs_file_open_context(fl->fl_file)->state != state)
1381 continue; 1380 continue;
1382 unlock_flocks(); 1381 spin_unlock(&inode->i_lock);
1383 status = ops->recover_lock(state, fl); 1382 status = ops->recover_lock(state, fl);
1384 switch (status) { 1383 switch (status) {
1385 case 0: 1384 case 0:
@@ -1406,9 +1405,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
1406 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 1405 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
1407 status = 0; 1406 status = 0;
1408 } 1407 }
1409 lock_flocks(); 1408 spin_lock(&inode->i_lock);
1410 } 1409 }
1411 unlock_flocks(); 1410 spin_unlock(&inode->i_lock);
1412out: 1411out:
1413 up_write(&nfsi->rwsem); 1412 up_write(&nfsi->rwsem);
1414 return status; 1413 return status;
@@ -1563,11 +1562,12 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
1563} 1562}
1564 1563
1565static void nfs4_reclaim_complete(struct nfs_client *clp, 1564static void nfs4_reclaim_complete(struct nfs_client *clp,
1566 const struct nfs4_state_recovery_ops *ops) 1565 const struct nfs4_state_recovery_ops *ops,
1566 struct rpc_cred *cred)
1567{ 1567{
1568 /* Notify the server we're done reclaiming our state */ 1568 /* Notify the server we're done reclaiming our state */
1569 if (ops->reclaim_complete) 1569 if (ops->reclaim_complete)
1570 (void)ops->reclaim_complete(clp); 1570 (void)ops->reclaim_complete(clp, cred);
1571} 1571}
1572 1572
1573static void nfs4_clear_reclaim_server(struct nfs_server *server) 1573static void nfs4_clear_reclaim_server(struct nfs_server *server)
@@ -1612,9 +1612,15 @@ static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp)
1612 1612
1613static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) 1613static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1614{ 1614{
1615 const struct nfs4_state_recovery_ops *ops;
1616 struct rpc_cred *cred;
1617
1615 if (!nfs4_state_clear_reclaim_reboot(clp)) 1618 if (!nfs4_state_clear_reclaim_reboot(clp))
1616 return; 1619 return;
1617 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); 1620 ops = clp->cl_mvops->reboot_recovery_ops;
1621 cred = ops->get_clid_cred(clp);
1622 nfs4_reclaim_complete(clp, ops, cred);
1623 put_rpccred(cred);
1618} 1624}
1619 1625
1620static void nfs_delegation_clear_all(struct nfs_client *clp) 1626static void nfs_delegation_clear_all(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index a5e1a3026d48..5dbe2d269210 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -9,6 +9,7 @@
9#include "delegation.h" 9#include "delegation.h"
10#include "internal.h" 10#include "internal.h"
11#include "nfs4_fs.h" 11#include "nfs4_fs.h"
12#include "dns_resolve.h"
12#include "pnfs.h" 13#include "pnfs.h"
13#include "nfs.h" 14#include "nfs.h"
14 15
@@ -331,18 +332,24 @@ static int __init init_nfs_v4(void)
331{ 332{
332 int err; 333 int err;
333 334
334 err = nfs_idmap_init(); 335 err = nfs_dns_resolver_init();
335 if (err) 336 if (err)
336 goto out; 337 goto out;
337 338
338 err = nfs4_register_sysctl(); 339 err = nfs_idmap_init();
339 if (err) 340 if (err)
340 goto out1; 341 goto out1;
341 342
343 err = nfs4_register_sysctl();
344 if (err)
345 goto out2;
346
342 register_nfs_version(&nfs_v4); 347 register_nfs_version(&nfs_v4);
343 return 0; 348 return 0;
344out1: 349out2:
345 nfs_idmap_quit(); 350 nfs_idmap_quit();
351out1:
352 nfs_dns_resolver_destroy();
346out: 353out:
347 return err; 354 return err;
348} 355}
@@ -352,6 +359,7 @@ static void __exit exit_nfs_v4(void)
352 unregister_nfs_version(&nfs_v4); 359 unregister_nfs_version(&nfs_v4);
353 nfs4_unregister_sysctl(); 360 nfs4_unregister_sysctl();
354 nfs_idmap_quit(); 361 nfs_idmap_quit();
362 nfs_dns_resolver_destroy();
355} 363}
356 364
357MODULE_LICENSE("GPL"); 365MODULE_LICENSE("GPL");
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4be8d135ed61..3850b018815f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -102,12 +102,23 @@ static int nfs4_stat_to_errno(int);
102#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) 102#define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2))
103#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 103#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
104#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) 104#define nfs4_group_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
105#ifdef CONFIG_NFS_V4_SECURITY_LABEL
106/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
107#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
108#define encode_readdir_space 24
109#define encode_readdir_bitmask_sz 3
110#else
111#define nfs4_label_maxsz 0
112#define encode_readdir_space 20
113#define encode_readdir_bitmask_sz 2
114#endif
105/* We support only one layout type per file system */ 115/* We support only one layout type per file system */
106#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8) 116#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
107/* This is based on getfattr, which uses the most attributes: */ 117/* This is based on getfattr, which uses the most attributes: */
108#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ 118#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \
109 3 + 3 + 3 + nfs4_owner_maxsz + \ 119 3 + 3 + 3 + nfs4_owner_maxsz + \
110 nfs4_group_maxsz + decode_mdsthreshold_maxsz)) 120 nfs4_group_maxsz + nfs4_label_maxsz + \
121 decode_mdsthreshold_maxsz))
111#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ 122#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \
112 nfs4_fattr_value_maxsz) 123 nfs4_fattr_value_maxsz)
113#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) 124#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz)
@@ -115,6 +126,7 @@ static int nfs4_stat_to_errno(int);
115 1 + 2 + 1 + \ 126 1 + 2 + 1 + \
116 nfs4_owner_maxsz + \ 127 nfs4_owner_maxsz + \
117 nfs4_group_maxsz + \ 128 nfs4_group_maxsz + \
129 nfs4_label_maxsz + \
118 4 + 4) 130 4 + 4)
119#define encode_savefh_maxsz (op_encode_hdr_maxsz) 131#define encode_savefh_maxsz (op_encode_hdr_maxsz)
120#define decode_savefh_maxsz (op_decode_hdr_maxsz) 132#define decode_savefh_maxsz (op_decode_hdr_maxsz)
@@ -192,9 +204,11 @@ static int nfs4_stat_to_errno(int);
192 encode_stateid_maxsz + 3) 204 encode_stateid_maxsz + 3)
193#define decode_read_maxsz (op_decode_hdr_maxsz + 2) 205#define decode_read_maxsz (op_decode_hdr_maxsz + 2)
194#define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 206#define encode_readdir_maxsz (op_encode_hdr_maxsz + \
195 2 + encode_verifier_maxsz + 5) 207 2 + encode_verifier_maxsz + 5 + \
208 nfs4_label_maxsz)
196#define decode_readdir_maxsz (op_decode_hdr_maxsz + \ 209#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
197 decode_verifier_maxsz) 210 decode_verifier_maxsz + \
211 nfs4_label_maxsz + nfs4_fattr_maxsz)
198#define encode_readlink_maxsz (op_encode_hdr_maxsz) 212#define encode_readlink_maxsz (op_encode_hdr_maxsz)
199#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) 213#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
200#define encode_write_maxsz (op_encode_hdr_maxsz + \ 214#define encode_write_maxsz (op_encode_hdr_maxsz + \
@@ -853,6 +867,12 @@ const u32 nfs41_maxread_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
853 decode_sequence_maxsz + 867 decode_sequence_maxsz +
854 decode_putfh_maxsz) * 868 decode_putfh_maxsz) *
855 XDR_UNIT); 869 XDR_UNIT);
870
871const u32 nfs41_maxgetdevinfo_overhead = ((RPC_MAX_REPHEADER_WITH_AUTH +
872 compound_decode_hdr_maxsz +
873 decode_sequence_maxsz) *
874 XDR_UNIT);
875EXPORT_SYMBOL_GPL(nfs41_maxgetdevinfo_overhead);
856#endif /* CONFIG_NFS_V4_1 */ 876#endif /* CONFIG_NFS_V4_1 */
857 877
858static const umode_t nfs_type2fmt[] = { 878static const umode_t nfs_type2fmt[] = {
@@ -968,7 +988,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
968 encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE); 988 encode_opaque_fixed(xdr, verf->data, NFS4_VERIFIER_SIZE);
969} 989}
970 990
971static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server) 991static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
992 const struct nfs4_label *label,
993 const struct nfs_server *server)
972{ 994{
973 char owner_name[IDMAP_NAMESZ]; 995 char owner_name[IDMAP_NAMESZ];
974 char owner_group[IDMAP_NAMESZ]; 996 char owner_group[IDMAP_NAMESZ];
@@ -977,17 +999,19 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
977 __be32 *p; 999 __be32 *p;
978 __be32 *q; 1000 __be32 *q;
979 int len; 1001 int len;
1002 uint32_t bmval_len = 2;
980 uint32_t bmval0 = 0; 1003 uint32_t bmval0 = 0;
981 uint32_t bmval1 = 0; 1004 uint32_t bmval1 = 0;
1005 uint32_t bmval2 = 0;
982 1006
983 /* 1007 /*
984 * We reserve enough space to write the entire attribute buffer at once. 1008 * We reserve enough space to write the entire attribute buffer at once.
985 * In the worst-case, this would be 1009 * In the worst-case, this would be
986 * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) 1010 * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
987 * = 36 bytes, plus any contribution from variable-length fields 1011 * = 40 bytes, plus any contribution from variable-length fields
988 * such as owner/group. 1012 * such as owner/group.
989 */ 1013 */
990 len = 16; 1014 len = 8;
991 1015
992 /* Sigh */ 1016 /* Sigh */
993 if (iap->ia_valid & ATTR_SIZE) 1017 if (iap->ia_valid & ATTR_SIZE)
@@ -1025,15 +1049,22 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1025 len += 16; 1049 len += 16;
1026 else if (iap->ia_valid & ATTR_MTIME) 1050 else if (iap->ia_valid & ATTR_MTIME)
1027 len += 4; 1051 len += 4;
1052 if (label) {
1053 len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
1054 bmval_len = 3;
1055 }
1056
1057 len += bmval_len << 2;
1028 p = reserve_space(xdr, len); 1058 p = reserve_space(xdr, len);
1029 1059
1030 /* 1060 /*
1031 * We write the bitmap length now, but leave the bitmap and the attribute 1061 * We write the bitmap length now, but leave the bitmap and the attribute
1032 * buffer length to be backfilled at the end of this routine. 1062 * buffer length to be backfilled at the end of this routine.
1033 */ 1063 */
1034 *p++ = cpu_to_be32(2); 1064 *p++ = cpu_to_be32(bmval_len);
1035 q = p; 1065 q = p;
1036 p += 3; 1066 /* Skip bitmap entries + attrlen */
1067 p += bmval_len + 1;
1037 1068
1038 if (iap->ia_valid & ATTR_SIZE) { 1069 if (iap->ia_valid & ATTR_SIZE) {
1039 bmval0 |= FATTR4_WORD0_SIZE; 1070 bmval0 |= FATTR4_WORD0_SIZE;
@@ -1071,6 +1102,13 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1071 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 1102 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
1072 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); 1103 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
1073 } 1104 }
1105 if (label) {
1106 bmval2 |= FATTR4_WORD2_SECURITY_LABEL;
1107 *p++ = cpu_to_be32(label->lfs);
1108 *p++ = cpu_to_be32(label->pi);
1109 *p++ = cpu_to_be32(label->len);
1110 p = xdr_encode_opaque_fixed(p, label->label, label->len);
1111 }
1074 1112
1075 /* 1113 /*
1076 * Now we backfill the bitmap and the attribute buffer length. 1114 * Now we backfill the bitmap and the attribute buffer length.
@@ -1080,9 +1118,11 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
1080 len, ((char *)p - (char *)q) + 4); 1118 len, ((char *)p - (char *)q) + 4);
1081 BUG(); 1119 BUG();
1082 } 1120 }
1083 len = (char *)p - (char *)q - 12;
1084 *q++ = htonl(bmval0); 1121 *q++ = htonl(bmval0);
1085 *q++ = htonl(bmval1); 1122 *q++ = htonl(bmval1);
1123 if (bmval_len == 3)
1124 *q++ = htonl(bmval2);
1125 len = (char *)p - (char *)(q + 1);
1086 *q = htonl(len); 1126 *q = htonl(len);
1087 1127
1088/* out: */ 1128/* out: */
@@ -1136,7 +1176,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
1136 } 1176 }
1137 1177
1138 encode_string(xdr, create->name->len, create->name->name); 1178 encode_string(xdr, create->name->len, create->name->name);
1139 encode_attrs(xdr, create->attrs, create->server); 1179 encode_attrs(xdr, create->attrs, create->label, create->server);
1140} 1180}
1141 1181
1142static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) 1182static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1188,8 +1228,10 @@ encode_getattr_three(struct xdr_stream *xdr,
1188 1228
1189static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) 1229static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
1190{ 1230{
1191 encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], 1231 encode_getattr_three(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
1192 bitmask[1] & nfs4_fattr_bitmap[1], hdr); 1232 bitmask[1] & nfs4_fattr_bitmap[1],
1233 bitmask[2] & nfs4_fattr_bitmap[2],
1234 hdr);
1193} 1235}
1194 1236
1195static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, 1237static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask,
@@ -1367,11 +1409,11 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1367 switch(arg->createmode) { 1409 switch(arg->createmode) {
1368 case NFS4_CREATE_UNCHECKED: 1410 case NFS4_CREATE_UNCHECKED:
1369 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); 1411 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
1370 encode_attrs(xdr, arg->u.attrs, arg->server); 1412 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
1371 break; 1413 break;
1372 case NFS4_CREATE_GUARDED: 1414 case NFS4_CREATE_GUARDED:
1373 *p = cpu_to_be32(NFS4_CREATE_GUARDED); 1415 *p = cpu_to_be32(NFS4_CREATE_GUARDED);
1374 encode_attrs(xdr, arg->u.attrs, arg->server); 1416 encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
1375 break; 1417 break;
1376 case NFS4_CREATE_EXCLUSIVE: 1418 case NFS4_CREATE_EXCLUSIVE:
1377 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); 1419 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1381,7 +1423,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1381 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); 1423 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
1382 encode_nfs4_verifier(xdr, &arg->u.verifier); 1424 encode_nfs4_verifier(xdr, &arg->u.verifier);
1383 dummy.ia_valid = 0; 1425 dummy.ia_valid = 0;
1384 encode_attrs(xdr, &dummy, arg->server); 1426 encode_attrs(xdr, &dummy, arg->label, arg->server);
1385 } 1427 }
1386} 1428}
1387 1429
@@ -1532,7 +1574,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1532 1574
1533static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr) 1575static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
1534{ 1576{
1535 uint32_t attrs[2] = { 1577 uint32_t attrs[3] = {
1536 FATTR4_WORD0_RDATTR_ERROR, 1578 FATTR4_WORD0_RDATTR_ERROR,
1537 FATTR4_WORD1_MOUNTED_ON_FILEID, 1579 FATTR4_WORD1_MOUNTED_ON_FILEID,
1538 }; 1580 };
@@ -1555,20 +1597,26 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1555 encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr); 1597 encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
1556 encode_uint64(xdr, readdir->cookie); 1598 encode_uint64(xdr, readdir->cookie);
1557 encode_nfs4_verifier(xdr, &readdir->verifier); 1599 encode_nfs4_verifier(xdr, &readdir->verifier);
1558 p = reserve_space(xdr, 20); 1600 p = reserve_space(xdr, encode_readdir_space);
1559 *p++ = cpu_to_be32(dircount); 1601 *p++ = cpu_to_be32(dircount);
1560 *p++ = cpu_to_be32(readdir->count); 1602 *p++ = cpu_to_be32(readdir->count);
1561 *p++ = cpu_to_be32(2); 1603 *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
1562
1563 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); 1604 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1564 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); 1605 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1606 if (encode_readdir_bitmask_sz > 2) {
1607 if (hdr->minorversion > 1)
1608 attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
1609 p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
1610 }
1565 memcpy(verf, readdir->verifier.data, sizeof(verf)); 1611 memcpy(verf, readdir->verifier.data, sizeof(verf));
1566 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1612
1613 dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
1567 __func__, 1614 __func__,
1568 (unsigned long long)readdir->cookie, 1615 (unsigned long long)readdir->cookie,
1569 verf[0], verf[1], 1616 verf[0], verf[1],
1570 attrs[0] & readdir->bitmask[0], 1617 attrs[0] & readdir->bitmask[0],
1571 attrs[1] & readdir->bitmask[1]); 1618 attrs[1] & readdir->bitmask[1],
1619 attrs[2] & readdir->bitmask[2]);
1572} 1620}
1573 1621
1574static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr) 1622static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
@@ -1627,7 +1675,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
1627{ 1675{
1628 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); 1676 encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
1629 encode_nfs4_stateid(xdr, &arg->stateid); 1677 encode_nfs4_stateid(xdr, &arg->stateid);
1630 encode_attrs(xdr, arg->iap, server); 1678 encode_attrs(xdr, arg->iap, arg->label, server);
1631} 1679}
1632 1680
1633static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) 1681static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -1889,7 +1937,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1889 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, 1937 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1890 NFS4_DEVICEID4_SIZE); 1938 NFS4_DEVICEID4_SIZE);
1891 *p++ = cpu_to_be32(args->pdev->layout_type); 1939 *p++ = cpu_to_be32(args->pdev->layout_type);
1892 *p++ = cpu_to_be32(args->pdev->pglen); /* gdia_maxcount */ 1940 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
1893 *p++ = cpu_to_be32(0); /* bitmap length 0 */ 1941 *p++ = cpu_to_be32(0); /* bitmap length 0 */
1894} 1942}
1895 1943
@@ -4038,6 +4086,56 @@ static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap,
4038 return status; 4086 return status;
4039} 4087}
4040 4088
4089static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
4090 struct nfs4_label *label)
4091{
4092 uint32_t pi = 0;
4093 uint32_t lfs = 0;
4094 __u32 len;
4095 __be32 *p;
4096 int status = 0;
4097
4098 if (unlikely(bitmap[2] & (FATTR4_WORD2_SECURITY_LABEL - 1U)))
4099 return -EIO;
4100 if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) {
4101 p = xdr_inline_decode(xdr, 4);
4102 if (unlikely(!p))
4103 goto out_overflow;
4104 lfs = be32_to_cpup(p++);
4105 p = xdr_inline_decode(xdr, 4);
4106 if (unlikely(!p))
4107 goto out_overflow;
4108 pi = be32_to_cpup(p++);
4109 p = xdr_inline_decode(xdr, 4);
4110 if (unlikely(!p))
4111 goto out_overflow;
4112 len = be32_to_cpup(p++);
4113 p = xdr_inline_decode(xdr, len);
4114 if (unlikely(!p))
4115 goto out_overflow;
4116 if (len < NFS4_MAXLABELLEN) {
4117 if (label) {
4118 memcpy(label->label, p, len);
4119 label->len = len;
4120 label->pi = pi;
4121 label->lfs = lfs;
4122 status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
4123 }
4124 bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
4125 } else
4126 printk(KERN_WARNING "%s: label too long (%u)!\n",
4127 __func__, len);
4128 }
4129 if (label && label->label)
4130 dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__,
4131 (char *)label->label, label->len, label->pi, label->lfs);
4132 return status;
4133
4134out_overflow:
4135 print_overflow_msg(__func__, xdr);
4136 return -EIO;
4137}
4138
4041static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 4139static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
4042{ 4140{
4043 int status = 0; 4141 int status = 0;
@@ -4380,7 +4478,7 @@ out_overflow:
4380 4478
4381static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, 4479static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4382 struct nfs_fattr *fattr, struct nfs_fh *fh, 4480 struct nfs_fattr *fattr, struct nfs_fh *fh,
4383 struct nfs4_fs_locations *fs_loc, 4481 struct nfs4_fs_locations *fs_loc, struct nfs4_label *label,
4384 const struct nfs_server *server) 4482 const struct nfs_server *server)
4385{ 4483{
4386 int status; 4484 int status;
@@ -4488,6 +4586,13 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4488 if (status < 0) 4586 if (status < 0)
4489 goto xdr_error; 4587 goto xdr_error;
4490 4588
4589 if (label) {
4590 status = decode_attr_security_label(xdr, bitmap, label);
4591 if (status < 0)
4592 goto xdr_error;
4593 fattr->valid |= status;
4594 }
4595
4491xdr_error: 4596xdr_error:
4492 dprintk("%s: xdr returned %d\n", __func__, -status); 4597 dprintk("%s: xdr returned %d\n", __func__, -status);
4493 return status; 4598 return status;
@@ -4495,7 +4600,7 @@ xdr_error:
4495 4600
4496static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4601static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4497 struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc, 4602 struct nfs_fh *fh, struct nfs4_fs_locations *fs_loc,
4498 const struct nfs_server *server) 4603 struct nfs4_label *label, const struct nfs_server *server)
4499{ 4604{
4500 unsigned int savep; 4605 unsigned int savep;
4501 uint32_t attrlen, 4606 uint32_t attrlen,
@@ -4514,7 +4619,8 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
4514 if (status < 0) 4619 if (status < 0)
4515 goto xdr_error; 4620 goto xdr_error;
4516 4621
4517 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc, server); 4622 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, fs_loc,
4623 label, server);
4518 if (status < 0) 4624 if (status < 0)
4519 goto xdr_error; 4625 goto xdr_error;
4520 4626
@@ -4524,10 +4630,16 @@ xdr_error:
4524 return status; 4630 return status;
4525} 4631}
4526 4632
4633static int decode_getfattr_label(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4634 struct nfs4_label *label, const struct nfs_server *server)
4635{
4636 return decode_getfattr_generic(xdr, fattr, NULL, NULL, label, server);
4637}
4638
4527static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4639static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4528 const struct nfs_server *server) 4640 const struct nfs_server *server)
4529{ 4641{
4530 return decode_getfattr_generic(xdr, fattr, NULL, NULL, server); 4642 return decode_getfattr_generic(xdr, fattr, NULL, NULL, NULL, server);
4531} 4643}
4532 4644
4533/* 4645/*
@@ -5919,7 +6031,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5919 status = decode_getfh(xdr, res->fh); 6031 status = decode_getfh(xdr, res->fh);
5920 if (status) 6032 if (status)
5921 goto out; 6033 goto out;
5922 status = decode_getfattr(xdr, res->fattr, res->server); 6034 status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
5923out: 6035out:
5924 return status; 6036 return status;
5925} 6037}
@@ -5945,7 +6057,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
5945 goto out; 6057 goto out;
5946 status = decode_getfh(xdr, res->fh); 6058 status = decode_getfh(xdr, res->fh);
5947 if (status == 0) 6059 if (status == 0)
5948 status = decode_getfattr(xdr, res->fattr, res->server); 6060 status = decode_getfattr_label(xdr, res->fattr,
6061 res->label, res->server);
5949out: 6062out:
5950 return status; 6063 return status;
5951} 6064}
@@ -6036,7 +6149,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6036 status = decode_restorefh(xdr); 6149 status = decode_restorefh(xdr);
6037 if (status) 6150 if (status)
6038 goto out; 6151 goto out;
6039 decode_getfattr(xdr, res->fattr, res->server); 6152 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6040out: 6153out:
6041 return status; 6154 return status;
6042} 6155}
@@ -6065,7 +6178,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6065 status = decode_getfh(xdr, res->fh); 6178 status = decode_getfh(xdr, res->fh);
6066 if (status) 6179 if (status)
6067 goto out; 6180 goto out;
6068 decode_getfattr(xdr, res->fattr, res->server); 6181 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6069out: 6182out:
6070 return status; 6183 return status;
6071} 6184}
@@ -6097,7 +6210,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6097 status = decode_putfh(xdr); 6210 status = decode_putfh(xdr);
6098 if (status) 6211 if (status)
6099 goto out; 6212 goto out;
6100 status = decode_getfattr(xdr, res->fattr, res->server); 6213 status = decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6101out: 6214out:
6102 return status; 6215 return status;
6103} 6216}
@@ -6230,7 +6343,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6230 goto out; 6343 goto out;
6231 if (res->access_request) 6344 if (res->access_request)
6232 decode_access(xdr, &res->access_supported, &res->access_result); 6345 decode_access(xdr, &res->access_supported, &res->access_result);
6233 decode_getfattr(xdr, res->f_attr, res->server); 6346 decode_getfattr_label(xdr, res->f_attr, res->f_label, res->server);
6234out: 6347out:
6235 return status; 6348 return status;
6236} 6349}
@@ -6307,7 +6420,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
6307 status = decode_setattr(xdr); 6420 status = decode_setattr(xdr);
6308 if (status) 6421 if (status)
6309 goto out; 6422 goto out;
6310 decode_getfattr(xdr, res->fattr, res->server); 6423 decode_getfattr_label(xdr, res->fattr, res->label, res->server);
6311out: 6424out:
6312 return status; 6425 return status;
6313} 6426}
@@ -6696,7 +6809,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
6696 xdr_enter_page(xdr, PAGE_SIZE); 6809 xdr_enter_page(xdr, PAGE_SIZE);
6697 status = decode_getfattr_generic(xdr, &res->fs_locations->fattr, 6810 status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
6698 NULL, res->fs_locations, 6811 NULL, res->fs_locations,
6699 res->fs_locations->server); 6812 NULL, res->fs_locations->server);
6700out: 6813out:
6701 return status; 6814 return status;
6702} 6815}
@@ -7109,7 +7222,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
7109 goto out_overflow; 7222 goto out_overflow;
7110 7223
7111 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, 7224 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
7112 NULL, entry->server) < 0) 7225 NULL, entry->label, entry->server) < 0)
7113 goto out_overflow; 7226 goto out_overflow;
7114 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) 7227 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
7115 entry->ino = entry->fattr->mounted_on_fileid; 7228 entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index a9ebd817278b..e4f9cbfec67b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -613,8 +613,10 @@ int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
613 pd.pgbase = 0; 613 pd.pgbase = 0;
614 pd.pglen = PAGE_SIZE; 614 pd.pglen = PAGE_SIZE;
615 pd.mincount = 0; 615 pd.mincount = 0;
616 pd.maxcount = PAGE_SIZE;
616 617
617 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd); 618 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
619 pnfslay->plh_lc_cred);
618 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); 620 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
619 if (err) 621 if (err)
620 goto err_out; 622 goto err_out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c5bd758e5637..3a3a79d6bf15 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -360,7 +360,7 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
360} 360}
361EXPORT_SYMBOL_GPL(pnfs_put_lseg); 361EXPORT_SYMBOL_GPL(pnfs_put_lseg);
362 362
363static inline u64 363static u64
364end_offset(u64 start, u64 len) 364end_offset(u64 start, u64 len)
365{ 365{
366 u64 end; 366 u64 end;
@@ -376,9 +376,9 @@ end_offset(u64 start, u64 len)
376 * start2 end2 376 * start2 end2
377 * [----------------) 377 * [----------------)
378 */ 378 */
379static inline int 379static bool
380lo_seg_contained(struct pnfs_layout_range *l1, 380pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
381 struct pnfs_layout_range *l2) 381 const struct pnfs_layout_range *l2)
382{ 382{
383 u64 start1 = l1->offset; 383 u64 start1 = l1->offset;
384 u64 end1 = end_offset(start1, l1->length); 384 u64 end1 = end_offset(start1, l1->length);
@@ -395,9 +395,9 @@ lo_seg_contained(struct pnfs_layout_range *l1,
395 * start2 end2 395 * start2 end2
396 * [----------------) 396 * [----------------)
397 */ 397 */
398static inline int 398static bool
399lo_seg_intersecting(struct pnfs_layout_range *l1, 399pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
400 struct pnfs_layout_range *l2) 400 const struct pnfs_layout_range *l2)
401{ 401{
402 u64 start1 = l1->offset; 402 u64 start1 = l1->offset;
403 u64 end1 = end_offset(start1, l1->length); 403 u64 end1 = end_offset(start1, l1->length);
@@ -409,12 +409,12 @@ lo_seg_intersecting(struct pnfs_layout_range *l1,
409} 409}
410 410
411static bool 411static bool
412should_free_lseg(struct pnfs_layout_range *lseg_range, 412should_free_lseg(const struct pnfs_layout_range *lseg_range,
413 struct pnfs_layout_range *recall_range) 413 const struct pnfs_layout_range *recall_range)
414{ 414{
415 return (recall_range->iomode == IOMODE_ANY || 415 return (recall_range->iomode == IOMODE_ANY ||
416 lseg_range->iomode == recall_range->iomode) && 416 lseg_range->iomode == recall_range->iomode) &&
417 lo_seg_intersecting(lseg_range, recall_range); 417 pnfs_lseg_range_intersecting(lseg_range, recall_range);
418} 418}
419 419
420static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 420static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
@@ -766,6 +766,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
766 lgp->args.inode = ino; 766 lgp->args.inode = ino;
767 lgp->args.ctx = get_nfs_open_context(ctx); 767 lgp->args.ctx = get_nfs_open_context(ctx);
768 lgp->gfp_flags = gfp_flags; 768 lgp->gfp_flags = gfp_flags;
769 lgp->cred = lo->plh_lc_cred;
769 770
770 /* Synchronously retrieve layout information from server and 771 /* Synchronously retrieve layout information from server and
771 * store in lseg. 772 * store in lseg.
@@ -860,6 +861,7 @@ _pnfs_return_layout(struct inode *ino)
860 lrp->args.inode = ino; 861 lrp->args.inode = ino;
861 lrp->args.layout = lo; 862 lrp->args.layout = lo;
862 lrp->clp = NFS_SERVER(ino)->nfs_client; 863 lrp->clp = NFS_SERVER(ino)->nfs_client;
864 lrp->cred = lo->plh_lc_cred;
863 865
864 status = nfs4_proc_layoutreturn(lrp); 866 status = nfs4_proc_layoutreturn(lrp);
865out: 867out:
@@ -984,8 +986,8 @@ out:
984 * are seen first. 986 * are seen first.
985 */ 987 */
986static s64 988static s64
987cmp_layout(struct pnfs_layout_range *l1, 989pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
988 struct pnfs_layout_range *l2) 990 const struct pnfs_layout_range *l2)
989{ 991{
990 s64 d; 992 s64 d;
991 993
@@ -1012,7 +1014,7 @@ pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1012 dprintk("%s:Begin\n", __func__); 1014 dprintk("%s:Begin\n", __func__);
1013 1015
1014 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 1016 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
1015 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0) 1017 if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
1016 continue; 1018 continue;
1017 list_add_tail(&lseg->pls_list, &lp->pls_list); 1019 list_add_tail(&lseg->pls_list, &lp->pls_list);
1018 dprintk("%s: inserted lseg %p " 1020 dprintk("%s: inserted lseg %p "
@@ -1050,7 +1052,7 @@ alloc_init_layout_hdr(struct inode *ino,
1050 INIT_LIST_HEAD(&lo->plh_segs); 1052 INIT_LIST_HEAD(&lo->plh_segs);
1051 INIT_LIST_HEAD(&lo->plh_bulk_destroy); 1053 INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1052 lo->plh_inode = ino; 1054 lo->plh_inode = ino;
1053 lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); 1055 lo->plh_lc_cred = get_rpccred(ctx->cred);
1054 return lo; 1056 return lo;
1055} 1057}
1056 1058
@@ -1091,21 +1093,21 @@ out_existing:
1091 * READ READ true 1093 * READ READ true
1092 * READ RW true 1094 * READ RW true
1093 */ 1095 */
1094static int 1096static bool
1095is_matching_lseg(struct pnfs_layout_range *ls_range, 1097pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1096 struct pnfs_layout_range *range) 1098 const struct pnfs_layout_range *range)
1097{ 1099{
1098 struct pnfs_layout_range range1; 1100 struct pnfs_layout_range range1;
1099 1101
1100 if ((range->iomode == IOMODE_RW && 1102 if ((range->iomode == IOMODE_RW &&
1101 ls_range->iomode != IOMODE_RW) || 1103 ls_range->iomode != IOMODE_RW) ||
1102 !lo_seg_intersecting(ls_range, range)) 1104 !pnfs_lseg_range_intersecting(ls_range, range))
1103 return 0; 1105 return 0;
1104 1106
1105 /* range1 covers only the first byte in the range */ 1107 /* range1 covers only the first byte in the range */
1106 range1 = *range; 1108 range1 = *range;
1107 range1.length = 1; 1109 range1.length = 1;
1108 return lo_seg_contained(ls_range, &range1); 1110 return pnfs_lseg_range_contained(ls_range, &range1);
1109} 1111}
1110 1112
1111/* 1113/*
@@ -1121,7 +1123,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1121 1123
1122 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1124 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1123 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1125 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1124 is_matching_lseg(&lseg->pls_range, range)) { 1126 pnfs_lseg_range_match(&lseg->pls_range, range)) {
1125 ret = pnfs_get_lseg(lseg); 1127 ret = pnfs_get_lseg(lseg);
1126 break; 1128 break;
1127 } 1129 }
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f5f8a470a647..a4f41810a7f4 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -149,9 +149,10 @@ struct pnfs_device {
149 struct nfs4_deviceid dev_id; 149 struct nfs4_deviceid dev_id;
150 unsigned int layout_type; 150 unsigned int layout_type;
151 unsigned int mincount; 151 unsigned int mincount;
152 unsigned int maxcount; /* gdia_maxcount */
152 struct page **pages; 153 struct page **pages;
153 unsigned int pgbase; 154 unsigned int pgbase;
154 unsigned int pglen; 155 unsigned int pglen; /* reply buffer length */
155}; 156};
156 157
157#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 158#define NFS4_PNFS_GETDEVLIST_MAXNUM 16
@@ -170,7 +171,8 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
170 const struct nfs_fh *fh, 171 const struct nfs_fh *fh,
171 struct pnfs_devicelist *devlist); 172 struct pnfs_devicelist *devlist);
172extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 173extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
173 struct pnfs_device *dev); 174 struct pnfs_device *dev,
175 struct rpc_cred *cred);
174extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); 176extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
175extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); 177extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
176 178
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fc8de9016acf..c041c41f7a52 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -98,7 +98,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
98 */ 98 */
99static int 99static int
100nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, 100nfs_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
101 struct nfs_fattr *fattr) 101 struct nfs_fattr *fattr, struct nfs4_label *label)
102{ 102{
103 struct rpc_message msg = { 103 struct rpc_message msg = {
104 .rpc_proc = &nfs_procedures[NFSPROC_GETATTR], 104 .rpc_proc = &nfs_procedures[NFSPROC_GETATTR],
@@ -146,7 +146,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
146 146
147static int 147static int
148nfs_proc_lookup(struct inode *dir, struct qstr *name, 148nfs_proc_lookup(struct inode *dir, struct qstr *name,
149 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 149 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
150 struct nfs4_label *label)
150{ 151{
151 struct nfs_diropargs arg = { 152 struct nfs_diropargs arg = {
152 .fh = NFS_FH(dir), 153 .fh = NFS_FH(dir),
@@ -243,7 +244,7 @@ nfs_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
243 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 244 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
244 nfs_mark_for_revalidate(dir); 245 nfs_mark_for_revalidate(dir);
245 if (status == 0) 246 if (status == 0)
246 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 247 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
247 nfs_free_createdata(data); 248 nfs_free_createdata(data);
248out: 249out:
249 dprintk("NFS reply create: %d\n", status); 250 dprintk("NFS reply create: %d\n", status);
@@ -290,7 +291,7 @@ nfs_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
290 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 291 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
291 } 292 }
292 if (status == 0) 293 if (status == 0)
293 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 294 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
294 nfs_free_createdata(data); 295 nfs_free_createdata(data);
295out: 296out:
296 dprintk("NFS reply mknod: %d\n", status); 297 dprintk("NFS reply mknod: %d\n", status);
@@ -442,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
442 * should fill in the data with a LOOKUP call on the wire. 443 * should fill in the data with a LOOKUP call on the wire.
443 */ 444 */
444 if (status == 0) 445 if (status == 0)
445 status = nfs_instantiate(dentry, fh, fattr); 446 status = nfs_instantiate(dentry, fh, fattr, NULL);
446 447
447out_free: 448out_free:
448 nfs_free_fattr(fattr); 449 nfs_free_fattr(fattr);
@@ -471,7 +472,7 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
471 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 472 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
472 nfs_mark_for_revalidate(dir); 473 nfs_mark_for_revalidate(dir);
473 if (status == 0) 474 if (status == 0)
474 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); 475 status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL);
475 nfs_free_createdata(data); 476 nfs_free_createdata(data);
476out: 477out:
477 dprintk("NFS reply mkdir: %d\n", status); 478 dprintk("NFS reply mkdir: %d\n", status);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2d7525fbcf25..f6db66d8f647 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -269,7 +269,7 @@ static match_table_t nfs_local_lock_tokens = {
269 269
270enum { 270enum {
271 Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, 271 Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0,
272 Opt_vers_4_1, 272 Opt_vers_4_1, Opt_vers_4_2,
273 273
274 Opt_vers_err 274 Opt_vers_err
275}; 275};
@@ -280,6 +280,7 @@ static match_table_t nfs_vers_tokens = {
280 { Opt_vers_4, "4" }, 280 { Opt_vers_4, "4" },
281 { Opt_vers_4_0, "4.0" }, 281 { Opt_vers_4_0, "4.0" },
282 { Opt_vers_4_1, "4.1" }, 282 { Opt_vers_4_1, "4.1" },
283 { Opt_vers_4_2, "4.2" },
283 284
284 { Opt_vers_err, NULL } 285 { Opt_vers_err, NULL }
285}; 286};
@@ -832,6 +833,7 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root)
832 seq_printf(m, "\n\tnfsv4:\t"); 833 seq_printf(m, "\n\tnfsv4:\t");
833 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); 834 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
834 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); 835 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
836 seq_printf(m, ",bm2=0x%x", nfss->attr_bitmask[2]);
835 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); 837 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
836 show_sessions(m, nfss); 838 show_sessions(m, nfss);
837 show_pnfs(m, nfss); 839 show_pnfs(m, nfss);
@@ -1097,6 +1099,10 @@ static int nfs_parse_version_string(char *string,
1097 mnt->version = 4; 1099 mnt->version = 4;
1098 mnt->minorversion = 1; 1100 mnt->minorversion = 1;
1099 break; 1101 break;
1102 case Opt_vers_4_2:
1103 mnt->version = 4;
1104 mnt->minorversion = 2;
1105 break;
1100 default: 1106 default:
1101 return 0; 1107 return 0;
1102 } 1108 }
@@ -1608,29 +1614,13 @@ out_security_failure:
1608} 1614}
1609 1615
1610/* 1616/*
1611 * Select a security flavor for this mount. The selected flavor 1617 * Ensure that the specified authtype in args->auth_flavors[0] is supported by
1612 * is planted in args->auth_flavors[0]. 1618 * the server. Returns 0 if it's ok, and -EACCES if not.
1613 *
1614 * Returns 0 on success, -EACCES on failure.
1615 */ 1619 */
1616static int nfs_select_flavor(struct nfs_parsed_mount_data *args, 1620static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
1617 struct nfs_mount_request *request) 1621 rpc_authflavor_t *server_authlist, unsigned int count)
1618{ 1622{
1619 unsigned int i, count = *(request->auth_flav_len); 1623 unsigned int i;
1620 rpc_authflavor_t flavor;
1621
1622 /*
1623 * The NFSv2 MNT operation does not return a flavor list.
1624 */
1625 if (args->mount_server.version != NFS_MNT3_VERSION)
1626 goto out_default;
1627
1628 /*
1629 * Certain releases of Linux's mountd return an empty
1630 * flavor list in some cases.
1631 */
1632 if (count == 0)
1633 goto out_default;
1634 1624
1635 /* 1625 /*
1636 * If the sec= mount option is used, the specified flavor or AUTH_NULL 1626 * If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1640,60 +1630,19 @@ static int nfs_select_flavor(struct nfs_parsed_mount_data *args,
1640 * means that the server will ignore the rpc creds, so any flavor 1630 * means that the server will ignore the rpc creds, so any flavor
1641 * can be used. 1631 * can be used.
1642 */ 1632 */
1643 if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
1644 for (i = 0; i < count; i++) {
1645 if (args->auth_flavors[0] == request->auth_flavs[i] ||
1646 request->auth_flavs[i] == RPC_AUTH_NULL)
1647 goto out;
1648 }
1649 dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n",
1650 args->auth_flavors[0]);
1651 goto out_err;
1652 }
1653
1654 /*
1655 * RFC 2623, section 2.7 suggests we SHOULD prefer the
1656 * flavor listed first. However, some servers list
1657 * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
1658 */
1659 for (i = 0; i < count; i++) {
1660 struct rpcsec_gss_info info;
1661
1662 flavor = request->auth_flavs[i];
1663 switch (flavor) {
1664 case RPC_AUTH_UNIX:
1665 goto out_set;
1666 case RPC_AUTH_NULL:
1667 continue;
1668 default:
1669 if (rpcauth_get_gssinfo(flavor, &info) == 0)
1670 goto out_set;
1671 }
1672 }
1673
1674 /*
1675 * As a last chance, see if the server list contains AUTH_NULL -
1676 * if it does, use the default flavor.
1677 */
1678 for (i = 0; i < count; i++) { 1633 for (i = 0; i < count; i++) {
1679 if (request->auth_flavs[i] == RPC_AUTH_NULL) 1634 if (args->auth_flavors[0] == server_authlist[i] ||
1680 goto out_default; 1635 server_authlist[i] == RPC_AUTH_NULL)
1636 goto out;
1681 } 1637 }
1682 1638
1683 dfprintk(MOUNT, "NFS: no auth flavors in common with server\n"); 1639 dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
1684 goto out_err; 1640 args->auth_flavors[0]);
1641 return -EACCES;
1685 1642
1686out_default:
1687 /* use default if flavor not already set */
1688 flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ?
1689 RPC_AUTH_UNIX : args->auth_flavors[0];
1690out_set:
1691 args->auth_flavors[0] = flavor;
1692out: 1643out:
1693 dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]); 1644 dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
1694 return 0; 1645 return 0;
1695out_err:
1696 return -EACCES;
1697} 1646}
1698 1647
1699/* 1648/*
@@ -1701,10 +1650,10 @@ out_err:
1701 * corresponding to the provided path. 1650 * corresponding to the provided path.
1702 */ 1651 */
1703static int nfs_request_mount(struct nfs_parsed_mount_data *args, 1652static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1704 struct nfs_fh *root_fh) 1653 struct nfs_fh *root_fh,
1654 rpc_authflavor_t *server_authlist,
1655 unsigned int *server_authlist_len)
1705{ 1656{
1706 rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
1707 unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
1708 struct nfs_mount_request request = { 1657 struct nfs_mount_request request = {
1709 .sap = (struct sockaddr *) 1658 .sap = (struct sockaddr *)
1710 &args->mount_server.address, 1659 &args->mount_server.address,
@@ -1712,7 +1661,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1712 .protocol = args->mount_server.protocol, 1661 .protocol = args->mount_server.protocol,
1713 .fh = root_fh, 1662 .fh = root_fh,
1714 .noresvport = args->flags & NFS_MOUNT_NORESVPORT, 1663 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1715 .auth_flav_len = &server_authlist_len, 1664 .auth_flav_len = server_authlist_len,
1716 .auth_flavs = server_authlist, 1665 .auth_flavs = server_authlist,
1717 .net = args->net, 1666 .net = args->net,
1718 }; 1667 };
@@ -1756,24 +1705,92 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args,
1756 return status; 1705 return status;
1757 } 1706 }
1758 1707
1759 return nfs_select_flavor(args, &request); 1708 return 0;
1760} 1709}
1761 1710
1762struct dentry *nfs_try_mount(int flags, const char *dev_name, 1711static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_info,
1763 struct nfs_mount_info *mount_info, 1712 struct nfs_subversion *nfs_mod)
1764 struct nfs_subversion *nfs_mod)
1765{ 1713{
1766 int status; 1714 int status;
1767 struct nfs_server *server; 1715 unsigned int i;
1716 bool tried_auth_unix = false;
1717 bool auth_null_in_list = false;
1718 struct nfs_server *server = ERR_PTR(-EACCES);
1719 struct nfs_parsed_mount_data *args = mount_info->parsed;
1720 rpc_authflavor_t authlist[NFS_MAX_SECFLAVORS];
1721 unsigned int authlist_len = ARRAY_SIZE(authlist);
1722
1723 status = nfs_request_mount(args, mount_info->mntfh, authlist,
1724 &authlist_len);
1725 if (status)
1726 return ERR_PTR(status);
1768 1727
1769 if (mount_info->parsed->need_mount) { 1728 /*
1770 status = nfs_request_mount(mount_info->parsed, mount_info->mntfh); 1729 * Was a sec= authflavor specified in the options? First, verify
1730 * whether the server supports it, and then just try to use it if so.
1731 */
1732 if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) {
1733 status = nfs_verify_authflavor(args, authlist, authlist_len);
1734 dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
1771 if (status) 1735 if (status)
1772 return ERR_PTR(status); 1736 return ERR_PTR(status);
1737 return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1738 }
1739
1740 /*
1741 * No sec= option was provided. RFC 2623, section 2.7 suggests we
1742 * SHOULD prefer the flavor listed first. However, some servers list
1743 * AUTH_NULL first. Avoid ever choosing AUTH_NULL.
1744 */
1745 for (i = 0; i < authlist_len; ++i) {
1746 rpc_authflavor_t flavor;
1747 struct rpcsec_gss_info info;
1748
1749 flavor = authlist[i];
1750 switch (flavor) {
1751 case RPC_AUTH_UNIX:
1752 tried_auth_unix = true;
1753 break;
1754 case RPC_AUTH_NULL:
1755 auth_null_in_list = true;
1756 continue;
1757 default:
1758 if (rpcauth_get_gssinfo(flavor, &info) != 0)
1759 continue;
1760 /* Fallthrough */
1761 }
1762 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
1763 args->auth_flavors[0] = flavor;
1764 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1765 if (!IS_ERR(server))
1766 return server;
1773 } 1767 }
1774 1768
1775 /* Get a volume representation */ 1769 /*
1776 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod); 1770 * Nothing we tried so far worked. At this point, give up if we've
1771 * already tried AUTH_UNIX or if the server's list doesn't contain
1772 * AUTH_NULL
1773 */
1774 if (tried_auth_unix || !auth_null_in_list)
1775 return server;
1776
1777 /* Last chance! Try AUTH_UNIX */
1778 dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
1779 args->auth_flavors[0] = RPC_AUTH_UNIX;
1780 return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1781}
1782
1783struct dentry *nfs_try_mount(int flags, const char *dev_name,
1784 struct nfs_mount_info *mount_info,
1785 struct nfs_subversion *nfs_mod)
1786{
1787 struct nfs_server *server;
1788
1789 if (mount_info->parsed->need_mount)
1790 server = nfs_try_mount_request(mount_info, nfs_mod);
1791 else
1792 server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
1793
1777 if (IS_ERR(server)) 1794 if (IS_ERR(server))
1778 return ERR_CAST(server); 1795 return ERR_CAST(server);
1779 1796
@@ -2412,7 +2429,21 @@ static int nfs_bdi_register(struct nfs_server *server)
2412int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, 2429int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
2413 struct nfs_mount_info *mount_info) 2430 struct nfs_mount_info *mount_info)
2414{ 2431{
2415 return security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts); 2432 int error;
2433 unsigned long kflags = 0, kflags_out = 0;
2434 if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL)
2435 kflags |= SECURITY_LSM_NATIVE_LABELS;
2436
2437 error = security_sb_set_mnt_opts(s, &mount_info->parsed->lsm_opts,
2438 kflags, &kflags_out);
2439 if (error)
2440 goto err;
2441
2442 if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL &&
2443 !(kflags_out & SECURITY_LSM_NATIVE_LABELS))
2444 NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL;
2445err:
2446 return error;
2416} 2447}
2417EXPORT_SYMBOL_GPL(nfs_set_sb_security); 2448EXPORT_SYMBOL_GPL(nfs_set_sb_security);
2418 2449
@@ -2447,6 +2478,10 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
2447 if (server->flags & NFS_MOUNT_NOAC) 2478 if (server->flags & NFS_MOUNT_NOAC)
2448 sb_mntdata.mntflags |= MS_SYNCHRONOUS; 2479 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2449 2480
2481 if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL)
2482 if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS)
2483 sb_mntdata.mntflags |= MS_SYNCHRONOUS;
2484
2450 /* Get a superblock - note that we may end up sharing one that already exists */ 2485 /* Get a superblock - note that we may end up sharing one that already exists */
2451 s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); 2486 s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata);
2452 if (IS_ERR(s)) { 2487 if (IS_ERR(s)) {
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 1f1f38f0c5d5..60395ad3a2e4 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -479,7 +479,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
479 479
480 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n", 480 dfprintk(VFS, "NFS: silly-rename(%s/%s, ct=%d)\n",
481 dentry->d_parent->d_name.name, dentry->d_name.name, 481 dentry->d_parent->d_name.name, dentry->d_name.name,
482 dentry->d_count); 482 d_count(dentry));
483 nfs_inc_stats(dir, NFSIOS_SILLYRENAME); 483 nfs_inc_stats(dir, NFSIOS_SILLYRENAME);
484 484
485 /* 485 /*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a2c7c28049d5..f1bdb7254776 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -888,6 +888,28 @@ out:
888 return PageUptodate(page) != 0; 888 return PageUptodate(page) != 0;
889} 889}
890 890
891/* If we know the page is up to date, and we're not using byte range locks (or
892 * if we have the whole file locked for writing), it may be more efficient to
893 * extend the write to cover the entire page in order to avoid fragmentation
894 * inefficiencies.
895 *
896 * If the file is opened for synchronous writes or if we have a write delegation
897 * from the server then we can just skip the rest of the checks.
898 */
899static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
900{
901 if (file->f_flags & O_DSYNC)
902 return 0;
903 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
904 return 1;
905 if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
906 (inode->i_flock->fl_start == 0 &&
907 inode->i_flock->fl_end == OFFSET_MAX &&
908 inode->i_flock->fl_type != F_RDLCK)))
909 return 1;
910 return 0;
911}
912
891/* 913/*
892 * Update and possibly write a cached page of an NFS file. 914 * Update and possibly write a cached page of an NFS file.
893 * 915 *
@@ -908,14 +930,7 @@ int nfs_updatepage(struct file *file, struct page *page,
908 file->f_path.dentry->d_name.name, count, 930 file->f_path.dentry->d_name.name, count,
909 (long long)(page_file_offset(page) + offset)); 931 (long long)(page_file_offset(page) + offset));
910 932
911 /* If we're not using byte range locks, and we know the page 933 if (nfs_can_extend_write(file, page, inode)) {
912 * is up to date, it may be more efficient to extend the write
913 * to cover the entire page in order to avoid fragmentation
914 * inefficiencies.
915 */
916 if (nfs_write_pageuptodate(page, inode) &&
917 inode->i_flock == NULL &&
918 !(file->f_flags & O_DSYNC)) {
919 count = max(count + offset, nfs_page_length(page)); 934 count = max(count + offset, nfs_page_length(page));
920 offset = 0; 935 offset = 0;
921 } 936 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 430b6872806f..dc8f1ef665ce 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -81,6 +81,22 @@ config NFSD_V4
81 81
82 If unsure, say N. 82 If unsure, say N.
83 83
84config NFSD_V4_SECURITY_LABEL
85 bool "Provide Security Label support for NFSv4 server"
86 depends on NFSD_V4 && SECURITY
87 help
88
89 Say Y here if you want enable fine-grained security label attribute
90 support for NFS version 4. Security labels allow security modules like
91 SELinux and Smack to label files to facilitate enforcement of their policies.
92 Without this an NFSv4 mount will have the same label on each file.
93
94 If you do not wish to enable fine-grained security labels SELinux or
95 Smack policies on NFSv4 files, say N.
96
97 WARNING: there is still a chance of backwards-incompatible protocol changes.
98 For now we recommend "Y" only for developers and testers."
99
84config NFSD_FAULT_INJECTION 100config NFSD_FAULT_INJECTION
85 bool "NFS server manual fault injection" 101 bool "NFS server manual fault injection"
86 depends on NFSD_V4 && DEBUG_KERNEL 102 depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 27d74a294515..419572f33b72 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -42,6 +42,36 @@
42#include "current_stateid.h" 42#include "current_stateid.h"
43#include "netns.h" 43#include "netns.h"
44 44
45#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
46#include <linux/security.h>
47
48static inline void
49nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
50{
51 struct inode *inode = resfh->fh_dentry->d_inode;
52 int status;
53
54 mutex_lock(&inode->i_mutex);
55 status = security_inode_setsecctx(resfh->fh_dentry,
56 label->data, label->len);
57 mutex_unlock(&inode->i_mutex);
58
59 if (status)
60 /*
61 * XXX: We should really fail the whole open, but we may
62 * already have created a new file, so it may be too
63 * late. For now this seems the least of evils:
64 */
65 bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
66
67 return;
68}
69#else
70static inline void
71nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
72{ }
73#endif
74
45#define NFSDDBG_FACILITY NFSDDBG_PROC 75#define NFSDDBG_FACILITY NFSDDBG_PROC
46 76
47static u32 nfsd_attrmask[] = { 77static u32 nfsd_attrmask[] = {
@@ -239,6 +269,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
239 (u32 *)open->op_verf.data, 269 (u32 *)open->op_verf.data,
240 &open->op_truncate, &open->op_created); 270 &open->op_truncate, &open->op_created);
241 271
272 if (!status && open->op_label.len)
273 nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
274
242 /* 275 /*
243 * Following rfc 3530 14.2.16, use the returned bitmask 276 * Following rfc 3530 14.2.16, use the returned bitmask
244 * to indicate which attributes we used to store the 277 * to indicate which attributes we used to store the
@@ -263,7 +296,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
263 296
264 nfsd4_set_open_owner_reply_cache(cstate, open, resfh); 297 nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
265 accmode = NFSD_MAY_NOP; 298 accmode = NFSD_MAY_NOP;
266 if (open->op_created) 299 if (open->op_created ||
300 open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
267 accmode |= NFSD_MAY_OWNER_OVERRIDE; 301 accmode |= NFSD_MAY_OWNER_OVERRIDE;
268 status = do_open_permission(rqstp, resfh, open, accmode); 302 status = do_open_permission(rqstp, resfh, open, accmode);
269 set_change_info(&open->op_cinfo, current_fh); 303 set_change_info(&open->op_cinfo, current_fh);
@@ -637,6 +671,9 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
637 if (status) 671 if (status)
638 goto out; 672 goto out;
639 673
674 if (create->cr_label.len)
675 nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
676
640 if (create->cr_acl != NULL) 677 if (create->cr_acl != NULL)
641 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl, 678 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
642 create->cr_bmval); 679 create->cr_bmval);
@@ -916,6 +953,11 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
916 setattr->sa_acl); 953 setattr->sa_acl);
917 if (status) 954 if (status)
918 goto out; 955 goto out;
956 if (setattr->sa_label.len)
957 status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
958 &setattr->sa_label);
959 if (status)
960 goto out;
919 status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 961 status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
920 0, (time_t)0); 962 0, (time_t)0);
921out: 963out:
@@ -1251,7 +1293,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1251 * According to RFC3010, this takes precedence over all other errors. 1293 * According to RFC3010, this takes precedence over all other errors.
1252 */ 1294 */
1253 status = nfserr_minor_vers_mismatch; 1295 status = nfserr_minor_vers_mismatch;
1254 if (args->minorversion > nfsd_supported_minorversion) 1296 if (nfsd_minorversion(args->minorversion, NFSD_TEST) <= 0)
1255 goto out; 1297 goto out;
1256 1298
1257 status = nfs41_check_op_ordering(args); 1299 status = nfs41_check_op_ordering(args);
@@ -1482,7 +1524,7 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1482static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1524static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1483{ 1525{
1484 return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ 1526 return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
1485 1 + 1 + 0 + /* eir_flags, spr_how, SP4_NONE (for now) */\ 1527 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\
1486 2 + /*eir_server_owner.so_minor_id */\ 1528 2 + /*eir_server_owner.so_minor_id */\
1487 /* eir_server_owner.so_major_id<> */\ 1529 /* eir_server_owner.so_major_id<> */\
1488 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ 1530 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4e9a21db867a..105a3b080d12 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -240,11 +240,16 @@ struct name_list {
240 struct list_head list; 240 struct list_head list;
241}; 241};
242 242
243struct nfs4_dir_ctx {
244 struct dir_context ctx;
245 struct list_head names;
246};
247
243static int 248static int
244nfsd4_build_namelist(void *arg, const char *name, int namlen, 249nfsd4_build_namelist(void *arg, const char *name, int namlen,
245 loff_t offset, u64 ino, unsigned int d_type) 250 loff_t offset, u64 ino, unsigned int d_type)
246{ 251{
247 struct list_head *names = arg; 252 struct nfs4_dir_ctx *ctx = arg;
248 struct name_list *entry; 253 struct name_list *entry;
249 254
250 if (namlen != HEXDIR_LEN - 1) 255 if (namlen != HEXDIR_LEN - 1)
@@ -254,7 +259,7 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
254 return -ENOMEM; 259 return -ENOMEM;
255 memcpy(entry->name, name, HEXDIR_LEN - 1); 260 memcpy(entry->name, name, HEXDIR_LEN - 1);
256 entry->name[HEXDIR_LEN - 1] = '\0'; 261 entry->name[HEXDIR_LEN - 1] = '\0';
257 list_add(&entry->list, names); 262 list_add(&entry->list, &ctx->names);
258 return 0; 263 return 0;
259} 264}
260 265
@@ -263,7 +268,10 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
263{ 268{
264 const struct cred *original_cred; 269 const struct cred *original_cred;
265 struct dentry *dir = nn->rec_file->f_path.dentry; 270 struct dentry *dir = nn->rec_file->f_path.dentry;
266 LIST_HEAD(names); 271 struct nfs4_dir_ctx ctx = {
272 .ctx.actor = nfsd4_build_namelist,
273 .names = LIST_HEAD_INIT(ctx.names)
274 };
267 int status; 275 int status;
268 276
269 status = nfs4_save_creds(&original_cred); 277 status = nfs4_save_creds(&original_cred);
@@ -276,11 +284,11 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
276 return status; 284 return status;
277 } 285 }
278 286
279 status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names); 287 status = iterate_dir(nn->rec_file, &ctx.ctx);
280 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 288 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
281 while (!list_empty(&names)) { 289 while (!list_empty(&ctx.names)) {
282 struct name_list *entry; 290 struct name_list *entry;
283 entry = list_entry(names.next, struct name_list, list); 291 entry = list_entry(ctx.names.next, struct name_list, list);
284 if (!status) { 292 if (!status) {
285 struct dentry *dentry; 293 struct dentry *dentry;
286 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); 294 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 316ec843dec2..43f42290e5df 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -97,19 +97,20 @@ nfs4_lock_state(void)
97 97
98static void free_session(struct nfsd4_session *); 98static void free_session(struct nfsd4_session *);
99 99
100void nfsd4_put_session(struct nfsd4_session *ses) 100static bool is_session_dead(struct nfsd4_session *ses)
101{ 101{
102 atomic_dec(&ses->se_ref); 102 return ses->se_flags & NFS4_SESSION_DEAD;
103} 103}
104 104
105static bool is_session_dead(struct nfsd4_session *ses) 105void nfsd4_put_session(struct nfsd4_session *ses)
106{ 106{
107 return ses->se_flags & NFS4_SESSION_DEAD; 107 if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
108 free_session(ses);
108} 109}
109 110
110static __be32 mark_session_dead_locked(struct nfsd4_session *ses) 111static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
111{ 112{
112 if (atomic_read(&ses->se_ref)) 113 if (atomic_read(&ses->se_ref) > ref_held_by_me)
113 return nfserr_jukebox; 114 return nfserr_jukebox;
114 ses->se_flags |= NFS4_SESSION_DEAD; 115 ses->se_flags |= NFS4_SESSION_DEAD;
115 return nfs_ok; 116 return nfs_ok;
@@ -364,19 +365,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
364} 365}
365 366
366static struct nfs4_delegation * 367static struct nfs4_delegation *
367alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh, u32 type) 368alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
368{ 369{
369 struct nfs4_delegation *dp; 370 struct nfs4_delegation *dp;
370 struct nfs4_file *fp = stp->st_file; 371 struct nfs4_file *fp = stp->st_file;
371 372
372 dprintk("NFSD alloc_init_deleg\n"); 373 dprintk("NFSD alloc_init_deleg\n");
373 /*
374 * Major work on the lease subsystem (for example, to support
375 * calbacks on stat) will be required before we can support
376 * write delegations properly.
377 */
378 if (type != NFS4_OPEN_DELEGATE_READ)
379 return NULL;
380 if (fp->fi_had_conflict) 374 if (fp->fi_had_conflict)
381 return NULL; 375 return NULL;
382 if (num_delegations > max_delegations) 376 if (num_delegations > max_delegations)
@@ -397,7 +391,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
397 INIT_LIST_HEAD(&dp->dl_recall_lru); 391 INIT_LIST_HEAD(&dp->dl_recall_lru);
398 get_nfs4_file(fp); 392 get_nfs4_file(fp);
399 dp->dl_file = fp; 393 dp->dl_file = fp;
400 dp->dl_type = type; 394 dp->dl_type = NFS4_OPEN_DELEGATE_READ;
401 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 395 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
402 dp->dl_time = 0; 396 dp->dl_time = 0;
403 atomic_set(&dp->dl_count, 1); 397 atomic_set(&dp->dl_count, 1);
@@ -1188,6 +1182,9 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
1188 target->cr_gid = source->cr_gid; 1182 target->cr_gid = source->cr_gid;
1189 target->cr_group_info = source->cr_group_info; 1183 target->cr_group_info = source->cr_group_info;
1190 get_group_info(target->cr_group_info); 1184 get_group_info(target->cr_group_info);
1185 target->cr_gss_mech = source->cr_gss_mech;
1186 if (source->cr_gss_mech)
1187 gss_mech_get(source->cr_gss_mech);
1191 return 0; 1188 return 0;
1192} 1189}
1193 1190
@@ -1262,6 +1259,33 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
1262 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); 1259 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
1263} 1260}
1264 1261
1262static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
1263{
1264 struct svc_cred *cr = &rqstp->rq_cred;
1265 u32 service;
1266
1267 if (!cr->cr_gss_mech)
1268 return false;
1269 service = gss_pseudoflavor_to_service(cr->cr_gss_mech, cr->cr_flavor);
1270 return service == RPC_GSS_SVC_INTEGRITY ||
1271 service == RPC_GSS_SVC_PRIVACY;
1272}
1273
1274static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
1275{
1276 struct svc_cred *cr = &rqstp->rq_cred;
1277
1278 if (!cl->cl_mach_cred)
1279 return true;
1280 if (cl->cl_cred.cr_gss_mech != cr->cr_gss_mech)
1281 return false;
1282 if (!svc_rqst_integrity_protected(rqstp))
1283 return false;
1284 if (!cr->cr_principal)
1285 return false;
1286 return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
1287}
1288
1265static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) 1289static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
1266{ 1290{
1267 static u32 current_clientid = 1; 1291 static u32 current_clientid = 1;
@@ -1639,16 +1663,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1639 if (exid->flags & ~EXCHGID4_FLAG_MASK_A) 1663 if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
1640 return nfserr_inval; 1664 return nfserr_inval;
1641 1665
1642 /* Currently only support SP4_NONE */
1643 switch (exid->spa_how) { 1666 switch (exid->spa_how) {
1667 case SP4_MACH_CRED:
1668 if (!svc_rqst_integrity_protected(rqstp))
1669 return nfserr_inval;
1644 case SP4_NONE: 1670 case SP4_NONE:
1645 break; 1671 break;
1646 default: /* checked by xdr code */ 1672 default: /* checked by xdr code */
1647 WARN_ON_ONCE(1); 1673 WARN_ON_ONCE(1);
1648 case SP4_SSV: 1674 case SP4_SSV:
1649 return nfserr_encr_alg_unsupp; 1675 return nfserr_encr_alg_unsupp;
1650 case SP4_MACH_CRED:
1651 return nfserr_serverfault; /* no excuse :-/ */
1652 } 1676 }
1653 1677
1654 /* Cases below refer to rfc 5661 section 18.35.4: */ 1678 /* Cases below refer to rfc 5661 section 18.35.4: */
@@ -1663,6 +1687,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1663 status = nfserr_inval; 1687 status = nfserr_inval;
1664 goto out; 1688 goto out;
1665 } 1689 }
1690 if (!mach_creds_match(conf, rqstp)) {
1691 status = nfserr_wrong_cred;
1692 goto out;
1693 }
1666 if (!creds_match) { /* case 9 */ 1694 if (!creds_match) { /* case 9 */
1667 status = nfserr_perm; 1695 status = nfserr_perm;
1668 goto out; 1696 goto out;
@@ -1709,7 +1737,8 @@ out_new:
1709 status = nfserr_jukebox; 1737 status = nfserr_jukebox;
1710 goto out; 1738 goto out;
1711 } 1739 }
1712 new->cl_minorversion = 1; 1740 new->cl_minorversion = cstate->minorversion;
1741 new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
1713 1742
1714 gen_clid(new, nn); 1743 gen_clid(new, nn);
1715 add_to_unconfirmed(new); 1744 add_to_unconfirmed(new);
@@ -1839,6 +1868,24 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
1839 return nfs_ok; 1868 return nfs_ok;
1840} 1869}
1841 1870
1871static __be32 nfsd4_check_cb_sec(struct nfsd4_cb_sec *cbs)
1872{
1873 switch (cbs->flavor) {
1874 case RPC_AUTH_NULL:
1875 case RPC_AUTH_UNIX:
1876 return nfs_ok;
1877 default:
1878 /*
1879 * GSS case: the spec doesn't allow us to return this
1880 * error. But it also doesn't allow us not to support
1881 * GSS.
1882 * I'd rather this fail hard than return some error the
1883 * client might think it can already handle:
1884 */
1885 return nfserr_encr_alg_unsupp;
1886 }
1887}
1888
1842__be32 1889__be32
1843nfsd4_create_session(struct svc_rqst *rqstp, 1890nfsd4_create_session(struct svc_rqst *rqstp,
1844 struct nfsd4_compound_state *cstate, 1891 struct nfsd4_compound_state *cstate,
@@ -1854,6 +1901,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1854 1901
1855 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) 1902 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1856 return nfserr_inval; 1903 return nfserr_inval;
1904 status = nfsd4_check_cb_sec(&cr_ses->cb_sec);
1905 if (status)
1906 return status;
1857 status = check_forechannel_attrs(&cr_ses->fore_channel, nn); 1907 status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
1858 if (status) 1908 if (status)
1859 return status; 1909 return status;
@@ -1874,6 +1924,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1874 WARN_ON_ONCE(conf && unconf); 1924 WARN_ON_ONCE(conf && unconf);
1875 1925
1876 if (conf) { 1926 if (conf) {
1927 status = nfserr_wrong_cred;
1928 if (!mach_creds_match(conf, rqstp))
1929 goto out_free_conn;
1877 cs_slot = &conf->cl_cs_slot; 1930 cs_slot = &conf->cl_cs_slot;
1878 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1931 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1879 if (status == nfserr_replay_cache) { 1932 if (status == nfserr_replay_cache) {
@@ -1890,6 +1943,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1890 status = nfserr_clid_inuse; 1943 status = nfserr_clid_inuse;
1891 goto out_free_conn; 1944 goto out_free_conn;
1892 } 1945 }
1946 status = nfserr_wrong_cred;
1947 if (!mach_creds_match(unconf, rqstp))
1948 goto out_free_conn;
1893 cs_slot = &unconf->cl_cs_slot; 1949 cs_slot = &unconf->cl_cs_slot;
1894 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); 1950 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1895 if (status) { 1951 if (status) {
@@ -1957,7 +2013,11 @@ __be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state
1957{ 2013{
1958 struct nfsd4_session *session = cstate->session; 2014 struct nfsd4_session *session = cstate->session;
1959 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 2015 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2016 __be32 status;
1960 2017
2018 status = nfsd4_check_cb_sec(&bc->bc_cb_sec);
2019 if (status)
2020 return status;
1961 spin_lock(&nn->client_lock); 2021 spin_lock(&nn->client_lock);
1962 session->se_cb_prog = bc->bc_cb_program; 2022 session->se_cb_prog = bc->bc_cb_program;
1963 session->se_cb_sec = bc->bc_cb_sec; 2023 session->se_cb_sec = bc->bc_cb_sec;
@@ -1986,6 +2046,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1986 status = nfserr_badsession; 2046 status = nfserr_badsession;
1987 if (!session) 2047 if (!session)
1988 goto out; 2048 goto out;
2049 status = nfserr_wrong_cred;
2050 if (!mach_creds_match(session->se_client, rqstp))
2051 goto out;
1989 status = nfsd4_map_bcts_dir(&bcts->dir); 2052 status = nfsd4_map_bcts_dir(&bcts->dir);
1990 if (status) 2053 if (status)
1991 goto out; 2054 goto out;
@@ -2014,6 +2077,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
2014{ 2077{
2015 struct nfsd4_session *ses; 2078 struct nfsd4_session *ses;
2016 __be32 status; 2079 __be32 status;
2080 int ref_held_by_me = 0;
2017 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); 2081 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
2018 2082
2019 nfs4_lock_state(); 2083 nfs4_lock_state();
@@ -2021,6 +2085,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
2021 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { 2085 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
2022 if (!nfsd4_last_compound_op(r)) 2086 if (!nfsd4_last_compound_op(r))
2023 goto out; 2087 goto out;
2088 ref_held_by_me++;
2024 } 2089 }
2025 dump_sessionid(__func__, &sessionid->sessionid); 2090 dump_sessionid(__func__, &sessionid->sessionid);
2026 spin_lock(&nn->client_lock); 2091 spin_lock(&nn->client_lock);
@@ -2028,17 +2093,22 @@ nfsd4_destroy_session(struct svc_rqst *r,
2028 status = nfserr_badsession; 2093 status = nfserr_badsession;
2029 if (!ses) 2094 if (!ses)
2030 goto out_client_lock; 2095 goto out_client_lock;
2031 status = mark_session_dead_locked(ses); 2096 status = nfserr_wrong_cred;
2032 if (status) 2097 if (!mach_creds_match(ses->se_client, r))
2033 goto out_client_lock; 2098 goto out_client_lock;
2099 nfsd4_get_session_locked(ses);
2100 status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
2101 if (status)
2102 goto out_put_session;
2034 unhash_session(ses); 2103 unhash_session(ses);
2035 spin_unlock(&nn->client_lock); 2104 spin_unlock(&nn->client_lock);
2036 2105
2037 nfsd4_probe_callback_sync(ses->se_client); 2106 nfsd4_probe_callback_sync(ses->se_client);
2038 2107
2039 spin_lock(&nn->client_lock); 2108 spin_lock(&nn->client_lock);
2040 free_session(ses);
2041 status = nfs_ok; 2109 status = nfs_ok;
2110out_put_session:
2111 nfsd4_put_session(ses);
2042out_client_lock: 2112out_client_lock:
2043 spin_unlock(&nn->client_lock); 2113 spin_unlock(&nn->client_lock);
2044out: 2114out:
@@ -2058,26 +2128,31 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s
2058 return NULL; 2128 return NULL;
2059} 2129}
2060 2130
2061static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses) 2131static __be32 nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_session *ses)
2062{ 2132{
2063 struct nfs4_client *clp = ses->se_client; 2133 struct nfs4_client *clp = ses->se_client;
2064 struct nfsd4_conn *c; 2134 struct nfsd4_conn *c;
2135 __be32 status = nfs_ok;
2065 int ret; 2136 int ret;
2066 2137
2067 spin_lock(&clp->cl_lock); 2138 spin_lock(&clp->cl_lock);
2068 c = __nfsd4_find_conn(new->cn_xprt, ses); 2139 c = __nfsd4_find_conn(new->cn_xprt, ses);
2069 if (c) { 2140 if (c)
2070 spin_unlock(&clp->cl_lock); 2141 goto out_free;
2071 free_conn(new); 2142 status = nfserr_conn_not_bound_to_session;
2072 return; 2143 if (clp->cl_mach_cred)
2073 } 2144 goto out_free;
2074 __nfsd4_hash_conn(new, ses); 2145 __nfsd4_hash_conn(new, ses);
2075 spin_unlock(&clp->cl_lock); 2146 spin_unlock(&clp->cl_lock);
2076 ret = nfsd4_register_conn(new); 2147 ret = nfsd4_register_conn(new);
2077 if (ret) 2148 if (ret)
2078 /* oops; xprt is already down: */ 2149 /* oops; xprt is already down: */
2079 nfsd4_conn_lost(&new->cn_xpt_user); 2150 nfsd4_conn_lost(&new->cn_xpt_user);
2080 return; 2151 return nfs_ok;
2152out_free:
2153 spin_unlock(&clp->cl_lock);
2154 free_conn(new);
2155 return status;
2081} 2156}
2082 2157
2083static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session) 2158static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
@@ -2169,8 +2244,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2169 if (status) 2244 if (status)
2170 goto out_put_session; 2245 goto out_put_session;
2171 2246
2172 nfsd4_sequence_check_conn(conn, session); 2247 status = nfsd4_sequence_check_conn(conn, session);
2173 conn = NULL; 2248 conn = NULL;
2249 if (status)
2250 goto out_put_session;
2174 2251
2175 /* Success! bump slot seqid */ 2252 /* Success! bump slot seqid */
2176 slot->sl_seqid = seq->seqid; 2253 slot->sl_seqid = seq->seqid;
@@ -2232,7 +2309,10 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2232 status = nfserr_stale_clientid; 2309 status = nfserr_stale_clientid;
2233 goto out; 2310 goto out;
2234 } 2311 }
2235 2312 if (!mach_creds_match(clp, rqstp)) {
2313 status = nfserr_wrong_cred;
2314 goto out;
2315 }
2236 expire_client(clp); 2316 expire_client(clp);
2237out: 2317out:
2238 nfs4_unlock_state(); 2318 nfs4_unlock_state();
@@ -2645,13 +2725,13 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
2645 2725
2646 list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); 2726 list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
2647 2727
2648 /* only place dl_time is set. protected by lock_flocks*/ 2728 /* Only place dl_time is set; protected by i_lock: */
2649 dp->dl_time = get_seconds(); 2729 dp->dl_time = get_seconds();
2650 2730
2651 nfsd4_cb_recall(dp); 2731 nfsd4_cb_recall(dp);
2652} 2732}
2653 2733
2654/* Called from break_lease() with lock_flocks() held. */ 2734/* Called from break_lease() with i_lock held. */
2655static void nfsd_break_deleg_cb(struct file_lock *fl) 2735static void nfsd_break_deleg_cb(struct file_lock *fl)
2656{ 2736{
2657 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; 2737 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
@@ -2940,13 +3020,13 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
2940 return fl; 3020 return fl;
2941} 3021}
2942 3022
2943static int nfs4_setlease(struct nfs4_delegation *dp, int flag) 3023static int nfs4_setlease(struct nfs4_delegation *dp)
2944{ 3024{
2945 struct nfs4_file *fp = dp->dl_file; 3025 struct nfs4_file *fp = dp->dl_file;
2946 struct file_lock *fl; 3026 struct file_lock *fl;
2947 int status; 3027 int status;
2948 3028
2949 fl = nfs4_alloc_init_lease(dp, flag); 3029 fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
2950 if (!fl) 3030 if (!fl)
2951 return -ENOMEM; 3031 return -ENOMEM;
2952 fl->fl_file = find_readable_file(fp); 3032 fl->fl_file = find_readable_file(fp);
@@ -2964,12 +3044,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
2964 return 0; 3044 return 0;
2965} 3045}
2966 3046
2967static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag) 3047static int nfs4_set_delegation(struct nfs4_delegation *dp)
2968{ 3048{
2969 struct nfs4_file *fp = dp->dl_file; 3049 struct nfs4_file *fp = dp->dl_file;
2970 3050
2971 if (!fp->fi_lease) 3051 if (!fp->fi_lease)
2972 return nfs4_setlease(dp, flag); 3052 return nfs4_setlease(dp);
2973 spin_lock(&recall_lock); 3053 spin_lock(&recall_lock);
2974 if (fp->fi_had_conflict) { 3054 if (fp->fi_had_conflict) {
2975 spin_unlock(&recall_lock); 3055 spin_unlock(&recall_lock);
@@ -3005,6 +3085,9 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
3005 3085
3006/* 3086/*
3007 * Attempt to hand out a delegation. 3087 * Attempt to hand out a delegation.
3088 *
3089 * Note we don't support write delegations, and won't until the vfs has
3090 * proper support for them.
3008 */ 3091 */
3009static void 3092static void
3010nfs4_open_delegation(struct net *net, struct svc_fh *fh, 3093nfs4_open_delegation(struct net *net, struct svc_fh *fh,
@@ -3013,39 +3096,45 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3013 struct nfs4_delegation *dp; 3096 struct nfs4_delegation *dp;
3014 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); 3097 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
3015 int cb_up; 3098 int cb_up;
3016 int status = 0, flag = 0; 3099 int status = 0;
3017 3100
3018 cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); 3101 cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
3019 flag = NFS4_OPEN_DELEGATE_NONE;
3020 open->op_recall = 0; 3102 open->op_recall = 0;
3021 switch (open->op_claim_type) { 3103 switch (open->op_claim_type) {
3022 case NFS4_OPEN_CLAIM_PREVIOUS: 3104 case NFS4_OPEN_CLAIM_PREVIOUS:
3023 if (!cb_up) 3105 if (!cb_up)
3024 open->op_recall = 1; 3106 open->op_recall = 1;
3025 flag = open->op_delegate_type; 3107 if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
3026 if (flag == NFS4_OPEN_DELEGATE_NONE) 3108 goto out_no_deleg;
3027 goto out;
3028 break; 3109 break;
3029 case NFS4_OPEN_CLAIM_NULL: 3110 case NFS4_OPEN_CLAIM_NULL:
3030 /* Let's not give out any delegations till everyone's 3111 /*
3031 * had the chance to reclaim theirs.... */ 3112 * Let's not give out any delegations till everyone's
3113 * had the chance to reclaim theirs....
3114 */
3032 if (locks_in_grace(net)) 3115 if (locks_in_grace(net))
3033 goto out; 3116 goto out_no_deleg;
3034 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) 3117 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
3035 goto out; 3118 goto out_no_deleg;
3119 /*
3120 * Also, if the file was opened for write or
3121 * create, there's a good chance the client's
3122 * about to write to it, resulting in an
3123 * immediate recall (since we don't support
3124 * write delegations):
3125 */
3036 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) 3126 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
3037 flag = NFS4_OPEN_DELEGATE_WRITE; 3127 goto out_no_deleg;
3038 else 3128 if (open->op_create == NFS4_OPEN_CREATE)
3039 flag = NFS4_OPEN_DELEGATE_READ; 3129 goto out_no_deleg;
3040 break; 3130 break;
3041 default: 3131 default:
3042 goto out; 3132 goto out_no_deleg;
3043 } 3133 }
3044 3134 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
3045 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh, flag);
3046 if (dp == NULL) 3135 if (dp == NULL)
3047 goto out_no_deleg; 3136 goto out_no_deleg;
3048 status = nfs4_set_delegation(dp, flag); 3137 status = nfs4_set_delegation(dp);
3049 if (status) 3138 if (status)
3050 goto out_free; 3139 goto out_free;
3051 3140
@@ -3053,24 +3142,23 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3053 3142
3054 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", 3143 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
3055 STATEID_VAL(&dp->dl_stid.sc_stateid)); 3144 STATEID_VAL(&dp->dl_stid.sc_stateid));
3056out: 3145 open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
3057 open->op_delegate_type = flag;
3058 if (flag == NFS4_OPEN_DELEGATE_NONE) {
3059 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
3060 open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
3061 dprintk("NFSD: WARNING: refusing delegation reclaim\n");
3062
3063 /* 4.1 client asking for a delegation? */
3064 if (open->op_deleg_want)
3065 nfsd4_open_deleg_none_ext(open, status);
3066 }
3067 return; 3146 return;
3068out_free: 3147out_free:
3069 unhash_stid(&dp->dl_stid); 3148 unhash_stid(&dp->dl_stid);
3070 nfs4_put_delegation(dp); 3149 nfs4_put_delegation(dp);
3071out_no_deleg: 3150out_no_deleg:
3072 flag = NFS4_OPEN_DELEGATE_NONE; 3151 open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
3073 goto out; 3152 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
3153 open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) {
3154 dprintk("NFSD: WARNING: refusing delegation reclaim\n");
3155 open->op_recall = 1;
3156 }
3157
3158 /* 4.1 client asking for a delegation? */
3159 if (open->op_deleg_want)
3160 nfsd4_open_deleg_none_ext(open, status);
3161 return;
3074} 3162}
3075 3163
3076static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, 3164static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
@@ -3427,7 +3515,7 @@ grace_disallows_io(struct net *net, struct inode *inode)
3427/* Returns true iff a is later than b: */ 3515/* Returns true iff a is later than b: */
3428static bool stateid_generation_after(stateid_t *a, stateid_t *b) 3516static bool stateid_generation_after(stateid_t *a, stateid_t *b)
3429{ 3517{
3430 return (s32)a->si_generation - (s32)b->si_generation > 0; 3518 return (s32)(a->si_generation - b->si_generation) > 0;
3431} 3519}
3432 3520
3433static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) 3521static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
@@ -4435,7 +4523,6 @@ __be32
4435nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 4523nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4436 struct nfsd4_locku *locku) 4524 struct nfsd4_locku *locku)
4437{ 4525{
4438 struct nfs4_lockowner *lo;
4439 struct nfs4_ol_stateid *stp; 4526 struct nfs4_ol_stateid *stp;
4440 struct file *filp = NULL; 4527 struct file *filp = NULL;
4441 struct file_lock *file_lock = NULL; 4528 struct file_lock *file_lock = NULL;
@@ -4468,10 +4555,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4468 status = nfserr_jukebox; 4555 status = nfserr_jukebox;
4469 goto out; 4556 goto out;
4470 } 4557 }
4471 lo = lockowner(stp->st_stateowner);
4472 locks_init_lock(file_lock); 4558 locks_init_lock(file_lock);
4473 file_lock->fl_type = F_UNLCK; 4559 file_lock->fl_type = F_UNLCK;
4474 file_lock->fl_owner = (fl_owner_t)lo; 4560 file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
4475 file_lock->fl_pid = current->tgid; 4561 file_lock->fl_pid = current->tgid;
4476 file_lock->fl_file = filp; 4562 file_lock->fl_file = filp;
4477 file_lock->fl_flags = FL_POSIX; 4563 file_lock->fl_flags = FL_POSIX;
@@ -4490,11 +4576,6 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4490 update_stateid(&stp->st_stid.sc_stateid); 4576 update_stateid(&stp->st_stid.sc_stateid);
4491 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4577 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4492 4578
4493 if (nfsd4_has_session(cstate) && !check_for_locks(stp->st_file, lo)) {
4494 WARN_ON_ONCE(cstate->replay_owner);
4495 release_lockowner(lo);
4496 }
4497
4498out: 4579out:
4499 nfsd4_bump_seqid(cstate, status); 4580 nfsd4_bump_seqid(cstate, status);
4500 if (!cstate->replay_owner) 4581 if (!cstate->replay_owner)
@@ -4520,7 +4601,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
4520 struct inode *inode = filp->fi_inode; 4601 struct inode *inode = filp->fi_inode;
4521 int status = 0; 4602 int status = 0;
4522 4603
4523 lock_flocks(); 4604 spin_lock(&inode->i_lock);
4524 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 4605 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
4525 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 4606 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
4526 status = 1; 4607 status = 1;
@@ -4528,7 +4609,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
4528 } 4609 }
4529 } 4610 }
4530out: 4611out:
4531 unlock_flocks(); 4612 spin_unlock(&inode->i_lock);
4532 return status; 4613 return status;
4533} 4614}
4534 4615
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 6cd86e0fe450..c2a4701d7286 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -55,6 +55,11 @@
55#include "cache.h" 55#include "cache.h"
56#include "netns.h" 56#include "netns.h"
57 57
58#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
59#include <linux/security.h>
60#endif
61
62
58#define NFSDDBG_FACILITY NFSDDBG_XDR 63#define NFSDDBG_FACILITY NFSDDBG_XDR
59 64
60/* 65/*
@@ -134,6 +139,19 @@ xdr_error: \
134 } \ 139 } \
135} while (0) 140} while (0)
136 141
142static void next_decode_page(struct nfsd4_compoundargs *argp)
143{
144 argp->pagelist++;
145 argp->p = page_address(argp->pagelist[0]);
146 if (argp->pagelen < PAGE_SIZE) {
147 argp->end = argp->p + (argp->pagelen>>2);
148 argp->pagelen = 0;
149 } else {
150 argp->end = argp->p + (PAGE_SIZE>>2);
151 argp->pagelen -= PAGE_SIZE;
152 }
153}
154
137static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) 155static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
138{ 156{
139 /* We want more bytes than seem to be available. 157 /* We want more bytes than seem to be available.
@@ -161,16 +179,7 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
161 * guarantee p points to at least nbytes bytes. 179 * guarantee p points to at least nbytes bytes.
162 */ 180 */
163 memcpy(p, argp->p, avail); 181 memcpy(p, argp->p, avail);
164 /* step to next page */ 182 next_decode_page(argp);
165 argp->p = page_address(argp->pagelist[0]);
166 argp->pagelist++;
167 if (argp->pagelen < PAGE_SIZE) {
168 argp->end = argp->p + (argp->pagelen>>2);
169 argp->pagelen = 0;
170 } else {
171 argp->end = argp->p + (PAGE_SIZE>>2);
172 argp->pagelen -= PAGE_SIZE;
173 }
174 memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); 183 memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
175 argp->p += XDR_QUADLEN(nbytes - avail); 184 argp->p += XDR_QUADLEN(nbytes - avail);
176 return p; 185 return p;
@@ -242,7 +251,8 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
242 251
243static __be32 252static __be32
244nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, 253nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
245 struct iattr *iattr, struct nfs4_acl **acl) 254 struct iattr *iattr, struct nfs4_acl **acl,
255 struct xdr_netobj *label)
246{ 256{
247 int expected_len, len = 0; 257 int expected_len, len = 0;
248 u32 dummy32; 258 u32 dummy32;
@@ -380,6 +390,32 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
380 goto xdr_error; 390 goto xdr_error;
381 } 391 }
382 } 392 }
393
394 label->len = 0;
395#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
396 if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
397 READ_BUF(4);
398 len += 4;
399 READ32(dummy32); /* lfs: we don't use it */
400 READ_BUF(4);
401 len += 4;
402 READ32(dummy32); /* pi: we don't use it either */
403 READ_BUF(4);
404 len += 4;
405 READ32(dummy32);
406 READ_BUF(dummy32);
407 if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
408 return nfserr_badlabel;
409 len += (XDR_QUADLEN(dummy32) << 2);
410 READMEM(buf, dummy32);
411 label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
412 if (!label->data)
413 return nfserr_jukebox;
414 defer_free(argp, kfree, label->data);
415 memcpy(label->data, buf, dummy32);
416 }
417#endif
418
383 if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 419 if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
384 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 420 || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
385 || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) 421 || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
@@ -428,7 +464,11 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
428 /* callback_sec_params4 */ 464 /* callback_sec_params4 */
429 READ_BUF(4); 465 READ_BUF(4);
430 READ32(nr_secflavs); 466 READ32(nr_secflavs);
431 cbs->flavor = (u32)(-1); 467 if (nr_secflavs)
468 cbs->flavor = (u32)(-1);
469 else
470 /* Is this legal? Be generous, take it to mean AUTH_NONE: */
471 cbs->flavor = 0;
432 for (i = 0; i < nr_secflavs; ++i) { 472 for (i = 0; i < nr_secflavs; ++i) {
433 READ_BUF(4); 473 READ_BUF(4);
434 READ32(dummy); 474 READ32(dummy);
@@ -576,7 +616,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
576 return status; 616 return status;
577 617
578 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, 618 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
579 &create->cr_acl); 619 &create->cr_acl, &create->cr_label);
580 if (status) 620 if (status)
581 goto out; 621 goto out;
582 622
@@ -827,7 +867,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
827 case NFS4_CREATE_UNCHECKED: 867 case NFS4_CREATE_UNCHECKED:
828 case NFS4_CREATE_GUARDED: 868 case NFS4_CREATE_GUARDED:
829 status = nfsd4_decode_fattr(argp, open->op_bmval, 869 status = nfsd4_decode_fattr(argp, open->op_bmval,
830 &open->op_iattr, &open->op_acl); 870 &open->op_iattr, &open->op_acl, &open->op_label);
831 if (status) 871 if (status)
832 goto out; 872 goto out;
833 break; 873 break;
@@ -841,7 +881,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
841 READ_BUF(NFS4_VERIFIER_SIZE); 881 READ_BUF(NFS4_VERIFIER_SIZE);
842 COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); 882 COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
843 status = nfsd4_decode_fattr(argp, open->op_bmval, 883 status = nfsd4_decode_fattr(argp, open->op_bmval,
844 &open->op_iattr, &open->op_acl); 884 &open->op_iattr, &open->op_acl, &open->op_label);
845 if (status) 885 if (status)
846 goto out; 886 goto out;
847 break; 887 break;
@@ -1063,7 +1103,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
1063 if (status) 1103 if (status)
1064 return status; 1104 return status;
1065 return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, 1105 return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
1066 &setattr->sa_acl); 1106 &setattr->sa_acl, &setattr->sa_label);
1067} 1107}
1068 1108
1069static __be32 1109static __be32
@@ -1567,6 +1607,7 @@ struct nfsd4_minorversion_ops {
1567static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { 1607static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
1568 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, 1608 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
1569 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, 1609 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1610 [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1570}; 1611};
1571 1612
1572static __be32 1613static __be32
@@ -1953,6 +1994,36 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
1953 FATTR4_WORD0_RDATTR_ERROR) 1994 FATTR4_WORD0_RDATTR_ERROR)
1954#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID 1995#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
1955 1996
1997#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
1998static inline __be32
1999nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
2000{
2001 __be32 *p = *pp;
2002
2003 if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
2004 return nfserr_resource;
2005
2006 /*
2007 * For now we use a 0 here to indicate the null translation; in
2008 * the future we may place a call to translation code here.
2009 */
2010 if ((*buflen -= 8) < 0)
2011 return nfserr_resource;
2012
2013 WRITE32(0); /* lfs */
2014 WRITE32(0); /* pi */
2015 p = xdr_encode_opaque(p, context, len);
2016 *buflen -= (XDR_QUADLEN(len) << 2) + 4;
2017
2018 *pp = p;
2019 return 0;
2020}
2021#else
2022static inline __be32
2023nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
2024{ return 0; }
2025#endif
2026
1956static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) 2027static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
1957{ 2028{
1958 /* As per referral draft: */ 2029 /* As per referral draft: */
@@ -2012,6 +2083,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2012 int err; 2083 int err;
2013 int aclsupport = 0; 2084 int aclsupport = 0;
2014 struct nfs4_acl *acl = NULL; 2085 struct nfs4_acl *acl = NULL;
2086 void *context = NULL;
2087 int contextlen;
2088 bool contextsupport = false;
2015 struct nfsd4_compoundres *resp = rqstp->rq_resp; 2089 struct nfsd4_compoundres *resp = rqstp->rq_resp;
2016 u32 minorversion = resp->cstate.minorversion; 2090 u32 minorversion = resp->cstate.minorversion;
2017 struct path path = { 2091 struct path path = {
@@ -2065,6 +2139,21 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2065 } 2139 }
2066 } 2140 }
2067 2141
2142#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
2143 if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) ||
2144 bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
2145 err = security_inode_getsecctx(dentry->d_inode,
2146 &context, &contextlen);
2147 contextsupport = (err == 0);
2148 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2149 if (err == -EOPNOTSUPP)
2150 bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL;
2151 else if (err)
2152 goto out_nfserr;
2153 }
2154 }
2155#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
2156
2068 if (bmval2) { 2157 if (bmval2) {
2069 if ((buflen -= 16) < 0) 2158 if ((buflen -= 16) < 0)
2070 goto out_resource; 2159 goto out_resource;
@@ -2093,6 +2182,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2093 2182
2094 if (!aclsupport) 2183 if (!aclsupport)
2095 word0 &= ~FATTR4_WORD0_ACL; 2184 word0 &= ~FATTR4_WORD0_ACL;
2185 if (!contextsupport)
2186 word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
2096 if (!word2) { 2187 if (!word2) {
2097 if ((buflen -= 12) < 0) 2188 if ((buflen -= 12) < 0)
2098 goto out_resource; 2189 goto out_resource;
@@ -2400,6 +2491,12 @@ out_acl:
2400 get_parent_attributes(exp, &stat); 2491 get_parent_attributes(exp, &stat);
2401 WRITE64(stat.ino); 2492 WRITE64(stat.ino);
2402 } 2493 }
2494 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
2495 status = nfsd4_encode_security_label(rqstp, context,
2496 contextlen, &p, &buflen);
2497 if (status)
2498 goto out;
2499 }
2403 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { 2500 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2404 WRITE32(3); 2501 WRITE32(3);
2405 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); 2502 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
@@ -2412,6 +2509,10 @@ out_acl:
2412 status = nfs_ok; 2509 status = nfs_ok;
2413 2510
2414out: 2511out:
2512#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
2513 if (context)
2514 security_release_secctx(context, contextlen);
2515#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
2415 kfree(acl); 2516 kfree(acl);
2416 if (fhp == &tempfh) 2517 if (fhp == &tempfh)
2417 fh_put(&tempfh); 2518 fh_put(&tempfh);
@@ -3176,16 +3277,18 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3176{ 3277{
3177 __be32 *p; 3278 __be32 *p;
3178 3279
3179 RESERVE_SPACE(12); 3280 RESERVE_SPACE(16);
3180 if (nfserr) { 3281 if (nfserr) {
3181 WRITE32(2); 3282 WRITE32(3);
3283 WRITE32(0);
3182 WRITE32(0); 3284 WRITE32(0);
3183 WRITE32(0); 3285 WRITE32(0);
3184 } 3286 }
3185 else { 3287 else {
3186 WRITE32(2); 3288 WRITE32(3);
3187 WRITE32(setattr->sa_bmval[0]); 3289 WRITE32(setattr->sa_bmval[0]);
3188 WRITE32(setattr->sa_bmval[1]); 3290 WRITE32(setattr->sa_bmval[1]);
3291 WRITE32(setattr->sa_bmval[2]);
3189 } 3292 }
3190 ADJUST_ARGS(); 3293 ADJUST_ARGS();
3191 return nfserr; 3294 return nfserr;
@@ -3226,6 +3329,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
3226 return nfserr; 3329 return nfserr;
3227} 3330}
3228 3331
3332static const u32 nfs4_minimal_spo_must_enforce[2] = {
3333 [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
3334 1 << (OP_EXCHANGE_ID - 32) |
3335 1 << (OP_CREATE_SESSION - 32) |
3336 1 << (OP_DESTROY_SESSION - 32) |
3337 1 << (OP_DESTROY_CLIENTID - 32)
3338};
3339
3229static __be32 3340static __be32
3230nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, 3341nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3231 struct nfsd4_exchange_id *exid) 3342 struct nfsd4_exchange_id *exid)
@@ -3249,7 +3360,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3249 8 /* eir_clientid */ + 3360 8 /* eir_clientid */ +
3250 4 /* eir_sequenceid */ + 3361 4 /* eir_sequenceid */ +
3251 4 /* eir_flags */ + 3362 4 /* eir_flags */ +
3252 4 /* spr_how (SP4_NONE) */ + 3363 4 /* spr_how */ +
3364 8 /* spo_must_enforce, spo_must_allow */ +
3253 8 /* so_minor_id */ + 3365 8 /* so_minor_id */ +
3254 4 /* so_major_id.len */ + 3366 4 /* so_major_id.len */ +
3255 (XDR_QUADLEN(major_id_sz) * 4) + 3367 (XDR_QUADLEN(major_id_sz) * 4) +
@@ -3261,9 +3373,21 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3261 WRITE32(exid->seqid); 3373 WRITE32(exid->seqid);
3262 WRITE32(exid->flags); 3374 WRITE32(exid->flags);
3263 3375
3264 /* state_protect4_r. Currently only support SP4_NONE */
3265 BUG_ON(exid->spa_how != SP4_NONE);
3266 WRITE32(exid->spa_how); 3376 WRITE32(exid->spa_how);
3377 switch (exid->spa_how) {
3378 case SP4_NONE:
3379 break;
3380 case SP4_MACH_CRED:
3381 /* spo_must_enforce bitmap: */
3382 WRITE32(2);
3383 WRITE32(nfs4_minimal_spo_must_enforce[0]);
3384 WRITE32(nfs4_minimal_spo_must_enforce[1]);
3385 /* empty spo_must_allow bitmap: */
3386 WRITE32(0);
3387 break;
3388 default:
3389 WARN_ON_ONCE(1);
3390 }
3267 3391
3268 /* The server_owner struct */ 3392 /* The server_owner struct */
3269 WRITE64(minor_id); /* Minor id */ 3393 WRITE64(minor_id); /* Minor id */
@@ -3635,13 +3759,17 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3635 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3759 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3636 BUG_ON(iov->iov_len > PAGE_SIZE); 3760 BUG_ON(iov->iov_len > PAGE_SIZE);
3637 if (nfsd4_has_session(cs)) { 3761 if (nfsd4_has_session(cs)) {
3762 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3763 struct nfs4_client *clp = cs->session->se_client;
3638 if (cs->status != nfserr_replay_cache) { 3764 if (cs->status != nfserr_replay_cache) {
3639 nfsd4_store_cache_entry(resp); 3765 nfsd4_store_cache_entry(resp);
3640 cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE; 3766 cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
3641 } 3767 }
3642 /* Renew the clientid on success and on replay */ 3768 /* Renew the clientid on success and on replay */
3643 put_client_renew(cs->session->se_client); 3769 spin_lock(&nn->client_lock);
3644 nfsd4_put_session(cs->session); 3770 nfsd4_put_session(cs->session);
3771 spin_unlock(&nn->client_lock);
3772 put_client_renew(clp);
3645 } 3773 }
3646 return 1; 3774 return 1;
3647} 3775}
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 07a473fd49bc..30f34ab02137 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
24/* 24/*
25 * nfsd version 25 * nfsd version
26 */ 26 */
27#define NFSD_SUPPORTED_MINOR_VERSION 1 27#define NFSD_SUPPORTED_MINOR_VERSION 2
28/* 28/*
29 * Maximum blocksizes supported by daemon under various circumstances. 29 * Maximum blocksizes supported by daemon under various circumstances.
30 */ 30 */
@@ -53,7 +53,6 @@ struct readdir_cd {
53extern struct svc_program nfsd_program; 53extern struct svc_program nfsd_program;
54extern struct svc_version nfsd_version2, nfsd_version3, 54extern struct svc_version nfsd_version2, nfsd_version3,
55 nfsd_version4; 55 nfsd_version4;
56extern u32 nfsd_supported_minorversion;
57extern struct mutex nfsd_mutex; 56extern struct mutex nfsd_mutex;
58extern spinlock_t nfsd_drc_lock; 57extern spinlock_t nfsd_drc_lock;
59extern unsigned long nfsd_drc_max_mem; 58extern unsigned long nfsd_drc_max_mem;
@@ -243,6 +242,12 @@ void nfsd_lockd_shutdown(void);
243#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG) 242#define nfserr_reject_deleg cpu_to_be32(NFS4ERR_REJECT_DELEG)
244#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT) 243#define nfserr_returnconflict cpu_to_be32(NFS4ERR_RETURNCONFLICT)
245#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) 244#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
245#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
246#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
247#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP)
248#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
249#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
250#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
246 251
247/* error codes for internal use */ 252/* error codes for internal use */
248/* if a request fails due to kmalloc failure, it gets dropped. 253/* if a request fails due to kmalloc failure, it gets dropped.
@@ -322,6 +327,13 @@ void nfsd_lockd_shutdown(void);
322#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ 327#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
323 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) 328 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
324 329
330#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
331#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
332 (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
333#else
334#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0
335#endif
336
325static inline u32 nfsd_suppattrs0(u32 minorversion) 337static inline u32 nfsd_suppattrs0(u32 minorversion)
326{ 338{
327 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 339 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
@@ -336,8 +348,11 @@ static inline u32 nfsd_suppattrs1(u32 minorversion)
336 348
337static inline u32 nfsd_suppattrs2(u32 minorversion) 349static inline u32 nfsd_suppattrs2(u32 minorversion)
338{ 350{
339 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2 351 switch (minorversion) {
340 : NFSD4_SUPPORTED_ATTRS_WORD2; 352 default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
353 case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2;
354 case 0: return NFSD4_SUPPORTED_ATTRS_WORD2;
355 }
341} 356}
342 357
343/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ 358/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
@@ -350,7 +365,11 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
350#define NFSD_WRITEABLE_ATTRS_WORD1 \ 365#define NFSD_WRITEABLE_ATTRS_WORD1 \
351 (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ 366 (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
352 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) 367 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
368#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
369#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
370#else
353#define NFSD_WRITEABLE_ATTRS_WORD2 0 371#define NFSD_WRITEABLE_ATTRS_WORD2 0
372#endif
354 373
355#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ 374#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
356 NFSD_WRITEABLE_ATTRS_WORD0 375 NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 262df5ccbf59..760c85a6f534 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -116,7 +116,10 @@ struct svc_program nfsd_program = {
116 116
117}; 117};
118 118
119u32 nfsd_supported_minorversion; 119static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
120 [0] = 1,
121 [1] = 1,
122};
120 123
121int nfsd_vers(int vers, enum vers_op change) 124int nfsd_vers(int vers, enum vers_op change)
122{ 125{
@@ -151,15 +154,13 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
151 return -1; 154 return -1;
152 switch(change) { 155 switch(change) {
153 case NFSD_SET: 156 case NFSD_SET:
154 nfsd_supported_minorversion = minorversion; 157 nfsd_supported_minorversions[minorversion] = true;
155 break; 158 break;
156 case NFSD_CLEAR: 159 case NFSD_CLEAR:
157 if (minorversion == 0) 160 nfsd_supported_minorversions[minorversion] = false;
158 return -1;
159 nfsd_supported_minorversion = minorversion - 1;
160 break; 161 break;
161 case NFSD_TEST: 162 case NFSD_TEST:
162 return minorversion <= nfsd_supported_minorversion; 163 return nfsd_supported_minorversions[minorversion];
163 case NFSD_AVAIL: 164 case NFSD_AVAIL:
164 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION; 165 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
165 } 166 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 274e2a114e05..424d8f5f2317 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -246,6 +246,7 @@ struct nfs4_client {
246 nfs4_verifier cl_verifier; /* generated by client */ 246 nfs4_verifier cl_verifier; /* generated by client */
247 time_t cl_time; /* time of last lease renewal */ 247 time_t cl_time; /* time of last lease renewal */
248 struct sockaddr_storage cl_addr; /* client ipaddress */ 248 struct sockaddr_storage cl_addr; /* client ipaddress */
249 bool cl_mach_cred; /* SP4_MACH_CRED in force */
249 struct svc_cred cl_cred; /* setclientid principal */ 250 struct svc_cred cl_cred; /* setclientid principal */
250 clientid_t cl_clientid; /* generated by server */ 251 clientid_t cl_clientid; /* generated by server */
251 nfs4_verifier cl_confirm; /* generated by server */ 252 nfs4_verifier cl_confirm; /* generated by server */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 84ce601d8063..c827acb0e943 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -28,6 +28,7 @@
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
29#include <linux/exportfs.h> 29#include <linux/exportfs.h>
30#include <linux/writeback.h> 30#include <linux/writeback.h>
31#include <linux/security.h>
31 32
32#ifdef CONFIG_NFSD_V3 33#ifdef CONFIG_NFSD_V3
33#include "xdr3.h" 34#include "xdr3.h"
@@ -621,6 +622,33 @@ int nfsd4_is_junction(struct dentry *dentry)
621 return 0; 622 return 0;
622 return 1; 623 return 1;
623} 624}
625#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
626__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
627 struct xdr_netobj *label)
628{
629 __be32 error;
630 int host_error;
631 struct dentry *dentry;
632
633 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
634 if (error)
635 return error;
636
637 dentry = fhp->fh_dentry;
638
639 mutex_lock(&dentry->d_inode->i_mutex);
640 host_error = security_inode_setsecctx(dentry, label->data, label->len);
641 mutex_unlock(&dentry->d_inode->i_mutex);
642 return nfserrno(host_error);
643}
644#else
645__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
646 struct xdr_netobj *label)
647{
648 return nfserr_notsupp;
649}
650#endif
651
624#endif /* defined(CONFIG_NFSD_V4) */ 652#endif /* defined(CONFIG_NFSD_V4) */
625 653
626#ifdef CONFIG_NFSD_V3 654#ifdef CONFIG_NFSD_V3
@@ -802,9 +830,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
802 flags = O_WRONLY|O_LARGEFILE; 830 flags = O_WRONLY|O_LARGEFILE;
803 } 831 }
804 *filp = dentry_open(&path, flags, current_cred()); 832 *filp = dentry_open(&path, flags, current_cred());
805 if (IS_ERR(*filp)) 833 if (IS_ERR(*filp)) {
806 host_err = PTR_ERR(*filp); 834 host_err = PTR_ERR(*filp);
807 else { 835 *filp = NULL;
836 } else {
808 host_err = ima_file_check(*filp, may_flags); 837 host_err = ima_file_check(*filp, may_flags);
809 838
810 if (may_flags & NFSD_MAY_64BIT_COOKIE) 839 if (may_flags & NFSD_MAY_64BIT_COOKIE)
@@ -1912,6 +1941,7 @@ struct buffered_dirent {
1912}; 1941};
1913 1942
1914struct readdir_data { 1943struct readdir_data {
1944 struct dir_context ctx;
1915 char *dirent; 1945 char *dirent;
1916 size_t used; 1946 size_t used;
1917 int full; 1947 int full;
@@ -1943,13 +1973,15 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen,
1943static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, 1973static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
1944 struct readdir_cd *cdp, loff_t *offsetp) 1974 struct readdir_cd *cdp, loff_t *offsetp)
1945{ 1975{
1946 struct readdir_data buf;
1947 struct buffered_dirent *de; 1976 struct buffered_dirent *de;
1948 int host_err; 1977 int host_err;
1949 int size; 1978 int size;
1950 loff_t offset; 1979 loff_t offset;
1980 struct readdir_data buf = {
1981 .ctx.actor = nfsd_buffered_filldir,
1982 .dirent = (void *)__get_free_page(GFP_KERNEL)
1983 };
1951 1984
1952 buf.dirent = (void *)__get_free_page(GFP_KERNEL);
1953 if (!buf.dirent) 1985 if (!buf.dirent)
1954 return nfserrno(-ENOMEM); 1986 return nfserrno(-ENOMEM);
1955 1987
@@ -1963,7 +1995,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func,
1963 buf.used = 0; 1995 buf.used = 0;
1964 buf.full = 0; 1996 buf.full = 0;
1965 1997
1966 host_err = vfs_readdir(file, nfsd_buffered_filldir, &buf); 1998 host_err = iterate_dir(file, &buf.ctx);
1967 if (buf.full) 1999 if (buf.full)
1968 host_err = 0; 2000 host_err = 0;
1969 2001
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5b5894159f22..a4be2e389670 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -39,7 +39,6 @@
39typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); 39typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
40 40
41/* nfsd/vfs.c */ 41/* nfsd/vfs.c */
42int fh_lock_parent(struct svc_fh *, struct dentry *);
43int nfsd_racache_init(int); 42int nfsd_racache_init(int);
44void nfsd_racache_shutdown(void); 43void nfsd_racache_shutdown(void);
45int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 44int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
@@ -56,6 +55,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *);
56__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, 55__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
57 struct nfs4_acl *); 56 struct nfs4_acl *);
58int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); 57int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
58__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
59 struct xdr_netobj *);
59#endif /* CONFIG_NFSD_V4 */ 60#endif /* CONFIG_NFSD_V4 */
60__be32 nfsd_create(struct svc_rqst *, struct svc_fh *, 61__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
61 char *name, int len, struct iattr *attrs, 62 char *name, int len, struct iattr *attrs,
@@ -92,17 +93,13 @@ __be32 nfsd_remove(struct svc_rqst *,
92 struct svc_fh *, char *, int); 93 struct svc_fh *, char *, int);
93__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, 94__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
94 char *name, int len); 95 char *name, int len);
95int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
96 unsigned long size);
97__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, 96__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
98 loff_t *, struct readdir_cd *, filldir_t); 97 loff_t *, struct readdir_cd *, filldir_t);
99__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, 98__be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
100 struct kstatfs *, int access); 99 struct kstatfs *, int access);
101 100
102int nfsd_notify_change(struct inode *, struct iattr *);
103__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, 101__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
104 struct dentry *, int); 102 struct dentry *, int);
105int nfsd_sync_dir(struct dentry *dp);
106 103
107#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 104#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
108struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); 105struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 3b271d2092b6..b3ed6446ed8e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,6 +40,7 @@
40#include "state.h" 40#include "state.h"
41#include "nfsd.h" 41#include "nfsd.h"
42 42
43#define NFSD4_MAX_SEC_LABEL_LEN 2048
43#define NFSD4_MAX_TAGLEN 128 44#define NFSD4_MAX_TAGLEN 128
44#define XDR_LEN(n) (((n) + 3) & ~3) 45#define XDR_LEN(n) (((n) + 3) & ~3)
45 46
@@ -118,6 +119,7 @@ struct nfsd4_create {
118 struct iattr cr_iattr; /* request */ 119 struct iattr cr_iattr; /* request */
119 struct nfsd4_change_info cr_cinfo; /* response */ 120 struct nfsd4_change_info cr_cinfo; /* response */
120 struct nfs4_acl *cr_acl; 121 struct nfs4_acl *cr_acl;
122 struct xdr_netobj cr_label;
121}; 123};
122#define cr_linklen u.link.namelen 124#define cr_linklen u.link.namelen
123#define cr_linkname u.link.name 125#define cr_linkname u.link.name
@@ -246,6 +248,7 @@ struct nfsd4_open {
246 struct nfs4_file *op_file; /* used during processing */ 248 struct nfs4_file *op_file; /* used during processing */
247 struct nfs4_ol_stateid *op_stp; /* used during processing */ 249 struct nfs4_ol_stateid *op_stp; /* used during processing */
248 struct nfs4_acl *op_acl; 250 struct nfs4_acl *op_acl;
251 struct xdr_netobj op_label;
249}; 252};
250#define op_iattr iattr 253#define op_iattr iattr
251 254
@@ -330,6 +333,7 @@ struct nfsd4_setattr {
330 u32 sa_bmval[3]; /* request */ 333 u32 sa_bmval[3]; /* request */
331 struct iattr sa_iattr; /* request */ 334 struct iattr sa_iattr; /* request */
332 struct nfs4_acl *sa_acl; 335 struct nfs4_acl *sa_acl;
336 struct xdr_netobj sa_label;
333}; 337};
334 338
335struct nfsd4_setclientid { 339struct nfsd4_setclientid {
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index eed4d7b26249..741fd02e0444 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -398,6 +398,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
398} 398}
399 399
400/** 400/**
401 * nilfs_palloc_count_desc_blocks - count descriptor blocks number
402 * @inode: inode of metadata file using this allocator
403 * @desc_blocks: descriptor blocks number [out]
404 */
405static int nilfs_palloc_count_desc_blocks(struct inode *inode,
406 unsigned long *desc_blocks)
407{
408 unsigned long blknum;
409 int ret;
410
411 ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum);
412 if (likely(!ret))
413 *desc_blocks = DIV_ROUND_UP(
414 blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block);
415 return ret;
416}
417
418/**
419 * nilfs_palloc_mdt_file_can_grow - check potential opportunity for
420 * MDT file growing
421 * @inode: inode of metadata file using this allocator
422 * @desc_blocks: known current descriptor blocks count
423 */
424static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode,
425 unsigned long desc_blocks)
426{
427 return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) <
428 nilfs_palloc_groups_count(inode);
429}
430
431/**
432 * nilfs_palloc_count_max_entries - count max number of entries that can be
433 * described by descriptor blocks count
434 * @inode: inode of metadata file using this allocator
435 * @nused: current number of used entries
436 * @nmaxp: max number of entries [out]
437 */
438int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
439{
440 unsigned long desc_blocks = 0;
441 u64 entries_per_desc_block, nmax;
442 int err;
443
444 err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks);
445 if (unlikely(err))
446 return err;
447
448 entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) *
449 nilfs_palloc_groups_per_desc_block(inode);
450 nmax = entries_per_desc_block * desc_blocks;
451
452 if (nused == nmax &&
453 nilfs_palloc_mdt_file_can_grow(inode, desc_blocks))
454 nmax += entries_per_desc_block;
455
456 if (nused > nmax)
457 return -ERANGE;
458
459 *nmaxp = nmax;
460 return 0;
461}
462
463/**
401 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object 464 * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
402 * @inode: inode of metadata file using this allocator 465 * @inode: inode of metadata file using this allocator
403 * @req: nilfs_palloc_req structure exchanged for the allocation 466 * @req: nilfs_palloc_req structure exchanged for the allocation
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index fb7238100548..4bd6451b5703 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
48void *nilfs_palloc_block_get_entry(const struct inode *, __u64, 48void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
49 const struct buffer_head *, void *); 49 const struct buffer_head *, void *);
50 50
51int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *);
52
51/** 53/**
52 * nilfs_palloc_req - persistent allocator request and reply 54 * nilfs_palloc_req - persistent allocator request and reply
53 * @pr_entry_nr: entry number (vblocknr or inode number) 55 * @pr_entry_nr: entry number (vblocknr or inode number)
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index f30b017740a7..197a63e9d102 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -256,22 +256,18 @@ static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
256 de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 256 de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
257} 257}
258 258
259static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 259static int nilfs_readdir(struct file *file, struct dir_context *ctx)
260{ 260{
261 loff_t pos = filp->f_pos; 261 loff_t pos = ctx->pos;
262 struct inode *inode = file_inode(filp); 262 struct inode *inode = file_inode(file);
263 struct super_block *sb = inode->i_sb; 263 struct super_block *sb = inode->i_sb;
264 unsigned int offset = pos & ~PAGE_CACHE_MASK; 264 unsigned int offset = pos & ~PAGE_CACHE_MASK;
265 unsigned long n = pos >> PAGE_CACHE_SHIFT; 265 unsigned long n = pos >> PAGE_CACHE_SHIFT;
266 unsigned long npages = dir_pages(inode); 266 unsigned long npages = dir_pages(inode);
267/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ 267/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
268 unsigned char *types = NULL;
269 int ret;
270 268
271 if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) 269 if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
272 goto success; 270 return 0;
273
274 types = nilfs_filetype_table;
275 271
276 for ( ; n < npages; n++, offset = 0) { 272 for ( ; n < npages; n++, offset = 0) {
277 char *kaddr, *limit; 273 char *kaddr, *limit;
@@ -281,9 +277,8 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
281 if (IS_ERR(page)) { 277 if (IS_ERR(page)) {
282 nilfs_error(sb, __func__, "bad page in #%lu", 278 nilfs_error(sb, __func__, "bad page in #%lu",
283 inode->i_ino); 279 inode->i_ino);
284 filp->f_pos += PAGE_CACHE_SIZE - offset; 280 ctx->pos += PAGE_CACHE_SIZE - offset;
285 ret = -EIO; 281 return -EIO;
286 goto done;
287 } 282 }
288 kaddr = page_address(page); 283 kaddr = page_address(page);
289 de = (struct nilfs_dir_entry *)(kaddr + offset); 284 de = (struct nilfs_dir_entry *)(kaddr + offset);
@@ -293,35 +288,28 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
293 if (de->rec_len == 0) { 288 if (de->rec_len == 0) {
294 nilfs_error(sb, __func__, 289 nilfs_error(sb, __func__,
295 "zero-length directory entry"); 290 "zero-length directory entry");
296 ret = -EIO;
297 nilfs_put_page(page); 291 nilfs_put_page(page);
298 goto done; 292 return -EIO;
299 } 293 }
300 if (de->inode) { 294 if (de->inode) {
301 int over; 295 unsigned char t;
302 unsigned char d_type = DT_UNKNOWN;
303 296
304 if (types && de->file_type < NILFS_FT_MAX) 297 if (de->file_type < NILFS_FT_MAX)
305 d_type = types[de->file_type]; 298 t = nilfs_filetype_table[de->file_type];
299 else
300 t = DT_UNKNOWN;
306 301
307 offset = (char *)de - kaddr; 302 if (!dir_emit(ctx, de->name, de->name_len,
308 over = filldir(dirent, de->name, de->name_len, 303 le64_to_cpu(de->inode), t)) {
309 (n<<PAGE_CACHE_SHIFT) | offset,
310 le64_to_cpu(de->inode), d_type);
311 if (over) {
312 nilfs_put_page(page); 304 nilfs_put_page(page);
313 goto success; 305 return 0;
314 } 306 }
315 } 307 }
316 filp->f_pos += nilfs_rec_len_from_disk(de->rec_len); 308 ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
317 } 309 }
318 nilfs_put_page(page); 310 nilfs_put_page(page);
319 } 311 }
320 312 return 0;
321success:
322 ret = 0;
323done:
324 return ret;
325} 313}
326 314
327/* 315/*
@@ -678,7 +666,7 @@ not_empty:
678const struct file_operations nilfs_dir_operations = { 666const struct file_operations nilfs_dir_operations = {
679 .llseek = generic_file_llseek, 667 .llseek = generic_file_llseek,
680 .read = generic_read_dir, 668 .read = generic_read_dir,
681 .readdir = nilfs_readdir, 669 .iterate = nilfs_readdir,
682 .unlocked_ioctl = nilfs_ioctl, 670 .unlocked_ioctl = nilfs_ioctl,
683#ifdef CONFIG_COMPAT 671#ifdef CONFIG_COMPAT
684 .compat_ioctl = nilfs_compat_ioctl, 672 .compat_ioctl = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index d8e65bde083c..6548c7851b48 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -160,6 +160,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
160} 160}
161 161
162/** 162/**
163 * nilfs_ifile_count_free_inodes - calculate free inodes count
164 * @ifile: ifile inode
165 * @nmaxinodes: current maximum of available inodes count [out]
166 * @nfreeinodes: free inodes count [out]
167 */
168int nilfs_ifile_count_free_inodes(struct inode *ifile,
169 u64 *nmaxinodes, u64 *nfreeinodes)
170{
171 u64 nused;
172 int err;
173
174 *nmaxinodes = 0;
175 *nfreeinodes = 0;
176
177 nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count);
178 err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes);
179 if (likely(!err))
180 *nfreeinodes = *nmaxinodes - nused;
181 return err;
182}
183
184/**
163 * nilfs_ifile_read - read or get ifile inode 185 * nilfs_ifile_read - read or get ifile inode
164 * @sb: super block instance 186 * @sb: super block instance
165 * @root: root object 187 * @root: root object
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 59b6f2b51df6..679674d13372 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
49int nilfs_ifile_delete_inode(struct inode *, ino_t); 49int nilfs_ifile_delete_inode(struct inode *, ino_t);
50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); 50int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
51 51
52int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
53
52int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, 54int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
53 size_t inode_size, struct nilfs_inode *raw_inode, 55 size_t inode_size, struct nilfs_inode *raw_inode,
54 struct inode **inodep); 56 struct inode **inodep);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index bccfec8343c5..b1a5277cfd18 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n)
54 54
55 inode_add_bytes(inode, (1 << inode->i_blkbits) * n); 55 inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
56 if (root) 56 if (root)
57 atomic_add(n, &root->blocks_count); 57 atomic64_add(n, &root->blocks_count);
58} 58}
59 59
60void nilfs_inode_sub_blocks(struct inode *inode, int n) 60void nilfs_inode_sub_blocks(struct inode *inode, int n)
@@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n)
63 63
64 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); 64 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
65 if (root) 65 if (root)
66 atomic_sub(n, &root->blocks_count); 66 atomic64_sub(n, &root->blocks_count);
67} 67}
68 68
69/** 69/**
@@ -369,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
369 goto failed_ifile_create_inode; 369 goto failed_ifile_create_inode;
370 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 370 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
371 371
372 atomic_inc(&root->inodes_count); 372 atomic64_inc(&root->inodes_count);
373 inode_init_owner(inode, dir, mode); 373 inode_init_owner(inode, dir, mode);
374 inode->i_ino = ino; 374 inode->i_ino = ino;
375 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 375 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -801,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode)
801 801
802 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); 802 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
803 if (!ret) 803 if (!ret)
804 atomic_dec(&ii->i_root->inodes_count); 804 atomic64_dec(&ii->i_root->inodes_count);
805 805
806 nilfs_clear_inode(inode); 806 nilfs_clear_inode(inode);
807 807
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a5752a589932..bd88a7461063 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
835 raw_cp->cp_snapshot_list.ssl_next = 0; 835 raw_cp->cp_snapshot_list.ssl_next = 0;
836 raw_cp->cp_snapshot_list.ssl_prev = 0; 836 raw_cp->cp_snapshot_list.ssl_prev = 0;
837 raw_cp->cp_inodes_count = 837 raw_cp->cp_inodes_count =
838 cpu_to_le64(atomic_read(&sci->sc_root->inodes_count)); 838 cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
839 raw_cp->cp_blocks_count = 839 raw_cp->cp_blocks_count =
840 cpu_to_le64(atomic_read(&sci->sc_root->blocks_count)); 840 cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
841 raw_cp->cp_nblk_inc = 841 raw_cp->cp_nblk_inc =
842 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); 842 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
843 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); 843 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c7d1f9f18b09..af3ba0478cdf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
554 if (err) 554 if (err)
555 goto failed_bh; 555 goto failed_bh;
556 556
557 atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); 557 atomic64_set(&root->inodes_count,
558 atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); 558 le64_to_cpu(raw_cp->cp_inodes_count));
559 atomic64_set(&root->blocks_count,
560 le64_to_cpu(raw_cp->cp_blocks_count));
559 561
560 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); 562 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
561 563
@@ -609,6 +611,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
609 unsigned long overhead; 611 unsigned long overhead;
610 unsigned long nrsvblocks; 612 unsigned long nrsvblocks;
611 sector_t nfreeblocks; 613 sector_t nfreeblocks;
614 u64 nmaxinodes, nfreeinodes;
612 int err; 615 int err;
613 616
614 /* 617 /*
@@ -633,14 +636,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
633 if (unlikely(err)) 636 if (unlikely(err))
634 return err; 637 return err;
635 638
639 err = nilfs_ifile_count_free_inodes(root->ifile,
640 &nmaxinodes, &nfreeinodes);
641 if (unlikely(err)) {
642 printk(KERN_WARNING
643 "NILFS warning: fail to count free inodes: err %d.\n",
644 err);
645 if (err == -ERANGE) {
646 /*
647 * If nilfs_palloc_count_max_entries() returns
648 * -ERANGE error code then we simply treat
649 * curent inodes count as maximum possible and
650 * zero as free inodes value.
651 */
652 nmaxinodes = atomic64_read(&root->inodes_count);
653 nfreeinodes = 0;
654 err = 0;
655 } else
656 return err;
657 }
658
636 buf->f_type = NILFS_SUPER_MAGIC; 659 buf->f_type = NILFS_SUPER_MAGIC;
637 buf->f_bsize = sb->s_blocksize; 660 buf->f_bsize = sb->s_blocksize;
638 buf->f_blocks = blocks - overhead; 661 buf->f_blocks = blocks - overhead;
639 buf->f_bfree = nfreeblocks; 662 buf->f_bfree = nfreeblocks;
640 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? 663 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
641 (buf->f_bfree - nrsvblocks) : 0; 664 (buf->f_bfree - nrsvblocks) : 0;
642 buf->f_files = atomic_read(&root->inodes_count); 665 buf->f_files = nmaxinodes;
643 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ 666 buf->f_ffree = nfreeinodes;
644 buf->f_namelen = NILFS_NAME_LEN; 667 buf->f_namelen = NILFS_NAME_LEN;
645 buf->f_fsid.val[0] = (u32)id; 668 buf->f_fsid.val[0] = (u32)id;
646 buf->f_fsid.val[1] = (u32)(id >> 32); 669 buf->f_fsid.val[1] = (u32)(id >> 32);
@@ -973,7 +996,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
973 996
974static int nilfs_tree_was_touched(struct dentry *root_dentry) 997static int nilfs_tree_was_touched(struct dentry *root_dentry)
975{ 998{
976 return root_dentry->d_count > 1; 999 return d_count(root_dentry) > 1;
977} 1000}
978 1001
979/** 1002/**
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 41e6a04a561f..94c451ce6d24 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
764 new->ifile = NULL; 764 new->ifile = NULL;
765 new->nilfs = nilfs; 765 new->nilfs = nilfs;
766 atomic_set(&new->count, 1); 766 atomic_set(&new->count, 1);
767 atomic_set(&new->inodes_count, 0); 767 atomic64_set(&new->inodes_count, 0);
768 atomic_set(&new->blocks_count, 0); 768 atomic64_set(&new->blocks_count, 0);
769 769
770 rb_link_node(&new->rb_node, parent, p); 770 rb_link_node(&new->rb_node, parent, p);
771 rb_insert_color(&new->rb_node, &nilfs->ns_cptree); 771 rb_insert_color(&new->rb_node, &nilfs->ns_cptree);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be1267a34cea..de8cc53b4a5c 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -241,8 +241,8 @@ struct nilfs_root {
241 struct the_nilfs *nilfs; 241 struct the_nilfs *nilfs;
242 struct inode *ifile; 242 struct inode *ifile;
243 243
244 atomic_t inodes_count; 244 atomic64_t inodes_count;
245 atomic_t blocks_count; 245 atomic64_t blocks_count;
246}; 246};
247 247
248/* Special checkpoint number */ 248/* Special checkpoint number */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 2bfe6dc413a0..1fedd5f7ccc4 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -31,7 +31,6 @@ int dir_notify_enable __read_mostly = 1;
31static struct kmem_cache *dnotify_struct_cache __read_mostly; 31static struct kmem_cache *dnotify_struct_cache __read_mostly;
32static struct kmem_cache *dnotify_mark_cache __read_mostly; 32static struct kmem_cache *dnotify_mark_cache __read_mostly;
33static struct fsnotify_group *dnotify_group __read_mostly; 33static struct fsnotify_group *dnotify_group __read_mostly;
34static DEFINE_MUTEX(dnotify_mark_mutex);
35 34
36/* 35/*
37 * dnotify will attach one of these to each inode (i_fsnotify_marks) which 36 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
@@ -183,7 +182,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
183 return; 182 return;
184 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); 183 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
185 184
186 mutex_lock(&dnotify_mark_mutex); 185 mutex_lock(&dnotify_group->mark_mutex);
187 186
188 spin_lock(&fsn_mark->lock); 187 spin_lock(&fsn_mark->lock);
189 prev = &dn_mark->dn; 188 prev = &dn_mark->dn;
@@ -199,11 +198,12 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
199 198
200 spin_unlock(&fsn_mark->lock); 199 spin_unlock(&fsn_mark->lock);
201 200
202 /* nothing else could have found us thanks to the dnotify_mark_mutex */ 201 /* nothing else could have found us thanks to the dnotify_groups
202 mark_mutex */
203 if (dn_mark->dn == NULL) 203 if (dn_mark->dn == NULL)
204 fsnotify_destroy_mark(fsn_mark, dnotify_group); 204 fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
205 205
206 mutex_unlock(&dnotify_mark_mutex); 206 mutex_unlock(&dnotify_group->mark_mutex);
207 207
208 fsnotify_put_mark(fsn_mark); 208 fsnotify_put_mark(fsn_mark);
209} 209}
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
326 new_dn_mark->dn = NULL; 326 new_dn_mark->dn = NULL;
327 327
328 /* this is needed to prevent the fcntl/close race described below */ 328 /* this is needed to prevent the fcntl/close race described below */
329 mutex_lock(&dnotify_mark_mutex); 329 mutex_lock(&dnotify_group->mark_mutex);
330 330
331 /* add the new_fsn_mark or find an old one. */ 331 /* add the new_fsn_mark or find an old one. */
332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); 332 fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
@@ -334,7 +334,8 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); 334 dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
335 spin_lock(&fsn_mark->lock); 335 spin_lock(&fsn_mark->lock);
336 } else { 336 } else {
337 fsnotify_add_mark(new_fsn_mark, dnotify_group, inode, NULL, 0); 337 fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
338 NULL, 0);
338 spin_lock(&new_fsn_mark->lock); 339 spin_lock(&new_fsn_mark->lock);
339 fsn_mark = new_fsn_mark; 340 fsn_mark = new_fsn_mark;
340 dn_mark = new_dn_mark; 341 dn_mark = new_dn_mark;
@@ -348,9 +349,9 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
348 349
349 /* if (f != filp) means that we lost a race and another task/thread 350 /* if (f != filp) means that we lost a race and another task/thread
350 * actually closed the fd we are still playing with before we grabbed 351 * actually closed the fd we are still playing with before we grabbed
351 * the dnotify_mark_mutex and fsn_mark->lock. Since closing the fd is the 352 * the dnotify_groups mark_mutex and fsn_mark->lock. Since closing the
352 * only time we clean up the marks we need to get our mark off 353 * fd is the only time we clean up the marks we need to get our mark
353 * the list. */ 354 * off the list. */
354 if (f != filp) { 355 if (f != filp) {
355 /* if we added ourselves, shoot ourselves, it's possible that 356 /* if we added ourselves, shoot ourselves, it's possible that
356 * the flush actually did shoot this fsn_mark. That's fine too 357 * the flush actually did shoot this fsn_mark. That's fine too
@@ -385,9 +386,9 @@ out:
385 spin_unlock(&fsn_mark->lock); 386 spin_unlock(&fsn_mark->lock);
386 387
387 if (destroy) 388 if (destroy)
388 fsnotify_destroy_mark(fsn_mark, dnotify_group); 389 fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
389 390
390 mutex_unlock(&dnotify_mark_mutex); 391 mutex_unlock(&dnotify_group->mark_mutex);
391 fsnotify_put_mark(fsn_mark); 392 fsnotify_put_mark(fsn_mark);
392out_err: 393out_err:
393 if (new_fsn_mark) 394 if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6c80083a984f..e44cb6427df3 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -122,6 +122,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
122 metadata->event_len = FAN_EVENT_METADATA_LEN; 122 metadata->event_len = FAN_EVENT_METADATA_LEN;
123 metadata->metadata_len = FAN_EVENT_METADATA_LEN; 123 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
124 metadata->vers = FANOTIFY_METADATA_VERSION; 124 metadata->vers = FANOTIFY_METADATA_VERSION;
125 metadata->reserved = 0;
125 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 126 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
126 metadata->pid = pid_vnr(event->tgid); 127 metadata->pid = pid_vnr(event->tgid);
127 if (unlikely(event->mask & FAN_Q_OVERFLOW)) 128 if (unlikely(event->mask & FAN_Q_OVERFLOW))
@@ -399,9 +400,6 @@ static int fanotify_release(struct inode *ignored, struct file *file)
399 wake_up(&group->fanotify_data.access_waitq); 400 wake_up(&group->fanotify_data.access_waitq);
400#endif 401#endif
401 402
402 if (file->f_flags & FASYNC)
403 fsnotify_fasync(-1, file, 0);
404
405 /* matches the fanotify_init->fsnotify_alloc_group */ 403 /* matches the fanotify_init->fsnotify_alloc_group */
406 fsnotify_destroy_group(group); 404 fsnotify_destroy_group(group);
407 405
@@ -526,14 +524,18 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
526 __u32 removed; 524 __u32 removed;
527 int destroy_mark; 525 int destroy_mark;
528 526
527 mutex_lock(&group->mark_mutex);
529 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 528 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
530 if (!fsn_mark) 529 if (!fsn_mark) {
530 mutex_unlock(&group->mark_mutex);
531 return -ENOENT; 531 return -ENOENT;
532 }
532 533
533 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 534 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
534 &destroy_mark); 535 &destroy_mark);
535 if (destroy_mark) 536 if (destroy_mark)
536 fsnotify_destroy_mark(fsn_mark, group); 537 fsnotify_destroy_mark_locked(fsn_mark, group);
538 mutex_unlock(&group->mark_mutex);
537 539
538 fsnotify_put_mark(fsn_mark); 540 fsnotify_put_mark(fsn_mark);
539 if (removed & real_mount(mnt)->mnt_fsnotify_mask) 541 if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -550,14 +552,19 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
550 __u32 removed; 552 __u32 removed;
551 int destroy_mark; 553 int destroy_mark;
552 554
555 mutex_lock(&group->mark_mutex);
553 fsn_mark = fsnotify_find_inode_mark(group, inode); 556 fsn_mark = fsnotify_find_inode_mark(group, inode);
554 if (!fsn_mark) 557 if (!fsn_mark) {
558 mutex_unlock(&group->mark_mutex);
555 return -ENOENT; 559 return -ENOENT;
560 }
556 561
557 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 562 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
558 &destroy_mark); 563 &destroy_mark);
559 if (destroy_mark) 564 if (destroy_mark)
560 fsnotify_destroy_mark(fsn_mark, group); 565 fsnotify_destroy_mark_locked(fsn_mark, group);
566 mutex_unlock(&group->mark_mutex);
567
561 /* matches the fsnotify_find_inode_mark() */ 568 /* matches the fsnotify_find_inode_mark() */
562 fsnotify_put_mark(fsn_mark); 569 fsnotify_put_mark(fsn_mark);
563 if (removed & inode->i_fsnotify_mask) 570 if (removed & inode->i_fsnotify_mask)
@@ -593,35 +600,55 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
593 return mask & ~oldmask; 600 return mask & ~oldmask;
594} 601}
595 602
603static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
604 struct inode *inode,
605 struct vfsmount *mnt)
606{
607 struct fsnotify_mark *mark;
608 int ret;
609
610 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
611 return ERR_PTR(-ENOSPC);
612
613 mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
614 if (!mark)
615 return ERR_PTR(-ENOMEM);
616
617 fsnotify_init_mark(mark, fanotify_free_mark);
618 ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
619 if (ret) {
620 fsnotify_put_mark(mark);
621 return ERR_PTR(ret);
622 }
623
624 return mark;
625}
626
627
596static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, 628static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
597 struct vfsmount *mnt, __u32 mask, 629 struct vfsmount *mnt, __u32 mask,
598 unsigned int flags) 630 unsigned int flags)
599{ 631{
600 struct fsnotify_mark *fsn_mark; 632 struct fsnotify_mark *fsn_mark;
601 __u32 added; 633 __u32 added;
602 int ret = 0;
603 634
635 mutex_lock(&group->mark_mutex);
604 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 636 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
605 if (!fsn_mark) { 637 if (!fsn_mark) {
606 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 638 fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
607 return -ENOSPC; 639 if (IS_ERR(fsn_mark)) {
608 640 mutex_unlock(&group->mark_mutex);
609 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 641 return PTR_ERR(fsn_mark);
610 if (!fsn_mark) 642 }
611 return -ENOMEM;
612
613 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
614 ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
615 if (ret)
616 goto err;
617 } 643 }
618 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 644 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
645 mutex_unlock(&group->mark_mutex);
619 646
620 if (added & ~real_mount(mnt)->mnt_fsnotify_mask) 647 if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
621 fsnotify_recalc_vfsmount_mask(mnt); 648 fsnotify_recalc_vfsmount_mask(mnt);
622err: 649
623 fsnotify_put_mark(fsn_mark); 650 fsnotify_put_mark(fsn_mark);
624 return ret; 651 return 0;
625} 652}
626 653
627static int fanotify_add_inode_mark(struct fsnotify_group *group, 654static int fanotify_add_inode_mark(struct fsnotify_group *group,
@@ -630,7 +657,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
630{ 657{
631 struct fsnotify_mark *fsn_mark; 658 struct fsnotify_mark *fsn_mark;
632 __u32 added; 659 __u32 added;
633 int ret = 0;
634 660
635 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); 661 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
636 662
@@ -644,27 +670,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
644 (atomic_read(&inode->i_writecount) > 0)) 670 (atomic_read(&inode->i_writecount) > 0))
645 return 0; 671 return 0;
646 672
673 mutex_lock(&group->mark_mutex);
647 fsn_mark = fsnotify_find_inode_mark(group, inode); 674 fsn_mark = fsnotify_find_inode_mark(group, inode);
648 if (!fsn_mark) { 675 if (!fsn_mark) {
649 if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 676 fsn_mark = fanotify_add_new_mark(group, inode, NULL);
650 return -ENOSPC; 677 if (IS_ERR(fsn_mark)) {
651 678 mutex_unlock(&group->mark_mutex);
652 fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 679 return PTR_ERR(fsn_mark);
653 if (!fsn_mark) 680 }
654 return -ENOMEM;
655
656 fsnotify_init_mark(fsn_mark, fanotify_free_mark);
657 ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
658 if (ret)
659 goto err;
660 } 681 }
661 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); 682 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
683 mutex_unlock(&group->mark_mutex);
662 684
663 if (added & ~inode->i_fsnotify_mask) 685 if (added & ~inode->i_fsnotify_mask)
664 fsnotify_recalc_inode_mask(inode); 686 fsnotify_recalc_inode_mask(inode);
665err: 687
666 fsnotify_put_mark(fsn_mark); 688 fsnotify_put_mark(fsn_mark);
667 return ret; 689 return 0;
668} 690}
669 691
670/* fanotify syscalls */ 692/* fanotify syscalls */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 959815c1e017..60f954a891ab 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -636,7 +636,8 @@ static int inotify_new_watch(struct fsnotify_group *group,
636 goto out_err; 636 goto out_err;
637 637
638 /* we are on the idr, now get on the inode */ 638 /* we are on the idr, now get on the inode */
639 ret = fsnotify_add_mark(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); 639 ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
640 NULL, 0);
640 if (ret) { 641 if (ret) {
641 /* we failed to get on the inode, get off the idr */ 642 /* we failed to get on the inode, get off the idr */
642 inotify_remove_from_idr(group, tmp_i_mark); 643 inotify_remove_from_idr(group, tmp_i_mark);
@@ -660,19 +661,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
660{ 661{
661 int ret = 0; 662 int ret = 0;
662 663
663retry: 664 mutex_lock(&group->mark_mutex);
664 /* try to update and existing watch with the new arg */ 665 /* try to update and existing watch with the new arg */
665 ret = inotify_update_existing_watch(group, inode, arg); 666 ret = inotify_update_existing_watch(group, inode, arg);
666 /* no mark present, try to add a new one */ 667 /* no mark present, try to add a new one */
667 if (ret == -ENOENT) 668 if (ret == -ENOENT)
668 ret = inotify_new_watch(group, inode, arg); 669 ret = inotify_new_watch(group, inode, arg);
669 /* 670 mutex_unlock(&group->mark_mutex);
670 * inotify_new_watch could race with another thread which did an
671 * inotify_new_watch between the update_existing and the add watch
672 * here, go back and try to update an existing mark again.
673 */
674 if (ret == -EEXIST)
675 goto retry;
676 671
677 return ret; 672 return ret;
678} 673}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc6b49bf7360..923fe4a5f503 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -20,28 +20,29 @@
20 * fsnotify inode mark locking/lifetime/and refcnting 20 * fsnotify inode mark locking/lifetime/and refcnting
21 * 21 *
22 * REFCNT: 22 * REFCNT:
23 * The mark->refcnt tells how many "things" in the kernel currently are 23 * The group->recnt and mark->refcnt tell how many "things" in the kernel
24 * referencing this object. The object typically will live inside the kernel 24 * currently are referencing the objects. Both kind of objects typically will
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task 25 * live inside the kernel with a refcnt of 2, one for its creation and one for
26 * which can find this object holding the appropriete locks, can take a reference 26 * the reference a group and a mark hold to each other.
27 * and the object itself is guaranteed to survive until the reference is dropped. 27 * If you are holding the appropriate locks, you can take a reference and the
28 * object itself is guaranteed to survive until the reference is dropped.
28 * 29 *
29 * LOCKING: 30 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST 31 * There are 3 locks involved with fsnotify inode marks and they MUST be taken
31 * be taken in order as follows: 32 * in order as follows:
32 * 33 *
34 * group->mark_mutex
33 * mark->lock 35 * mark->lock
34 * group->mark_lock
35 * inode->i_lock 36 * inode->i_lock
36 * 37 *
37 * mark->lock protects 2 things, mark->group and mark->inode. You must hold 38 * group->mark_mutex protects the marks_list anchored inside a given group and
38 * that lock to dereference either of these things (they could be NULL even with 39 * each mark is hooked via the g_list. It also protects the groups private
39 * the lock) 40 * data (i.e group limits).
40 * 41
41 * group->mark_lock protects the marks_list anchored inside a given group 42 * mark->lock protects the marks attributes like its masks and flags.
42 * and each mark is hooked via the g_list. It also sorta protects the 43 * Furthermore it protects the access to a reference of the group that the mark
43 * free_g_list, which when used is anchored by a private list on the stack of the 44 * is assigned to as well as the access to a reference of the inode/vfsmount
44 * task which held the group->mark_lock. 45 * that is being watched by the mark.
45 * 46 *
46 * inode->i_lock protects the i_fsnotify_marks list anchored inside a 47 * inode->i_lock protects the i_fsnotify_marks list anchored inside a
47 * given inode and each mark is hooked via the i_list. (and sorta the 48 * given inode and each mark is hooked via the i_list. (and sorta the
@@ -64,18 +65,11 @@
64 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each 65 * inode. We take i_lock and walk the i_fsnotify_marks safely. For each
65 * mark on the list we take a reference (so the mark can't disappear under us). 66 * mark on the list we take a reference (so the mark can't disappear under us).
66 * We remove that mark form the inode's list of marks and we add this mark to a 67 * We remove that mark form the inode's list of marks and we add this mark to a
67 * private list anchored on the stack using i_free_list; At this point we no 68 * private list anchored on the stack using i_free_list; we walk i_free_list
68 * longer fear anything finding the mark using the inode's list of marks. 69 * and before we destroy the mark we make sure that we dont race with a
69 * 70 * concurrent destroy_group by getting a ref to the marks group and taking the
70 * We can safely and locklessly run the private list on the stack of everything 71 * groups mutex.
71 * we just unattached from the original inode. For each mark on the private list 72
72 * we grab the mark-> and can thus dereference mark->group and mark->inode. If
73 * we see the group and inode are not NULL we take those locks. Now holding all
74 * 3 locks we can completely remove the mark from other tasks finding it in the
75 * future. Remember, 10 things might already be referencing this mark, but they
76 * better be holding a ref. We drop our reference we took before we unhooked it
77 * from the inode. When the ref hits 0 we can free the mark.
78 *
79 * Very similarly for freeing by group, except we use free_g_list. 73 * Very similarly for freeing by group, except we use free_g_list.
80 * 74 *
81 * This has the very interesting property of being able to run concurrently with 75 * This has the very interesting property of being able to run concurrently with
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fa9c05f97af4..d267ea6aa1a0 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1372,7 +1372,7 @@ retry_writepage:
1372 * The page may have dirty, unmapped buffers. Make them 1372 * The page may have dirty, unmapped buffers. Make them
1373 * freeable here, so the page does not leak. 1373 * freeable here, so the page does not leak.
1374 */ 1374 */
1375 block_invalidatepage(page, 0); 1375 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1376 unlock_page(page); 1376 unlock_page(page);
1377 ntfs_debug("Write outside i_size - truncated?"); 1377 ntfs_debug("Write outside i_size - truncated?");
1378 return 0; 1378 return 0;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index aa411c3f20e9..9e38dafa3bc7 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1004,13 +1004,11 @@ dir_err_out:
1004/** 1004/**
1005 * ntfs_filldir - ntfs specific filldir method 1005 * ntfs_filldir - ntfs specific filldir method
1006 * @vol: current ntfs volume 1006 * @vol: current ntfs volume
1007 * @fpos: position in the directory
1008 * @ndir: ntfs inode of current directory 1007 * @ndir: ntfs inode of current directory
1009 * @ia_page: page in which the index allocation buffer @ie is in resides 1008 * @ia_page: page in which the index allocation buffer @ie is in resides
1010 * @ie: current index entry 1009 * @ie: current index entry
1011 * @name: buffer to use for the converted name 1010 * @name: buffer to use for the converted name
1012 * @dirent: vfs filldir callback context 1011 * @actor: what to feed the entries to
1013 * @filldir: vfs filldir callback
1014 * 1012 *
1015 * Convert the Unicode @name to the loaded NLS and pass it to the @filldir 1013 * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
1016 * callback. 1014 * callback.
@@ -1024,12 +1022,12 @@ dir_err_out:
1024 * retake the lock if we are returning a non-zero value as ntfs_readdir() 1022 * retake the lock if we are returning a non-zero value as ntfs_readdir()
1025 * would need to drop the lock immediately anyway. 1023 * would need to drop the lock immediately anyway.
1026 */ 1024 */
1027static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, 1025static inline int ntfs_filldir(ntfs_volume *vol,
1028 ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, 1026 ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
1029 u8 *name, void *dirent, filldir_t filldir) 1027 u8 *name, struct dir_context *actor)
1030{ 1028{
1031 unsigned long mref; 1029 unsigned long mref;
1032 int name_len, rc; 1030 int name_len;
1033 unsigned dt_type; 1031 unsigned dt_type;
1034 FILE_NAME_TYPE_FLAGS name_type; 1032 FILE_NAME_TYPE_FLAGS name_type;
1035 1033
@@ -1068,13 +1066,14 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
1068 if (ia_page) 1066 if (ia_page)
1069 unlock_page(ia_page); 1067 unlock_page(ia_page);
1070 ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " 1068 ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
1071 "0x%lx, DT_%s.", name, name_len, fpos, mref, 1069 "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
1072 dt_type == DT_DIR ? "DIR" : "REG"); 1070 dt_type == DT_DIR ? "DIR" : "REG");
1073 rc = filldir(dirent, name, name_len, fpos, mref, dt_type); 1071 if (!dir_emit(actor, name, name_len, mref, dt_type))
1072 return 1;
1074 /* Relock the page but not if we are aborting ->readdir. */ 1073 /* Relock the page but not if we are aborting ->readdir. */
1075 if (!rc && ia_page) 1074 if (ia_page)
1076 lock_page(ia_page); 1075 lock_page(ia_page);
1077 return rc; 1076 return 0;
1078} 1077}
1079 1078
1080/* 1079/*
@@ -1097,11 +1096,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos,
1097 * removes them again after the write is complete after which it 1096 * removes them again after the write is complete after which it
1098 * unlocks the page. 1097 * unlocks the page.
1099 */ 1098 */
1100static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 1099static int ntfs_readdir(struct file *file, struct dir_context *actor)
1101{ 1100{
1102 s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; 1101 s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
1103 loff_t fpos, i_size; 1102 loff_t i_size;
1104 struct inode *bmp_vi, *vdir = file_inode(filp); 1103 struct inode *bmp_vi, *vdir = file_inode(file);
1105 struct super_block *sb = vdir->i_sb; 1104 struct super_block *sb = vdir->i_sb;
1106 ntfs_inode *ndir = NTFS_I(vdir); 1105 ntfs_inode *ndir = NTFS_I(vdir);
1107 ntfs_volume *vol = NTFS_SB(sb); 1106 ntfs_volume *vol = NTFS_SB(sb);
@@ -1116,33 +1115,16 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1116 u8 *kaddr, *bmp, *index_end; 1115 u8 *kaddr, *bmp, *index_end;
1117 ntfs_attr_search_ctx *ctx; 1116 ntfs_attr_search_ctx *ctx;
1118 1117
1119 fpos = filp->f_pos;
1120 ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", 1118 ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
1121 vdir->i_ino, fpos); 1119 vdir->i_ino, actor->pos);
1122 rc = err = 0; 1120 rc = err = 0;
1123 /* Are we at end of dir yet? */ 1121 /* Are we at end of dir yet? */
1124 i_size = i_size_read(vdir); 1122 i_size = i_size_read(vdir);
1125 if (fpos >= i_size + vol->mft_record_size) 1123 if (actor->pos >= i_size + vol->mft_record_size)
1126 goto done; 1124 return 0;
1127 /* Emulate . and .. for all directories. */ 1125 /* Emulate . and .. for all directories. */
1128 if (!fpos) { 1126 if (!dir_emit_dots(file, actor))
1129 ntfs_debug("Calling filldir for . with len 1, fpos 0x0, " 1127 return 0;
1130 "inode 0x%lx, DT_DIR.", vdir->i_ino);
1131 rc = filldir(dirent, ".", 1, fpos, vdir->i_ino, DT_DIR);
1132 if (rc)
1133 goto done;
1134 fpos++;
1135 }
1136 if (fpos == 1) {
1137 ntfs_debug("Calling filldir for .. with len 2, fpos 0x1, "
1138 "inode 0x%lx, DT_DIR.",
1139 (unsigned long)parent_ino(filp->f_path.dentry));
1140 rc = filldir(dirent, "..", 2, fpos,
1141 parent_ino(filp->f_path.dentry), DT_DIR);
1142 if (rc)
1143 goto done;
1144 fpos++;
1145 }
1146 m = NULL; 1128 m = NULL;
1147 ctx = NULL; 1129 ctx = NULL;
1148 /* 1130 /*
@@ -1155,7 +1137,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1155 goto err_out; 1137 goto err_out;
1156 } 1138 }
1157 /* Are we jumping straight into the index allocation attribute? */ 1139 /* Are we jumping straight into the index allocation attribute? */
1158 if (fpos >= vol->mft_record_size) 1140 if (actor->pos >= vol->mft_record_size)
1159 goto skip_index_root; 1141 goto skip_index_root;
1160 /* Get hold of the mft record for the directory. */ 1142 /* Get hold of the mft record for the directory. */
1161 m = map_mft_record(ndir); 1143 m = map_mft_record(ndir);
@@ -1170,7 +1152,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1170 goto err_out; 1152 goto err_out;
1171 } 1153 }
1172 /* Get the offset into the index root attribute. */ 1154 /* Get the offset into the index root attribute. */
1173 ir_pos = (s64)fpos; 1155 ir_pos = (s64)actor->pos;
1174 /* Find the index root attribute in the mft record. */ 1156 /* Find the index root attribute in the mft record. */
1175 err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, 1157 err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
1176 0, ctx); 1158 0, ctx);
@@ -1226,10 +1208,9 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1226 if (ir_pos > (u8*)ie - (u8*)ir) 1208 if (ir_pos > (u8*)ie - (u8*)ir)
1227 continue; 1209 continue;
1228 /* Advance the position even if going to skip the entry. */ 1210 /* Advance the position even if going to skip the entry. */
1229 fpos = (u8*)ie - (u8*)ir; 1211 actor->pos = (u8*)ie - (u8*)ir;
1230 /* Submit the name to the filldir callback. */ 1212 /* Submit the name to the filldir callback. */
1231 rc = ntfs_filldir(vol, fpos, ndir, NULL, ie, name, dirent, 1213 rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
1232 filldir);
1233 if (rc) { 1214 if (rc) {
1234 kfree(ir); 1215 kfree(ir);
1235 goto abort; 1216 goto abort;
@@ -1242,12 +1223,12 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
1242 if (!NInoIndexAllocPresent(ndir)) 1223 if (!NInoIndexAllocPresent(ndir))
1243 goto EOD; 1224 goto EOD;
1244 /* Advance fpos to the beginning of the index allocation. */ 1225 /* Advance fpos to the beginning of the index allocation. */
1245 fpos = vol->mft_record_size; 1226 actor->pos = vol->mft_record_size;
1246skip_index_root: 1227skip_index_root:
1247 kaddr = NULL; 1228 kaddr = NULL;
1248 prev_ia_pos = -1LL; 1229 prev_ia_pos = -1LL;
1249 /* Get the offset into the index allocation attribute. */ 1230 /* Get the offset into the index allocation attribute. */
1250 ia_pos = (s64)fpos - vol->mft_record_size; 1231 ia_pos = (s64)actor->pos - vol->mft_record_size;
1251 ia_mapping = vdir->i_mapping; 1232 ia_mapping = vdir->i_mapping;
1252 ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); 1233 ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
1253 bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); 1234 bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
@@ -1409,7 +1390,7 @@ find_next_index_buffer:
1409 if (ia_pos - ia_start > (u8*)ie - (u8*)ia) 1390 if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
1410 continue; 1391 continue;
1411 /* Advance the position even if going to skip the entry. */ 1392 /* Advance the position even if going to skip the entry. */
1412 fpos = (u8*)ie - (u8*)ia + 1393 actor->pos = (u8*)ie - (u8*)ia +
1413 (sle64_to_cpu(ia->index_block_vcn) << 1394 (sle64_to_cpu(ia->index_block_vcn) <<
1414 ndir->itype.index.vcn_size_bits) + 1395 ndir->itype.index.vcn_size_bits) +
1415 vol->mft_record_size; 1396 vol->mft_record_size;
@@ -1419,8 +1400,7 @@ find_next_index_buffer:
1419 * before returning, unless a non-zero value is returned in 1400 * before returning, unless a non-zero value is returned in
1420 * which case the page is left unlocked. 1401 * which case the page is left unlocked.
1421 */ 1402 */
1422 rc = ntfs_filldir(vol, fpos, ndir, ia_page, ie, name, dirent, 1403 rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
1423 filldir);
1424 if (rc) { 1404 if (rc) {
1425 /* @ia_page is already unlocked in this case. */ 1405 /* @ia_page is already unlocked in this case. */
1426 ntfs_unmap_page(ia_page); 1406 ntfs_unmap_page(ia_page);
@@ -1439,18 +1419,9 @@ unm_EOD:
1439 iput(bmp_vi); 1419 iput(bmp_vi);
1440EOD: 1420EOD:
1441 /* We are finished, set fpos to EOD. */ 1421 /* We are finished, set fpos to EOD. */
1442 fpos = i_size + vol->mft_record_size; 1422 actor->pos = i_size + vol->mft_record_size;
1443abort: 1423abort:
1444 kfree(name); 1424 kfree(name);
1445done:
1446#ifdef DEBUG
1447 if (!rc)
1448 ntfs_debug("EOD, fpos 0x%llx, returning 0.", fpos);
1449 else
1450 ntfs_debug("filldir returned %i, fpos 0x%llx, returning 0.",
1451 rc, fpos);
1452#endif
1453 filp->f_pos = fpos;
1454 return 0; 1425 return 0;
1455err_out: 1426err_out:
1456 if (bmp_page) { 1427 if (bmp_page) {
@@ -1471,7 +1442,6 @@ iput_err_out:
1471 if (!err) 1442 if (!err)
1472 err = -EIO; 1443 err = -EIO;
1473 ntfs_debug("Failed. Returning error code %i.", -err); 1444 ntfs_debug("Failed. Returning error code %i.", -err);
1474 filp->f_pos = fpos;
1475 return err; 1445 return err;
1476} 1446}
1477 1447
@@ -1571,7 +1541,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
1571const struct file_operations ntfs_dir_ops = { 1541const struct file_operations ntfs_dir_ops = {
1572 .llseek = generic_file_llseek, /* Seek inside directory. */ 1542 .llseek = generic_file_llseek, /* Seek inside directory. */
1573 .read = generic_read_dir, /* Return -EISDIR. */ 1543 .read = generic_read_dir, /* Return -EISDIR. */
1574 .readdir = ntfs_readdir, /* Read directory contents. */ 1544 .iterate = ntfs_readdir, /* Read directory contents. */
1575#ifdef NTFS_RW 1545#ifdef NTFS_RW
1576 .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ 1546 .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
1577 /*.aio_fsync = ,*/ /* Sync all outstanding async 1547 /*.aio_fsync = ,*/ /* Sync all outstanding async
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b8a9d87231b1..17e6bdde96c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5655 &ref_tree, NULL); 5655 &ref_tree, NULL);
5656 if (ret) { 5656 if (ret) {
5657 mlog_errno(ret); 5657 mlog_errno(ret);
5658 goto out; 5658 goto bail;
5659 } 5659 }
5660 5660
5661 ret = ocfs2_prepare_refcount_change_for_del(inode, 5661 ret = ocfs2_prepare_refcount_change_for_del(inode,
@@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5666 &extra_blocks); 5666 &extra_blocks);
5667 if (ret < 0) { 5667 if (ret < 0) {
5668 mlog_errno(ret); 5668 mlog_errno(ret);
5669 goto out; 5669 goto bail;
5670 } 5670 }
5671 } 5671 }
5672 5672
@@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5674 extra_blocks); 5674 extra_blocks);
5675 if (ret) { 5675 if (ret) {
5676 mlog_errno(ret); 5676 mlog_errno(ret);
5677 return ret; 5677 goto bail;
5678 } 5678 }
5679 5679
5680 mutex_lock(&tl_inode->i_mutex); 5680 mutex_lock(&tl_inode->i_mutex);
@@ -5734,7 +5734,7 @@ out_commit:
5734 ocfs2_commit_trans(osb, handle); 5734 ocfs2_commit_trans(osb, handle);
5735out: 5735out:
5736 mutex_unlock(&tl_inode->i_mutex); 5736 mutex_unlock(&tl_inode->i_mutex);
5737 5737bail:
5738 if (meta_ac) 5738 if (meta_ac)
5739 ocfs2_free_alloc_context(meta_ac); 5739 ocfs2_free_alloc_context(meta_ac);
5740 5740
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 20dfec72e903..2abf97b2a592 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -603,11 +603,12 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
603 * from ext3. PageChecked() bits have been removed as OCFS2 does not 603 * from ext3. PageChecked() bits have been removed as OCFS2 does not
604 * do journalled data. 604 * do journalled data.
605 */ 605 */
606static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 606static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
607 unsigned int length)
607{ 608{
608 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 609 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
609 610
610 jbd2_journal_invalidatepage(journal, page, offset); 611 jbd2_journal_invalidatepage(journal, page, offset, length);
611} 612}
612 613
613static int ocfs2_releasepage(struct page *page, gfp_t wait) 614static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -1756,7 +1757,7 @@ try_again:
1756 goto out; 1757 goto out;
1757 } else if (ret == 1) { 1758 } else if (ret == 1) {
1758 clusters_need = wc->w_clen; 1759 clusters_need = wc->w_clen;
1759 ret = ocfs2_refcount_cow(inode, filp, di_bh, 1760 ret = ocfs2_refcount_cow(inode, di_bh,
1760 wc->w_cpos, wc->w_clen, UINT_MAX); 1761 wc->w_cpos, wc->w_clen, UINT_MAX);
1761 if (ret) { 1762 if (ret) {
1762 mlog_errno(ret); 1763 mlog_errno(ret);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 42252bf64b51..5c1c864e81cc 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
176 } 176 }
177} 177}
178 178
179static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) 179static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode)
180{ 180{
181 int ret = -1; 181 int ret = -1;
182 182
@@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
500 } 500 }
501 501
502 atomic_inc(&write_wc->wc_num_reqs); 502 atomic_inc(&write_wc->wc_num_reqs);
503 submit_bio(WRITE, bio); 503 submit_bio(WRITE_SYNC, bio);
504 504
505 status = 0; 505 status = 0;
506bail: 506bail:
@@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2271 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) 2271 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
2272 continue; 2272 continue;
2273 2273
2274 ret = o2hb_global_hearbeat_mode_set(i); 2274 ret = o2hb_global_heartbeat_mode_set(i);
2275 if (!ret) 2275 if (!ret)
2276 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", 2276 printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
2277 o2hb_heartbeat_mode_desc[i]); 2277 o2hb_heartbeat_mode_desc[i]);
@@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
2304 NULL, 2304 NULL,
2305}; 2305};
2306 2306
2307static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { 2307static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
2308 .show_attribute = o2hb_heartbeat_group_show, 2308 .show_attribute = o2hb_heartbeat_group_show,
2309 .store_attribute = o2hb_heartbeat_group_store, 2309 .store_attribute = o2hb_heartbeat_group_store,
2310}; 2310};
@@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
2316 2316
2317static struct config_item_type o2hb_heartbeat_group_type = { 2317static struct config_item_type o2hb_heartbeat_group_type = {
2318 .ct_group_ops = &o2hb_heartbeat_group_group_ops, 2318 .ct_group_ops = &o2hb_heartbeat_group_group_ops,
2319 .ct_item_ops = &o2hb_hearbeat_group_item_ops, 2319 .ct_item_ops = &o2hb_heartbeat_group_item_ops,
2320 .ct_attrs = o2hb_heartbeat_group_attrs, 2320 .ct_attrs = o2hb_heartbeat_group_attrs,
2321 .ct_owner = THIS_MODULE, 2321 .ct_owner = THIS_MODULE,
2322}; 2322};
@@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid)
2389 assert_spin_locked(&o2hb_live_lock); 2389 assert_spin_locked(&o2hb_live_lock);
2390 2390
2391 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2391 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2392 if (reg->hr_item_dropped)
2393 continue;
2394
2392 uuid = config_item_name(&reg->hr_item); 2395 uuid = config_item_name(&reg->hr_item);
2393 2396
2394 /* local heartbeat */ 2397 /* local heartbeat */
@@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid)
2439 assert_spin_locked(&o2hb_live_lock); 2442 assert_spin_locked(&o2hb_live_lock);
2440 2443
2441 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2444 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2445 if (reg->hr_item_dropped)
2446 continue;
2447
2442 uuid = config_item_name(&reg->hr_item); 2448 uuid = config_item_name(&reg->hr_item);
2443 if (region_uuid) { 2449 if (region_uuid) {
2444 if (strcmp(region_uuid, uuid)) 2450 if (strcmp(region_uuid, uuid))
@@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
2654 2660
2655 p = region_uuids; 2661 p = region_uuids;
2656 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { 2662 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
2663 if (reg->hr_item_dropped)
2664 continue;
2665
2657 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item)); 2666 mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
2658 if (numregs < max_regions) { 2667 if (numregs < max_regions) {
2659 memcpy(p, config_item_name(&reg->hr_item), 2668 memcpy(p, config_item_name(&reg->hr_item),
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index c19897d0fe14..1ec141e758d7 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node)
264/* This is analogous to hb_up. as a node's connection comes up we delay the 264/* This is analogous to hb_up. as a node's connection comes up we delay the
265 * quorum decision until we see it heartbeating. the hold will be droped in 265 * quorum decision until we see it heartbeating. the hold will be droped in
266 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if 266 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
267 * it's already heartbeating we we might be dropping a hold that conn_up got. 267 * it's already heartbeating we might be dropping a hold that conn_up got.
268 * */ 268 * */
269void o2quo_conn_up(u8 node) 269void o2quo_conn_up(u8 node)
270{ 270{
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa88bd8bcedc..d644dc611425 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref)
406 sc->sc_node = NULL; 406 sc->sc_node = NULL;
407 407
408 o2net_debug_del_sc(sc); 408 o2net_debug_del_sc(sc);
409
410 if (sc->sc_page)
411 __free_page(sc->sc_page);
409 kfree(sc); 412 kfree(sc);
410} 413}
411 414
@@ -630,19 +633,19 @@ static void o2net_state_change(struct sock *sk)
630 state_change = sc->sc_state_change; 633 state_change = sc->sc_state_change;
631 634
632 switch(sk->sk_state) { 635 switch(sk->sk_state) {
633 /* ignore connecting sockets as they make progress */ 636 /* ignore connecting sockets as they make progress */
634 case TCP_SYN_SENT: 637 case TCP_SYN_SENT:
635 case TCP_SYN_RECV: 638 case TCP_SYN_RECV:
636 break; 639 break;
637 case TCP_ESTABLISHED: 640 case TCP_ESTABLISHED:
638 o2net_sc_queue_work(sc, &sc->sc_connect_work); 641 o2net_sc_queue_work(sc, &sc->sc_connect_work);
639 break; 642 break;
640 default: 643 default:
641 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT 644 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
642 " shutdown, state %d\n", 645 " shutdown, state %d\n",
643 SC_NODEF_ARGS(sc), sk->sk_state); 646 SC_NODEF_ARGS(sc), sk->sk_state);
644 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 647 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
645 break; 648 break;
646 } 649 }
647out: 650out:
648 read_unlock(&sk->sk_callback_lock); 651 read_unlock(&sk->sk_callback_lock);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f1e1aed8f638..30544ce8e9f7 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1761,11 +1761,10 @@ bail:
1761 1761
1762static int ocfs2_dir_foreach_blk_id(struct inode *inode, 1762static int ocfs2_dir_foreach_blk_id(struct inode *inode,
1763 u64 *f_version, 1763 u64 *f_version,
1764 loff_t *f_pos, void *priv, 1764 struct dir_context *ctx)
1765 filldir_t filldir, int *filldir_err)
1766{ 1765{
1767 int ret, i, filldir_ret; 1766 int ret, i;
1768 unsigned long offset = *f_pos; 1767 unsigned long offset = ctx->pos;
1769 struct buffer_head *di_bh = NULL; 1768 struct buffer_head *di_bh = NULL;
1770 struct ocfs2_dinode *di; 1769 struct ocfs2_dinode *di;
1771 struct ocfs2_inline_data *data; 1770 struct ocfs2_inline_data *data;
@@ -1781,8 +1780,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
1781 di = (struct ocfs2_dinode *)di_bh->b_data; 1780 di = (struct ocfs2_dinode *)di_bh->b_data;
1782 data = &di->id2.i_data; 1781 data = &di->id2.i_data;
1783 1782
1784 while (*f_pos < i_size_read(inode)) { 1783 while (ctx->pos < i_size_read(inode)) {
1785revalidate:
1786 /* If the dir block has changed since the last call to 1784 /* If the dir block has changed since the last call to
1787 * readdir(2), then we might be pointing to an invalid 1785 * readdir(2), then we might be pointing to an invalid
1788 * dirent right now. Scan from the start of the block 1786 * dirent right now. Scan from the start of the block
@@ -1802,50 +1800,31 @@ revalidate:
1802 break; 1800 break;
1803 i += le16_to_cpu(de->rec_len); 1801 i += le16_to_cpu(de->rec_len);
1804 } 1802 }
1805 *f_pos = offset = i; 1803 ctx->pos = offset = i;
1806 *f_version = inode->i_version; 1804 *f_version = inode->i_version;
1807 } 1805 }
1808 1806
1809 de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos); 1807 de = (struct ocfs2_dir_entry *) (data->id_data + ctx->pos);
1810 if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) { 1808 if (!ocfs2_check_dir_entry(inode, de, di_bh, ctx->pos)) {
1811 /* On error, skip the f_pos to the end. */ 1809 /* On error, skip the f_pos to the end. */
1812 *f_pos = i_size_read(inode); 1810 ctx->pos = i_size_read(inode);
1813 goto out; 1811 break;
1814 } 1812 }
1815 offset += le16_to_cpu(de->rec_len); 1813 offset += le16_to_cpu(de->rec_len);
1816 if (le64_to_cpu(de->inode)) { 1814 if (le64_to_cpu(de->inode)) {
1817 /* We might block in the next section
1818 * if the data destination is
1819 * currently swapped out. So, use a
1820 * version stamp to detect whether or
1821 * not the directory has been modified
1822 * during the copy operation.
1823 */
1824 u64 version = *f_version;
1825 unsigned char d_type = DT_UNKNOWN; 1815 unsigned char d_type = DT_UNKNOWN;
1826 1816
1827 if (de->file_type < OCFS2_FT_MAX) 1817 if (de->file_type < OCFS2_FT_MAX)
1828 d_type = ocfs2_filetype_table[de->file_type]; 1818 d_type = ocfs2_filetype_table[de->file_type];
1829 1819
1830 filldir_ret = filldir(priv, de->name, 1820 if (!dir_emit(ctx, de->name, de->name_len,
1831 de->name_len, 1821 le64_to_cpu(de->inode), d_type))
1832 *f_pos, 1822 goto out;
1833 le64_to_cpu(de->inode),
1834 d_type);
1835 if (filldir_ret) {
1836 if (filldir_err)
1837 *filldir_err = filldir_ret;
1838 break;
1839 }
1840 if (version != *f_version)
1841 goto revalidate;
1842 } 1823 }
1843 *f_pos += le16_to_cpu(de->rec_len); 1824 ctx->pos += le16_to_cpu(de->rec_len);
1844 } 1825 }
1845
1846out: 1826out:
1847 brelse(di_bh); 1827 brelse(di_bh);
1848
1849 return 0; 1828 return 0;
1850} 1829}
1851 1830
@@ -1855,27 +1834,26 @@ out:
1855 */ 1834 */
1856static int ocfs2_dir_foreach_blk_el(struct inode *inode, 1835static int ocfs2_dir_foreach_blk_el(struct inode *inode,
1857 u64 *f_version, 1836 u64 *f_version,
1858 loff_t *f_pos, void *priv, 1837 struct dir_context *ctx,
1859 filldir_t filldir, int *filldir_err) 1838 bool persist)
1860{ 1839{
1861 int error = 0;
1862 unsigned long offset, blk, last_ra_blk = 0; 1840 unsigned long offset, blk, last_ra_blk = 0;
1863 int i, stored; 1841 int i;
1864 struct buffer_head * bh, * tmp; 1842 struct buffer_head * bh, * tmp;
1865 struct ocfs2_dir_entry * de; 1843 struct ocfs2_dir_entry * de;
1866 struct super_block * sb = inode->i_sb; 1844 struct super_block * sb = inode->i_sb;
1867 unsigned int ra_sectors = 16; 1845 unsigned int ra_sectors = 16;
1846 int stored = 0;
1868 1847
1869 stored = 0;
1870 bh = NULL; 1848 bh = NULL;
1871 1849
1872 offset = (*f_pos) & (sb->s_blocksize - 1); 1850 offset = ctx->pos & (sb->s_blocksize - 1);
1873 1851
1874 while (!error && !stored && *f_pos < i_size_read(inode)) { 1852 while (ctx->pos < i_size_read(inode)) {
1875 blk = (*f_pos) >> sb->s_blocksize_bits; 1853 blk = ctx->pos >> sb->s_blocksize_bits;
1876 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { 1854 if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
1877 /* Skip the corrupt dirblock and keep trying */ 1855 /* Skip the corrupt dirblock and keep trying */
1878 *f_pos += sb->s_blocksize - offset; 1856 ctx->pos += sb->s_blocksize - offset;
1879 continue; 1857 continue;
1880 } 1858 }
1881 1859
@@ -1897,7 +1875,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
1897 ra_sectors = 8; 1875 ra_sectors = 8;
1898 } 1876 }
1899 1877
1900revalidate:
1901 /* If the dir block has changed since the last call to 1878 /* If the dir block has changed since the last call to
1902 * readdir(2), then we might be pointing to an invalid 1879 * readdir(2), then we might be pointing to an invalid
1903 * dirent right now. Scan from the start of the block 1880 * dirent right now. Scan from the start of the block
@@ -1917,93 +1894,64 @@ revalidate:
1917 i += le16_to_cpu(de->rec_len); 1894 i += le16_to_cpu(de->rec_len);
1918 } 1895 }
1919 offset = i; 1896 offset = i;
1920 *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1)) 1897 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
1921 | offset; 1898 | offset;
1922 *f_version = inode->i_version; 1899 *f_version = inode->i_version;
1923 } 1900 }
1924 1901
1925 while (!error && *f_pos < i_size_read(inode) 1902 while (ctx->pos < i_size_read(inode)
1926 && offset < sb->s_blocksize) { 1903 && offset < sb->s_blocksize) {
1927 de = (struct ocfs2_dir_entry *) (bh->b_data + offset); 1904 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
1928 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { 1905 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
1929 /* On error, skip the f_pos to the 1906 /* On error, skip the f_pos to the
1930 next block. */ 1907 next block. */
1931 *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1; 1908 ctx->pos = (ctx->pos | (sb->s_blocksize - 1)) + 1;
1932 brelse(bh); 1909 brelse(bh);
1933 goto out; 1910 continue;
1934 } 1911 }
1935 offset += le16_to_cpu(de->rec_len);
1936 if (le64_to_cpu(de->inode)) { 1912 if (le64_to_cpu(de->inode)) {
1937 /* We might block in the next section
1938 * if the data destination is
1939 * currently swapped out. So, use a
1940 * version stamp to detect whether or
1941 * not the directory has been modified
1942 * during the copy operation.
1943 */
1944 unsigned long version = *f_version;
1945 unsigned char d_type = DT_UNKNOWN; 1913 unsigned char d_type = DT_UNKNOWN;
1946 1914
1947 if (de->file_type < OCFS2_FT_MAX) 1915 if (de->file_type < OCFS2_FT_MAX)
1948 d_type = ocfs2_filetype_table[de->file_type]; 1916 d_type = ocfs2_filetype_table[de->file_type];
1949 error = filldir(priv, de->name, 1917 if (!dir_emit(ctx, de->name,
1950 de->name_len, 1918 de->name_len,
1951 *f_pos,
1952 le64_to_cpu(de->inode), 1919 le64_to_cpu(de->inode),
1953 d_type); 1920 d_type)) {
1954 if (error) { 1921 brelse(bh);
1955 if (filldir_err) 1922 return 0;
1956 *filldir_err = error;
1957 break;
1958 } 1923 }
1959 if (version != *f_version) 1924 stored++;
1960 goto revalidate;
1961 stored ++;
1962 } 1925 }
1963 *f_pos += le16_to_cpu(de->rec_len); 1926 offset += le16_to_cpu(de->rec_len);
1927 ctx->pos += le16_to_cpu(de->rec_len);
1964 } 1928 }
1965 offset = 0; 1929 offset = 0;
1966 brelse(bh); 1930 brelse(bh);
1967 bh = NULL; 1931 bh = NULL;
1932 if (!persist && stored)
1933 break;
1968 } 1934 }
1969 1935 return 0;
1970 stored = 0;
1971out:
1972 return stored;
1973} 1936}
1974 1937
1975static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version, 1938static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
1976 loff_t *f_pos, void *priv, filldir_t filldir, 1939 struct dir_context *ctx,
1977 int *filldir_err) 1940 bool persist)
1978{ 1941{
1979 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1942 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1980 return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv, 1943 return ocfs2_dir_foreach_blk_id(inode, f_version, ctx);
1981 filldir, filldir_err); 1944 return ocfs2_dir_foreach_blk_el(inode, f_version, ctx, persist);
1982
1983 return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
1984 filldir_err);
1985} 1945}
1986 1946
1987/* 1947/*
1988 * This is intended to be called from inside other kernel functions, 1948 * This is intended to be called from inside other kernel functions,
1989 * so we fake some arguments. 1949 * so we fake some arguments.
1990 */ 1950 */
1991int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, 1951int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx)
1992 filldir_t filldir)
1993{ 1952{
1994 int ret = 0, filldir_err = 0;
1995 u64 version = inode->i_version; 1953 u64 version = inode->i_version;
1996 1954 ocfs2_dir_foreach_blk(inode, &version, ctx, true);
1997 while (*f_pos < i_size_read(inode)) {
1998 ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
1999 filldir, &filldir_err);
2000 if (ret || filldir_err)
2001 break;
2002 }
2003
2004 if (ret > 0)
2005 ret = -EIO;
2006
2007 return 0; 1955 return 0;
2008} 1956}
2009 1957
@@ -2011,15 +1959,15 @@ int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
2011 * ocfs2_readdir() 1959 * ocfs2_readdir()
2012 * 1960 *
2013 */ 1961 */
2014int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) 1962int ocfs2_readdir(struct file *file, struct dir_context *ctx)
2015{ 1963{
2016 int error = 0; 1964 int error = 0;
2017 struct inode *inode = file_inode(filp); 1965 struct inode *inode = file_inode(file);
2018 int lock_level = 0; 1966 int lock_level = 0;
2019 1967
2020 trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); 1968 trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
2021 1969
2022 error = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); 1970 error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
2023 if (lock_level && error >= 0) { 1971 if (lock_level && error >= 0) {
2024 /* We release EX lock which used to update atime 1972 /* We release EX lock which used to update atime
2025 * and get PR lock again to reduce contention 1973 * and get PR lock again to reduce contention
@@ -2035,8 +1983,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
2035 goto bail_nolock; 1983 goto bail_nolock;
2036 } 1984 }
2037 1985
2038 error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos, 1986 error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
2039 dirent, filldir, NULL);
2040 1987
2041 ocfs2_inode_unlock(inode, lock_level); 1988 ocfs2_inode_unlock(inode, lock_level);
2042 if (error) 1989 if (error)
@@ -2120,6 +2067,7 @@ bail:
2120} 2067}
2121 2068
2122struct ocfs2_empty_dir_priv { 2069struct ocfs2_empty_dir_priv {
2070 struct dir_context ctx;
2123 unsigned seen_dot; 2071 unsigned seen_dot;
2124 unsigned seen_dot_dot; 2072 unsigned seen_dot_dot;
2125 unsigned seen_other; 2073 unsigned seen_other;
@@ -2204,10 +2152,9 @@ out:
2204int ocfs2_empty_dir(struct inode *inode) 2152int ocfs2_empty_dir(struct inode *inode)
2205{ 2153{
2206 int ret; 2154 int ret;
2207 loff_t start = 0; 2155 struct ocfs2_empty_dir_priv priv = {
2208 struct ocfs2_empty_dir_priv priv; 2156 .ctx.actor = ocfs2_empty_dir_filldir,
2209 2157 };
2210 memset(&priv, 0, sizeof(priv));
2211 2158
2212 if (ocfs2_dir_indexed(inode)) { 2159 if (ocfs2_dir_indexed(inode)) {
2213 ret = ocfs2_empty_dir_dx(inode, &priv); 2160 ret = ocfs2_empty_dir_dx(inode, &priv);
@@ -2219,7 +2166,7 @@ int ocfs2_empty_dir(struct inode *inode)
2219 */ 2166 */
2220 } 2167 }
2221 2168
2222 ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir); 2169 ret = ocfs2_dir_foreach(inode, &priv.ctx);
2223 if (ret) 2170 if (ret)
2224 mlog_errno(ret); 2171 mlog_errno(ret);
2225 2172
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index e683f3deb645..f0344b75b14d 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -92,9 +92,8 @@ int ocfs2_find_files_on_disk(const char *name,
92 struct ocfs2_dir_lookup_result *res); 92 struct ocfs2_dir_lookup_result *res);
93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name, 93int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
94 int namelen, u64 *blkno); 94 int namelen, u64 *blkno);
95int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); 95int ocfs2_readdir(struct file *file, struct dir_context *ctx);
96int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv, 96int ocfs2_dir_foreach(struct inode *inode, struct dir_context *ctx);
97 filldir_t filldir);
98int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, 97int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
99 struct inode *dir, 98 struct inode *dir,
100 struct buffer_head *parent_fe_bh, 99 struct buffer_head *parent_fe_bh,
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 975810b98492..47e67c2d228f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
178 lock->ml.node); 178 lock->ml.node);
179 } 179 }
180 } else { 180 } else {
181 status = DLM_NORMAL;
181 dlm_lock_get(lock); 182 dlm_lock_get(lock);
182 list_add_tail(&lock->list, &res->blocked); 183 list_add_tail(&lock->list, &res->blocked);
183 kick_thread = 1; 184 kick_thread = 1;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index e68588e6b1e8..773bd32bfd8c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -55,9 +55,6 @@
55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); 55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
56 56
57static int dlm_recovery_thread(void *data); 57static int dlm_recovery_thread(void *data);
58void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
59int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
60void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
61static int dlm_do_recovery(struct dlm_ctxt *dlm); 58static int dlm_do_recovery(struct dlm_ctxt *dlm);
62 59
63static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); 60static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -789,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
789 u8 dead_node) 786 u8 dead_node)
790{ 787{
791 struct dlm_lock_request lr; 788 struct dlm_lock_request lr;
792 enum dlm_status ret; 789 int ret;
793 790
794 mlog(0, "\n"); 791 mlog(0, "\n");
795 792
@@ -802,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
802 lr.dead_node = dead_node; 799 lr.dead_node = dead_node;
803 800
804 // send message 801 // send message
805 ret = DLM_NOLOCKMGR;
806 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, 802 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
807 &lr, sizeof(lr), request_from, NULL); 803 &lr, sizeof(lr), request_from, NULL);
808 804
@@ -2696,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2696 dlm->name, br->node_idx, br->dead_node, 2692 dlm->name, br->node_idx, br->dead_node,
2697 dlm->reco.dead_node, dlm->reco.new_master); 2693 dlm->reco.dead_node, dlm->reco.new_master);
2698 spin_unlock(&dlm->spinlock); 2694 spin_unlock(&dlm->spinlock);
2695 dlm_put(dlm);
2699 return -EAGAIN; 2696 return -EAGAIN;
2700 } 2697 }
2701 spin_unlock(&dlm->spinlock); 2698 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ff54014a24ec..3261d71319ee 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -370,7 +370,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
370 if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) 370 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
371 goto out; 371 goto out;
372 372
373 return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); 373 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
374 374
375out: 375out:
376 return status; 376 return status;
@@ -899,7 +899,7 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
899 zero_clusters = last_cpos - zero_cpos; 899 zero_clusters = last_cpos - zero_cpos;
900 900
901 if (needs_cow) { 901 if (needs_cow) {
902 rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos, 902 rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
903 zero_clusters, UINT_MAX); 903 zero_clusters, UINT_MAX);
904 if (rc) { 904 if (rc) {
905 mlog_errno(rc); 905 mlog_errno(rc);
@@ -2078,7 +2078,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2078 2078
2079 *meta_level = 1; 2079 *meta_level = 1;
2080 2080
2081 ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); 2081 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
2082 if (ret) 2082 if (ret)
2083 mlog_errno(ret); 2083 mlog_errno(ret);
2084out: 2084out:
@@ -2646,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2646 goto out; 2646 goto out;
2647 } 2647 }
2648 2648
2649 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 2649 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2650 ret = -EINVAL;
2651 if (!ret && offset > inode->i_sb->s_maxbytes)
2652 ret = -EINVAL;
2653 if (ret)
2654 goto out;
2655
2656 if (offset != file->f_pos) {
2657 file->f_pos = offset;
2658 file->f_version = 0;
2659 }
2660 2650
2661out: 2651out:
2662 mutex_unlock(&inode->i_mutex); 2652 mutex_unlock(&inode->i_mutex);
@@ -2712,7 +2702,7 @@ const struct file_operations ocfs2_fops = {
2712const struct file_operations ocfs2_dops = { 2702const struct file_operations ocfs2_dops = {
2713 .llseek = generic_file_llseek, 2703 .llseek = generic_file_llseek,
2714 .read = generic_read_dir, 2704 .read = generic_read_dir,
2715 .readdir = ocfs2_readdir, 2705 .iterate = ocfs2_readdir,
2716 .fsync = ocfs2_sync_file, 2706 .fsync = ocfs2_sync_file,
2717 .release = ocfs2_dir_release, 2707 .release = ocfs2_dir_release,
2718 .open = ocfs2_dir_open, 2708 .open = ocfs2_dir_open,
@@ -2759,7 +2749,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
2759const struct file_operations ocfs2_dops_no_plocks = { 2749const struct file_operations ocfs2_dops_no_plocks = {
2760 .llseek = generic_file_llseek, 2750 .llseek = generic_file_llseek,
2761 .read = generic_read_dir, 2751 .read = generic_read_dir,
2762 .readdir = ocfs2_readdir, 2752 .iterate = ocfs2_readdir,
2763 .fsync = ocfs2_sync_file, 2753 .fsync = ocfs2_sync_file,
2764 .release = ocfs2_dir_release, 2754 .release = ocfs2_dir_release,
2765 .open = ocfs2_dir_open, 2755 .open = ocfs2_dir_open,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8eccfabcd12e..242170d83971 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1941,6 +1941,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
1941} 1941}
1942 1942
1943struct ocfs2_orphan_filldir_priv { 1943struct ocfs2_orphan_filldir_priv {
1944 struct dir_context ctx;
1944 struct inode *head; 1945 struct inode *head;
1945 struct ocfs2_super *osb; 1946 struct ocfs2_super *osb;
1946}; 1947};
@@ -1977,11 +1978,11 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1977{ 1978{
1978 int status; 1979 int status;
1979 struct inode *orphan_dir_inode = NULL; 1980 struct inode *orphan_dir_inode = NULL;
1980 struct ocfs2_orphan_filldir_priv priv; 1981 struct ocfs2_orphan_filldir_priv priv = {
1981 loff_t pos = 0; 1982 .ctx.actor = ocfs2_orphan_filldir,
1982 1983 .osb = osb,
1983 priv.osb = osb; 1984 .head = *head
1984 priv.head = *head; 1985 };
1985 1986
1986 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1987 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1987 ORPHAN_DIR_SYSTEM_INODE, 1988 ORPHAN_DIR_SYSTEM_INODE,
@@ -1999,8 +2000,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
1999 goto out; 2000 goto out;
2000 } 2001 }
2001 2002
2002 status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv, 2003 status = ocfs2_dir_foreach(orphan_dir_inode, &priv.ctx);
2003 ocfs2_orphan_filldir);
2004 if (status) { 2004 if (status) {
2005 mlog_errno(status); 2005 mlog_errno(status);
2006 goto out_cluster; 2006 goto out_cluster;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index a3385b63ff5e..0a992737dcaf 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
200 200
201static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) 201static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
202{ 202{
203 atomic_set(&osb->needs_checkpoint, 1);
204 wake_up(&osb->checkpoint_event); 203 wake_up(&osb->checkpoint_event);
205} 204}
206 205
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index f1fc172175b6..452068b45749 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -69,7 +69,7 @@ static int __ocfs2_move_extent(handle_t *handle,
69 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); 69 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
70 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); 70 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
71 71
72 ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, 72 ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
73 p_cpos, new_p_cpos, len); 73 p_cpos, new_p_cpos, len);
74 if (ret) { 74 if (ret) {
75 mlog_errno(ret); 75 mlog_errno(ret);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b4a5cdf9dbc5..be3f8676a438 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
522 522
523 fe->i_last_eb_blk = 0; 523 fe->i_last_eb_blk = 0;
524 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 524 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
525 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); 525 fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
526 fe->i_atime = fe->i_ctime = fe->i_mtime = 526 fe->i_atime = fe->i_ctime = fe->i_mtime =
527 cpu_to_le64(CURRENT_TIME.tv_sec); 527 cpu_to_le64(CURRENT_TIME.tv_sec);
528 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = 528 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
@@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
773 return ret; 773 return ret;
774} 774}
775 775
776static inline int inode_is_unlinkable(struct inode *inode) 776static inline int ocfs2_inode_is_unlinkable(struct inode *inode)
777{ 777{
778 if (S_ISDIR(inode->i_mode)) { 778 if (S_ISDIR(inode->i_mode)) {
779 if (inode->i_nlink == 2) 779 if (inode->i_nlink == 2)
@@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir,
791{ 791{
792 int status; 792 int status;
793 int child_locked = 0; 793 int child_locked = 0;
794 bool is_unlinkable = false;
794 struct inode *inode = dentry->d_inode; 795 struct inode *inode = dentry->d_inode;
795 struct inode *orphan_dir = NULL; 796 struct inode *orphan_dir = NULL;
796 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 797 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir,
865 goto leave; 866 goto leave;
866 } 867 }
867 868
868 if (inode_is_unlinkable(inode)) { 869 if (ocfs2_inode_is_unlinkable(inode)) {
869 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 870 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
870 OCFS2_I(inode)->ip_blkno, 871 OCFS2_I(inode)->ip_blkno,
871 orphan_name, &orphan_insert); 872 orphan_name, &orphan_insert);
@@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir,
873 mlog_errno(status); 874 mlog_errno(status);
874 goto leave; 875 goto leave;
875 } 876 }
877 is_unlinkable = true;
876 } 878 }
877 879
878 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); 880 handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
@@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir,
892 894
893 fe = (struct ocfs2_dinode *) fe_bh->b_data; 895 fe = (struct ocfs2_dinode *) fe_bh->b_data;
894 896
895 if (inode_is_unlinkable(inode)) {
896 status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
897 &orphan_insert, orphan_dir);
898 if (status < 0) {
899 mlog_errno(status);
900 goto leave;
901 }
902 }
903
904 /* delete the name from the parent dir */ 897 /* delete the name from the parent dir */
905 status = ocfs2_delete_entry(handle, dir, &lookup); 898 status = ocfs2_delete_entry(handle, dir, &lookup);
906 if (status < 0) { 899 if (status < 0) {
@@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir,
923 mlog_errno(status); 916 mlog_errno(status);
924 if (S_ISDIR(inode->i_mode)) 917 if (S_ISDIR(inode->i_mode))
925 inc_nlink(dir); 918 inc_nlink(dir);
919 goto leave;
920 }
921
922 if (is_unlinkable) {
923 status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
924 orphan_name, &orphan_insert, orphan_dir);
925 if (status < 0)
926 mlog_errno(status);
926 } 927 }
927 928
928leave: 929leave:
@@ -2012,6 +2013,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2012 goto leave; 2013 goto leave;
2013 } 2014 }
2014 2015
2016 /*
2017 * We're going to journal the change of i_flags and i_orphaned_slot.
2018 * It's safe anyway, though some callers may duplicate the journaling.
2019 * Journaling within the func just make the logic look more
2020 * straightforward.
2021 */
2022 status = ocfs2_journal_access_di(handle,
2023 INODE_CACHE(inode),
2024 fe_bh,
2025 OCFS2_JOURNAL_ACCESS_WRITE);
2026 if (status < 0) {
2027 mlog_errno(status);
2028 goto leave;
2029 }
2030
2015 /* we're a cluster, and nlink can change on disk from 2031 /* we're a cluster, and nlink can change on disk from
2016 * underneath us... */ 2032 * underneath us... */
2017 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; 2033 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
@@ -2026,25 +2042,10 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2026 orphan_dir_bh, lookup); 2042 orphan_dir_bh, lookup);
2027 if (status < 0) { 2043 if (status < 0) {
2028 mlog_errno(status); 2044 mlog_errno(status);
2029 goto leave; 2045 goto rollback;
2030 }
2031
2032 /*
2033 * We're going to journal the change of i_flags and i_orphaned_slot.
2034 * It's safe anyway, though some callers may duplicate the journaling.
2035 * Journaling within the func just make the logic look more
2036 * straightforward.
2037 */
2038 status = ocfs2_journal_access_di(handle,
2039 INODE_CACHE(inode),
2040 fe_bh,
2041 OCFS2_JOURNAL_ACCESS_WRITE);
2042 if (status < 0) {
2043 mlog_errno(status);
2044 goto leave;
2045 } 2046 }
2046 2047
2047 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); 2048 fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
2048 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; 2049 OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
2049 2050
2050 /* Record which orphan dir our inode now resides 2051 /* Record which orphan dir our inode now resides
@@ -2057,11 +2058,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2057 trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, 2058 trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
2058 osb->slot_num); 2059 osb->slot_num);
2059 2060
2061rollback:
2062 if (status < 0) {
2063 if (S_ISDIR(inode->i_mode))
2064 ocfs2_add_links_count(orphan_fe, -1);
2065 set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe));
2066 }
2067
2060leave: 2068leave:
2061 brelse(orphan_dir_bh); 2069 brelse(orphan_dir_bh);
2062 2070
2063 if (status)
2064 mlog_errno(status);
2065 return status; 2071 return status;
2066} 2072}
2067 2073
@@ -2434,7 +2440,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2434 } 2440 }
2435 2441
2436 di = (struct ocfs2_dinode *)di_bh->b_data; 2442 di = (struct ocfs2_dinode *)di_bh->b_data;
2437 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); 2443 di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL);
2438 di->i_orphaned_slot = 0; 2444 di->i_orphaned_slot = 0;
2439 set_nlink(inode, 1); 2445 set_nlink(inode, 1);
2440 ocfs2_set_links_count(di, inode->i_nlink); 2446 ocfs2_set_links_count(di, inode->i_nlink);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d355e6e36b36..3a903470c794 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -347,7 +347,6 @@ struct ocfs2_super
347 struct task_struct *recovery_thread_task; 347 struct task_struct *recovery_thread_task;
348 int disable_recovery; 348 int disable_recovery;
349 wait_queue_head_t checkpoint_event; 349 wait_queue_head_t checkpoint_event;
350 atomic_t needs_checkpoint;
351 struct ocfs2_journal *journal; 350 struct ocfs2_journal *journal;
352 unsigned long osb_commit_interval; 351 unsigned long osb_commit_interval;
353 352
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 998b17eda09d..a70d604593b6 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,7 +49,6 @@
49 49
50struct ocfs2_cow_context { 50struct ocfs2_cow_context {
51 struct inode *inode; 51 struct inode *inode;
52 struct file *file;
53 u32 cow_start; 52 u32 cow_start;
54 u32 cow_len; 53 u32 cow_len;
55 struct ocfs2_extent_tree data_et; 54 struct ocfs2_extent_tree data_et;
@@ -66,7 +65,7 @@ struct ocfs2_cow_context {
66 u32 *num_clusters, 65 u32 *num_clusters,
67 unsigned int *extent_flags); 66 unsigned int *extent_flags);
68 int (*cow_duplicate_clusters)(handle_t *handle, 67 int (*cow_duplicate_clusters)(handle_t *handle,
69 struct file *file, 68 struct inode *inode,
70 u32 cpos, u32 old_cluster, 69 u32 cpos, u32 old_cluster,
71 u32 new_cluster, u32 new_len); 70 u32 new_cluster, u32 new_len);
72}; 71};
@@ -2922,14 +2921,12 @@ static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2922} 2921}
2923 2922
2924int ocfs2_duplicate_clusters_by_page(handle_t *handle, 2923int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2925 struct file *file, 2924 struct inode *inode,
2926 u32 cpos, u32 old_cluster, 2925 u32 cpos, u32 old_cluster,
2927 u32 new_cluster, u32 new_len) 2926 u32 new_cluster, u32 new_len)
2928{ 2927{
2929 int ret = 0, partial; 2928 int ret = 0, partial;
2930 struct inode *inode = file_inode(file); 2929 struct super_block *sb = inode->i_sb;
2931 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
2932 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2930 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2934 struct page *page; 2931 struct page *page;
2935 pgoff_t page_index; 2932 pgoff_t page_index;
@@ -2965,6 +2962,11 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2965 to = map_end & (PAGE_CACHE_SIZE - 1); 2962 to = map_end & (PAGE_CACHE_SIZE - 1);
2966 2963
2967 page = find_or_create_page(mapping, page_index, GFP_NOFS); 2964 page = find_or_create_page(mapping, page_index, GFP_NOFS);
2965 if (!page) {
2966 ret = -ENOMEM;
2967 mlog_errno(ret);
2968 break;
2969 }
2968 2970
2969 /* 2971 /*
2970 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page 2972 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
@@ -2973,13 +2975,6 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2973 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) 2975 if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2974 BUG_ON(PageDirty(page)); 2976 BUG_ON(PageDirty(page));
2975 2977
2976 if (PageReadahead(page)) {
2977 page_cache_async_readahead(mapping,
2978 &file->f_ra, file,
2979 page, page_index,
2980 readahead_pages);
2981 }
2982
2983 if (!PageUptodate(page)) { 2978 if (!PageUptodate(page)) {
2984 ret = block_read_full_page(page, ocfs2_get_block); 2979 ret = block_read_full_page(page, ocfs2_get_block);
2985 if (ret) { 2980 if (ret) {
@@ -2999,7 +2994,8 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2999 } 2994 }
3000 } 2995 }
3001 2996
3002 ocfs2_map_and_dirty_page(inode, handle, from, to, 2997 ocfs2_map_and_dirty_page(inode,
2998 handle, from, to,
3003 page, 0, &new_block); 2999 page, 0, &new_block);
3004 mark_page_accessed(page); 3000 mark_page_accessed(page);
3005unlock: 3001unlock:
@@ -3015,12 +3011,11 @@ unlock:
3015} 3011}
3016 3012
3017int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, 3013int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3018 struct file *file, 3014 struct inode *inode,
3019 u32 cpos, u32 old_cluster, 3015 u32 cpos, u32 old_cluster,
3020 u32 new_cluster, u32 new_len) 3016 u32 new_cluster, u32 new_len)
3021{ 3017{
3022 int ret = 0; 3018 int ret = 0;
3023 struct inode *inode = file_inode(file);
3024 struct super_block *sb = inode->i_sb; 3019 struct super_block *sb = inode->i_sb;
3025 struct ocfs2_caching_info *ci = INODE_CACHE(inode); 3020 struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3026 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); 3021 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
@@ -3145,7 +3140,7 @@ static int ocfs2_replace_clusters(handle_t *handle,
3145 3140
3146 /*If the old clusters is unwritten, no need to duplicate. */ 3141 /*If the old clusters is unwritten, no need to duplicate. */
3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 3142 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3148 ret = context->cow_duplicate_clusters(handle, context->file, 3143 ret = context->cow_duplicate_clusters(handle, context->inode,
3149 cpos, old, new, len); 3144 cpos, old, new, len);
3150 if (ret) { 3145 if (ret) {
3151 mlog_errno(ret); 3146 mlog_errno(ret);
@@ -3423,35 +3418,12 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3423 return ret; 3418 return ret;
3424} 3419}
3425 3420
3426static void ocfs2_readahead_for_cow(struct inode *inode,
3427 struct file *file,
3428 u32 start, u32 len)
3429{
3430 struct address_space *mapping;
3431 pgoff_t index;
3432 unsigned long num_pages;
3433 int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
3434
3435 if (!file)
3436 return;
3437
3438 mapping = file->f_mapping;
3439 num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
3440 if (!num_pages)
3441 num_pages = 1;
3442
3443 index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
3444 page_cache_sync_readahead(mapping, &file->f_ra, file,
3445 index, num_pages);
3446}
3447
3448/* 3421/*
3449 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3422 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3450 * past max_cpos. This will stop when it runs into a hole or an 3423 * past max_cpos. This will stop when it runs into a hole or an
3451 * unrefcounted extent. 3424 * unrefcounted extent.
3452 */ 3425 */
3453static int ocfs2_refcount_cow_hunk(struct inode *inode, 3426static int ocfs2_refcount_cow_hunk(struct inode *inode,
3454 struct file *file,
3455 struct buffer_head *di_bh, 3427 struct buffer_head *di_bh,
3456 u32 cpos, u32 write_len, u32 max_cpos) 3428 u32 cpos, u32 write_len, u32 max_cpos)
3457{ 3429{
@@ -3480,8 +3452,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3480 3452
3481 BUG_ON(cow_len == 0); 3453 BUG_ON(cow_len == 0);
3482 3454
3483 ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
3484
3485 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3455 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3486 if (!context) { 3456 if (!context) {
3487 ret = -ENOMEM; 3457 ret = -ENOMEM;
@@ -3503,7 +3473,6 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3503 context->ref_root_bh = ref_root_bh; 3473 context->ref_root_bh = ref_root_bh;
3504 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3474 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3505 context->get_clusters = ocfs2_di_get_clusters; 3475 context->get_clusters = ocfs2_di_get_clusters;
3506 context->file = file;
3507 3476
3508 ocfs2_init_dinode_extent_tree(&context->data_et, 3477 ocfs2_init_dinode_extent_tree(&context->data_et,
3509 INODE_CACHE(inode), di_bh); 3478 INODE_CACHE(inode), di_bh);
@@ -3532,7 +3501,6 @@ out:
3532 * clusters between cpos and cpos+write_len are safe to modify. 3501 * clusters between cpos and cpos+write_len are safe to modify.
3533 */ 3502 */
3534int ocfs2_refcount_cow(struct inode *inode, 3503int ocfs2_refcount_cow(struct inode *inode,
3535 struct file *file,
3536 struct buffer_head *di_bh, 3504 struct buffer_head *di_bh,
3537 u32 cpos, u32 write_len, u32 max_cpos) 3505 u32 cpos, u32 write_len, u32 max_cpos)
3538{ 3506{
@@ -3552,7 +3520,7 @@ int ocfs2_refcount_cow(struct inode *inode,
3552 num_clusters = write_len; 3520 num_clusters = write_len;
3553 3521
3554 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3522 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3555 ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, 3523 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3556 num_clusters, max_cpos); 3524 num_clusters, max_cpos);
3557 if (ret) { 3525 if (ret) {
3558 mlog_errno(ret); 3526 mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 7754608c83a4..6422bbcdb525 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -53,7 +53,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
53 int *credits, 53 int *credits,
54 int *ref_blocks); 54 int *ref_blocks);
55int ocfs2_refcount_cow(struct inode *inode, 55int ocfs2_refcount_cow(struct inode *inode,
56 struct file *filep, struct buffer_head *di_bh, 56 struct buffer_head *di_bh,
57 u32 cpos, u32 write_len, u32 max_cpos); 57 u32 cpos, u32 write_len, u32 max_cpos);
58 58
59typedef int (ocfs2_post_refcount_func)(struct inode *inode, 59typedef int (ocfs2_post_refcount_func)(struct inode *inode,
@@ -85,11 +85,11 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
85 u32 cpos, u32 write_len, 85 u32 cpos, u32 write_len,
86 struct ocfs2_post_refcount *post); 86 struct ocfs2_post_refcount *post);
87int ocfs2_duplicate_clusters_by_page(handle_t *handle, 87int ocfs2_duplicate_clusters_by_page(handle_t *handle,
88 struct file *file, 88 struct inode *inode,
89 u32 cpos, u32 old_cluster, 89 u32 cpos, u32 old_cluster,
90 u32 new_cluster, u32 new_len); 90 u32 new_cluster, u32 new_len);
91int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, 91int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
92 struct file *file, 92 struct inode *inode,
93 u32 cpos, u32 old_cluster, 93 u32 cpos, u32 old_cluster,
94 u32 new_cluster, u32 new_len); 94 u32 new_cluster, u32 new_len);
95int ocfs2_cow_sync_writeback(struct super_block *sb, 95int ocfs2_cow_sync_writeback(struct super_block *sb,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index b7e74b580c0f..5397c07ce608 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle,
1422 int status; 1422 int status;
1423 /* there is a really tiny chance the journal calls could fail, 1423 /* there is a really tiny chance the journal calls could fail,
1424 * but we wouldn't want inconsistent blocks in *any* case. */ 1424 * but we wouldn't want inconsistent blocks in *any* case. */
1425 u64 fe_ptr, bg_ptr, prev_bg_ptr; 1425 u64 bg_ptr, prev_bg_ptr;
1426 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1426 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1427 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 1427 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1428 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; 1428 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
@@ -1437,51 +1437,44 @@ static int ocfs2_relink_block_group(handle_t *handle,
1437 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1437 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1438 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1438 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1439 1439
1440 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1441 bg_ptr = le64_to_cpu(bg->bg_next_group); 1440 bg_ptr = le64_to_cpu(bg->bg_next_group);
1442 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1441 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1443 1442
1444 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1443 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1445 prev_bg_bh, 1444 prev_bg_bh,
1446 OCFS2_JOURNAL_ACCESS_WRITE); 1445 OCFS2_JOURNAL_ACCESS_WRITE);
1447 if (status < 0) { 1446 if (status < 0)
1448 mlog_errno(status); 1447 goto out;
1449 goto out_rollback;
1450 }
1451 1448
1452 prev_bg->bg_next_group = bg->bg_next_group; 1449 prev_bg->bg_next_group = bg->bg_next_group;
1453 ocfs2_journal_dirty(handle, prev_bg_bh); 1450 ocfs2_journal_dirty(handle, prev_bg_bh);
1454 1451
1455 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 1452 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1456 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1453 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1457 if (status < 0) { 1454 if (status < 0)
1458 mlog_errno(status); 1455 goto out_rollback_prev_bg;
1459 goto out_rollback;
1460 }
1461 1456
1462 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; 1457 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1463 ocfs2_journal_dirty(handle, bg_bh); 1458 ocfs2_journal_dirty(handle, bg_bh);
1464 1459
1465 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), 1460 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1466 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1461 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1467 if (status < 0) { 1462 if (status < 0)
1468 mlog_errno(status); 1463 goto out_rollback_bg;
1469 goto out_rollback;
1470 }
1471 1464
1472 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; 1465 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1473 ocfs2_journal_dirty(handle, fe_bh); 1466 ocfs2_journal_dirty(handle, fe_bh);
1474 1467
1475out_rollback: 1468out:
1476 if (status < 0) { 1469 if (status < 0)
1477 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1478 bg->bg_next_group = cpu_to_le64(bg_ptr);
1479 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1480 }
1481
1482 if (status)
1483 mlog_errno(status); 1470 mlog_errno(status);
1484 return status; 1471 return status;
1472
1473out_rollback_bg:
1474 bg->bg_next_group = cpu_to_le64(bg_ptr);
1475out_rollback_prev_bg:
1476 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1477 goto out;
1485} 1478}
1486 1479
1487static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, 1480static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 01b85165552b..854d80955bf8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
286 spin_unlock(&osb->osb_lock); 286 spin_unlock(&osb->osb_lock);
287 287
288 out += snprintf(buf + out, len - out, 288 out += snprintf(buf + out, len - out,
289 "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", 289 "%10s => Pid: %d Interval: %lu\n", "Commit",
290 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), 290 (osb->commit_task ? task_pid_nr(osb->commit_task) : -1),
291 osb->osb_commit_interval, 291 osb->osb_commit_interval);
292 atomic_read(&osb->needs_checkpoint));
293 292
294 out += snprintf(buf + out, len - out, 293 out += snprintf(buf + out, len - out,
295 "%10s => State: %d TxnId: %lu NumTxns: %d\n", 294 "%10s => State: %d TxnId: %lu NumTxns: %d\n",
@@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2154 } 2153 }
2155 2154
2156 init_waitqueue_head(&osb->checkpoint_event); 2155 init_waitqueue_head(&osb->checkpoint_event);
2157 atomic_set(&osb->needs_checkpoint, 0);
2158 2156
2159 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 2157 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
2160 2158
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2e3ea308c144..317ef0abccbb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2751,7 +2751,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2751{ 2751{
2752 int ret; 2752 int ret;
2753 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2753 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2754 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2755 struct ocfs2_xa_loc loc; 2754 struct ocfs2_xa_loc loc;
2756 2755
2757 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) 2756 if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
@@ -2759,13 +2758,6 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
2759 2758
2760 down_write(&oi->ip_alloc_sem); 2759 down_write(&oi->ip_alloc_sem);
2761 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { 2760 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2762 if (!ocfs2_xattr_has_space_inline(inode, di)) {
2763 ret = -ENOSPC;
2764 goto out;
2765 }
2766 }
2767
2768 if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
2769 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); 2761 ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt);
2770 if (ret) { 2762 if (ret) {
2771 if (ret != -ENOSPC) 2763 if (ret != -ENOSPC)
@@ -6499,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
6499 } 6491 }
6500 6492
6501 new_oi = OCFS2_I(args->new_inode); 6493 new_oi = OCFS2_I(args->new_inode);
6494 /*
6495 * Adjust extent record count to reserve space for extended attribute.
6496 * Inline data count had been adjusted in ocfs2_duplicate_inline_data().
6497 */
6498 if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) &&
6499 !(ocfs2_inode_is_fast_symlink(args->new_inode))) {
6500 struct ocfs2_extent_list *el = &new_di->id2.i_list;
6501 le16_add_cpu(&el->l_count, -(inline_size /
6502 sizeof(struct ocfs2_extent_rec)));
6503 }
6502 spin_lock(&new_oi->ip_lock); 6504 spin_lock(&new_oi->ip_lock);
6503 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; 6505 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
6504 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); 6506 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index acbaebcad3a8..1b8e9e8405b2 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -327,26 +327,23 @@ int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
327 return is_bad; 327 return is_bad;
328} 328}
329 329
330static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir, 330static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx,
331 u64 fsblock, int hindex) 331 u64 fsblock, int hindex)
332{ 332{
333 struct inode *dir = file_inode(filp);
334 struct buffer_head *bh;
335 struct omfs_inode *oi;
336 u64 self;
337 int res = 0;
338 unsigned char d_type;
339
340 /* follow chain in this bucket */ 333 /* follow chain in this bucket */
341 while (fsblock != ~0) { 334 while (fsblock != ~0) {
342 bh = omfs_bread(dir->i_sb, fsblock); 335 struct buffer_head *bh = omfs_bread(dir->i_sb, fsblock);
336 struct omfs_inode *oi;
337 u64 self;
338 unsigned char d_type;
339
343 if (!bh) 340 if (!bh)
344 goto out; 341 return true;
345 342
346 oi = (struct omfs_inode *) bh->b_data; 343 oi = (struct omfs_inode *) bh->b_data;
347 if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) { 344 if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
348 brelse(bh); 345 brelse(bh);
349 goto out; 346 return true;
350 } 347 }
351 348
352 self = fsblock; 349 self = fsblock;
@@ -361,15 +358,16 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
361 358
362 d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG; 359 d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
363 360
364 res = filldir(dirent, oi->i_name, strnlen(oi->i_name, 361 if (!dir_emit(ctx, oi->i_name,
365 OMFS_NAMELEN), filp->f_pos, self, d_type); 362 strnlen(oi->i_name, OMFS_NAMELEN),
363 self, d_type)) {
364 brelse(bh);
365 return false;
366 }
366 brelse(bh); 367 brelse(bh);
367 if (res < 0) 368 ctx->pos++;
368 break;
369 filp->f_pos++;
370 } 369 }
371out: 370 return true;
372 return res;
373} 371}
374 372
375static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, 373static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -403,60 +401,44 @@ out:
403 return err; 401 return err;
404} 402}
405 403
406static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 404static int omfs_readdir(struct file *file, struct dir_context *ctx)
407{ 405{
408 struct inode *dir = file_inode(filp); 406 struct inode *dir = file_inode(file);
409 struct buffer_head *bh; 407 struct buffer_head *bh;
410 loff_t offset, res; 408 __be64 *p;
411 unsigned int hchain, hindex; 409 unsigned int hchain, hindex;
412 int nbuckets; 410 int nbuckets;
413 u64 fsblock; 411
414 int ret = -EINVAL; 412 if (ctx->pos >> 32)
415 413 return -EINVAL;
416 if (filp->f_pos >> 32) 414
417 goto success; 415 if (ctx->pos < 1 << 20) {
418 416 if (!dir_emit_dots(file, ctx))
419 switch ((unsigned long) filp->f_pos) { 417 return 0;
420 case 0: 418 ctx->pos = 1 << 20;
421 if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
422 goto success;
423 filp->f_pos++;
424 /* fall through */
425 case 1:
426 if (filldir(dirent, "..", 2, 1,
427 parent_ino(filp->f_dentry), DT_DIR) < 0)
428 goto success;
429 filp->f_pos = 1 << 20;
430 /* fall through */
431 } 419 }
432 420
433 nbuckets = (dir->i_size - OMFS_DIR_START) / 8; 421 nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
434 422
435 /* high 12 bits store bucket + 1 and low 20 bits store hash index */ 423 /* high 12 bits store bucket + 1 and low 20 bits store hash index */
436 hchain = (filp->f_pos >> 20) - 1; 424 hchain = (ctx->pos >> 20) - 1;
437 hindex = filp->f_pos & 0xfffff; 425 hindex = ctx->pos & 0xfffff;
438 426
439 bh = omfs_bread(dir->i_sb, dir->i_ino); 427 bh = omfs_bread(dir->i_sb, dir->i_ino);
440 if (!bh) 428 if (!bh)
441 goto out; 429 return -EINVAL;
442 430
443 offset = OMFS_DIR_START + hchain * 8; 431 p = (__be64 *)(bh->b_data + OMFS_DIR_START) + hchain;
444 432
445 for (; hchain < nbuckets; hchain++, offset += 8) { 433 for (; hchain < nbuckets; hchain++) {
446 fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset])); 434 __u64 fsblock = be64_to_cpu(*p++);
447 435 if (!omfs_fill_chain(dir, ctx, fsblock, hindex))
448 res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
449 hindex = 0;
450 if (res < 0)
451 break; 436 break;
452 437 hindex = 0;
453 filp->f_pos = (hchain+2) << 20; 438 ctx->pos = (hchain+2) << 20;
454 } 439 }
455 brelse(bh); 440 brelse(bh);
456success: 441 return 0;
457 ret = 0;
458out:
459 return ret;
460} 442}
461 443
462const struct inode_operations omfs_dir_inops = { 444const struct inode_operations omfs_dir_inops = {
@@ -470,6 +452,6 @@ const struct inode_operations omfs_dir_inops = {
470 452
471const struct file_operations omfs_dir_operations = { 453const struct file_operations omfs_dir_operations = {
472 .read = generic_read_dir, 454 .read = generic_read_dir,
473 .readdir = omfs_readdir, 455 .iterate = omfs_readdir,
474 .llseek = generic_file_llseek, 456 .llseek = generic_file_llseek,
475}; 457};
diff --git a/fs/open.c b/fs/open.c
index 8c741002f947..7931f76acc2b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,7 +823,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
823 int lookup_flags = 0; 823 int lookup_flags = 0;
824 int acc_mode; 824 int acc_mode;
825 825
826 if (flags & O_CREAT) 826 if (flags & (O_CREAT | __O_TMPFILE))
827 op->mode = (mode & S_IALLUGO) | S_IFREG; 827 op->mode = (mode & S_IALLUGO) | S_IFREG;
828 else 828 else
829 op->mode = 0; 829 op->mode = 0;
@@ -840,11 +840,17 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
840 if (flags & __O_SYNC) 840 if (flags & __O_SYNC)
841 flags |= O_DSYNC; 841 flags |= O_DSYNC;
842 842
843 /* 843 if (flags & __O_TMPFILE) {
844 * If we have O_PATH in the open flag. Then we 844 if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
845 * cannot have anything other than the below set of flags 845 return -EINVAL;
846 */ 846 acc_mode = MAY_OPEN | ACC_MODE(flags);
847 if (flags & O_PATH) { 847 if (!(acc_mode & MAY_WRITE))
848 return -EINVAL;
849 } else if (flags & O_PATH) {
850 /*
851 * If we have O_PATH in the open flag. Then we
852 * cannot have anything other than the below set of flags
853 */
848 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; 854 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
849 acc_mode = 0; 855 acc_mode = 0;
850 } else { 856 } else {
@@ -876,7 +882,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
876 lookup_flags |= LOOKUP_DIRECTORY; 882 lookup_flags |= LOOKUP_DIRECTORY;
877 if (!(flags & O_NOFOLLOW)) 883 if (!(flags & O_NOFOLLOW))
878 lookup_flags |= LOOKUP_FOLLOW; 884 lookup_flags |= LOOKUP_FOLLOW;
879 return lookup_flags; 885 op->lookup_flags = lookup_flags;
886 return 0;
880} 887}
881 888
882/** 889/**
@@ -893,8 +900,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
893struct file *file_open_name(struct filename *name, int flags, umode_t mode) 900struct file *file_open_name(struct filename *name, int flags, umode_t mode)
894{ 901{
895 struct open_flags op; 902 struct open_flags op;
896 int lookup = build_open_flags(flags, mode, &op); 903 int err = build_open_flags(flags, mode, &op);
897 return do_filp_open(AT_FDCWD, name, &op, lookup); 904 return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
898} 905}
899 906
900/** 907/**
@@ -919,37 +926,43 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
919 const char *filename, int flags) 926 const char *filename, int flags)
920{ 927{
921 struct open_flags op; 928 struct open_flags op;
922 int lookup = build_open_flags(flags, 0, &op); 929 int err = build_open_flags(flags, 0, &op);
930 if (err)
931 return ERR_PTR(err);
923 if (flags & O_CREAT) 932 if (flags & O_CREAT)
924 return ERR_PTR(-EINVAL); 933 return ERR_PTR(-EINVAL);
925 if (!filename && (flags & O_DIRECTORY)) 934 if (!filename && (flags & O_DIRECTORY))
926 if (!dentry->d_inode->i_op->lookup) 935 if (!dentry->d_inode->i_op->lookup)
927 return ERR_PTR(-ENOTDIR); 936 return ERR_PTR(-ENOTDIR);
928 return do_file_open_root(dentry, mnt, filename, &op, lookup); 937 return do_file_open_root(dentry, mnt, filename, &op);
929} 938}
930EXPORT_SYMBOL(file_open_root); 939EXPORT_SYMBOL(file_open_root);
931 940
932long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) 941long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
933{ 942{
934 struct open_flags op; 943 struct open_flags op;
935 int lookup = build_open_flags(flags, mode, &op); 944 int fd = build_open_flags(flags, mode, &op);
936 struct filename *tmp = getname(filename); 945 struct filename *tmp;
937 int fd = PTR_ERR(tmp); 946
938 947 if (fd)
939 if (!IS_ERR(tmp)) { 948 return fd;
940 fd = get_unused_fd_flags(flags); 949
941 if (fd >= 0) { 950 tmp = getname(filename);
942 struct file *f = do_filp_open(dfd, tmp, &op, lookup); 951 if (IS_ERR(tmp))
943 if (IS_ERR(f)) { 952 return PTR_ERR(tmp);
944 put_unused_fd(fd); 953
945 fd = PTR_ERR(f); 954 fd = get_unused_fd_flags(flags);
946 } else { 955 if (fd >= 0) {
947 fsnotify_open(f); 956 struct file *f = do_filp_open(dfd, tmp, &op);
948 fd_install(fd, f); 957 if (IS_ERR(f)) {
949 } 958 put_unused_fd(fd);
959 fd = PTR_ERR(f);
960 } else {
961 fsnotify_open(f);
962 fd_install(fd, f);
950 } 963 }
951 putname(tmp);
952 } 964 }
965 putname(tmp);
953 return fd; 966 return fd;
954} 967}
955 968
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 75885ffde44e..8c0ceb8dd1f7 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -162,11 +162,11 @@ static const struct file_operations openpromfs_prop_ops = {
162 .release = seq_release, 162 .release = seq_release,
163}; 163};
164 164
165static int openpromfs_readdir(struct file *, void *, filldir_t); 165static int openpromfs_readdir(struct file *, struct dir_context *);
166 166
167static const struct file_operations openprom_operations = { 167static const struct file_operations openprom_operations = {
168 .read = generic_read_dir, 168 .read = generic_read_dir,
169 .readdir = openpromfs_readdir, 169 .iterate = openpromfs_readdir,
170 .llseek = generic_file_llseek, 170 .llseek = generic_file_llseek,
171}; 171};
172 172
@@ -260,71 +260,64 @@ found:
260 return NULL; 260 return NULL;
261} 261}
262 262
263static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 263static int openpromfs_readdir(struct file *file, struct dir_context *ctx)
264{ 264{
265 struct inode *inode = file_inode(filp); 265 struct inode *inode = file_inode(file);
266 struct op_inode_info *oi = OP_I(inode); 266 struct op_inode_info *oi = OP_I(inode);
267 struct device_node *dp = oi->u.node; 267 struct device_node *dp = oi->u.node;
268 struct device_node *child; 268 struct device_node *child;
269 struct property *prop; 269 struct property *prop;
270 unsigned int ino;
271 int i; 270 int i;
272 271
273 mutex_lock(&op_mutex); 272 mutex_lock(&op_mutex);
274 273
275 ino = inode->i_ino; 274 if (ctx->pos == 0) {
276 i = filp->f_pos; 275 if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
277 switch (i) {
278 case 0:
279 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
280 goto out; 276 goto out;
281 i++; 277 ctx->pos = 1;
282 filp->f_pos++; 278 }
283 /* fall thru */ 279 if (ctx->pos == 1) {
284 case 1: 280 if (!dir_emit(ctx, "..", 2,
285 if (filldir(dirent, "..", 2, i,
286 (dp->parent == NULL ? 281 (dp->parent == NULL ?
287 OPENPROM_ROOT_INO : 282 OPENPROM_ROOT_INO :
288 dp->parent->unique_id), DT_DIR) < 0) 283 dp->parent->unique_id), DT_DIR))
289 goto out; 284 goto out;
290 i++; 285 ctx->pos = 2;
291 filp->f_pos++; 286 }
292 /* fall thru */ 287 i = ctx->pos - 2;
293 default:
294 i -= 2;
295
296 /* First, the children nodes as directories. */
297 child = dp->child;
298 while (i && child) {
299 child = child->sibling;
300 i--;
301 }
302 while (child) {
303 if (filldir(dirent,
304 child->path_component_name,
305 strlen(child->path_component_name),
306 filp->f_pos, child->unique_id, DT_DIR) < 0)
307 goto out;
308
309 filp->f_pos++;
310 child = child->sibling;
311 }
312 288
313 /* Next, the properties as files. */ 289 /* First, the children nodes as directories. */
314 prop = dp->properties; 290 child = dp->child;
315 while (i && prop) { 291 while (i && child) {
316 prop = prop->next; 292 child = child->sibling;
317 i--; 293 i--;
318 } 294 }
319 while (prop) { 295 while (child) {
320 if (filldir(dirent, prop->name, strlen(prop->name), 296 if (!dir_emit(ctx,
321 filp->f_pos, prop->unique_id, DT_REG) < 0) 297 child->path_component_name,
322 goto out; 298 strlen(child->path_component_name),
299 child->unique_id, DT_DIR))
300 goto out;
323 301
324 filp->f_pos++; 302 ctx->pos++;
325 prop = prop->next; 303 child = child->sibling;
326 } 304 }
305
306 /* Next, the properties as files. */
307 prop = dp->properties;
308 while (i && prop) {
309 prop = prop->next;
310 i--;
327 } 311 }
312 while (prop) {
313 if (!dir_emit(ctx, prop->name, strlen(prop->name),
314 prop->unique_id, DT_REG))
315 goto out;
316
317 ctx->pos++;
318 prop = prop->next;
319 }
320
328out: 321out:
329 mutex_unlock(&op_mutex); 322 mutex_unlock(&op_mutex);
330 return 0; 323 return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c3834dad09b3..1485e38daaa3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1681,46 +1681,34 @@ const struct dentry_operations pid_dentry_operations =
1681 * reported by readdir in sync with the inode numbers reported 1681 * reported by readdir in sync with the inode numbers reported
1682 * by stat. 1682 * by stat.
1683 */ 1683 */
1684int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 1684bool proc_fill_cache(struct file *file, struct dir_context *ctx,
1685 const char *name, int len, 1685 const char *name, int len,
1686 instantiate_t instantiate, struct task_struct *task, const void *ptr) 1686 instantiate_t instantiate, struct task_struct *task, const void *ptr)
1687{ 1687{
1688 struct dentry *child, *dir = filp->f_path.dentry; 1688 struct dentry *child, *dir = file->f_path.dentry;
1689 struct qstr qname = QSTR_INIT(name, len);
1689 struct inode *inode; 1690 struct inode *inode;
1690 struct qstr qname; 1691 unsigned type;
1691 ino_t ino = 0; 1692 ino_t ino;
1692 unsigned type = DT_UNKNOWN;
1693
1694 qname.name = name;
1695 qname.len = len;
1696 qname.hash = full_name_hash(name, len);
1697 1693
1698 child = d_lookup(dir, &qname); 1694 child = d_hash_and_lookup(dir, &qname);
1699 if (!child) { 1695 if (!child) {
1700 struct dentry *new; 1696 child = d_alloc(dir, &qname);
1701 new = d_alloc(dir, &qname); 1697 if (!child)
1702 if (new) { 1698 goto end_instantiate;
1703 child = instantiate(dir->d_inode, new, task, ptr); 1699 if (instantiate(dir->d_inode, child, task, ptr) < 0) {
1704 if (child) 1700 dput(child);
1705 dput(new); 1701 goto end_instantiate;
1706 else
1707 child = new;
1708 } 1702 }
1709 } 1703 }
1710 if (!child || IS_ERR(child) || !child->d_inode)
1711 goto end_instantiate;
1712 inode = child->d_inode; 1704 inode = child->d_inode;
1713 if (inode) { 1705 ino = inode->i_ino;
1714 ino = inode->i_ino; 1706 type = inode->i_mode >> 12;
1715 type = inode->i_mode >> 12;
1716 }
1717 dput(child); 1707 dput(child);
1708 return dir_emit(ctx, name, len, ino, type);
1709
1718end_instantiate: 1710end_instantiate:
1719 if (!ino) 1711 return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
1720 ino = find_inode_number(dir, &qname);
1721 if (!ino)
1722 ino = 1;
1723 return filldir(dirent, name, len, filp->f_pos, ino, type);
1724} 1712}
1725 1713
1726#ifdef CONFIG_CHECKPOINT_RESTORE 1714#ifdef CONFIG_CHECKPOINT_RESTORE
@@ -1846,7 +1834,7 @@ struct map_files_info {
1846 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1834 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1847}; 1835};
1848 1836
1849static struct dentry * 1837static int
1850proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 1838proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1851 struct task_struct *task, const void *ptr) 1839 struct task_struct *task, const void *ptr)
1852{ 1840{
@@ -1856,7 +1844,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1856 1844
1857 inode = proc_pid_make_inode(dir->i_sb, task); 1845 inode = proc_pid_make_inode(dir->i_sb, task);
1858 if (!inode) 1846 if (!inode)
1859 return ERR_PTR(-ENOENT); 1847 return -ENOENT;
1860 1848
1861 ei = PROC_I(inode); 1849 ei = PROC_I(inode);
1862 ei->op.proc_get_link = proc_map_files_get_link; 1850 ei->op.proc_get_link = proc_map_files_get_link;
@@ -1873,7 +1861,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1873 d_set_d_op(dentry, &tid_map_files_dentry_operations); 1861 d_set_d_op(dentry, &tid_map_files_dentry_operations);
1874 d_add(dentry, inode); 1862 d_add(dentry, inode);
1875 1863
1876 return NULL; 1864 return 0;
1877} 1865}
1878 1866
1879static struct dentry *proc_map_files_lookup(struct inode *dir, 1867static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -1882,23 +1870,23 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
1882 unsigned long vm_start, vm_end; 1870 unsigned long vm_start, vm_end;
1883 struct vm_area_struct *vma; 1871 struct vm_area_struct *vma;
1884 struct task_struct *task; 1872 struct task_struct *task;
1885 struct dentry *result; 1873 int result;
1886 struct mm_struct *mm; 1874 struct mm_struct *mm;
1887 1875
1888 result = ERR_PTR(-EPERM); 1876 result = -EPERM;
1889 if (!capable(CAP_SYS_ADMIN)) 1877 if (!capable(CAP_SYS_ADMIN))
1890 goto out; 1878 goto out;
1891 1879
1892 result = ERR_PTR(-ENOENT); 1880 result = -ENOENT;
1893 task = get_proc_task(dir); 1881 task = get_proc_task(dir);
1894 if (!task) 1882 if (!task)
1895 goto out; 1883 goto out;
1896 1884
1897 result = ERR_PTR(-EACCES); 1885 result = -EACCES;
1898 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 1886 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1899 goto out_put_task; 1887 goto out_put_task;
1900 1888
1901 result = ERR_PTR(-ENOENT); 1889 result = -ENOENT;
1902 if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) 1890 if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
1903 goto out_put_task; 1891 goto out_put_task;
1904 1892
@@ -1921,7 +1909,7 @@ out_no_vma:
1921out_put_task: 1909out_put_task:
1922 put_task_struct(task); 1910 put_task_struct(task);
1923out: 1911out:
1924 return result; 1912 return ERR_PTR(result);
1925} 1913}
1926 1914
1927static const struct inode_operations proc_map_files_inode_operations = { 1915static const struct inode_operations proc_map_files_inode_operations = {
@@ -1931,14 +1919,15 @@ static const struct inode_operations proc_map_files_inode_operations = {
1931}; 1919};
1932 1920
1933static int 1921static int
1934proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) 1922proc_map_files_readdir(struct file *file, struct dir_context *ctx)
1935{ 1923{
1936 struct dentry *dentry = filp->f_path.dentry;
1937 struct inode *inode = dentry->d_inode;
1938 struct vm_area_struct *vma; 1924 struct vm_area_struct *vma;
1939 struct task_struct *task; 1925 struct task_struct *task;
1940 struct mm_struct *mm; 1926 struct mm_struct *mm;
1941 ino_t ino; 1927 unsigned long nr_files, pos, i;
1928 struct flex_array *fa = NULL;
1929 struct map_files_info info;
1930 struct map_files_info *p;
1942 int ret; 1931 int ret;
1943 1932
1944 ret = -EPERM; 1933 ret = -EPERM;
@@ -1946,7 +1935,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
1946 goto out; 1935 goto out;
1947 1936
1948 ret = -ENOENT; 1937 ret = -ENOENT;
1949 task = get_proc_task(inode); 1938 task = get_proc_task(file_inode(file));
1950 if (!task) 1939 if (!task)
1951 goto out; 1940 goto out;
1952 1941
@@ -1955,91 +1944,73 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
1955 goto out_put_task; 1944 goto out_put_task;
1956 1945
1957 ret = 0; 1946 ret = 0;
1958 switch (filp->f_pos) { 1947 if (!dir_emit_dots(file, ctx))
1959 case 0: 1948 goto out_put_task;
1960 ino = inode->i_ino;
1961 if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
1962 goto out_put_task;
1963 filp->f_pos++;
1964 case 1:
1965 ino = parent_ino(dentry);
1966 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1967 goto out_put_task;
1968 filp->f_pos++;
1969 default:
1970 {
1971 unsigned long nr_files, pos, i;
1972 struct flex_array *fa = NULL;
1973 struct map_files_info info;
1974 struct map_files_info *p;
1975
1976 mm = get_task_mm(task);
1977 if (!mm)
1978 goto out_put_task;
1979 down_read(&mm->mmap_sem);
1980 1949
1981 nr_files = 0; 1950 mm = get_task_mm(task);
1951 if (!mm)
1952 goto out_put_task;
1953 down_read(&mm->mmap_sem);
1982 1954
1983 /* 1955 nr_files = 0;
1984 * We need two passes here:
1985 *
1986 * 1) Collect vmas of mapped files with mmap_sem taken
1987 * 2) Release mmap_sem and instantiate entries
1988 *
1989 * otherwise we get lockdep complained, since filldir()
1990 * routine might require mmap_sem taken in might_fault().
1991 */
1992 1956
1993 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { 1957 /*
1994 if (vma->vm_file && ++pos > filp->f_pos) 1958 * We need two passes here:
1995 nr_files++; 1959 *
1996 } 1960 * 1) Collect vmas of mapped files with mmap_sem taken
1961 * 2) Release mmap_sem and instantiate entries
1962 *
1963 * otherwise we get lockdep complained, since filldir()
1964 * routine might require mmap_sem taken in might_fault().
1965 */
1997 1966
1998 if (nr_files) { 1967 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
1999 fa = flex_array_alloc(sizeof(info), nr_files, 1968 if (vma->vm_file && ++pos > ctx->pos)
2000 GFP_KERNEL); 1969 nr_files++;
2001 if (!fa || flex_array_prealloc(fa, 0, nr_files, 1970 }
2002 GFP_KERNEL)) { 1971
2003 ret = -ENOMEM; 1972 if (nr_files) {
2004 if (fa) 1973 fa = flex_array_alloc(sizeof(info), nr_files,
2005 flex_array_free(fa); 1974 GFP_KERNEL);
2006 up_read(&mm->mmap_sem); 1975 if (!fa || flex_array_prealloc(fa, 0, nr_files,
2007 mmput(mm); 1976 GFP_KERNEL)) {
2008 goto out_put_task; 1977 ret = -ENOMEM;
2009 } 1978 if (fa)
2010 for (i = 0, vma = mm->mmap, pos = 2; vma; 1979 flex_array_free(fa);
2011 vma = vma->vm_next) { 1980 up_read(&mm->mmap_sem);
2012 if (!vma->vm_file) 1981 mmput(mm);
2013 continue; 1982 goto out_put_task;
2014 if (++pos <= filp->f_pos)
2015 continue;
2016
2017 info.mode = vma->vm_file->f_mode;
2018 info.len = snprintf(info.name,
2019 sizeof(info.name), "%lx-%lx",
2020 vma->vm_start, vma->vm_end);
2021 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
2022 BUG();
2023 }
2024 } 1983 }
2025 up_read(&mm->mmap_sem); 1984 for (i = 0, vma = mm->mmap, pos = 2; vma;
2026 1985 vma = vma->vm_next) {
2027 for (i = 0; i < nr_files; i++) { 1986 if (!vma->vm_file)
2028 p = flex_array_get(fa, i); 1987 continue;
2029 ret = proc_fill_cache(filp, dirent, filldir, 1988 if (++pos <= ctx->pos)
2030 p->name, p->len, 1989 continue;
2031 proc_map_files_instantiate, 1990
2032 task, 1991 info.mode = vma->vm_file->f_mode;
2033 (void *)(unsigned long)p->mode); 1992 info.len = snprintf(info.name,
2034 if (ret) 1993 sizeof(info.name), "%lx-%lx",
2035 break; 1994 vma->vm_start, vma->vm_end);
2036 filp->f_pos++; 1995 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
1996 BUG();
2037 } 1997 }
2038 if (fa)
2039 flex_array_free(fa);
2040 mmput(mm);
2041 } 1998 }
1999 up_read(&mm->mmap_sem);
2000
2001 for (i = 0; i < nr_files; i++) {
2002 p = flex_array_get(fa, i);
2003 if (!proc_fill_cache(file, ctx,
2004 p->name, p->len,
2005 proc_map_files_instantiate,
2006 task,
2007 (void *)(unsigned long)p->mode))
2008 break;
2009 ctx->pos++;
2042 } 2010 }
2011 if (fa)
2012 flex_array_free(fa);
2013 mmput(mm);
2043 2014
2044out_put_task: 2015out_put_task:
2045 put_task_struct(task); 2016 put_task_struct(task);
@@ -2049,7 +2020,7 @@ out:
2049 2020
2050static const struct file_operations proc_map_files_operations = { 2021static const struct file_operations proc_map_files_operations = {
2051 .read = generic_read_dir, 2022 .read = generic_read_dir,
2052 .readdir = proc_map_files_readdir, 2023 .iterate = proc_map_files_readdir,
2053 .llseek = default_llseek, 2024 .llseek = default_llseek,
2054}; 2025};
2055 2026
@@ -2152,13 +2123,12 @@ static const struct file_operations proc_timers_operations = {
2152}; 2123};
2153#endif /* CONFIG_CHECKPOINT_RESTORE */ 2124#endif /* CONFIG_CHECKPOINT_RESTORE */
2154 2125
2155static struct dentry *proc_pident_instantiate(struct inode *dir, 2126static int proc_pident_instantiate(struct inode *dir,
2156 struct dentry *dentry, struct task_struct *task, const void *ptr) 2127 struct dentry *dentry, struct task_struct *task, const void *ptr)
2157{ 2128{
2158 const struct pid_entry *p = ptr; 2129 const struct pid_entry *p = ptr;
2159 struct inode *inode; 2130 struct inode *inode;
2160 struct proc_inode *ei; 2131 struct proc_inode *ei;
2161 struct dentry *error = ERR_PTR(-ENOENT);
2162 2132
2163 inode = proc_pid_make_inode(dir->i_sb, task); 2133 inode = proc_pid_make_inode(dir->i_sb, task);
2164 if (!inode) 2134 if (!inode)
@@ -2177,9 +2147,9 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2177 d_add(dentry, inode); 2147 d_add(dentry, inode);
2178 /* Close the race of the process dying before we return the dentry */ 2148 /* Close the race of the process dying before we return the dentry */
2179 if (pid_revalidate(dentry, 0)) 2149 if (pid_revalidate(dentry, 0))
2180 error = NULL; 2150 return 0;
2181out: 2151out:
2182 return error; 2152 return -ENOENT;
2183} 2153}
2184 2154
2185static struct dentry *proc_pident_lookup(struct inode *dir, 2155static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -2187,11 +2157,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2187 const struct pid_entry *ents, 2157 const struct pid_entry *ents,
2188 unsigned int nents) 2158 unsigned int nents)
2189{ 2159{
2190 struct dentry *error; 2160 int error;
2191 struct task_struct *task = get_proc_task(dir); 2161 struct task_struct *task = get_proc_task(dir);
2192 const struct pid_entry *p, *last; 2162 const struct pid_entry *p, *last;
2193 2163
2194 error = ERR_PTR(-ENOENT); 2164 error = -ENOENT;
2195 2165
2196 if (!task) 2166 if (!task)
2197 goto out_no_task; 2167 goto out_no_task;
@@ -2214,70 +2184,33 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2214out: 2184out:
2215 put_task_struct(task); 2185 put_task_struct(task);
2216out_no_task: 2186out_no_task:
2217 return error; 2187 return ERR_PTR(error);
2218}
2219
2220static int proc_pident_fill_cache(struct file *filp, void *dirent,
2221 filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2222{
2223 return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2224 proc_pident_instantiate, task, p);
2225} 2188}
2226 2189
2227static int proc_pident_readdir(struct file *filp, 2190static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2228 void *dirent, filldir_t filldir,
2229 const struct pid_entry *ents, unsigned int nents) 2191 const struct pid_entry *ents, unsigned int nents)
2230{ 2192{
2231 int i; 2193 struct task_struct *task = get_proc_task(file_inode(file));
2232 struct dentry *dentry = filp->f_path.dentry; 2194 const struct pid_entry *p;
2233 struct inode *inode = dentry->d_inode;
2234 struct task_struct *task = get_proc_task(inode);
2235 const struct pid_entry *p, *last;
2236 ino_t ino;
2237 int ret;
2238 2195
2239 ret = -ENOENT;
2240 if (!task) 2196 if (!task)
2241 goto out_no_task; 2197 return -ENOENT;
2242 2198
2243 ret = 0; 2199 if (!dir_emit_dots(file, ctx))
2244 i = filp->f_pos; 2200 goto out;
2245 switch (i) { 2201
2246 case 0: 2202 if (ctx->pos >= nents + 2)
2247 ino = inode->i_ino; 2203 goto out;
2248 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
2249 goto out;
2250 i++;
2251 filp->f_pos++;
2252 /* fall through */
2253 case 1:
2254 ino = parent_ino(dentry);
2255 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
2256 goto out;
2257 i++;
2258 filp->f_pos++;
2259 /* fall through */
2260 default:
2261 i -= 2;
2262 if (i >= nents) {
2263 ret = 1;
2264 goto out;
2265 }
2266 p = ents + i;
2267 last = &ents[nents - 1];
2268 while (p <= last) {
2269 if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
2270 goto out;
2271 filp->f_pos++;
2272 p++;
2273 }
2274 }
2275 2204
2276 ret = 1; 2205 for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
2206 if (!proc_fill_cache(file, ctx, p->name, p->len,
2207 proc_pident_instantiate, task, p))
2208 break;
2209 ctx->pos++;
2210 }
2277out: 2211out:
2278 put_task_struct(task); 2212 put_task_struct(task);
2279out_no_task: 2213 return 0;
2280 return ret;
2281} 2214}
2282 2215
2283#ifdef CONFIG_SECURITY 2216#ifdef CONFIG_SECURITY
@@ -2362,16 +2295,15 @@ static const struct pid_entry attr_dir_stuff[] = {
2362 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2295 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2363}; 2296};
2364 2297
2365static int proc_attr_dir_readdir(struct file * filp, 2298static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2366 void * dirent, filldir_t filldir)
2367{ 2299{
2368 return proc_pident_readdir(filp,dirent,filldir, 2300 return proc_pident_readdir(file, ctx,
2369 attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); 2301 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2370} 2302}
2371 2303
2372static const struct file_operations proc_attr_dir_operations = { 2304static const struct file_operations proc_attr_dir_operations = {
2373 .read = generic_read_dir, 2305 .read = generic_read_dir,
2374 .readdir = proc_attr_dir_readdir, 2306 .iterate = proc_attr_dir_readdir,
2375 .llseek = default_llseek, 2307 .llseek = default_llseek,
2376}; 2308};
2377 2309
@@ -2725,16 +2657,15 @@ static const struct pid_entry tgid_base_stuff[] = {
2725#endif 2657#endif
2726}; 2658};
2727 2659
2728static int proc_tgid_base_readdir(struct file * filp, 2660static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
2729 void * dirent, filldir_t filldir)
2730{ 2661{
2731 return proc_pident_readdir(filp,dirent,filldir, 2662 return proc_pident_readdir(file, ctx,
2732 tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); 2663 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2733} 2664}
2734 2665
2735static const struct file_operations proc_tgid_base_operations = { 2666static const struct file_operations proc_tgid_base_operations = {
2736 .read = generic_read_dir, 2667 .read = generic_read_dir,
2737 .readdir = proc_tgid_base_readdir, 2668 .iterate = proc_tgid_base_readdir,
2738 .llseek = default_llseek, 2669 .llseek = default_llseek,
2739}; 2670};
2740 2671
@@ -2836,11 +2767,10 @@ void proc_flush_task(struct task_struct *task)
2836 } 2767 }
2837} 2768}
2838 2769
2839static struct dentry *proc_pid_instantiate(struct inode *dir, 2770static int proc_pid_instantiate(struct inode *dir,
2840 struct dentry * dentry, 2771 struct dentry * dentry,
2841 struct task_struct *task, const void *ptr) 2772 struct task_struct *task, const void *ptr)
2842{ 2773{
2843 struct dentry *error = ERR_PTR(-ENOENT);
2844 struct inode *inode; 2774 struct inode *inode;
2845 2775
2846 inode = proc_pid_make_inode(dir->i_sb, task); 2776 inode = proc_pid_make_inode(dir->i_sb, task);
@@ -2860,14 +2790,14 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
2860 d_add(dentry, inode); 2790 d_add(dentry, inode);
2861 /* Close the race of the process dying before we return the dentry */ 2791 /* Close the race of the process dying before we return the dentry */
2862 if (pid_revalidate(dentry, 0)) 2792 if (pid_revalidate(dentry, 0))
2863 error = NULL; 2793 return 0;
2864out: 2794out:
2865 return error; 2795 return -ENOENT;
2866} 2796}
2867 2797
2868struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2798struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2869{ 2799{
2870 struct dentry *result = NULL; 2800 int result = 0;
2871 struct task_struct *task; 2801 struct task_struct *task;
2872 unsigned tgid; 2802 unsigned tgid;
2873 struct pid_namespace *ns; 2803 struct pid_namespace *ns;
@@ -2888,7 +2818,7 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsign
2888 result = proc_pid_instantiate(dir, dentry, task, NULL); 2818 result = proc_pid_instantiate(dir, dentry, task, NULL);
2889 put_task_struct(task); 2819 put_task_struct(task);
2890out: 2820out:
2891 return result; 2821 return ERR_PTR(result);
2892} 2822}
2893 2823
2894/* 2824/*
@@ -2936,58 +2866,42 @@ retry:
2936 2866
2937#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) 2867#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
2938 2868
2939static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2940 struct tgid_iter iter)
2941{
2942 char name[PROC_NUMBUF];
2943 int len = snprintf(name, sizeof(name), "%d", iter.tgid);
2944 return proc_fill_cache(filp, dirent, filldir, name, len,
2945 proc_pid_instantiate, iter.task, NULL);
2946}
2947
2948static int fake_filldir(void *buf, const char *name, int namelen,
2949 loff_t offset, u64 ino, unsigned d_type)
2950{
2951 return 0;
2952}
2953
2954/* for the /proc/ directory itself, after non-process stuff has been done */ 2869/* for the /proc/ directory itself, after non-process stuff has been done */
2955int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2870int proc_pid_readdir(struct file *file, struct dir_context *ctx)
2956{ 2871{
2957 struct tgid_iter iter; 2872 struct tgid_iter iter;
2958 struct pid_namespace *ns; 2873 struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
2959 filldir_t __filldir; 2874 loff_t pos = ctx->pos;
2960 loff_t pos = filp->f_pos;
2961 2875
2962 if (pos >= PID_MAX_LIMIT + TGID_OFFSET) 2876 if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
2963 goto out; 2877 return 0;
2964 2878
2965 if (pos == TGID_OFFSET - 1) { 2879 if (pos == TGID_OFFSET - 1) {
2966 if (proc_fill_cache(filp, dirent, filldir, "self", 4, 2880 struct inode *inode = ns->proc_self->d_inode;
2967 NULL, NULL, NULL) < 0) 2881 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
2968 goto out; 2882 return 0;
2969 iter.tgid = 0; 2883 iter.tgid = 0;
2970 } else { 2884 } else {
2971 iter.tgid = pos - TGID_OFFSET; 2885 iter.tgid = pos - TGID_OFFSET;
2972 } 2886 }
2973 iter.task = NULL; 2887 iter.task = NULL;
2974 ns = filp->f_dentry->d_sb->s_fs_info;
2975 for (iter = next_tgid(ns, iter); 2888 for (iter = next_tgid(ns, iter);
2976 iter.task; 2889 iter.task;
2977 iter.tgid += 1, iter = next_tgid(ns, iter)) { 2890 iter.tgid += 1, iter = next_tgid(ns, iter)) {
2978 if (has_pid_permissions(ns, iter.task, 2)) 2891 char name[PROC_NUMBUF];
2979 __filldir = filldir; 2892 int len;
2980 else 2893 if (!has_pid_permissions(ns, iter.task, 2))
2981 __filldir = fake_filldir; 2894 continue;
2982 2895
2983 filp->f_pos = iter.tgid + TGID_OFFSET; 2896 len = snprintf(name, sizeof(name), "%d", iter.tgid);
2984 if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { 2897 ctx->pos = iter.tgid + TGID_OFFSET;
2898 if (!proc_fill_cache(file, ctx, name, len,
2899 proc_pid_instantiate, iter.task, NULL)) {
2985 put_task_struct(iter.task); 2900 put_task_struct(iter.task);
2986 goto out; 2901 return 0;
2987 } 2902 }
2988 } 2903 }
2989 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 2904 ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
2990out:
2991 return 0; 2905 return 0;
2992} 2906}
2993 2907
@@ -3075,11 +2989,10 @@ static const struct pid_entry tid_base_stuff[] = {
3075#endif 2989#endif
3076}; 2990};
3077 2991
3078static int proc_tid_base_readdir(struct file * filp, 2992static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
3079 void * dirent, filldir_t filldir)
3080{ 2993{
3081 return proc_pident_readdir(filp,dirent,filldir, 2994 return proc_pident_readdir(file, ctx,
3082 tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); 2995 tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3083} 2996}
3084 2997
3085static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2998static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -3090,7 +3003,7 @@ static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *den
3090 3003
3091static const struct file_operations proc_tid_base_operations = { 3004static const struct file_operations proc_tid_base_operations = {
3092 .read = generic_read_dir, 3005 .read = generic_read_dir,
3093 .readdir = proc_tid_base_readdir, 3006 .iterate = proc_tid_base_readdir,
3094 .llseek = default_llseek, 3007 .llseek = default_llseek,
3095}; 3008};
3096 3009
@@ -3100,10 +3013,9 @@ static const struct inode_operations proc_tid_base_inode_operations = {
3100 .setattr = proc_setattr, 3013 .setattr = proc_setattr,
3101}; 3014};
3102 3015
3103static struct dentry *proc_task_instantiate(struct inode *dir, 3016static int proc_task_instantiate(struct inode *dir,
3104 struct dentry *dentry, struct task_struct *task, const void *ptr) 3017 struct dentry *dentry, struct task_struct *task, const void *ptr)
3105{ 3018{
3106 struct dentry *error = ERR_PTR(-ENOENT);
3107 struct inode *inode; 3019 struct inode *inode;
3108 inode = proc_pid_make_inode(dir->i_sb, task); 3020 inode = proc_pid_make_inode(dir->i_sb, task);
3109 3021
@@ -3122,14 +3034,14 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
3122 d_add(dentry, inode); 3034 d_add(dentry, inode);
3123 /* Close the race of the process dying before we return the dentry */ 3035 /* Close the race of the process dying before we return the dentry */
3124 if (pid_revalidate(dentry, 0)) 3036 if (pid_revalidate(dentry, 0))
3125 error = NULL; 3037 return 0;
3126out: 3038out:
3127 return error; 3039 return -ENOENT;
3128} 3040}
3129 3041
3130static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3042static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3131{ 3043{
3132 struct dentry *result = ERR_PTR(-ENOENT); 3044 int result = -ENOENT;
3133 struct task_struct *task; 3045 struct task_struct *task;
3134 struct task_struct *leader = get_proc_task(dir); 3046 struct task_struct *leader = get_proc_task(dir);
3135 unsigned tid; 3047 unsigned tid;
@@ -3159,7 +3071,7 @@ out_drop_task:
3159out: 3071out:
3160 put_task_struct(leader); 3072 put_task_struct(leader);
3161out_no_task: 3073out_no_task:
3162 return result; 3074 return ERR_PTR(result);
3163} 3075}
3164 3076
3165/* 3077/*
@@ -3231,30 +3143,16 @@ static struct task_struct *next_tid(struct task_struct *start)
3231 return pos; 3143 return pos;
3232} 3144}
3233 3145
3234static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
3235 struct task_struct *task, int tid)
3236{
3237 char name[PROC_NUMBUF];
3238 int len = snprintf(name, sizeof(name), "%d", tid);
3239 return proc_fill_cache(filp, dirent, filldir, name, len,
3240 proc_task_instantiate, task, NULL);
3241}
3242
3243/* for the /proc/TGID/task/ directories */ 3146/* for the /proc/TGID/task/ directories */
3244static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) 3147static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3245{ 3148{
3246 struct dentry *dentry = filp->f_path.dentry;
3247 struct inode *inode = dentry->d_inode;
3248 struct task_struct *leader = NULL; 3149 struct task_struct *leader = NULL;
3249 struct task_struct *task; 3150 struct task_struct *task = get_proc_task(file_inode(file));
3250 int retval = -ENOENT;
3251 ino_t ino;
3252 int tid;
3253 struct pid_namespace *ns; 3151 struct pid_namespace *ns;
3152 int tid;
3254 3153
3255 task = get_proc_task(inode);
3256 if (!task) 3154 if (!task)
3257 goto out_no_task; 3155 return -ENOENT;
3258 rcu_read_lock(); 3156 rcu_read_lock();
3259 if (pid_alive(task)) { 3157 if (pid_alive(task)) {
3260 leader = task->group_leader; 3158 leader = task->group_leader;
@@ -3263,46 +3161,36 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
3263 rcu_read_unlock(); 3161 rcu_read_unlock();
3264 put_task_struct(task); 3162 put_task_struct(task);
3265 if (!leader) 3163 if (!leader)
3266 goto out_no_task; 3164 return -ENOENT;
3267 retval = 0;
3268 3165
3269 switch ((unsigned long)filp->f_pos) { 3166 if (!dir_emit_dots(file, ctx))
3270 case 0: 3167 goto out;
3271 ino = inode->i_ino;
3272 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
3273 goto out;
3274 filp->f_pos++;
3275 /* fall through */
3276 case 1:
3277 ino = parent_ino(dentry);
3278 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
3279 goto out;
3280 filp->f_pos++;
3281 /* fall through */
3282 }
3283 3168
3284 /* f_version caches the tgid value that the last readdir call couldn't 3169 /* f_version caches the tgid value that the last readdir call couldn't
3285 * return. lseek aka telldir automagically resets f_version to 0. 3170 * return. lseek aka telldir automagically resets f_version to 0.
3286 */ 3171 */
3287 ns = filp->f_dentry->d_sb->s_fs_info; 3172 ns = file->f_dentry->d_sb->s_fs_info;
3288 tid = (int)filp->f_version; 3173 tid = (int)file->f_version;
3289 filp->f_version = 0; 3174 file->f_version = 0;
3290 for (task = first_tid(leader, tid, filp->f_pos - 2, ns); 3175 for (task = first_tid(leader, tid, ctx->pos - 2, ns);
3291 task; 3176 task;
3292 task = next_tid(task), filp->f_pos++) { 3177 task = next_tid(task), ctx->pos++) {
3178 char name[PROC_NUMBUF];
3179 int len;
3293 tid = task_pid_nr_ns(task, ns); 3180 tid = task_pid_nr_ns(task, ns);
3294 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { 3181 len = snprintf(name, sizeof(name), "%d", tid);
3182 if (!proc_fill_cache(file, ctx, name, len,
3183 proc_task_instantiate, task, NULL)) {
3295 /* returning this tgid failed, save it as the first 3184 /* returning this tgid failed, save it as the first
3296 * pid for the next readir call */ 3185 * pid for the next readir call */
3297 filp->f_version = (u64)tid; 3186 file->f_version = (u64)tid;
3298 put_task_struct(task); 3187 put_task_struct(task);
3299 break; 3188 break;
3300 } 3189 }
3301 } 3190 }
3302out: 3191out:
3303 put_task_struct(leader); 3192 put_task_struct(leader);
3304out_no_task: 3193 return 0;
3305 return retval;
3306} 3194}
3307 3195
3308static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 3196static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
@@ -3328,6 +3216,6 @@ static const struct inode_operations proc_task_inode_operations = {
3328 3216
3329static const struct file_operations proc_task_operations = { 3217static const struct file_operations proc_task_operations = {
3330 .read = generic_read_dir, 3218 .read = generic_read_dir,
3331 .readdir = proc_task_readdir, 3219 .iterate = proc_task_readdir,
3332 .llseek = default_llseek, 3220 .llseek = default_llseek,
3333}; 3221};
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d7a4a28ef630..75f2890abbd8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -167,11 +167,10 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
167 return ret; 167 return ret;
168} 168}
169 169
170static struct dentry * 170static int
171proc_fd_instantiate(struct inode *dir, struct dentry *dentry, 171proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
172 struct task_struct *task, const void *ptr) 172 struct task_struct *task, const void *ptr)
173{ 173{
174 struct dentry *error = ERR_PTR(-ENOENT);
175 unsigned fd = (unsigned long)ptr; 174 unsigned fd = (unsigned long)ptr;
176 struct proc_inode *ei; 175 struct proc_inode *ei;
177 struct inode *inode; 176 struct inode *inode;
@@ -194,9 +193,9 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
194 193
195 /* Close the race of the process dying before we return the dentry */ 194 /* Close the race of the process dying before we return the dentry */
196 if (tid_fd_revalidate(dentry, 0)) 195 if (tid_fd_revalidate(dentry, 0))
197 error = NULL; 196 return 0;
198 out: 197 out:
199 return error; 198 return -ENOENT;
200} 199}
201 200
202static struct dentry *proc_lookupfd_common(struct inode *dir, 201static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -204,7 +203,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
204 instantiate_t instantiate) 203 instantiate_t instantiate)
205{ 204{
206 struct task_struct *task = get_proc_task(dir); 205 struct task_struct *task = get_proc_task(dir);
207 struct dentry *result = ERR_PTR(-ENOENT); 206 int result = -ENOENT;
208 unsigned fd = name_to_int(dentry); 207 unsigned fd = name_to_int(dentry);
209 208
210 if (!task) 209 if (!task)
@@ -216,77 +215,61 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
216out: 215out:
217 put_task_struct(task); 216 put_task_struct(task);
218out_no_task: 217out_no_task:
219 return result; 218 return ERR_PTR(result);
220} 219}
221 220
222static int proc_readfd_common(struct file * filp, void * dirent, 221static int proc_readfd_common(struct file *file, struct dir_context *ctx,
223 filldir_t filldir, instantiate_t instantiate) 222 instantiate_t instantiate)
224{ 223{
225 struct dentry *dentry = filp->f_path.dentry; 224 struct task_struct *p = get_proc_task(file_inode(file));
226 struct inode *inode = dentry->d_inode;
227 struct task_struct *p = get_proc_task(inode);
228 struct files_struct *files; 225 struct files_struct *files;
229 unsigned int fd, ino; 226 unsigned int fd;
230 int retval;
231 227
232 retval = -ENOENT;
233 if (!p) 228 if (!p)
234 goto out_no_task; 229 return -ENOENT;
235 retval = 0;
236
237 fd = filp->f_pos;
238 switch (fd) {
239 case 0:
240 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
241 goto out;
242 filp->f_pos++;
243 case 1:
244 ino = parent_ino(dentry);
245 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
246 goto out;
247 filp->f_pos++;
248 default:
249 files = get_files_struct(p);
250 if (!files)
251 goto out;
252 rcu_read_lock();
253 for (fd = filp->f_pos - 2;
254 fd < files_fdtable(files)->max_fds;
255 fd++, filp->f_pos++) {
256 char name[PROC_NUMBUF];
257 int len;
258 int rv;
259
260 if (!fcheck_files(files, fd))
261 continue;
262 rcu_read_unlock();
263 230
264 len = snprintf(name, sizeof(name), "%d", fd); 231 if (!dir_emit_dots(file, ctx))
265 rv = proc_fill_cache(filp, dirent, filldir, 232 goto out;
266 name, len, instantiate, p, 233 if (!dir_emit_dots(file, ctx))
267 (void *)(unsigned long)fd); 234 goto out;
268 if (rv < 0) 235 files = get_files_struct(p);
269 goto out_fd_loop; 236 if (!files)
270 rcu_read_lock(); 237 goto out;
271 } 238
272 rcu_read_unlock(); 239 rcu_read_lock();
273out_fd_loop: 240 for (fd = ctx->pos - 2;
274 put_files_struct(files); 241 fd < files_fdtable(files)->max_fds;
242 fd++, ctx->pos++) {
243 char name[PROC_NUMBUF];
244 int len;
245
246 if (!fcheck_files(files, fd))
247 continue;
248 rcu_read_unlock();
249
250 len = snprintf(name, sizeof(name), "%d", fd);
251 if (!proc_fill_cache(file, ctx,
252 name, len, instantiate, p,
253 (void *)(unsigned long)fd))
254 goto out_fd_loop;
255 rcu_read_lock();
275 } 256 }
257 rcu_read_unlock();
258out_fd_loop:
259 put_files_struct(files);
276out: 260out:
277 put_task_struct(p); 261 put_task_struct(p);
278out_no_task: 262 return 0;
279 return retval;
280} 263}
281 264
282static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir) 265static int proc_readfd(struct file *file, struct dir_context *ctx)
283{ 266{
284 return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate); 267 return proc_readfd_common(file, ctx, proc_fd_instantiate);
285} 268}
286 269
287const struct file_operations proc_fd_operations = { 270const struct file_operations proc_fd_operations = {
288 .read = generic_read_dir, 271 .read = generic_read_dir,
289 .readdir = proc_readfd, 272 .iterate = proc_readfd,
290 .llseek = default_llseek, 273 .llseek = default_llseek,
291}; 274};
292 275
@@ -316,11 +299,10 @@ const struct inode_operations proc_fd_inode_operations = {
316 .setattr = proc_setattr, 299 .setattr = proc_setattr,
317}; 300};
318 301
319static struct dentry * 302static int
320proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, 303proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
321 struct task_struct *task, const void *ptr) 304 struct task_struct *task, const void *ptr)
322{ 305{
323 struct dentry *error = ERR_PTR(-ENOENT);
324 unsigned fd = (unsigned long)ptr; 306 unsigned fd = (unsigned long)ptr;
325 struct proc_inode *ei; 307 struct proc_inode *ei;
326 struct inode *inode; 308 struct inode *inode;
@@ -340,9 +322,9 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
340 322
341 /* Close the race of the process dying before we return the dentry */ 323 /* Close the race of the process dying before we return the dentry */
342 if (tid_fd_revalidate(dentry, 0)) 324 if (tid_fd_revalidate(dentry, 0))
343 error = NULL; 325 return 0;
344 out: 326 out:
345 return error; 327 return -ENOENT;
346} 328}
347 329
348static struct dentry * 330static struct dentry *
@@ -351,9 +333,9 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
351 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); 333 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
352} 334}
353 335
354static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir) 336static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
355{ 337{
356 return proc_readfd_common(filp, dirent, filldir, 338 return proc_readfd_common(file, ctx,
357 proc_fdinfo_instantiate); 339 proc_fdinfo_instantiate);
358} 340}
359 341
@@ -364,6 +346,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
364 346
365const struct file_operations proc_fdinfo_operations = { 347const struct file_operations proc_fdinfo_operations = {
366 .read = generic_read_dir, 348 .read = generic_read_dir,
367 .readdir = proc_readfdinfo, 349 .iterate = proc_readfdinfo,
368 .llseek = default_llseek, 350 .llseek = default_llseek,
369}; 351};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a2596afffae6..94441a407337 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -233,76 +233,52 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
233 * value of the readdir() call, as long as it's non-negative 233 * value of the readdir() call, as long as it's non-negative
234 * for success.. 234 * for success..
235 */ 235 */
236int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, 236int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
237 filldir_t filldir) 237 struct dir_context *ctx)
238{ 238{
239 unsigned int ino;
240 int i; 239 int i;
241 struct inode *inode = file_inode(filp);
242 int ret = 0;
243
244 ino = inode->i_ino;
245 i = filp->f_pos;
246 switch (i) {
247 case 0:
248 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
249 goto out;
250 i++;
251 filp->f_pos++;
252 /* fall through */
253 case 1:
254 if (filldir(dirent, "..", 2, i,
255 parent_ino(filp->f_path.dentry),
256 DT_DIR) < 0)
257 goto out;
258 i++;
259 filp->f_pos++;
260 /* fall through */
261 default:
262 spin_lock(&proc_subdir_lock);
263 de = de->subdir;
264 i -= 2;
265 for (;;) {
266 if (!de) {
267 ret = 1;
268 spin_unlock(&proc_subdir_lock);
269 goto out;
270 }
271 if (!i)
272 break;
273 de = de->next;
274 i--;
275 }
276 240
277 do { 241 if (!dir_emit_dots(file, ctx))
278 struct proc_dir_entry *next; 242 return 0;
279 243
280 /* filldir passes info to user space */ 244 spin_lock(&proc_subdir_lock);
281 pde_get(de); 245 de = de->subdir;
282 spin_unlock(&proc_subdir_lock); 246 i = ctx->pos - 2;
283 if (filldir(dirent, de->name, de->namelen, filp->f_pos, 247 for (;;) {
284 de->low_ino, de->mode >> 12) < 0) { 248 if (!de) {
285 pde_put(de);
286 goto out;
287 }
288 spin_lock(&proc_subdir_lock);
289 filp->f_pos++;
290 next = de->next;
291 pde_put(de);
292 de = next;
293 } while (de);
294 spin_unlock(&proc_subdir_lock); 249 spin_unlock(&proc_subdir_lock);
250 return 0;
251 }
252 if (!i)
253 break;
254 de = de->next;
255 i--;
295 } 256 }
296 ret = 1; 257
297out: 258 do {
298 return ret; 259 struct proc_dir_entry *next;
260 pde_get(de);
261 spin_unlock(&proc_subdir_lock);
262 if (!dir_emit(ctx, de->name, de->namelen,
263 de->low_ino, de->mode >> 12)) {
264 pde_put(de);
265 return 0;
266 }
267 spin_lock(&proc_subdir_lock);
268 ctx->pos++;
269 next = de->next;
270 pde_put(de);
271 de = next;
272 } while (de);
273 spin_unlock(&proc_subdir_lock);
274 return 0;
299} 275}
300 276
301int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) 277int proc_readdir(struct file *file, struct dir_context *ctx)
302{ 278{
303 struct inode *inode = file_inode(filp); 279 struct inode *inode = file_inode(file);
304 280
305 return proc_readdir_de(PDE(inode), filp, dirent, filldir); 281 return proc_readdir_de(PDE(inode), file, ctx);
306} 282}
307 283
308/* 284/*
@@ -313,7 +289,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
313static const struct file_operations proc_dir_operations = { 289static const struct file_operations proc_dir_operations = {
314 .llseek = generic_file_llseek, 290 .llseek = generic_file_llseek,
315 .read = generic_read_dir, 291 .read = generic_read_dir,
316 .readdir = proc_readdir, 292 .iterate = proc_readdir,
317}; 293};
318 294
319/* 295/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index d600fb098b6a..651d09a11dde 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -165,14 +165,14 @@ extern int proc_setattr(struct dentry *, struct iattr *);
165extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); 165extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
166extern int pid_revalidate(struct dentry *, unsigned int); 166extern int pid_revalidate(struct dentry *, unsigned int);
167extern int pid_delete_dentry(const struct dentry *); 167extern int pid_delete_dentry(const struct dentry *);
168extern int proc_pid_readdir(struct file *, void *, filldir_t); 168extern int proc_pid_readdir(struct file *, struct dir_context *);
169extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); 169extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
170extern loff_t mem_lseek(struct file *, loff_t, int); 170extern loff_t mem_lseek(struct file *, loff_t, int);
171 171
172/* Lookups */ 172/* Lookups */
173typedef struct dentry *instantiate_t(struct inode *, struct dentry *, 173typedef int instantiate_t(struct inode *, struct dentry *,
174 struct task_struct *, const void *); 174 struct task_struct *, const void *);
175extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int, 175extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
176 instantiate_t, struct task_struct *, const void *); 176 instantiate_t, struct task_struct *, const void *);
177 177
178/* 178/*
@@ -183,8 +183,8 @@ extern spinlock_t proc_subdir_lock;
183extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); 183extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
184extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, 184extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
185 struct dentry *); 185 struct dentry *);
186extern int proc_readdir(struct file *, void *, filldir_t); 186extern int proc_readdir(struct file *, struct dir_context *);
187extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t); 187extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
188 188
189static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) 189static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
190{ 190{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 0a22194e5d58..06ea155e1a59 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
408 prpsinfo.pr_zomb = 0; 408 prpsinfo.pr_zomb = 0;
409 409
410 strcpy(prpsinfo.pr_fname, "vmlinux"); 410 strcpy(prpsinfo.pr_fname, "vmlinux");
411 strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); 411 strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs));
412 412
413 nhdr->p_filesz += notesize(&notes[1]); 413 nhdr->p_filesz += notesize(&notes[1]);
414 bufp = storenote(&notes[1], bufp); 414 bufp = storenote(&notes[1], bufp);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 54bdc6701e9f..49a7fff2e83a 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -187,13 +187,12 @@ static const struct inode_operations proc_ns_link_inode_operations = {
187 .setattr = proc_setattr, 187 .setattr = proc_setattr,
188}; 188};
189 189
190static struct dentry *proc_ns_instantiate(struct inode *dir, 190static int proc_ns_instantiate(struct inode *dir,
191 struct dentry *dentry, struct task_struct *task, const void *ptr) 191 struct dentry *dentry, struct task_struct *task, const void *ptr)
192{ 192{
193 const struct proc_ns_operations *ns_ops = ptr; 193 const struct proc_ns_operations *ns_ops = ptr;
194 struct inode *inode; 194 struct inode *inode;
195 struct proc_inode *ei; 195 struct proc_inode *ei;
196 struct dentry *error = ERR_PTR(-ENOENT);
197 196
198 inode = proc_pid_make_inode(dir->i_sb, task); 197 inode = proc_pid_make_inode(dir->i_sb, task);
199 if (!inode) 198 if (!inode)
@@ -208,90 +207,52 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
208 d_add(dentry, inode); 207 d_add(dentry, inode);
209 /* Close the race of the process dying before we return the dentry */ 208 /* Close the race of the process dying before we return the dentry */
210 if (pid_revalidate(dentry, 0)) 209 if (pid_revalidate(dentry, 0))
211 error = NULL; 210 return 0;
212out: 211out:
213 return error; 212 return -ENOENT;
214}
215
216static int proc_ns_fill_cache(struct file *filp, void *dirent,
217 filldir_t filldir, struct task_struct *task,
218 const struct proc_ns_operations *ops)
219{
220 return proc_fill_cache(filp, dirent, filldir,
221 ops->name, strlen(ops->name),
222 proc_ns_instantiate, task, ops);
223} 213}
224 214
225static int proc_ns_dir_readdir(struct file *filp, void *dirent, 215static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
226 filldir_t filldir)
227{ 216{
228 int i; 217 struct task_struct *task = get_proc_task(file_inode(file));
229 struct dentry *dentry = filp->f_path.dentry;
230 struct inode *inode = dentry->d_inode;
231 struct task_struct *task = get_proc_task(inode);
232 const struct proc_ns_operations **entry, **last; 218 const struct proc_ns_operations **entry, **last;
233 ino_t ino;
234 int ret;
235 219
236 ret = -ENOENT;
237 if (!task) 220 if (!task)
238 goto out_no_task; 221 return -ENOENT;
239 222
240 ret = 0; 223 if (!dir_emit_dots(file, ctx))
241 i = filp->f_pos; 224 goto out;
242 switch (i) { 225 if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries))
243 case 0: 226 goto out;
244 ino = inode->i_ino; 227 entry = ns_entries + (ctx->pos - 2);
245 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) 228 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
246 goto out; 229 while (entry <= last) {
247 i++; 230 const struct proc_ns_operations *ops = *entry;
248 filp->f_pos++; 231 if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name),
249 /* fall through */ 232 proc_ns_instantiate, task, ops))
250 case 1: 233 break;
251 ino = parent_ino(dentry); 234 ctx->pos++;
252 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) 235 entry++;
253 goto out;
254 i++;
255 filp->f_pos++;
256 /* fall through */
257 default:
258 i -= 2;
259 if (i >= ARRAY_SIZE(ns_entries)) {
260 ret = 1;
261 goto out;
262 }
263 entry = ns_entries + i;
264 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
265 while (entry <= last) {
266 if (proc_ns_fill_cache(filp, dirent, filldir,
267 task, *entry) < 0)
268 goto out;
269 filp->f_pos++;
270 entry++;
271 }
272 } 236 }
273
274 ret = 1;
275out: 237out:
276 put_task_struct(task); 238 put_task_struct(task);
277out_no_task: 239 return 0;
278 return ret;
279} 240}
280 241
281const struct file_operations proc_ns_dir_operations = { 242const struct file_operations proc_ns_dir_operations = {
282 .read = generic_read_dir, 243 .read = generic_read_dir,
283 .readdir = proc_ns_dir_readdir, 244 .iterate = proc_ns_dir_readdir,
284}; 245};
285 246
286static struct dentry *proc_ns_dir_lookup(struct inode *dir, 247static struct dentry *proc_ns_dir_lookup(struct inode *dir,
287 struct dentry *dentry, unsigned int flags) 248 struct dentry *dentry, unsigned int flags)
288{ 249{
289 struct dentry *error; 250 int error;
290 struct task_struct *task = get_proc_task(dir); 251 struct task_struct *task = get_proc_task(dir);
291 const struct proc_ns_operations **entry, **last; 252 const struct proc_ns_operations **entry, **last;
292 unsigned int len = dentry->d_name.len; 253 unsigned int len = dentry->d_name.len;
293 254
294 error = ERR_PTR(-ENOENT); 255 error = -ENOENT;
295 256
296 if (!task) 257 if (!task)
297 goto out_no_task; 258 goto out_no_task;
@@ -310,7 +271,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
310out: 271out:
311 put_task_struct(task); 272 put_task_struct(task);
312out_no_task: 273out_no_task:
313 return error; 274 return ERR_PTR(error);
314} 275}
315 276
316const struct inode_operations proc_ns_dir_inode_operations = { 277const struct inode_operations proc_ns_dir_inode_operations = {
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 986e83220d56..4677bb7dc7c2 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -160,16 +160,15 @@ const struct inode_operations proc_net_inode_operations = {
160 .getattr = proc_tgid_net_getattr, 160 .getattr = proc_tgid_net_getattr,
161}; 161};
162 162
163static int proc_tgid_net_readdir(struct file *filp, void *dirent, 163static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
164 filldir_t filldir)
165{ 164{
166 int ret; 165 int ret;
167 struct net *net; 166 struct net *net;
168 167
169 ret = -EINVAL; 168 ret = -EINVAL;
170 net = get_proc_task_net(file_inode(filp)); 169 net = get_proc_task_net(file_inode(file));
171 if (net != NULL) { 170 if (net != NULL) {
172 ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); 171 ret = proc_readdir_de(net->proc_net, file, ctx);
173 put_net(net); 172 put_net(net);
174 } 173 }
175 return ret; 174 return ret;
@@ -178,7 +177,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
178const struct file_operations proc_net_operations = { 177const struct file_operations proc_net_operations = {
179 .llseek = generic_file_llseek, 178 .llseek = generic_file_llseek,
180 .read = generic_read_dir, 179 .read = generic_read_dir,
181 .readdir = proc_tgid_net_readdir, 180 .iterate = proc_tgid_net_readdir,
182}; 181};
183 182
184static __net_init int proc_net_ns_init(struct net *net) 183static __net_init int proc_net_ns_init(struct net *net)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ac05f33a0dde..71290463a1d3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -573,12 +573,12 @@ out:
573 return ret; 573 return ret;
574} 574}
575 575
576static int proc_sys_fill_cache(struct file *filp, void *dirent, 576static bool proc_sys_fill_cache(struct file *file,
577 filldir_t filldir, 577 struct dir_context *ctx,
578 struct ctl_table_header *head, 578 struct ctl_table_header *head,
579 struct ctl_table *table) 579 struct ctl_table *table)
580{ 580{
581 struct dentry *child, *dir = filp->f_path.dentry; 581 struct dentry *child, *dir = file->f_path.dentry;
582 struct inode *inode; 582 struct inode *inode;
583 struct qstr qname; 583 struct qstr qname;
584 ino_t ino = 0; 584 ino_t ino = 0;
@@ -595,38 +595,38 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
595 inode = proc_sys_make_inode(dir->d_sb, head, table); 595 inode = proc_sys_make_inode(dir->d_sb, head, table);
596 if (!inode) { 596 if (!inode) {
597 dput(child); 597 dput(child);
598 return -ENOMEM; 598 return false;
599 } else { 599 } else {
600 d_set_d_op(child, &proc_sys_dentry_operations); 600 d_set_d_op(child, &proc_sys_dentry_operations);
601 d_add(child, inode); 601 d_add(child, inode);
602 } 602 }
603 } else { 603 } else {
604 return -ENOMEM; 604 return false;
605 } 605 }
606 } 606 }
607 inode = child->d_inode; 607 inode = child->d_inode;
608 ino = inode->i_ino; 608 ino = inode->i_ino;
609 type = inode->i_mode >> 12; 609 type = inode->i_mode >> 12;
610 dput(child); 610 dput(child);
611 return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); 611 return dir_emit(ctx, qname.name, qname.len, ino, type);
612} 612}
613 613
614static int proc_sys_link_fill_cache(struct file *filp, void *dirent, 614static bool proc_sys_link_fill_cache(struct file *file,
615 filldir_t filldir, 615 struct dir_context *ctx,
616 struct ctl_table_header *head, 616 struct ctl_table_header *head,
617 struct ctl_table *table) 617 struct ctl_table *table)
618{ 618{
619 int err, ret = 0; 619 bool ret = true;
620 head = sysctl_head_grab(head); 620 head = sysctl_head_grab(head);
621 621
622 if (S_ISLNK(table->mode)) { 622 if (S_ISLNK(table->mode)) {
623 /* It is not an error if we can not follow the link ignore it */ 623 /* It is not an error if we can not follow the link ignore it */
624 err = sysctl_follow_link(&head, &table, current->nsproxy); 624 int err = sysctl_follow_link(&head, &table, current->nsproxy);
625 if (err) 625 if (err)
626 goto out; 626 goto out;
627 } 627 }
628 628
629 ret = proc_sys_fill_cache(filp, dirent, filldir, head, table); 629 ret = proc_sys_fill_cache(file, ctx, head, table);
630out: 630out:
631 sysctl_head_finish(head); 631 sysctl_head_finish(head);
632 return ret; 632 return ret;
@@ -634,67 +634,50 @@ out:
634 634
635static int scan(struct ctl_table_header *head, ctl_table *table, 635static int scan(struct ctl_table_header *head, ctl_table *table,
636 unsigned long *pos, struct file *file, 636 unsigned long *pos, struct file *file,
637 void *dirent, filldir_t filldir) 637 struct dir_context *ctx)
638{ 638{
639 int res; 639 bool res;
640 640
641 if ((*pos)++ < file->f_pos) 641 if ((*pos)++ < ctx->pos)
642 return 0; 642 return true;
643 643
644 if (unlikely(S_ISLNK(table->mode))) 644 if (unlikely(S_ISLNK(table->mode)))
645 res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); 645 res = proc_sys_link_fill_cache(file, ctx, head, table);
646 else 646 else
647 res = proc_sys_fill_cache(file, dirent, filldir, head, table); 647 res = proc_sys_fill_cache(file, ctx, head, table);
648 648
649 if (res == 0) 649 if (res)
650 file->f_pos = *pos; 650 ctx->pos = *pos;
651 651
652 return res; 652 return res;
653} 653}
654 654
655static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) 655static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
656{ 656{
657 struct dentry *dentry = filp->f_path.dentry; 657 struct ctl_table_header *head = grab_header(file_inode(file));
658 struct inode *inode = dentry->d_inode;
659 struct ctl_table_header *head = grab_header(inode);
660 struct ctl_table_header *h = NULL; 658 struct ctl_table_header *h = NULL;
661 struct ctl_table *entry; 659 struct ctl_table *entry;
662 struct ctl_dir *ctl_dir; 660 struct ctl_dir *ctl_dir;
663 unsigned long pos; 661 unsigned long pos;
664 int ret = -EINVAL;
665 662
666 if (IS_ERR(head)) 663 if (IS_ERR(head))
667 return PTR_ERR(head); 664 return PTR_ERR(head);
668 665
669 ctl_dir = container_of(head, struct ctl_dir, header); 666 ctl_dir = container_of(head, struct ctl_dir, header);
670 667
671 ret = 0; 668 if (!dir_emit_dots(file, ctx))
672 /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ 669 return 0;
673 if (filp->f_pos == 0) { 670
674 if (filldir(dirent, ".", 1, filp->f_pos,
675 inode->i_ino, DT_DIR) < 0)
676 goto out;
677 filp->f_pos++;
678 }
679 if (filp->f_pos == 1) {
680 if (filldir(dirent, "..", 2, filp->f_pos,
681 parent_ino(dentry), DT_DIR) < 0)
682 goto out;
683 filp->f_pos++;
684 }
685 pos = 2; 671 pos = 2;
686 672
687 for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { 673 for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) {
688 ret = scan(h, entry, &pos, filp, dirent, filldir); 674 if (!scan(h, entry, &pos, file, ctx)) {
689 if (ret) {
690 sysctl_head_finish(h); 675 sysctl_head_finish(h);
691 break; 676 break;
692 } 677 }
693 } 678 }
694 ret = 1;
695out:
696 sysctl_head_finish(head); 679 sysctl_head_finish(head);
697 return ret; 680 return 0;
698} 681}
699 682
700static int proc_sys_permission(struct inode *inode, int mask) 683static int proc_sys_permission(struct inode *inode, int mask)
@@ -769,7 +752,7 @@ static const struct file_operations proc_sys_file_operations = {
769 752
770static const struct file_operations proc_sys_dir_file_operations = { 753static const struct file_operations proc_sys_dir_file_operations = {
771 .read = generic_read_dir, 754 .read = generic_read_dir,
772 .readdir = proc_sys_readdir, 755 .iterate = proc_sys_readdir,
773 .llseek = generic_file_llseek, 756 .llseek = generic_file_llseek,
774}; 757};
775 758
@@ -813,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p)
813 return res; 796 return res;
814} 797}
815 798
816static int proc_sys_compare(const struct dentry *parent, 799static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
817 const struct inode *pinode,
818 const struct dentry *dentry, const struct inode *inode,
819 unsigned int len, const char *str, const struct qstr *name) 800 unsigned int len, const char *str, const struct qstr *name)
820{ 801{
821 struct ctl_table_header *head; 802 struct ctl_table_header *head;
803 struct inode *inode;
804
822 /* Although proc doesn't have negative dentries, rcu-walk means 805 /* Although proc doesn't have negative dentries, rcu-walk means
823 * that inode here can be NULL */ 806 * that inode here can be NULL */
824 /* AV: can it, indeed? */ 807 /* AV: can it, indeed? */
808 inode = ACCESS_ONCE(dentry->d_inode);
825 if (!inode) 809 if (!inode)
826 return 1; 810 return 1;
827 if (name->len != len) 811 if (name->len != len)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41a6ea93f486..229e366598da 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -202,21 +202,14 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
202 return proc_pid_lookup(dir, dentry, flags); 202 return proc_pid_lookup(dir, dentry, flags);
203} 203}
204 204
205static int proc_root_readdir(struct file * filp, 205static int proc_root_readdir(struct file *file, struct dir_context *ctx)
206 void * dirent, filldir_t filldir)
207{ 206{
208 unsigned int nr = filp->f_pos; 207 if (ctx->pos < FIRST_PROCESS_ENTRY) {
209 int ret; 208 proc_readdir(file, ctx);
210 209 ctx->pos = FIRST_PROCESS_ENTRY;
211 if (nr < FIRST_PROCESS_ENTRY) {
212 int error = proc_readdir(filp, dirent, filldir);
213 if (error <= 0)
214 return error;
215 filp->f_pos = FIRST_PROCESS_ENTRY;
216 } 210 }
217 211
218 ret = proc_pid_readdir(filp, dirent, filldir); 212 return proc_pid_readdir(file, ctx);
219 return ret;
220} 213}
221 214
222/* 215/*
@@ -226,7 +219,7 @@ static int proc_root_readdir(struct file * filp,
226 */ 219 */
227static const struct file_operations proc_root_operations = { 220static const struct file_operations proc_root_operations = {
228 .read = generic_read_dir, 221 .read = generic_read_dir,
229 .readdir = proc_root_readdir, 222 .iterate = proc_root_readdir,
230 .llseek = default_llseek, 223 .llseek = default_llseek,
231}; 224};
232 225
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3e636d864d56..107d026f5d6e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -11,6 +11,7 @@
11#include <linux/rmap.h> 11#include <linux/rmap.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/swapops.h> 13#include <linux/swapops.h>
14#include <linux/mmu_notifier.h>
14 15
15#include <asm/elf.h> 16#include <asm/elf.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
@@ -688,10 +689,66 @@ const struct file_operations proc_tid_smaps_operations = {
688 .release = seq_release_private, 689 .release = seq_release_private,
689}; 690};
690 691
692/*
693 * We do not want to have constant page-shift bits sitting in
694 * pagemap entries and are about to reuse them some time soon.
695 *
696 * Here's the "migration strategy":
697 * 1. when the system boots these bits remain what they are,
698 * but a warning about future change is printed in log;
699 * 2. once anyone clears soft-dirty bits via clear_refs file,
700 * these flag is set to denote, that user is aware of the
701 * new API and those page-shift bits change their meaning.
702 * The respective warning is printed in dmesg;
703 * 3. In a couple of releases we will remove all the mentions
704 * of page-shift in pagemap entries.
705 */
706
707static bool soft_dirty_cleared __read_mostly;
708
709enum clear_refs_types {
710 CLEAR_REFS_ALL = 1,
711 CLEAR_REFS_ANON,
712 CLEAR_REFS_MAPPED,
713 CLEAR_REFS_SOFT_DIRTY,
714 CLEAR_REFS_LAST,
715};
716
717struct clear_refs_private {
718 struct vm_area_struct *vma;
719 enum clear_refs_types type;
720};
721
722static inline void clear_soft_dirty(struct vm_area_struct *vma,
723 unsigned long addr, pte_t *pte)
724{
725#ifdef CONFIG_MEM_SOFT_DIRTY
726 /*
727 * The soft-dirty tracker uses #PF-s to catch writes
728 * to pages, so write-protect the pte as well. See the
729 * Documentation/vm/soft-dirty.txt for full description
730 * of how soft-dirty works.
731 */
732 pte_t ptent = *pte;
733
734 if (pte_present(ptent)) {
735 ptent = pte_wrprotect(ptent);
736 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
737 } else if (is_swap_pte(ptent)) {
738 ptent = pte_swp_clear_soft_dirty(ptent);
739 } else if (pte_file(ptent)) {
740 ptent = pte_file_clear_soft_dirty(ptent);
741 }
742
743 set_pte_at(vma->vm_mm, addr, pte, ptent);
744#endif
745}
746
691static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, 747static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
692 unsigned long end, struct mm_walk *walk) 748 unsigned long end, struct mm_walk *walk)
693{ 749{
694 struct vm_area_struct *vma = walk->private; 750 struct clear_refs_private *cp = walk->private;
751 struct vm_area_struct *vma = cp->vma;
695 pte_t *pte, ptent; 752 pte_t *pte, ptent;
696 spinlock_t *ptl; 753 spinlock_t *ptl;
697 struct page *page; 754 struct page *page;
@@ -703,6 +760,12 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
703 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 760 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
704 for (; addr != end; pte++, addr += PAGE_SIZE) { 761 for (; addr != end; pte++, addr += PAGE_SIZE) {
705 ptent = *pte; 762 ptent = *pte;
763
764 if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
765 clear_soft_dirty(vma, addr, pte);
766 continue;
767 }
768
706 if (!pte_present(ptent)) 769 if (!pte_present(ptent))
707 continue; 770 continue;
708 771
@@ -719,10 +782,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
719 return 0; 782 return 0;
720} 783}
721 784
722#define CLEAR_REFS_ALL 1
723#define CLEAR_REFS_ANON 2
724#define CLEAR_REFS_MAPPED 3
725
726static ssize_t clear_refs_write(struct file *file, const char __user *buf, 785static ssize_t clear_refs_write(struct file *file, const char __user *buf,
727 size_t count, loff_t *ppos) 786 size_t count, loff_t *ppos)
728{ 787{
@@ -730,7 +789,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
730 char buffer[PROC_NUMBUF]; 789 char buffer[PROC_NUMBUF];
731 struct mm_struct *mm; 790 struct mm_struct *mm;
732 struct vm_area_struct *vma; 791 struct vm_area_struct *vma;
733 int type; 792 enum clear_refs_types type;
793 int itype;
734 int rv; 794 int rv;
735 795
736 memset(buffer, 0, sizeof(buffer)); 796 memset(buffer, 0, sizeof(buffer));
@@ -738,23 +798,37 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
738 count = sizeof(buffer) - 1; 798 count = sizeof(buffer) - 1;
739 if (copy_from_user(buffer, buf, count)) 799 if (copy_from_user(buffer, buf, count))
740 return -EFAULT; 800 return -EFAULT;
741 rv = kstrtoint(strstrip(buffer), 10, &type); 801 rv = kstrtoint(strstrip(buffer), 10, &itype);
742 if (rv < 0) 802 if (rv < 0)
743 return rv; 803 return rv;
744 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) 804 type = (enum clear_refs_types)itype;
805 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
745 return -EINVAL; 806 return -EINVAL;
807
808 if (type == CLEAR_REFS_SOFT_DIRTY) {
809 soft_dirty_cleared = true;
810 pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
811 "See the linux/Documentation/vm/pagemap.txt for details.\n");
812 }
813
746 task = get_proc_task(file_inode(file)); 814 task = get_proc_task(file_inode(file));
747 if (!task) 815 if (!task)
748 return -ESRCH; 816 return -ESRCH;
749 mm = get_task_mm(task); 817 mm = get_task_mm(task);
750 if (mm) { 818 if (mm) {
819 struct clear_refs_private cp = {
820 .type = type,
821 };
751 struct mm_walk clear_refs_walk = { 822 struct mm_walk clear_refs_walk = {
752 .pmd_entry = clear_refs_pte_range, 823 .pmd_entry = clear_refs_pte_range,
753 .mm = mm, 824 .mm = mm,
825 .private = &cp,
754 }; 826 };
755 down_read(&mm->mmap_sem); 827 down_read(&mm->mmap_sem);
828 if (type == CLEAR_REFS_SOFT_DIRTY)
829 mmu_notifier_invalidate_range_start(mm, 0, -1);
756 for (vma = mm->mmap; vma; vma = vma->vm_next) { 830 for (vma = mm->mmap; vma; vma = vma->vm_next) {
757 clear_refs_walk.private = vma; 831 cp.vma = vma;
758 if (is_vm_hugetlb_page(vma)) 832 if (is_vm_hugetlb_page(vma))
759 continue; 833 continue;
760 /* 834 /*
@@ -773,6 +847,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
773 walk_page_range(vma->vm_start, vma->vm_end, 847 walk_page_range(vma->vm_start, vma->vm_end,
774 &clear_refs_walk); 848 &clear_refs_walk);
775 } 849 }
850 if (type == CLEAR_REFS_SOFT_DIRTY)
851 mmu_notifier_invalidate_range_end(mm, 0, -1);
776 flush_tlb_mm(mm); 852 flush_tlb_mm(mm);
777 up_read(&mm->mmap_sem); 853 up_read(&mm->mmap_sem);
778 mmput(mm); 854 mmput(mm);
@@ -792,14 +868,15 @@ typedef struct {
792} pagemap_entry_t; 868} pagemap_entry_t;
793 869
794struct pagemapread { 870struct pagemapread {
795 int pos, len; 871 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
796 pagemap_entry_t *buffer; 872 pagemap_entry_t *buffer;
873 bool v2;
797}; 874};
798 875
799#define PAGEMAP_WALK_SIZE (PMD_SIZE) 876#define PAGEMAP_WALK_SIZE (PMD_SIZE)
800#define PAGEMAP_WALK_MASK (PMD_MASK) 877#define PAGEMAP_WALK_MASK (PMD_MASK)
801 878
802#define PM_ENTRY_BYTES sizeof(u64) 879#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
803#define PM_STATUS_BITS 3 880#define PM_STATUS_BITS 3
804#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 881#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
805#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) 882#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
@@ -807,14 +884,17 @@ struct pagemapread {
807#define PM_PSHIFT_BITS 6 884#define PM_PSHIFT_BITS 6
808#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 885#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
809#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 886#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
810#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 887#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
811#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) 888#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
812#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) 889#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
890/* in "new" pagemap pshift bits are occupied with more status bits */
891#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
813 892
893#define __PM_SOFT_DIRTY (1LL)
814#define PM_PRESENT PM_STATUS(4LL) 894#define PM_PRESENT PM_STATUS(4LL)
815#define PM_SWAP PM_STATUS(2LL) 895#define PM_SWAP PM_STATUS(2LL)
816#define PM_FILE PM_STATUS(1LL) 896#define PM_FILE PM_STATUS(1LL)
817#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) 897#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
818#define PM_END_OF_BUFFER 1 898#define PM_END_OF_BUFFER 1
819 899
820static inline pagemap_entry_t make_pme(u64 val) 900static inline pagemap_entry_t make_pme(u64 val)
@@ -837,7 +917,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
837 struct pagemapread *pm = walk->private; 917 struct pagemapread *pm = walk->private;
838 unsigned long addr; 918 unsigned long addr;
839 int err = 0; 919 int err = 0;
840 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 920 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
841 921
842 for (addr = start; addr < end; addr += PAGE_SIZE) { 922 for (addr = start; addr < end; addr += PAGE_SIZE) {
843 err = add_to_pagemap(addr, &pme, pm); 923 err = add_to_pagemap(addr, &pme, pm);
@@ -847,38 +927,43 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
847 return err; 927 return err;
848} 928}
849 929
850static void pte_to_pagemap_entry(pagemap_entry_t *pme, 930static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
851 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 931 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
852{ 932{
853 u64 frame, flags; 933 u64 frame, flags;
854 struct page *page = NULL; 934 struct page *page = NULL;
935 int flags2 = 0;
855 936
856 if (pte_present(pte)) { 937 if (pte_present(pte)) {
857 frame = pte_pfn(pte); 938 frame = pte_pfn(pte);
858 flags = PM_PRESENT; 939 flags = PM_PRESENT;
859 page = vm_normal_page(vma, addr, pte); 940 page = vm_normal_page(vma, addr, pte);
860 } else if (is_swap_pte(pte)) { 941 } else if (is_swap_pte(pte)) {
861 swp_entry_t entry = pte_to_swp_entry(pte); 942 swp_entry_t entry;
862 943 if (pte_swp_soft_dirty(pte))
944 flags2 |= __PM_SOFT_DIRTY;
945 entry = pte_to_swp_entry(pte);
863 frame = swp_type(entry) | 946 frame = swp_type(entry) |
864 (swp_offset(entry) << MAX_SWAPFILES_SHIFT); 947 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
865 flags = PM_SWAP; 948 flags = PM_SWAP;
866 if (is_migration_entry(entry)) 949 if (is_migration_entry(entry))
867 page = migration_entry_to_page(entry); 950 page = migration_entry_to_page(entry);
868 } else { 951 } else {
869 *pme = make_pme(PM_NOT_PRESENT); 952 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
870 return; 953 return;
871 } 954 }
872 955
873 if (page && !PageAnon(page)) 956 if (page && !PageAnon(page))
874 flags |= PM_FILE; 957 flags |= PM_FILE;
958 if (pte_soft_dirty(pte))
959 flags2 |= __PM_SOFT_DIRTY;
875 960
876 *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); 961 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
877} 962}
878 963
879#ifdef CONFIG_TRANSPARENT_HUGEPAGE 964#ifdef CONFIG_TRANSPARENT_HUGEPAGE
880static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 965static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
881 pmd_t pmd, int offset) 966 pmd_t pmd, int offset, int pmd_flags2)
882{ 967{
883 /* 968 /*
884 * Currently pmd for thp is always present because thp can not be 969 * Currently pmd for thp is always present because thp can not be
@@ -887,13 +972,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
887 */ 972 */
888 if (pmd_present(pmd)) 973 if (pmd_present(pmd))
889 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 974 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
890 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 975 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
891 else 976 else
892 *pme = make_pme(PM_NOT_PRESENT); 977 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
893} 978}
894#else 979#else
895static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, 980static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
896 pmd_t pmd, int offset) 981 pmd_t pmd, int offset, int pmd_flags2)
897{ 982{
898} 983}
899#endif 984#endif
@@ -905,17 +990,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
905 struct pagemapread *pm = walk->private; 990 struct pagemapread *pm = walk->private;
906 pte_t *pte; 991 pte_t *pte;
907 int err = 0; 992 int err = 0;
908 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 993 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
909 994
910 /* find the first VMA at or above 'addr' */ 995 /* find the first VMA at or above 'addr' */
911 vma = find_vma(walk->mm, addr); 996 vma = find_vma(walk->mm, addr);
912 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 997 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
998 int pmd_flags2;
999
1000 pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
913 for (; addr != end; addr += PAGE_SIZE) { 1001 for (; addr != end; addr += PAGE_SIZE) {
914 unsigned long offset; 1002 unsigned long offset;
915 1003
916 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1004 offset = (addr & ~PAGEMAP_WALK_MASK) >>
917 PAGE_SHIFT; 1005 PAGE_SHIFT;
918 thp_pmd_to_pagemap_entry(&pme, *pmd, offset); 1006 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
919 err = add_to_pagemap(addr, &pme, pm); 1007 err = add_to_pagemap(addr, &pme, pm);
920 if (err) 1008 if (err)
921 break; 1009 break;
@@ -932,7 +1020,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
932 * and need a new, higher one */ 1020 * and need a new, higher one */
933 if (vma && (addr >= vma->vm_end)) { 1021 if (vma && (addr >= vma->vm_end)) {
934 vma = find_vma(walk->mm, addr); 1022 vma = find_vma(walk->mm, addr);
935 pme = make_pme(PM_NOT_PRESENT); 1023 pme = make_pme(PM_NOT_PRESENT(pm->v2));
936 } 1024 }
937 1025
938 /* check that 'vma' actually covers this address, 1026 /* check that 'vma' actually covers this address,
@@ -940,7 +1028,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
940 if (vma && (vma->vm_start <= addr) && 1028 if (vma && (vma->vm_start <= addr) &&
941 !is_vm_hugetlb_page(vma)) { 1029 !is_vm_hugetlb_page(vma)) {
942 pte = pte_offset_map(pmd, addr); 1030 pte = pte_offset_map(pmd, addr);
943 pte_to_pagemap_entry(&pme, vma, addr, *pte); 1031 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
944 /* unmap before userspace copy */ 1032 /* unmap before userspace copy */
945 pte_unmap(pte); 1033 pte_unmap(pte);
946 } 1034 }
@@ -955,14 +1043,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
955} 1043}
956 1044
957#ifdef CONFIG_HUGETLB_PAGE 1045#ifdef CONFIG_HUGETLB_PAGE
958static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, 1046static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
959 pte_t pte, int offset) 1047 pte_t pte, int offset)
960{ 1048{
961 if (pte_present(pte)) 1049 if (pte_present(pte))
962 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1050 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
963 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 1051 | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
964 else 1052 else
965 *pme = make_pme(PM_NOT_PRESENT); 1053 *pme = make_pme(PM_NOT_PRESENT(pm->v2));
966} 1054}
967 1055
968/* This function walks within one hugetlb entry in the single call */ 1056/* This function walks within one hugetlb entry in the single call */
@@ -976,7 +1064,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
976 1064
977 for (; addr != end; addr += PAGE_SIZE) { 1065 for (; addr != end; addr += PAGE_SIZE) {
978 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1066 int offset = (addr & ~hmask) >> PAGE_SHIFT;
979 huge_pte_to_pagemap_entry(&pme, *pte, offset); 1067 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
980 err = add_to_pagemap(addr, &pme, pm); 1068 err = add_to_pagemap(addr, &pme, pm);
981 if (err) 1069 if (err)
982 return err; 1070 return err;
@@ -1038,8 +1126,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1038 if (!count) 1126 if (!count)
1039 goto out_task; 1127 goto out_task;
1040 1128
1041 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1129 pm.v2 = soft_dirty_cleared;
1042 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); 1130 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1131 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1043 ret = -ENOMEM; 1132 ret = -ENOMEM;
1044 if (!pm.buffer) 1133 if (!pm.buffer)
1045 goto out_task; 1134 goto out_task;
@@ -1110,9 +1199,18 @@ out:
1110 return ret; 1199 return ret;
1111} 1200}
1112 1201
1202static int pagemap_open(struct inode *inode, struct file *file)
1203{
1204 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
1205 "to stop being page-shift some time soon. See the "
1206 "linux/Documentation/vm/pagemap.txt for details.\n");
1207 return 0;
1208}
1209
1113const struct file_operations proc_pagemap_operations = { 1210const struct file_operations proc_pagemap_operations = {
1114 .llseek = mem_lseek, /* borrow this */ 1211 .llseek = mem_lseek, /* borrow this */
1115 .read = pagemap_read, 1212 .read = pagemap_read,
1213 .open = pagemap_open,
1116}; 1214};
1117#endif /* CONFIG_PROC_PAGE_MONITOR */ 1215#endif /* CONFIG_PROC_PAGE_MONITOR */
1118 1216
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7e..061894625903 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v)
20 for_each_possible_cpu(i) 20 for_each_possible_cpu(i)
21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; 21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
22 22
23 do_posix_clock_monotonic_gettime(&uptime); 23 get_monotonic_boottime(&uptime);
24 monotonic_to_bootbased(&uptime);
25 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; 24 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
26 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); 25 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
27 idle.tv_nsec = rem; 26 idle.tv_nsec = rem;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 17f7e080d7ff..a1a16eb97c7b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/crash_dump.h> 21#include <linux/crash_dump.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/vmalloc.h>
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24#include <asm/io.h> 25#include <asm/io.h>
25#include "internal.h" 26#include "internal.h"
@@ -32,6 +33,10 @@ static LIST_HEAD(vmcore_list);
32/* Stores the pointer to the buffer containing kernel elf core headers. */ 33/* Stores the pointer to the buffer containing kernel elf core headers. */
33static char *elfcorebuf; 34static char *elfcorebuf;
34static size_t elfcorebuf_sz; 35static size_t elfcorebuf_sz;
36static size_t elfcorebuf_sz_orig;
37
38static char *elfnotes_buf;
39static size_t elfnotes_sz;
35 40
36/* Total size of vmcore file. */ 41/* Total size of vmcore file. */
37static u64 vmcore_size; 42static u64 vmcore_size;
@@ -118,27 +123,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
118 return read; 123 return read;
119} 124}
120 125
121/* Maps vmcore file offset to respective physical address in memroy. */
122static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
123 struct vmcore **m_ptr)
124{
125 struct vmcore *m;
126 u64 paddr;
127
128 list_for_each_entry(m, vc_list, list) {
129 u64 start, end;
130 start = m->offset;
131 end = m->offset + m->size - 1;
132 if (offset >= start && offset <= end) {
133 paddr = m->paddr + offset - start;
134 *m_ptr = m;
135 return paddr;
136 }
137 }
138 *m_ptr = NULL;
139 return 0;
140}
141
142/* Read from the ELF header and then the crash dump. On error, negative value is 126/* Read from the ELF header and then the crash dump. On error, negative value is
143 * returned otherwise number of bytes read are returned. 127 * returned otherwise number of bytes read are returned.
144 */ 128 */
@@ -147,8 +131,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
147{ 131{
148 ssize_t acc = 0, tmp; 132 ssize_t acc = 0, tmp;
149 size_t tsz; 133 size_t tsz;
150 u64 start, nr_bytes; 134 u64 start;
151 struct vmcore *curr_m = NULL; 135 struct vmcore *m = NULL;
152 136
153 if (buflen == 0 || *fpos >= vmcore_size) 137 if (buflen == 0 || *fpos >= vmcore_size)
154 return 0; 138 return 0;
@@ -159,9 +143,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
159 143
160 /* Read ELF core header */ 144 /* Read ELF core header */
161 if (*fpos < elfcorebuf_sz) { 145 if (*fpos < elfcorebuf_sz) {
162 tsz = elfcorebuf_sz - *fpos; 146 tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
163 if (buflen < tsz)
164 tsz = buflen;
165 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) 147 if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
166 return -EFAULT; 148 return -EFAULT;
167 buflen -= tsz; 149 buflen -= tsz;
@@ -174,39 +156,161 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
174 return acc; 156 return acc;
175 } 157 }
176 158
177 start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); 159 /* Read Elf note segment */
178 if (!curr_m) 160 if (*fpos < elfcorebuf_sz + elfnotes_sz) {
179 return -EINVAL; 161 void *kaddr;
180
181 while (buflen) {
182 tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK));
183 162
184 /* Calculate left bytes in current memory segment. */ 163 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
185 nr_bytes = (curr_m->size - (start - curr_m->paddr)); 164 kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
186 if (tsz > nr_bytes) 165 if (copy_to_user(buffer, kaddr, tsz))
187 tsz = nr_bytes; 166 return -EFAULT;
188
189 tmp = read_from_oldmem(buffer, tsz, &start, 1);
190 if (tmp < 0)
191 return tmp;
192 buflen -= tsz; 167 buflen -= tsz;
193 *fpos += tsz; 168 *fpos += tsz;
194 buffer += tsz; 169 buffer += tsz;
195 acc += tsz; 170 acc += tsz;
196 if (start >= (curr_m->paddr + curr_m->size)) { 171
197 if (curr_m->list.next == &vmcore_list) 172 /* leave now if filled buffer already */
198 return acc; /*EOF*/ 173 if (buflen == 0)
199 curr_m = list_entry(curr_m->list.next, 174 return acc;
200 struct vmcore, list); 175 }
201 start = curr_m->paddr; 176
177 list_for_each_entry(m, &vmcore_list, list) {
178 if (*fpos < m->offset + m->size) {
179 tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
180 start = m->paddr + *fpos - m->offset;
181 tmp = read_from_oldmem(buffer, tsz, &start, 1);
182 if (tmp < 0)
183 return tmp;
184 buflen -= tsz;
185 *fpos += tsz;
186 buffer += tsz;
187 acc += tsz;
188
189 /* leave now if filled buffer already */
190 if (buflen == 0)
191 return acc;
202 } 192 }
203 } 193 }
194
204 return acc; 195 return acc;
205} 196}
206 197
198/**
199 * alloc_elfnotes_buf - allocate buffer for ELF note segment in
200 * vmalloc memory
201 *
202 * @notes_sz: size of buffer
203 *
204 * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
205 * the buffer to user-space by means of remap_vmalloc_range().
206 *
207 * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
208 * disabled and there's no need to allow users to mmap the buffer.
209 */
210static inline char *alloc_elfnotes_buf(size_t notes_sz)
211{
212#ifdef CONFIG_MMU
213 return vmalloc_user(notes_sz);
214#else
215 return vzalloc(notes_sz);
216#endif
217}
218
219/*
220 * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is
221 * essential for mmap_vmcore() in order to map physically
222 * non-contiguous objects (ELF header, ELF note segment and memory
223 * regions in the 1st kernel pointed to by PT_LOAD entries) into
224 * virtually contiguous user-space in ELF layout.
225 */
226#if defined(CONFIG_MMU) && !defined(CONFIG_S390)
227static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
228{
229 size_t size = vma->vm_end - vma->vm_start;
230 u64 start, end, len, tsz;
231 struct vmcore *m;
232
233 start = (u64)vma->vm_pgoff << PAGE_SHIFT;
234 end = start + size;
235
236 if (size > vmcore_size || end > vmcore_size)
237 return -EINVAL;
238
239 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
240 return -EPERM;
241
242 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
243 vma->vm_flags |= VM_MIXEDMAP;
244
245 len = 0;
246
247 if (start < elfcorebuf_sz) {
248 u64 pfn;
249
250 tsz = min(elfcorebuf_sz - (size_t)start, size);
251 pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT;
252 if (remap_pfn_range(vma, vma->vm_start, pfn, tsz,
253 vma->vm_page_prot))
254 return -EAGAIN;
255 size -= tsz;
256 start += tsz;
257 len += tsz;
258
259 if (size == 0)
260 return 0;
261 }
262
263 if (start < elfcorebuf_sz + elfnotes_sz) {
264 void *kaddr;
265
266 tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
267 kaddr = elfnotes_buf + start - elfcorebuf_sz;
268 if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
269 kaddr, tsz))
270 goto fail;
271 size -= tsz;
272 start += tsz;
273 len += tsz;
274
275 if (size == 0)
276 return 0;
277 }
278
279 list_for_each_entry(m, &vmcore_list, list) {
280 if (start < m->offset + m->size) {
281 u64 paddr = 0;
282
283 tsz = min_t(size_t, m->offset + m->size - start, size);
284 paddr = m->paddr + start - m->offset;
285 if (remap_pfn_range(vma, vma->vm_start + len,
286 paddr >> PAGE_SHIFT, tsz,
287 vma->vm_page_prot))
288 goto fail;
289 size -= tsz;
290 start += tsz;
291 len += tsz;
292
293 if (size == 0)
294 return 0;
295 }
296 }
297
298 return 0;
299fail:
300 do_munmap(vma->vm_mm, vma->vm_start, len);
301 return -EAGAIN;
302}
303#else
304static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
305{
306 return -ENOSYS;
307}
308#endif
309
207static const struct file_operations proc_vmcore_operations = { 310static const struct file_operations proc_vmcore_operations = {
208 .read = read_vmcore, 311 .read = read_vmcore,
209 .llseek = default_llseek, 312 .llseek = default_llseek,
313 .mmap = mmap_vmcore,
210}; 314};
211 315
212static struct vmcore* __init get_new_element(void) 316static struct vmcore* __init get_new_element(void)
@@ -214,61 +318,40 @@ static struct vmcore* __init get_new_element(void)
214 return kzalloc(sizeof(struct vmcore), GFP_KERNEL); 318 return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
215} 319}
216 320
217static u64 __init get_vmcore_size_elf64(char *elfptr) 321static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
322 struct list_head *vc_list)
218{ 323{
219 int i;
220 u64 size;
221 Elf64_Ehdr *ehdr_ptr;
222 Elf64_Phdr *phdr_ptr;
223
224 ehdr_ptr = (Elf64_Ehdr *)elfptr;
225 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
226 size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
227 for (i = 0; i < ehdr_ptr->e_phnum; i++) {
228 size += phdr_ptr->p_memsz;
229 phdr_ptr++;
230 }
231 return size;
232}
233
234static u64 __init get_vmcore_size_elf32(char *elfptr)
235{
236 int i;
237 u64 size; 324 u64 size;
238 Elf32_Ehdr *ehdr_ptr; 325 struct vmcore *m;
239 Elf32_Phdr *phdr_ptr;
240 326
241 ehdr_ptr = (Elf32_Ehdr *)elfptr; 327 size = elfsz + elfnotesegsz;
242 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); 328 list_for_each_entry(m, vc_list, list) {
243 size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); 329 size += m->size;
244 for (i = 0; i < ehdr_ptr->e_phnum; i++) {
245 size += phdr_ptr->p_memsz;
246 phdr_ptr++;
247 } 330 }
248 return size; 331 return size;
249} 332}
250 333
251/* Merges all the PT_NOTE headers into one. */ 334/**
252static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, 335 * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry
253 struct list_head *vc_list) 336 *
337 * @ehdr_ptr: ELF header
338 *
339 * This function updates p_memsz member of each PT_NOTE entry in the
340 * program header table pointed to by @ehdr_ptr to real size of ELF
341 * note segment.
342 */
343static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
254{ 344{
255 int i, nr_ptnote=0, rc=0; 345 int i, rc=0;
256 char *tmp; 346 Elf64_Phdr *phdr_ptr;
257 Elf64_Ehdr *ehdr_ptr;
258 Elf64_Phdr phdr, *phdr_ptr;
259 Elf64_Nhdr *nhdr_ptr; 347 Elf64_Nhdr *nhdr_ptr;
260 u64 phdr_sz = 0, note_off;
261 348
262 ehdr_ptr = (Elf64_Ehdr *)elfptr; 349 phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
263 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
264 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 350 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
265 int j;
266 void *notes_section; 351 void *notes_section;
267 struct vmcore *new;
268 u64 offset, max_sz, sz, real_sz = 0; 352 u64 offset, max_sz, sz, real_sz = 0;
269 if (phdr_ptr->p_type != PT_NOTE) 353 if (phdr_ptr->p_type != PT_NOTE)
270 continue; 354 continue;
271 nr_ptnote++;
272 max_sz = phdr_ptr->p_memsz; 355 max_sz = phdr_ptr->p_memsz;
273 offset = phdr_ptr->p_offset; 356 offset = phdr_ptr->p_offset;
274 notes_section = kmalloc(max_sz, GFP_KERNEL); 357 notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -280,7 +363,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
280 return rc; 363 return rc;
281 } 364 }
282 nhdr_ptr = notes_section; 365 nhdr_ptr = notes_section;
283 for (j = 0; j < max_sz; j += sz) { 366 while (real_sz < max_sz) {
284 if (nhdr_ptr->n_namesz == 0) 367 if (nhdr_ptr->n_namesz == 0)
285 break; 368 break;
286 sz = sizeof(Elf64_Nhdr) + 369 sz = sizeof(Elf64_Nhdr) +
@@ -289,26 +372,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
289 real_sz += sz; 372 real_sz += sz;
290 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); 373 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
291 } 374 }
292
293 /* Add this contiguous chunk of notes section to vmcore list.*/
294 new = get_new_element();
295 if (!new) {
296 kfree(notes_section);
297 return -ENOMEM;
298 }
299 new->paddr = phdr_ptr->p_offset;
300 new->size = real_sz;
301 list_add_tail(&new->list, vc_list);
302 phdr_sz += real_sz;
303 kfree(notes_section); 375 kfree(notes_section);
376 phdr_ptr->p_memsz = real_sz;
377 }
378
379 return 0;
380}
381
382/**
383 * get_note_number_and_size_elf64 - get the number of PT_NOTE program
384 * headers and sum of real size of their ELF note segment headers and
385 * data.
386 *
387 * @ehdr_ptr: ELF header
388 * @nr_ptnote: buffer for the number of PT_NOTE program headers
389 * @sz_ptnote: buffer for size of unique PT_NOTE program header
390 *
391 * This function is used to merge multiple PT_NOTE program headers
392 * into a unique single one. The resulting unique entry will have
393 * @sz_ptnote in its phdr->p_mem.
394 *
395 * It is assumed that program headers with PT_NOTE type pointed to by
396 * @ehdr_ptr has already been updated by update_note_header_size_elf64
397 * and each of PT_NOTE program headers has actual ELF note segment
398 * size in its p_memsz member.
399 */
400static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr,
401 int *nr_ptnote, u64 *sz_ptnote)
402{
403 int i;
404 Elf64_Phdr *phdr_ptr;
405
406 *nr_ptnote = *sz_ptnote = 0;
407
408 phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1);
409 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
410 if (phdr_ptr->p_type != PT_NOTE)
411 continue;
412 *nr_ptnote += 1;
413 *sz_ptnote += phdr_ptr->p_memsz;
414 }
415
416 return 0;
417}
418
419/**
420 * copy_notes_elf64 - copy ELF note segments in a given buffer
421 *
422 * @ehdr_ptr: ELF header
423 * @notes_buf: buffer into which ELF note segments are copied
424 *
425 * This function is used to copy ELF note segment in the 1st kernel
426 * into the buffer @notes_buf in the 2nd kernel. It is assumed that
427 * size of the buffer @notes_buf is equal to or larger than sum of the
428 * real ELF note segment headers and data.
429 *
430 * It is assumed that program headers with PT_NOTE type pointed to by
431 * @ehdr_ptr has already been updated by update_note_header_size_elf64
432 * and each of PT_NOTE program headers has actual ELF note segment
433 * size in its p_memsz member.
434 */
435static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
436{
437 int i, rc=0;
438 Elf64_Phdr *phdr_ptr;
439
440 phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1);
441
442 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
443 u64 offset;
444 if (phdr_ptr->p_type != PT_NOTE)
445 continue;
446 offset = phdr_ptr->p_offset;
447 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
448 if (rc < 0)
449 return rc;
450 notes_buf += phdr_ptr->p_memsz;
304 } 451 }
305 452
453 return 0;
454}
455
456/* Merges all the PT_NOTE headers into one. */
457static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
458 char **notes_buf, size_t *notes_sz)
459{
460 int i, nr_ptnote=0, rc=0;
461 char *tmp;
462 Elf64_Ehdr *ehdr_ptr;
463 Elf64_Phdr phdr;
464 u64 phdr_sz = 0, note_off;
465
466 ehdr_ptr = (Elf64_Ehdr *)elfptr;
467
468 rc = update_note_header_size_elf64(ehdr_ptr);
469 if (rc < 0)
470 return rc;
471
472 rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz);
473 if (rc < 0)
474 return rc;
475
476 *notes_sz = roundup(phdr_sz, PAGE_SIZE);
477 *notes_buf = alloc_elfnotes_buf(*notes_sz);
478 if (!*notes_buf)
479 return -ENOMEM;
480
481 rc = copy_notes_elf64(ehdr_ptr, *notes_buf);
482 if (rc < 0)
483 return rc;
484
306 /* Prepare merged PT_NOTE program header. */ 485 /* Prepare merged PT_NOTE program header. */
307 phdr.p_type = PT_NOTE; 486 phdr.p_type = PT_NOTE;
308 phdr.p_flags = 0; 487 phdr.p_flags = 0;
309 note_off = sizeof(Elf64_Ehdr) + 488 note_off = sizeof(Elf64_Ehdr) +
310 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); 489 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
311 phdr.p_offset = note_off; 490 phdr.p_offset = roundup(note_off, PAGE_SIZE);
312 phdr.p_vaddr = phdr.p_paddr = 0; 491 phdr.p_vaddr = phdr.p_paddr = 0;
313 phdr.p_filesz = phdr.p_memsz = phdr_sz; 492 phdr.p_filesz = phdr.p_memsz = phdr_sz;
314 phdr.p_align = 0; 493 phdr.p_align = 0;
@@ -322,6 +501,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
322 i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); 501 i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
323 *elfsz = *elfsz - i; 502 *elfsz = *elfsz - i;
324 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); 503 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
504 memset(elfptr + *elfsz, 0, i);
505 *elfsz = roundup(*elfsz, PAGE_SIZE);
325 506
326 /* Modify e_phnum to reflect merged headers. */ 507 /* Modify e_phnum to reflect merged headers. */
327 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; 508 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -329,27 +510,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
329 return 0; 510 return 0;
330} 511}
331 512
332/* Merges all the PT_NOTE headers into one. */ 513/**
333static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, 514 * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry
334 struct list_head *vc_list) 515 *
516 * @ehdr_ptr: ELF header
517 *
518 * This function updates p_memsz member of each PT_NOTE entry in the
519 * program header table pointed to by @ehdr_ptr to real size of ELF
520 * note segment.
521 */
522static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
335{ 523{
336 int i, nr_ptnote=0, rc=0; 524 int i, rc=0;
337 char *tmp; 525 Elf32_Phdr *phdr_ptr;
338 Elf32_Ehdr *ehdr_ptr;
339 Elf32_Phdr phdr, *phdr_ptr;
340 Elf32_Nhdr *nhdr_ptr; 526 Elf32_Nhdr *nhdr_ptr;
341 u64 phdr_sz = 0, note_off;
342 527
343 ehdr_ptr = (Elf32_Ehdr *)elfptr; 528 phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
344 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr));
345 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 529 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
346 int j;
347 void *notes_section; 530 void *notes_section;
348 struct vmcore *new;
349 u64 offset, max_sz, sz, real_sz = 0; 531 u64 offset, max_sz, sz, real_sz = 0;
350 if (phdr_ptr->p_type != PT_NOTE) 532 if (phdr_ptr->p_type != PT_NOTE)
351 continue; 533 continue;
352 nr_ptnote++;
353 max_sz = phdr_ptr->p_memsz; 534 max_sz = phdr_ptr->p_memsz;
354 offset = phdr_ptr->p_offset; 535 offset = phdr_ptr->p_offset;
355 notes_section = kmalloc(max_sz, GFP_KERNEL); 536 notes_section = kmalloc(max_sz, GFP_KERNEL);
@@ -361,7 +542,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
361 return rc; 542 return rc;
362 } 543 }
363 nhdr_ptr = notes_section; 544 nhdr_ptr = notes_section;
364 for (j = 0; j < max_sz; j += sz) { 545 while (real_sz < max_sz) {
365 if (nhdr_ptr->n_namesz == 0) 546 if (nhdr_ptr->n_namesz == 0)
366 break; 547 break;
367 sz = sizeof(Elf32_Nhdr) + 548 sz = sizeof(Elf32_Nhdr) +
@@ -370,26 +551,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
370 real_sz += sz; 551 real_sz += sz;
371 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); 552 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
372 } 553 }
373
374 /* Add this contiguous chunk of notes section to vmcore list.*/
375 new = get_new_element();
376 if (!new) {
377 kfree(notes_section);
378 return -ENOMEM;
379 }
380 new->paddr = phdr_ptr->p_offset;
381 new->size = real_sz;
382 list_add_tail(&new->list, vc_list);
383 phdr_sz += real_sz;
384 kfree(notes_section); 554 kfree(notes_section);
555 phdr_ptr->p_memsz = real_sz;
556 }
557
558 return 0;
559}
560
561/**
562 * get_note_number_and_size_elf32 - get the number of PT_NOTE program
563 * headers and sum of real size of their ELF note segment headers and
564 * data.
565 *
566 * @ehdr_ptr: ELF header
567 * @nr_ptnote: buffer for the number of PT_NOTE program headers
568 * @sz_ptnote: buffer for size of unique PT_NOTE program header
569 *
570 * This function is used to merge multiple PT_NOTE program headers
571 * into a unique single one. The resulting unique entry will have
572 * @sz_ptnote in its phdr->p_mem.
573 *
574 * It is assumed that program headers with PT_NOTE type pointed to by
575 * @ehdr_ptr has already been updated by update_note_header_size_elf32
576 * and each of PT_NOTE program headers has actual ELF note segment
577 * size in its p_memsz member.
578 */
579static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr,
580 int *nr_ptnote, u64 *sz_ptnote)
581{
582 int i;
583 Elf32_Phdr *phdr_ptr;
584
585 *nr_ptnote = *sz_ptnote = 0;
586
587 phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1);
588 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
589 if (phdr_ptr->p_type != PT_NOTE)
590 continue;
591 *nr_ptnote += 1;
592 *sz_ptnote += phdr_ptr->p_memsz;
593 }
594
595 return 0;
596}
597
598/**
599 * copy_notes_elf32 - copy ELF note segments in a given buffer
600 *
601 * @ehdr_ptr: ELF header
602 * @notes_buf: buffer into which ELF note segments are copied
603 *
604 * This function is used to copy ELF note segment in the 1st kernel
605 * into the buffer @notes_buf in the 2nd kernel. It is assumed that
606 * size of the buffer @notes_buf is equal to or larger than sum of the
607 * real ELF note segment headers and data.
608 *
609 * It is assumed that program headers with PT_NOTE type pointed to by
610 * @ehdr_ptr has already been updated by update_note_header_size_elf32
611 * and each of PT_NOTE program headers has actual ELF note segment
612 * size in its p_memsz member.
613 */
614static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
615{
616 int i, rc=0;
617 Elf32_Phdr *phdr_ptr;
618
619 phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1);
620
621 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
622 u64 offset;
623 if (phdr_ptr->p_type != PT_NOTE)
624 continue;
625 offset = phdr_ptr->p_offset;
626 rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
627 if (rc < 0)
628 return rc;
629 notes_buf += phdr_ptr->p_memsz;
385 } 630 }
386 631
632 return 0;
633}
634
635/* Merges all the PT_NOTE headers into one. */
636static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
637 char **notes_buf, size_t *notes_sz)
638{
639 int i, nr_ptnote=0, rc=0;
640 char *tmp;
641 Elf32_Ehdr *ehdr_ptr;
642 Elf32_Phdr phdr;
643 u64 phdr_sz = 0, note_off;
644
645 ehdr_ptr = (Elf32_Ehdr *)elfptr;
646
647 rc = update_note_header_size_elf32(ehdr_ptr);
648 if (rc < 0)
649 return rc;
650
651 rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz);
652 if (rc < 0)
653 return rc;
654
655 *notes_sz = roundup(phdr_sz, PAGE_SIZE);
656 *notes_buf = alloc_elfnotes_buf(*notes_sz);
657 if (!*notes_buf)
658 return -ENOMEM;
659
660 rc = copy_notes_elf32(ehdr_ptr, *notes_buf);
661 if (rc < 0)
662 return rc;
663
387 /* Prepare merged PT_NOTE program header. */ 664 /* Prepare merged PT_NOTE program header. */
388 phdr.p_type = PT_NOTE; 665 phdr.p_type = PT_NOTE;
389 phdr.p_flags = 0; 666 phdr.p_flags = 0;
390 note_off = sizeof(Elf32_Ehdr) + 667 note_off = sizeof(Elf32_Ehdr) +
391 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); 668 (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr);
392 phdr.p_offset = note_off; 669 phdr.p_offset = roundup(note_off, PAGE_SIZE);
393 phdr.p_vaddr = phdr.p_paddr = 0; 670 phdr.p_vaddr = phdr.p_paddr = 0;
394 phdr.p_filesz = phdr.p_memsz = phdr_sz; 671 phdr.p_filesz = phdr.p_memsz = phdr_sz;
395 phdr.p_align = 0; 672 phdr.p_align = 0;
@@ -403,6 +680,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
403 i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); 680 i = (nr_ptnote - 1) * sizeof(Elf32_Phdr);
404 *elfsz = *elfsz - i; 681 *elfsz = *elfsz - i;
405 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); 682 memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr)));
683 memset(elfptr + *elfsz, 0, i);
684 *elfsz = roundup(*elfsz, PAGE_SIZE);
406 685
407 /* Modify e_phnum to reflect merged headers. */ 686 /* Modify e_phnum to reflect merged headers. */
408 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; 687 ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
@@ -414,6 +693,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
414 * the new offset fields of exported program headers. */ 693 * the new offset fields of exported program headers. */
415static int __init process_ptload_program_headers_elf64(char *elfptr, 694static int __init process_ptload_program_headers_elf64(char *elfptr,
416 size_t elfsz, 695 size_t elfsz,
696 size_t elfnotes_sz,
417 struct list_head *vc_list) 697 struct list_head *vc_list)
418{ 698{
419 int i; 699 int i;
@@ -425,32 +705,38 @@ static int __init process_ptload_program_headers_elf64(char *elfptr,
425 ehdr_ptr = (Elf64_Ehdr *)elfptr; 705 ehdr_ptr = (Elf64_Ehdr *)elfptr;
426 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ 706 phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
427 707
428 /* First program header is PT_NOTE header. */ 708 /* Skip Elf header, program headers and Elf note segment. */
429 vmcore_off = sizeof(Elf64_Ehdr) + 709 vmcore_off = elfsz + elfnotes_sz;
430 (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
431 phdr_ptr->p_memsz; /* Note sections */
432 710
433 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 711 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
712 u64 paddr, start, end, size;
713
434 if (phdr_ptr->p_type != PT_LOAD) 714 if (phdr_ptr->p_type != PT_LOAD)
435 continue; 715 continue;
436 716
717 paddr = phdr_ptr->p_offset;
718 start = rounddown(paddr, PAGE_SIZE);
719 end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
720 size = end - start;
721
437 /* Add this contiguous chunk of memory to vmcore list.*/ 722 /* Add this contiguous chunk of memory to vmcore list.*/
438 new = get_new_element(); 723 new = get_new_element();
439 if (!new) 724 if (!new)
440 return -ENOMEM; 725 return -ENOMEM;
441 new->paddr = phdr_ptr->p_offset; 726 new->paddr = start;
442 new->size = phdr_ptr->p_memsz; 727 new->size = size;
443 list_add_tail(&new->list, vc_list); 728 list_add_tail(&new->list, vc_list);
444 729
445 /* Update the program header offset. */ 730 /* Update the program header offset. */
446 phdr_ptr->p_offset = vmcore_off; 731 phdr_ptr->p_offset = vmcore_off + (paddr - start);
447 vmcore_off = vmcore_off + phdr_ptr->p_memsz; 732 vmcore_off = vmcore_off + size;
448 } 733 }
449 return 0; 734 return 0;
450} 735}
451 736
452static int __init process_ptload_program_headers_elf32(char *elfptr, 737static int __init process_ptload_program_headers_elf32(char *elfptr,
453 size_t elfsz, 738 size_t elfsz,
739 size_t elfnotes_sz,
454 struct list_head *vc_list) 740 struct list_head *vc_list)
455{ 741{
456 int i; 742 int i;
@@ -462,43 +748,44 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
462 ehdr_ptr = (Elf32_Ehdr *)elfptr; 748 ehdr_ptr = (Elf32_Ehdr *)elfptr;
463 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ 749 phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */
464 750
465 /* First program header is PT_NOTE header. */ 751 /* Skip Elf header, program headers and Elf note segment. */
466 vmcore_off = sizeof(Elf32_Ehdr) + 752 vmcore_off = elfsz + elfnotes_sz;
467 (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) +
468 phdr_ptr->p_memsz; /* Note sections */
469 753
470 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { 754 for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
755 u64 paddr, start, end, size;
756
471 if (phdr_ptr->p_type != PT_LOAD) 757 if (phdr_ptr->p_type != PT_LOAD)
472 continue; 758 continue;
473 759
760 paddr = phdr_ptr->p_offset;
761 start = rounddown(paddr, PAGE_SIZE);
762 end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE);
763 size = end - start;
764
474 /* Add this contiguous chunk of memory to vmcore list.*/ 765 /* Add this contiguous chunk of memory to vmcore list.*/
475 new = get_new_element(); 766 new = get_new_element();
476 if (!new) 767 if (!new)
477 return -ENOMEM; 768 return -ENOMEM;
478 new->paddr = phdr_ptr->p_offset; 769 new->paddr = start;
479 new->size = phdr_ptr->p_memsz; 770 new->size = size;
480 list_add_tail(&new->list, vc_list); 771 list_add_tail(&new->list, vc_list);
481 772
482 /* Update the program header offset */ 773 /* Update the program header offset */
483 phdr_ptr->p_offset = vmcore_off; 774 phdr_ptr->p_offset = vmcore_off + (paddr - start);
484 vmcore_off = vmcore_off + phdr_ptr->p_memsz; 775 vmcore_off = vmcore_off + size;
485 } 776 }
486 return 0; 777 return 0;
487} 778}
488 779
489/* Sets offset fields of vmcore elements. */ 780/* Sets offset fields of vmcore elements. */
490static void __init set_vmcore_list_offsets_elf64(char *elfptr, 781static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
491 struct list_head *vc_list) 782 struct list_head *vc_list)
492{ 783{
493 loff_t vmcore_off; 784 loff_t vmcore_off;
494 Elf64_Ehdr *ehdr_ptr;
495 struct vmcore *m; 785 struct vmcore *m;
496 786
497 ehdr_ptr = (Elf64_Ehdr *)elfptr; 787 /* Skip Elf header, program headers and Elf note segment. */
498 788 vmcore_off = elfsz + elfnotes_sz;
499 /* Skip Elf header and program headers. */
500 vmcore_off = sizeof(Elf64_Ehdr) +
501 (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
502 789
503 list_for_each_entry(m, vc_list, list) { 790 list_for_each_entry(m, vc_list, list) {
504 m->offset = vmcore_off; 791 m->offset = vmcore_off;
@@ -506,24 +793,12 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr,
506 } 793 }
507} 794}
508 795
509/* Sets offset fields of vmcore elements. */ 796static void free_elfcorebuf(void)
510static void __init set_vmcore_list_offsets_elf32(char *elfptr,
511 struct list_head *vc_list)
512{ 797{
513 loff_t vmcore_off; 798 free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig));
514 Elf32_Ehdr *ehdr_ptr; 799 elfcorebuf = NULL;
515 struct vmcore *m; 800 vfree(elfnotes_buf);
516 801 elfnotes_buf = NULL;
517 ehdr_ptr = (Elf32_Ehdr *)elfptr;
518
519 /* Skip Elf header and program headers. */
520 vmcore_off = sizeof(Elf32_Ehdr) +
521 (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr);
522
523 list_for_each_entry(m, vc_list, list) {
524 m->offset = vmcore_off;
525 vmcore_off += m->size;
526 }
527} 802}
528 803
529static int __init parse_crash_elf64_headers(void) 804static int __init parse_crash_elf64_headers(void)
@@ -554,31 +829,32 @@ static int __init parse_crash_elf64_headers(void)
554 } 829 }
555 830
556 /* Read in all elf headers. */ 831 /* Read in all elf headers. */
557 elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); 832 elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) +
558 elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); 833 ehdr.e_phnum * sizeof(Elf64_Phdr);
834 elfcorebuf_sz = elfcorebuf_sz_orig;
835 elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
836 get_order(elfcorebuf_sz_orig));
559 if (!elfcorebuf) 837 if (!elfcorebuf)
560 return -ENOMEM; 838 return -ENOMEM;
561 addr = elfcorehdr_addr; 839 addr = elfcorehdr_addr;
562 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); 840 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
563 if (rc < 0) { 841 if (rc < 0)
564 kfree(elfcorebuf); 842 goto fail;
565 return rc;
566 }
567 843
568 /* Merge all PT_NOTE headers into one. */ 844 /* Merge all PT_NOTE headers into one. */
569 rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); 845 rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz,
570 if (rc) { 846 &elfnotes_buf, &elfnotes_sz);
571 kfree(elfcorebuf); 847 if (rc)
572 return rc; 848 goto fail;
573 }
574 rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, 849 rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
575 &vmcore_list); 850 elfnotes_sz, &vmcore_list);
576 if (rc) { 851 if (rc)
577 kfree(elfcorebuf); 852 goto fail;
578 return rc; 853 set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
579 }
580 set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
581 return 0; 854 return 0;
855fail:
856 free_elfcorebuf();
857 return rc;
582} 858}
583 859
584static int __init parse_crash_elf32_headers(void) 860static int __init parse_crash_elf32_headers(void)
@@ -609,31 +885,31 @@ static int __init parse_crash_elf32_headers(void)
609 } 885 }
610 886
611 /* Read in all elf headers. */ 887 /* Read in all elf headers. */
612 elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); 888 elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr);
613 elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); 889 elfcorebuf_sz = elfcorebuf_sz_orig;
890 elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
891 get_order(elfcorebuf_sz_orig));
614 if (!elfcorebuf) 892 if (!elfcorebuf)
615 return -ENOMEM; 893 return -ENOMEM;
616 addr = elfcorehdr_addr; 894 addr = elfcorehdr_addr;
617 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); 895 rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
618 if (rc < 0) { 896 if (rc < 0)
619 kfree(elfcorebuf); 897 goto fail;
620 return rc;
621 }
622 898
623 /* Merge all PT_NOTE headers into one. */ 899 /* Merge all PT_NOTE headers into one. */
624 rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); 900 rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz,
625 if (rc) { 901 &elfnotes_buf, &elfnotes_sz);
626 kfree(elfcorebuf); 902 if (rc)
627 return rc; 903 goto fail;
628 }
629 rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, 904 rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz,
630 &vmcore_list); 905 elfnotes_sz, &vmcore_list);
631 if (rc) { 906 if (rc)
632 kfree(elfcorebuf); 907 goto fail;
633 return rc; 908 set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
634 }
635 set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list);
636 return 0; 909 return 0;
910fail:
911 free_elfcorebuf();
912 return rc;
637} 913}
638 914
639static int __init parse_crash_elf_headers(void) 915static int __init parse_crash_elf_headers(void)
@@ -655,20 +931,19 @@ static int __init parse_crash_elf_headers(void)
655 rc = parse_crash_elf64_headers(); 931 rc = parse_crash_elf64_headers();
656 if (rc) 932 if (rc)
657 return rc; 933 return rc;
658
659 /* Determine vmcore size. */
660 vmcore_size = get_vmcore_size_elf64(elfcorebuf);
661 } else if (e_ident[EI_CLASS] == ELFCLASS32) { 934 } else if (e_ident[EI_CLASS] == ELFCLASS32) {
662 rc = parse_crash_elf32_headers(); 935 rc = parse_crash_elf32_headers();
663 if (rc) 936 if (rc)
664 return rc; 937 return rc;
665
666 /* Determine vmcore size. */
667 vmcore_size = get_vmcore_size_elf32(elfcorebuf);
668 } else { 938 } else {
669 pr_warn("Warning: Core image elf header is not sane\n"); 939 pr_warn("Warning: Core image elf header is not sane\n");
670 return -EINVAL; 940 return -EINVAL;
671 } 941 }
942
943 /* Determine vmcore size. */
944 vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
945 &vmcore_list);
946
672 return 0; 947 return 0;
673} 948}
674 949
@@ -711,7 +986,6 @@ void vmcore_cleanup(void)
711 list_del(&m->list); 986 list_del(&m->list);
712 kfree(m); 987 kfree(m);
713 } 988 }
714 kfree(elfcorebuf); 989 free_elfcorebuf();
715 elfcorebuf = NULL;
716} 990}
717EXPORT_SYMBOL_GPL(vmcore_cleanup); 991EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 43b12807a51d..76a4eeb92982 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -44,7 +44,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
44 rec.parent_ip = parent_ip; 44 rec.parent_ip = parent_ip;
45 pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); 45 pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
46 psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, 46 psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
47 sizeof(rec), psinfo); 47 0, sizeof(rec), psinfo);
48 48
49 local_irq_restore(flags); 49 local_irq_restore(flags);
50} 50}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index e4bcb2cf055a..71bf5f4ae84c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
178 if (p->psi->erase) 178 if (p->psi->erase)
179 p->psi->erase(p->type, p->id, p->count, 179 p->psi->erase(p->type, p->id, p->count,
180 dentry->d_inode->i_ctime, p->psi); 180 dentry->d_inode->i_ctime, p->psi);
181 else
182 return -EPERM;
181 183
182 return simple_unlink(dir, dentry); 184 return simple_unlink(dir, dentry);
183} 185}
@@ -324,6 +326,15 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
324 case PSTORE_TYPE_MCE: 326 case PSTORE_TYPE_MCE:
325 sprintf(name, "mce-%s-%lld", psname, id); 327 sprintf(name, "mce-%s-%lld", psname, id);
326 break; 328 break;
329 case PSTORE_TYPE_PPC_RTAS:
330 sprintf(name, "rtas-%s-%lld", psname, id);
331 break;
332 case PSTORE_TYPE_PPC_OF:
333 sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
334 break;
335 case PSTORE_TYPE_PPC_COMMON:
336 sprintf(name, "powerpc-common-%s-%lld", psname, id);
337 break;
327 case PSTORE_TYPE_UNKNOWN: 338 case PSTORE_TYPE_UNKNOWN:
328 sprintf(name, "unknown-%s-%lld", psname, id); 339 sprintf(name, "unknown-%s-%lld", psname, id);
329 break; 340 break;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 86d1038b5a12..422962ae9fc2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -159,7 +159,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
159 break; 159 break;
160 160
161 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, 161 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
162 oopscount, hsize + len, psinfo); 162 oopscount, hsize, hsize + len, psinfo);
163 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 163 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
164 pstore_new_entry = 1; 164 pstore_new_entry = 1;
165 165
@@ -196,7 +196,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
196 spin_lock_irqsave(&psinfo->buf_lock, flags); 196 spin_lock_irqsave(&psinfo->buf_lock, flags);
197 } 197 }
198 memcpy(psinfo->buf, s, c); 198 memcpy(psinfo->buf, s, c);
199 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo); 199 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
200 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 200 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
201 s += c; 201 s += c;
202 c = e - s; 202 c = e - s;
@@ -221,9 +221,11 @@ static void pstore_register_console(void) {}
221static int pstore_write_compat(enum pstore_type_id type, 221static int pstore_write_compat(enum pstore_type_id type,
222 enum kmsg_dump_reason reason, 222 enum kmsg_dump_reason reason,
223 u64 *id, unsigned int part, int count, 223 u64 *id, unsigned int part, int count,
224 size_t size, struct pstore_info *psi) 224 size_t hsize, size_t size,
225 struct pstore_info *psi)
225{ 226{
226 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); 227 return psi->write_buf(type, reason, id, part, psinfo->buf, hsize,
228 size, psi);
227} 229}
228 230
229/* 231/*
@@ -239,17 +241,15 @@ int pstore_register(struct pstore_info *psi)
239{ 241{
240 struct module *owner = psi->owner; 242 struct module *owner = psi->owner;
241 243
244 if (backend && strcmp(backend, psi->name))
245 return -EPERM;
246
242 spin_lock(&pstore_lock); 247 spin_lock(&pstore_lock);
243 if (psinfo) { 248 if (psinfo) {
244 spin_unlock(&pstore_lock); 249 spin_unlock(&pstore_lock);
245 return -EBUSY; 250 return -EBUSY;
246 } 251 }
247 252
248 if (backend && strcmp(backend, psi->name)) {
249 spin_unlock(&pstore_lock);
250 return -EINVAL;
251 }
252
253 if (!psi->write) 253 if (!psi->write)
254 psi->write = pstore_write_compat; 254 psi->write = pstore_write_compat;
255 psinfo = psi; 255 psinfo = psi;
@@ -274,6 +274,9 @@ int pstore_register(struct pstore_info *psi)
274 add_timer(&pstore_timer); 274 add_timer(&pstore_timer);
275 } 275 }
276 276
277 pr_info("pstore: Registered %s as persistent store backend\n",
278 psi->name);
279
277 return 0; 280 return 0;
278} 281}
279EXPORT_SYMBOL_GPL(pstore_register); 282EXPORT_SYMBOL_GPL(pstore_register);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1376e5a8f0d6..a6119f9469e2 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -195,7 +195,8 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
195static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, 195static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
196 enum kmsg_dump_reason reason, 196 enum kmsg_dump_reason reason,
197 u64 *id, unsigned int part, 197 u64 *id, unsigned int part,
198 const char *buf, size_t size, 198 const char *buf,
199 size_t hsize, size_t size,
199 struct pstore_info *psi) 200 struct pstore_info *psi)
200{ 201{
201 struct ramoops_context *cxt = psi->data; 202 struct ramoops_context *cxt = psi->data;
@@ -399,8 +400,6 @@ static int ramoops_probe(struct platform_device *pdev)
399 goto fail_out; 400 goto fail_out;
400 } 401 }
401 402
402 if (!is_power_of_2(pdata->mem_size))
403 pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
404 if (!is_power_of_2(pdata->record_size)) 403 if (!is_power_of_2(pdata->record_size))
405 pdata->record_size = rounddown_pow_of_two(pdata->record_size); 404 pdata->record_size = rounddown_pow_of_two(pdata->record_size);
406 if (!is_power_of_2(pdata->console_size)) 405 if (!is_power_of_2(pdata->console_size))
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 59337326e288..de272d426763 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -46,7 +46,7 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
46} 46}
47 47
48/* increase and wrap the start pointer, returning the old value */ 48/* increase and wrap the start pointer, returning the old value */
49static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) 49static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
50{ 50{
51 int old; 51 int old;
52 int new; 52 int new;
@@ -62,7 +62,7 @@ static inline size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
62} 62}
63 63
64/* increase the size counter until it hits the max size */ 64/* increase the size counter until it hits the max size */
65static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a) 65static void buffer_size_add_atomic(struct persistent_ram_zone *prz, size_t a)
66{ 66{
67 size_t old; 67 size_t old;
68 size_t new; 68 size_t new;
@@ -78,6 +78,53 @@ static inline void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
78 } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old); 78 } while (atomic_cmpxchg(&prz->buffer->size, old, new) != old);
79} 79}
80 80
81static DEFINE_RAW_SPINLOCK(buffer_lock);
82
83/* increase and wrap the start pointer, returning the old value */
84static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
85{
86 int old;
87 int new;
88 unsigned long flags;
89
90 raw_spin_lock_irqsave(&buffer_lock, flags);
91
92 old = atomic_read(&prz->buffer->start);
93 new = old + a;
94 while (unlikely(new > prz->buffer_size))
95 new -= prz->buffer_size;
96 atomic_set(&prz->buffer->start, new);
97
98 raw_spin_unlock_irqrestore(&buffer_lock, flags);
99
100 return old;
101}
102
103/* increase the size counter until it hits the max size */
104static void buffer_size_add_locked(struct persistent_ram_zone *prz, size_t a)
105{
106 size_t old;
107 size_t new;
108 unsigned long flags;
109
110 raw_spin_lock_irqsave(&buffer_lock, flags);
111
112 old = atomic_read(&prz->buffer->size);
113 if (old == prz->buffer_size)
114 goto exit;
115
116 new = old + a;
117 if (new > prz->buffer_size)
118 new = prz->buffer_size;
119 atomic_set(&prz->buffer->size, new);
120
121exit:
122 raw_spin_unlock_irqrestore(&buffer_lock, flags);
123}
124
125static size_t (*buffer_start_add)(struct persistent_ram_zone *, size_t) = buffer_start_add_atomic;
126static void (*buffer_size_add)(struct persistent_ram_zone *, size_t) = buffer_size_add_atomic;
127
81static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, 128static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
82 uint8_t *data, size_t len, uint8_t *ecc) 129 uint8_t *data, size_t len, uint8_t *ecc)
83{ 130{
@@ -372,6 +419,9 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size)
372 return NULL; 419 return NULL;
373 } 420 }
374 421
422 buffer_start_add = buffer_start_add_locked;
423 buffer_size_add = buffer_size_add_locked;
424
375 return ioremap(start, size); 425 return ioremap(start, size);
376} 426}
377 427
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 28ce014b3cef..b218f965817b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -14,9 +14,9 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include "qnx4.h" 15#include "qnx4.h"
16 16
17static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) 17static int qnx4_readdir(struct file *file, struct dir_context *ctx)
18{ 18{
19 struct inode *inode = file_inode(filp); 19 struct inode *inode = file_inode(file);
20 unsigned int offset; 20 unsigned int offset;
21 struct buffer_head *bh; 21 struct buffer_head *bh;
22 struct qnx4_inode_entry *de; 22 struct qnx4_inode_entry *de;
@@ -26,48 +26,44 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
26 int size; 26 int size;
27 27
28 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); 28 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
29 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos)); 29 QNX4DEBUG((KERN_INFO "pos = %ld\n", (long) ctx->pos));
30 30
31 while (filp->f_pos < inode->i_size) { 31 while (ctx->pos < inode->i_size) {
32 blknum = qnx4_block_map( inode, filp->f_pos >> QNX4_BLOCK_SIZE_BITS ); 32 blknum = qnx4_block_map(inode, ctx->pos >> QNX4_BLOCK_SIZE_BITS);
33 bh = sb_bread(inode->i_sb, blknum); 33 bh = sb_bread(inode->i_sb, blknum);
34 if(bh==NULL) { 34 if (bh == NULL) {
35 printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum); 35 printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", blknum);
36 break; 36 return 0;
37 } 37 }
38 ix = (int)(filp->f_pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; 38 ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
39 while (ix < QNX4_INODES_PER_BLOCK) { 39 for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
40 offset = ix * QNX4_DIR_ENTRY_SIZE; 40 offset = ix * QNX4_DIR_ENTRY_SIZE;
41 de = (struct qnx4_inode_entry *) (bh->b_data + offset); 41 de = (struct qnx4_inode_entry *) (bh->b_data + offset);
42 size = strlen(de->di_fname); 42 if (!de->di_fname[0])
43 if (size) { 43 continue;
44 if ( !( de->di_status & QNX4_FILE_LINK ) && size > QNX4_SHORT_NAME_MAX ) 44 if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
45 size = QNX4_SHORT_NAME_MAX; 45 continue;
46 else if ( size > QNX4_NAME_MAX ) 46 if (!(de->di_status & QNX4_FILE_LINK))
47 size = QNX4_NAME_MAX; 47 size = QNX4_SHORT_NAME_MAX;
48 48 else
49 if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) { 49 size = QNX4_NAME_MAX;
50 QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); 50 size = strnlen(de->di_fname, size);
51 if ( ( de->di_status & QNX4_FILE_LINK ) == 0 ) 51 QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
52 ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; 52 if (!(de->di_status & QNX4_FILE_LINK))
53 else { 53 ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
54 le = (struct qnx4_link_info*)de; 54 else {
55 ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * 55 le = (struct qnx4_link_info*)de;
56 QNX4_INODES_PER_BLOCK + 56 ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
57 le->dl_inode_ndx; 57 QNX4_INODES_PER_BLOCK +
58 } 58 le->dl_inode_ndx;
59 if (filldir(dirent, de->di_fname, size, filp->f_pos, ino, DT_UNKNOWN) < 0) { 59 }
60 brelse(bh); 60 if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) {
61 goto out; 61 brelse(bh);
62 } 62 return 0;
63 }
64 } 63 }
65 ix++;
66 filp->f_pos += QNX4_DIR_ENTRY_SIZE;
67 } 64 }
68 brelse(bh); 65 brelse(bh);
69 } 66 }
70out:
71 return 0; 67 return 0;
72} 68}
73 69
@@ -75,7 +71,7 @@ const struct file_operations qnx4_dir_operations =
75{ 71{
76 .llseek = generic_file_llseek, 72 .llseek = generic_file_llseek,
77 .read = generic_read_dir, 73 .read = generic_read_dir,
78 .readdir = qnx4_readdir, 74 .iterate = qnx4_readdir,
79 .fsync = generic_file_fsync, 75 .fsync = generic_file_fsync,
80}; 76};
81 77
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index afa6be6fc397..15b7d92ed60d 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -65,8 +65,8 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
65 65
66static int qnx6_dir_longfilename(struct inode *inode, 66static int qnx6_dir_longfilename(struct inode *inode,
67 struct qnx6_long_dir_entry *de, 67 struct qnx6_long_dir_entry *de,
68 void *dirent, loff_t pos, 68 struct dir_context *ctx,
69 unsigned de_inode, filldir_t filldir) 69 unsigned de_inode)
70{ 70{
71 struct qnx6_long_filename *lf; 71 struct qnx6_long_filename *lf;
72 struct super_block *s = inode->i_sb; 72 struct super_block *s = inode->i_sb;
@@ -104,8 +104,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
104 104
105 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", 105 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
106 lf_size, lf->lf_fname, de_inode)); 106 lf_size, lf->lf_fname, de_inode));
107 if (filldir(dirent, lf->lf_fname, lf_size, pos, de_inode, 107 if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
108 DT_UNKNOWN) < 0) {
109 qnx6_put_page(page); 108 qnx6_put_page(page);
110 return 0; 109 return 0;
111 } 110 }
@@ -115,18 +114,19 @@ static int qnx6_dir_longfilename(struct inode *inode,
115 return 1; 114 return 1;
116} 115}
117 116
118static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) 117static int qnx6_readdir(struct file *file, struct dir_context *ctx)
119{ 118{
120 struct inode *inode = file_inode(filp); 119 struct inode *inode = file_inode(file);
121 struct super_block *s = inode->i_sb; 120 struct super_block *s = inode->i_sb;
122 struct qnx6_sb_info *sbi = QNX6_SB(s); 121 struct qnx6_sb_info *sbi = QNX6_SB(s);
123 loff_t pos = filp->f_pos & ~(QNX6_DIR_ENTRY_SIZE - 1); 122 loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
124 unsigned long npages = dir_pages(inode); 123 unsigned long npages = dir_pages(inode);
125 unsigned long n = pos >> PAGE_CACHE_SHIFT; 124 unsigned long n = pos >> PAGE_CACHE_SHIFT;
126 unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; 125 unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
127 bool done = false; 126 bool done = false;
128 127
129 if (filp->f_pos >= inode->i_size) 128 ctx->pos = pos;
129 if (ctx->pos >= inode->i_size)
130 return 0; 130 return 0;
131 131
132 for ( ; !done && n < npages; n++, start = 0) { 132 for ( ; !done && n < npages; n++, start = 0) {
@@ -137,11 +137,11 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
137 137
138 if (IS_ERR(page)) { 138 if (IS_ERR(page)) {
139 printk(KERN_ERR "qnx6_readdir: read failed\n"); 139 printk(KERN_ERR "qnx6_readdir: read failed\n");
140 filp->f_pos = (n + 1) << PAGE_CACHE_SHIFT; 140 ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
141 return PTR_ERR(page); 141 return PTR_ERR(page);
142 } 142 }
143 de = ((struct qnx6_dir_entry *)page_address(page)) + start; 143 de = ((struct qnx6_dir_entry *)page_address(page)) + start;
144 for (; i < limit; i++, de++, pos += QNX6_DIR_ENTRY_SIZE) { 144 for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) {
145 int size = de->de_size; 145 int size = de->de_size;
146 u32 no_inode = fs32_to_cpu(sbi, de->de_inode); 146 u32 no_inode = fs32_to_cpu(sbi, de->de_inode);
147 147
@@ -154,8 +154,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
154 structure / block */ 154 structure / block */
155 if (!qnx6_dir_longfilename(inode, 155 if (!qnx6_dir_longfilename(inode,
156 (struct qnx6_long_dir_entry *)de, 156 (struct qnx6_long_dir_entry *)de,
157 dirent, pos, no_inode, 157 ctx, no_inode)) {
158 filldir)) {
159 done = true; 158 done = true;
160 break; 159 break;
161 } 160 }
@@ -163,9 +162,8 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
163 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" 162 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
164 " inode:%u\n", size, de->de_fname, 163 " inode:%u\n", size, de->de_fname,
165 no_inode)); 164 no_inode));
166 if (filldir(dirent, de->de_fname, size, 165 if (!dir_emit(ctx, de->de_fname, size,
167 pos, no_inode, DT_UNKNOWN) 166 no_inode, DT_UNKNOWN)) {
168 < 0) {
169 done = true; 167 done = true;
170 break; 168 break;
171 } 169 }
@@ -173,7 +171,6 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir)
173 } 171 }
174 qnx6_put_page(page); 172 qnx6_put_page(page);
175 } 173 }
176 filp->f_pos = pos;
177 return 0; 174 return 0;
178} 175}
179 176
@@ -282,7 +279,7 @@ found:
282const struct file_operations qnx6_dir_operations = { 279const struct file_operations qnx6_dir_operations = {
283 .llseek = generic_file_llseek, 280 .llseek = generic_file_llseek,
284 .read = generic_read_dir, 281 .read = generic_read_dir,
285 .readdir = qnx6_readdir, 282 .iterate = qnx6_readdir,
286 .fsync = generic_file_fsync, 283 .fsync = generic_file_fsync,
287}; 284};
288 285
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3e64169ef527..fbad622841f9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2585,7 +2585,7 @@ static int do_proc_dqstats(struct ctl_table *table, int write,
2585 return proc_dointvec(table, write, buffer, lenp, ppos); 2585 return proc_dointvec(table, write, buffer, lenp, ppos);
2586} 2586}
2587 2587
2588static ctl_table fs_dqstats_table[] = { 2588static struct ctl_table fs_dqstats_table[] = {
2589 { 2589 {
2590 .procname = "lookups", 2590 .procname = "lookups",
2591 .data = &dqstats.stat[DQST_LOOKUPS], 2591 .data = &dqstats.stat[DQST_LOOKUPS],
@@ -2654,7 +2654,7 @@ static ctl_table fs_dqstats_table[] = {
2654 { }, 2654 { },
2655}; 2655};
2656 2656
2657static ctl_table fs_table[] = { 2657static struct ctl_table fs_table[] = {
2658 { 2658 {
2659 .procname = "quota", 2659 .procname = "quota",
2660 .mode = 0555, 2660 .mode = 0555,
@@ -2663,7 +2663,7 @@ static ctl_table fs_table[] = {
2663 { }, 2663 { },
2664}; 2664};
2665 2665
2666static ctl_table sys_table[] = { 2666static struct ctl_table sys_table[] = {
2667 { 2667 {
2668 .procname = "fs", 2668 .procname = "fs",
2669 .mode = 0555, 2669 .mode = 0555,
diff --git a/fs/read_write.c b/fs/read_write.c
index 2cefa417be34..122a3846d9e1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -41,8 +41,19 @@ static inline int unsigned_offsets(struct file *file)
41 return file->f_mode & FMODE_UNSIGNED_OFFSET; 41 return file->f_mode & FMODE_UNSIGNED_OFFSET;
42} 42}
43 43
44static loff_t lseek_execute(struct file *file, struct inode *inode, 44/**
45 loff_t offset, loff_t maxsize) 45 * vfs_setpos - update the file offset for lseek
46 * @file: file structure in question
47 * @offset: file offset to seek to
48 * @maxsize: maximum file size
49 *
50 * This is a low-level filesystem helper for updating the file offset to
51 * the value specified by @offset if the given offset is valid and it is
52 * not equal to the current file offset.
53 *
54 * Return the specified offset on success and -EINVAL on invalid offset.
55 */
56loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
46{ 57{
47 if (offset < 0 && !unsigned_offsets(file)) 58 if (offset < 0 && !unsigned_offsets(file))
48 return -EINVAL; 59 return -EINVAL;
@@ -55,6 +66,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
55 } 66 }
56 return offset; 67 return offset;
57} 68}
69EXPORT_SYMBOL(vfs_setpos);
58 70
59/** 71/**
60 * generic_file_llseek_size - generic llseek implementation for regular files 72 * generic_file_llseek_size - generic llseek implementation for regular files
@@ -76,8 +88,6 @@ loff_t
76generic_file_llseek_size(struct file *file, loff_t offset, int whence, 88generic_file_llseek_size(struct file *file, loff_t offset, int whence,
77 loff_t maxsize, loff_t eof) 89 loff_t maxsize, loff_t eof)
78{ 90{
79 struct inode *inode = file->f_mapping->host;
80
81 switch (whence) { 91 switch (whence) {
82 case SEEK_END: 92 case SEEK_END:
83 offset += eof; 93 offset += eof;
@@ -97,8 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
97 * like SEEK_SET. 107 * like SEEK_SET.
98 */ 108 */
99 spin_lock(&file->f_lock); 109 spin_lock(&file->f_lock);
100 offset = lseek_execute(file, inode, file->f_pos + offset, 110 offset = vfs_setpos(file, file->f_pos + offset, maxsize);
101 maxsize);
102 spin_unlock(&file->f_lock); 111 spin_unlock(&file->f_lock);
103 return offset; 112 return offset;
104 case SEEK_DATA: 113 case SEEK_DATA:
@@ -120,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
120 break; 129 break;
121 } 130 }
122 131
123 return lseek_execute(file, inode, offset, maxsize); 132 return vfs_setpos(file, offset, maxsize);
124} 133}
125EXPORT_SYMBOL(generic_file_llseek_size); 134EXPORT_SYMBOL(generic_file_llseek_size);
126 135
@@ -145,6 +154,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145EXPORT_SYMBOL(generic_file_llseek); 154EXPORT_SYMBOL(generic_file_llseek);
146 155
147/** 156/**
157 * fixed_size_llseek - llseek implementation for fixed-sized devices
158 * @file: file structure to seek on
159 * @offset: file offset to seek to
160 * @whence: type of seek
161 * @size: size of the file
162 *
163 */
164loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
165{
166 switch (whence) {
167 case SEEK_SET: case SEEK_CUR: case SEEK_END:
168 return generic_file_llseek_size(file, offset, whence,
169 size, size);
170 default:
171 return -EINVAL;
172 }
173}
174EXPORT_SYMBOL(fixed_size_llseek);
175
176/**
148 * noop_llseek - No Operation Performed llseek implementation 177 * noop_llseek - No Operation Performed llseek implementation
149 * @file: file structure to seek on 178 * @file: file structure to seek on
150 * @offset: file offset to seek to 179 * @offset: file offset to seek to
@@ -296,7 +325,7 @@ out_putf:
296 * them to something that fits in "int" so that others 325 * them to something that fits in "int" so that others
297 * won't have to do range checks all the time. 326 * won't have to do range checks all the time.
298 */ 327 */
299int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) 328int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
300{ 329{
301 struct inode *inode; 330 struct inode *inode;
302 loff_t pos; 331 loff_t pos;
@@ -477,7 +506,8 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
477 if (f.file) { 506 if (f.file) {
478 loff_t pos = file_pos_read(f.file); 507 loff_t pos = file_pos_read(f.file);
479 ret = vfs_read(f.file, buf, count, &pos); 508 ret = vfs_read(f.file, buf, count, &pos);
480 file_pos_write(f.file, pos); 509 if (ret >= 0)
510 file_pos_write(f.file, pos);
481 fdput(f); 511 fdput(f);
482 } 512 }
483 return ret; 513 return ret;
@@ -492,7 +522,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
492 if (f.file) { 522 if (f.file) {
493 loff_t pos = file_pos_read(f.file); 523 loff_t pos = file_pos_read(f.file);
494 ret = vfs_write(f.file, buf, count, &pos); 524 ret = vfs_write(f.file, buf, count, &pos);
495 file_pos_write(f.file, pos); 525 if (ret >= 0)
526 file_pos_write(f.file, pos);
496 fdput(f); 527 fdput(f);
497 } 528 }
498 529
@@ -780,7 +811,8 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
780 if (f.file) { 811 if (f.file) {
781 loff_t pos = file_pos_read(f.file); 812 loff_t pos = file_pos_read(f.file);
782 ret = vfs_readv(f.file, vec, vlen, &pos); 813 ret = vfs_readv(f.file, vec, vlen, &pos);
783 file_pos_write(f.file, pos); 814 if (ret >= 0)
815 file_pos_write(f.file, pos);
784 fdput(f); 816 fdput(f);
785 } 817 }
786 818
@@ -799,7 +831,8 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
799 if (f.file) { 831 if (f.file) {
800 loff_t pos = file_pos_read(f.file); 832 loff_t pos = file_pos_read(f.file);
801 ret = vfs_writev(f.file, vec, vlen, &pos); 833 ret = vfs_writev(f.file, vec, vlen, &pos);
802 file_pos_write(f.file, pos); 834 if (ret >= 0)
835 file_pos_write(f.file, pos);
803 fdput(f); 836 fdput(f);
804 } 837 }
805 838
@@ -959,7 +992,8 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
959 return -EBADF; 992 return -EBADF;
960 pos = f.file->f_pos; 993 pos = f.file->f_pos;
961 ret = compat_readv(f.file, vec, vlen, &pos); 994 ret = compat_readv(f.file, vec, vlen, &pos);
962 f.file->f_pos = pos; 995 if (ret >= 0)
996 f.file->f_pos = pos;
963 fdput(f); 997 fdput(f);
964 return ret; 998 return ret;
965} 999}
@@ -1025,7 +1059,8 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1025 return -EBADF; 1059 return -EBADF;
1026 pos = f.file->f_pos; 1060 pos = f.file->f_pos;
1027 ret = compat_writev(f.file, vec, vlen, &pos); 1061 ret = compat_writev(f.file, vec, vlen, &pos);
1028 f.file->f_pos = pos; 1062 if (ret >= 0)
1063 f.file->f_pos = pos;
1029 fdput(f); 1064 fdput(f);
1030 return ret; 1065 return ret;
1031} 1066}
@@ -1129,7 +1164,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1129 if (in.file->f_flags & O_NONBLOCK) 1164 if (in.file->f_flags & O_NONBLOCK)
1130 fl = SPLICE_F_NONBLOCK; 1165 fl = SPLICE_F_NONBLOCK;
1131#endif 1166#endif
1167 file_start_write(out.file);
1132 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); 1168 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1169 file_end_write(out.file);
1133 1170
1134 if (retval > 0) { 1171 if (retval > 0) {
1135 add_rchar(current, retval); 1172 add_rchar(current, retval);
diff --git a/fs/readdir.c b/fs/readdir.c
index fee38e04fae4..93d71e574310 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -20,11 +20,11 @@
20 20
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22 22
23int vfs_readdir(struct file *file, filldir_t filler, void *buf) 23int iterate_dir(struct file *file, struct dir_context *ctx)
24{ 24{
25 struct inode *inode = file_inode(file); 25 struct inode *inode = file_inode(file);
26 int res = -ENOTDIR; 26 int res = -ENOTDIR;
27 if (!file->f_op || !file->f_op->readdir) 27 if (!file->f_op || !file->f_op->iterate)
28 goto out; 28 goto out;
29 29
30 res = security_file_permission(file, MAY_READ); 30 res = security_file_permission(file, MAY_READ);
@@ -37,15 +37,16 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf)
37 37
38 res = -ENOENT; 38 res = -ENOENT;
39 if (!IS_DEADDIR(inode)) { 39 if (!IS_DEADDIR(inode)) {
40 res = file->f_op->readdir(file, buf, filler); 40 ctx->pos = file->f_pos;
41 res = file->f_op->iterate(file, ctx);
42 file->f_pos = ctx->pos;
41 file_accessed(file); 43 file_accessed(file);
42 } 44 }
43 mutex_unlock(&inode->i_mutex); 45 mutex_unlock(&inode->i_mutex);
44out: 46out:
45 return res; 47 return res;
46} 48}
47 49EXPORT_SYMBOL(iterate_dir);
48EXPORT_SYMBOL(vfs_readdir);
49 50
50/* 51/*
51 * Traditional linux readdir() handling.. 52 * Traditional linux readdir() handling..
@@ -66,6 +67,7 @@ struct old_linux_dirent {
66}; 67};
67 68
68struct readdir_callback { 69struct readdir_callback {
70 struct dir_context ctx;
69 struct old_linux_dirent __user * dirent; 71 struct old_linux_dirent __user * dirent;
70 int result; 72 int result;
71}; 73};
@@ -73,7 +75,7 @@ struct readdir_callback {
73static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, 75static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
74 u64 ino, unsigned int d_type) 76 u64 ino, unsigned int d_type)
75{ 77{
76 struct readdir_callback * buf = (struct readdir_callback *) __buf; 78 struct readdir_callback *buf = (struct readdir_callback *) __buf;
77 struct old_linux_dirent __user * dirent; 79 struct old_linux_dirent __user * dirent;
78 unsigned long d_ino; 80 unsigned long d_ino;
79 81
@@ -107,15 +109,15 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
107{ 109{
108 int error; 110 int error;
109 struct fd f = fdget(fd); 111 struct fd f = fdget(fd);
110 struct readdir_callback buf; 112 struct readdir_callback buf = {
113 .ctx.actor = fillonedir,
114 .dirent = dirent
115 };
111 116
112 if (!f.file) 117 if (!f.file)
113 return -EBADF; 118 return -EBADF;
114 119
115 buf.result = 0; 120 error = iterate_dir(f.file, &buf.ctx);
116 buf.dirent = dirent;
117
118 error = vfs_readdir(f.file, fillonedir, &buf);
119 if (buf.result) 121 if (buf.result)
120 error = buf.result; 122 error = buf.result;
121 123
@@ -137,6 +139,7 @@ struct linux_dirent {
137}; 139};
138 140
139struct getdents_callback { 141struct getdents_callback {
142 struct dir_context ctx;
140 struct linux_dirent __user * current_dir; 143 struct linux_dirent __user * current_dir;
141 struct linux_dirent __user * previous; 144 struct linux_dirent __user * previous;
142 int count; 145 int count;
@@ -191,7 +194,11 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
191{ 194{
192 struct fd f; 195 struct fd f;
193 struct linux_dirent __user * lastdirent; 196 struct linux_dirent __user * lastdirent;
194 struct getdents_callback buf; 197 struct getdents_callback buf = {
198 .ctx.actor = filldir,
199 .count = count,
200 .current_dir = dirent
201 };
195 int error; 202 int error;
196 203
197 if (!access_ok(VERIFY_WRITE, dirent, count)) 204 if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -201,17 +208,12 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
201 if (!f.file) 208 if (!f.file)
202 return -EBADF; 209 return -EBADF;
203 210
204 buf.current_dir = dirent; 211 error = iterate_dir(f.file, &buf.ctx);
205 buf.previous = NULL;
206 buf.count = count;
207 buf.error = 0;
208
209 error = vfs_readdir(f.file, filldir, &buf);
210 if (error >= 0) 212 if (error >= 0)
211 error = buf.error; 213 error = buf.error;
212 lastdirent = buf.previous; 214 lastdirent = buf.previous;
213 if (lastdirent) { 215 if (lastdirent) {
214 if (put_user(f.file->f_pos, &lastdirent->d_off)) 216 if (put_user(buf.ctx.pos, &lastdirent->d_off))
215 error = -EFAULT; 217 error = -EFAULT;
216 else 218 else
217 error = count - buf.count; 219 error = count - buf.count;
@@ -221,6 +223,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
221} 223}
222 224
223struct getdents_callback64 { 225struct getdents_callback64 {
226 struct dir_context ctx;
224 struct linux_dirent64 __user * current_dir; 227 struct linux_dirent64 __user * current_dir;
225 struct linux_dirent64 __user * previous; 228 struct linux_dirent64 __user * previous;
226 int count; 229 int count;
@@ -271,7 +274,11 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
271{ 274{
272 struct fd f; 275 struct fd f;
273 struct linux_dirent64 __user * lastdirent; 276 struct linux_dirent64 __user * lastdirent;
274 struct getdents_callback64 buf; 277 struct getdents_callback64 buf = {
278 .ctx.actor = filldir64,
279 .count = count,
280 .current_dir = dirent
281 };
275 int error; 282 int error;
276 283
277 if (!access_ok(VERIFY_WRITE, dirent, count)) 284 if (!access_ok(VERIFY_WRITE, dirent, count))
@@ -281,17 +288,12 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
281 if (!f.file) 288 if (!f.file)
282 return -EBADF; 289 return -EBADF;
283 290
284 buf.current_dir = dirent; 291 error = iterate_dir(f.file, &buf.ctx);
285 buf.previous = NULL;
286 buf.count = count;
287 buf.error = 0;
288
289 error = vfs_readdir(f.file, filldir64, &buf);
290 if (error >= 0) 292 if (error >= 0)
291 error = buf.error; 293 error = buf.error;
292 lastdirent = buf.previous; 294 lastdirent = buf.previous;
293 if (lastdirent) { 295 if (lastdirent) {
294 typeof(lastdirent->d_off) d_off = f.file->f_pos; 296 typeof(lastdirent->d_off) d_off = buf.ctx.pos;
295 if (__put_user(d_off, &lastdirent->d_off)) 297 if (__put_user(d_off, &lastdirent->d_off))
296 error = -EFAULT; 298 error = -EFAULT;
297 else 299 else
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6c2d136561cb..03e4ca5624d6 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -13,14 +13,14 @@
13 13
14extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
15 15
16static int reiserfs_readdir(struct file *, void *, filldir_t); 16static int reiserfs_readdir(struct file *, struct dir_context *);
17static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, 17static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
18 int datasync); 18 int datasync);
19 19
20const struct file_operations reiserfs_dir_operations = { 20const struct file_operations reiserfs_dir_operations = {
21 .llseek = generic_file_llseek, 21 .llseek = generic_file_llseek,
22 .read = generic_read_dir, 22 .read = generic_read_dir,
23 .readdir = reiserfs_readdir, 23 .iterate = reiserfs_readdir,
24 .fsync = reiserfs_dir_fsync, 24 .fsync = reiserfs_dir_fsync,
25 .unlocked_ioctl = reiserfs_ioctl, 25 .unlocked_ioctl = reiserfs_ioctl,
26#ifdef CONFIG_COMPAT 26#ifdef CONFIG_COMPAT
@@ -50,18 +50,15 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
50 50
51#define store_ih(where,what) copy_item_head (where, what) 51#define store_ih(where,what) copy_item_head (where, what)
52 52
53static inline bool is_privroot_deh(struct dentry *dir, 53static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh)
54 struct reiserfs_de_head *deh)
55{ 54{
56 struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; 55 struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root;
57 return (dir == dir->d_parent && privroot->d_inode && 56 return (privroot->d_inode &&
58 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); 57 deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
59} 58}
60 59
61int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, 60int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
62 filldir_t filldir, loff_t *pos)
63{ 61{
64 struct inode *inode = dentry->d_inode;
65 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ 62 struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
66 INITIALIZE_PATH(path_to_entry); 63 INITIALIZE_PATH(path_to_entry);
67 struct buffer_head *bh; 64 struct buffer_head *bh;
@@ -81,7 +78,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
81 78
82 /* form key for search the next directory entry using f_pos field of 79 /* form key for search the next directory entry using f_pos field of
83 file structure */ 80 file structure */
84 make_cpu_key(&pos_key, inode, *pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3); 81 make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
85 next_pos = cpu_key_k_offset(&pos_key); 82 next_pos = cpu_key_k_offset(&pos_key);
86 83
87 path_to_entry.reada = PATH_READA; 84 path_to_entry.reada = PATH_READA;
@@ -126,7 +123,6 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
126 entry_num++, deh++) { 123 entry_num++, deh++) {
127 int d_reclen; 124 int d_reclen;
128 char *d_name; 125 char *d_name;
129 off_t d_off;
130 ino_t d_ino; 126 ino_t d_ino;
131 127
132 if (!de_visible(deh)) 128 if (!de_visible(deh))
@@ -155,11 +151,10 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
155 } 151 }
156 152
157 /* Ignore the .reiserfs_priv entry */ 153 /* Ignore the .reiserfs_priv entry */
158 if (is_privroot_deh(dentry, deh)) 154 if (is_privroot_deh(inode, deh))
159 continue; 155 continue;
160 156
161 d_off = deh_offset(deh); 157 ctx->pos = deh_offset(deh);
162 *pos = d_off;
163 d_ino = deh_objectid(deh); 158 d_ino = deh_objectid(deh);
164 if (d_reclen <= 32) { 159 if (d_reclen <= 32) {
165 local_buf = small_buf; 160 local_buf = small_buf;
@@ -187,9 +182,9 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
187 * the write lock here for other waiters 182 * the write lock here for other waiters
188 */ 183 */
189 reiserfs_write_unlock(inode->i_sb); 184 reiserfs_write_unlock(inode->i_sb);
190 if (filldir 185 if (!dir_emit
191 (dirent, local_buf, d_reclen, d_off, d_ino, 186 (ctx, local_buf, d_reclen, d_ino,
192 DT_UNKNOWN) < 0) { 187 DT_UNKNOWN)) {
193 reiserfs_write_lock(inode->i_sb); 188 reiserfs_write_lock(inode->i_sb);
194 if (local_buf != small_buf) { 189 if (local_buf != small_buf) {
195 kfree(local_buf); 190 kfree(local_buf);
@@ -237,7 +232,7 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
237 } /* while */ 232 } /* while */
238 233
239end: 234end:
240 *pos = next_pos; 235 ctx->pos = next_pos;
241 pathrelse(&path_to_entry); 236 pathrelse(&path_to_entry);
242 reiserfs_check_path(&path_to_entry); 237 reiserfs_check_path(&path_to_entry);
243out: 238out:
@@ -245,10 +240,9 @@ out:
245 return ret; 240 return ret;
246} 241}
247 242
248static int reiserfs_readdir(struct file *file, void *dirent, filldir_t filldir) 243static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
249{ 244{
250 struct dentry *dentry = file->f_path.dentry; 245 return reiserfs_readdir_inode(file_inode(file), ctx);
251 return reiserfs_readdir_dentry(dentry, dirent, filldir, &file->f_pos);
252} 246}
253 247
254/* compose directory item containing "." and ".." entries (entries are 248/* compose directory item containing "." and ".." entries (entries are
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f844533792ee..0048cc16a6a8 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2975,16 +2975,19 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
2975} 2975}
2976 2976
2977/* clm -- taken from fs/buffer.c:block_invalidate_page */ 2977/* clm -- taken from fs/buffer.c:block_invalidate_page */
2978static void reiserfs_invalidatepage(struct page *page, unsigned long offset) 2978static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
2979 unsigned int length)
2979{ 2980{
2980 struct buffer_head *head, *bh, *next; 2981 struct buffer_head *head, *bh, *next;
2981 struct inode *inode = page->mapping->host; 2982 struct inode *inode = page->mapping->host;
2982 unsigned int curr_off = 0; 2983 unsigned int curr_off = 0;
2984 unsigned int stop = offset + length;
2985 int partial_page = (offset || length < PAGE_CACHE_SIZE);
2983 int ret = 1; 2986 int ret = 1;
2984 2987
2985 BUG_ON(!PageLocked(page)); 2988 BUG_ON(!PageLocked(page));
2986 2989
2987 if (offset == 0) 2990 if (!partial_page)
2988 ClearPageChecked(page); 2991 ClearPageChecked(page);
2989 2992
2990 if (!page_has_buffers(page)) 2993 if (!page_has_buffers(page))
@@ -2996,6 +2999,9 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
2996 unsigned int next_off = curr_off + bh->b_size; 2999 unsigned int next_off = curr_off + bh->b_size;
2997 next = bh->b_this_page; 3000 next = bh->b_this_page;
2998 3001
3002 if (next_off > stop)
3003 goto out;
3004
2999 /* 3005 /*
3000 * is this block fully invalidated? 3006 * is this block fully invalidated?
3001 */ 3007 */
@@ -3014,7 +3020,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned long offset)
3014 * The get_block cached value has been unconditionally invalidated, 3020 * The get_block cached value has been unconditionally invalidated,
3015 * so real IO is not possible anymore. 3021 * so real IO is not possible anymore.
3016 */ 3022 */
3017 if (!offset && ret) { 3023 if (!partial_page && ret) {
3018 ret = try_to_release_page(page, 0); 3024 ret = try_to_release_page(page, 0);
3019 /* maybe should BUG_ON(!ret); - neilb */ 3025 /* maybe should BUG_ON(!ret); - neilb */
3020 } 3026 }
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 33532f79b4f7..a958444a75fc 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -19,12 +19,13 @@
19/* 19/*
20 * LOCKING: 20 * LOCKING:
21 * 21 *
22 * We rely on new Alexander Viro's super-block locking. 22 * These guys are evicted from procfs as the very first step in ->kill_sb().
23 * 23 *
24 */ 24 */
25 25
26static int show_version(struct seq_file *m, struct super_block *sb) 26static int show_version(struct seq_file *m, void *unused)
27{ 27{
28 struct super_block *sb = m->private;
28 char *format; 29 char *format;
29 30
30 if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { 31 if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
@@ -66,8 +67,9 @@ static int show_version(struct seq_file *m, struct super_block *sb)
66#define DJP( x ) le32_to_cpu( jp -> x ) 67#define DJP( x ) le32_to_cpu( jp -> x )
67#define JF( x ) ( r -> s_journal -> x ) 68#define JF( x ) ( r -> s_journal -> x )
68 69
69static int show_super(struct seq_file *m, struct super_block *sb) 70static int show_super(struct seq_file *m, void *unused)
70{ 71{
72 struct super_block *sb = m->private;
71 struct reiserfs_sb_info *r = REISERFS_SB(sb); 73 struct reiserfs_sb_info *r = REISERFS_SB(sb);
72 74
73 seq_printf(m, "state: \t%s\n" 75 seq_printf(m, "state: \t%s\n"
@@ -128,8 +130,9 @@ static int show_super(struct seq_file *m, struct super_block *sb)
128 return 0; 130 return 0;
129} 131}
130 132
131static int show_per_level(struct seq_file *m, struct super_block *sb) 133static int show_per_level(struct seq_file *m, void *unused)
132{ 134{
135 struct super_block *sb = m->private;
133 struct reiserfs_sb_info *r = REISERFS_SB(sb); 136 struct reiserfs_sb_info *r = REISERFS_SB(sb);
134 int level; 137 int level;
135 138
@@ -186,8 +189,9 @@ static int show_per_level(struct seq_file *m, struct super_block *sb)
186 return 0; 189 return 0;
187} 190}
188 191
189static int show_bitmap(struct seq_file *m, struct super_block *sb) 192static int show_bitmap(struct seq_file *m, void *unused)
190{ 193{
194 struct super_block *sb = m->private;
191 struct reiserfs_sb_info *r = REISERFS_SB(sb); 195 struct reiserfs_sb_info *r = REISERFS_SB(sb);
192 196
193 seq_printf(m, "free_block: %lu\n" 197 seq_printf(m, "free_block: %lu\n"
@@ -218,8 +222,9 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb)
218 return 0; 222 return 0;
219} 223}
220 224
221static int show_on_disk_super(struct seq_file *m, struct super_block *sb) 225static int show_on_disk_super(struct seq_file *m, void *unused)
222{ 226{
227 struct super_block *sb = m->private;
223 struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); 228 struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
224 struct reiserfs_super_block *rs = sb_info->s_rs; 229 struct reiserfs_super_block *rs = sb_info->s_rs;
225 int hash_code = DFL(s_hash_function_code); 230 int hash_code = DFL(s_hash_function_code);
@@ -261,8 +266,9 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
261 return 0; 266 return 0;
262} 267}
263 268
264static int show_oidmap(struct seq_file *m, struct super_block *sb) 269static int show_oidmap(struct seq_file *m, void *unused)
265{ 270{
271 struct super_block *sb = m->private;
266 struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); 272 struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
267 struct reiserfs_super_block *rs = sb_info->s_rs; 273 struct reiserfs_super_block *rs = sb_info->s_rs;
268 unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); 274 unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
@@ -291,8 +297,9 @@ static int show_oidmap(struct seq_file *m, struct super_block *sb)
291 return 0; 297 return 0;
292} 298}
293 299
294static int show_journal(struct seq_file *m, struct super_block *sb) 300static int show_journal(struct seq_file *m, void *unused)
295{ 301{
302 struct super_block *sb = m->private;
296 struct reiserfs_sb_info *r = REISERFS_SB(sb); 303 struct reiserfs_sb_info *r = REISERFS_SB(sb);
297 struct reiserfs_super_block *rs = r->s_rs; 304 struct reiserfs_super_block *rs = r->s_rs;
298 struct journal_params *jp = &rs->s_v1.s_journal; 305 struct journal_params *jp = &rs->s_v1.s_journal;
@@ -383,92 +390,24 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
383 return 0; 390 return 0;
384} 391}
385 392
386/* iterator */
387static int test_sb(struct super_block *sb, void *data)
388{
389 return data == sb;
390}
391
392static int set_sb(struct super_block *sb, void *data)
393{
394 return -ENOENT;
395}
396
397struct reiserfs_seq_private {
398 struct super_block *sb;
399 int (*show) (struct seq_file *, struct super_block *);
400};
401
402static void *r_start(struct seq_file *m, loff_t * pos)
403{
404 struct reiserfs_seq_private *priv = m->private;
405 loff_t l = *pos;
406
407 if (l)
408 return NULL;
409
410 if (IS_ERR(sget(&reiserfs_fs_type, test_sb, set_sb, 0, priv->sb)))
411 return NULL;
412
413 up_write(&priv->sb->s_umount);
414 return priv->sb;
415}
416
417static void *r_next(struct seq_file *m, void *v, loff_t * pos)
418{
419 ++*pos;
420 if (v)
421 deactivate_super(v);
422 return NULL;
423}
424
425static void r_stop(struct seq_file *m, void *v)
426{
427 if (v)
428 deactivate_super(v);
429}
430
431static int r_show(struct seq_file *m, void *v)
432{
433 struct reiserfs_seq_private *priv = m->private;
434 return priv->show(m, v);
435}
436
437static const struct seq_operations r_ops = {
438 .start = r_start,
439 .next = r_next,
440 .stop = r_stop,
441 .show = r_show,
442};
443
444static int r_open(struct inode *inode, struct file *file) 393static int r_open(struct inode *inode, struct file *file)
445{ 394{
446 struct reiserfs_seq_private *priv; 395 return single_open(file, PDE_DATA(inode),
447 int ret = seq_open_private(file, &r_ops, 396 proc_get_parent_data(inode));
448 sizeof(struct reiserfs_seq_private));
449
450 if (!ret) {
451 struct seq_file *m = file->private_data;
452 priv = m->private;
453 priv->sb = proc_get_parent_data(inode);
454 priv->show = PDE_DATA(inode);
455 }
456 return ret;
457} 397}
458 398
459static const struct file_operations r_file_operations = { 399static const struct file_operations r_file_operations = {
460 .open = r_open, 400 .open = r_open,
461 .read = seq_read, 401 .read = seq_read,
462 .llseek = seq_lseek, 402 .llseek = seq_lseek,
463 .release = seq_release_private, 403 .release = single_release,
464 .owner = THIS_MODULE,
465}; 404};
466 405
467static struct proc_dir_entry *proc_info_root = NULL; 406static struct proc_dir_entry *proc_info_root = NULL;
468static const char proc_info_root_name[] = "fs/reiserfs"; 407static const char proc_info_root_name[] = "fs/reiserfs";
469 408
470static void add_file(struct super_block *sb, char *name, 409static void add_file(struct super_block *sb, char *name,
471 int (*func) (struct seq_file *, struct super_block *)) 410 int (*func) (struct seq_file *, void *))
472{ 411{
473 proc_create_data(name, 0, REISERFS_SB(sb)->procdir, 412 proc_create_data(name, 0, REISERFS_SB(sb)->procdir,
474 &r_file_operations, func); 413 &r_file_operations, func);
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 157e474ab303..3df5ce6c724d 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2709,7 +2709,7 @@ extern const struct inode_operations reiserfs_dir_inode_operations;
2709extern const struct inode_operations reiserfs_symlink_inode_operations; 2709extern const struct inode_operations reiserfs_symlink_inode_operations;
2710extern const struct inode_operations reiserfs_special_inode_operations; 2710extern const struct inode_operations reiserfs_special_inode_operations;
2711extern const struct file_operations reiserfs_dir_operations; 2711extern const struct file_operations reiserfs_dir_operations;
2712int reiserfs_readdir_dentry(struct dentry *, void *, filldir_t, loff_t *); 2712int reiserfs_readdir_inode(struct inode *, struct dir_context *);
2713 2713
2714/* tail_conversion.c */ 2714/* tail_conversion.c */
2715int direct2indirect(struct reiserfs_transaction_handle *, struct inode *, 2715int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f8a23c3078f8..e2e202a07b31 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -499,6 +499,7 @@ int remove_save_link(struct inode *inode, int truncate)
499static void reiserfs_kill_sb(struct super_block *s) 499static void reiserfs_kill_sb(struct super_block *s)
500{ 500{
501 if (REISERFS_SB(s)) { 501 if (REISERFS_SB(s)) {
502 reiserfs_proc_info_done(s);
502 /* 503 /*
503 * Force any pending inode evictions to occur now. Any 504 * Force any pending inode evictions to occur now. Any
504 * inodes to be removed that have extended attributes 505 * inodes to be removed that have extended attributes
@@ -554,8 +555,6 @@ static void reiserfs_put_super(struct super_block *s)
554 REISERFS_SB(s)->reserved_blocks); 555 REISERFS_SB(s)->reserved_blocks);
555 } 556 }
556 557
557 reiserfs_proc_info_done(s);
558
559 reiserfs_write_unlock(s); 558 reiserfs_write_unlock(s);
560 mutex_destroy(&REISERFS_SB(s)->lock); 559 mutex_destroy(&REISERFS_SB(s)->lock);
561 kfree(s->s_fs_info); 560 kfree(s->s_fs_info);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 821bcf70e467..c69cdd749f09 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -171,6 +171,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
171 * modifying extended attributes. This includes operations such as permissions 171 * modifying extended attributes. This includes operations such as permissions
172 * or ownership changes, object deletions, etc. */ 172 * or ownership changes, object deletions, etc. */
173struct reiserfs_dentry_buf { 173struct reiserfs_dentry_buf {
174 struct dir_context ctx;
174 struct dentry *xadir; 175 struct dentry *xadir;
175 int count; 176 int count;
176 struct dentry *dentries[8]; 177 struct dentry *dentries[8];
@@ -223,9 +224,8 @@ static int reiserfs_for_each_xattr(struct inode *inode,
223{ 224{
224 struct dentry *dir; 225 struct dentry *dir;
225 int i, err = 0; 226 int i, err = 0;
226 loff_t pos = 0;
227 struct reiserfs_dentry_buf buf = { 227 struct reiserfs_dentry_buf buf = {
228 .count = 0, 228 .ctx.actor = fill_with_dentries,
229 }; 229 };
230 230
231 /* Skip out, an xattr has no xattrs associated with it */ 231 /* Skip out, an xattr has no xattrs associated with it */
@@ -249,29 +249,27 @@ static int reiserfs_for_each_xattr(struct inode *inode,
249 reiserfs_write_lock(inode->i_sb); 249 reiserfs_write_lock(inode->i_sb);
250 250
251 buf.xadir = dir; 251 buf.xadir = dir;
252 err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); 252 while (1) {
253 while ((err == 0 || err == -ENOSPC) && buf.count) { 253 err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
254 err = 0; 254 if (err)
255 255 break;
256 for (i = 0; i < buf.count && buf.dentries[i]; i++) { 256 if (!buf.count)
257 int lerr = 0; 257 break;
258 for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) {
258 struct dentry *dentry = buf.dentries[i]; 259 struct dentry *dentry = buf.dentries[i];
259 260
260 if (err == 0 && !S_ISDIR(dentry->d_inode->i_mode)) 261 if (!S_ISDIR(dentry->d_inode->i_mode))
261 lerr = action(dentry, data); 262 err = action(dentry, data);
262 263
263 dput(dentry); 264 dput(dentry);
264 buf.dentries[i] = NULL; 265 buf.dentries[i] = NULL;
265 err = lerr ?: err;
266 } 266 }
267 if (err)
268 break;
267 buf.count = 0; 269 buf.count = 0;
268 if (!err)
269 err = reiserfs_readdir_dentry(dir, &buf,
270 fill_with_dentries, &pos);
271 } 270 }
272 mutex_unlock(&dir->d_inode->i_mutex); 271 mutex_unlock(&dir->d_inode->i_mutex);
273 272
274 /* Clean up after a failed readdir */
275 cleanup_dentry_buf(&buf); 273 cleanup_dentry_buf(&buf);
276 274
277 if (!err) { 275 if (!err) {
@@ -800,6 +798,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
800} 798}
801 799
802struct listxattr_buf { 800struct listxattr_buf {
801 struct dir_context ctx;
803 size_t size; 802 size_t size;
804 size_t pos; 803 size_t pos;
805 char *buf; 804 char *buf;
@@ -845,8 +844,8 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
845{ 844{
846 struct dentry *dir; 845 struct dentry *dir;
847 int err = 0; 846 int err = 0;
848 loff_t pos = 0;
849 struct listxattr_buf buf = { 847 struct listxattr_buf buf = {
848 .ctx.actor = listxattr_filler,
850 .dentry = dentry, 849 .dentry = dentry,
851 .buf = buffer, 850 .buf = buffer,
852 .size = buffer ? size : 0, 851 .size = buffer ? size : 0,
@@ -868,7 +867,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
868 } 867 }
869 868
870 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); 869 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
871 err = reiserfs_readdir_dentry(dir, &buf, listxattr_filler, &pos); 870 err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx);
872 mutex_unlock(&dir->d_inode->i_mutex); 871 mutex_unlock(&dir->d_inode->i_mutex);
873 872
874 if (!err) 873 if (!err)
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 15cbc41ee365..ff1d3d42e72a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -145,19 +145,18 @@ static const struct address_space_operations romfs_aops = {
145/* 145/*
146 * read the entries from a directory 146 * read the entries from a directory
147 */ 147 */
148static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) 148static int romfs_readdir(struct file *file, struct dir_context *ctx)
149{ 149{
150 struct inode *i = file_inode(filp); 150 struct inode *i = file_inode(file);
151 struct romfs_inode ri; 151 struct romfs_inode ri;
152 unsigned long offset, maxoff; 152 unsigned long offset, maxoff;
153 int j, ino, nextfh; 153 int j, ino, nextfh;
154 int stored = 0;
155 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ 154 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
156 int ret; 155 int ret;
157 156
158 maxoff = romfs_maxsize(i->i_sb); 157 maxoff = romfs_maxsize(i->i_sb);
159 158
160 offset = filp->f_pos; 159 offset = ctx->pos;
161 if (!offset) { 160 if (!offset) {
162 offset = i->i_ino & ROMFH_MASK; 161 offset = i->i_ino & ROMFH_MASK;
163 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); 162 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -170,10 +169,10 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
170 for (;;) { 169 for (;;) {
171 if (!offset || offset >= maxoff) { 170 if (!offset || offset >= maxoff) {
172 offset = maxoff; 171 offset = maxoff;
173 filp->f_pos = offset; 172 ctx->pos = offset;
174 goto out; 173 goto out;
175 } 174 }
176 filp->f_pos = offset; 175 ctx->pos = offset;
177 176
178 /* Fetch inode info */ 177 /* Fetch inode info */
179 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); 178 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
@@ -194,16 +193,14 @@ static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
194 nextfh = be32_to_cpu(ri.next); 193 nextfh = be32_to_cpu(ri.next);
195 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) 194 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
196 ino = be32_to_cpu(ri.spec); 195 ino = be32_to_cpu(ri.spec);
197 if (filldir(dirent, fsname, j, offset, ino, 196 if (!dir_emit(ctx, fsname, j, ino,
198 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) 197 romfs_dtype_table[nextfh & ROMFH_TYPE]))
199 goto out; 198 goto out;
200 199
201 stored++;
202 offset = nextfh & ROMFH_MASK; 200 offset = nextfh & ROMFH_MASK;
203 } 201 }
204
205out: 202out:
206 return stored; 203 return 0;
207} 204}
208 205
209/* 206/*
@@ -281,7 +278,7 @@ error:
281 278
282static const struct file_operations romfs_dir_operations = { 279static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir, 280 .read = generic_read_dir,
284 .readdir = romfs_readdir, 281 .iterate = romfs_readdir,
285 .llseek = default_llseek, 282 .llseek = default_llseek,
286}; 283};
287 284
diff --git a/fs/select.c b/fs/select.c
index 8c1c96c27062..35d4adc749d9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -27,6 +27,8 @@
27#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/sched/rt.h> 29#include <linux/sched/rt.h>
30#include <linux/freezer.h>
31#include <net/busy_poll.h>
30 32
31#include <asm/uaccess.h> 33#include <asm/uaccess.h>
32 34
@@ -236,7 +238,8 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
236 238
237 set_current_state(state); 239 set_current_state(state);
238 if (!pwq->triggered) 240 if (!pwq->triggered)
239 rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); 241 rc = freezable_schedule_hrtimeout_range(expires, slack,
242 HRTIMER_MODE_ABS);
240 __set_current_state(TASK_RUNNING); 243 __set_current_state(TASK_RUNNING);
241 244
242 /* 245 /*
@@ -384,9 +387,10 @@ get_max:
384#define POLLEX_SET (POLLPRI) 387#define POLLEX_SET (POLLPRI)
385 388
386static inline void wait_key_set(poll_table *wait, unsigned long in, 389static inline void wait_key_set(poll_table *wait, unsigned long in,
387 unsigned long out, unsigned long bit) 390 unsigned long out, unsigned long bit,
391 unsigned int ll_flag)
388{ 392{
389 wait->_key = POLLEX_SET; 393 wait->_key = POLLEX_SET | ll_flag;
390 if (in & bit) 394 if (in & bit)
391 wait->_key |= POLLIN_SET; 395 wait->_key |= POLLIN_SET;
392 if (out & bit) 396 if (out & bit)
@@ -400,6 +404,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
400 poll_table *wait; 404 poll_table *wait;
401 int retval, i, timed_out = 0; 405 int retval, i, timed_out = 0;
402 unsigned long slack = 0; 406 unsigned long slack = 0;
407 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
408 unsigned long busy_end = 0;
403 409
404 rcu_read_lock(); 410 rcu_read_lock();
405 retval = max_select_fd(n, fds); 411 retval = max_select_fd(n, fds);
@@ -422,6 +428,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
422 retval = 0; 428 retval = 0;
423 for (;;) { 429 for (;;) {
424 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; 430 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
431 bool can_busy_loop = false;
425 432
426 inp = fds->in; outp = fds->out; exp = fds->ex; 433 inp = fds->in; outp = fds->out; exp = fds->ex;
427 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; 434 rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -449,7 +456,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
449 f_op = f.file->f_op; 456 f_op = f.file->f_op;
450 mask = DEFAULT_POLLMASK; 457 mask = DEFAULT_POLLMASK;
451 if (f_op && f_op->poll) { 458 if (f_op && f_op->poll) {
452 wait_key_set(wait, in, out, bit); 459 wait_key_set(wait, in, out,
460 bit, busy_flag);
453 mask = (*f_op->poll)(f.file, wait); 461 mask = (*f_op->poll)(f.file, wait);
454 } 462 }
455 fdput(f); 463 fdput(f);
@@ -468,6 +476,18 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
468 retval++; 476 retval++;
469 wait->_qproc = NULL; 477 wait->_qproc = NULL;
470 } 478 }
479 /* got something, stop busy polling */
480 if (retval) {
481 can_busy_loop = false;
482 busy_flag = 0;
483
484 /*
485 * only remember a returned
486 * POLL_BUSY_LOOP if we asked for it
487 */
488 } else if (busy_flag & mask)
489 can_busy_loop = true;
490
471 } 491 }
472 } 492 }
473 if (res_in) 493 if (res_in)
@@ -486,6 +506,17 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
486 break; 506 break;
487 } 507 }
488 508
509 /* only if found POLL_BUSY_LOOP sockets && not out of time */
510 if (can_busy_loop && !need_resched()) {
511 if (!busy_end) {
512 busy_end = busy_loop_end_time();
513 continue;
514 }
515 if (!busy_loop_timeout(busy_end))
516 continue;
517 }
518 busy_flag = 0;
519
489 /* 520 /*
490 * If this is the first loop and we have a timeout 521 * If this is the first loop and we have a timeout
491 * given, then we convert to ktime_t and set the to 522 * given, then we convert to ktime_t and set the to
@@ -717,7 +748,9 @@ struct poll_list {
717 * pwait poll_table will be used by the fd-provided poll handler for waiting, 748 * pwait poll_table will be used by the fd-provided poll handler for waiting,
718 * if pwait->_qproc is non-NULL. 749 * if pwait->_qproc is non-NULL.
719 */ 750 */
720static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) 751static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
752 bool *can_busy_poll,
753 unsigned int busy_flag)
721{ 754{
722 unsigned int mask; 755 unsigned int mask;
723 int fd; 756 int fd;
@@ -731,7 +764,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
731 mask = DEFAULT_POLLMASK; 764 mask = DEFAULT_POLLMASK;
732 if (f.file->f_op && f.file->f_op->poll) { 765 if (f.file->f_op && f.file->f_op->poll) {
733 pwait->_key = pollfd->events|POLLERR|POLLHUP; 766 pwait->_key = pollfd->events|POLLERR|POLLHUP;
767 pwait->_key |= busy_flag;
734 mask = f.file->f_op->poll(f.file, pwait); 768 mask = f.file->f_op->poll(f.file, pwait);
769 if (mask & busy_flag)
770 *can_busy_poll = true;
735 } 771 }
736 /* Mask out unneeded events. */ 772 /* Mask out unneeded events. */
737 mask &= pollfd->events | POLLERR | POLLHUP; 773 mask &= pollfd->events | POLLERR | POLLHUP;
@@ -750,6 +786,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
750 ktime_t expire, *to = NULL; 786 ktime_t expire, *to = NULL;
751 int timed_out = 0, count = 0; 787 int timed_out = 0, count = 0;
752 unsigned long slack = 0; 788 unsigned long slack = 0;
789 unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
790 unsigned long busy_end = 0;
753 791
754 /* Optimise the no-wait case */ 792 /* Optimise the no-wait case */
755 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { 793 if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -762,6 +800,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
762 800
763 for (;;) { 801 for (;;) {
764 struct poll_list *walk; 802 struct poll_list *walk;
803 bool can_busy_loop = false;
765 804
766 for (walk = list; walk != NULL; walk = walk->next) { 805 for (walk = list; walk != NULL; walk = walk->next) {
767 struct pollfd * pfd, * pfd_end; 806 struct pollfd * pfd, * pfd_end;
@@ -776,9 +815,13 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
776 * this. They'll get immediately deregistered 815 * this. They'll get immediately deregistered
777 * when we break out and return. 816 * when we break out and return.
778 */ 817 */
779 if (do_pollfd(pfd, pt)) { 818 if (do_pollfd(pfd, pt, &can_busy_loop,
819 busy_flag)) {
780 count++; 820 count++;
781 pt->_qproc = NULL; 821 pt->_qproc = NULL;
822 /* found something, stop busy polling */
823 busy_flag = 0;
824 can_busy_loop = false;
782 } 825 }
783 } 826 }
784 } 827 }
@@ -795,6 +838,17 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
795 if (count || timed_out) 838 if (count || timed_out)
796 break; 839 break;
797 840
841 /* only if found POLL_BUSY_LOOP sockets && not out of time */
842 if (can_busy_loop && !need_resched()) {
843 if (!busy_end) {
844 busy_end = busy_loop_end_time();
845 continue;
846 }
847 if (!busy_loop_timeout(busy_end))
848 continue;
849 }
850 busy_flag = 0;
851
798 /* 852 /*
799 * If this is the first loop and we have a timeout 853 * If this is the first loop and we have a timeout
800 * given, then we convert to ktime_t and set the to 854 * given, then we convert to ktime_t and set the to
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 774c1eb7f1c9..3135c2525c76 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -921,3 +921,57 @@ struct hlist_node *seq_hlist_next_rcu(void *v,
921 return rcu_dereference(node->next); 921 return rcu_dereference(node->next);
922} 922}
923EXPORT_SYMBOL(seq_hlist_next_rcu); 923EXPORT_SYMBOL(seq_hlist_next_rcu);
924
925/**
926 * seq_hlist_start_precpu - start an iteration of a percpu hlist array
927 * @head: pointer to percpu array of struct hlist_heads
928 * @cpu: pointer to cpu "cursor"
929 * @pos: start position of sequence
930 *
931 * Called at seq_file->op->start().
932 */
933struct hlist_node *
934seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos)
935{
936 struct hlist_node *node;
937
938 for_each_possible_cpu(*cpu) {
939 hlist_for_each(node, per_cpu_ptr(head, *cpu)) {
940 if (pos-- == 0)
941 return node;
942 }
943 }
944 return NULL;
945}
946EXPORT_SYMBOL(seq_hlist_start_percpu);
947
948/**
949 * seq_hlist_next_percpu - move to the next position of the percpu hlist array
950 * @v: pointer to current hlist_node
951 * @head: pointer to percpu array of struct hlist_heads
952 * @cpu: pointer to cpu "cursor"
953 * @pos: start position of sequence
954 *
955 * Called at seq_file->op->next().
956 */
957struct hlist_node *
958seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head,
959 int *cpu, loff_t *pos)
960{
961 struct hlist_node *node = v;
962
963 ++*pos;
964
965 if (node->next)
966 return node->next;
967
968 for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids;
969 *cpu = cpumask_next(*cpu, cpu_possible_mask)) {
970 struct hlist_head *bucket = per_cpu_ptr(head, *cpu);
971
972 if (!hlist_empty(bucket))
973 return bucket->first;
974 }
975 return NULL;
976}
977EXPORT_SYMBOL(seq_hlist_next_percpu);
diff --git a/fs/splice.c b/fs/splice.c
index d37431dd60a1..3b7ee656f3aa 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1098,27 +1098,13 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1098{ 1098{
1099 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, 1099 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1100 loff_t *, size_t, unsigned int); 1100 loff_t *, size_t, unsigned int);
1101 int ret;
1102
1103 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1104 return -EBADF;
1105
1106 if (unlikely(out->f_flags & O_APPEND))
1107 return -EINVAL;
1108
1109 ret = rw_verify_area(WRITE, out, ppos, len);
1110 if (unlikely(ret < 0))
1111 return ret;
1112 1101
1113 if (out->f_op && out->f_op->splice_write) 1102 if (out->f_op && out->f_op->splice_write)
1114 splice_write = out->f_op->splice_write; 1103 splice_write = out->f_op->splice_write;
1115 else 1104 else
1116 splice_write = default_file_splice_write; 1105 splice_write = default_file_splice_write;
1117 1106
1118 file_start_write(out); 1107 return splice_write(pipe, out, ppos, len, flags);
1119 ret = splice_write(pipe, out, ppos, len, flags);
1120 file_end_write(out);
1121 return ret;
1122} 1108}
1123 1109
1124/* 1110/*
@@ -1307,6 +1293,16 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1307 }; 1293 };
1308 long ret; 1294 long ret;
1309 1295
1296 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1297 return -EBADF;
1298
1299 if (unlikely(out->f_flags & O_APPEND))
1300 return -EINVAL;
1301
1302 ret = rw_verify_area(WRITE, out, opos, len);
1303 if (unlikely(ret < 0))
1304 return ret;
1305
1310 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1306 ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1311 if (ret > 0) 1307 if (ret > 0)
1312 *ppos = sd.pos; 1308 *ppos = sd.pos;
@@ -1362,7 +1358,19 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1362 offset = out->f_pos; 1358 offset = out->f_pos;
1363 } 1359 }
1364 1360
1361 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1362 return -EBADF;
1363
1364 if (unlikely(out->f_flags & O_APPEND))
1365 return -EINVAL;
1366
1367 ret = rw_verify_area(WRITE, out, &offset, len);
1368 if (unlikely(ret < 0))
1369 return ret;
1370
1371 file_start_write(out);
1365 ret = do_splice_from(ipipe, out, &offset, len, flags); 1372 ret = do_splice_from(ipipe, out, &offset, len, flags);
1373 file_end_write(out);
1366 1374
1367 if (!off_out) 1375 if (!off_out)
1368 out->f_pos = offset; 1376 out->f_pos = offset;
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 57dc70ebbb19..f7f527bf8c10 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -100,7 +100,7 @@ static int get_dir_index_using_offset(struct super_block *sb,
100} 100}
101 101
102 102
103static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) 103static int squashfs_readdir(struct file *file, struct dir_context *ctx)
104{ 104{
105 struct inode *inode = file_inode(file); 105 struct inode *inode = file_inode(file);
106 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; 106 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
@@ -127,11 +127,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
127 * It also means that the external f_pos is offset by 3 from the 127 * It also means that the external f_pos is offset by 3 from the
128 * on-disk directory f_pos. 128 * on-disk directory f_pos.
129 */ 129 */
130 while (file->f_pos < 3) { 130 while (ctx->pos < 3) {
131 char *name; 131 char *name;
132 int i_ino; 132 int i_ino;
133 133
134 if (file->f_pos == 0) { 134 if (ctx->pos == 0) {
135 name = "."; 135 name = ".";
136 size = 1; 136 size = 1;
137 i_ino = inode->i_ino; 137 i_ino = inode->i_ino;
@@ -141,24 +141,18 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
141 i_ino = squashfs_i(inode)->parent; 141 i_ino = squashfs_i(inode)->parent;
142 } 142 }
143 143
144 TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n", 144 if (!dir_emit(ctx, name, size, i_ino,
145 dirent, name, size, file->f_pos, i_ino, 145 squashfs_filetype_table[1]))
146 squashfs_filetype_table[1]);
147
148 if (filldir(dirent, name, size, file->f_pos, i_ino,
149 squashfs_filetype_table[1]) < 0) {
150 TRACE("Filldir returned less than 0\n");
151 goto finish; 146 goto finish;
152 }
153 147
154 file->f_pos += size; 148 ctx->pos += size;
155 } 149 }
156 150
157 length = get_dir_index_using_offset(inode->i_sb, &block, &offset, 151 length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
158 squashfs_i(inode)->dir_idx_start, 152 squashfs_i(inode)->dir_idx_start,
159 squashfs_i(inode)->dir_idx_offset, 153 squashfs_i(inode)->dir_idx_offset,
160 squashfs_i(inode)->dir_idx_cnt, 154 squashfs_i(inode)->dir_idx_cnt,
161 file->f_pos); 155 ctx->pos);
162 156
163 while (length < i_size_read(inode)) { 157 while (length < i_size_read(inode)) {
164 /* 158 /*
@@ -198,7 +192,7 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
198 192
199 length += sizeof(*dire) + size; 193 length += sizeof(*dire) + size;
200 194
201 if (file->f_pos >= length) 195 if (ctx->pos >= length)
202 continue; 196 continue;
203 197
204 dire->name[size] = '\0'; 198 dire->name[size] = '\0';
@@ -206,22 +200,12 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
206 ((short) le16_to_cpu(dire->inode_number)); 200 ((short) le16_to_cpu(dire->inode_number));
207 type = le16_to_cpu(dire->type); 201 type = le16_to_cpu(dire->type);
208 202
209 TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)" 203 if (!dir_emit(ctx, dire->name, size,
210 "\n", dirent, dire->name, size,
211 file->f_pos,
212 le32_to_cpu(dirh.start_block),
213 le16_to_cpu(dire->offset),
214 inode_number,
215 squashfs_filetype_table[type]);
216
217 if (filldir(dirent, dire->name, size, file->f_pos,
218 inode_number, 204 inode_number,
219 squashfs_filetype_table[type]) < 0) { 205 squashfs_filetype_table[type]))
220 TRACE("Filldir returned less than 0\n");
221 goto finish; 206 goto finish;
222 }
223 207
224 file->f_pos = length; 208 ctx->pos = length;
225 } 209 }
226 } 210 }
227 211
@@ -238,6 +222,6 @@ failed_read:
238 222
239const struct file_operations squashfs_dir_ops = { 223const struct file_operations squashfs_dir_ops = {
240 .read = generic_read_dir, 224 .read = generic_read_dir,
241 .readdir = squashfs_readdir, 225 .iterate = squashfs_readdir,
242 .llseek = default_llseek, 226 .llseek = default_llseek,
243}; 227};
diff --git a/fs/super.c b/fs/super.c
index 7465d4364208..68307c029228 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -336,19 +336,19 @@ EXPORT_SYMBOL(deactivate_super);
336 * and want to turn it into a full-blown active reference. grab_super() 336 * and want to turn it into a full-blown active reference. grab_super()
337 * is called with sb_lock held and drops it. Returns 1 in case of 337 * is called with sb_lock held and drops it. Returns 1 in case of
338 * success, 0 if we had failed (superblock contents was already dead or 338 * success, 0 if we had failed (superblock contents was already dead or
339 * dying when grab_super() had been called). 339 * dying when grab_super() had been called). Note that this is only
340 * called for superblocks not in rundown mode (== ones still on ->fs_supers
341 * of their type), so increment of ->s_count is OK here.
340 */ 342 */
341static int grab_super(struct super_block *s) __releases(sb_lock) 343static int grab_super(struct super_block *s) __releases(sb_lock)
342{ 344{
343 if (atomic_inc_not_zero(&s->s_active)) {
344 spin_unlock(&sb_lock);
345 return 1;
346 }
347 /* it's going away */
348 s->s_count++; 345 s->s_count++;
349 spin_unlock(&sb_lock); 346 spin_unlock(&sb_lock);
350 /* wait for it to die */
351 down_write(&s->s_umount); 347 down_write(&s->s_umount);
348 if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) {
349 put_super(s);
350 return 1;
351 }
352 up_write(&s->s_umount); 352 up_write(&s->s_umount);
353 put_super(s); 353 put_super(s);
354 return 0; 354 return 0;
@@ -463,11 +463,6 @@ retry:
463 destroy_super(s); 463 destroy_super(s);
464 s = NULL; 464 s = NULL;
465 } 465 }
466 down_write(&old->s_umount);
467 if (unlikely(!(old->s_flags & MS_BORN))) {
468 deactivate_locked_super(old);
469 goto retry;
470 }
471 return old; 466 return old;
472 } 467 }
473 } 468 }
@@ -660,10 +655,10 @@ restart:
660 if (hlist_unhashed(&sb->s_instances)) 655 if (hlist_unhashed(&sb->s_instances))
661 continue; 656 continue;
662 if (sb->s_bdev == bdev) { 657 if (sb->s_bdev == bdev) {
663 if (grab_super(sb)) /* drops sb_lock */ 658 if (!grab_super(sb))
664 return sb;
665 else
666 goto restart; 659 goto restart;
660 up_write(&sb->s_umount);
661 return sb;
667 } 662 }
668 } 663 }
669 spin_unlock(&sb_lock); 664 spin_unlock(&sb_lock);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e8e0e71b29d5..e068e744dbdd 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -74,7 +74,7 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
74} 74}
75 75
76/** 76/**
77 * sysfs_link_subling - link sysfs_dirent into sibling rbtree 77 * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
78 * @sd: sysfs_dirent of interest 78 * @sd: sysfs_dirent of interest
79 * 79 *
80 * Link @sd into its sibling rbtree which starts from 80 * Link @sd into its sibling rbtree which starts from
@@ -998,68 +998,38 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
998 return pos; 998 return pos;
999} 999}
1000 1000
1001static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) 1001static int sysfs_readdir(struct file *file, struct dir_context *ctx)
1002{ 1002{
1003 struct dentry *dentry = filp->f_path.dentry; 1003 struct dentry *dentry = file->f_path.dentry;
1004 struct sysfs_dirent * parent_sd = dentry->d_fsdata; 1004 struct sysfs_dirent * parent_sd = dentry->d_fsdata;
1005 struct sysfs_dirent *pos = filp->private_data; 1005 struct sysfs_dirent *pos = file->private_data;
1006 enum kobj_ns_type type; 1006 enum kobj_ns_type type;
1007 const void *ns; 1007 const void *ns;
1008 ino_t ino;
1009 loff_t off;
1010 1008
1011 type = sysfs_ns_type(parent_sd); 1009 type = sysfs_ns_type(parent_sd);
1012 ns = sysfs_info(dentry->d_sb)->ns[type]; 1010 ns = sysfs_info(dentry->d_sb)->ns[type];
1013 1011
1014 if (filp->f_pos == 0) { 1012 if (!dir_emit_dots(file, ctx))
1015 ino = parent_sd->s_ino; 1013 return 0;
1016 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
1017 filp->f_pos++;
1018 else
1019 return 0;
1020 }
1021 if (filp->f_pos == 1) {
1022 if (parent_sd->s_parent)
1023 ino = parent_sd->s_parent->s_ino;
1024 else
1025 ino = parent_sd->s_ino;
1026 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
1027 filp->f_pos++;
1028 else
1029 return 0;
1030 }
1031 mutex_lock(&sysfs_mutex); 1014 mutex_lock(&sysfs_mutex);
1032 off = filp->f_pos; 1015 for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
1033 for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
1034 pos; 1016 pos;
1035 pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { 1017 pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
1036 const char * name; 1018 const char *name = pos->s_name;
1037 unsigned int type; 1019 unsigned int type = dt_type(pos);
1038 int len, ret; 1020 int len = strlen(name);
1039 1021 ino_t ino = pos->s_ino;
1040 name = pos->s_name; 1022 ctx->pos = pos->s_hash;
1041 len = strlen(name); 1023 file->private_data = sysfs_get(pos);
1042 ino = pos->s_ino;
1043 type = dt_type(pos);
1044 off = filp->f_pos = pos->s_hash;
1045 filp->private_data = sysfs_get(pos);
1046 1024
1047 mutex_unlock(&sysfs_mutex); 1025 mutex_unlock(&sysfs_mutex);
1048 ret = filldir(dirent, name, len, off, ino, type); 1026 if (!dir_emit(ctx, name, len, ino, type))
1027 return 0;
1049 mutex_lock(&sysfs_mutex); 1028 mutex_lock(&sysfs_mutex);
1050 if (ret < 0)
1051 break;
1052 } 1029 }
1053 mutex_unlock(&sysfs_mutex); 1030 mutex_unlock(&sysfs_mutex);
1054 1031 file->private_data = NULL;
1055 /* don't reference last entry if its refcount is dropped */ 1032 ctx->pos = INT_MAX;
1056 if (!pos) {
1057 filp->private_data = NULL;
1058
1059 /* EOF and not changed as 0 or 1 in read/write path */
1060 if (off == filp->f_pos && off > 1)
1061 filp->f_pos = INT_MAX;
1062 }
1063 return 0; 1033 return 0;
1064} 1034}
1065 1035
@@ -1077,7 +1047,7 @@ static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
1077 1047
1078const struct file_operations sysfs_dir_operations = { 1048const struct file_operations sysfs_dir_operations = {
1079 .read = generic_read_dir, 1049 .read = generic_read_dir,
1080 .readdir = sysfs_readdir, 1050 .iterate = sysfs_readdir,
1081 .release = sysfs_dir_release, 1051 .release = sysfs_dir_release,
1082 .llseek = sysfs_dir_llseek, 1052 .llseek = sysfs_dir_llseek,
1083}; 1053};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 602f56db0442..d2bb7ed8fa74 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -449,10 +449,12 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
449 449
450 spin_lock_irqsave(&sysfs_open_dirent_lock, flags); 450 spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
451 451
452 od = sd->s_attr.open; 452 if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
453 if (od) { 453 od = sd->s_attr.open;
454 atomic_inc(&od->event); 454 if (od) {
455 wake_up_interruptible(&od->poll); 455 atomic_inc(&od->event);
456 wake_up_interruptible(&od->poll);
457 }
456 } 458 }
457 459
458 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); 460 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aec3d5c98c94..09a1a25cd145 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -20,38 +20,64 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
20 const struct attribute_group *grp) 20 const struct attribute_group *grp)
21{ 21{
22 struct attribute *const* attr; 22 struct attribute *const* attr;
23 int i; 23 struct bin_attribute *const* bin_attr;
24 24
25 for (i = 0, attr = grp->attrs; *attr; i++, attr++) 25 if (grp->attrs)
26 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); 26 for (attr = grp->attrs; *attr; attr++)
27 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
28 if (grp->bin_attrs)
29 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
30 sysfs_remove_bin_file(kobj, *bin_attr);
27} 31}
28 32
29static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 33static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
30 const struct attribute_group *grp, int update) 34 const struct attribute_group *grp, int update)
31{ 35{
32 struct attribute *const* attr; 36 struct attribute *const* attr;
37 struct bin_attribute *const* bin_attr;
33 int error = 0, i; 38 int error = 0, i;
34 39
35 for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { 40 if (grp->attrs) {
36 umode_t mode = 0; 41 for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
42 umode_t mode = 0;
43
44 /*
45 * In update mode, we're changing the permissions or
46 * visibility. Do this by first removing then
47 * re-adding (if required) the file.
48 */
49 if (update)
50 sysfs_hash_and_remove(dir_sd, NULL,
51 (*attr)->name);
52 if (grp->is_visible) {
53 mode = grp->is_visible(kobj, *attr, i);
54 if (!mode)
55 continue;
56 }
57 error = sysfs_add_file_mode(dir_sd, *attr,
58 SYSFS_KOBJ_ATTR,
59 (*attr)->mode | mode);
60 if (unlikely(error))
61 break;
62 }
63 if (error) {
64 remove_files(dir_sd, kobj, grp);
65 goto exit;
66 }
67 }
37 68
38 /* in update mode, we're changing the permissions or 69 if (grp->bin_attrs) {
39 * visibility. Do this by first removing then 70 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
40 * re-adding (if required) the file */ 71 if (update)
41 if (update) 72 sysfs_remove_bin_file(kobj, *bin_attr);
42 sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name); 73 error = sysfs_create_bin_file(kobj, *bin_attr);
43 if (grp->is_visible) { 74 if (error)
44 mode = grp->is_visible(kobj, *attr, i); 75 break;
45 if (!mode)
46 continue;
47 } 76 }
48 error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR, 77 if (error)
49 (*attr)->mode | mode); 78 remove_files(dir_sd, kobj, grp);
50 if (unlikely(error))
51 break;
52 } 79 }
53 if (error) 80exit:
54 remove_files(dir_sd, kobj, grp);
55 return error; 81 return error;
56} 82}
57 83
@@ -67,8 +93,8 @@ static int internal_create_group(struct kobject *kobj, int update,
67 /* Updates may happen before the object has been instantiated */ 93 /* Updates may happen before the object has been instantiated */
68 if (unlikely(update && !kobj->sd)) 94 if (unlikely(update && !kobj->sd))
69 return -EINVAL; 95 return -EINVAL;
70 if (!grp->attrs) { 96 if (!grp->attrs && !grp->bin_attrs) {
71 WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n", 97 WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
72 kobj->name, grp->name ? "" : grp->name); 98 kobj->name, grp->name ? "" : grp->name);
73 return -EINVAL; 99 return -EINVAL;
74 } 100 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0ce3ccf7f401..3e2837a633ed 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -24,8 +24,6 @@
24#include <linux/security.h> 24#include <linux/security.h>
25#include "sysfs.h" 25#include "sysfs.h"
26 26
27extern struct super_block * sysfs_sb;
28
29static const struct address_space_operations sysfs_aops = { 27static const struct address_space_operations sysfs_aops = {
30 .readpage = simple_readpage, 28 .readpage = simple_readpage,
31 .write_begin = simple_write_begin, 29 .write_begin = simple_write_begin,
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 3799e8dac3eb..d42291d08215 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -18,12 +18,12 @@
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include "sysv.h" 19#include "sysv.h"
20 20
21static int sysv_readdir(struct file *, void *, filldir_t); 21static int sysv_readdir(struct file *, struct dir_context *);
22 22
23const struct file_operations sysv_dir_operations = { 23const struct file_operations sysv_dir_operations = {
24 .llseek = generic_file_llseek, 24 .llseek = generic_file_llseek,
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .iterate = sysv_readdir,
27 .fsync = generic_file_fsync, 27 .fsync = generic_file_fsync,
28}; 28};
29 29
@@ -65,18 +65,21 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
65 return page; 65 return page;
66} 66}
67 67
68static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) 68static int sysv_readdir(struct file *file, struct dir_context *ctx)
69{ 69{
70 unsigned long pos = filp->f_pos; 70 unsigned long pos = ctx->pos;
71 struct inode *inode = file_inode(filp); 71 struct inode *inode = file_inode(file);
72 struct super_block *sb = inode->i_sb; 72 struct super_block *sb = inode->i_sb;
73 unsigned offset = pos & ~PAGE_CACHE_MASK;
74 unsigned long n = pos >> PAGE_CACHE_SHIFT;
75 unsigned long npages = dir_pages(inode); 73 unsigned long npages = dir_pages(inode);
74 unsigned offset;
75 unsigned long n;
76 76
77 pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); 77 ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
78 if (pos >= inode->i_size) 78 if (pos >= inode->i_size)
79 goto done; 79 return 0;
80
81 offset = pos & ~PAGE_CACHE_MASK;
82 n = pos >> PAGE_CACHE_SHIFT;
80 83
81 for ( ; n < npages; n++, offset = 0) { 84 for ( ; n < npages; n++, offset = 0) {
82 char *kaddr, *limit; 85 char *kaddr, *limit;
@@ -88,29 +91,21 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir)
88 kaddr = (char *)page_address(page); 91 kaddr = (char *)page_address(page);
89 de = (struct sysv_dir_entry *)(kaddr+offset); 92 de = (struct sysv_dir_entry *)(kaddr+offset);
90 limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE; 93 limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
91 for ( ;(char*)de <= limit; de++) { 94 for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
92 char *name = de->name; 95 char *name = de->name;
93 int over;
94 96
95 if (!de->inode) 97 if (!de->inode)
96 continue; 98 continue;
97 99
98 offset = (char *)de - kaddr; 100 if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
99
100 over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN),
101 ((loff_t)n<<PAGE_CACHE_SHIFT) | offset,
102 fs16_to_cpu(SYSV_SB(sb), de->inode), 101 fs16_to_cpu(SYSV_SB(sb), de->inode),
103 DT_UNKNOWN); 102 DT_UNKNOWN)) {
104 if (over) {
105 dir_put_page(page); 103 dir_put_page(page);
106 goto done; 104 return 0;
107 } 105 }
108 } 106 }
109 dir_put_page(page); 107 dir_put_page(page);
110 } 108 }
111
112done:
113 filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset;
114 return 0; 109 return 0;
115} 110}
116 111
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 1c0d5f264767..731b2bbcaab3 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
27 return err; 27 return err;
28} 28}
29 29
30static int sysv_hash(const struct dentry *dentry, const struct inode *inode, 30static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
31 struct qstr *qstr)
32{ 31{
33 /* Truncate the name in place, avoids having to define a compare 32 /* Truncate the name in place, avoids having to define a compare
34 function. */ 33 function. */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 32b644f03690..929312180dd0 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -8,6 +8,7 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/alarmtimer.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/poll.h> 13#include <linux/poll.h>
13#include <linux/init.h> 14#include <linux/init.h>
@@ -26,7 +27,10 @@
26#include <linux/rcupdate.h> 27#include <linux/rcupdate.h>
27 28
28struct timerfd_ctx { 29struct timerfd_ctx {
29 struct hrtimer tmr; 30 union {
31 struct hrtimer tmr;
32 struct alarm alarm;
33 } t;
30 ktime_t tintv; 34 ktime_t tintv;
31 ktime_t moffs; 35 ktime_t moffs;
32 wait_queue_head_t wqh; 36 wait_queue_head_t wqh;
@@ -41,14 +45,19 @@ struct timerfd_ctx {
41static LIST_HEAD(cancel_list); 45static LIST_HEAD(cancel_list);
42static DEFINE_SPINLOCK(cancel_lock); 46static DEFINE_SPINLOCK(cancel_lock);
43 47
48static inline bool isalarm(struct timerfd_ctx *ctx)
49{
50 return ctx->clockid == CLOCK_REALTIME_ALARM ||
51 ctx->clockid == CLOCK_BOOTTIME_ALARM;
52}
53
44/* 54/*
45 * This gets called when the timer event triggers. We set the "expired" 55 * This gets called when the timer event triggers. We set the "expired"
46 * flag, but we do not re-arm the timer (in case it's necessary, 56 * flag, but we do not re-arm the timer (in case it's necessary,
47 * tintv.tv64 != 0) until the timer is accessed. 57 * tintv.tv64 != 0) until the timer is accessed.
48 */ 58 */
49static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) 59static void timerfd_triggered(struct timerfd_ctx *ctx)
50{ 60{
51 struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx, tmr);
52 unsigned long flags; 61 unsigned long flags;
53 62
54 spin_lock_irqsave(&ctx->wqh.lock, flags); 63 spin_lock_irqsave(&ctx->wqh.lock, flags);
@@ -56,10 +65,25 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
56 ctx->ticks++; 65 ctx->ticks++;
57 wake_up_locked(&ctx->wqh); 66 wake_up_locked(&ctx->wqh);
58 spin_unlock_irqrestore(&ctx->wqh.lock, flags); 67 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
68}
59 69
70static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
71{
72 struct timerfd_ctx *ctx = container_of(htmr, struct timerfd_ctx,
73 t.tmr);
74 timerfd_triggered(ctx);
60 return HRTIMER_NORESTART; 75 return HRTIMER_NORESTART;
61} 76}
62 77
78static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
79 ktime_t now)
80{
81 struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx,
82 t.alarm);
83 timerfd_triggered(ctx);
84 return ALARMTIMER_NORESTART;
85}
86
63/* 87/*
64 * Called when the clock was set to cancel the timers in the cancel 88 * Called when the clock was set to cancel the timers in the cancel
65 * list. This will wake up processes waiting on these timers. The 89 * list. This will wake up processes waiting on these timers. The
@@ -107,8 +131,9 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
107 131
108static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) 132static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
109{ 133{
110 if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && 134 if ((ctx->clockid == CLOCK_REALTIME ||
111 (flags & TFD_TIMER_CANCEL_ON_SET)) { 135 ctx->clockid == CLOCK_REALTIME_ALARM) &&
136 (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
112 if (!ctx->might_cancel) { 137 if (!ctx->might_cancel) {
113 ctx->might_cancel = true; 138 ctx->might_cancel = true;
114 spin_lock(&cancel_lock); 139 spin_lock(&cancel_lock);
@@ -124,7 +149,11 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
124{ 149{
125 ktime_t remaining; 150 ktime_t remaining;
126 151
127 remaining = hrtimer_expires_remaining(&ctx->tmr); 152 if (isalarm(ctx))
153 remaining = alarm_expires_remaining(&ctx->t.alarm);
154 else
155 remaining = hrtimer_expires_remaining(&ctx->t.tmr);
156
128 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; 157 return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
129} 158}
130 159
@@ -142,11 +171,28 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
142 ctx->expired = 0; 171 ctx->expired = 0;
143 ctx->ticks = 0; 172 ctx->ticks = 0;
144 ctx->tintv = timespec_to_ktime(ktmr->it_interval); 173 ctx->tintv = timespec_to_ktime(ktmr->it_interval);
145 hrtimer_init(&ctx->tmr, clockid, htmode); 174
146 hrtimer_set_expires(&ctx->tmr, texp); 175 if (isalarm(ctx)) {
147 ctx->tmr.function = timerfd_tmrproc; 176 alarm_init(&ctx->t.alarm,
177 ctx->clockid == CLOCK_REALTIME_ALARM ?
178 ALARM_REALTIME : ALARM_BOOTTIME,
179 timerfd_alarmproc);
180 } else {
181 hrtimer_init(&ctx->t.tmr, clockid, htmode);
182 hrtimer_set_expires(&ctx->t.tmr, texp);
183 ctx->t.tmr.function = timerfd_tmrproc;
184 }
185
148 if (texp.tv64 != 0) { 186 if (texp.tv64 != 0) {
149 hrtimer_start(&ctx->tmr, texp, htmode); 187 if (isalarm(ctx)) {
188 if (flags & TFD_TIMER_ABSTIME)
189 alarm_start(&ctx->t.alarm, texp);
190 else
191 alarm_start_relative(&ctx->t.alarm, texp);
192 } else {
193 hrtimer_start(&ctx->t.tmr, texp, htmode);
194 }
195
150 if (timerfd_canceled(ctx)) 196 if (timerfd_canceled(ctx))
151 return -ECANCELED; 197 return -ECANCELED;
152 } 198 }
@@ -158,7 +204,11 @@ static int timerfd_release(struct inode *inode, struct file *file)
158 struct timerfd_ctx *ctx = file->private_data; 204 struct timerfd_ctx *ctx = file->private_data;
159 205
160 timerfd_remove_cancel(ctx); 206 timerfd_remove_cancel(ctx);
161 hrtimer_cancel(&ctx->tmr); 207
208 if (isalarm(ctx))
209 alarm_cancel(&ctx->t.alarm);
210 else
211 hrtimer_cancel(&ctx->t.tmr);
162 kfree_rcu(ctx, rcu); 212 kfree_rcu(ctx, rcu);
163 return 0; 213 return 0;
164} 214}
@@ -215,9 +265,15 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
215 * callback to avoid DoS attacks specifying a very 265 * callback to avoid DoS attacks specifying a very
216 * short timer period. 266 * short timer period.
217 */ 267 */
218 ticks += hrtimer_forward_now(&ctx->tmr, 268 if (isalarm(ctx)) {
219 ctx->tintv) - 1; 269 ticks += alarm_forward_now(
220 hrtimer_restart(&ctx->tmr); 270 &ctx->t.alarm, ctx->tintv) - 1;
271 alarm_restart(&ctx->t.alarm);
272 } else {
273 ticks += hrtimer_forward_now(&ctx->t.tmr,
274 ctx->tintv) - 1;
275 hrtimer_restart(&ctx->t.tmr);
276 }
221 } 277 }
222 ctx->expired = 0; 278 ctx->expired = 0;
223 ctx->ticks = 0; 279 ctx->ticks = 0;
@@ -259,7 +315,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
259 315
260 if ((flags & ~TFD_CREATE_FLAGS) || 316 if ((flags & ~TFD_CREATE_FLAGS) ||
261 (clockid != CLOCK_MONOTONIC && 317 (clockid != CLOCK_MONOTONIC &&
262 clockid != CLOCK_REALTIME)) 318 clockid != CLOCK_REALTIME &&
319 clockid != CLOCK_REALTIME_ALARM &&
320 clockid != CLOCK_BOOTTIME_ALARM))
263 return -EINVAL; 321 return -EINVAL;
264 322
265 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 323 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -268,7 +326,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
268 326
269 init_waitqueue_head(&ctx->wqh); 327 init_waitqueue_head(&ctx->wqh);
270 ctx->clockid = clockid; 328 ctx->clockid = clockid;
271 hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); 329
330 if (isalarm(ctx))
331 alarm_init(&ctx->t.alarm,
332 ctx->clockid == CLOCK_REALTIME_ALARM ?
333 ALARM_REALTIME : ALARM_BOOTTIME,
334 timerfd_alarmproc);
335 else
336 hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
337
272 ctx->moffs = ktime_get_monotonic_offset(); 338 ctx->moffs = ktime_get_monotonic_offset();
273 339
274 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 340 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
@@ -305,8 +371,14 @@ static int do_timerfd_settime(int ufd, int flags,
305 */ 371 */
306 for (;;) { 372 for (;;) {
307 spin_lock_irq(&ctx->wqh.lock); 373 spin_lock_irq(&ctx->wqh.lock);
308 if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) 374
309 break; 375 if (isalarm(ctx)) {
376 if (alarm_try_to_cancel(&ctx->t.alarm) >= 0)
377 break;
378 } else {
379 if (hrtimer_try_to_cancel(&ctx->t.tmr) >= 0)
380 break;
381 }
310 spin_unlock_irq(&ctx->wqh.lock); 382 spin_unlock_irq(&ctx->wqh.lock);
311 cpu_relax(); 383 cpu_relax();
312 } 384 }
@@ -317,8 +389,12 @@ static int do_timerfd_settime(int ufd, int flags,
317 * We do not update "ticks" and "expired" since the timer will be 389 * We do not update "ticks" and "expired" since the timer will be
318 * re-programmed again in the following timerfd_setup() call. 390 * re-programmed again in the following timerfd_setup() call.
319 */ 391 */
320 if (ctx->expired && ctx->tintv.tv64) 392 if (ctx->expired && ctx->tintv.tv64) {
321 hrtimer_forward_now(&ctx->tmr, ctx->tintv); 393 if (isalarm(ctx))
394 alarm_forward_now(&ctx->t.alarm, ctx->tintv);
395 else
396 hrtimer_forward_now(&ctx->t.tmr, ctx->tintv);
397 }
322 398
323 old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); 399 old->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
324 old->it_interval = ktime_to_timespec(ctx->tintv); 400 old->it_interval = ktime_to_timespec(ctx->tintv);
@@ -345,9 +421,18 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
345 spin_lock_irq(&ctx->wqh.lock); 421 spin_lock_irq(&ctx->wqh.lock);
346 if (ctx->expired && ctx->tintv.tv64) { 422 if (ctx->expired && ctx->tintv.tv64) {
347 ctx->expired = 0; 423 ctx->expired = 0;
348 ctx->ticks += 424
349 hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; 425 if (isalarm(ctx)) {
350 hrtimer_restart(&ctx->tmr); 426 ctx->ticks +=
427 alarm_forward_now(
428 &ctx->t.alarm, ctx->tintv) - 1;
429 alarm_restart(&ctx->t.alarm);
430 } else {
431 ctx->ticks +=
432 hrtimer_forward_now(&ctx->t.tmr, ctx->tintv)
433 - 1;
434 hrtimer_restart(&ctx->t.tmr);
435 }
351 } 436 }
352 t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); 437 t->it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
353 t->it_interval = ktime_to_timespec(ctx->tintv); 438 t->it_interval = ktime_to_timespec(ctx->tintv);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 605af512aec2..6b4947f75af7 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -346,19 +346,18 @@ static unsigned int vfs_dent_type(uint8_t type)
346 * This means that UBIFS cannot support NFS which requires full 346 * This means that UBIFS cannot support NFS which requires full
347 * 'seekdir()'/'telldir()' support. 347 * 'seekdir()'/'telldir()' support.
348 */ 348 */
349static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) 349static int ubifs_readdir(struct file *file, struct dir_context *ctx)
350{ 350{
351 int err, over = 0; 351 int err;
352 loff_t pos = file->f_pos;
353 struct qstr nm; 352 struct qstr nm;
354 union ubifs_key key; 353 union ubifs_key key;
355 struct ubifs_dent_node *dent; 354 struct ubifs_dent_node *dent;
356 struct inode *dir = file_inode(file); 355 struct inode *dir = file_inode(file);
357 struct ubifs_info *c = dir->i_sb->s_fs_info; 356 struct ubifs_info *c = dir->i_sb->s_fs_info;
358 357
359 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, pos); 358 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
360 359
361 if (pos > UBIFS_S_KEY_HASH_MASK || pos == 2) 360 if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2)
362 /* 361 /*
363 * The directory was seek'ed to a senseless position or there 362 * The directory was seek'ed to a senseless position or there
364 * are no more entries. 363 * are no more entries.
@@ -384,19 +383,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
384 file->f_version = 1; 383 file->f_version = 1;
385 384
386 /* File positions 0 and 1 correspond to "." and ".." */ 385 /* File positions 0 and 1 correspond to "." and ".." */
387 if (pos == 0) { 386 if (ctx->pos < 2) {
388 ubifs_assert(!file->private_data);
389 over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
390 if (over)
391 return 0;
392 file->f_pos = pos = 1;
393 }
394
395 if (pos == 1) {
396 ubifs_assert(!file->private_data); 387 ubifs_assert(!file->private_data);
397 over = filldir(dirent, "..", 2, 1, 388 if (!dir_emit_dots(file, ctx))
398 parent_ino(file->f_path.dentry), DT_DIR);
399 if (over)
400 return 0; 389 return 0;
401 390
402 /* Find the first entry in TNC and save it */ 391 /* Find the first entry in TNC and save it */
@@ -408,7 +397,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
408 goto out; 397 goto out;
409 } 398 }
410 399
411 file->f_pos = pos = key_hash_flash(c, &dent->key); 400 ctx->pos = key_hash_flash(c, &dent->key);
412 file->private_data = dent; 401 file->private_data = dent;
413 } 402 }
414 403
@@ -416,16 +405,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
416 if (!dent) { 405 if (!dent) {
417 /* 406 /*
418 * The directory was seek'ed to and is now readdir'ed. 407 * The directory was seek'ed to and is now readdir'ed.
419 * Find the entry corresponding to @pos or the closest one. 408 * Find the entry corresponding to @ctx->pos or the closest one.
420 */ 409 */
421 dent_key_init_hash(c, &key, dir->i_ino, pos); 410 dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
422 nm.name = NULL; 411 nm.name = NULL;
423 dent = ubifs_tnc_next_ent(c, &key, &nm); 412 dent = ubifs_tnc_next_ent(c, &key, &nm);
424 if (IS_ERR(dent)) { 413 if (IS_ERR(dent)) {
425 err = PTR_ERR(dent); 414 err = PTR_ERR(dent);
426 goto out; 415 goto out;
427 } 416 }
428 file->f_pos = pos = key_hash_flash(c, &dent->key); 417 ctx->pos = key_hash_flash(c, &dent->key);
429 file->private_data = dent; 418 file->private_data = dent;
430 } 419 }
431 420
@@ -437,10 +426,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
437 ubifs_inode(dir)->creat_sqnum); 426 ubifs_inode(dir)->creat_sqnum);
438 427
439 nm.len = le16_to_cpu(dent->nlen); 428 nm.len = le16_to_cpu(dent->nlen);
440 over = filldir(dirent, dent->name, nm.len, pos, 429 if (!dir_emit(ctx, dent->name, nm.len,
441 le64_to_cpu(dent->inum), 430 le64_to_cpu(dent->inum),
442 vfs_dent_type(dent->type)); 431 vfs_dent_type(dent->type)))
443 if (over)
444 return 0; 432 return 0;
445 433
446 /* Switch to the next entry */ 434 /* Switch to the next entry */
@@ -453,17 +441,9 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
453 } 441 }
454 442
455 kfree(file->private_data); 443 kfree(file->private_data);
456 file->f_pos = pos = key_hash_flash(c, &dent->key); 444 ctx->pos = key_hash_flash(c, &dent->key);
457 file->private_data = dent; 445 file->private_data = dent;
458 cond_resched(); 446 cond_resched();
459
460 if (file->f_version == 0)
461 /*
462 * The file was seek'ed meanwhile, lets return and start
463 * reading direntries from the new position on the next
464 * invocation.
465 */
466 return 0;
467 } 447 }
468 448
469out: 449out:
@@ -475,15 +455,10 @@ out:
475 kfree(file->private_data); 455 kfree(file->private_data);
476 file->private_data = NULL; 456 file->private_data = NULL;
477 /* 2 is a special value indicating that there are no more direntries */ 457 /* 2 is a special value indicating that there are no more direntries */
478 file->f_pos = 2; 458 ctx->pos = 2;
479 return 0; 459 return 0;
480} 460}
481 461
482static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
483{
484 return generic_file_llseek(file, offset, whence);
485}
486
487/* Free saved readdir() state when the directory is closed */ 462/* Free saved readdir() state when the directory is closed */
488static int ubifs_dir_release(struct inode *dir, struct file *file) 463static int ubifs_dir_release(struct inode *dir, struct file *file)
489{ 464{
@@ -1201,10 +1176,10 @@ const struct inode_operations ubifs_dir_inode_operations = {
1201}; 1176};
1202 1177
1203const struct file_operations ubifs_dir_operations = { 1178const struct file_operations ubifs_dir_operations = {
1204 .llseek = ubifs_dir_llseek, 1179 .llseek = generic_file_llseek,
1205 .release = ubifs_dir_release, 1180 .release = ubifs_dir_release,
1206 .read = generic_read_dir, 1181 .read = generic_read_dir,
1207 .readdir = ubifs_readdir, 1182 .iterate = ubifs_readdir,
1208 .fsync = ubifs_fsync, 1183 .fsync = ubifs_fsync,
1209 .unlocked_ioctl = ubifs_ioctl, 1184 .unlocked_ioctl = ubifs_ioctl,
1210#ifdef CONFIG_COMPAT 1185#ifdef CONFIG_COMPAT
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 14374530784c..123c79b7261e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1277,13 +1277,14 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
1277 return err; 1277 return err;
1278} 1278}
1279 1279
1280static void ubifs_invalidatepage(struct page *page, unsigned long offset) 1280static void ubifs_invalidatepage(struct page *page, unsigned int offset,
1281 unsigned int length)
1281{ 1282{
1282 struct inode *inode = page->mapping->host; 1283 struct inode *inode = page->mapping->host;
1283 struct ubifs_info *c = inode->i_sb->s_fs_info; 1284 struct ubifs_info *c = inode->i_sb->s_fs_info;
1284 1285
1285 ubifs_assert(PagePrivate(page)); 1286 ubifs_assert(PagePrivate(page));
1286 if (offset) 1287 if (offset || length < PAGE_CACHE_SIZE)
1287 /* Partial page remains dirty */ 1288 /* Partial page remains dirty */
1288 return; 1289 return;
1289 1290
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f21acf0ef01f..879b9976c12b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1412,7 +1412,7 @@ static int mount_ubifs(struct ubifs_info *c)
1412 1412
1413 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", 1413 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
1414 c->vi.ubi_num, c->vi.vol_id, c->vi.name, 1414 c->vi.ubi_num, c->vi.vol_id, c->vi.name,
1415 c->ro_mount ? ", R/O mode" : NULL); 1415 c->ro_mount ? ", R/O mode" : "");
1416 x = (long long)c->main_lebs * c->leb_size; 1416 x = (long long)c->main_lebs * c->leb_size;
1417 y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1417 y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1418 ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", 1418 ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index b3e93f5e17c3..a012c51caffd 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -35,14 +35,16 @@
35#include "udf_i.h" 35#include "udf_i.h"
36#include "udf_sb.h" 36#include "udf_sb.h"
37 37
38static int do_udf_readdir(struct inode *dir, struct file *filp, 38
39 filldir_t filldir, void *dirent) 39static int udf_readdir(struct file *file, struct dir_context *ctx)
40{ 40{
41 struct inode *dir = file_inode(file);
42 struct udf_inode_info *iinfo = UDF_I(dir);
41 struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; 43 struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
42 struct fileIdentDesc *fi = NULL; 44 struct fileIdentDesc *fi = NULL;
43 struct fileIdentDesc cfi; 45 struct fileIdentDesc cfi;
44 int block, iblock; 46 int block, iblock;
45 loff_t nf_pos = (filp->f_pos - 1) << 2; 47 loff_t nf_pos;
46 int flen; 48 int flen;
47 unsigned char *fname = NULL; 49 unsigned char *fname = NULL;
48 unsigned char *nameptr; 50 unsigned char *nameptr;
@@ -54,10 +56,14 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
54 uint32_t elen; 56 uint32_t elen;
55 sector_t offset; 57 sector_t offset;
56 int i, num, ret = 0; 58 int i, num, ret = 0;
57 unsigned int dt_type;
58 struct extent_position epos = { NULL, 0, {0, 0} }; 59 struct extent_position epos = { NULL, 0, {0, 0} };
59 struct udf_inode_info *iinfo;
60 60
61 if (ctx->pos == 0) {
62 if (!dir_emit_dot(file, ctx))
63 return 0;
64 ctx->pos = 1;
65 }
66 nf_pos = (ctx->pos - 1) << 2;
61 if (nf_pos >= size) 67 if (nf_pos >= size)
62 goto out; 68 goto out;
63 69
@@ -71,7 +77,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
71 nf_pos = udf_ext0_offset(dir); 77 nf_pos = udf_ext0_offset(dir);
72 78
73 fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); 79 fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
74 iinfo = UDF_I(dir);
75 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { 80 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
76 if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, 81 if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
77 &epos, &eloc, &elen, &offset) 82 &epos, &eloc, &elen, &offset)
@@ -116,7 +121,9 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
116 } 121 }
117 122
118 while (nf_pos < size) { 123 while (nf_pos < size) {
119 filp->f_pos = (nf_pos >> 2) + 1; 124 struct kernel_lb_addr tloc;
125
126 ctx->pos = (nf_pos >> 2) + 1;
120 127
121 fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, 128 fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
122 &elen, &offset); 129 &elen, &offset);
@@ -155,24 +162,22 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
155 } 162 }
156 163
157 if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { 164 if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) {
158 iblock = parent_ino(filp->f_path.dentry); 165 if (!dir_emit_dotdot(file, ctx))
159 flen = 2; 166 goto out;
160 memcpy(fname, "..", flen); 167 continue;
161 dt_type = DT_DIR;
162 } else {
163 struct kernel_lb_addr tloc = lelb_to_cpu(cfi.icb.extLocation);
164
165 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
166 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
167 dt_type = DT_UNKNOWN;
168 } 168 }
169 169
170 if (flen && filldir(dirent, fname, flen, filp->f_pos, 170 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
171 iblock, dt_type) < 0) 171 if (!flen)
172 continue;
173
174 tloc = lelb_to_cpu(cfi.icb.extLocation);
175 iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
176 if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
172 goto out; 177 goto out;
173 } /* end while */ 178 } /* end while */
174 179
175 filp->f_pos = (nf_pos >> 2) + 1; 180 ctx->pos = (nf_pos >> 2) + 1;
176 181
177out: 182out:
178 if (fibh.sbh != fibh.ebh) 183 if (fibh.sbh != fibh.ebh)
@@ -184,27 +189,11 @@ out:
184 return ret; 189 return ret;
185} 190}
186 191
187static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
188{
189 struct inode *dir = file_inode(filp);
190 int result;
191
192 if (filp->f_pos == 0) {
193 if (filldir(dirent, ".", 1, filp->f_pos, dir->i_ino, DT_DIR) < 0) {
194 return 0;
195 }
196 filp->f_pos++;
197 }
198
199 result = do_udf_readdir(dir, filp, filldir, dirent);
200 return result;
201}
202
203/* readdir and lookup functions */ 192/* readdir and lookup functions */
204const struct file_operations udf_dir_operations = { 193const struct file_operations udf_dir_operations = {
205 .llseek = generic_file_llseek, 194 .llseek = generic_file_llseek,
206 .read = generic_read_dir, 195 .read = generic_read_dir,
207 .readdir = udf_readdir, 196 .iterate = udf_readdir,
208 .unlocked_ioctl = udf_ioctl, 197 .unlocked_ioctl = udf_ioctl,
209 .fsync = generic_file_fsync, 198 .fsync = generic_file_fsync,
210}; 199};
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 102c072c6bbf..5f6fc17d6bc5 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -594,6 +594,29 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
594 return 0; 594 return 0;
595} 595}
596 596
597static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
598{
599 struct inode *inode;
600 struct udf_inode_info *iinfo;
601 int err;
602
603 inode = udf_new_inode(dir, mode, &err);
604 if (!inode)
605 return err;
606
607 iinfo = UDF_I(inode);
608 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
609 inode->i_data.a_ops = &udf_adinicb_aops;
610 else
611 inode->i_data.a_ops = &udf_aops;
612 inode->i_op = &udf_file_inode_operations;
613 inode->i_fop = &udf_file_operations;
614 mark_inode_dirty(inode);
615
616 d_tmpfile(dentry, inode);
617 return 0;
618}
619
597static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, 620static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
598 dev_t rdev) 621 dev_t rdev)
599{ 622{
@@ -1311,6 +1334,7 @@ const struct inode_operations udf_dir_inode_operations = {
1311 .rmdir = udf_rmdir, 1334 .rmdir = udf_rmdir,
1312 .mknod = udf_mknod, 1335 .mknod = udf_mknod,
1313 .rename = udf_rename, 1336 .rename = udf_rename,
1337 .tmpfile = udf_tmpfile,
1314}; 1338};
1315const struct inode_operations udf_symlink_inode_operations = { 1339const struct inode_operations udf_symlink_inode_operations = {
1316 .readlink = generic_readlink, 1340 .readlink = generic_readlink,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 3a75ca09c506..0ecc2cebed8f 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -430,16 +430,16 @@ ufs_validate_entry(struct super_block *sb, char *base,
430 * This is blatantly stolen from ext2fs 430 * This is blatantly stolen from ext2fs
431 */ 431 */
432static int 432static int
433ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) 433ufs_readdir(struct file *file, struct dir_context *ctx)
434{ 434{
435 loff_t pos = filp->f_pos; 435 loff_t pos = ctx->pos;
436 struct inode *inode = file_inode(filp); 436 struct inode *inode = file_inode(file);
437 struct super_block *sb = inode->i_sb; 437 struct super_block *sb = inode->i_sb;
438 unsigned int offset = pos & ~PAGE_CACHE_MASK; 438 unsigned int offset = pos & ~PAGE_CACHE_MASK;
439 unsigned long n = pos >> PAGE_CACHE_SHIFT; 439 unsigned long n = pos >> PAGE_CACHE_SHIFT;
440 unsigned long npages = ufs_dir_pages(inode); 440 unsigned long npages = ufs_dir_pages(inode);
441 unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); 441 unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
442 int need_revalidate = filp->f_version != inode->i_version; 442 int need_revalidate = file->f_version != inode->i_version;
443 unsigned flags = UFS_SB(sb)->s_flags; 443 unsigned flags = UFS_SB(sb)->s_flags;
444 444
445 UFSD("BEGIN\n"); 445 UFSD("BEGIN\n");
@@ -457,16 +457,16 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
457 ufs_error(sb, __func__, 457 ufs_error(sb, __func__,
458 "bad page in #%lu", 458 "bad page in #%lu",
459 inode->i_ino); 459 inode->i_ino);
460 filp->f_pos += PAGE_CACHE_SIZE - offset; 460 ctx->pos += PAGE_CACHE_SIZE - offset;
461 return -EIO; 461 return -EIO;
462 } 462 }
463 kaddr = page_address(page); 463 kaddr = page_address(page);
464 if (unlikely(need_revalidate)) { 464 if (unlikely(need_revalidate)) {
465 if (offset) { 465 if (offset) {
466 offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); 466 offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
467 filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; 467 ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
468 } 468 }
469 filp->f_version = inode->i_version; 469 file->f_version = inode->i_version;
470 need_revalidate = 0; 470 need_revalidate = 0;
471 } 471 }
472 de = (struct ufs_dir_entry *)(kaddr+offset); 472 de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -479,11 +479,8 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
479 return -EIO; 479 return -EIO;
480 } 480 }
481 if (de->d_ino) { 481 if (de->d_ino) {
482 int over;
483 unsigned char d_type = DT_UNKNOWN; 482 unsigned char d_type = DT_UNKNOWN;
484 483
485 offset = (char *)de - kaddr;
486
487 UFSD("filldir(%s,%u)\n", de->d_name, 484 UFSD("filldir(%s,%u)\n", de->d_name,
488 fs32_to_cpu(sb, de->d_ino)); 485 fs32_to_cpu(sb, de->d_ino));
489 UFSD("namlen %u\n", ufs_get_de_namlen(sb, de)); 486 UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
@@ -491,16 +488,15 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
491 if ((flags & UFS_DE_MASK) == UFS_DE_44BSD) 488 if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
492 d_type = de->d_u.d_44.d_type; 489 d_type = de->d_u.d_44.d_type;
493 490
494 over = filldir(dirent, de->d_name, 491 if (!dir_emit(ctx, de->d_name,
495 ufs_get_de_namlen(sb, de), 492 ufs_get_de_namlen(sb, de),
496 (n<<PAGE_CACHE_SHIFT) | offset, 493 fs32_to_cpu(sb, de->d_ino),
497 fs32_to_cpu(sb, de->d_ino), d_type); 494 d_type)) {
498 if (over) {
499 ufs_put_page(page); 495 ufs_put_page(page);
500 return 0; 496 return 0;
501 } 497 }
502 } 498 }
503 filp->f_pos += fs16_to_cpu(sb, de->d_reclen); 499 ctx->pos += fs16_to_cpu(sb, de->d_reclen);
504 } 500 }
505 ufs_put_page(page); 501 ufs_put_page(page);
506 } 502 }
@@ -660,7 +656,7 @@ not_empty:
660 656
661const struct file_operations ufs_dir_operations = { 657const struct file_operations ufs_dir_operations = {
662 .read = generic_read_dir, 658 .read = generic_read_dir,
663 .readdir = ufs_readdir, 659 .iterate = ufs_readdir,
664 .fsync = generic_file_fsync, 660 .fsync = generic_file_fsync,
665 .llseek = generic_file_llseek, 661 .llseek = generic_file_llseek,
666}; 662};
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 6313b69b6644..4a4508023a3c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -71,6 +71,7 @@ xfs-y += xfs_alloc.o \
71 xfs_dir2_sf.o \ 71 xfs_dir2_sf.o \
72 xfs_ialloc.o \ 72 xfs_ialloc.o \
73 xfs_ialloc_btree.o \ 73 xfs_ialloc_btree.o \
74 xfs_icreate_item.o \
74 xfs_inode.o \ 75 xfs_inode.o \
75 xfs_log_recover.o \ 76 xfs_log_recover.o \
76 xfs_mount.o \ 77 xfs_mount.o \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 5673bcfda2f0..71596e57283a 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -175,6 +175,7 @@ xfs_alloc_compute_diff(
175 xfs_agblock_t wantbno, /* target starting block */ 175 xfs_agblock_t wantbno, /* target starting block */
176 xfs_extlen_t wantlen, /* target length */ 176 xfs_extlen_t wantlen, /* target length */
177 xfs_extlen_t alignment, /* target alignment */ 177 xfs_extlen_t alignment, /* target alignment */
178 char userdata, /* are we allocating data? */
178 xfs_agblock_t freebno, /* freespace's starting block */ 179 xfs_agblock_t freebno, /* freespace's starting block */
179 xfs_extlen_t freelen, /* freespace's length */ 180 xfs_extlen_t freelen, /* freespace's length */
180 xfs_agblock_t *newbnop) /* result: best start block from free */ 181 xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -189,7 +190,14 @@ xfs_alloc_compute_diff(
189 ASSERT(freelen >= wantlen); 190 ASSERT(freelen >= wantlen);
190 freeend = freebno + freelen; 191 freeend = freebno + freelen;
191 wantend = wantbno + wantlen; 192 wantend = wantbno + wantlen;
192 if (freebno >= wantbno) { 193 /*
194 * We want to allocate from the start of a free extent if it is past
195 * the desired block or if we are allocating user data and the free
196 * extent is before desired block. The second case is there to allow
197 * for contiguous allocation from the remaining free space if the file
198 * grows in the short term.
199 */
200 if (freebno >= wantbno || (userdata && freeend < wantend)) {
193 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 201 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
194 newbno1 = NULLAGBLOCK; 202 newbno1 = NULLAGBLOCK;
195 } else if (freeend >= wantend && alignment > 1) { 203 } else if (freeend >= wantend && alignment > 1) {
@@ -805,7 +813,8 @@ xfs_alloc_find_best_extent(
805 xfs_alloc_fix_len(args); 813 xfs_alloc_fix_len(args);
806 814
807 sdiff = xfs_alloc_compute_diff(args->agbno, args->len, 815 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
808 args->alignment, *sbnoa, 816 args->alignment,
817 args->userdata, *sbnoa,
809 *slena, &new); 818 *slena, &new);
810 819
811 /* 820 /*
@@ -976,7 +985,8 @@ restart:
976 if (args->len < blen) 985 if (args->len < blen)
977 continue; 986 continue;
978 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 987 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
979 args->alignment, ltbnoa, ltlena, &ltnew); 988 args->alignment, args->userdata, ltbnoa,
989 ltlena, &ltnew);
980 if (ltnew != NULLAGBLOCK && 990 if (ltnew != NULLAGBLOCK &&
981 (args->len > blen || ltdiff < bdiff)) { 991 (args->len > blen || ltdiff < bdiff)) {
982 bdiff = ltdiff; 992 bdiff = ltdiff;
@@ -1128,7 +1138,8 @@ restart:
1128 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1138 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1129 xfs_alloc_fix_len(args); 1139 xfs_alloc_fix_len(args);
1130 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1140 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1131 args->alignment, ltbnoa, ltlena, &ltnew); 1141 args->alignment, args->userdata, ltbnoa,
1142 ltlena, &ltnew);
1132 1143
1133 error = xfs_alloc_find_best_extent(args, 1144 error = xfs_alloc_find_best_extent(args,
1134 &bno_cur_lt, &bno_cur_gt, 1145 &bno_cur_lt, &bno_cur_gt,
@@ -1144,7 +1155,8 @@ restart:
1144 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1155 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1145 xfs_alloc_fix_len(args); 1156 xfs_alloc_fix_len(args);
1146 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1157 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1147 args->alignment, gtbnoa, gtlena, &gtnew); 1158 args->alignment, args->userdata, gtbnoa,
1159 gtlena, &gtnew);
1148 1160
1149 error = xfs_alloc_find_best_extent(args, 1161 error = xfs_alloc_find_best_extent(args,
1150 &bno_cur_gt, &bno_cur_lt, 1162 &bno_cur_gt, &bno_cur_lt,
@@ -1203,7 +1215,7 @@ restart:
1203 } 1215 }
1204 rlen = args->len; 1216 rlen = args->len;
1205 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, 1217 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1206 ltbnoa, ltlena, &ltnew); 1218 args->userdata, ltbnoa, ltlena, &ltnew);
1207 ASSERT(ltnew >= ltbno); 1219 ASSERT(ltnew >= ltbno);
1208 ASSERT(ltnew + rlen <= ltbnoa + ltlena); 1220 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1209 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1221 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41a695048be7..596ec71da00e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -843,10 +843,12 @@ xfs_cluster_write(
843STATIC void 843STATIC void
844xfs_vm_invalidatepage( 844xfs_vm_invalidatepage(
845 struct page *page, 845 struct page *page,
846 unsigned long offset) 846 unsigned int offset,
847 unsigned int length)
847{ 848{
848 trace_xfs_invalidatepage(page->mapping->host, page, offset); 849 trace_xfs_invalidatepage(page->mapping->host, page, offset,
849 block_invalidatepage(page, offset); 850 length);
851 block_invalidatepage(page, offset, length);
850} 852}
851 853
852/* 854/*
@@ -910,7 +912,7 @@ next_buffer:
910 912
911 xfs_iunlock(ip, XFS_ILOCK_EXCL); 913 xfs_iunlock(ip, XFS_ILOCK_EXCL);
912out_invalidate: 914out_invalidate:
913 xfs_vm_invalidatepage(page, 0); 915 xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
914 return; 916 return;
915} 917}
916 918
@@ -940,7 +942,7 @@ xfs_vm_writepage(
940 int count = 0; 942 int count = 0;
941 int nonblocking = 0; 943 int nonblocking = 0;
942 944
943 trace_xfs_writepage(inode, page, 0); 945 trace_xfs_writepage(inode, page, 0, 0);
944 946
945 ASSERT(page_has_buffers(page)); 947 ASSERT(page_has_buffers(page));
946 948
@@ -1171,7 +1173,7 @@ xfs_vm_releasepage(
1171{ 1173{
1172 int delalloc, unwritten; 1174 int delalloc, unwritten;
1173 1175
1174 trace_xfs_releasepage(page->mapping->host, page, 0); 1176 trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1175 1177
1176 xfs_count_page_state(page, &delalloc, &unwritten); 1178 xfs_count_page_state(page, &delalloc, &unwritten);
1177 1179
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 31d3cd129269..b800fbcafc7f 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -690,6 +690,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
690 sf = (xfs_attr_shortform_t *)tmpbuffer; 690 sf = (xfs_attr_shortform_t *)tmpbuffer;
691 691
692 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); 692 xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
693 xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK);
694
693 bp = NULL; 695 bp = NULL;
694 error = xfs_da_grow_inode(args, &blkno); 696 error = xfs_da_grow_inode(args, &blkno);
695 if (error) { 697 if (error) {
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 89042848f9ec..05c698ccb238 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1161,6 +1161,24 @@ xfs_bmap_extents_to_btree(
1161 * since the file data needs to get logged so things will stay consistent. 1161 * since the file data needs to get logged so things will stay consistent.
1162 * (The bmap-level manipulations are ok, though). 1162 * (The bmap-level manipulations are ok, though).
1163 */ 1163 */
1164void
1165xfs_bmap_local_to_extents_empty(
1166 struct xfs_inode *ip,
1167 int whichfork)
1168{
1169 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
1170
1171 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
1172 ASSERT(ifp->if_bytes == 0);
1173 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
1174
1175 xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
1176 ifp->if_flags &= ~XFS_IFINLINE;
1177 ifp->if_flags |= XFS_IFEXTENTS;
1178 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
1179}
1180
1181
1164STATIC int /* error */ 1182STATIC int /* error */
1165xfs_bmap_local_to_extents( 1183xfs_bmap_local_to_extents(
1166 xfs_trans_t *tp, /* transaction pointer */ 1184 xfs_trans_t *tp, /* transaction pointer */
@@ -1174,9 +1192,12 @@ xfs_bmap_local_to_extents(
1174 struct xfs_inode *ip, 1192 struct xfs_inode *ip,
1175 struct xfs_ifork *ifp)) 1193 struct xfs_ifork *ifp))
1176{ 1194{
1177 int error; /* error return value */ 1195 int error = 0;
1178 int flags; /* logging flags returned */ 1196 int flags; /* logging flags returned */
1179 xfs_ifork_t *ifp; /* inode fork pointer */ 1197 xfs_ifork_t *ifp; /* inode fork pointer */
1198 xfs_alloc_arg_t args; /* allocation arguments */
1199 xfs_buf_t *bp; /* buffer for extent block */
1200 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
1180 1201
1181 /* 1202 /*
1182 * We don't want to deal with the case of keeping inode data inline yet. 1203 * We don't want to deal with the case of keeping inode data inline yet.
@@ -1185,68 +1206,65 @@ xfs_bmap_local_to_extents(
1185 ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); 1206 ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
1186 ifp = XFS_IFORK_PTR(ip, whichfork); 1207 ifp = XFS_IFORK_PTR(ip, whichfork);
1187 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); 1208 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
1209
1210 if (!ifp->if_bytes) {
1211 xfs_bmap_local_to_extents_empty(ip, whichfork);
1212 flags = XFS_ILOG_CORE;
1213 goto done;
1214 }
1215
1188 flags = 0; 1216 flags = 0;
1189 error = 0; 1217 error = 0;
1190 if (ifp->if_bytes) { 1218 ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) ==
1191 xfs_alloc_arg_t args; /* allocation arguments */ 1219 XFS_IFINLINE);
1192 xfs_buf_t *bp; /* buffer for extent block */ 1220 memset(&args, 0, sizeof(args));
1193 xfs_bmbt_rec_host_t *ep;/* extent record pointer */ 1221 args.tp = tp;
1194 1222 args.mp = ip->i_mount;
1195 ASSERT((ifp->if_flags & 1223 args.firstblock = *firstblock;
1196 (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); 1224 /*
1197 memset(&args, 0, sizeof(args)); 1225 * Allocate a block. We know we need only one, since the
1198 args.tp = tp; 1226 * file currently fits in an inode.
1199 args.mp = ip->i_mount; 1227 */
1200 args.firstblock = *firstblock; 1228 if (*firstblock == NULLFSBLOCK) {
1201 /* 1229 args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
1202 * Allocate a block. We know we need only one, since the 1230 args.type = XFS_ALLOCTYPE_START_BNO;
1203 * file currently fits in an inode.
1204 */
1205 if (*firstblock == NULLFSBLOCK) {
1206 args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
1207 args.type = XFS_ALLOCTYPE_START_BNO;
1208 } else {
1209 args.fsbno = *firstblock;
1210 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1211 }
1212 args.total = total;
1213 args.minlen = args.maxlen = args.prod = 1;
1214 error = xfs_alloc_vextent(&args);
1215 if (error)
1216 goto done;
1217
1218 /* Can't fail, the space was reserved. */
1219 ASSERT(args.fsbno != NULLFSBLOCK);
1220 ASSERT(args.len == 1);
1221 *firstblock = args.fsbno;
1222 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
1223
1224 /* initialise the block and copy the data */
1225 init_fn(tp, bp, ip, ifp);
1226
1227 /* account for the change in fork size and log everything */
1228 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
1229 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
1230 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
1231 xfs_iext_add(ifp, 0, 1);
1232 ep = xfs_iext_get_ext(ifp, 0);
1233 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
1234 trace_xfs_bmap_post_update(ip, 0,
1235 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
1236 _THIS_IP_);
1237 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
1238 ip->i_d.di_nblocks = 1;
1239 xfs_trans_mod_dquot_byino(tp, ip,
1240 XFS_TRANS_DQ_BCOUNT, 1L);
1241 flags |= xfs_ilog_fext(whichfork);
1242 } else { 1231 } else {
1243 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); 1232 args.fsbno = *firstblock;
1244 xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); 1233 args.type = XFS_ALLOCTYPE_NEAR_BNO;
1245 } 1234 }
1246 ifp->if_flags &= ~XFS_IFINLINE; 1235 args.total = total;
1247 ifp->if_flags |= XFS_IFEXTENTS; 1236 args.minlen = args.maxlen = args.prod = 1;
1248 XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); 1237 error = xfs_alloc_vextent(&args);
1238 if (error)
1239 goto done;
1240
1241 /* Can't fail, the space was reserved. */
1242 ASSERT(args.fsbno != NULLFSBLOCK);
1243 ASSERT(args.len == 1);
1244 *firstblock = args.fsbno;
1245 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
1246
1247 /* initialise the block and copy the data */
1248 init_fn(tp, bp, ip, ifp);
1249
1250 /* account for the change in fork size and log everything */
1251 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
1252 xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
1253 xfs_bmap_local_to_extents_empty(ip, whichfork);
1249 flags |= XFS_ILOG_CORE; 1254 flags |= XFS_ILOG_CORE;
1255
1256 xfs_iext_add(ifp, 0, 1);
1257 ep = xfs_iext_get_ext(ifp, 0);
1258 xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
1259 trace_xfs_bmap_post_update(ip, 0,
1260 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0,
1261 _THIS_IP_);
1262 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
1263 ip->i_d.di_nblocks = 1;
1264 xfs_trans_mod_dquot_byino(tp, ip,
1265 XFS_TRANS_DQ_BCOUNT, 1L);
1266 flags |= xfs_ilog_fext(whichfork);
1267
1250done: 1268done:
1251 *logflagsp = flags; 1269 *logflagsp = flags;
1252 return error; 1270 return error;
@@ -1323,25 +1341,6 @@ xfs_bmap_add_attrfork_extents(
1323} 1341}
1324 1342
1325/* 1343/*
1326 * Block initialisation function for local to extent format conversion.
1327 *
1328 * This shouldn't actually be called by anyone, so make sure debug kernels cause
1329 * a noticable failure.
1330 */
1331STATIC void
1332xfs_bmap_local_to_extents_init_fn(
1333 struct xfs_trans *tp,
1334 struct xfs_buf *bp,
1335 struct xfs_inode *ip,
1336 struct xfs_ifork *ifp)
1337{
1338 ASSERT(0);
1339 bp->b_ops = &xfs_bmbt_buf_ops;
1340 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
1341 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
1342}
1343
1344/*
1345 * Called from xfs_bmap_add_attrfork to handle local format files. Each 1344 * Called from xfs_bmap_add_attrfork to handle local format files. Each
1346 * different data fork content type needs a different callout to do the 1345 * different data fork content type needs a different callout to do the
1347 * conversion. Some are basic and only require special block initialisation 1346 * conversion. Some are basic and only require special block initialisation
@@ -1381,9 +1380,9 @@ xfs_bmap_add_attrfork_local(
1381 flags, XFS_DATA_FORK, 1380 flags, XFS_DATA_FORK,
1382 xfs_symlink_local_to_remote); 1381 xfs_symlink_local_to_remote);
1383 1382
1384 return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, 1383 /* should only be called for types that support local format data */
1385 XFS_DATA_FORK, 1384 ASSERT(0);
1386 xfs_bmap_local_to_extents_init_fn); 1385 return EFSCORRUPTED;
1387} 1386}
1388 1387
1389/* 1388/*
@@ -4907,20 +4906,19 @@ xfs_bmapi_write(
4907 orig_mval = mval; 4906 orig_mval = mval;
4908 orig_nmap = *nmap; 4907 orig_nmap = *nmap;
4909#endif 4908#endif
4909 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4910 XFS_ATTR_FORK : XFS_DATA_FORK;
4910 4911
4911 ASSERT(*nmap >= 1); 4912 ASSERT(*nmap >= 1);
4912 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); 4913 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4913 ASSERT(!(flags & XFS_BMAPI_IGSTATE)); 4914 ASSERT(!(flags & XFS_BMAPI_IGSTATE));
4914 ASSERT(tp != NULL); 4915 ASSERT(tp != NULL);
4915 ASSERT(len > 0); 4916 ASSERT(len > 0);
4916 4917 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
4917 whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
4918 XFS_ATTR_FORK : XFS_DATA_FORK;
4919 4918
4920 if (unlikely(XFS_TEST_ERROR( 4919 if (unlikely(XFS_TEST_ERROR(
4921 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 4920 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
4922 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && 4921 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
4923 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
4924 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { 4922 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4925 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); 4923 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
4926 return XFS_ERROR(EFSCORRUPTED); 4924 return XFS_ERROR(EFSCORRUPTED);
@@ -4933,37 +4931,6 @@ xfs_bmapi_write(
4933 4931
4934 XFS_STATS_INC(xs_blk_mapw); 4932 XFS_STATS_INC(xs_blk_mapw);
4935 4933
4936 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
4937 /*
4938 * XXX (dgc): This assumes we are only called for inodes that
4939 * contain content neutral data in local format. Anything that
4940 * contains caller-specific data in local format that needs
4941 * transformation to move to a block format needs to do the
4942 * conversion to extent format itself.
4943 *
4944 * Directory data forks and attribute forks handle this
4945 * themselves, but with the addition of metadata verifiers every
4946 * data fork in local format now contains caller specific data
4947 * and as such conversion through this function is likely to be
4948 * broken.
4949 *
4950 * The only likely user of this branch is for remote symlinks,
4951 * but we cannot overwrite the data fork contents of the symlink
4952 * (EEXIST occurs higher up the stack) and so it will never go
4953 * from local format to extent format here. Hence I don't think
4954 * this branch is ever executed intentionally and we should
4955 * consider removing it and asserting that xfs_bmapi_write()
4956 * cannot be called directly on local format forks. i.e. callers
4957 * are completely responsible for local to extent format
4958 * conversion, not xfs_bmapi_write().
4959 */
4960 error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
4961 &bma.logflags, whichfork,
4962 xfs_bmap_local_to_extents_init_fn);
4963 if (error)
4964 goto error0;
4965 }
4966
4967 if (*firstblock == NULLFSBLOCK) { 4934 if (*firstblock == NULLFSBLOCK) {
4968 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) 4935 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
4969 bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; 4936 bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 5f469c3516eb..1cf1292d29b7 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -172,6 +172,7 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
172#endif 172#endif
173 173
174int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); 174int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
175void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
175void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, 176void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
176 struct xfs_bmap_free *flist, struct xfs_mount *mp); 177 struct xfs_bmap_free *flist, struct xfs_mount *mp);
177void xfs_bmap_cancel(struct xfs_bmap_free *flist); 178void xfs_bmap_cancel(struct xfs_bmap_free *flist);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 70c43d9f72c1..1b726d626941 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -196,6 +196,8 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
196#define XFS_BMDR_SPACE_CALC(nrecs) \ 196#define XFS_BMDR_SPACE_CALC(nrecs) \
197 (int)(sizeof(xfs_bmdr_block_t) + \ 197 (int)(sizeof(xfs_bmdr_block_t) + \
198 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) 198 ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
199#define XFS_BMAP_BMDR_SPACE(bb) \
200 (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs)))
199 201
200/* 202/*
201 * Maximum number of bmap btree levels. 203 * Maximum number of bmap btree levels.
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4ec431777048..bfc4e0c26fd3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -140,6 +140,16 @@ xfs_buf_item_size(
140 140
141 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 141 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
142 142
143 if (bip->bli_flags & XFS_BLI_ORDERED) {
144 /*
145 * The buffer has been logged just to order it.
146 * It is not being included in the transaction
147 * commit, so no vectors are used at all.
148 */
149 trace_xfs_buf_item_size_ordered(bip);
150 return XFS_LOG_VEC_ORDERED;
151 }
152
143 /* 153 /*
144 * the vector count is based on the number of buffer vectors we have 154 * the vector count is based on the number of buffer vectors we have
145 * dirty bits in. This will only be greater than one when we have a 155 * dirty bits in. This will only be greater than one when we have a
@@ -212,6 +222,7 @@ xfs_buf_item_format_segment(
212 goto out; 222 goto out;
213 } 223 }
214 224
225
215 /* 226 /*
216 * Fill in an iovec for each set of contiguous chunks. 227 * Fill in an iovec for each set of contiguous chunks.
217 */ 228 */
@@ -299,18 +310,36 @@ xfs_buf_item_format(
299 310
300 /* 311 /*
301 * If it is an inode buffer, transfer the in-memory state to the 312 * If it is an inode buffer, transfer the in-memory state to the
302 * format flags and clear the in-memory state. We do not transfer 313 * format flags and clear the in-memory state.
314 *
315 * For buffer based inode allocation, we do not transfer
303 * this state if the inode buffer allocation has not yet been committed 316 * this state if the inode buffer allocation has not yet been committed
304 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent 317 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
305 * correct replay of the inode allocation. 318 * correct replay of the inode allocation.
319 *
320 * For icreate item based inode allocation, the buffers aren't written
321 * to the journal during allocation, and hence we should always tag the
322 * buffer as an inode buffer so that the correct unlinked list replay
323 * occurs during recovery.
306 */ 324 */
307 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 325 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
308 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 326 if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
327 !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
309 xfs_log_item_in_current_chkpt(lip))) 328 xfs_log_item_in_current_chkpt(lip)))
310 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; 329 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
311 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 330 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
312 } 331 }
313 332
333 if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
334 XFS_BLI_ORDERED) {
335 /*
336 * The buffer has been logged just to order it. It is not being
337 * included in the transaction commit, so don't format it.
338 */
339 trace_xfs_buf_item_format_ordered(bip);
340 return;
341 }
342
314 for (i = 0; i < bip->bli_format_count; i++) { 343 for (i = 0; i < bip->bli_format_count; i++) {
315 vecp = xfs_buf_item_format_segment(bip, vecp, offset, 344 vecp = xfs_buf_item_format_segment(bip, vecp, offset,
316 &bip->bli_formats[i]); 345 &bip->bli_formats[i]);
@@ -340,6 +369,7 @@ xfs_buf_item_pin(
340 369
341 ASSERT(atomic_read(&bip->bli_refcount) > 0); 370 ASSERT(atomic_read(&bip->bli_refcount) > 0);
342 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 371 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
372 (bip->bli_flags & XFS_BLI_ORDERED) ||
343 (bip->bli_flags & XFS_BLI_STALE)); 373 (bip->bli_flags & XFS_BLI_STALE));
344 374
345 trace_xfs_buf_item_pin(bip); 375 trace_xfs_buf_item_pin(bip);
@@ -512,8 +542,9 @@ xfs_buf_item_unlock(
512{ 542{
513 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 543 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
514 struct xfs_buf *bp = bip->bli_buf; 544 struct xfs_buf *bp = bip->bli_buf;
515 int aborted, clean, i; 545 bool clean;
516 uint hold; 546 bool aborted;
547 int flags;
517 548
518 /* Clear the buffer's association with this transaction. */ 549 /* Clear the buffer's association with this transaction. */
519 bp->b_transp = NULL; 550 bp->b_transp = NULL;
@@ -524,23 +555,21 @@ xfs_buf_item_unlock(
524 * (cancelled) buffers at unpin time, but we'll never go through the 555 * (cancelled) buffers at unpin time, but we'll never go through the
525 * pin/unpin cycle if we abort inside commit. 556 * pin/unpin cycle if we abort inside commit.
526 */ 557 */
527 aborted = (lip->li_flags & XFS_LI_ABORTED) != 0; 558 aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
528
529 /* 559 /*
530 * Before possibly freeing the buf item, determine if we should 560 * Before possibly freeing the buf item, copy the per-transaction state
531 * release the buffer at the end of this routine. 561 * so we can reference it safely later after clearing it from the
562 * buffer log item.
532 */ 563 */
533 hold = bip->bli_flags & XFS_BLI_HOLD; 564 flags = bip->bli_flags;
534 565 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
535 /* Clear the per transaction state. */
536 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
537 566
538 /* 567 /*
539 * If the buf item is marked stale, then don't do anything. We'll 568 * If the buf item is marked stale, then don't do anything. We'll
540 * unlock the buffer and free the buf item when the buffer is unpinned 569 * unlock the buffer and free the buf item when the buffer is unpinned
541 * for the last time. 570 * for the last time.
542 */ 571 */
543 if (bip->bli_flags & XFS_BLI_STALE) { 572 if (flags & XFS_BLI_STALE) {
544 trace_xfs_buf_item_unlock_stale(bip); 573 trace_xfs_buf_item_unlock_stale(bip);
545 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 574 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
546 if (!aborted) { 575 if (!aborted) {
@@ -557,13 +586,19 @@ xfs_buf_item_unlock(
557 * be the only reference to the buf item, so we free it anyway 586 * be the only reference to the buf item, so we free it anyway
558 * regardless of whether it is dirty or not. A dirty abort implies a 587 * regardless of whether it is dirty or not. A dirty abort implies a
559 * shutdown, anyway. 588 * shutdown, anyway.
589 *
590 * Ordered buffers are dirty but may have no recorded changes, so ensure
591 * we only release clean items here.
560 */ 592 */
561 clean = 1; 593 clean = (flags & XFS_BLI_DIRTY) ? false : true;
562 for (i = 0; i < bip->bli_format_count; i++) { 594 if (clean) {
563 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, 595 int i;
564 bip->bli_formats[i].blf_map_size)) { 596 for (i = 0; i < bip->bli_format_count; i++) {
565 clean = 0; 597 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
566 break; 598 bip->bli_formats[i].blf_map_size)) {
599 clean = false;
600 break;
601 }
567 } 602 }
568 } 603 }
569 if (clean) 604 if (clean)
@@ -576,7 +611,7 @@ xfs_buf_item_unlock(
576 } else 611 } else
577 atomic_dec(&bip->bli_refcount); 612 atomic_dec(&bip->bli_refcount);
578 613
579 if (!hold) 614 if (!(flags & XFS_BLI_HOLD))
580 xfs_buf_relse(bp); 615 xfs_buf_relse(bp);
581} 616}
582 617
@@ -842,12 +877,6 @@ xfs_buf_item_log(
842 struct xfs_buf *bp = bip->bli_buf; 877 struct xfs_buf *bp = bip->bli_buf;
843 878
844 /* 879 /*
845 * Mark the item as having some dirty data for
846 * quick reference in xfs_buf_item_dirty.
847 */
848 bip->bli_flags |= XFS_BLI_DIRTY;
849
850 /*
851 * walk each buffer segment and mark them dirty appropriately. 880 * walk each buffer segment and mark them dirty appropriately.
852 */ 881 */
853 start = 0; 882 start = 0;
@@ -873,7 +902,7 @@ xfs_buf_item_log(
873 902
874 903
875/* 904/*
876 * Return 1 if the buffer has some data that has been logged (at any 905 * Return 1 if the buffer has been logged or ordered in a transaction (at any
877 * point, not just the current transaction) and 0 if not. 906 * point, not just the current transaction) and 0 if not.
878 */ 907 */
879uint 908uint
@@ -907,11 +936,11 @@ void
907xfs_buf_item_relse( 936xfs_buf_item_relse(
908 xfs_buf_t *bp) 937 xfs_buf_t *bp)
909{ 938{
910 xfs_buf_log_item_t *bip; 939 xfs_buf_log_item_t *bip = bp->b_fspriv;
911 940
912 trace_xfs_buf_item_relse(bp, _RET_IP_); 941 trace_xfs_buf_item_relse(bp, _RET_IP_);
942 ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
913 943
914 bip = bp->b_fspriv;
915 bp->b_fspriv = bip->bli_item.li_bio_list; 944 bp->b_fspriv = bip->bli_item.li_bio_list;
916 if (bp->b_fspriv == NULL) 945 if (bp->b_fspriv == NULL)
917 bp->b_iodone = NULL; 946 bp->b_iodone = NULL;
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 2573d2a75fc8..0f1c247dc680 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -120,6 +120,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
120#define XFS_BLI_INODE_ALLOC_BUF 0x10 120#define XFS_BLI_INODE_ALLOC_BUF 0x10
121#define XFS_BLI_STALE_INODE 0x20 121#define XFS_BLI_STALE_INODE 0x20
122#define XFS_BLI_INODE_BUF 0x40 122#define XFS_BLI_INODE_BUF 0x40
123#define XFS_BLI_ORDERED 0x80
123 124
124#define XFS_BLI_FLAGS \ 125#define XFS_BLI_FLAGS \
125 { XFS_BLI_HOLD, "HOLD" }, \ 126 { XFS_BLI_HOLD, "HOLD" }, \
@@ -128,7 +129,8 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
128 { XFS_BLI_LOGGED, "LOGGED" }, \ 129 { XFS_BLI_LOGGED, "LOGGED" }, \
129 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 130 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
130 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ 131 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
131 { XFS_BLI_INODE_BUF, "INODE_BUF" } 132 { XFS_BLI_INODE_BUF, "INODE_BUF" }, \
133 { XFS_BLI_ORDERED, "ORDERED" }
132 134
133 135
134#ifdef __KERNEL__ 136#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index c407e1ccff43..e36445ceaf80 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,6 +24,9 @@
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_mount.h" 25#include "xfs_mount.h"
26#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
27#include "xfs_alloc_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
27#include "xfs_dinode.h" 30#include "xfs_dinode.h"
28#include "xfs_inode.h" 31#include "xfs_inode.h"
29#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
@@ -182,7 +185,7 @@ xfs_swap_extents_check_format(
182 */ 185 */
183 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 186 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
184 if (XFS_IFORK_BOFF(ip) && 187 if (XFS_IFORK_BOFF(ip) &&
185 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) 188 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
186 return EINVAL; 189 return EINVAL;
187 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= 190 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
188 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) 191 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
@@ -192,9 +195,8 @@ xfs_swap_extents_check_format(
192 /* Reciprocal target->temp btree format checks */ 195 /* Reciprocal target->temp btree format checks */
193 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
194 if (XFS_IFORK_BOFF(tip) && 197 if (XFS_IFORK_BOFF(tip) &&
195 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) 198 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
196 return EINVAL; 199 return EINVAL;
197
198 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= 200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
199 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) 201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
200 return EINVAL; 202 return EINVAL;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index f7a0e95d197a..e5869b50dc41 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -39,6 +39,9 @@ typedef struct xfs_timestamp {
39 * There is a very similar struct icdinode in xfs_inode which matches the 39 * There is a very similar struct icdinode in xfs_inode which matches the
40 * layout of the first 96 bytes of this structure, but is kept in native 40 * layout of the first 96 bytes of this structure, but is kept in native
41 * format instead of big endian. 41 * format instead of big endian.
42 *
43 * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
44 * padding field for v3 inodes.
42 */ 45 */
43typedef struct xfs_dinode { 46typedef struct xfs_dinode {
44 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ 47 __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
@@ -132,9 +135,6 @@ typedef enum xfs_dinode_fmt {
132#define XFS_LITINO(mp, version) \ 135#define XFS_LITINO(mp, version) \
133 ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) 136 ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
134 137
135#define XFS_BROOT_SIZE_ADJ(ip) \
136 (XFS_BMBT_BLOCK_LEN((ip)->i_mount) - sizeof(xfs_bmdr_block_t))
137
138/* 138/*
139 * Inode data & attribute fork sizes, per inode. 139 * Inode data & attribute fork sizes, per inode.
140 */ 140 */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index b26a50f9921d..8f023dee404d 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -368,10 +368,8 @@ xfs_dir_removename(
368int 368int
369xfs_readdir( 369xfs_readdir(
370 xfs_inode_t *dp, 370 xfs_inode_t *dp,
371 void *dirent, 371 struct dir_context *ctx,
372 size_t bufsize, 372 size_t bufsize)
373 xfs_off_t *offset,
374 filldir_t filldir)
375{ 373{
376 int rval; /* return value */ 374 int rval; /* return value */
377 int v; /* type-checking value */ 375 int v; /* type-checking value */
@@ -385,14 +383,13 @@ xfs_readdir(
385 XFS_STATS_INC(xs_dir_getdents); 383 XFS_STATS_INC(xs_dir_getdents);
386 384
387 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 385 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
388 rval = xfs_dir2_sf_getdents(dp, dirent, offset, filldir); 386 rval = xfs_dir2_sf_getdents(dp, ctx);
389 else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) 387 else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
390 ; 388 ;
391 else if (v) 389 else if (v)
392 rval = xfs_dir2_block_getdents(dp, dirent, offset, filldir); 390 rval = xfs_dir2_block_getdents(dp, ctx);
393 else 391 else
394 rval = xfs_dir2_leaf_getdents(dp, dirent, bufsize, offset, 392 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
395 filldir);
396 return rval; 393 return rval;
397} 394}
398 395
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e59f5fc816fe..5e7fbd72cf52 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -29,6 +29,7 @@
29#include "xfs_dinode.h" 29#include "xfs_dinode.h"
30#include "xfs_inode.h" 30#include "xfs_inode.h"
31#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
32#include "xfs_bmap.h"
32#include "xfs_buf_item.h" 33#include "xfs_buf_item.h"
33#include "xfs_dir2.h" 34#include "xfs_dir2.h"
34#include "xfs_dir2_format.h" 35#include "xfs_dir2_format.h"
@@ -569,9 +570,7 @@ xfs_dir2_block_addname(
569int /* error */ 570int /* error */
570xfs_dir2_block_getdents( 571xfs_dir2_block_getdents(
571 xfs_inode_t *dp, /* incore inode */ 572 xfs_inode_t *dp, /* incore inode */
572 void *dirent, 573 struct dir_context *ctx)
573 xfs_off_t *offset,
574 filldir_t filldir)
575{ 574{
576 xfs_dir2_data_hdr_t *hdr; /* block header */ 575 xfs_dir2_data_hdr_t *hdr; /* block header */
577 struct xfs_buf *bp; /* buffer for block */ 576 struct xfs_buf *bp; /* buffer for block */
@@ -589,7 +588,7 @@ xfs_dir2_block_getdents(
589 /* 588 /*
590 * If the block number in the offset is out of range, we're done. 589 * If the block number in the offset is out of range, we're done.
591 */ 590 */
592 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) 591 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
593 return 0; 592 return 0;
594 593
595 error = xfs_dir3_block_read(NULL, dp, &bp); 594 error = xfs_dir3_block_read(NULL, dp, &bp);
@@ -600,7 +599,7 @@ xfs_dir2_block_getdents(
600 * Extract the byte offset we start at from the seek pointer. 599 * Extract the byte offset we start at from the seek pointer.
601 * We'll skip entries before this. 600 * We'll skip entries before this.
602 */ 601 */
603 wantoff = xfs_dir2_dataptr_to_off(mp, *offset); 602 wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
604 hdr = bp->b_addr; 603 hdr = bp->b_addr;
605 xfs_dir3_data_check(dp, bp); 604 xfs_dir3_data_check(dp, bp);
606 /* 605 /*
@@ -639,13 +638,12 @@ xfs_dir2_block_getdents(
639 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, 638 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
640 (char *)dep - (char *)hdr); 639 (char *)dep - (char *)hdr);
641 640
641 ctx->pos = cook & 0x7fffffff;
642 /* 642 /*
643 * If it didn't fit, set the final offset to here & return. 643 * If it didn't fit, set the final offset to here & return.
644 */ 644 */
645 if (filldir(dirent, (char *)dep->name, dep->namelen, 645 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
646 cook & 0x7fffffff, be64_to_cpu(dep->inumber), 646 be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
647 DT_UNKNOWN)) {
648 *offset = cook & 0x7fffffff;
649 xfs_trans_brelse(NULL, bp); 647 xfs_trans_brelse(NULL, bp);
650 return 0; 648 return 0;
651 } 649 }
@@ -655,7 +653,7 @@ xfs_dir2_block_getdents(
655 * Reached the end of the block. 653 * Reached the end of the block.
656 * Set the offset to a non-existent block 1 and return. 654 * Set the offset to a non-existent block 1 and return.
657 */ 655 */
658 *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 656 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
659 0x7fffffff; 657 0x7fffffff;
660 xfs_trans_brelse(NULL, bp); 658 xfs_trans_brelse(NULL, bp);
661 return 0; 659 return 0;
@@ -1167,13 +1165,15 @@ xfs_dir2_sf_to_block(
1167 __be16 *tagp; /* end of data entry */ 1165 __be16 *tagp; /* end of data entry */
1168 xfs_trans_t *tp; /* transaction pointer */ 1166 xfs_trans_t *tp; /* transaction pointer */
1169 struct xfs_name name; 1167 struct xfs_name name;
1168 struct xfs_ifork *ifp;
1170 1169
1171 trace_xfs_dir2_sf_to_block(args); 1170 trace_xfs_dir2_sf_to_block(args);
1172 1171
1173 dp = args->dp; 1172 dp = args->dp;
1174 tp = args->trans; 1173 tp = args->trans;
1175 mp = dp->i_mount; 1174 mp = dp->i_mount;
1176 ASSERT(dp->i_df.if_flags & XFS_IFINLINE); 1175 ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
1176 ASSERT(ifp->if_flags & XFS_IFINLINE);
1177 /* 1177 /*
1178 * Bomb out if the shortform directory is way too short. 1178 * Bomb out if the shortform directory is way too short.
1179 */ 1179 */
@@ -1182,22 +1182,23 @@ xfs_dir2_sf_to_block(
1182 return XFS_ERROR(EIO); 1182 return XFS_ERROR(EIO);
1183 } 1183 }
1184 1184
1185 oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 1185 oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
1186 1186
1187 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 1187 ASSERT(ifp->if_bytes == dp->i_d.di_size);
1188 ASSERT(dp->i_df.if_u1.if_data != NULL); 1188 ASSERT(ifp->if_u1.if_data != NULL);
1189 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); 1189 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
1190 ASSERT(dp->i_d.di_nextents == 0);
1190 1191
1191 /* 1192 /*
1192 * Copy the directory into a temporary buffer. 1193 * Copy the directory into a temporary buffer.
1193 * Then pitch the incore inode data so we can make extents. 1194 * Then pitch the incore inode data so we can make extents.
1194 */ 1195 */
1195 sfp = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); 1196 sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP);
1196 memcpy(sfp, oldsfp, dp->i_df.if_bytes); 1197 memcpy(sfp, oldsfp, ifp->if_bytes);
1197 1198
1198 xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); 1199 xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
1200 xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK);
1199 dp->i_d.di_size = 0; 1201 dp->i_d.di_size = 0;
1200 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1201 1202
1202 /* 1203 /*
1203 * Add block 0 to the inode. 1204 * Add block 0 to the inode.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index da71a1819d78..2aed25cae04d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1108,6 +1108,7 @@ xfs_dir2_leaf_readbuf(
1108 struct xfs_mount *mp = dp->i_mount; 1108 struct xfs_mount *mp = dp->i_mount;
1109 struct xfs_buf *bp = *bpp; 1109 struct xfs_buf *bp = *bpp;
1110 struct xfs_bmbt_irec *map = mip->map; 1110 struct xfs_bmbt_irec *map = mip->map;
1111 struct blk_plug plug;
1111 int error = 0; 1112 int error = 0;
1112 int length; 1113 int length;
1113 int i; 1114 int i;
@@ -1236,6 +1237,7 @@ xfs_dir2_leaf_readbuf(
1236 /* 1237 /*
1237 * Do we need more readahead? 1238 * Do we need more readahead?
1238 */ 1239 */
1240 blk_start_plug(&plug);
1239 for (mip->ra_index = mip->ra_offset = i = 0; 1241 for (mip->ra_index = mip->ra_offset = i = 0;
1240 mip->ra_want > mip->ra_current && i < mip->map_blocks; 1242 mip->ra_want > mip->ra_current && i < mip->map_blocks;
1241 i += mp->m_dirblkfsbs) { 1243 i += mp->m_dirblkfsbs) {
@@ -1287,6 +1289,7 @@ xfs_dir2_leaf_readbuf(
1287 } 1289 }
1288 } 1290 }
1289 } 1291 }
1292 blk_finish_plug(&plug);
1290 1293
1291out: 1294out:
1292 *bpp = bp; 1295 *bpp = bp;
@@ -1300,10 +1303,8 @@ out:
1300int /* error */ 1303int /* error */
1301xfs_dir2_leaf_getdents( 1304xfs_dir2_leaf_getdents(
1302 xfs_inode_t *dp, /* incore directory inode */ 1305 xfs_inode_t *dp, /* incore directory inode */
1303 void *dirent, 1306 struct dir_context *ctx,
1304 size_t bufsize, 1307 size_t bufsize)
1305 xfs_off_t *offset,
1306 filldir_t filldir)
1307{ 1308{
1308 struct xfs_buf *bp = NULL; /* data block buffer */ 1309 struct xfs_buf *bp = NULL; /* data block buffer */
1309 xfs_dir2_data_hdr_t *hdr; /* data block header */ 1310 xfs_dir2_data_hdr_t *hdr; /* data block header */
@@ -1322,7 +1323,7 @@ xfs_dir2_leaf_getdents(
1322 * If the offset is at or past the largest allowed value, 1323 * If the offset is at or past the largest allowed value,
1323 * give up right away. 1324 * give up right away.
1324 */ 1325 */
1325 if (*offset >= XFS_DIR2_MAX_DATAPTR) 1326 if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
1326 return 0; 1327 return 0;
1327 1328
1328 mp = dp->i_mount; 1329 mp = dp->i_mount;
@@ -1343,7 +1344,7 @@ xfs_dir2_leaf_getdents(
1343 * Inside the loop we keep the main offset value as a byte offset 1344 * Inside the loop we keep the main offset value as a byte offset
1344 * in the directory file. 1345 * in the directory file.
1345 */ 1346 */
1346 curoff = xfs_dir2_dataptr_to_byte(mp, *offset); 1347 curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
1347 1348
1348 /* 1349 /*
1349 * Force this conversion through db so we truncate the offset 1350 * Force this conversion through db so we truncate the offset
@@ -1444,8 +1445,8 @@ xfs_dir2_leaf_getdents(
1444 dep = (xfs_dir2_data_entry_t *)ptr; 1445 dep = (xfs_dir2_data_entry_t *)ptr;
1445 length = xfs_dir2_data_entsize(dep->namelen); 1446 length = xfs_dir2_data_entsize(dep->namelen);
1446 1447
1447 if (filldir(dirent, (char *)dep->name, dep->namelen, 1448 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1448 xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff, 1449 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
1449 be64_to_cpu(dep->inumber), DT_UNKNOWN)) 1450 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1450 break; 1451 break;
1451 1452
@@ -1462,9 +1463,9 @@ xfs_dir2_leaf_getdents(
1462 * All done. Set output offset value to current offset. 1463 * All done. Set output offset value to current offset.
1463 */ 1464 */
1464 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR)) 1465 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
1465 *offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; 1466 ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
1466 else 1467 else
1467 *offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff; 1468 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1468 kmem_free(map_info); 1469 kmem_free(map_info);
1469 if (bp) 1470 if (bp)
1470 xfs_trans_brelse(NULL, bp); 1471 xfs_trans_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7cf573c88aad..0511cda4a712 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -33,8 +33,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
33extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; 33extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
34 34
35extern int xfs_dir2_block_addname(struct xfs_da_args *args); 35extern int xfs_dir2_block_addname(struct xfs_da_args *args);
36extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, 36extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
37 xfs_off_t *offset, filldir_t filldir); 37 struct dir_context *ctx);
38extern int xfs_dir2_block_lookup(struct xfs_da_args *args); 38extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
39extern int xfs_dir2_block_removename(struct xfs_da_args *args); 39extern int xfs_dir2_block_removename(struct xfs_da_args *args);
40extern int xfs_dir2_block_replace(struct xfs_da_args *args); 40extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -91,8 +91,8 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
91extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, 91extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
92 struct xfs_dir2_leaf_entry *ents, int *indexp, 92 struct xfs_dir2_leaf_entry *ents, int *indexp,
93 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); 93 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
94extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, void *dirent, 94extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
95 size_t bufsize, xfs_off_t *offset, filldir_t filldir); 95 size_t bufsize);
96extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, 96extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
97 struct xfs_buf **bpp, __uint16_t magic); 97 struct xfs_buf **bpp, __uint16_t magic);
98extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, 98extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -153,8 +153,7 @@ extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
153 int size, xfs_dir2_sf_hdr_t *sfhp); 153 int size, xfs_dir2_sf_hdr_t *sfhp);
154extern int xfs_dir2_sf_addname(struct xfs_da_args *args); 154extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
155extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); 155extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
156extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, void *dirent, 156extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
157 xfs_off_t *offset, filldir_t filldir);
158extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); 157extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
159extern int xfs_dir2_sf_removename(struct xfs_da_args *args); 158extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
160extern int xfs_dir2_sf_replace(struct xfs_da_args *args); 159extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 6157424dbf8f..97676a347da1 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -768,9 +768,7 @@ xfs_dir2_sf_create(
768int /* error */ 768int /* error */
769xfs_dir2_sf_getdents( 769xfs_dir2_sf_getdents(
770 xfs_inode_t *dp, /* incore directory inode */ 770 xfs_inode_t *dp, /* incore directory inode */
771 void *dirent, 771 struct dir_context *ctx)
772 xfs_off_t *offset,
773 filldir_t filldir)
774{ 772{
775 int i; /* shortform entry number */ 773 int i; /* shortform entry number */
776 xfs_mount_t *mp; /* filesystem mount point */ 774 xfs_mount_t *mp; /* filesystem mount point */
@@ -802,7 +800,7 @@ xfs_dir2_sf_getdents(
802 /* 800 /*
803 * If the block number in the offset is out of range, we're done. 801 * If the block number in the offset is out of range, we're done.
804 */ 802 */
805 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) 803 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
806 return 0; 804 return 0;
807 805
808 /* 806 /*
@@ -819,22 +817,20 @@ xfs_dir2_sf_getdents(
819 /* 817 /*
820 * Put . entry unless we're starting past it. 818 * Put . entry unless we're starting past it.
821 */ 819 */
822 if (*offset <= dot_offset) { 820 if (ctx->pos <= dot_offset) {
823 if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, dp->i_ino, DT_DIR)) { 821 ctx->pos = dot_offset & 0x7fffffff;
824 *offset = dot_offset & 0x7fffffff; 822 if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
825 return 0; 823 return 0;
826 }
827 } 824 }
828 825
829 /* 826 /*
830 * Put .. entry unless we're starting past it. 827 * Put .. entry unless we're starting past it.
831 */ 828 */
832 if (*offset <= dotdot_offset) { 829 if (ctx->pos <= dotdot_offset) {
833 ino = xfs_dir2_sf_get_parent_ino(sfp); 830 ino = xfs_dir2_sf_get_parent_ino(sfp);
834 if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) { 831 ctx->pos = dotdot_offset & 0x7fffffff;
835 *offset = dotdot_offset & 0x7fffffff; 832 if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
836 return 0; 833 return 0;
837 }
838 } 834 }
839 835
840 /* 836 /*
@@ -845,21 +841,20 @@ xfs_dir2_sf_getdents(
845 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, 841 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
846 xfs_dir2_sf_get_offset(sfep)); 842 xfs_dir2_sf_get_offset(sfep));
847 843
848 if (*offset > off) { 844 if (ctx->pos > off) {
849 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 845 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
850 continue; 846 continue;
851 } 847 }
852 848
853 ino = xfs_dir2_sfe_get_ino(sfp, sfep); 849 ino = xfs_dir2_sfe_get_ino(sfp, sfep);
854 if (filldir(dirent, (char *)sfep->name, sfep->namelen, 850 ctx->pos = off & 0x7fffffff;
855 off & 0x7fffffff, ino, DT_UNKNOWN)) { 851 if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
856 *offset = off & 0x7fffffff; 852 ino, DT_UNKNOWN))
857 return 0; 853 return 0;
858 }
859 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 854 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
860 } 855 }
861 856
862 *offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) & 857 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
863 0x7fffffff; 858 0x7fffffff;
864 return 0; 859 return 0;
865} 860}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 044e97a33c8d..0adf27ecf3f1 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -570,13 +570,13 @@ xfs_qm_dqtobp(
570 xfs_buf_t **O_bpp, 570 xfs_buf_t **O_bpp,
571 uint flags) 571 uint flags)
572{ 572{
573 xfs_bmbt_irec_t map; 573 struct xfs_bmbt_irec map;
574 int nmaps = 1, error; 574 int nmaps = 1, error;
575 xfs_buf_t *bp; 575 struct xfs_buf *bp;
576 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); 576 struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
577 xfs_mount_t *mp = dqp->q_mount; 577 struct xfs_mount *mp = dqp->q_mount;
578 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 578 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
579 xfs_trans_t *tp = (tpp ? *tpp : NULL); 579 struct xfs_trans *tp = (tpp ? *tpp : NULL);
580 580
581 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; 581 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
582 582
@@ -804,7 +804,7 @@ xfs_qm_dqget(
804 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ 804 xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
805{ 805{
806 struct xfs_quotainfo *qi = mp->m_quotainfo; 806 struct xfs_quotainfo *qi = mp->m_quotainfo;
807 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 807 struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
808 struct xfs_dquot *dqp; 808 struct xfs_dquot *dqp;
809 int error; 809 int error;
810 810
@@ -936,6 +936,7 @@ xfs_qm_dqput_final(
936{ 936{
937 struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; 937 struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
938 struct xfs_dquot *gdqp; 938 struct xfs_dquot *gdqp;
939 struct xfs_dquot *pdqp;
939 940
940 trace_xfs_dqput_free(dqp); 941 trace_xfs_dqput_free(dqp);
941 942
@@ -949,21 +950,29 @@ xfs_qm_dqput_final(
949 950
950 /* 951 /*
951 * If we just added a udquot to the freelist, then we want to release 952 * If we just added a udquot to the freelist, then we want to release
952 * the gdquot reference that it (probably) has. Otherwise it'll keep 953 * the gdquot/pdquot reference that it (probably) has. Otherwise it'll
953 * the gdquot from getting reclaimed. 954 * keep the gdquot/pdquot from getting reclaimed.
954 */ 955 */
955 gdqp = dqp->q_gdquot; 956 gdqp = dqp->q_gdquot;
956 if (gdqp) { 957 if (gdqp) {
957 xfs_dqlock(gdqp); 958 xfs_dqlock(gdqp);
958 dqp->q_gdquot = NULL; 959 dqp->q_gdquot = NULL;
959 } 960 }
961
962 pdqp = dqp->q_pdquot;
963 if (pdqp) {
964 xfs_dqlock(pdqp);
965 dqp->q_pdquot = NULL;
966 }
960 xfs_dqunlock(dqp); 967 xfs_dqunlock(dqp);
961 968
962 /* 969 /*
963 * If we had a group quota hint, release it now. 970 * If we had a group/project quota hint, release it now.
964 */ 971 */
965 if (gdqp) 972 if (gdqp)
966 xfs_qm_dqput(gdqp); 973 xfs_qm_dqput(gdqp);
974 if (pdqp)
975 xfs_qm_dqput(pdqp);
967} 976}
968 977
969/* 978/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4f0ebfc43cc9..55abbca2883d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -53,6 +53,7 @@ typedef struct xfs_dquot {
53 xfs_fileoff_t q_fileoffset; /* offset in quotas file */ 53 xfs_fileoff_t q_fileoffset; /* offset in quotas file */
54 54
55 struct xfs_dquot*q_gdquot; /* group dquot, hint only */ 55 struct xfs_dquot*q_gdquot; /* group dquot, hint only */
56 struct xfs_dquot*q_pdquot; /* project dquot, hint only */
56 xfs_disk_dquot_t q_core; /* actual usage & quotas */ 57 xfs_disk_dquot_t q_core; /* actual usage & quotas */
57 xfs_dq_logitem_t q_logitem; /* dquot log item */ 58 xfs_dq_logitem_t q_logitem; /* dquot log item */
58 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ 59 xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
@@ -118,8 +119,9 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
118 case XFS_DQ_USER: 119 case XFS_DQ_USER:
119 return XFS_IS_UQUOTA_ON(mp); 120 return XFS_IS_UQUOTA_ON(mp);
120 case XFS_DQ_GROUP: 121 case XFS_DQ_GROUP:
122 return XFS_IS_GQUOTA_ON(mp);
121 case XFS_DQ_PROJ: 123 case XFS_DQ_PROJ:
122 return XFS_IS_OQUOTA_ON(mp); 124 return XFS_IS_PQUOTA_ON(mp);
123 default: 125 default:
124 return 0; 126 return 0;
125 } 127 }
@@ -131,8 +133,9 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
131 case XFS_DQ_USER: 133 case XFS_DQ_USER:
132 return ip->i_udquot; 134 return ip->i_udquot;
133 case XFS_DQ_GROUP: 135 case XFS_DQ_GROUP:
134 case XFS_DQ_PROJ:
135 return ip->i_gdquot; 136 return ip->i_gdquot;
137 case XFS_DQ_PROJ:
138 return ip->i_pdquot;
136 default: 139 default:
137 return NULL; 140 return NULL;
138 } 141 }
@@ -143,10 +146,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
143#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 146#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
144#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) 147#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
145#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) 148#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
146#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo)
147#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \
148 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
149 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
150 149
151extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, 150extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
152 uint, struct xfs_dquot **); 151 uint, struct xfs_dquot **);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a5f2042aec8b..de3dc98f4e8f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -906,11 +906,10 @@ xfs_file_release(
906 906
907STATIC int 907STATIC int
908xfs_file_readdir( 908xfs_file_readdir(
909 struct file *filp, 909 struct file *file,
910 void *dirent, 910 struct dir_context *ctx)
911 filldir_t filldir)
912{ 911{
913 struct inode *inode = file_inode(filp); 912 struct inode *inode = file_inode(file);
914 xfs_inode_t *ip = XFS_I(inode); 913 xfs_inode_t *ip = XFS_I(inode);
915 int error; 914 int error;
916 size_t bufsize; 915 size_t bufsize;
@@ -929,8 +928,7 @@ xfs_file_readdir(
929 */ 928 */
930 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); 929 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
931 930
932 error = xfs_readdir(ip, dirent, bufsize, 931 error = xfs_readdir(ip, ctx, bufsize);
933 (xfs_off_t *)&filp->f_pos, filldir);
934 if (error) 932 if (error)
935 return -error; 933 return -error;
936 return 0; 934 return 0;
@@ -1270,8 +1268,7 @@ xfs_seek_data(
1270 } 1268 }
1271 1269
1272out: 1270out:
1273 if (offset != file->f_pos) 1271 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1274 file->f_pos = offset;
1275 1272
1276out_unlock: 1273out_unlock:
1277 xfs_iunlock_map_shared(ip, lock); 1274 xfs_iunlock_map_shared(ip, lock);
@@ -1379,8 +1376,7 @@ out:
1379 * situation in particular. 1376 * situation in particular.
1380 */ 1377 */
1381 offset = min_t(loff_t, offset, isize); 1378 offset = min_t(loff_t, offset, isize);
1382 if (offset != file->f_pos) 1379 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1383 file->f_pos = offset;
1384 1380
1385out_unlock: 1381out_unlock:
1386 xfs_iunlock_map_shared(ip, lock); 1382 xfs_iunlock_map_shared(ip, lock);
@@ -1432,7 +1428,7 @@ const struct file_operations xfs_file_operations = {
1432const struct file_operations xfs_dir_file_operations = { 1428const struct file_operations xfs_dir_file_operations = {
1433 .open = xfs_dir_open, 1429 .open = xfs_dir_open,
1434 .read = generic_read_dir, 1430 .read = generic_read_dir,
1435 .readdir = xfs_file_readdir, 1431 .iterate = xfs_file_readdir,
1436 .llseek = generic_file_llseek, 1432 .llseek = generic_file_llseek,
1437 .unlocked_ioctl = xfs_file_ioctl, 1433 .unlocked_ioctl = xfs_file_ioctl,
1438#ifdef CONFIG_COMPAT 1434#ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3c3644ea825b..614eb0cc3608 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
176 if (!bp) 176 if (!bp)
177 return EIO; 177 return EIO;
178 if (bp->b_error) { 178 if (bp->b_error) {
179 int error = bp->b_error; 179 error = bp->b_error;
180 xfs_buf_relse(bp); 180 xfs_buf_relse(bp);
181 return error; 181 return error;
182 } 182 }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c8f5ae1debf2..7a0c17d7ec09 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
38#include "xfs_bmap.h" 38#include "xfs_bmap.h"
39#include "xfs_cksum.h" 39#include "xfs_cksum.h"
40#include "xfs_buf_item.h" 40#include "xfs_buf_item.h"
41#include "xfs_icreate_item.h"
41 42
42 43
43/* 44/*
@@ -150,12 +151,16 @@ xfs_check_agi_freecount(
150#endif 151#endif
151 152
152/* 153/*
153 * Initialise a new set of inodes. 154 * Initialise a new set of inodes. When called without a transaction context
155 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
156 * than logging them (which in a transaction context puts them into the AIL
157 * for writeback rather than the xfsbufd queue).
154 */ 158 */
155STATIC int 159int
156xfs_ialloc_inode_init( 160xfs_ialloc_inode_init(
157 struct xfs_mount *mp, 161 struct xfs_mount *mp,
158 struct xfs_trans *tp, 162 struct xfs_trans *tp,
163 struct list_head *buffer_list,
159 xfs_agnumber_t agno, 164 xfs_agnumber_t agno,
160 xfs_agblock_t agbno, 165 xfs_agblock_t agbno,
161 xfs_agblock_t length, 166 xfs_agblock_t length,
@@ -208,6 +213,18 @@ xfs_ialloc_inode_init(
208 version = 3; 213 version = 3;
209 ino = XFS_AGINO_TO_INO(mp, agno, 214 ino = XFS_AGINO_TO_INO(mp, agno,
210 XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); 215 XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
216
217 /*
218 * log the initialisation that is about to take place as an
219 * logical operation. This means the transaction does not
220 * need to log the physical changes to the inode buffers as log
221 * recovery will know what initialisation is actually needed.
222 * Hence we only need to log the buffers as "ordered" buffers so
223 * they track in the AIL as if they were physically logged.
224 */
225 if (tp)
226 xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
227 mp->m_sb.sb_inodesize, length, gen);
211 } else if (xfs_sb_version_hasnlink(&mp->m_sb)) 228 } else if (xfs_sb_version_hasnlink(&mp->m_sb))
212 version = 2; 229 version = 2;
213 else 230 else
@@ -223,13 +240,8 @@ xfs_ialloc_inode_init(
223 XBF_UNMAPPED); 240 XBF_UNMAPPED);
224 if (!fbuf) 241 if (!fbuf)
225 return ENOMEM; 242 return ENOMEM;
226 /* 243
227 * Initialize all inodes in this buffer and then log them. 244 /* Initialize the inode buffers and log them appropriately. */
228 *
229 * XXX: It would be much better if we had just one transaction
230 * to log a whole cluster of inodes instead of all the
231 * individual transactions causing a lot of log traffic.
232 */
233 fbuf->b_ops = &xfs_inode_buf_ops; 245 fbuf->b_ops = &xfs_inode_buf_ops;
234 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); 246 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
235 for (i = 0; i < ninodes; i++) { 247 for (i = 0; i < ninodes; i++) {
@@ -247,18 +259,39 @@ xfs_ialloc_inode_init(
247 ino++; 259 ino++;
248 uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); 260 uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
249 xfs_dinode_calc_crc(mp, free); 261 xfs_dinode_calc_crc(mp, free);
250 } else { 262 } else if (tp) {
251 /* just log the inode core */ 263 /* just log the inode core */
252 xfs_trans_log_buf(tp, fbuf, ioffset, 264 xfs_trans_log_buf(tp, fbuf, ioffset,
253 ioffset + isize - 1); 265 ioffset + isize - 1);
254 } 266 }
255 } 267 }
256 if (version == 3) { 268
257 /* need to log the entire buffer */ 269 if (tp) {
258 xfs_trans_log_buf(tp, fbuf, 0, 270 /*
259 BBTOB(fbuf->b_length) - 1); 271 * Mark the buffer as an inode allocation buffer so it
272 * sticks in AIL at the point of this allocation
273 * transaction. This ensures the they are on disk before
274 * the tail of the log can be moved past this
275 * transaction (i.e. by preventing relogging from moving
276 * it forward in the log).
277 */
278 xfs_trans_inode_alloc_buf(tp, fbuf);
279 if (version == 3) {
280 /*
281 * Mark the buffer as ordered so that they are
282 * not physically logged in the transaction but
283 * still tracked in the AIL as part of the
284 * transaction and pin the log appropriately.
285 */
286 xfs_trans_ordered_buf(tp, fbuf);
287 xfs_trans_log_buf(tp, fbuf, 0,
288 BBTOB(fbuf->b_length) - 1);
289 }
290 } else {
291 fbuf->b_flags |= XBF_DONE;
292 xfs_buf_delwri_queue(fbuf, buffer_list);
293 xfs_buf_relse(fbuf);
260 } 294 }
261 xfs_trans_inode_alloc_buf(tp, fbuf);
262 } 295 }
263 return 0; 296 return 0;
264} 297}
@@ -303,7 +336,7 @@ xfs_ialloc_ag_alloc(
303 * First try to allocate inodes contiguous with the last-allocated 336 * First try to allocate inodes contiguous with the last-allocated
304 * chunk of inodes. If the filesystem is striped, this will fill 337 * chunk of inodes. If the filesystem is striped, this will fill
305 * an entire stripe unit with inodes. 338 * an entire stripe unit with inodes.
306 */ 339 */
307 agi = XFS_BUF_TO_AGI(agbp); 340 agi = XFS_BUF_TO_AGI(agbp);
308 newino = be32_to_cpu(agi->agi_newino); 341 newino = be32_to_cpu(agi->agi_newino);
309 agno = be32_to_cpu(agi->agi_seqno); 342 agno = be32_to_cpu(agi->agi_seqno);
@@ -402,7 +435,7 @@ xfs_ialloc_ag_alloc(
402 * rather than a linear progression to prevent the next generation 435 * rather than a linear progression to prevent the next generation
403 * number from being easily guessable. 436 * number from being easily guessable.
404 */ 437 */
405 error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, 438 error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
406 args.len, prandom_u32()); 439 args.len, prandom_u32());
407 440
408 if (error) 441 if (error)
@@ -615,8 +648,7 @@ xfs_ialloc_get_rec(
615 struct xfs_btree_cur *cur, 648 struct xfs_btree_cur *cur,
616 xfs_agino_t agino, 649 xfs_agino_t agino,
617 xfs_inobt_rec_incore_t *rec, 650 xfs_inobt_rec_incore_t *rec,
618 int *done, 651 int *done)
619 int left)
620{ 652{
621 int error; 653 int error;
622 int i; 654 int i;
@@ -724,12 +756,12 @@ xfs_dialloc_ag(
724 pag->pagl_leftrec != NULLAGINO && 756 pag->pagl_leftrec != NULLAGINO &&
725 pag->pagl_rightrec != NULLAGINO) { 757 pag->pagl_rightrec != NULLAGINO) {
726 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, 758 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
727 &trec, &doneleft, 1); 759 &trec, &doneleft);
728 if (error) 760 if (error)
729 goto error1; 761 goto error1;
730 762
731 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, 763 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
732 &rec, &doneright, 0); 764 &rec, &doneright);
733 if (error) 765 if (error)
734 goto error1; 766 goto error1;
735 } else { 767 } else {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index c8da3df271e6..68c07320f096 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,14 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
150int xfs_inobt_get_rec(struct xfs_btree_cur *cur, 150int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
151 xfs_inobt_rec_incore_t *rec, int *stat); 151 xfs_inobt_rec_incore_t *rec, int *stat);
152 152
153/*
154 * Inode chunk initialisation routine
155 */
156int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
157 struct list_head *buffer_list,
158 xfs_agnumber_t agno, xfs_agblock_t agbno,
159 xfs_agblock_t length, unsigned int gen);
160
153extern const struct xfs_buf_ops xfs_agi_buf_ops; 161extern const struct xfs_buf_ops xfs_agi_buf_ops;
154 162
155#endif /* __XFS_IALLOC_H__ */ 163#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e3e927..3f90e1ceb8d6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -335,7 +335,9 @@ xfs_iget_cache_miss(
335 iflags = XFS_INEW; 335 iflags = XFS_INEW;
336 if (flags & XFS_IGET_DONTCACHE) 336 if (flags & XFS_IGET_DONTCACHE)
337 iflags |= XFS_IDONTCACHE; 337 iflags |= XFS_IDONTCACHE;
338 ip->i_udquot = ip->i_gdquot = NULL; 338 ip->i_udquot = NULL;
339 ip->i_gdquot = NULL;
340 ip->i_pdquot = NULL;
339 xfs_iflags_set(ip, iflags); 341 xfs_iflags_set(ip, iflags);
340 342
341 /* insert the new inode */ 343 /* insert the new inode */
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index e0f138c70a2f..a01afbb3909a 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,6 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); 40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
41void xfs_eofblocks_worker(struct work_struct *); 41void xfs_eofblocks_worker(struct work_struct *);
42 42
43int xfs_sync_inode_grab(struct xfs_inode *ip);
44int xfs_inode_ag_iterator(struct xfs_mount *mp, 43int xfs_inode_ag_iterator(struct xfs_mount *mp,
45 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, 44 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
46 int flags, void *args), 45 int flags, void *args),
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
new file mode 100644
index 000000000000..7716a4e7375e
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.c
@@ -0,0 +1,195 @@
1/*
2 * Copyright (c) 2008-2010, 2013 Dave Chinner
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_buf_item.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h"
30#include "xfs_trans_priv.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_error.h"
41#include "xfs_icreate_item.h"
42
43kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
44
45static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
46{
47 return container_of(lip, struct xfs_icreate_item, ic_item);
48}
49
50/*
51 * This returns the number of iovecs needed to log the given inode item.
52 *
53 * We only need one iovec for the icreate log structure.
54 */
55STATIC uint
56xfs_icreate_item_size(
57 struct xfs_log_item *lip)
58{
59 return 1;
60}
61
62/*
63 * This is called to fill in the vector of log iovecs for the
64 * given inode create log item.
65 */
66STATIC void
67xfs_icreate_item_format(
68 struct xfs_log_item *lip,
69 struct xfs_log_iovec *log_vector)
70{
71 struct xfs_icreate_item *icp = ICR_ITEM(lip);
72
73 log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
74 log_vector->i_len = sizeof(struct xfs_icreate_log);
75 log_vector->i_type = XLOG_REG_TYPE_ICREATE;
76}
77
78
79/* Pinning has no meaning for the create item, so just return. */
80STATIC void
81xfs_icreate_item_pin(
82 struct xfs_log_item *lip)
83{
84}
85
86
87/* pinning has no meaning for the create item, so just return. */
88STATIC void
89xfs_icreate_item_unpin(
90 struct xfs_log_item *lip,
91 int remove)
92{
93}
94
95STATIC void
96xfs_icreate_item_unlock(
97 struct xfs_log_item *lip)
98{
99 struct xfs_icreate_item *icp = ICR_ITEM(lip);
100
101 if (icp->ic_item.li_flags & XFS_LI_ABORTED)
102 kmem_zone_free(xfs_icreate_zone, icp);
103 return;
104}
105
106/*
107 * Because we have ordered buffers being tracked in the AIL for the inode
108 * creation, we don't need the create item after this. Hence we can free
109 * the log item and return -1 to tell the caller we're done with the item.
110 */
111STATIC xfs_lsn_t
112xfs_icreate_item_committed(
113 struct xfs_log_item *lip,
114 xfs_lsn_t lsn)
115{
116 struct xfs_icreate_item *icp = ICR_ITEM(lip);
117
118 kmem_zone_free(xfs_icreate_zone, icp);
119 return (xfs_lsn_t)-1;
120}
121
122/* item can never get into the AIL */
123STATIC uint
124xfs_icreate_item_push(
125 struct xfs_log_item *lip,
126 struct list_head *buffer_list)
127{
128 ASSERT(0);
129 return XFS_ITEM_SUCCESS;
130}
131
132/* Ordered buffers do the dependency tracking here, so this does nothing. */
133STATIC void
134xfs_icreate_item_committing(
135 struct xfs_log_item *lip,
136 xfs_lsn_t lsn)
137{
138}
139
140/*
141 * This is the ops vector shared by all buf log items.
142 */
143static struct xfs_item_ops xfs_icreate_item_ops = {
144 .iop_size = xfs_icreate_item_size,
145 .iop_format = xfs_icreate_item_format,
146 .iop_pin = xfs_icreate_item_pin,
147 .iop_unpin = xfs_icreate_item_unpin,
148 .iop_push = xfs_icreate_item_push,
149 .iop_unlock = xfs_icreate_item_unlock,
150 .iop_committed = xfs_icreate_item_committed,
151 .iop_committing = xfs_icreate_item_committing,
152};
153
154
155/*
156 * Initialize the inode log item for a newly allocated (in-core) inode.
157 *
158 * Inode extents can only reside within an AG. Hence specify the starting
159 * block for the inode chunk by offset within an AG as well as the
160 * length of the allocated extent.
161 *
162 * This joins the item to the transaction and marks it dirty so
163 * that we don't need a separate call to do this, nor does the
164 * caller need to know anything about the icreate item.
165 */
166void
167xfs_icreate_log(
168 struct xfs_trans *tp,
169 xfs_agnumber_t agno,
170 xfs_agblock_t agbno,
171 unsigned int count,
172 unsigned int inode_size,
173 xfs_agblock_t length,
174 unsigned int generation)
175{
176 struct xfs_icreate_item *icp;
177
178 icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP);
179
180 xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE,
181 &xfs_icreate_item_ops);
182
183 icp->ic_format.icl_type = XFS_LI_ICREATE;
184 icp->ic_format.icl_size = 1; /* single vector */
185 icp->ic_format.icl_ag = cpu_to_be32(agno);
186 icp->ic_format.icl_agbno = cpu_to_be32(agbno);
187 icp->ic_format.icl_count = cpu_to_be32(count);
188 icp->ic_format.icl_isize = cpu_to_be32(inode_size);
189 icp->ic_format.icl_length = cpu_to_be32(length);
190 icp->ic_format.icl_gen = cpu_to_be32(generation);
191
192 xfs_trans_add_item(tp, &icp->ic_item);
193 tp->t_flags |= XFS_TRANS_DIRTY;
194 icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
195}
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
new file mode 100644
index 000000000000..88ba8aa0bc41
--- /dev/null
+++ b/fs/xfs/xfs_icreate_item.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright (c) 2008-2010, Dave Chinner
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef XFS_ICREATE_ITEM_H
19#define XFS_ICREATE_ITEM_H 1
20
21/*
22 * on disk log item structure
23 *
24 * Log recovery assumes the first two entries are the type and size and they fit
25 * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
26 * decoding can be done correctly.
27 */
28struct xfs_icreate_log {
29 __uint16_t icl_type; /* type of log format structure */
30 __uint16_t icl_size; /* size of log format structure */
31 __be32 icl_ag; /* ag being allocated in */
32 __be32 icl_agbno; /* start block of inode range */
33 __be32 icl_count; /* number of inodes to initialise */
34 __be32 icl_isize; /* size of inodes */
35 __be32 icl_length; /* length of extent to initialise */
36 __be32 icl_gen; /* inode generation number to use */
37};
38
39/* in memory log item structure */
40struct xfs_icreate_item {
41 struct xfs_log_item ic_item;
42 struct xfs_icreate_log ic_format;
43};
44
45extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
46
47void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno,
48 xfs_agblock_t agbno, unsigned int count,
49 unsigned int inode_size, xfs_agblock_t length,
50 unsigned int generation);
51
52#endif /* XFS_ICREATE_ITEM_H */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f7be5f98f52..bb262c25c8de 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -896,7 +896,6 @@ xfs_dinode_to_disk(
896 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 896 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
897 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 897 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
898 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 898 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
899 to->di_flushiter = cpu_to_be16(from->di_flushiter);
900 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); 899 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
901 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); 900 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
902 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); 901 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
@@ -924,6 +923,9 @@ xfs_dinode_to_disk(
924 to->di_lsn = cpu_to_be64(from->di_lsn); 923 to->di_lsn = cpu_to_be64(from->di_lsn);
925 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); 924 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
926 uuid_copy(&to->di_uuid, &from->di_uuid); 925 uuid_copy(&to->di_uuid, &from->di_uuid);
926 to->di_flushiter = 0;
927 } else {
928 to->di_flushiter = cpu_to_be16(from->di_flushiter);
927 } 929 }
928} 930}
929 931
@@ -1028,6 +1030,15 @@ xfs_dinode_calc_crc(
1028 1030
1029/* 1031/*
1030 * Read the disk inode attributes into the in-core inode structure. 1032 * Read the disk inode attributes into the in-core inode structure.
1033 *
1034 * For version 5 superblocks, if we are initialising a new inode and we are not
1035 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
1036 * inode core with a random generation number. If we are keeping inodes around,
1037 * we need to read the inode cluster to get the existing generation number off
1038 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
1039 * format) then log recovery is dependent on the di_flushiter field being
1040 * initialised from the current on-disk value and hence we must also read the
1041 * inode off disk.
1031 */ 1042 */
1032int 1043int
1033xfs_iread( 1044xfs_iread(
@@ -1047,6 +1058,23 @@ xfs_iread(
1047 if (error) 1058 if (error)
1048 return error; 1059 return error;
1049 1060
1061 /* shortcut IO on inode allocation if possible */
1062 if ((iget_flags & XFS_IGET_CREATE) &&
1063 xfs_sb_version_hascrc(&mp->m_sb) &&
1064 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1065 /* initialise the on-disk inode core */
1066 memset(&ip->i_d, 0, sizeof(ip->i_d));
1067 ip->i_d.di_magic = XFS_DINODE_MAGIC;
1068 ip->i_d.di_gen = prandom_u32();
1069 if (xfs_sb_version_hascrc(&mp->m_sb)) {
1070 ip->i_d.di_version = 3;
1071 ip->i_d.di_ino = ip->i_ino;
1072 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
1073 } else
1074 ip->i_d.di_version = 2;
1075 return 0;
1076 }
1077
1050 /* 1078 /*
1051 * Get pointers to the on-disk inode and the buffer containing it. 1079 * Get pointers to the on-disk inode and the buffer containing it.
1052 */ 1080 */
@@ -1133,17 +1161,16 @@ xfs_iread(
1133 xfs_buf_set_ref(bp, XFS_INO_REF); 1161 xfs_buf_set_ref(bp, XFS_INO_REF);
1134 1162
1135 /* 1163 /*
1136 * Use xfs_trans_brelse() to release the buffer containing the 1164 * Use xfs_trans_brelse() to release the buffer containing the on-disk
1137 * on-disk inode, because it was acquired with xfs_trans_read_buf() 1165 * inode, because it was acquired with xfs_trans_read_buf() in
1138 * in xfs_imap_to_bp() above. If tp is NULL, this is just a normal 1166 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
1139 * brelse(). If we're within a transaction, then xfs_trans_brelse() 1167 * brelse(). If we're within a transaction, then xfs_trans_brelse()
1140 * will only release the buffer if it is not dirty within the 1168 * will only release the buffer if it is not dirty within the
1141 * transaction. It will be OK to release the buffer in this case, 1169 * transaction. It will be OK to release the buffer in this case,
1142 * because inodes on disk are never destroyed and we will be 1170 * because inodes on disk are never destroyed and we will be locking the
1143 * locking the new in-core inode before putting it in the hash 1171 * new in-core inode before putting it in the cache where other
1144 * table where other processes can find it. Thus we don't have 1172 * processes can find it. Thus we don't have to worry about the inode
1145 * to worry about the inode being changed just because we released 1173 * being changed just because we released the buffer.
1146 * the buffer.
1147 */ 1174 */
1148 out_brelse: 1175 out_brelse:
1149 xfs_trans_brelse(tp, bp); 1176 xfs_trans_brelse(tp, bp);
@@ -2028,8 +2055,6 @@ xfs_ifree(
2028 int error; 2055 int error;
2029 int delete; 2056 int delete;
2030 xfs_ino_t first_ino; 2057 xfs_ino_t first_ino;
2031 xfs_dinode_t *dip;
2032 xfs_buf_t *ibp;
2033 2058
2034 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 2059 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2035 ASSERT(ip->i_d.di_nlink == 0); 2060 ASSERT(ip->i_d.di_nlink == 0);
@@ -2042,14 +2067,13 @@ xfs_ifree(
2042 * Pull the on-disk inode from the AGI unlinked list. 2067 * Pull the on-disk inode from the AGI unlinked list.
2043 */ 2068 */
2044 error = xfs_iunlink_remove(tp, ip); 2069 error = xfs_iunlink_remove(tp, ip);
2045 if (error != 0) { 2070 if (error)
2046 return error; 2071 return error;
2047 }
2048 2072
2049 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); 2073 error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2050 if (error != 0) { 2074 if (error)
2051 return error; 2075 return error;
2052 } 2076
2053 ip->i_d.di_mode = 0; /* mark incore inode as free */ 2077 ip->i_d.di_mode = 0; /* mark incore inode as free */
2054 ip->i_d.di_flags = 0; 2078 ip->i_d.di_flags = 0;
2055 ip->i_d.di_dmevmask = 0; 2079 ip->i_d.di_dmevmask = 0;
@@ -2061,31 +2085,10 @@ xfs_ifree(
2061 * by reincarnations of this inode. 2085 * by reincarnations of this inode.
2062 */ 2086 */
2063 ip->i_d.di_gen++; 2087 ip->i_d.di_gen++;
2064
2065 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2088 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2066 2089
2067 error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp, 2090 if (delete)
2068 0, 0);
2069 if (error)
2070 return error;
2071
2072 /*
2073 * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
2074 * from picking up this inode when it is reclaimed (its incore state
2075 * initialzed but not flushed to disk yet). The in-core di_mode is
2076 * already cleared and a corresponding transaction logged.
2077 * The hack here just synchronizes the in-core to on-disk
2078 * di_mode value in advance before the actual inode sync to disk.
2079 * This is OK because the inode is already unlinked and would never
2080 * change its di_mode again for this inode generation.
2081 * This is a temporary hack that would require a proper fix
2082 * in the future.
2083 */
2084 dip->di_mode = 0;
2085
2086 if (delete) {
2087 error = xfs_ifree_cluster(ip, tp, first_ino); 2091 error = xfs_ifree_cluster(ip, tp, first_ino);
2088 }
2089 2092
2090 return error; 2093 return error;
2091} 2094}
@@ -2160,8 +2163,8 @@ xfs_iroot_realloc(
2160 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, 2163 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2161 (int)new_size); 2164 (int)new_size);
2162 ifp->if_broot_bytes = (int)new_size; 2165 ifp->if_broot_bytes = (int)new_size;
2163 ASSERT(ifp->if_broot_bytes <= 2166 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2164 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); 2167 XFS_IFORK_SIZE(ip, whichfork));
2165 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 2168 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2166 return; 2169 return;
2167 } 2170 }
@@ -2214,8 +2217,9 @@ xfs_iroot_realloc(
2214 kmem_free(ifp->if_broot); 2217 kmem_free(ifp->if_broot);
2215 ifp->if_broot = new_broot; 2218 ifp->if_broot = new_broot;
2216 ifp->if_broot_bytes = (int)new_size; 2219 ifp->if_broot_bytes = (int)new_size;
2217 ASSERT(ifp->if_broot_bytes <= 2220 if (ifp->if_broot)
2218 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ(ip)); 2221 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2222 XFS_IFORK_SIZE(ip, whichfork));
2219 return; 2223 return;
2220} 2224}
2221 2225
@@ -2526,9 +2530,8 @@ xfs_iflush_fork(
2526 if ((iip->ili_fields & brootflag[whichfork]) && 2530 if ((iip->ili_fields & brootflag[whichfork]) &&
2527 (ifp->if_broot_bytes > 0)) { 2531 (ifp->if_broot_bytes > 0)) {
2528 ASSERT(ifp->if_broot != NULL); 2532 ASSERT(ifp->if_broot != NULL);
2529 ASSERT(ifp->if_broot_bytes <= 2533 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2530 (XFS_IFORK_SIZE(ip, whichfork) + 2534 XFS_IFORK_SIZE(ip, whichfork));
2531 XFS_BROOT_SIZE_ADJ(ip)));
2532 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2535 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
2533 (xfs_bmdr_block_t *)cp, 2536 (xfs_bmdr_block_t *)cp,
2534 XFS_DFORK_SIZE(dip, mp, whichfork)); 2537 XFS_DFORK_SIZE(dip, mp, whichfork));
@@ -2886,12 +2889,18 @@ xfs_iflush_int(
2886 __func__, ip->i_ino, ip->i_d.di_forkoff, ip); 2889 __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2887 goto corrupt_out; 2890 goto corrupt_out;
2888 } 2891 }
2892
2889 /* 2893 /*
2890 * bump the flush iteration count, used to detect flushes which 2894 * Inode item log recovery for v1/v2 inodes are dependent on the
2891 * postdate a log record during recovery. This is redundant as we now 2895 * di_flushiter count for correct sequencing. We bump the flush
2892 * log every change and hence this can't happen. Still, it doesn't hurt. 2896 * iteration count so we can detect flushes which postdate a log record
2897 * during recovery. This is redundant as we now log every change and
2898 * hence this can't happen but we need to still do it to ensure
2899 * backwards compatibility with old kernels that predate logging all
2900 * inode changes.
2893 */ 2901 */
2894 ip->i_d.di_flushiter++; 2902 if (ip->i_d.di_version < 3)
2903 ip->i_d.di_flushiter++;
2895 2904
2896 /* 2905 /*
2897 * Copy the dirty parts of the inode into the on-disk 2906 * Copy the dirty parts of the inode into the on-disk
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 91129794aaec..b55fd347ab5b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -250,6 +250,7 @@ typedef struct xfs_inode {
250 struct xfs_mount *i_mount; /* fs mount struct ptr */ 250 struct xfs_mount *i_mount; /* fs mount struct ptr */
251 struct xfs_dquot *i_udquot; /* user dquot */ 251 struct xfs_dquot *i_udquot; /* user dquot */
252 struct xfs_dquot *i_gdquot; /* group dquot */ 252 struct xfs_dquot *i_gdquot; /* group dquot */
253 struct xfs_dquot *i_pdquot; /* project dquot */
253 254
254 /* Inode location stuff */ 255 /* Inode location stuff */
255 xfs_ino_t i_ino; /* inode number (agno/agino)*/ 256 xfs_ino_t i_ino; /* inode number (agno/agino)*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5e999680094a..6e2bca5d44d6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -248,7 +248,7 @@ xfs_open_by_handle(
248 goto out_dput; 248 goto out_dput;
249 } 249 }
250 250
251 fd = get_unused_fd(); 251 fd = get_unused_fd_flags(0);
252 if (fd < 0) { 252 if (fd < 0) {
253 error = fd; 253 error = fd;
254 goto out_dput; 254 goto out_dput;
@@ -928,7 +928,7 @@ xfs_ioctl_setattr(
928 struct xfs_trans *tp; 928 struct xfs_trans *tp;
929 unsigned int lock_flags = 0; 929 unsigned int lock_flags = 0;
930 struct xfs_dquot *udqp = NULL; 930 struct xfs_dquot *udqp = NULL;
931 struct xfs_dquot *gdqp = NULL; 931 struct xfs_dquot *pdqp = NULL;
932 struct xfs_dquot *olddquot = NULL; 932 struct xfs_dquot *olddquot = NULL;
933 int code; 933 int code;
934 934
@@ -957,7 +957,7 @@ xfs_ioctl_setattr(
957 if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { 957 if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
958 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, 958 code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
959 ip->i_d.di_gid, fa->fsx_projid, 959 ip->i_d.di_gid, fa->fsx_projid,
960 XFS_QMOPT_PQUOTA, &udqp, &gdqp); 960 XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
961 if (code) 961 if (code)
962 return code; 962 return code;
963 } 963 }
@@ -994,8 +994,8 @@ xfs_ioctl_setattr(
994 XFS_IS_PQUOTA_ON(mp) && 994 XFS_IS_PQUOTA_ON(mp) &&
995 xfs_get_projid(ip) != fa->fsx_projid) { 995 xfs_get_projid(ip) != fa->fsx_projid) {
996 ASSERT(tp); 996 ASSERT(tp);
997 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 997 code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
998 capable(CAP_FOWNER) ? 998 pdqp, capable(CAP_FOWNER) ?
999 XFS_QMOPT_FORCE_RES : 0); 999 XFS_QMOPT_FORCE_RES : 0);
1000 if (code) /* out of quota */ 1000 if (code) /* out of quota */
1001 goto error_return; 1001 goto error_return;
@@ -1113,7 +1113,7 @@ xfs_ioctl_setattr(
1113 if (xfs_get_projid(ip) != fa->fsx_projid) { 1113 if (xfs_get_projid(ip) != fa->fsx_projid) {
1114 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { 1114 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
1115 olddquot = xfs_qm_vop_chown(tp, ip, 1115 olddquot = xfs_qm_vop_chown(tp, ip,
1116 &ip->i_gdquot, gdqp); 1116 &ip->i_pdquot, pdqp);
1117 } 1117 }
1118 xfs_set_projid(ip, fa->fsx_projid); 1118 xfs_set_projid(ip, fa->fsx_projid);
1119 1119
@@ -1160,13 +1160,13 @@ xfs_ioctl_setattr(
1160 */ 1160 */
1161 xfs_qm_dqrele(olddquot); 1161 xfs_qm_dqrele(olddquot);
1162 xfs_qm_dqrele(udqp); 1162 xfs_qm_dqrele(udqp);
1163 xfs_qm_dqrele(gdqp); 1163 xfs_qm_dqrele(pdqp);
1164 1164
1165 return code; 1165 return code;
1166 1166
1167 error_return: 1167 error_return:
1168 xfs_qm_dqrele(udqp); 1168 xfs_qm_dqrele(udqp);
1169 xfs_qm_dqrele(gdqp); 1169 xfs_qm_dqrele(pdqp);
1170 xfs_trans_cancel(tp, 0); 1170 xfs_trans_cancel(tp, 0);
1171 if (lock_flags) 1171 if (lock_flags)
1172 xfs_iunlock(ip, lock_flags); 1172 xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8f8aaee7f379..6a7096422295 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -284,6 +284,15 @@ xfs_iomap_eof_want_preallocate(
284 return 0; 284 return 0;
285 285
286 /* 286 /*
287 * If the file is smaller than the minimum prealloc and we are using
288 * dynamic preallocation, don't do any preallocation at all as it is
289 * likely this is the only write to the file that is going to be done.
290 */
291 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
292 XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
293 return 0;
294
295 /*
287 * If there are any real blocks past eof, then don't 296 * If there are any real blocks past eof, then don't
288 * do any speculative allocation. 297 * do any speculative allocation.
289 */ 298 */
@@ -345,6 +354,10 @@ xfs_iomap_eof_prealloc_initial_size(
345 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) 354 if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
346 return 0; 355 return 0;
347 356
357 /* If the file is small, then use the minimum prealloc */
358 if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
359 return 0;
360
348 /* 361 /*
349 * As we write multiple pages, the offset will always align to the 362 * As we write multiple pages, the offset will always align to the
350 * start of a page and hence point to a hole at EOF. i.e. if the size is 363 * start of a page and hence point to a hole at EOF. i.e. if the size is
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca9ecaa81112..96dda62d497b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -467,9 +467,6 @@ xfs_setattr_mode(
467 ASSERT(tp); 467 ASSERT(tp);
468 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 468 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
469 469
470 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
471 mode &= ~S_ISGID;
472
473 ip->i_d.di_mode &= S_IFMT; 470 ip->i_d.di_mode &= S_IFMT;
474 ip->i_d.di_mode |= mode & ~S_IFMT; 471 ip->i_d.di_mode |= mode & ~S_IFMT;
475 472
@@ -495,15 +492,18 @@ xfs_setattr_nonsize(
495 492
496 trace_xfs_setattr(ip); 493 trace_xfs_setattr(ip);
497 494
498 if (mp->m_flags & XFS_MOUNT_RDONLY) 495 /* If acls are being inherited, we already have this checked */
499 return XFS_ERROR(EROFS); 496 if (!(flags & XFS_ATTR_NOACL)) {
497 if (mp->m_flags & XFS_MOUNT_RDONLY)
498 return XFS_ERROR(EROFS);
500 499
501 if (XFS_FORCED_SHUTDOWN(mp)) 500 if (XFS_FORCED_SHUTDOWN(mp))
502 return XFS_ERROR(EIO); 501 return XFS_ERROR(EIO);
503 502
504 error = -inode_change_ok(inode, iattr); 503 error = -inode_change_ok(inode, iattr);
505 if (error) 504 if (error)
506 return XFS_ERROR(error); 505 return XFS_ERROR(error);
506 }
507 507
508 ASSERT((mask & ATTR_SIZE) == 0); 508 ASSERT((mask & ATTR_SIZE) == 0);
509 509
@@ -539,7 +539,7 @@ xfs_setattr_nonsize(
539 ASSERT(udqp == NULL); 539 ASSERT(udqp == NULL);
540 ASSERT(gdqp == NULL); 540 ASSERT(gdqp == NULL);
541 error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), 541 error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
542 qflags, &udqp, &gdqp); 542 qflags, &udqp, &gdqp, NULL);
543 if (error) 543 if (error)
544 return error; 544 return error;
545 } 545 }
@@ -575,7 +575,7 @@ xfs_setattr_nonsize(
575 (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { 575 (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
576 ASSERT(tp); 576 ASSERT(tp);
577 error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 577 error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
578 capable(CAP_FOWNER) ? 578 NULL, capable(CAP_FOWNER) ?
579 XFS_QMOPT_FORCE_RES : 0); 579 XFS_QMOPT_FORCE_RES : 0);
580 if (error) /* out of quota */ 580 if (error) /* out of quota */
581 goto out_trans_cancel; 581 goto out_trans_cancel;
@@ -987,7 +987,8 @@ xfs_fiemap_format(
987 if (bmv->bmv_oflags & BMV_OF_PREALLOC) 987 if (bmv->bmv_oflags & BMV_OF_PREALLOC)
988 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; 988 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
989 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { 989 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
990 fiemap_flags |= FIEMAP_EXTENT_DELALLOC; 990 fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
991 FIEMAP_EXTENT_UNKNOWN);
991 physical = 0; /* no block yet */ 992 physical = 0; /* no block yet */
992 } 993 }
993 if (bmv->bmv_oflags & BMV_OF_LAST) 994 if (bmv->bmv_oflags & BMV_OF_LAST)
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2ea7d402188d..b93e14b86754 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -43,7 +43,7 @@ xfs_internal_inum(
43{ 43{
44 return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || 44 return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
45 (xfs_sb_version_hasquota(&mp->m_sb) && 45 (xfs_sb_version_hasquota(&mp->m_sb) &&
46 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); 46 xfs_is_quota_inode(&mp->m_sb, ino)));
47} 47}
48 48
49/* 49/*
@@ -221,7 +221,6 @@ xfs_bulkstat(
221 char __user *ubufp; /* pointer into user's buffer */ 221 char __user *ubufp; /* pointer into user's buffer */
222 int ubelem; /* spaces used in user's buffer */ 222 int ubelem; /* spaces used in user's buffer */
223 int ubused; /* bytes used by formatter */ 223 int ubused; /* bytes used by formatter */
224 xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
225 224
226 /* 225 /*
227 * Get the last inode value, see if there's nothing to do. 226 * Get the last inode value, see if there's nothing to do.
@@ -263,7 +262,6 @@ xfs_bulkstat(
263 rval = 0; 262 rval = 0;
264 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { 263 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
265 cond_resched(); 264 cond_resched();
266 bp = NULL;
267 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 265 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
268 if (error) { 266 if (error) {
269 /* 267 /*
@@ -383,11 +381,13 @@ xfs_bulkstat(
383 * Also start read-ahead now for this chunk. 381 * Also start read-ahead now for this chunk.
384 */ 382 */
385 if (r.ir_freecount < XFS_INODES_PER_CHUNK) { 383 if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
384 struct blk_plug plug;
386 /* 385 /*
387 * Loop over all clusters in the next chunk. 386 * Loop over all clusters in the next chunk.
388 * Do a readahead if there are any allocated 387 * Do a readahead if there are any allocated
389 * inodes in that cluster. 388 * inodes in that cluster.
390 */ 389 */
390 blk_start_plug(&plug);
391 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); 391 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
392 for (chunkidx = 0; 392 for (chunkidx = 0;
393 chunkidx < XFS_INODES_PER_CHUNK; 393 chunkidx < XFS_INODES_PER_CHUNK;
@@ -399,6 +399,7 @@ xfs_bulkstat(
399 agbno, nbcluster, 399 agbno, nbcluster,
400 &xfs_inode_buf_ops); 400 &xfs_inode_buf_ops);
401 } 401 }
402 blk_finish_plug(&plug);
402 irbp->ir_startino = r.ir_startino; 403 irbp->ir_startino = r.ir_startino;
403 irbp->ir_freecount = r.ir_freecount; 404 irbp->ir_freecount = r.ir_freecount;
404 irbp->ir_free = r.ir_free; 405 irbp->ir_free = r.ir_free;
@@ -433,27 +434,7 @@ xfs_bulkstat(
433 irbp->ir_freecount < XFS_INODES_PER_CHUNK; 434 irbp->ir_freecount < XFS_INODES_PER_CHUNK;
434 chunkidx++, clustidx++, agino++) { 435 chunkidx++, clustidx++, agino++) {
435 ASSERT(chunkidx < XFS_INODES_PER_CHUNK); 436 ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
436 /* 437
437 * Recompute agbno if this is the
438 * first inode of the cluster.
439 *
440 * Careful with clustidx. There can be
441 * multiple clusters per chunk, a single
442 * cluster per chunk or a cluster that has
443 * inodes represented from several different
444 * chunks (if blocksize is large).
445 *
446 * Because of this, the starting clustidx is
447 * initialized to zero in this loop but must
448 * later be reset after reading in the cluster
449 * buffer.
450 */
451 if ((chunkidx & (nicluster - 1)) == 0) {
452 agbno = XFS_AGINO_TO_AGBNO(mp,
453 irbp->ir_startino) +
454 ((chunkidx & nimask) >>
455 mp->m_sb.sb_inopblog);
456 }
457 ino = XFS_AGINO_TO_INO(mp, agno, agino); 438 ino = XFS_AGINO_TO_INO(mp, agno, agino);
458 /* 439 /*
459 * Skip if this inode is free. 440 * Skip if this inode is free.
@@ -499,10 +480,6 @@ xfs_bulkstat(
499 480
500 cond_resched(); 481 cond_resched();
501 } 482 }
502
503 if (bp)
504 xfs_buf_relse(bp);
505
506 /* 483 /*
507 * Set up for the next loop iteration. 484 * Set up for the next loop iteration.
508 */ 485 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b345a7c85153..d852a2b3e1fd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1963,6 +1963,10 @@ xlog_write_calc_vec_length(
1963 headers++; 1963 headers++;
1964 1964
1965 for (lv = log_vector; lv; lv = lv->lv_next) { 1965 for (lv = log_vector; lv; lv = lv->lv_next) {
1966 /* we don't write ordered log vectors */
1967 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
1968 continue;
1969
1966 headers += lv->lv_niovecs; 1970 headers += lv->lv_niovecs;
1967 1971
1968 for (i = 0; i < lv->lv_niovecs; i++) { 1972 for (i = 0; i < lv->lv_niovecs; i++) {
@@ -2216,7 +2220,7 @@ xlog_write(
2216 index = 0; 2220 index = 0;
2217 lv = log_vector; 2221 lv = log_vector;
2218 vecp = lv->lv_iovecp; 2222 vecp = lv->lv_iovecp;
2219 while (lv && index < lv->lv_niovecs) { 2223 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
2220 void *ptr; 2224 void *ptr;
2221 int log_offset; 2225 int log_offset;
2222 2226
@@ -2236,13 +2240,22 @@ xlog_write(
2236 * This loop writes out as many regions as can fit in the amount 2240 * This loop writes out as many regions as can fit in the amount
2237 * of space which was allocated by xlog_state_get_iclog_space(). 2241 * of space which was allocated by xlog_state_get_iclog_space().
2238 */ 2242 */
2239 while (lv && index < lv->lv_niovecs) { 2243 while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
2240 struct xfs_log_iovec *reg = &vecp[index]; 2244 struct xfs_log_iovec *reg;
2241 struct xlog_op_header *ophdr; 2245 struct xlog_op_header *ophdr;
2242 int start_rec_copy; 2246 int start_rec_copy;
2243 int copy_len; 2247 int copy_len;
2244 int copy_off; 2248 int copy_off;
2249 bool ordered = false;
2250
2251 /* ordered log vectors have no regions to write */
2252 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
2253 ASSERT(lv->lv_niovecs == 0);
2254 ordered = true;
2255 goto next_lv;
2256 }
2245 2257
2258 reg = &vecp[index];
2246 ASSERT(reg->i_len % sizeof(__int32_t) == 0); 2259 ASSERT(reg->i_len % sizeof(__int32_t) == 0);
2247 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); 2260 ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0);
2248 2261
@@ -2302,12 +2315,13 @@ xlog_write(
2302 break; 2315 break;
2303 2316
2304 if (++index == lv->lv_niovecs) { 2317 if (++index == lv->lv_niovecs) {
2318next_lv:
2305 lv = lv->lv_next; 2319 lv = lv->lv_next;
2306 index = 0; 2320 index = 0;
2307 if (lv) 2321 if (lv)
2308 vecp = lv->lv_iovecp; 2322 vecp = lv->lv_iovecp;
2309 } 2323 }
2310 if (record_cnt == 0) { 2324 if (record_cnt == 0 && ordered == false) {
2311 if (!lv) 2325 if (!lv)
2312 return 0; 2326 return 0;
2313 break; 2327 break;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 5caee96059df..fb630e496c12 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -88,7 +88,8 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
88#define XLOG_REG_TYPE_UNMOUNT 17 88#define XLOG_REG_TYPE_UNMOUNT 17
89#define XLOG_REG_TYPE_COMMIT 18 89#define XLOG_REG_TYPE_COMMIT 18
90#define XLOG_REG_TYPE_TRANSHDR 19 90#define XLOG_REG_TYPE_TRANSHDR 19
91#define XLOG_REG_TYPE_MAX 19 91#define XLOG_REG_TYPE_ICREATE 20
92#define XLOG_REG_TYPE_MAX 20
92 93
93typedef struct xfs_log_iovec { 94typedef struct xfs_log_iovec {
94 void *i_addr; /* beginning address of region */ 95 void *i_addr; /* beginning address of region */
@@ -105,6 +106,8 @@ struct xfs_log_vec {
105 int lv_buf_len; /* size of formatted buffer */ 106 int lv_buf_len; /* size of formatted buffer */
106}; 107};
107 108
109#define XFS_LOG_VEC_ORDERED (-1)
110
108/* 111/*
109 * Structure used to pass callback function and the function's argument 112 * Structure used to pass callback function and the function's argument
110 * to the log manager. 113 * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d0833b54e55d..02b9cf3f8252 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -127,6 +127,7 @@ xlog_cil_prepare_log_vecs(
127 int index; 127 int index;
128 int len = 0; 128 int len = 0;
129 uint niovecs; 129 uint niovecs;
130 bool ordered = false;
130 131
131 /* Skip items which aren't dirty in this transaction. */ 132 /* Skip items which aren't dirty in this transaction. */
132 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 133 if (!(lidp->lid_flags & XFS_LID_DIRTY))
@@ -137,14 +138,30 @@ xlog_cil_prepare_log_vecs(
137 if (!niovecs) 138 if (!niovecs)
138 continue; 139 continue;
139 140
141 /*
142 * Ordered items need to be tracked but we do not wish to write
143 * them. We need a logvec to track the object, but we do not
144 * need an iovec or buffer to be allocated for copying data.
145 */
146 if (niovecs == XFS_LOG_VEC_ORDERED) {
147 ordered = true;
148 niovecs = 0;
149 }
150
140 new_lv = kmem_zalloc(sizeof(*new_lv) + 151 new_lv = kmem_zalloc(sizeof(*new_lv) +
141 niovecs * sizeof(struct xfs_log_iovec), 152 niovecs * sizeof(struct xfs_log_iovec),
142 KM_SLEEP|KM_NOFS); 153 KM_SLEEP|KM_NOFS);
143 154
155 new_lv->lv_item = lidp->lid_item;
156 new_lv->lv_niovecs = niovecs;
157 if (ordered) {
158 /* track as an ordered logvec */
159 new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
160 goto next;
161 }
162
144 /* The allocated iovec region lies beyond the log vector. */ 163 /* The allocated iovec region lies beyond the log vector. */
145 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; 164 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
146 new_lv->lv_niovecs = niovecs;
147 new_lv->lv_item = lidp->lid_item;
148 165
149 /* build the vector array and calculate it's length */ 166 /* build the vector array and calculate it's length */
150 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); 167 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
@@ -165,6 +182,7 @@ xlog_cil_prepare_log_vecs(
165 } 182 }
166 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); 183 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
167 184
185next:
168 if (!ret_lv) 186 if (!ret_lv)
169 ret_lv = new_lv; 187 ret_lv = new_lv;
170 else 188 else
@@ -191,8 +209,18 @@ xfs_cil_prepare_item(
191 209
192 if (old) { 210 if (old) {
193 /* existing lv on log item, space used is a delta */ 211 /* existing lv on log item, space used is a delta */
194 ASSERT(!list_empty(&lv->lv_item->li_cil)); 212 ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) ||
195 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); 213 old->lv_buf_len == XFS_LOG_VEC_ORDERED);
214
215 /*
216 * If the new item is ordered, keep the old one that is already
217 * tracking dirty or ordered regions
218 */
219 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
220 ASSERT(!lv->lv_buf);
221 kmem_free(lv);
222 return;
223 }
196 224
197 *len += lv->lv_buf_len - old->lv_buf_len; 225 *len += lv->lv_buf_len - old->lv_buf_len;
198 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; 226 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
@@ -201,10 +229,11 @@ xfs_cil_prepare_item(
201 } else { 229 } else {
202 /* new lv, must pin the log item */ 230 /* new lv, must pin the log item */
203 ASSERT(!lv->lv_item->li_lv); 231 ASSERT(!lv->lv_item->li_lv);
204 ASSERT(list_empty(&lv->lv_item->li_cil));
205 232
206 *len += lv->lv_buf_len; 233 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
207 *diff_iovecs += lv->lv_niovecs; 234 *len += lv->lv_buf_len;
235 *diff_iovecs += lv->lv_niovecs;
236 }
208 IOP_PIN(lv->lv_item); 237 IOP_PIN(lv->lv_item);
209 238
210 } 239 }
@@ -259,18 +288,24 @@ xlog_cil_insert_items(
259 * We can do this safely because the context can't checkpoint until we 288 * We can do this safely because the context can't checkpoint until we
260 * are done so it doesn't matter exactly how we update the CIL. 289 * are done so it doesn't matter exactly how we update the CIL.
261 */ 290 */
262 for (lv = log_vector; lv; lv = lv->lv_next)
263 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
264
265 /* account for space used by new iovec headers */
266 len += diff_iovecs * sizeof(xlog_op_header_t);
267
268 spin_lock(&cil->xc_cil_lock); 291 spin_lock(&cil->xc_cil_lock);
292 for (lv = log_vector; lv; ) {
293 struct xfs_log_vec *next = lv->lv_next;
269 294
270 /* move the items to the tail of the CIL */ 295 ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil));
271 for (lv = log_vector; lv; lv = lv->lv_next) 296 lv->lv_next = NULL;
297
298 /*
299 * xfs_cil_prepare_item() may free the lv, so move the item on
300 * the CIL first.
301 */
272 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil); 302 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
303 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
304 lv = next;
305 }
273 306
307 /* account for space used by new iovec headers */
308 len += diff_iovecs * sizeof(xlog_op_header_t);
274 ctx->nvecs += diff_iovecs; 309 ctx->nvecs += diff_iovecs;
275 310
276 /* 311 /*
@@ -381,9 +416,7 @@ xlog_cil_push(
381 struct xfs_cil_ctx *new_ctx; 416 struct xfs_cil_ctx *new_ctx;
382 struct xlog_in_core *commit_iclog; 417 struct xlog_in_core *commit_iclog;
383 struct xlog_ticket *tic; 418 struct xlog_ticket *tic;
384 int num_lv;
385 int num_iovecs; 419 int num_iovecs;
386 int len;
387 int error = 0; 420 int error = 0;
388 struct xfs_trans_header thdr; 421 struct xfs_trans_header thdr;
389 struct xfs_log_iovec lhdr; 422 struct xfs_log_iovec lhdr;
@@ -428,12 +461,9 @@ xlog_cil_push(
428 * side which is currently locked out by the flush lock. 461 * side which is currently locked out by the flush lock.
429 */ 462 */
430 lv = NULL; 463 lv = NULL;
431 num_lv = 0;
432 num_iovecs = 0; 464 num_iovecs = 0;
433 len = 0;
434 while (!list_empty(&cil->xc_cil)) { 465 while (!list_empty(&cil->xc_cil)) {
435 struct xfs_log_item *item; 466 struct xfs_log_item *item;
436 int i;
437 467
438 item = list_first_entry(&cil->xc_cil, 468 item = list_first_entry(&cil->xc_cil,
439 struct xfs_log_item, li_cil); 469 struct xfs_log_item, li_cil);
@@ -444,11 +474,7 @@ xlog_cil_push(
444 lv->lv_next = item->li_lv; 474 lv->lv_next = item->li_lv;
445 lv = item->li_lv; 475 lv = item->li_lv;
446 item->li_lv = NULL; 476 item->li_lv = NULL;
447
448 num_lv++;
449 num_iovecs += lv->lv_niovecs; 477 num_iovecs += lv->lv_niovecs;
450 for (i = 0; i < lv->lv_niovecs; i++)
451 len += lv->lv_iovecp[i].i_len;
452 } 478 }
453 479
454 /* 480 /*
@@ -701,6 +727,7 @@ xfs_log_commit_cil(
701 if (commit_lsn) 727 if (commit_lsn)
702 *commit_lsn = log->l_cilp->xc_ctx->sequence; 728 *commit_lsn = log->l_cilp->xc_ctx->sequence;
703 729
730 /* xlog_cil_insert_items() destroys log_vector list */
704 xlog_cil_insert_items(log, log_vector, tp->t_ticket); 731 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
705 732
706 /* check we didn't blow the reservation */ 733 /* check we didn't blow the reservation */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7cf5e4eafe28..7681b19aa5dc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -45,6 +45,7 @@
45#include "xfs_cksum.h" 45#include "xfs_cksum.h"
46#include "xfs_trace.h" 46#include "xfs_trace.h"
47#include "xfs_icache.h" 47#include "xfs_icache.h"
48#include "xfs_icreate_item.h"
48 49
49/* Need all the magic numbers and buffer ops structures from these headers */ 50/* Need all the magic numbers and buffer ops structures from these headers */
50#include "xfs_symlink.h" 51#include "xfs_symlink.h"
@@ -1617,7 +1618,10 @@ xlog_recover_add_to_trans(
1617 * form the cancelled buffer table. Hence they have tobe done last. 1618 * form the cancelled buffer table. Hence they have tobe done last.
1618 * 1619 *
1619 * 3. Inode allocation buffers must be replayed before inode items that 1620 * 3. Inode allocation buffers must be replayed before inode items that
1620 * read the buffer and replay changes into it. 1621 * read the buffer and replay changes into it. For filesystems using the
1622 * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1623 * treated the same as inode allocation buffers as they create and
1624 * initialise the buffers directly.
1621 * 1625 *
1622 * 4. Inode unlink buffers must be replayed after inode items are replayed. 1626 * 4. Inode unlink buffers must be replayed after inode items are replayed.
1623 * This ensures that inodes are completely flushed to the inode buffer 1627 * This ensures that inodes are completely flushed to the inode buffer
@@ -1632,10 +1636,17 @@ xlog_recover_add_to_trans(
1632 * from all the other buffers and move them to last. 1636 * from all the other buffers and move them to last.
1633 * 1637 *
1634 * Hence, 4 lists, in order from head to tail: 1638 * Hence, 4 lists, in order from head to tail:
1635 * - buffer_list for all buffers except cancelled/inode unlink buffers 1639 * - buffer_list for all buffers except cancelled/inode unlink buffers
1636 * - item_list for all non-buffer items 1640 * - item_list for all non-buffer items
1637 * - inode_buffer_list for inode unlink buffers 1641 * - inode_buffer_list for inode unlink buffers
1638 * - cancel_list for the cancelled buffers 1642 * - cancel_list for the cancelled buffers
1643 *
1644 * Note that we add objects to the tail of the lists so that first-to-last
1645 * ordering is preserved within the lists. Adding objects to the head of the
1646 * list means when we traverse from the head we walk them in last-to-first
1647 * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1648 * but for all other items there may be specific ordering that we need to
1649 * preserve.
1639 */ 1650 */
1640STATIC int 1651STATIC int
1641xlog_recover_reorder_trans( 1652xlog_recover_reorder_trans(
@@ -1655,6 +1666,9 @@ xlog_recover_reorder_trans(
1655 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 1666 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
1656 1667
1657 switch (ITEM_TYPE(item)) { 1668 switch (ITEM_TYPE(item)) {
1669 case XFS_LI_ICREATE:
1670 list_move_tail(&item->ri_list, &buffer_list);
1671 break;
1658 case XFS_LI_BUF: 1672 case XFS_LI_BUF:
1659 if (buf_f->blf_flags & XFS_BLF_CANCEL) { 1673 if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1660 trace_xfs_log_recover_item_reorder_head(log, 1674 trace_xfs_log_recover_item_reorder_head(log,
@@ -2578,8 +2592,16 @@ xlog_recover_inode_pass2(
2578 goto error; 2592 goto error;
2579 } 2593 }
2580 2594
2581 /* Skip replay when the on disk inode is newer than the log one */ 2595 /*
2582 if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 2596 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
2597 * are transactional and if ordering is necessary we can determine that
2598 * more accurately by the LSN field in the V3 inode core. Don't trust
2599 * the inode versions we might be changing them here - use the
2600 * superblock flag to determine whether we need to look at di_flushiter
2601 * to skip replay when the on disk inode is newer than the log one
2602 */
2603 if (!xfs_sb_version_hascrc(&mp->m_sb) &&
2604 dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2583 /* 2605 /*
2584 * Deal with the wrap case, DI_MAX_FLUSH is less 2606 * Deal with the wrap case, DI_MAX_FLUSH is less
2585 * than smaller numbers 2607 * than smaller numbers
@@ -2594,6 +2616,7 @@ xlog_recover_inode_pass2(
2594 goto error; 2616 goto error;
2595 } 2617 }
2596 } 2618 }
2619
2597 /* Take the opportunity to reset the flush iteration count */ 2620 /* Take the opportunity to reset the flush iteration count */
2598 dicp->di_flushiter = 0; 2621 dicp->di_flushiter = 0;
2599 2622
@@ -2982,6 +3005,93 @@ xlog_recover_efd_pass2(
2982} 3005}
2983 3006
2984/* 3007/*
3008 * This routine is called when an inode create format structure is found in a
3009 * committed transaction in the log. It's purpose is to initialise the inodes
3010 * being allocated on disk. This requires us to get inode cluster buffers that
3011 * match the range to be intialised, stamped with inode templates and written
3012 * by delayed write so that subsequent modifications will hit the cached buffer
3013 * and only need writing out at the end of recovery.
3014 */
3015STATIC int
3016xlog_recover_do_icreate_pass2(
3017 struct xlog *log,
3018 struct list_head *buffer_list,
3019 xlog_recover_item_t *item)
3020{
3021 struct xfs_mount *mp = log->l_mp;
3022 struct xfs_icreate_log *icl;
3023 xfs_agnumber_t agno;
3024 xfs_agblock_t agbno;
3025 unsigned int count;
3026 unsigned int isize;
3027 xfs_agblock_t length;
3028
3029 icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3030 if (icl->icl_type != XFS_LI_ICREATE) {
3031 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3032 return EINVAL;
3033 }
3034
3035 if (icl->icl_size != 1) {
3036 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3037 return EINVAL;
3038 }
3039
3040 agno = be32_to_cpu(icl->icl_ag);
3041 if (agno >= mp->m_sb.sb_agcount) {
3042 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3043 return EINVAL;
3044 }
3045 agbno = be32_to_cpu(icl->icl_agbno);
3046 if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3047 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3048 return EINVAL;
3049 }
3050 isize = be32_to_cpu(icl->icl_isize);
3051 if (isize != mp->m_sb.sb_inodesize) {
3052 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3053 return EINVAL;
3054 }
3055 count = be32_to_cpu(icl->icl_count);
3056 if (!count) {
3057 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3058 return EINVAL;
3059 }
3060 length = be32_to_cpu(icl->icl_length);
3061 if (!length || length >= mp->m_sb.sb_agblocks) {
3062 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3063 return EINVAL;
3064 }
3065
3066 /* existing allocation is fixed value */
3067 ASSERT(count == XFS_IALLOC_INODES(mp));
3068 ASSERT(length == XFS_IALLOC_BLOCKS(mp));
3069 if (count != XFS_IALLOC_INODES(mp) ||
3070 length != XFS_IALLOC_BLOCKS(mp)) {
3071 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
3072 return EINVAL;
3073 }
3074
3075 /*
3076 * Inode buffers can be freed. Do not replay the inode initialisation as
3077 * we could be overwriting something written after this inode buffer was
3078 * cancelled.
3079 *
3080 * XXX: we need to iterate all buffers and only init those that are not
3081 * cancelled. I think that a more fine grained factoring of
3082 * xfs_ialloc_inode_init may be appropriate here to enable this to be
3083 * done easily.
3084 */
3085 if (xlog_check_buffer_cancelled(log,
3086 XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
3087 return 0;
3088
3089 xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
3090 be32_to_cpu(icl->icl_gen));
3091 return 0;
3092}
3093
3094/*
2985 * Free up any resources allocated by the transaction 3095 * Free up any resources allocated by the transaction
2986 * 3096 *
2987 * Remember that EFIs, EFDs, and IUNLINKs are handled later. 3097 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
@@ -3023,6 +3133,7 @@ xlog_recover_commit_pass1(
3023 case XFS_LI_EFI: 3133 case XFS_LI_EFI:
3024 case XFS_LI_EFD: 3134 case XFS_LI_EFD:
3025 case XFS_LI_DQUOT: 3135 case XFS_LI_DQUOT:
3136 case XFS_LI_ICREATE:
3026 /* nothing to do in pass 1 */ 3137 /* nothing to do in pass 1 */
3027 return 0; 3138 return 0;
3028 default: 3139 default:
@@ -3053,6 +3164,8 @@ xlog_recover_commit_pass2(
3053 return xlog_recover_efd_pass2(log, item); 3164 return xlog_recover_efd_pass2(log, item);
3054 case XFS_LI_DQUOT: 3165 case XFS_LI_DQUOT:
3055 return xlog_recover_dquot_pass2(log, buffer_list, item); 3166 return xlog_recover_dquot_pass2(log, buffer_list, item);
3167 case XFS_LI_ICREATE:
3168 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3056 case XFS_LI_QUOTAOFF: 3169 case XFS_LI_QUOTAOFF:
3057 /* nothing to do in pass2 */ 3170 /* nothing to do in pass2 */
3058 return 0; 3171 return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8e310c05097..2b0ba3581656 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -336,6 +336,14 @@ xfs_mount_validate_sb(
336 return XFS_ERROR(EWRONGFS); 336 return XFS_ERROR(EWRONGFS);
337 } 337 }
338 338
339 if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
340 (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
341 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
342 xfs_notice(mp,
343"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
344 return XFS_ERROR(EFSCORRUPTED);
345 }
346
339 /* 347 /*
340 * Version 5 superblock feature mask validation. Reject combinations the 348 * Version 5 superblock feature mask validation. Reject combinations the
341 * kernel cannot support up front before checking anything else. For 349 * kernel cannot support up front before checking anything else. For
@@ -561,6 +569,18 @@ out_unwind:
561 return error; 569 return error;
562} 570}
563 571
572static void
573xfs_sb_quota_from_disk(struct xfs_sb *sbp)
574{
575 if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
576 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
577 XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
578 if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
579 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
580 XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
581 sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
582}
583
564void 584void
565xfs_sb_from_disk( 585xfs_sb_from_disk(
566 struct xfs_sb *to, 586 struct xfs_sb *to,
@@ -622,6 +642,35 @@ xfs_sb_from_disk(
622 to->sb_lsn = be64_to_cpu(from->sb_lsn); 642 to->sb_lsn = be64_to_cpu(from->sb_lsn);
623} 643}
624 644
645static inline void
646xfs_sb_quota_to_disk(
647 xfs_dsb_t *to,
648 xfs_sb_t *from,
649 __int64_t *fields)
650{
651 __uint16_t qflags = from->sb_qflags;
652
653 if (*fields & XFS_SB_QFLAGS) {
654 /*
655 * The in-core version of sb_qflags do not have
656 * XFS_OQUOTA_* flags, whereas the on-disk version
657 * does. So, convert incore XFS_{PG}QUOTA_* flags
658 * to on-disk XFS_OQUOTA_* flags.
659 */
660 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
661 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
662
663 if (from->sb_qflags &
664 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
665 qflags |= XFS_OQUOTA_ENFD;
666 if (from->sb_qflags &
667 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
668 qflags |= XFS_OQUOTA_CHKD;
669 to->sb_qflags = cpu_to_be16(qflags);
670 *fields &= ~XFS_SB_QFLAGS;
671 }
672}
673
625/* 674/*
626 * Copy in core superblock to ondisk one. 675 * Copy in core superblock to ondisk one.
627 * 676 *
@@ -643,6 +692,7 @@ xfs_sb_to_disk(
643 if (!fields) 692 if (!fields)
644 return; 693 return;
645 694
695 xfs_sb_quota_to_disk(to, from, &fields);
646 while (fields) { 696 while (fields) {
647 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 697 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
648 first = xfs_sb_info[f].offset; 698 first = xfs_sb_info[f].offset;
@@ -835,6 +885,7 @@ reread:
835 */ 885 */
836 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 886 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
837 887
888 xfs_sb_quota_from_disk(&mp->m_sb);
838 /* 889 /*
839 * We must be able to do sector-sized and sector-aligned IO. 890 * We must be able to do sector-sized and sector-aligned IO.
840 */ 891 */
@@ -987,42 +1038,27 @@ xfs_update_alignment(xfs_mount_t *mp)
987 */ 1038 */
988 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 1039 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
989 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 1040 (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
990 if (mp->m_flags & XFS_MOUNT_RETERR) { 1041 xfs_warn(mp,
991 xfs_warn(mp, "alignment check failed: " 1042 "alignment check failed: sunit/swidth vs. blocksize(%d)",
992 "(sunit/swidth vs. blocksize)"); 1043 sbp->sb_blocksize);
993 return XFS_ERROR(EINVAL); 1044 return XFS_ERROR(EINVAL);
994 }
995 mp->m_dalign = mp->m_swidth = 0;
996 } else { 1045 } else {
997 /* 1046 /*
998 * Convert the stripe unit and width to FSBs. 1047 * Convert the stripe unit and width to FSBs.
999 */ 1048 */
1000 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); 1049 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
1001 if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { 1050 if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
1002 if (mp->m_flags & XFS_MOUNT_RETERR) {
1003 xfs_warn(mp, "alignment check failed: "
1004 "(sunit/swidth vs. ag size)");
1005 return XFS_ERROR(EINVAL);
1006 }
1007 xfs_warn(mp, 1051 xfs_warn(mp,
1008 "stripe alignment turned off: sunit(%d)/swidth(%d) " 1052 "alignment check failed: sunit/swidth vs. agsize(%d)",
1009 "incompatible with agsize(%d)", 1053 sbp->sb_agblocks);
1010 mp->m_dalign, mp->m_swidth, 1054 return XFS_ERROR(EINVAL);
1011 sbp->sb_agblocks);
1012
1013 mp->m_dalign = 0;
1014 mp->m_swidth = 0;
1015 } else if (mp->m_dalign) { 1055 } else if (mp->m_dalign) {
1016 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 1056 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
1017 } else { 1057 } else {
1018 if (mp->m_flags & XFS_MOUNT_RETERR) { 1058 xfs_warn(mp,
1019 xfs_warn(mp, "alignment check failed: " 1059 "alignment check failed: sunit(%d) less than bsize(%d)",
1020 "sunit(%d) less than bsize(%d)", 1060 mp->m_dalign, sbp->sb_blocksize);
1021 mp->m_dalign, 1061 return XFS_ERROR(EINVAL);
1022 mp->m_blockmask +1);
1023 return XFS_ERROR(EINVAL);
1024 }
1025 mp->m_swidth = 0;
1026 } 1062 }
1027 } 1063 }
1028 1064
@@ -1039,6 +1075,10 @@ xfs_update_alignment(xfs_mount_t *mp)
1039 sbp->sb_width = mp->m_swidth; 1075 sbp->sb_width = mp->m_swidth;
1040 mp->m_update_flags |= XFS_SB_WIDTH; 1076 mp->m_update_flags |= XFS_SB_WIDTH;
1041 } 1077 }
1078 } else {
1079 xfs_warn(mp,
1080 "cannot change alignment: superblock does not support data alignment");
1081 return XFS_ERROR(EINVAL);
1042 } 1082 }
1043 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && 1083 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
1044 xfs_sb_version_hasdalign(&mp->m_sb)) { 1084 xfs_sb_version_hasdalign(&mp->m_sb)) {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b004cecdfb04..4e374d4a9189 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -192,8 +192,6 @@ typedef struct xfs_mount {
192 xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ 192 xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
193 xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ 193 xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
194 uint m_chsize; /* size of next field */ 194 uint m_chsize; /* size of next field */
195 struct xfs_chash *m_chash; /* fs private inode per-cluster
196 * hash table */
197 atomic_t m_active_trans; /* number trans frozen */ 195 atomic_t m_active_trans; /* number trans frozen */
198#ifdef HAVE_PERCPU_SB 196#ifdef HAVE_PERCPU_SB
199 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ 197 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
@@ -229,8 +227,6 @@ typedef struct xfs_mount {
229 operations, typically for 227 operations, typically for
230 disk errors in metadata */ 228 disk errors in metadata */
231#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ 229#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
232#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
233 user */
234#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment 230#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
235 allocations */ 231 allocations */
236#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ 232#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb6e71e..d320794d03ce 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -70,7 +70,7 @@ xfs_qm_dquot_walk(
70 void *data) 70 void *data)
71{ 71{
72 struct xfs_quotainfo *qi = mp->m_quotainfo; 72 struct xfs_quotainfo *qi = mp->m_quotainfo;
73 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 73 struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
74 uint32_t next_index; 74 uint32_t next_index;
75 int last_error = 0; 75 int last_error = 0;
76 int skipped; 76 int skipped;
@@ -137,6 +137,7 @@ xfs_qm_dqpurge(
137 struct xfs_mount *mp = dqp->q_mount; 137 struct xfs_mount *mp = dqp->q_mount;
138 struct xfs_quotainfo *qi = mp->m_quotainfo; 138 struct xfs_quotainfo *qi = mp->m_quotainfo;
139 struct xfs_dquot *gdqp = NULL; 139 struct xfs_dquot *gdqp = NULL;
140 struct xfs_dquot *pdqp = NULL;
140 141
141 xfs_dqlock(dqp); 142 xfs_dqlock(dqp);
142 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { 143 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -145,8 +146,7 @@ xfs_qm_dqpurge(
145 } 146 }
146 147
147 /* 148 /*
148 * If this quota has a group hint attached, prepare for releasing it 149 * If this quota has a hint attached, prepare for releasing it now.
149 * now.
150 */ 150 */
151 gdqp = dqp->q_gdquot; 151 gdqp = dqp->q_gdquot;
152 if (gdqp) { 152 if (gdqp) {
@@ -154,6 +154,12 @@ xfs_qm_dqpurge(
154 dqp->q_gdquot = NULL; 154 dqp->q_gdquot = NULL;
155 } 155 }
156 156
157 pdqp = dqp->q_pdquot;
158 if (pdqp) {
159 xfs_dqlock(pdqp);
160 dqp->q_pdquot = NULL;
161 }
162
157 dqp->dq_flags |= XFS_DQ_FREEING; 163 dqp->dq_flags |= XFS_DQ_FREEING;
158 164
159 xfs_dqflock(dqp); 165 xfs_dqflock(dqp);
@@ -189,7 +195,7 @@ xfs_qm_dqpurge(
189 xfs_dqfunlock(dqp); 195 xfs_dqfunlock(dqp);
190 xfs_dqunlock(dqp); 196 xfs_dqunlock(dqp);
191 197
192 radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), 198 radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
193 be32_to_cpu(dqp->q_core.d_id)); 199 be32_to_cpu(dqp->q_core.d_id));
194 qi->qi_dquots--; 200 qi->qi_dquots--;
195 201
@@ -208,6 +214,8 @@ xfs_qm_dqpurge(
208 214
209 if (gdqp) 215 if (gdqp)
210 xfs_qm_dqput(gdqp); 216 xfs_qm_dqput(gdqp);
217 if (pdqp)
218 xfs_qm_dqput(pdqp);
211 return 0; 219 return 0;
212} 220}
213 221
@@ -299,8 +307,10 @@ xfs_qm_mount_quotas(
299 */ 307 */
300 if (!XFS_IS_UQUOTA_ON(mp)) 308 if (!XFS_IS_UQUOTA_ON(mp))
301 mp->m_qflags &= ~XFS_UQUOTA_CHKD; 309 mp->m_qflags &= ~XFS_UQUOTA_CHKD;
302 if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) 310 if (!XFS_IS_GQUOTA_ON(mp))
303 mp->m_qflags &= ~XFS_OQUOTA_CHKD; 311 mp->m_qflags &= ~XFS_GQUOTA_CHKD;
312 if (!XFS_IS_PQUOTA_ON(mp))
313 mp->m_qflags &= ~XFS_PQUOTA_CHKD;
304 314
305 write_changes: 315 write_changes:
306 /* 316 /*
@@ -362,6 +372,10 @@ xfs_qm_unmount_quotas(
362 IRELE(mp->m_quotainfo->qi_gquotaip); 372 IRELE(mp->m_quotainfo->qi_gquotaip);
363 mp->m_quotainfo->qi_gquotaip = NULL; 373 mp->m_quotainfo->qi_gquotaip = NULL;
364 } 374 }
375 if (mp->m_quotainfo->qi_pquotaip) {
376 IRELE(mp->m_quotainfo->qi_pquotaip);
377 mp->m_quotainfo->qi_pquotaip = NULL;
378 }
365 } 379 }
366} 380}
367 381
@@ -408,7 +422,10 @@ xfs_qm_dqattach_one(
408 * be reclaimed as long as we have a ref from inode and we 422 * be reclaimed as long as we have a ref from inode and we
409 * hold the ilock. 423 * hold the ilock.
410 */ 424 */
411 dqp = udqhint->q_gdquot; 425 if (type == XFS_DQ_GROUP)
426 dqp = udqhint->q_gdquot;
427 else
428 dqp = udqhint->q_pdquot;
412 if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { 429 if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
413 ASSERT(*IO_idqpp == NULL); 430 ASSERT(*IO_idqpp == NULL);
414 431
@@ -451,28 +468,42 @@ xfs_qm_dqattach_one(
451 468
452 469
453/* 470/*
454 * Given a udquot and gdquot, attach a ptr to the group dquot in the 471 * Given a udquot and group/project type, attach the group/project
455 * udquot as a hint for future lookups. 472 * dquot pointer to the udquot as a hint for future lookups.
456 */ 473 */
457STATIC void 474STATIC void
458xfs_qm_dqattach_grouphint( 475xfs_qm_dqattach_hint(
459 xfs_dquot_t *udq, 476 struct xfs_inode *ip,
460 xfs_dquot_t *gdq) 477 int type)
461{ 478{
462 xfs_dquot_t *tmp; 479 struct xfs_dquot **dqhintp;
480 struct xfs_dquot *dqp;
481 struct xfs_dquot *udq = ip->i_udquot;
482
483 ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
463 484
464 xfs_dqlock(udq); 485 xfs_dqlock(udq);
465 486
466 tmp = udq->q_gdquot; 487 if (type == XFS_DQ_GROUP) {
467 if (tmp) { 488 dqp = ip->i_gdquot;
468 if (tmp == gdq) 489 dqhintp = &udq->q_gdquot;
490 } else {
491 dqp = ip->i_pdquot;
492 dqhintp = &udq->q_pdquot;
493 }
494
495 if (*dqhintp) {
496 struct xfs_dquot *tmp;
497
498 if (*dqhintp == dqp)
469 goto done; 499 goto done;
470 500
471 udq->q_gdquot = NULL; 501 tmp = *dqhintp;
502 *dqhintp = NULL;
472 xfs_qm_dqrele(tmp); 503 xfs_qm_dqrele(tmp);
473 } 504 }
474 505
475 udq->q_gdquot = xfs_qm_dqhold(gdq); 506 *dqhintp = xfs_qm_dqhold(dqp);
476done: 507done:
477 xfs_dqunlock(udq); 508 xfs_dqunlock(udq);
478} 509}
@@ -489,8 +520,7 @@ xfs_qm_need_dqattach(
489 return false; 520 return false;
490 if (!XFS_NOT_DQATTACHED(mp, ip)) 521 if (!XFS_NOT_DQATTACHED(mp, ip))
491 return false; 522 return false;
492 if (ip->i_ino == mp->m_sb.sb_uquotino || 523 if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
493 ip->i_ino == mp->m_sb.sb_gquotino)
494 return false; 524 return false;
495 return true; 525 return true;
496} 526}
@@ -526,12 +556,8 @@ xfs_qm_dqattach_locked(
526 } 556 }
527 557
528 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 558 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
529 if (XFS_IS_OQUOTA_ON(mp)) { 559 if (XFS_IS_GQUOTA_ON(mp)) {
530 error = XFS_IS_GQUOTA_ON(mp) ? 560 error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
531 xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
532 flags & XFS_QMOPT_DQALLOC,
533 ip->i_udquot, &ip->i_gdquot) :
534 xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
535 flags & XFS_QMOPT_DQALLOC, 561 flags & XFS_QMOPT_DQALLOC,
536 ip->i_udquot, &ip->i_gdquot); 562 ip->i_udquot, &ip->i_gdquot);
537 /* 563 /*
@@ -543,14 +569,28 @@ xfs_qm_dqattach_locked(
543 nquotas++; 569 nquotas++;
544 } 570 }
545 571
572 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
573 if (XFS_IS_PQUOTA_ON(mp)) {
574 error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
575 flags & XFS_QMOPT_DQALLOC,
576 ip->i_udquot, &ip->i_pdquot);
577 /*
578 * Don't worry about the udquot that we may have
579 * attached above. It'll get detached, if not already.
580 */
581 if (error)
582 goto done;
583 nquotas++;
584 }
585
546 /* 586 /*
547 * Attach this group quota to the user quota as a hint. 587 * Attach this group/project quota to the user quota as a hint.
548 * This WON'T, in general, result in a thrash. 588 * This WON'T, in general, result in a thrash.
549 */ 589 */
550 if (nquotas == 2) { 590 if (nquotas > 1 && ip->i_udquot) {
551 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 591 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
552 ASSERT(ip->i_udquot); 592 ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp));
553 ASSERT(ip->i_gdquot); 593 ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp));
554 594
555 /* 595 /*
556 * We do not have i_udquot locked at this point, but this check 596 * We do not have i_udquot locked at this point, but this check
@@ -559,7 +599,10 @@ xfs_qm_dqattach_locked(
559 * succeed in general. 599 * succeed in general.
560 */ 600 */
561 if (ip->i_udquot->q_gdquot != ip->i_gdquot) 601 if (ip->i_udquot->q_gdquot != ip->i_gdquot)
562 xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); 602 xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP);
603
604 if (ip->i_udquot->q_pdquot != ip->i_pdquot)
605 xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ);
563 } 606 }
564 607
565 done: 608 done:
@@ -567,8 +610,10 @@ xfs_qm_dqattach_locked(
567 if (!error) { 610 if (!error) {
568 if (XFS_IS_UQUOTA_ON(mp)) 611 if (XFS_IS_UQUOTA_ON(mp))
569 ASSERT(ip->i_udquot); 612 ASSERT(ip->i_udquot);
570 if (XFS_IS_OQUOTA_ON(mp)) 613 if (XFS_IS_GQUOTA_ON(mp))
571 ASSERT(ip->i_gdquot); 614 ASSERT(ip->i_gdquot);
615 if (XFS_IS_PQUOTA_ON(mp))
616 ASSERT(ip->i_pdquot);
572 } 617 }
573 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 618 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
574#endif 619#endif
@@ -601,13 +646,12 @@ void
601xfs_qm_dqdetach( 646xfs_qm_dqdetach(
602 xfs_inode_t *ip) 647 xfs_inode_t *ip)
603{ 648{
604 if (!(ip->i_udquot || ip->i_gdquot)) 649 if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot))
605 return; 650 return;
606 651
607 trace_xfs_dquot_dqdetach(ip); 652 trace_xfs_dquot_dqdetach(ip);
608 653
609 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); 654 ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino));
610 ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
611 if (ip->i_udquot) { 655 if (ip->i_udquot) {
612 xfs_qm_dqrele(ip->i_udquot); 656 xfs_qm_dqrele(ip->i_udquot);
613 ip->i_udquot = NULL; 657 ip->i_udquot = NULL;
@@ -616,6 +660,10 @@ xfs_qm_dqdetach(
616 xfs_qm_dqrele(ip->i_gdquot); 660 xfs_qm_dqrele(ip->i_gdquot);
617 ip->i_gdquot = NULL; 661 ip->i_gdquot = NULL;
618 } 662 }
663 if (ip->i_pdquot) {
664 xfs_qm_dqrele(ip->i_pdquot);
665 ip->i_pdquot = NULL;
666 }
619} 667}
620 668
621int 669int
@@ -660,6 +708,7 @@ xfs_qm_init_quotainfo(
660 708
661 INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); 709 INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
662 INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); 710 INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
711 INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
663 mutex_init(&qinf->qi_tree_lock); 712 mutex_init(&qinf->qi_tree_lock);
664 713
665 INIT_LIST_HEAD(&qinf->qi_lru_list); 714 INIT_LIST_HEAD(&qinf->qi_lru_list);
@@ -761,6 +810,10 @@ xfs_qm_destroy_quotainfo(
761 IRELE(qi->qi_gquotaip); 810 IRELE(qi->qi_gquotaip);
762 qi->qi_gquotaip = NULL; 811 qi->qi_gquotaip = NULL;
763 } 812 }
813 if (qi->qi_pquotaip) {
814 IRELE(qi->qi_pquotaip);
815 qi->qi_pquotaip = NULL;
816 }
764 mutex_destroy(&qi->qi_quotaofflock); 817 mutex_destroy(&qi->qi_quotaofflock);
765 kmem_free(qi); 818 kmem_free(qi);
766 mp->m_quotainfo = NULL; 819 mp->m_quotainfo = NULL;
@@ -1152,7 +1205,7 @@ xfs_qm_dqusage_adjust(
1152 * rootino must have its resources accounted for, not so with the quota 1205 * rootino must have its resources accounted for, not so with the quota
1153 * inodes. 1206 * inodes.
1154 */ 1207 */
1155 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { 1208 if (xfs_is_quota_inode(&mp->m_sb, ino)) {
1156 *res = BULKSTAT_RV_NOTHING; 1209 *res = BULKSTAT_RV_NOTHING;
1157 return XFS_ERROR(EINVAL); 1210 return XFS_ERROR(EINVAL);
1158 } 1211 }
@@ -1262,19 +1315,21 @@ int
1262xfs_qm_quotacheck( 1315xfs_qm_quotacheck(
1263 xfs_mount_t *mp) 1316 xfs_mount_t *mp)
1264{ 1317{
1265 int done, count, error, error2; 1318 int done, count, error, error2;
1266 xfs_ino_t lastino; 1319 xfs_ino_t lastino;
1267 size_t structsz; 1320 size_t structsz;
1268 xfs_inode_t *uip, *gip; 1321 uint flags;
1269 uint flags; 1322 LIST_HEAD (buffer_list);
1270 LIST_HEAD (buffer_list); 1323 struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
1324 struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
1325 struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip;
1271 1326
1272 count = INT_MAX; 1327 count = INT_MAX;
1273 structsz = 1; 1328 structsz = 1;
1274 lastino = 0; 1329 lastino = 0;
1275 flags = 0; 1330 flags = 0;
1276 1331
1277 ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); 1332 ASSERT(uip || gip || pip);
1278 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1333 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1279 1334
1280 xfs_notice(mp, "Quotacheck needed: Please wait."); 1335 xfs_notice(mp, "Quotacheck needed: Please wait.");
@@ -1284,7 +1339,6 @@ xfs_qm_quotacheck(
1284 * their counters to zero. We need a clean slate. 1339 * their counters to zero. We need a clean slate.
1285 * We don't log our changes till later. 1340 * We don't log our changes till later.
1286 */ 1341 */
1287 uip = mp->m_quotainfo->qi_uquotaip;
1288 if (uip) { 1342 if (uip) {
1289 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, 1343 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
1290 &buffer_list); 1344 &buffer_list);
@@ -1293,14 +1347,20 @@ xfs_qm_quotacheck(
1293 flags |= XFS_UQUOTA_CHKD; 1347 flags |= XFS_UQUOTA_CHKD;
1294 } 1348 }
1295 1349
1296 gip = mp->m_quotainfo->qi_gquotaip;
1297 if (gip) { 1350 if (gip) {
1298 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1351 error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
1299 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
1300 &buffer_list); 1352 &buffer_list);
1301 if (error) 1353 if (error)
1302 goto error_return; 1354 goto error_return;
1303 flags |= XFS_OQUOTA_CHKD; 1355 flags |= XFS_GQUOTA_CHKD;
1356 }
1357
1358 if (pip) {
1359 error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
1360 &buffer_list);
1361 if (error)
1362 goto error_return;
1363 flags |= XFS_PQUOTA_CHKD;
1304 } 1364 }
1305 1365
1306 do { 1366 do {
@@ -1395,15 +1455,14 @@ STATIC int
1395xfs_qm_init_quotainos( 1455xfs_qm_init_quotainos(
1396 xfs_mount_t *mp) 1456 xfs_mount_t *mp)
1397{ 1457{
1398 xfs_inode_t *uip, *gip; 1458 struct xfs_inode *uip = NULL;
1399 int error; 1459 struct xfs_inode *gip = NULL;
1400 __int64_t sbflags; 1460 struct xfs_inode *pip = NULL;
1401 uint flags; 1461 int error;
1462 __int64_t sbflags = 0;
1463 uint flags = 0;
1402 1464
1403 ASSERT(mp->m_quotainfo); 1465 ASSERT(mp->m_quotainfo);
1404 uip = gip = NULL;
1405 sbflags = 0;
1406 flags = 0;
1407 1466
1408 /* 1467 /*
1409 * Get the uquota and gquota inodes 1468 * Get the uquota and gquota inodes
@@ -1412,19 +1471,27 @@ xfs_qm_init_quotainos(
1412 if (XFS_IS_UQUOTA_ON(mp) && 1471 if (XFS_IS_UQUOTA_ON(mp) &&
1413 mp->m_sb.sb_uquotino != NULLFSINO) { 1472 mp->m_sb.sb_uquotino != NULLFSINO) {
1414 ASSERT(mp->m_sb.sb_uquotino > 0); 1473 ASSERT(mp->m_sb.sb_uquotino > 0);
1415 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 1474 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
1416 0, 0, &uip))) 1475 0, 0, &uip);
1476 if (error)
1417 return XFS_ERROR(error); 1477 return XFS_ERROR(error);
1418 } 1478 }
1419 if (XFS_IS_OQUOTA_ON(mp) && 1479 if (XFS_IS_GQUOTA_ON(mp) &&
1420 mp->m_sb.sb_gquotino != NULLFSINO) { 1480 mp->m_sb.sb_gquotino != NULLFSINO) {
1421 ASSERT(mp->m_sb.sb_gquotino > 0); 1481 ASSERT(mp->m_sb.sb_gquotino > 0);
1422 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 1482 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
1423 0, 0, &gip))) { 1483 0, 0, &gip);
1424 if (uip) 1484 if (error)
1425 IRELE(uip); 1485 goto error_rele;
1426 return XFS_ERROR(error); 1486 }
1427 } 1487 /* XXX: Use gquotino for now */
1488 if (XFS_IS_PQUOTA_ON(mp) &&
1489 mp->m_sb.sb_gquotino != NULLFSINO) {
1490 ASSERT(mp->m_sb.sb_gquotino > 0);
1491 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
1492 0, 0, &pip);
1493 if (error)
1494 goto error_rele;
1428 } 1495 }
1429 } else { 1496 } else {
1430 flags |= XFS_QMOPT_SBVERSION; 1497 flags |= XFS_QMOPT_SBVERSION;
@@ -1433,36 +1500,52 @@ xfs_qm_init_quotainos(
1433 } 1500 }
1434 1501
1435 /* 1502 /*
1436 * Create the two inodes, if they don't exist already. The changes 1503 * Create the three inodes, if they don't exist already. The changes
1437 * made above will get added to a transaction and logged in one of 1504 * made above will get added to a transaction and logged in one of
1438 * the qino_alloc calls below. If the device is readonly, 1505 * the qino_alloc calls below. If the device is readonly,
1439 * temporarily switch to read-write to do this. 1506 * temporarily switch to read-write to do this.
1440 */ 1507 */
1441 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { 1508 if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
1442 if ((error = xfs_qm_qino_alloc(mp, &uip, 1509 error = xfs_qm_qino_alloc(mp, &uip,
1443 sbflags | XFS_SB_UQUOTINO, 1510 sbflags | XFS_SB_UQUOTINO,
1444 flags | XFS_QMOPT_UQUOTA))) 1511 flags | XFS_QMOPT_UQUOTA);
1445 return XFS_ERROR(error); 1512 if (error)
1513 goto error_rele;
1446 1514
1447 flags &= ~XFS_QMOPT_SBVERSION; 1515 flags &= ~XFS_QMOPT_SBVERSION;
1448 } 1516 }
1449 if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) { 1517 if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
1450 flags |= (XFS_IS_GQUOTA_ON(mp) ?
1451 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
1452 error = xfs_qm_qino_alloc(mp, &gip, 1518 error = xfs_qm_qino_alloc(mp, &gip,
1453 sbflags | XFS_SB_GQUOTINO, flags); 1519 sbflags | XFS_SB_GQUOTINO,
1454 if (error) { 1520 flags | XFS_QMOPT_GQUOTA);
1455 if (uip) 1521 if (error)
1456 IRELE(uip); 1522 goto error_rele;
1457 1523
1458 return XFS_ERROR(error); 1524 flags &= ~XFS_QMOPT_SBVERSION;
1459 } 1525 }
1526 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
1527 /* XXX: Use XFS_SB_GQUOTINO for now */
1528 error = xfs_qm_qino_alloc(mp, &pip,
1529 sbflags | XFS_SB_GQUOTINO,
1530 flags | XFS_QMOPT_PQUOTA);
1531 if (error)
1532 goto error_rele;
1460 } 1533 }
1461 1534
1462 mp->m_quotainfo->qi_uquotaip = uip; 1535 mp->m_quotainfo->qi_uquotaip = uip;
1463 mp->m_quotainfo->qi_gquotaip = gip; 1536 mp->m_quotainfo->qi_gquotaip = gip;
1537 mp->m_quotainfo->qi_pquotaip = pip;
1464 1538
1465 return 0; 1539 return 0;
1540
1541error_rele:
1542 if (uip)
1543 IRELE(uip);
1544 if (gip)
1545 IRELE(gip);
1546 if (pip)
1547 IRELE(pip);
1548 return XFS_ERROR(error);
1466} 1549}
1467 1550
1468STATIC void 1551STATIC void
@@ -1473,7 +1556,7 @@ xfs_qm_dqfree_one(
1473 struct xfs_quotainfo *qi = mp->m_quotainfo; 1556 struct xfs_quotainfo *qi = mp->m_quotainfo;
1474 1557
1475 mutex_lock(&qi->qi_tree_lock); 1558 mutex_lock(&qi->qi_tree_lock);
1476 radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags), 1559 radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags),
1477 be32_to_cpu(dqp->q_core.d_id)); 1560 be32_to_cpu(dqp->q_core.d_id));
1478 1561
1479 qi->qi_dquots--; 1562 qi->qi_dquots--;
@@ -1656,10 +1739,13 @@ xfs_qm_vop_dqalloc(
1656 prid_t prid, 1739 prid_t prid,
1657 uint flags, 1740 uint flags,
1658 struct xfs_dquot **O_udqpp, 1741 struct xfs_dquot **O_udqpp,
1659 struct xfs_dquot **O_gdqpp) 1742 struct xfs_dquot **O_gdqpp,
1743 struct xfs_dquot **O_pdqpp)
1660{ 1744{
1661 struct xfs_mount *mp = ip->i_mount; 1745 struct xfs_mount *mp = ip->i_mount;
1662 struct xfs_dquot *uq, *gq; 1746 struct xfs_dquot *uq = NULL;
1747 struct xfs_dquot *gq = NULL;
1748 struct xfs_dquot *pq = NULL;
1663 int error; 1749 int error;
1664 uint lockflags; 1750 uint lockflags;
1665 1751
@@ -1684,7 +1770,6 @@ xfs_qm_vop_dqalloc(
1684 } 1770 }
1685 } 1771 }
1686 1772
1687 uq = gq = NULL;
1688 if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { 1773 if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
1689 if (ip->i_d.di_uid != uid) { 1774 if (ip->i_d.di_uid != uid) {
1690 /* 1775 /*
@@ -1697,11 +1782,12 @@ xfs_qm_vop_dqalloc(
1697 * holding ilock. 1782 * holding ilock.
1698 */ 1783 */
1699 xfs_iunlock(ip, lockflags); 1784 xfs_iunlock(ip, lockflags);
1700 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, 1785 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
1701 XFS_DQ_USER, 1786 XFS_DQ_USER,
1702 XFS_QMOPT_DQALLOC | 1787 XFS_QMOPT_DQALLOC |
1703 XFS_QMOPT_DOWARN, 1788 XFS_QMOPT_DOWARN,
1704 &uq))) { 1789 &uq);
1790 if (error) {
1705 ASSERT(error != ENOENT); 1791 ASSERT(error != ENOENT);
1706 return error; 1792 return error;
1707 } 1793 }
@@ -1723,15 +1809,14 @@ xfs_qm_vop_dqalloc(
1723 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { 1809 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
1724 if (ip->i_d.di_gid != gid) { 1810 if (ip->i_d.di_gid != gid) {
1725 xfs_iunlock(ip, lockflags); 1811 xfs_iunlock(ip, lockflags);
1726 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, 1812 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
1727 XFS_DQ_GROUP, 1813 XFS_DQ_GROUP,
1728 XFS_QMOPT_DQALLOC | 1814 XFS_QMOPT_DQALLOC |
1729 XFS_QMOPT_DOWARN, 1815 XFS_QMOPT_DOWARN,
1730 &gq))) { 1816 &gq);
1731 if (uq) 1817 if (error) {
1732 xfs_qm_dqrele(uq);
1733 ASSERT(error != ENOENT); 1818 ASSERT(error != ENOENT);
1734 return error; 1819 goto error_rele;
1735 } 1820 }
1736 xfs_dqunlock(gq); 1821 xfs_dqunlock(gq);
1737 lockflags = XFS_ILOCK_SHARED; 1822 lockflags = XFS_ILOCK_SHARED;
@@ -1740,25 +1825,25 @@ xfs_qm_vop_dqalloc(
1740 ASSERT(ip->i_gdquot); 1825 ASSERT(ip->i_gdquot);
1741 gq = xfs_qm_dqhold(ip->i_gdquot); 1826 gq = xfs_qm_dqhold(ip->i_gdquot);
1742 } 1827 }
1743 } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { 1828 }
1829 if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
1744 if (xfs_get_projid(ip) != prid) { 1830 if (xfs_get_projid(ip) != prid) {
1745 xfs_iunlock(ip, lockflags); 1831 xfs_iunlock(ip, lockflags);
1746 if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, 1832 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
1747 XFS_DQ_PROJ, 1833 XFS_DQ_PROJ,
1748 XFS_QMOPT_DQALLOC | 1834 XFS_QMOPT_DQALLOC |
1749 XFS_QMOPT_DOWARN, 1835 XFS_QMOPT_DOWARN,
1750 &gq))) { 1836 &pq);
1751 if (uq) 1837 if (error) {
1752 xfs_qm_dqrele(uq);
1753 ASSERT(error != ENOENT); 1838 ASSERT(error != ENOENT);
1754 return (error); 1839 goto error_rele;
1755 } 1840 }
1756 xfs_dqunlock(gq); 1841 xfs_dqunlock(pq);
1757 lockflags = XFS_ILOCK_SHARED; 1842 lockflags = XFS_ILOCK_SHARED;
1758 xfs_ilock(ip, lockflags); 1843 xfs_ilock(ip, lockflags);
1759 } else { 1844 } else {
1760 ASSERT(ip->i_gdquot); 1845 ASSERT(ip->i_pdquot);
1761 gq = xfs_qm_dqhold(ip->i_gdquot); 1846 pq = xfs_qm_dqhold(ip->i_pdquot);
1762 } 1847 }
1763 } 1848 }
1764 if (uq) 1849 if (uq)
@@ -1773,7 +1858,18 @@ xfs_qm_vop_dqalloc(
1773 *O_gdqpp = gq; 1858 *O_gdqpp = gq;
1774 else if (gq) 1859 else if (gq)
1775 xfs_qm_dqrele(gq); 1860 xfs_qm_dqrele(gq);
1861 if (O_pdqpp)
1862 *O_pdqpp = pq;
1863 else if (pq)
1864 xfs_qm_dqrele(pq);
1776 return 0; 1865 return 0;
1866
1867error_rele:
1868 if (gq)
1869 xfs_qm_dqrele(gq);
1870 if (uq)
1871 xfs_qm_dqrele(uq);
1872 return error;
1777} 1873}
1778 1874
1779/* 1875/*
@@ -1821,29 +1917,34 @@ xfs_qm_vop_chown(
1821 */ 1917 */
1822int 1918int
1823xfs_qm_vop_chown_reserve( 1919xfs_qm_vop_chown_reserve(
1824 xfs_trans_t *tp, 1920 struct xfs_trans *tp,
1825 xfs_inode_t *ip, 1921 struct xfs_inode *ip,
1826 xfs_dquot_t *udqp, 1922 struct xfs_dquot *udqp,
1827 xfs_dquot_t *gdqp, 1923 struct xfs_dquot *gdqp,
1828 uint flags) 1924 struct xfs_dquot *pdqp,
1925 uint flags)
1829{ 1926{
1830 xfs_mount_t *mp = ip->i_mount; 1927 struct xfs_mount *mp = ip->i_mount;
1831 uint delblks, blkflags, prjflags = 0; 1928 uint delblks, blkflags, prjflags = 0;
1832 xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; 1929 struct xfs_dquot *udq_unres = NULL;
1833 int error; 1930 struct xfs_dquot *gdq_unres = NULL;
1931 struct xfs_dquot *pdq_unres = NULL;
1932 struct xfs_dquot *udq_delblks = NULL;
1933 struct xfs_dquot *gdq_delblks = NULL;
1934 struct xfs_dquot *pdq_delblks = NULL;
1935 int error;
1834 1936
1835 1937
1836 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 1938 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
1837 ASSERT(XFS_IS_QUOTA_RUNNING(mp)); 1939 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1838 1940
1839 delblks = ip->i_delayed_blks; 1941 delblks = ip->i_delayed_blks;
1840 delblksudq = delblksgdq = unresudq = unresgdq = NULL;
1841 blkflags = XFS_IS_REALTIME_INODE(ip) ? 1942 blkflags = XFS_IS_REALTIME_INODE(ip) ?
1842 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; 1943 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
1843 1944
1844 if (XFS_IS_UQUOTA_ON(mp) && udqp && 1945 if (XFS_IS_UQUOTA_ON(mp) && udqp &&
1845 ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { 1946 ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
1846 delblksudq = udqp; 1947 udq_delblks = udqp;
1847 /* 1948 /*
1848 * If there are delayed allocation blocks, then we have to 1949 * If there are delayed allocation blocks, then we have to
1849 * unreserve those from the old dquot, and add them to the 1950 * unreserve those from the old dquot, and add them to the
@@ -1851,29 +1952,34 @@ xfs_qm_vop_chown_reserve(
1851 */ 1952 */
1852 if (delblks) { 1953 if (delblks) {
1853 ASSERT(ip->i_udquot); 1954 ASSERT(ip->i_udquot);
1854 unresudq = ip->i_udquot; 1955 udq_unres = ip->i_udquot;
1855 } 1956 }
1856 } 1957 }
1857 if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { 1958 if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
1858 if (XFS_IS_PQUOTA_ON(ip->i_mount) && 1959 ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
1859 xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id)) 1960 gdq_delblks = gdqp;
1860 prjflags = XFS_QMOPT_ENOSPC; 1961 if (delblks) {
1861 1962 ASSERT(ip->i_gdquot);
1862 if (prjflags || 1963 gdq_unres = ip->i_gdquot;
1863 (XFS_IS_GQUOTA_ON(ip->i_mount) &&
1864 ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
1865 delblksgdq = gdqp;
1866 if (delblks) {
1867 ASSERT(ip->i_gdquot);
1868 unresgdq = ip->i_gdquot;
1869 }
1870 } 1964 }
1871 } 1965 }
1872 1966
1873 if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, 1967 if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
1874 delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, 1968 xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
1875 flags | blkflags | prjflags))) 1969 prjflags = XFS_QMOPT_ENOSPC;
1876 return (error); 1970 pdq_delblks = pdqp;
1971 if (delblks) {
1972 ASSERT(ip->i_pdquot);
1973 pdq_unres = ip->i_pdquot;
1974 }
1975 }
1976
1977 error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
1978 udq_delblks, gdq_delblks, pdq_delblks,
1979 ip->i_d.di_nblocks, 1,
1980 flags | blkflags | prjflags);
1981 if (error)
1982 return error;
1877 1983
1878 /* 1984 /*
1879 * Do the delayed blks reservations/unreservations now. Since, these 1985 * Do the delayed blks reservations/unreservations now. Since, these
@@ -1885,15 +1991,17 @@ xfs_qm_vop_chown_reserve(
1885 /* 1991 /*
1886 * Do the reservations first. Unreservation can't fail. 1992 * Do the reservations first. Unreservation can't fail.
1887 */ 1993 */
1888 ASSERT(delblksudq || delblksgdq); 1994 ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
1889 ASSERT(unresudq || unresgdq); 1995 ASSERT(udq_unres || gdq_unres || pdq_unres);
1890 if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, 1996 error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
1891 delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, 1997 udq_delblks, gdq_delblks, pdq_delblks,
1892 flags | blkflags | prjflags))) 1998 (xfs_qcnt_t)delblks, 0,
1893 return (error); 1999 flags | blkflags | prjflags);
2000 if (error)
2001 return error;
1894 xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, 2002 xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
1895 unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, 2003 udq_unres, gdq_unres, pdq_unres,
1896 blkflags); 2004 -((xfs_qcnt_t)delblks), 0, blkflags);
1897 } 2005 }
1898 2006
1899 return (0); 2007 return (0);
@@ -1932,7 +2040,8 @@ xfs_qm_vop_create_dqattach(
1932 struct xfs_trans *tp, 2040 struct xfs_trans *tp,
1933 struct xfs_inode *ip, 2041 struct xfs_inode *ip,
1934 struct xfs_dquot *udqp, 2042 struct xfs_dquot *udqp,
1935 struct xfs_dquot *gdqp) 2043 struct xfs_dquot *gdqp,
2044 struct xfs_dquot *pdqp)
1936{ 2045{
1937 struct xfs_mount *mp = tp->t_mountp; 2046 struct xfs_mount *mp = tp->t_mountp;
1938 2047
@@ -1952,13 +2061,18 @@ xfs_qm_vop_create_dqattach(
1952 } 2061 }
1953 if (gdqp) { 2062 if (gdqp) {
1954 ASSERT(ip->i_gdquot == NULL); 2063 ASSERT(ip->i_gdquot == NULL);
1955 ASSERT(XFS_IS_OQUOTA_ON(mp)); 2064 ASSERT(XFS_IS_GQUOTA_ON(mp));
1956 ASSERT((XFS_IS_GQUOTA_ON(mp) ? 2065 ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
1957 ip->i_d.di_gid : xfs_get_projid(ip)) ==
1958 be32_to_cpu(gdqp->q_core.d_id));
1959
1960 ip->i_gdquot = xfs_qm_dqhold(gdqp); 2066 ip->i_gdquot = xfs_qm_dqhold(gdqp);
1961 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); 2067 xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
1962 } 2068 }
2069 if (pdqp) {
2070 ASSERT(ip->i_pdquot == NULL);
2071 ASSERT(XFS_IS_PQUOTA_ON(mp));
2072 ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
2073
2074 ip->i_pdquot = xfs_qm_dqhold(pdqp);
2075 xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
2076 }
1963} 2077}
1964 2078
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5d16a6e6900f..579d6a02a5b6 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -44,9 +44,11 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
44typedef struct xfs_quotainfo { 44typedef struct xfs_quotainfo {
45 struct radix_tree_root qi_uquota_tree; 45 struct radix_tree_root qi_uquota_tree;
46 struct radix_tree_root qi_gquota_tree; 46 struct radix_tree_root qi_gquota_tree;
47 struct radix_tree_root qi_pquota_tree;
47 struct mutex qi_tree_lock; 48 struct mutex qi_tree_lock;
48 xfs_inode_t *qi_uquotaip; /* user quota inode */ 49 struct xfs_inode *qi_uquotaip; /* user quota inode */
49 xfs_inode_t *qi_gquotaip; /* group quota inode */ 50 struct xfs_inode *qi_gquotaip; /* group quota inode */
51 struct xfs_inode *qi_pquotaip; /* project quota inode */
50 struct list_head qi_lru_list; 52 struct list_head qi_lru_list;
51 struct mutex qi_lru_lock; 53 struct mutex qi_lru_lock;
52 int qi_lru_count; 54 int qi_lru_count;
@@ -69,30 +71,66 @@ typedef struct xfs_quotainfo {
69 struct shrinker qi_shrinker; 71 struct shrinker qi_shrinker;
70} xfs_quotainfo_t; 72} xfs_quotainfo_t;
71 73
72#define XFS_DQUOT_TREE(qi, type) \ 74static inline struct radix_tree_root *
73 ((type & XFS_DQ_USER) ? \ 75xfs_dquot_tree(
74 &((qi)->qi_uquota_tree) : \ 76 struct xfs_quotainfo *qi,
75 &((qi)->qi_gquota_tree)) 77 int type)
78{
79 switch (type) {
80 case XFS_DQ_USER:
81 return &qi->qi_uquota_tree;
82 case XFS_DQ_GROUP:
83 return &qi->qi_gquota_tree;
84 case XFS_DQ_PROJ:
85 return &qi->qi_pquota_tree;
86 default:
87 ASSERT(0);
88 }
89 return NULL;
90}
76 91
92static inline struct xfs_inode *
93xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
94{
95 switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
96 case XFS_DQ_USER:
97 return dqp->q_mount->m_quotainfo->qi_uquotaip;
98 case XFS_DQ_GROUP:
99 return dqp->q_mount->m_quotainfo->qi_gquotaip;
100 case XFS_DQ_PROJ:
101 return dqp->q_mount->m_quotainfo->qi_pquotaip;
102 default:
103 ASSERT(0);
104 }
105 return NULL;
106}
77 107
78extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp, 108extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
79 unsigned int nbblks); 109 unsigned int nbblks);
80extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); 110extern void xfs_trans_mod_dquot(struct xfs_trans *,
81extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, 111 struct xfs_dquot *, uint, long);
82 xfs_dquot_t *, xfs_dquot_t *, long, long, uint); 112extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
83extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); 113 struct xfs_mount *, struct xfs_dquot *,
84extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); 114 struct xfs_dquot *, struct xfs_dquot *,
115 long, long, uint);
116extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
117extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
85 118
86/* 119/*
87 * We keep the usr and grp dquots separately so that locking will be easier 120 * We keep the usr, grp, and prj dquots separately so that locking will be
88 * to do at commit time. All transactions that we know of at this point 121 * easier to do at commit time. All transactions that we know of at this point
89 * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. 122 * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
90 */ 123 */
124enum {
125 XFS_QM_TRANS_USR = 0,
126 XFS_QM_TRANS_GRP,
127 XFS_QM_TRANS_PRJ,
128 XFS_QM_TRANS_DQTYPES
129};
91#define XFS_QM_TRANS_MAXDQS 2 130#define XFS_QM_TRANS_MAXDQS 2
92typedef struct xfs_dquot_acct { 131struct xfs_dquot_acct {
93 xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; 132 struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS];
94 xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; 133};
95} xfs_dquot_acct_t;
96 134
97/* 135/*
98 * Users are allowed to have a usage exceeding their softlimit for 136 * Users are allowed to have a usage exceeding their softlimit for
@@ -106,22 +144,23 @@ typedef struct xfs_dquot_acct {
106#define XFS_QM_IWARNLIMIT 5 144#define XFS_QM_IWARNLIMIT 5
107#define XFS_QM_RTBWARNLIMIT 5 145#define XFS_QM_RTBWARNLIMIT 5
108 146
109extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 147extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
110extern int xfs_qm_quotacheck(xfs_mount_t *); 148extern int xfs_qm_quotacheck(struct xfs_mount *);
111extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); 149extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
112 150
113/* dquot stuff */ 151/* dquot stuff */
114extern void xfs_qm_dqpurge_all(xfs_mount_t *, uint); 152extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint);
115extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); 153extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
116 154
117/* quota ops */ 155/* quota ops */
118extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); 156extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
119extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, 157extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
120 fs_disk_quota_t *); 158 uint, struct fs_disk_quota *);
121extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, 159extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
122 fs_disk_quota_t *); 160 struct fs_disk_quota *);
123extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); 161extern int xfs_qm_scall_getqstat(struct xfs_mount *,
124extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); 162 struct fs_quota_stat *);
125extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); 163extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
164extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
126 165
127#endif /* __XFS_QM_H__ */ 166#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2d02eac1c9a8..437a52d91f6d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -112,16 +112,16 @@ xfs_qm_newmount(
112 112
113 if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || 113 if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
114 (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || 114 (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
115 (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
116 (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) ||
117 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || 115 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
118 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && 116 (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) ||
117 (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
118 (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) &&
119 xfs_dev_is_read_only(mp, "changing quota state")) { 119 xfs_dev_is_read_only(mp, "changing quota state")) {
120 xfs_warn(mp, "please mount with%s%s%s%s.", 120 xfs_warn(mp, "please mount with%s%s%s%s.",
121 (!quotaondisk ? "out quota" : ""), 121 (!quotaondisk ? "out quota" : ""),
122 (uquotaondisk ? " usrquota" : ""), 122 (uquotaondisk ? " usrquota" : ""),
123 (pquotaondisk ? " prjquota" : ""), 123 (gquotaondisk ? " grpquota" : ""),
124 (gquotaondisk ? " grpquota" : "")); 124 (pquotaondisk ? " prjquota" : ""));
125 return XFS_ERROR(EPERM); 125 return XFS_ERROR(EPERM);
126 } 126 }
127 127
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 6cdf6ffc36a1..e4f8b2d6f38b 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -117,11 +117,12 @@ xfs_qm_scall_quotaoff(
117 } 117 }
118 if (flags & XFS_GQUOTA_ACCT) { 118 if (flags & XFS_GQUOTA_ACCT) {
119 dqtype |= XFS_QMOPT_GQUOTA; 119 dqtype |= XFS_QMOPT_GQUOTA;
120 flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); 120 flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
121 inactivate_flags |= XFS_GQUOTA_ACTIVE; 121 inactivate_flags |= XFS_GQUOTA_ACTIVE;
122 } else if (flags & XFS_PQUOTA_ACCT) { 122 }
123 if (flags & XFS_PQUOTA_ACCT) {
123 dqtype |= XFS_QMOPT_PQUOTA; 124 dqtype |= XFS_QMOPT_PQUOTA;
124 flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); 125 flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD);
125 inactivate_flags |= XFS_PQUOTA_ACTIVE; 126 inactivate_flags |= XFS_PQUOTA_ACTIVE;
126 } 127 }
127 128
@@ -198,10 +199,9 @@ xfs_qm_scall_quotaoff(
198 } 199 }
199 200
200 /* 201 /*
201 * If quotas is completely disabled, close shop. 202 * If all quotas are completely turned off, close shop.
202 */ 203 */
203 if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || 204 if (mp->m_qflags == 0) {
204 ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
205 mutex_unlock(&q->qi_quotaofflock); 205 mutex_unlock(&q->qi_quotaofflock);
206 xfs_qm_destroy_quotainfo(mp); 206 xfs_qm_destroy_quotainfo(mp);
207 return (0); 207 return (0);
@@ -214,10 +214,14 @@ xfs_qm_scall_quotaoff(
214 IRELE(q->qi_uquotaip); 214 IRELE(q->qi_uquotaip);
215 q->qi_uquotaip = NULL; 215 q->qi_uquotaip = NULL;
216 } 216 }
217 if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) { 217 if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) {
218 IRELE(q->qi_gquotaip); 218 IRELE(q->qi_gquotaip);
219 q->qi_gquotaip = NULL; 219 q->qi_gquotaip = NULL;
220 } 220 }
221 if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) {
222 IRELE(q->qi_pquotaip);
223 q->qi_pquotaip = NULL;
224 }
221 225
222out_unlock: 226out_unlock:
223 mutex_unlock(&q->qi_quotaofflock); 227 mutex_unlock(&q->qi_quotaofflock);
@@ -335,14 +339,14 @@ xfs_qm_scall_quotaon(
335 * quota acct on ondisk without m_qflags' knowing. 339 * quota acct on ondisk without m_qflags' knowing.
336 */ 340 */
337 if (((flags & XFS_UQUOTA_ACCT) == 0 && 341 if (((flags & XFS_UQUOTA_ACCT) == 0 &&
338 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && 342 (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
339 (flags & XFS_UQUOTA_ENFD)) 343 (flags & XFS_UQUOTA_ENFD)) ||
340 || 344 ((flags & XFS_GQUOTA_ACCT) == 0 &&
345 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
346 (flags & XFS_GQUOTA_ENFD)) ||
341 ((flags & XFS_PQUOTA_ACCT) == 0 && 347 ((flags & XFS_PQUOTA_ACCT) == 0 &&
342 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && 348 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
343 (flags & XFS_GQUOTA_ACCT) == 0 && 349 (flags & XFS_PQUOTA_ENFD))) {
344 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
345 (flags & XFS_OQUOTA_ENFD))) {
346 xfs_debug(mp, 350 xfs_debug(mp,
347 "%s: Can't enforce without acct, flags=%x sbflags=%x\n", 351 "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
348 __func__, flags, mp->m_sb.sb_qflags); 352 __func__, flags, mp->m_sb.sb_qflags);
@@ -407,11 +411,11 @@ xfs_qm_scall_getqstat(
407 struct fs_quota_stat *out) 411 struct fs_quota_stat *out)
408{ 412{
409 struct xfs_quotainfo *q = mp->m_quotainfo; 413 struct xfs_quotainfo *q = mp->m_quotainfo;
410 struct xfs_inode *uip, *gip; 414 struct xfs_inode *uip = NULL;
411 bool tempuqip, tempgqip; 415 struct xfs_inode *gip = NULL;
416 bool tempuqip = false;
417 bool tempgqip = false;
412 418
413 uip = gip = NULL;
414 tempuqip = tempgqip = false;
415 memset(out, 0, sizeof(fs_quota_stat_t)); 419 memset(out, 0, sizeof(fs_quota_stat_t));
416 420
417 out->qs_version = FS_QSTAT_VERSION; 421 out->qs_version = FS_QSTAT_VERSION;
@@ -776,9 +780,12 @@ xfs_qm_scall_getquota(
776 * gets turned off. No need to confuse the user level code, 780 * gets turned off. No need to confuse the user level code,
777 * so return zeroes in that case. 781 * so return zeroes in that case.
778 */ 782 */
779 if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) || 783 if ((!XFS_IS_UQUOTA_ENFORCED(mp) &&
780 (!XFS_IS_OQUOTA_ENFORCED(mp) && 784 dqp->q_core.d_flags == XFS_DQ_USER) ||
781 (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) { 785 (!XFS_IS_GQUOTA_ENFORCED(mp) &&
786 dqp->q_core.d_flags == XFS_DQ_GROUP) ||
787 (!XFS_IS_PQUOTA_ENFORCED(mp) &&
788 dqp->q_core.d_flags == XFS_DQ_PROJ)) {
782 dst->d_btimer = 0; 789 dst->d_btimer = 0;
783 dst->d_itimer = 0; 790 dst->d_itimer = 0;
784 dst->d_rtbtimer = 0; 791 dst->d_rtbtimer = 0;
@@ -786,8 +793,8 @@ xfs_qm_scall_getquota(
786 793
787#ifdef DEBUG 794#ifdef DEBUG
788 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || 795 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
789 (XFS_IS_OQUOTA_ENFORCED(mp) && 796 (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
790 (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && 797 (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
791 dst->d_id != 0) { 798 dst->d_id != 0) {
792 if ((dst->d_bcount > dst->d_blk_softlimit) && 799 if ((dst->d_bcount > dst->d_blk_softlimit) &&
793 (dst->d_blk_softlimit > 0)) { 800 (dst->d_blk_softlimit > 0)) {
@@ -833,16 +840,16 @@ xfs_qm_export_flags(
833 uflags = 0; 840 uflags = 0;
834 if (flags & XFS_UQUOTA_ACCT) 841 if (flags & XFS_UQUOTA_ACCT)
835 uflags |= FS_QUOTA_UDQ_ACCT; 842 uflags |= FS_QUOTA_UDQ_ACCT;
836 if (flags & XFS_PQUOTA_ACCT)
837 uflags |= FS_QUOTA_PDQ_ACCT;
838 if (flags & XFS_GQUOTA_ACCT) 843 if (flags & XFS_GQUOTA_ACCT)
839 uflags |= FS_QUOTA_GDQ_ACCT; 844 uflags |= FS_QUOTA_GDQ_ACCT;
845 if (flags & XFS_PQUOTA_ACCT)
846 uflags |= FS_QUOTA_PDQ_ACCT;
840 if (flags & XFS_UQUOTA_ENFD) 847 if (flags & XFS_UQUOTA_ENFD)
841 uflags |= FS_QUOTA_UDQ_ENFD; 848 uflags |= FS_QUOTA_UDQ_ENFD;
842 if (flags & (XFS_OQUOTA_ENFD)) { 849 if (flags & XFS_GQUOTA_ENFD)
843 uflags |= (flags & XFS_GQUOTA_ACCT) ? 850 uflags |= FS_QUOTA_GDQ_ENFD;
844 FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; 851 if (flags & XFS_PQUOTA_ENFD)
845 } 852 uflags |= FS_QUOTA_PDQ_ENFD;
846 return (uflags); 853 return (uflags);
847} 854}
848 855
@@ -856,9 +863,11 @@ xfs_dqrele_inode(
856{ 863{
857 /* skip quota inodes */ 864 /* skip quota inodes */
858 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 865 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
859 ip == ip->i_mount->m_quotainfo->qi_gquotaip) { 866 ip == ip->i_mount->m_quotainfo->qi_gquotaip ||
867 ip == ip->i_mount->m_quotainfo->qi_pquotaip) {
860 ASSERT(ip->i_udquot == NULL); 868 ASSERT(ip->i_udquot == NULL);
861 ASSERT(ip->i_gdquot == NULL); 869 ASSERT(ip->i_gdquot == NULL);
870 ASSERT(ip->i_pdquot == NULL);
862 return 0; 871 return 0;
863 } 872 }
864 873
@@ -867,10 +876,14 @@ xfs_dqrele_inode(
867 xfs_qm_dqrele(ip->i_udquot); 876 xfs_qm_dqrele(ip->i_udquot);
868 ip->i_udquot = NULL; 877 ip->i_udquot = NULL;
869 } 878 }
870 if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) { 879 if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
871 xfs_qm_dqrele(ip->i_gdquot); 880 xfs_qm_dqrele(ip->i_gdquot);
872 ip->i_gdquot = NULL; 881 ip->i_gdquot = NULL;
873 } 882 }
883 if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) {
884 xfs_qm_dqrele(ip->i_pdquot);
885 ip->i_pdquot = NULL;
886 }
874 xfs_iunlock(ip, XFS_ILOCK_EXCL); 887 xfs_iunlock(ip, XFS_ILOCK_EXCL);
875 return 0; 888 return 0;
876} 889}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f26c55..b14f42c714b6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -108,11 +108,28 @@ typedef struct xfs_dqblk {
108 { XFS_DQ_FREEING, "FREEING" } 108 { XFS_DQ_FREEING, "FREEING" }
109 109
110/* 110/*
111 * In the worst case, when both user and group quotas are on, 111 * We have the possibility of all three quota types being active at once, and
112 * we can have a max of three dquots changing in a single transaction. 112 * hence free space modification requires modification of all three current
113 * dquots in a single transaction. For this case we need to have a reservation
114 * of at least 3 dquots.
115 *
116 * However, a chmod operation can change both UID and GID in a single
117 * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
118 * modified. Hence for this case we need to reserve space for at least 4 dquots.
119 *
120 * And in the worst case, there's a rename operation that can be modifying up to
121 * 4 inodes with dquots attached to them. In reality, the only inodes that can
122 * have their dquots modified are the source and destination directory inodes
123 * due to directory name creation and removal. That can require space allocation
124 * and/or freeing on both directory inodes, and hence all three dquots on each
125 * inode can be modified. And if the directories are world writeable, all the
126 * dquots can be unique and so 6 dquots can be modified....
127 *
128 * And, of course, we also need to take into account the dquot log format item
129 * used to describe each dquot.
113 */ 130 */
114#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3) 131#define XFS_DQUOT_LOGRES(mp) \
115 132 ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
116 133
117/* 134/*
118 * These are the structures used to lay out dquots and quotaoff 135 * These are the structures used to lay out dquots and quotaoff
@@ -161,30 +178,42 @@ typedef struct xfs_qoff_logformat {
161#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ 178#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
162 179
163/* 180/*
181 * Conversion to and from the combined OQUOTA flag (if necessary)
182 * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
183 */
184#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
185#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
186#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
187#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
188
189/*
164 * Quota Accounting/Enforcement flags 190 * Quota Accounting/Enforcement flags
165 */ 191 */
166#define XFS_ALL_QUOTA_ACCT \ 192#define XFS_ALL_QUOTA_ACCT \
167 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) 193 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
168#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) 194#define XFS_ALL_QUOTA_ENFD \
169#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) 195 (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
196#define XFS_ALL_QUOTA_CHKD \
197 (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
170 198
171#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) 199#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
172#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) 200#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
173#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) 201#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
174#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) 202#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
175#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) 203#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
176#define XFS_IS_OQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_OQUOTA_ENFD) 204#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
205#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
177 206
178/* 207/*
179 * Incore only flags for quotaoff - these bits get cleared when quota(s) 208 * Incore only flags for quotaoff - these bits get cleared when quota(s)
180 * are in the process of getting turned off. These flags are in m_qflags but 209 * are in the process of getting turned off. These flags are in m_qflags but
181 * never in sb_qflags. 210 * never in sb_qflags.
182 */ 211 */
183#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ 212#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
184#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ 213#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
185#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ 214#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
186#define XFS_ALL_QUOTA_ACTIVE \ 215#define XFS_ALL_QUOTA_ACTIVE \
187 (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) 216 (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
188 217
189/* 218/*
190 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees 219 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
@@ -259,33 +288,24 @@ typedef struct xfs_qoff_logformat {
259 * we didn't have the inode locked, the appropriate dquot(s) will be 288 * we didn't have the inode locked, the appropriate dquot(s) will be
260 * attached atomically. 289 * attached atomically.
261 */ 290 */
262#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\ 291#define XFS_NOT_DQATTACHED(mp, ip) \
263 (ip)->i_udquot == NULL) || \ 292 ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \
264 (XFS_IS_OQUOTA_ON(mp) && \ 293 (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \
265 (ip)->i_gdquot == NULL)) 294 (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL))
266 295
267#define XFS_QM_NEED_QUOTACHECK(mp) \ 296#define XFS_QM_NEED_QUOTACHECK(mp) \
268 ((XFS_IS_UQUOTA_ON(mp) && \ 297 ((XFS_IS_UQUOTA_ON(mp) && \
269 (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ 298 (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \
270 (XFS_IS_GQUOTA_ON(mp) && \ 299 (XFS_IS_GQUOTA_ON(mp) && \
271 ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ 300 (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \
272 (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \
273 (XFS_IS_PQUOTA_ON(mp) && \ 301 (XFS_IS_PQUOTA_ON(mp) && \
274 ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ 302 (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
275 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT))))
276
277#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
278 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
279 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
280
281#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
282 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
283 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD)
284 303
285#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ 304#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
286 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ 305 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
287 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ 306 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
288 XFS_GQUOTA_ACCT) 307 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
308 XFS_PQUOTA_CHKD)
289 309
290 310
291/* 311/*
@@ -318,17 +338,18 @@ extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
318 struct xfs_inode *, long, long, uint); 338 struct xfs_inode *, long, long, uint);
319extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, 339extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
320 struct xfs_mount *, struct xfs_dquot *, 340 struct xfs_mount *, struct xfs_dquot *,
321 struct xfs_dquot *, long, long, uint); 341 struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
322 342
323extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint, 343extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
324 struct xfs_dquot **, struct xfs_dquot **); 344 struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **);
325extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, 345extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
326 struct xfs_dquot *, struct xfs_dquot *); 346 struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
327extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); 347extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
328extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, 348extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
329 struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); 349 struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
330extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, 350extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
331 struct xfs_dquot *, struct xfs_dquot *, uint); 351 struct xfs_dquot *, struct xfs_dquot *,
352 struct xfs_dquot *, uint);
332extern int xfs_qm_dqattach(struct xfs_inode *, uint); 353extern int xfs_qm_dqattach(struct xfs_inode *, uint);
333extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); 354extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
334extern void xfs_qm_dqdetach(struct xfs_inode *); 355extern void xfs_qm_dqdetach(struct xfs_inode *);
@@ -342,10 +363,12 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
342#else 363#else
343static inline int 364static inline int
344xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, 365xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
345 uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp) 366 uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp,
367 struct xfs_dquot **pdqp)
346{ 368{
347 *udqp = NULL; 369 *udqp = NULL;
348 *gdqp = NULL; 370 *gdqp = NULL;
371 *pdqp = NULL;
349 return 0; 372 return 0;
350} 373}
351#define xfs_trans_dup_dqinfo(tp, tp2) 374#define xfs_trans_dup_dqinfo(tp, tp2)
@@ -360,14 +383,15 @@ static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
360} 383}
361static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, 384static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
362 struct xfs_mount *mp, struct xfs_dquot *udqp, 385 struct xfs_mount *mp, struct xfs_dquot *udqp,
363 struct xfs_dquot *gdqp, long nblks, long nions, uint flags) 386 struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
387 long nblks, long nions, uint flags)
364{ 388{
365 return 0; 389 return 0;
366} 390}
367#define xfs_qm_vop_create_dqattach(tp, ip, u, g) 391#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p)
368#define xfs_qm_vop_rename_dqattach(it) (0) 392#define xfs_qm_vop_rename_dqattach(it) (0)
369#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) 393#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
370#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl) (0) 394#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
371#define xfs_qm_dqattach(ip, fl) (0) 395#define xfs_qm_dqattach(ip, fl) (0)
372#define xfs_qm_dqattach_locked(ip, fl) (0) 396#define xfs_qm_dqattach_locked(ip, fl) (0)
373#define xfs_qm_dqdetach(ip) 397#define xfs_qm_dqdetach(ip)
@@ -381,8 +405,8 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
381 405
382#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ 406#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
383 xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) 407 xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
384#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \ 408#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \
385 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \ 409 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
386 f | XFS_QMOPT_RES_REGBLKS) 410 f | XFS_QMOPT_RES_REGBLKS)
387 411
388extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *, 412extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 71926d630527..20e30f93b0c7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -75,8 +75,10 @@ xfs_fs_set_xstate(
75 flags |= XFS_GQUOTA_ACCT; 75 flags |= XFS_GQUOTA_ACCT;
76 if (uflags & FS_QUOTA_UDQ_ENFD) 76 if (uflags & FS_QUOTA_UDQ_ENFD)
77 flags |= XFS_UQUOTA_ENFD; 77 flags |= XFS_UQUOTA_ENFD;
78 if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) 78 if (uflags & FS_QUOTA_GDQ_ENFD)
79 flags |= XFS_OQUOTA_ENFD; 79 flags |= XFS_GQUOTA_ENFD;
80 if (uflags & FS_QUOTA_PDQ_ENFD)
81 flags |= XFS_PQUOTA_ENFD;
80 82
81 switch (op) { 83 switch (op) {
82 case Q_XQUOTAON: 84 case Q_XQUOTAON:
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 2de58a85833c..78f9e70b80c7 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -618,6 +618,12 @@ xfs_sb_has_incompat_log_feature(
618 return (sbp->sb_features_log_incompat & feature) != 0; 618 return (sbp->sb_features_log_incompat & feature) != 0;
619} 619}
620 620
621static inline bool
622xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
623{
624 return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino);
625}
626
621/* 627/*
622 * end of superblock version macros 628 * end of superblock version macros
623 */ 629 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3033ba5e9762..1d68ffcdeaa7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -51,6 +51,7 @@
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
52#include "xfs_icache.h" 52#include "xfs_icache.h"
53#include "xfs_trace.h" 53#include "xfs_trace.h"
54#include "xfs_icreate_item.h"
54 55
55#include <linux/namei.h> 56#include <linux/namei.h>
56#include <linux/init.h> 57#include <linux/init.h>
@@ -359,17 +360,17 @@ xfs_parseargs(
359 } else if (!strcmp(this_char, MNTOPT_PQUOTA) || 360 } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
360 !strcmp(this_char, MNTOPT_PRJQUOTA)) { 361 !strcmp(this_char, MNTOPT_PRJQUOTA)) {
361 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | 362 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
362 XFS_OQUOTA_ENFD); 363 XFS_PQUOTA_ENFD);
363 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { 364 } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
364 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); 365 mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
365 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 366 mp->m_qflags &= ~XFS_PQUOTA_ENFD;
366 } else if (!strcmp(this_char, MNTOPT_GQUOTA) || 367 } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
367 !strcmp(this_char, MNTOPT_GRPQUOTA)) { 368 !strcmp(this_char, MNTOPT_GRPQUOTA)) {
368 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | 369 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
369 XFS_OQUOTA_ENFD); 370 XFS_GQUOTA_ENFD);
370 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { 371 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
371 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); 372 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
372 mp->m_qflags &= ~XFS_OQUOTA_ENFD; 373 mp->m_qflags &= ~XFS_GQUOTA_ENFD;
373 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { 374 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
374 xfs_warn(mp, 375 xfs_warn(mp,
375 "delaylog is the default now, option is deprecated."); 376 "delaylog is the default now, option is deprecated.");
@@ -439,20 +440,15 @@ xfs_parseargs(
439 } 440 }
440 441
441done: 442done:
442 if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) { 443 if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) {
443 /* 444 /*
444 * At this point the superblock has not been read 445 * At this point the superblock has not been read
445 * in, therefore we do not know the block size. 446 * in, therefore we do not know the block size.
446 * Before the mount call ends we will convert 447 * Before the mount call ends we will convert
447 * these to FSBs. 448 * these to FSBs.
448 */ 449 */
449 if (dsunit) { 450 mp->m_dalign = dsunit;
450 mp->m_dalign = dsunit; 451 mp->m_swidth = dswidth;
451 mp->m_flags |= XFS_MOUNT_RETERR;
452 }
453
454 if (dswidth)
455 mp->m_swidth = dswidth;
456 } 452 }
457 453
458 if (mp->m_logbufs != -1 && 454 if (mp->m_logbufs != -1 &&
@@ -563,12 +559,12 @@ xfs_showargs(
563 /* Either project or group quotas can be active, not both */ 559 /* Either project or group quotas can be active, not both */
564 560
565 if (mp->m_qflags & XFS_PQUOTA_ACCT) { 561 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
566 if (mp->m_qflags & XFS_OQUOTA_ENFD) 562 if (mp->m_qflags & XFS_PQUOTA_ENFD)
567 seq_puts(m, "," MNTOPT_PRJQUOTA); 563 seq_puts(m, "," MNTOPT_PRJQUOTA);
568 else 564 else
569 seq_puts(m, "," MNTOPT_PQUOTANOENF); 565 seq_puts(m, "," MNTOPT_PQUOTANOENF);
570 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { 566 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
571 if (mp->m_qflags & XFS_OQUOTA_ENFD) 567 if (mp->m_qflags & XFS_GQUOTA_ENFD)
572 seq_puts(m, "," MNTOPT_GRPQUOTA); 568 seq_puts(m, "," MNTOPT_GRPQUOTA);
573 else 569 else
574 seq_puts(m, "," MNTOPT_GQUOTANOENF); 570 seq_puts(m, "," MNTOPT_GQUOTANOENF);
@@ -1136,8 +1132,8 @@ xfs_fs_statfs(
1136 spin_unlock(&mp->m_sb_lock); 1132 spin_unlock(&mp->m_sb_lock);
1137 1133
1138 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1134 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1139 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) == 1135 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
1140 (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) 1136 (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))
1141 xfs_qm_statvfs(ip, statp); 1137 xfs_qm_statvfs(ip, statp);
1142 return 0; 1138 return 0;
1143} 1139}
@@ -1481,6 +1477,10 @@ xfs_fs_fill_super(
1481 sb->s_time_gran = 1; 1477 sb->s_time_gran = 1;
1482 set_posix_acl_flag(sb); 1478 set_posix_acl_flag(sb);
1483 1479
1480 /* version 5 superblocks support inode version counters. */
1481 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
1482 sb->s_flags |= MS_I_VERSION;
1483
1484 error = xfs_mountfs(mp); 1484 error = xfs_mountfs(mp);
1485 if (error) 1485 if (error)
1486 goto out_filestream_unmount; 1486 goto out_filestream_unmount;
@@ -1655,9 +1655,15 @@ xfs_init_zones(void)
1655 KM_ZONE_SPREAD, NULL); 1655 KM_ZONE_SPREAD, NULL);
1656 if (!xfs_ili_zone) 1656 if (!xfs_ili_zone)
1657 goto out_destroy_inode_zone; 1657 goto out_destroy_inode_zone;
1658 xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
1659 "xfs_icr");
1660 if (!xfs_icreate_zone)
1661 goto out_destroy_ili_zone;
1658 1662
1659 return 0; 1663 return 0;
1660 1664
1665 out_destroy_ili_zone:
1666 kmem_zone_destroy(xfs_ili_zone);
1661 out_destroy_inode_zone: 1667 out_destroy_inode_zone:
1662 kmem_zone_destroy(xfs_inode_zone); 1668 kmem_zone_destroy(xfs_inode_zone);
1663 out_destroy_efi_zone: 1669 out_destroy_efi_zone:
@@ -1696,6 +1702,7 @@ xfs_destroy_zones(void)
1696 * destroy caches. 1702 * destroy caches.
1697 */ 1703 */
1698 rcu_barrier(); 1704 rcu_barrier();
1705 kmem_zone_destroy(xfs_icreate_zone);
1699 kmem_zone_destroy(xfs_ili_zone); 1706 kmem_zone_destroy(xfs_ili_zone);
1700 kmem_zone_destroy(xfs_inode_zone); 1707 kmem_zone_destroy(xfs_inode_zone);
1701 kmem_zone_destroy(xfs_efi_zone); 1708 kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 195a403e1522..f4895b662fcb 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -358,7 +358,9 @@ xfs_symlink(
358 int n; 358 int n;
359 xfs_buf_t *bp; 359 xfs_buf_t *bp;
360 prid_t prid; 360 prid_t prid;
361 struct xfs_dquot *udqp, *gdqp; 361 struct xfs_dquot *udqp = NULL;
362 struct xfs_dquot *gdqp = NULL;
363 struct xfs_dquot *pdqp = NULL;
362 uint resblks; 364 uint resblks;
363 365
364 *ipp = NULL; 366 *ipp = NULL;
@@ -385,7 +387,7 @@ xfs_symlink(
385 * Make sure that we have allocated dquot(s) on disk. 387 * Make sure that we have allocated dquot(s) on disk.
386 */ 388 */
387 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 389 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
388 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 390 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp);
389 if (error) 391 if (error)
390 goto std_return; 392 goto std_return;
391 393
@@ -426,7 +428,8 @@ xfs_symlink(
426 /* 428 /*
427 * Reserve disk quota : blocks and inode. 429 * Reserve disk quota : blocks and inode.
428 */ 430 */
429 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); 431 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
432 pdqp, resblks, 1, 0);
430 if (error) 433 if (error)
431 goto error_return; 434 goto error_return;
432 435
@@ -464,7 +467,7 @@ xfs_symlink(
464 /* 467 /*
465 * Also attach the dquot(s) to it, if applicable. 468 * Also attach the dquot(s) to it, if applicable.
466 */ 469 */
467 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); 470 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
468 471
469 if (resblks) 472 if (resblks)
470 resblks -= XFS_IALLOC_SPACE_RES(mp); 473 resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -562,6 +565,7 @@ xfs_symlink(
562 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 565 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
563 xfs_qm_dqrele(udqp); 566 xfs_qm_dqrele(udqp);
564 xfs_qm_dqrele(gdqp); 567 xfs_qm_dqrele(gdqp);
568 xfs_qm_dqrele(pdqp);
565 569
566 *ipp = ip; 570 *ipp = ip;
567 return 0; 571 return 0;
@@ -575,6 +579,7 @@ xfs_symlink(
575 xfs_trans_cancel(tp, cancel_flags); 579 xfs_trans_cancel(tp, cancel_flags);
576 xfs_qm_dqrele(udqp); 580 xfs_qm_dqrele(udqp);
577 xfs_qm_dqrele(gdqp); 581 xfs_qm_dqrele(gdqp);
582 xfs_qm_dqrele(pdqp);
578 583
579 if (unlock_dp_on_error) 584 if (unlock_dp_on_error)
580 xfs_iunlock(dp, XFS_ILOCK_EXCL); 585 xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -585,7 +590,7 @@ xfs_symlink(
585/* 590/*
586 * Free a symlink that has blocks associated with it. 591 * Free a symlink that has blocks associated with it.
587 */ 592 */
588int 593STATIC int
589xfs_inactive_symlink_rmt( 594xfs_inactive_symlink_rmt(
590 xfs_inode_t *ip, 595 xfs_inode_t *ip,
591 xfs_trans_t **tpp) 596 xfs_trans_t **tpp)
@@ -606,7 +611,7 @@ xfs_inactive_symlink_rmt(
606 611
607 tp = *tpp; 612 tp = *tpp;
608 mp = ip->i_mount; 613 mp = ip->i_mount;
609 ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); 614 ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
610 /* 615 /*
611 * We're freeing a symlink that has some 616 * We're freeing a symlink that has some
612 * blocks allocated to it. Free the 617 * blocks allocated to it. Free the
@@ -720,3 +725,47 @@ xfs_inactive_symlink_rmt(
720 error0: 725 error0:
721 return error; 726 return error;
722} 727}
728
729/*
730 * xfs_inactive_symlink - free a symlink
731 */
732int
733xfs_inactive_symlink(
734 struct xfs_inode *ip,
735 struct xfs_trans **tp)
736{
737 struct xfs_mount *mp = ip->i_mount;
738 int pathlen;
739
740 trace_xfs_inactive_symlink(ip);
741
742 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
743
744 if (XFS_FORCED_SHUTDOWN(mp))
745 return XFS_ERROR(EIO);
746
747 /*
748 * Zero length symlinks _can_ exist.
749 */
750 pathlen = (int)ip->i_d.di_size;
751 if (!pathlen)
752 return 0;
753
754 if (pathlen < 0 || pathlen > MAXPATHLEN) {
755 xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
756 __func__, (unsigned long long)ip->i_ino, pathlen);
757 ASSERT(0);
758 return XFS_ERROR(EFSCORRUPTED);
759 }
760
761 if (ip->i_df.if_flags & XFS_IFINLINE) {
762 if (ip->i_df.if_bytes > 0)
763 xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
764 XFS_DATA_FORK);
765 ASSERT(ip->i_df.if_bytes == 0);
766 return 0;
767 }
768
769 /* remove the remote symlink */
770 return xfs_inactive_symlink_rmt(ip, tp);
771}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index b39398d2097c..374394880c01 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -60,7 +60,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
60int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 60int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
61 const char *target_path, umode_t mode, struct xfs_inode **ipp); 61 const char *target_path, umode_t mode, struct xfs_inode **ipp);
62int xfs_readlink(struct xfs_inode *ip, char *link); 62int xfs_readlink(struct xfs_inode *ip, char *link);
63int xfs_inactive_symlink_rmt(struct xfs_inode *ip, struct xfs_trans **tpp); 63int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
64 64
65#endif /* __KERNEL__ */ 65#endif /* __KERNEL__ */
66#endif /* __XFS_SYMLINK_H */ 66#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 2801b5ce6cdb..1743b9f8e23d 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -25,11 +25,11 @@ static struct ctl_table_header *xfs_table_header;
25#ifdef CONFIG_PROC_FS 25#ifdef CONFIG_PROC_FS
26STATIC int 26STATIC int
27xfs_stats_clear_proc_handler( 27xfs_stats_clear_proc_handler(
28 ctl_table *ctl, 28 struct ctl_table *ctl,
29 int write, 29 int write,
30 void __user *buffer, 30 void __user *buffer,
31 size_t *lenp, 31 size_t *lenp,
32 loff_t *ppos) 32 loff_t *ppos)
33{ 33{
34 int c, ret, *valp = ctl->data; 34 int c, ret, *valp = ctl->data;
35 __uint32_t vn_active; 35 __uint32_t vn_active;
@@ -55,11 +55,11 @@ xfs_stats_clear_proc_handler(
55 55
56STATIC int 56STATIC int
57xfs_panic_mask_proc_handler( 57xfs_panic_mask_proc_handler(
58 ctl_table *ctl, 58 struct ctl_table *ctl,
59 int write, 59 int write,
60 void __user *buffer, 60 void __user *buffer,
61 size_t *lenp, 61 size_t *lenp,
62 loff_t *ppos) 62 loff_t *ppos)
63{ 63{
64 int ret, *valp = ctl->data; 64 int ret, *valp = ctl->data;
65 65
@@ -74,7 +74,7 @@ xfs_panic_mask_proc_handler(
74} 74}
75#endif /* CONFIG_PROC_FS */ 75#endif /* CONFIG_PROC_FS */
76 76
77static ctl_table xfs_table[] = { 77static struct ctl_table xfs_table[] = {
78 { 78 {
79 .procname = "irix_sgid_inherit", 79 .procname = "irix_sgid_inherit",
80 .data = &xfs_params.sgid_inherit.val, 80 .data = &xfs_params.sgid_inherit.val,
@@ -227,7 +227,7 @@ static ctl_table xfs_table[] = {
227 {} 227 {}
228}; 228};
229 229
230static ctl_table xfs_dir_table[] = { 230static struct ctl_table xfs_dir_table[] = {
231 { 231 {
232 .procname = "xfs", 232 .procname = "xfs",
233 .mode = 0555, 233 .mode = 0555,
@@ -236,7 +236,7 @@ static ctl_table xfs_dir_table[] = {
236 {} 236 {}
237}; 237};
238 238
239static ctl_table xfs_root_table[] = { 239static struct ctl_table xfs_root_table[] = {
240 { 240 {
241 .procname = "fs", 241 .procname = "fs",
242 .mode = 0555, 242 .mode = 0555,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index aa4db3307d36..47910e638c18 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -486,9 +486,12 @@ DEFINE_EVENT(xfs_buf_item_class, name, \
486 TP_PROTO(struct xfs_buf_log_item *bip), \ 486 TP_PROTO(struct xfs_buf_log_item *bip), \
487 TP_ARGS(bip)) 487 TP_ARGS(bip))
488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); 488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); 490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); 491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); 493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); 495DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); 496DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); 497DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
@@ -508,6 +511,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
508DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); 511DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
509DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); 512DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
510DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); 513DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
514DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
511 515
512DECLARE_EVENT_CLASS(xfs_lock_class, 516DECLARE_EVENT_CLASS(xfs_lock_class,
513 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, 517 TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
@@ -571,6 +575,7 @@ DEFINE_INODE_EVENT(xfs_iget_miss);
571DEFINE_INODE_EVENT(xfs_getattr); 575DEFINE_INODE_EVENT(xfs_getattr);
572DEFINE_INODE_EVENT(xfs_setattr); 576DEFINE_INODE_EVENT(xfs_setattr);
573DEFINE_INODE_EVENT(xfs_readlink); 577DEFINE_INODE_EVENT(xfs_readlink);
578DEFINE_INODE_EVENT(xfs_inactive_symlink);
574DEFINE_INODE_EVENT(xfs_alloc_file_space); 579DEFINE_INODE_EVENT(xfs_alloc_file_space);
575DEFINE_INODE_EVENT(xfs_free_file_space); 580DEFINE_INODE_EVENT(xfs_free_file_space);
576DEFINE_INODE_EVENT(xfs_readdir); 581DEFINE_INODE_EVENT(xfs_readdir);
@@ -974,14 +979,16 @@ DEFINE_RW_EVENT(xfs_file_splice_read);
974DEFINE_RW_EVENT(xfs_file_splice_write); 979DEFINE_RW_EVENT(xfs_file_splice_write);
975 980
976DECLARE_EVENT_CLASS(xfs_page_class, 981DECLARE_EVENT_CLASS(xfs_page_class,
977 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), 982 TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
978 TP_ARGS(inode, page, off), 983 unsigned int len),
984 TP_ARGS(inode, page, off, len),
979 TP_STRUCT__entry( 985 TP_STRUCT__entry(
980 __field(dev_t, dev) 986 __field(dev_t, dev)
981 __field(xfs_ino_t, ino) 987 __field(xfs_ino_t, ino)
982 __field(pgoff_t, pgoff) 988 __field(pgoff_t, pgoff)
983 __field(loff_t, size) 989 __field(loff_t, size)
984 __field(unsigned long, offset) 990 __field(unsigned long, offset)
991 __field(unsigned int, length)
985 __field(int, delalloc) 992 __field(int, delalloc)
986 __field(int, unwritten) 993 __field(int, unwritten)
987 ), 994 ),
@@ -995,24 +1002,27 @@ DECLARE_EVENT_CLASS(xfs_page_class,
995 __entry->pgoff = page_offset(page); 1002 __entry->pgoff = page_offset(page);
996 __entry->size = i_size_read(inode); 1003 __entry->size = i_size_read(inode);
997 __entry->offset = off; 1004 __entry->offset = off;
1005 __entry->length = len;
998 __entry->delalloc = delalloc; 1006 __entry->delalloc = delalloc;
999 __entry->unwritten = unwritten; 1007 __entry->unwritten = unwritten;
1000 ), 1008 ),
1001 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " 1009 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
1002 "delalloc %d unwritten %d", 1010 "length %x delalloc %d unwritten %d",
1003 MAJOR(__entry->dev), MINOR(__entry->dev), 1011 MAJOR(__entry->dev), MINOR(__entry->dev),
1004 __entry->ino, 1012 __entry->ino,
1005 __entry->pgoff, 1013 __entry->pgoff,
1006 __entry->size, 1014 __entry->size,
1007 __entry->offset, 1015 __entry->offset,
1016 __entry->length,
1008 __entry->delalloc, 1017 __entry->delalloc,
1009 __entry->unwritten) 1018 __entry->unwritten)
1010) 1019)
1011 1020
1012#define DEFINE_PAGE_EVENT(name) \ 1021#define DEFINE_PAGE_EVENT(name) \
1013DEFINE_EVENT(xfs_page_class, name, \ 1022DEFINE_EVENT(xfs_page_class, name, \
1014 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ 1023 TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
1015 TP_ARGS(inode, page, off)) 1024 unsigned int len), \
1025 TP_ARGS(inode, page, off, len))
1016DEFINE_PAGE_EVENT(xfs_writepage); 1026DEFINE_PAGE_EVENT(xfs_writepage);
1017DEFINE_PAGE_EVENT(xfs_releasepage); 1027DEFINE_PAGE_EVENT(xfs_releasepage);
1018DEFINE_PAGE_EVENT(xfs_invalidatepage); 1028DEFINE_PAGE_EVENT(xfs_invalidatepage);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 2fd7c1ff1d21..35a229981354 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -234,71 +234,93 @@ xfs_calc_remove_reservation(
234} 234}
235 235
236/* 236/*
237 * For symlink we can modify: 237 * For create, break it in to the two cases that the transaction
238 * covers. We start with the modify case - allocation done by modification
239 * of the state of existing inodes - and the allocation case.
240 */
241
242/*
243 * For create we can modify:
238 * the parent directory inode: inode size 244 * the parent directory inode: inode size
239 * the new inode: inode size 245 * the new inode: inode size
240 * the inode btree entry: 1 block 246 * the inode btree entry: block size
247 * the superblock for the nlink flag: sector size
241 * the directory btree: (max depth + v2) * dir block size 248 * the directory btree: (max depth + v2) * dir block size
242 * the directory inode's bmap btree: (max depth + v2) * block size 249 * the directory inode's bmap btree: (max depth + v2) * block size
243 * the blocks for the symlink: 1 kB 250 */
244 * Or in the first xact we allocate some inodes giving: 251STATIC uint
252xfs_calc_create_resv_modify(
253 struct xfs_mount *mp)
254{
255 return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
256 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
257 (uint)XFS_FSB_TO_B(mp, 1) +
258 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
259}
260
261/*
262 * For create we can allocate some inodes giving:
245 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 263 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
264 * the superblock for the nlink flag: sector size
246 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize 265 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
247 * the inode btree: max depth * blocksize 266 * the inode btree: max depth * blocksize
248 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size 267 * the allocation btrees: 2 trees * (max depth - 1) * block size
249 */ 268 */
250STATIC uint 269STATIC uint
251xfs_calc_symlink_reservation( 270xfs_calc_create_resv_alloc(
271 struct xfs_mount *mp)
272{
273 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
274 mp->m_sb.sb_sectsize +
275 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
276 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
277 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
278 XFS_FSB_TO_B(mp, 1));
279}
280
281STATIC uint
282__xfs_calc_create_reservation(
252 struct xfs_mount *mp) 283 struct xfs_mount *mp)
253{ 284{
254 return XFS_DQUOT_LOGRES(mp) + 285 return XFS_DQUOT_LOGRES(mp) +
255 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 286 MAX(xfs_calc_create_resv_alloc(mp),
256 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 287 xfs_calc_create_resv_modify(mp));
257 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
258 XFS_FSB_TO_B(mp, 1)) +
259 xfs_calc_buf_res(1, 1024)),
260 (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
261 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
262 XFS_FSB_TO_B(mp, 1)) +
263 xfs_calc_buf_res(mp->m_in_maxlevels,
264 XFS_FSB_TO_B(mp, 1)) +
265 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
266 XFS_FSB_TO_B(mp, 1))));
267} 288}
268 289
269/* 290/*
270 * For create we can modify: 291 * For icreate we can allocate some inodes giving:
271 * the parent directory inode: inode size
272 * the new inode: inode size
273 * the inode btree entry: block size
274 * the superblock for the nlink flag: sector size
275 * the directory btree: (max depth + v2) * dir block size
276 * the directory inode's bmap btree: (max depth + v2) * block size
277 * Or in the first xact we allocate some inodes giving:
278 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 292 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
279 * the superblock for the nlink flag: sector size 293 * the superblock for the nlink flag: sector size
280 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
281 * the inode btree: max depth * blocksize 294 * the inode btree: max depth * blocksize
282 * the allocation btrees: 2 trees * (max depth - 1) * block size 295 * the allocation btrees: 2 trees * (max depth - 1) * block size
283 */ 296 */
284STATIC uint 297STATIC uint
285xfs_calc_create_reservation( 298xfs_calc_icreate_resv_alloc(
286 struct xfs_mount *mp) 299 struct xfs_mount *mp)
287{ 300{
301 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
302 mp->m_sb.sb_sectsize +
303 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
304 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
305 XFS_FSB_TO_B(mp, 1));
306}
307
308STATIC uint
309xfs_calc_icreate_reservation(xfs_mount_t *mp)
310{
288 return XFS_DQUOT_LOGRES(mp) + 311 return XFS_DQUOT_LOGRES(mp) +
289 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) + 312 MAX(xfs_calc_icreate_resv_alloc(mp),
290 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + 313 xfs_calc_create_resv_modify(mp));
291 (uint)XFS_FSB_TO_B(mp, 1) + 314}
292 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), 315
293 XFS_FSB_TO_B(mp, 1))), 316STATIC uint
294 (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 317xfs_calc_create_reservation(
295 mp->m_sb.sb_sectsize + 318 struct xfs_mount *mp)
296 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), 319{
297 XFS_FSB_TO_B(mp, 1)) + 320 if (xfs_sb_version_hascrc(&mp->m_sb))
298 xfs_calc_buf_res(mp->m_in_maxlevels, 321 return xfs_calc_icreate_reservation(mp);
299 XFS_FSB_TO_B(mp, 1)) + 322 return __xfs_calc_create_reservation(mp);
300 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 323
301 XFS_FSB_TO_B(mp, 1))));
302} 324}
303 325
304/* 326/*
@@ -311,6 +333,20 @@ xfs_calc_mkdir_reservation(
311 return xfs_calc_create_reservation(mp); 333 return xfs_calc_create_reservation(mp);
312} 334}
313 335
336
337/*
338 * Making a new symplink is the same as creating a new file, but
339 * with the added blocks for remote symlink data which can be up to 1kB in
340 * length (MAXPATHLEN).
341 */
342STATIC uint
343xfs_calc_symlink_reservation(
344 struct xfs_mount *mp)
345{
346 return xfs_calc_create_reservation(mp) +
347 xfs_calc_buf_res(1, MAXPATHLEN);
348}
349
314/* 350/*
315 * In freeing an inode we can modify: 351 * In freeing an inode we can modify:
316 * the inode being freed: inode size 352 * the inode being freed: inode size
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a44dba5b2cdb..2b4946393e30 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -48,6 +48,7 @@ typedef struct xfs_trans_header {
48#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ 48#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
49#define XFS_LI_DQUOT 0x123d 49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e 50#define XFS_LI_QUOTAOFF 0x123e
51#define XFS_LI_ICREATE 0x123f
51 52
52#define XFS_LI_TYPE_DESC \ 53#define XFS_LI_TYPE_DESC \
53 { XFS_LI_EFI, "XFS_LI_EFI" }, \ 54 { XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -107,7 +108,8 @@ typedef struct xfs_trans_header {
107#define XFS_TRANS_SWAPEXT 40 108#define XFS_TRANS_SWAPEXT 40
108#define XFS_TRANS_SB_COUNT 41 109#define XFS_TRANS_SB_COUNT 41
109#define XFS_TRANS_CHECKPOINT 42 110#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42 111#define XFS_TRANS_ICREATE 43
112#define XFS_TRANS_TYPE_MAX 43
111/* new transaction types need to be reflected in xfs_logprint(8) */ 113/* new transaction types need to be reflected in xfs_logprint(8) */
112 114
113#define XFS_TRANS_TYPES \ 115#define XFS_TRANS_TYPES \
@@ -210,23 +212,18 @@ struct xfs_log_item_desc {
210/* 212/*
211 * Per-extent log reservation for the allocation btree changes 213 * Per-extent log reservation for the allocation btree changes
212 * involved in freeing or allocating an extent. 214 * involved in freeing or allocating an extent.
213 * 2 trees * (2 blocks/level * max depth - 1) * block size 215 * 2 trees * (2 blocks/level * max depth - 1)
214 */ 216 */
215#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
216 ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
217#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ 217#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
218 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) 218 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
219 219
220/* 220/*
221 * Per-directory log reservation for any directory change. 221 * Per-directory log reservation for any directory change.
222 * dir blocks: (1 btree block per level + data block + free block) * dblock size 222 * dir blocks: (1 btree block per level + data block + free block)
223 * bmap btree: (levels + 2) * max depth * block size 223 * bmap btree: (levels + 2) * max depth
224 * v2 directory blocks can be fragmented below the dirblksize down to the fsb 224 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
225 * size, so account for that in the DAENTER macros. 225 * size, so account for that in the DAENTER macros.
226 */ 226 */
227#define XFS_DIROP_LOG_RES(mp) \
228 (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
229 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
230#define XFS_DIROP_LOG_COUNT(mp) \ 227#define XFS_DIROP_LOG_COUNT(mp) \
231 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ 228 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
232 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) 229 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
@@ -503,6 +500,7 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
503void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); 500void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
504void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); 501void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
505void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); 502void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
503void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
506void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 504void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
507void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 505void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
508void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); 506void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 73a5fa457e16..aa5a04b844d6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -397,7 +397,6 @@ shutdown_abort:
397 return XFS_ERROR(EIO); 397 return XFS_ERROR(EIO);
398} 398}
399 399
400
401/* 400/*
402 * Release the buffer bp which was previously acquired with one of the 401 * Release the buffer bp which was previously acquired with one of the
403 * xfs_trans_... buffer allocation routines if the buffer has not 402 * xfs_trans_... buffer allocation routines if the buffer has not
@@ -603,8 +602,14 @@ xfs_trans_log_buf(xfs_trans_t *tp,
603 602
604 tp->t_flags |= XFS_TRANS_DIRTY; 603 tp->t_flags |= XFS_TRANS_DIRTY;
605 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; 604 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
606 bip->bli_flags |= XFS_BLI_LOGGED; 605
607 xfs_buf_item_log(bip, first, last); 606 /*
607 * If we have an ordered buffer we are not logging any dirty range but
608 * it still needs to be marked dirty and that it has been logged.
609 */
610 bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
611 if (!(bip->bli_flags & XFS_BLI_ORDERED))
612 xfs_buf_item_log(bip, first, last);
608} 613}
609 614
610 615
@@ -757,6 +762,29 @@ xfs_trans_inode_alloc_buf(
757} 762}
758 763
759/* 764/*
765 * Mark the buffer as ordered for this transaction. This means
766 * that the contents of the buffer are not recorded in the transaction
767 * but it is tracked in the AIL as though it was. This allows us
768 * to record logical changes in transactions rather than the physical
769 * changes we make to the buffer without changing writeback ordering
770 * constraints of metadata buffers.
771 */
772void
773xfs_trans_ordered_buf(
774 struct xfs_trans *tp,
775 struct xfs_buf *bp)
776{
777 struct xfs_buf_log_item *bip = bp->b_fspriv;
778
779 ASSERT(bp->b_transp == tp);
780 ASSERT(bip != NULL);
781 ASSERT(atomic_read(&bip->bli_refcount) > 0);
782
783 bip->bli_flags |= XFS_BLI_ORDERED;
784 trace_xfs_buf_item_ordered(bip);
785}
786
787/*
760 * Set the type of the buffer for log recovery so that it can correctly identify 788 * Set the type of the buffer for log recovery so that it can correctly identify
761 * and hence attach the correct buffer ops to the buffer after replay. 789 * and hence attach the correct buffer ops to the buffer after replay.
762 */ 790 */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index fec75d023703..61407a847b86 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -103,8 +103,6 @@ xfs_trans_dup_dqinfo(
103 return; 103 return;
104 104
105 xfs_trans_alloc_dqinfo(ntp); 105 xfs_trans_alloc_dqinfo(ntp);
106 oqa = otp->t_dqinfo->dqa_usrdquots;
107 nqa = ntp->t_dqinfo->dqa_usrdquots;
108 106
109 /* 107 /*
110 * Because the quota blk reservation is carried forward, 108 * Because the quota blk reservation is carried forward,
@@ -113,7 +111,9 @@ xfs_trans_dup_dqinfo(
113 if(otp->t_flags & XFS_TRANS_DQ_DIRTY) 111 if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
114 ntp->t_flags |= XFS_TRANS_DQ_DIRTY; 112 ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
115 113
116 for (j = 0; j < 2; j++) { 114 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
115 oqa = otp->t_dqinfo->dqs[j];
116 nqa = ntp->t_dqinfo->dqs[j];
117 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 117 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
118 if (oqa[i].qt_dquot == NULL) 118 if (oqa[i].qt_dquot == NULL)
119 break; 119 break;
@@ -138,8 +138,6 @@ xfs_trans_dup_dqinfo(
138 oq->qt_ino_res = oq->qt_ino_res_used; 138 oq->qt_ino_res = oq->qt_ino_res_used;
139 139
140 } 140 }
141 oqa = otp->t_dqinfo->dqa_grpdquots;
142 nqa = ntp->t_dqinfo->dqa_grpdquots;
143 } 141 }
144} 142}
145 143
@@ -157,8 +155,7 @@ xfs_trans_mod_dquot_byino(
157 155
158 if (!XFS_IS_QUOTA_RUNNING(mp) || 156 if (!XFS_IS_QUOTA_RUNNING(mp) ||
159 !XFS_IS_QUOTA_ON(mp) || 157 !XFS_IS_QUOTA_ON(mp) ||
160 ip->i_ino == mp->m_sb.sb_uquotino || 158 xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
161 ip->i_ino == mp->m_sb.sb_gquotino)
162 return; 159 return;
163 160
164 if (tp->t_dqinfo == NULL) 161 if (tp->t_dqinfo == NULL)
@@ -166,20 +163,28 @@ xfs_trans_mod_dquot_byino(
166 163
167 if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) 164 if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
168 (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); 165 (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
169 if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot) 166 if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
170 (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); 167 (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
168 if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
169 (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
171} 170}
172 171
173STATIC xfs_dqtrx_t * 172STATIC struct xfs_dqtrx *
174xfs_trans_get_dqtrx( 173xfs_trans_get_dqtrx(
175 xfs_trans_t *tp, 174 struct xfs_trans *tp,
176 xfs_dquot_t *dqp) 175 struct xfs_dquot *dqp)
177{ 176{
178 int i; 177 int i;
179 xfs_dqtrx_t *qa; 178 struct xfs_dqtrx *qa;
180 179
181 qa = XFS_QM_ISUDQ(dqp) ? 180 if (XFS_QM_ISUDQ(dqp))
182 tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; 181 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR];
182 else if (XFS_QM_ISGDQ(dqp))
183 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP];
184 else if (XFS_QM_ISPDQ(dqp))
185 qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ];
186 else
187 return NULL;
183 188
184 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 189 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
185 if (qa[i].qt_dquot == NULL || 190 if (qa[i].qt_dquot == NULL ||
@@ -292,11 +297,10 @@ xfs_trans_mod_dquot(
292 297
293 298
294/* 299/*
295 * Given an array of dqtrx structures, lock all the dquots associated 300 * Given an array of dqtrx structures, lock all the dquots associated and join
296 * and join them to the transaction, provided they have been modified. 301 * them to the transaction, provided they have been modified. We know that the
297 * We know that the highest number of dquots (of one type - usr OR grp), 302 * highest number of dquots of one type - usr, grp OR prj - involved in a
298 * involved in a transaction is 2 and that both usr and grp combined - 3. 303 * transaction is 2 so we don't need to make this very generic.
299 * So, we don't attempt to make this very generic.
300 */ 304 */
301STATIC void 305STATIC void
302xfs_trans_dqlockedjoin( 306xfs_trans_dqlockedjoin(
@@ -339,12 +343,10 @@ xfs_trans_apply_dquot_deltas(
339 return; 343 return;
340 344
341 ASSERT(tp->t_dqinfo); 345 ASSERT(tp->t_dqinfo);
342 qa = tp->t_dqinfo->dqa_usrdquots; 346 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
343 for (j = 0; j < 2; j++) { 347 qa = tp->t_dqinfo->dqs[j];
344 if (qa[0].qt_dquot == NULL) { 348 if (qa[0].qt_dquot == NULL)
345 qa = tp->t_dqinfo->dqa_grpdquots;
346 continue; 349 continue;
347 }
348 350
349 /* 351 /*
350 * Lock all of the dquots and join them to the transaction. 352 * Lock all of the dquots and join them to the transaction.
@@ -495,10 +497,6 @@ xfs_trans_apply_dquot_deltas(
495 ASSERT(dqp->q_res_rtbcount >= 497 ASSERT(dqp->q_res_rtbcount >=
496 be64_to_cpu(dqp->q_core.d_rtbcount)); 498 be64_to_cpu(dqp->q_core.d_rtbcount));
497 } 499 }
498 /*
499 * Do the group quotas next
500 */
501 qa = tp->t_dqinfo->dqa_grpdquots;
502 } 500 }
503} 501}
504 502
@@ -521,9 +519,9 @@ xfs_trans_unreserve_and_mod_dquots(
521 if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) 519 if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
522 return; 520 return;
523 521
524 qa = tp->t_dqinfo->dqa_usrdquots; 522 for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) {
523 qa = tp->t_dqinfo->dqs[j];
525 524
526 for (j = 0; j < 2; j++) {
527 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { 525 for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
528 qtrx = &qa[i]; 526 qtrx = &qa[i];
529 /* 527 /*
@@ -565,7 +563,6 @@ xfs_trans_unreserve_and_mod_dquots(
565 xfs_dqunlock(dqp); 563 xfs_dqunlock(dqp);
566 564
567 } 565 }
568 qa = tp->t_dqinfo->dqa_grpdquots;
569 } 566 }
570} 567}
571 568
@@ -640,8 +637,8 @@ xfs_trans_dqresv(
640 if ((flags & XFS_QMOPT_FORCE_RES) == 0 && 637 if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
641 dqp->q_core.d_id && 638 dqp->q_core.d_id &&
642 ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || 639 ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && 640 (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) ||
644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { 641 (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) {
645 if (nblks > 0) { 642 if (nblks > 0) {
646 /* 643 /*
647 * dquot is locked already. See if we'd go over the 644 * dquot is locked already. See if we'd go over the
@@ -736,8 +733,8 @@ error_return:
736 733
737/* 734/*
738 * Given dquot(s), make disk block and/or inode reservations against them. 735 * Given dquot(s), make disk block and/or inode reservations against them.
739 * The fact that this does the reservation against both the usr and 736 * The fact that this does the reservation against user, group and
740 * grp/prj quotas is important, because this follows a both-or-nothing 737 * project quotas is important, because this follows a all-or-nothing
741 * approach. 738 * approach.
742 * 739 *
743 * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. 740 * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
@@ -748,15 +745,16 @@ error_return:
748 */ 745 */
749int 746int
750xfs_trans_reserve_quota_bydquots( 747xfs_trans_reserve_quota_bydquots(
751 xfs_trans_t *tp, 748 struct xfs_trans *tp,
752 xfs_mount_t *mp, 749 struct xfs_mount *mp,
753 xfs_dquot_t *udqp, 750 struct xfs_dquot *udqp,
754 xfs_dquot_t *gdqp, 751 struct xfs_dquot *gdqp,
755 long nblks, 752 struct xfs_dquot *pdqp,
756 long ninos, 753 long nblks,
757 uint flags) 754 long ninos,
755 uint flags)
758{ 756{
759 int resvd = 0, error; 757 int error;
760 758
761 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) 759 if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
762 return 0; 760 return 0;
@@ -771,28 +769,34 @@ xfs_trans_reserve_quota_bydquots(
771 (flags & ~XFS_QMOPT_ENOSPC)); 769 (flags & ~XFS_QMOPT_ENOSPC));
772 if (error) 770 if (error)
773 return error; 771 return error;
774 resvd = 1;
775 } 772 }
776 773
777 if (gdqp) { 774 if (gdqp) {
778 error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); 775 error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
779 if (error) { 776 if (error)
780 /* 777 goto unwind_usr;
781 * can't do it, so backout previous reservation 778 }
782 */ 779
783 if (resvd) { 780 if (pdqp) {
784 flags |= XFS_QMOPT_FORCE_RES; 781 error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags);
785 xfs_trans_dqresv(tp, mp, udqp, 782 if (error)
786 -nblks, -ninos, flags); 783 goto unwind_grp;
787 }
788 return error;
789 }
790 } 784 }
791 785
792 /* 786 /*
793 * Didn't change anything critical, so, no need to log 787 * Didn't change anything critical, so, no need to log
794 */ 788 */
795 return 0; 789 return 0;
790
791unwind_grp:
792 flags |= XFS_QMOPT_FORCE_RES;
793 if (gdqp)
794 xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags);
795unwind_usr:
796 flags |= XFS_QMOPT_FORCE_RES;
797 if (udqp)
798 xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags);
799 return error;
796} 800}
797 801
798 802
@@ -816,8 +820,7 @@ xfs_trans_reserve_quota_nblks(
816 if (XFS_IS_PQUOTA_ON(mp)) 820 if (XFS_IS_PQUOTA_ON(mp))
817 flags |= XFS_QMOPT_ENOSPC; 821 flags |= XFS_QMOPT_ENOSPC;
818 822
819 ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); 823 ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
820 ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
821 824
822 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 825 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
823 ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == 826 ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -830,6 +833,7 @@ xfs_trans_reserve_quota_nblks(
830 */ 833 */
831 return xfs_trans_reserve_quota_bydquots(tp, mp, 834 return xfs_trans_reserve_quota_bydquots(tp, mp,
832 ip->i_udquot, ip->i_gdquot, 835 ip->i_udquot, ip->i_gdquot,
836 ip->i_pdquot,
833 nblks, ninos, flags); 837 nblks, ninos, flags);
834} 838}
835 839
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ac6d567704db..53dfe46f3680 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -112,6 +112,17 @@ xfs_trans_log_inode(
112 ASSERT(ip->i_itemp != NULL); 112 ASSERT(ip->i_itemp != NULL);
113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
114 114
115 /*
116 * First time we log the inode in a transaction, bump the inode change
117 * counter if it is configured for this to occur.
118 */
119 if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
120 IS_I_VERSION(VFS_I(ip))) {
121 inode_inc_iversion(VFS_I(ip));
122 ip->i_d.di_changecount = VFS_I(ip)->i_version;
123 flags |= XFS_ILOG_CORE;
124 }
125
115 tp->t_flags |= XFS_TRANS_DIRTY; 126 tp->t_flags |= XFS_TRANS_DIRTY;
116 ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; 127 ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
117 128
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0176bb21f09a..dc730ac272be 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -322,18 +322,9 @@ xfs_inactive(
322 xfs_trans_ijoin(tp, ip, 0); 322 xfs_trans_ijoin(tp, ip, 0);
323 323
324 if (S_ISLNK(ip->i_d.di_mode)) { 324 if (S_ISLNK(ip->i_d.di_mode)) {
325 /* 325 error = xfs_inactive_symlink(ip, &tp);
326 * Zero length symlinks _can_ exist. 326 if (error)
327 */ 327 goto out_cancel;
328 if (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) {
329 error = xfs_inactive_symlink_rmt(ip, &tp);
330 if (error)
331 goto out_cancel;
332 } else if (ip->i_df.if_bytes > 0) {
333 xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
334 XFS_DATA_FORK);
335 ASSERT(ip->i_df.if_bytes == 0);
336 }
337 } else if (truncate) { 328 } else if (truncate) {
338 ip->i_d.di_size = 0; 329 ip->i_d.di_size = 0;
339 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 330 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -498,6 +489,7 @@ xfs_create(
498 prid_t prid; 489 prid_t prid;
499 struct xfs_dquot *udqp = NULL; 490 struct xfs_dquot *udqp = NULL;
500 struct xfs_dquot *gdqp = NULL; 491 struct xfs_dquot *gdqp = NULL;
492 struct xfs_dquot *pdqp = NULL;
501 uint resblks; 493 uint resblks;
502 uint log_res; 494 uint log_res;
503 uint log_count; 495 uint log_count;
@@ -516,7 +508,8 @@ xfs_create(
516 * Make sure that we have allocated dquot(s) on disk. 508 * Make sure that we have allocated dquot(s) on disk.
517 */ 509 */
518 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 510 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
519 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 511 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
512 &udqp, &gdqp, &pdqp);
520 if (error) 513 if (error)
521 return error; 514 return error;
522 515
@@ -568,7 +561,8 @@ xfs_create(
568 /* 561 /*
569 * Reserve disk quota and the inode. 562 * Reserve disk quota and the inode.
570 */ 563 */
571 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0); 564 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
565 pdqp, resblks, 1, 0);
572 if (error) 566 if (error)
573 goto out_trans_cancel; 567 goto out_trans_cancel;
574 568
@@ -632,7 +626,7 @@ xfs_create(
632 * These ids of the inode couldn't have changed since the new 626 * These ids of the inode couldn't have changed since the new
633 * inode has been locked ever since it was created. 627 * inode has been locked ever since it was created.
634 */ 628 */
635 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); 629 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
636 630
637 error = xfs_bmap_finish(&tp, &free_list, &committed); 631 error = xfs_bmap_finish(&tp, &free_list, &committed);
638 if (error) 632 if (error)
@@ -644,6 +638,7 @@ xfs_create(
644 638
645 xfs_qm_dqrele(udqp); 639 xfs_qm_dqrele(udqp);
646 xfs_qm_dqrele(gdqp); 640 xfs_qm_dqrele(gdqp);
641 xfs_qm_dqrele(pdqp);
647 642
648 *ipp = ip; 643 *ipp = ip;
649 return 0; 644 return 0;
@@ -665,6 +660,7 @@ xfs_create(
665 660
666 xfs_qm_dqrele(udqp); 661 xfs_qm_dqrele(udqp);
667 xfs_qm_dqrele(gdqp); 662 xfs_qm_dqrele(gdqp);
663 xfs_qm_dqrele(pdqp);
668 664
669 if (unlock_dp_on_error) 665 if (unlock_dp_on_error)
670 xfs_iunlock(dp, XFS_ILOCK_EXCL); 666 xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1577,7 +1573,7 @@ xfs_free_file_space(
1577 } 1573 }
1578 xfs_ilock(ip, XFS_ILOCK_EXCL); 1574 xfs_ilock(ip, XFS_ILOCK_EXCL);
1579 error = xfs_trans_reserve_quota(tp, mp, 1575 error = xfs_trans_reserve_quota(tp, mp,
1580 ip->i_udquot, ip->i_gdquot, 1576 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1581 resblks, 0, XFS_QMOPT_RES_REGBLKS); 1577 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1582 if (error) 1578 if (error)
1583 goto error1; 1579 goto error1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 5163022d9808..38c67c34d73f 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -31,8 +31,7 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip); 31 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, 32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
33 struct xfs_name *target_name); 33 struct xfs_name *target_name);
34int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 34int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
35 xfs_off_t *offset, filldir_t filldir);
36int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 35int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
37 const char *target_path, umode_t mode, struct xfs_inode **ipp); 36 const char *target_path, umode_t mode, struct xfs_inode **ipp);
38int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 37int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);