aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJames Morris <james.l.morris@oracle.com>2014-04-13 21:23:14 -0400
committerJames Morris <james.l.morris@oracle.com>2014-04-13 21:23:14 -0400
commitecd740c6f2f092b90b95fa35f757973589eaaca2 (patch)
treece02b1e18c4fc5729699251460cd8be7604d8401 /fs
parentf64410ec665479d7b4b77b7519e814253ed0f686 (diff)
parent455c6fdbd219161bd09b1165f11699d6d73de11c (diff)
Merge commit 'v3.14' into next
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c4
-rw-r--r--fs/9p/cache.c3
-rw-r--r--fs/9p/v9fs.c9
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_addr.c7
-rw-r--r--fs/9p/vfs_file.c142
-rw-r--r--fs/9p/vfs_inode.c26
-rw-r--r--fs/9p/vfs_inode_dotl.c17
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/9p/xattr.c10
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/Makefile5
-rw-r--r--fs/affs/super.c57
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/proc.c122
-rw-r--r--fs/anon_inodes.c34
-rw-r--r--fs/attr.c5
-rw-r--r--fs/autofs4/autofs_i.h4
-rw-r--r--fs/autofs4/dev-ioctl.c16
-rw-r--r--fs/autofs4/expire.c14
-rw-r--r--fs/autofs4/inode.c49
-rw-r--r--fs/autofs4/root.c6
-rw-r--r--fs/autofs4/symlink.c4
-rw-r--r--fs/autofs4/waitq.c16
-rw-r--r--fs/befs/linuxvfs.c4
-rw-r--r--fs/binfmt_elf.c3
-rw-r--r--fs/bio-integrity.c184
-rw-r--r--fs/bio.c507
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c142
-rw-r--r--fs/btrfs/backref.c195
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c24
-rw-r--r--fs/btrfs/compression.c41
-rw-r--r--fs/btrfs/ctree.c552
-rw-r--r--fs/btrfs/ctree.h141
-rw-r--r--fs/btrfs/delayed-inode.c208
-rw-r--r--fs/btrfs/delayed-inode.h8
-rw-r--r--fs/btrfs/delayed-ref.c300
-rw-r--r--fs/btrfs/delayed-ref.h26
-rw-r--r--fs/btrfs/dev-replace.c56
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/disk-io.c269
-rw-r--r--fs/btrfs/extent-tree.c618
-rw-r--r--fs/btrfs/extent_io.c277
-rw-r--r--fs/btrfs/extent_io.h9
-rw-r--r--fs/btrfs/extent_map.c74
-rw-r--r--fs/btrfs/file-item.c23
-rw-r--r--fs/btrfs/file.c216
-rw-r--r--fs/btrfs/free-space-cache.c23
-rw-r--r--fs/btrfs/hash.c50
-rw-r--r--fs/btrfs/hash.h11
-rw-r--r--fs/btrfs/inode-item.c65
-rw-r--r--fs/btrfs/inode.c500
-rw-r--r--fs/btrfs/ioctl.c404
-rw-r--r--fs/btrfs/lzo.c6
-rw-r--r--fs/btrfs/ordered-data.c15
-rw-r--r--fs/btrfs/orphan.c20
-rw-r--r--fs/btrfs/print-tree.c4
-rw-r--r--fs/btrfs/props.c427
-rw-r--r--fs/btrfs/props.h42
-rw-r--r--fs/btrfs/qgroup.c57
-rw-r--r--fs/btrfs/raid56.c22
-rw-r--r--fs/btrfs/reada.c9
-rw-r--r--fs/btrfs/relocation.c105
-rw-r--r--fs/btrfs/root-tree.c19
-rw-r--r--fs/btrfs/scrub.c146
-rw-r--r--fs/btrfs/send.c973
-rw-r--r--fs/btrfs/super.c254
-rw-r--r--fs/btrfs/sysfs.c623
-rw-r--r--fs/btrfs/sysfs.h64
-rw-r--r--fs/btrfs/tests/btrfs-tests.h2
-rw-r--r--fs/btrfs/tests/free-space-tests.c4
-rw-r--r--fs/btrfs/transaction.c55
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-log.c209
-rw-r--r--fs/btrfs/ulist.c117
-rw-r--r--fs/btrfs/ulist.h39
-rw-r--r--fs/btrfs/uuid-tree.c13
-rw-r--r--fs/btrfs/volumes.c108
-rw-r--r--fs/btrfs/xattr.c17
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c8
-rw-r--r--fs/buffer.c20
-rw-r--r--fs/ceph/Kconfig13
-rw-r--r--fs/ceph/Makefile1
-rw-r--r--fs/ceph/acl.c200
-rw-r--r--fs/ceph/addr.c93
-rw-r--r--fs/ceph/cache.h13
-rw-r--r--fs/ceph/caps.c338
-rw-r--r--fs/ceph/dir.c36
-rw-r--r--fs/ceph/file.c438
-rw-r--r--fs/ceph/inode.c36
-rw-r--r--fs/ceph/ioctl.c8
-rw-r--r--fs/ceph/mds_client.c132
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/strings.c2
-rw-r--r--fs/ceph/super.c35
-rw-r--r--fs/ceph/super.h49
-rw-r--r--fs/ceph/xattr.c115
-rw-r--r--fs/cifs/cifsacl.c101
-rw-r--r--fs/cifs/cifsglob.h27
-rw-r--r--fs/cifs/cifsproto.h31
-rw-r--r--fs/cifs/cifssmb.c173
-rw-r--r--fs/cifs/dir.c61
-rw-r--r--fs/cifs/file.c96
-rw-r--r--fs/cifs/inode.c193
-rw-r--r--fs/cifs/link.c323
-rw-r--r--fs/cifs/readdir.c2
-rw-r--r--fs/cifs/smb1ops.c135
-rw-r--r--fs/cifs/smb2glob.h3
-rw-r--r--fs/cifs/smb2ops.c14
-rw-r--r--fs/cifs/smb2pdu.c9
-rw-r--r--fs/cifs/smb2proto.h3
-rw-r--r--fs/cifs/transport.c29
-rw-r--r--fs/cifs/xattr.c64
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/coredump.c1
-rw-r--r--fs/coredump.h6
-rw-r--r--fs/cramfs/inode.c50
-rw-r--r--fs/cramfs/internal.h4
-rw-r--r--fs/cramfs/uncompress.c2
-rw-r--r--fs/dcache.c21
-rw-r--r--fs/dcookies.c2
-rw-r--r--fs/direct-io.c4
-rw-r--r--fs/dlm/lowcomms.c12
-rw-r--r--fs/ecryptfs/inode.c29
-rw-r--r--fs/efs/super.c39
-rw-r--r--fs/eventfd.c13
-rw-r--r--fs/exec.c165
-rw-r--r--fs/exofs/inode.c31
-rw-r--r--fs/exofs/ore.c45
-rw-r--r--fs/ext2/acl.c188
-rw-r--r--fs/ext2/acl.h8
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/xattr.c8
-rw-r--r--fs/ext2/xattr.h2
-rw-r--r--fs/ext3/acl.c223
-rw-r--r--fs/ext3/acl.h9
-rw-r--r--fs/ext3/dir.c44
-rw-r--r--fs/ext3/file.c1
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext3/namei.c2
-rw-r--r--fs/ext3/xattr.c8
-rw-r--r--fs/ext3/xattr.h2
-rw-r--r--fs/ext4/acl.c223
-rw-r--r--fs/ext4/acl.h9
-rw-r--r--fs/ext4/block_validity.c33
-rw-r--r--fs/ext4/dir.c35
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/extents.c5
-rw-r--r--fs/ext4/file.c3
-rw-r--r--fs/ext4/inline.c26
-rw-r--r--fs/ext4/inode.c32
-rw-r--r--fs/ext4/ioctl.c9
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/page-io.c8
-rw-r--r--fs/ext4/resize.c34
-rw-r--r--fs/ext4/super.c20
-rw-r--r--fs/ext4/xattr.c8
-rw-r--r--fs/ext4/xattr.h2
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/acl.c174
-rw-r--r--fs/f2fs/acl.h7
-rw-r--r--fs/f2fs/checkpoint.c195
-rw-r--r--fs/f2fs/data.c614
-rw-r--r--fs/f2fs/debug.c53
-rw-r--r--fs/f2fs/dir.c47
-rw-r--r--fs/f2fs/f2fs.h199
-rw-r--r--fs/f2fs/file.c87
-rw-r--r--fs/f2fs/gc.c22
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/inline.c222
-rw-r--r--fs/f2fs/inode.c23
-rw-r--r--fs/f2fs/namei.c7
-rw-r--r--fs/f2fs/node.c272
-rw-r--r--fs/f2fs/node.h8
-rw-r--r--fs/f2fs/recovery.c49
-rw-r--r--fs/f2fs/segment.c584
-rw-r--r--fs/f2fs/segment.h81
-rw-r--r--fs/f2fs/super.c72
-rw-r--r--fs/f2fs/xattr.c11
-rw-r--r--fs/f2fs/xattr.h2
-rw-r--r--fs/file.c123
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/fs-writeback.c48
-rw-r--r--fs/fscache/object-list.c5
-rw-r--r--fs/fscache/object.c3
-rw-r--r--fs/fuse/dev.c25
-rw-r--r--fs/fuse/dir.c14
-rw-r--r--fs/fuse/file.c44
-rw-r--r--fs/fuse/fuse_i.h5
-rw-r--r--fs/generic_acl.c184
-rw-r--r--fs/gfs2/acl.c234
-rw-r--r--fs/gfs2/acl.h4
-rw-r--r--fs/gfs2/aops.c23
-rw-r--r--fs/gfs2/dir.c90
-rw-r--r--fs/gfs2/dir.h19
-rw-r--r--fs/gfs2/glock.c29
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c26
-rw-r--r--fs/gfs2/incore.h23
-rw-r--r--fs/gfs2/inode.c152
-rw-r--r--fs/gfs2/lops.c7
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c3
-rw-r--r--fs/gfs2/ops_fstype.c60
-rw-r--r--fs/gfs2/quota.c342
-rw-r--r--fs/gfs2/quota.h1
-rw-r--r--fs/gfs2/rgrp.c113
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c43
-rw-r--r--fs/gfs2/xattr.c4
-rw-r--r--fs/hfsplus/acl.h9
-rw-r--r--fs/hfsplus/catalog.c41
-rw-r--r--fs/hfsplus/dir.c3
-rw-r--r--fs/hfsplus/hfsplus_fs.h1
-rw-r--r--fs/hfsplus/hfsplus_raw.h6
-rw-r--r--fs/hfsplus/inode.c73
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hfsplus/posix_acl.c168
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hfsplus/xattr.c150
-rw-r--r--fs/hfsplus/xattr.h4
-rw-r--r--fs/hostfs/hostfs_kern.c53
-rw-r--r--fs/hpfs/alloc.c66
-rw-r--r--fs/hpfs/buffer.c96
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hpfs/super.c29
-rw-r--r--fs/jbd/journal.c8
-rw-r--r--fs/jbd/transaction.c4
-rw-r--r--fs/jbd2/transaction.c6
-rw-r--r--fs/jffs2/acl.c141
-rw-r--r--fs/jffs2/acl.h7
-rw-r--r--fs/jffs2/dir.c1
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/fs.c7
-rw-r--r--fs/jffs2/malloc.c4
-rw-r--r--fs/jffs2/nodelist.c28
-rw-r--r--fs/jffs2/readinode.c26
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jffs2/xattr.c9
-rw-r--r--fs/jfs/acl.c107
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/jfs_acl.h7
-rw-r--r--fs/jfs/jfs_logmgr.c12
-rw-r--r--fs/jfs/jfs_metapage.c9
-rw-r--r--fs/jfs/jfs_xattr.h2
-rw-r--r--fs/jfs/namei.c1
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/xattr.c123
-rw-r--r--fs/kernfs/Makefile5
-rw-r--r--fs/kernfs/dir.c1077
-rw-r--r--fs/kernfs/file.c867
-rw-r--r--fs/kernfs/inode.c377
-rw-r--r--fs/kernfs/kernfs-internal.h122
-rw-r--r--fs/kernfs/mount.c171
-rw-r--r--fs/kernfs/symlink.c151
-rw-r--r--fs/lockd/svclock.c8
-rw-r--r--fs/logfs/dev_bdev.c38
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/mount.h6
-rw-r--r--fs/mpage.c19
-rw-r--r--fs/namei.c91
-rw-r--r--fs/namespace.c179
-rw-r--r--fs/nfs/blocklayout/blocklayout.c43
-rw-r--r--fs/nfs/delegation.c11
-rw-r--r--fs/nfs/dir.c36
-rw-r--r--fs/nfs/direct.c279
-rw-r--r--fs/nfs/file.c6
-rw-r--r--fs/nfs/inode.c117
-rw-r--r--fs/nfs/internal.h12
-rw-r--r--fs/nfs/nfs3acl.c295
-rw-r--r--fs/nfs/nfs3proc.c77
-rw-r--r--fs/nfs/nfs3super.c3
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c29
-rw-r--r--fs/nfs/nfs4filelayout.c34
-rw-r--r--fs/nfs/nfs4filelayoutdev.c2
-rw-r--r--fs/nfs/nfs4namespace.c12
-rw-r--r--fs/nfs/nfs4proc.c106
-rw-r--r--fs/nfs/nfs4session.c25
-rw-r--r--fs/nfs/nfs4session.h2
-rw-r--r--fs/nfs/nfs4state.c23
-rw-r--r--fs/nfs/nfs4super.c14
-rw-r--r--fs/nfs/nfs4xdr.c49
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/pnfs.c67
-rw-r--r--fs/nfs/pnfs.h16
-rw-r--r--fs/nfs/read.c12
-rw-r--r--fs/nfs/write.c26
-rw-r--r--fs/nfsd/acl.h18
-rw-r--r--fs/nfsd/cache.h8
-rw-r--r--fs/nfsd/idmap.h4
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs2acl.c72
-rw-r--r--fs/nfsd/nfs3acl.c62
-rw-r--r--fs/nfsd/nfs3xdr.c14
-rw-r--r--fs/nfsd/nfs4acl.c139
-rw-r--r--fs/nfsd/nfs4idmap.c50
-rw-r--r--fs/nfsd/nfs4proc.c58
-rw-r--r--fs/nfsd/nfs4state.c40
-rw-r--r--fs/nfsd/nfs4xdr.c178
-rw-r--r--fs/nfsd/nfscache.c36
-rw-r--r--fs/nfsd/nfssvc.c30
-rw-r--r--fs/nfsd/nfsxdr.c2
-rw-r--r--fs/nfsd/vfs.c282
-rw-r--r--fs/nfsd/vfs.h10
-rw-r--r--fs/nfsd/xdr3.h3
-rw-r--r--fs/nfsd/xdr4.h4
-rw-r--r--fs/nilfs2/ioctl.c371
-rw-r--r--fs/nilfs2/segbuf.c3
-rw-r--r--fs/nilfs2/segment.c10
-rw-r--r--fs/nls/mac-celtic.c1
-rw-r--r--fs/nls/mac-centeuro.c1
-rw-r--r--fs/nls/mac-croatian.c1
-rw-r--r--fs/nls/mac-cyrillic.c1
-rw-r--r--fs/nls/mac-gaelic.c1
-rw-r--r--fs/nls/mac-greek.c1
-rw-r--r--fs/nls/mac-iceland.c1
-rw-r--r--fs/nls/mac-inuit.c1
-rw-r--r--fs/nls/mac-roman.c1
-rw-r--r--fs/nls/mac-romanian.c1
-rw-r--r--fs/nls/mac-turkish.c1
-rw-r--r--fs/nls/nls_ascii.c1
-rw-r--r--fs/nls/nls_base.c5
-rw-r--r--fs/nls/nls_cp1250.c1
-rw-r--r--fs/nls/nls_cp1251.c1
-rw-r--r--fs/nls/nls_cp1255.c1
-rw-r--r--fs/nls/nls_cp437.c1
-rw-r--r--fs/nls/nls_cp737.c1
-rw-r--r--fs/nls/nls_cp775.c1
-rw-r--r--fs/nls/nls_cp850.c1
-rw-r--r--fs/nls/nls_cp852.c1
-rw-r--r--fs/nls/nls_cp855.c1
-rw-r--r--fs/nls/nls_cp857.c1
-rw-r--r--fs/nls/nls_cp860.c1
-rw-r--r--fs/nls/nls_cp861.c1
-rw-r--r--fs/nls/nls_cp862.c1
-rw-r--r--fs/nls/nls_cp863.c1
-rw-r--r--fs/nls/nls_cp864.c1
-rw-r--r--fs/nls/nls_cp865.c1
-rw-r--r--fs/nls/nls_cp866.c1
-rw-r--r--fs/nls/nls_cp869.c1
-rw-r--r--fs/nls/nls_cp874.c1
-rw-r--r--fs/nls/nls_cp932.c1
-rw-r--r--fs/nls/nls_cp936.c1
-rw-r--r--fs/nls/nls_cp949.c1
-rw-r--r--fs/nls/nls_cp950.c1
-rw-r--r--fs/nls/nls_euc-jp.c1
-rw-r--r--fs/nls/nls_iso8859-1.c1
-rw-r--r--fs/nls/nls_iso8859-13.c1
-rw-r--r--fs/nls/nls_iso8859-14.c1
-rw-r--r--fs/nls/nls_iso8859-15.c1
-rw-r--r--fs/nls/nls_iso8859-2.c1
-rw-r--r--fs/nls/nls_iso8859-3.c1
-rw-r--r--fs/nls/nls_iso8859-4.c1
-rw-r--r--fs/nls/nls_iso8859-5.c1
-rw-r--r--fs/nls/nls_iso8859-6.c1
-rw-r--r--fs/nls/nls_iso8859-7.c1
-rw-r--r--fs/nls/nls_iso8859-9.c1
-rw-r--r--fs/nls/nls_koi8-r.c1
-rw-r--r--fs/nls/nls_koi8-ru.c1
-rw-r--r--fs/nls/nls_koi8-u.c1
-rw-r--r--fs/nls/nls_utf8.c1
-rw-r--r--fs/notify/dnotify/dnotify.c34
-rw-r--r--fs/notify/fanotify/fanotify.c234
-rw-r--r--fs/notify/fanotify/fanotify.h30
-rw-r--r--fs/notify/fanotify/fanotify_user.c63
-rw-r--r--fs/notify/fsnotify.c42
-rw-r--r--fs/notify/group.c7
-rw-r--r--fs/notify/inotify/inotify.h21
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c161
-rw-r--r--fs/notify/inotify/inotify_user.c131
-rw-r--r--fs/notify/notification.c358
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/acl.c234
-rw-r--r--fs/ocfs2/acl.h13
-rw-r--r--fs/ocfs2/alloc.c50
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/nodemanager.c4
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c5
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlmglue.c4
-rw-r--r--fs/ocfs2/file.c67
-rw-r--r--fs/ocfs2/ioctl.c7
-rw-r--r--fs/ocfs2/localalloc.c42
-rw-r--r--fs/ocfs2/localalloc.h6
-rw-r--r--fs/ocfs2/move_extents.c77
-rw-r--r--fs/ocfs2/namei.c44
-rw-r--r--fs/ocfs2/ocfs2.h1
-rw-r--r--fs/ocfs2/quota_global.c27
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/ocfs2/refcounttree.c19
-rw-r--r--fs/ocfs2/stack_o2cb.c3
-rw-r--r--fs/ocfs2/stack_user.c308
-rw-r--r--fs/ocfs2/stackglue.c18
-rw-r--r--fs/ocfs2/stackglue.h15
-rw-r--r--fs/ocfs2/suballoc.c12
-rw-r--r--fs/ocfs2/suballoc.h12
-rw-r--r--fs/ocfs2/super.c20
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/xattr.c21
-rw-r--r--fs/ocfs2/xattr.h6
-rw-r--r--fs/open.c4
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/pnode.c26
-rw-r--r--fs/pnode.h4
-rw-r--r--fs/posix_acl.c526
-rw-r--r--fs/proc/array.c18
-rw-r--r--fs/proc/base.c70
-rw-r--r--fs/proc/cmdline.c2
-rw-r--r--fs/proc/consoles.c2
-rw-r--r--fs/proc/cpuinfo.c2
-rw-r--r--fs/proc/devices.c2
-rw-r--r--fs/proc/generic.c3
-rw-r--r--fs/proc/interrupts.c2
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/kmsg.c2
-rw-r--r--fs/proc/loadavg.c2
-rw-r--r--fs/proc/meminfo.c39
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c9
-rw-r--r--fs/proc/proc_devtree.c5
-rw-r--r--fs/proc/softirqs.c2
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/uptime.c2
-rw-r--r--fs/proc/version.c2
-rw-r--r--fs/proc/vmcore.c28
-rw-r--r--fs/proc_namespace.c7
-rw-r--r--fs/qnx4/inode.c63
-rw-r--r--fs/qnx4/qnx4.h2
-rw-r--r--fs/quota/dquot.c14
-rw-r--r--fs/ramfs/file-mmu.c7
-rw-r--r--fs/ramfs/file-nommu.c17
-rw-r--r--fs/ramfs/inode.c9
-rw-r--r--fs/ramfs/internal.h1
-rw-r--r--fs/read_write.c64
-rw-r--r--fs/reiserfs/acl.h4
-rw-r--r--fs/reiserfs/do_balan.c895
-rw-r--r--fs/reiserfs/file.c1
-rw-r--r--fs/reiserfs/namei.c4
-rw-r--r--fs/reiserfs/procfs.c4
-rw-r--r--fs/reiserfs/reiserfs.h10
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/reiserfs/xattr.c5
-rw-r--r--fs/reiserfs/xattr_acl.c190
-rw-r--r--fs/romfs/super.c6
-rw-r--r--fs/splice.c18
-rw-r--r--fs/super.c6
-rw-r--r--fs/sync.c32
-rw-r--r--fs/sysfs/Makefile2
-rw-r--r--fs/sysfs/dir.c1075
-rw-r--r--fs/sysfs/file.c961
-rw-r--r--fs/sysfs/group.c102
-rw-r--r--fs/sysfs/inode.c331
-rw-r--r--fs/sysfs/mount.c185
-rw-r--r--fs/sysfs/symlink.c219
-rw-r--r--fs/sysfs/sysfs.h236
-rw-r--r--fs/ubifs/debug.c22
-rw-r--r--fs/ubifs/log.c21
-rw-r--r--fs/ubifs/orphan.c21
-rw-r--r--fs/ubifs/recovery.c21
-rw-r--r--fs/ubifs/super.c24
-rw-r--r--fs/ubifs/tnc.c22
-rw-r--r--fs/udf/file.c14
-rw-r--r--fs/udf/inode.c1
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/xattr_acl.c180
-rw-r--r--fs/xfs/xfs_acl.c151
-rw-r--r--fs/xfs/xfs_acl.h9
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_attr.c5
-rw-r--r--fs/xfs/xfs_attr_list.c8
-rw-r--r--fs/xfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/xfs_bmap.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c36
-rw-r--r--fs/xfs/xfs_buf.c44
-rw-r--r--fs/xfs/xfs_buf.h20
-rw-r--r--fs/xfs/xfs_buf_item.c103
-rw-r--r--fs/xfs/xfs_dir2_readdir.c4
-rw-r--r--fs/xfs/xfs_dir2_sf.c58
-rw-r--r--fs/xfs/xfs_dquot.c7
-rw-r--r--fs/xfs/xfs_dquot_item.c67
-rw-r--r--fs/xfs/xfs_dquot_item.h3
-rw-r--r--fs/xfs/xfs_extfree_item.c21
-rw-r--r--fs/xfs/xfs_file.c19
-rw-r--r--fs/xfs/xfs_ialloc.c53
-rw-r--r--fs/xfs/xfs_ialloc.h21
-rw-r--r--fs/xfs/xfs_icreate_item.c10
-rw-r--r--fs/xfs/xfs_inode.c85
-rw-r--r--fs/xfs/xfs_inode.h4
-rw-r--r--fs/xfs/xfs_inode_fork.c17
-rw-r--r--fs/xfs/xfs_inode_item.c400
-rw-r--r--fs/xfs/xfs_inode_item.h5
-rw-r--r--fs/xfs/xfs_ioctl.c6
-rw-r--r--fs/xfs/xfs_iops.c145
-rw-r--r--fs/xfs/xfs_iops.h2
-rw-r--r--fs/xfs/xfs_itable.c22
-rw-r--r--fs/xfs/xfs_log.h46
-rw-r--r--fs/xfs/xfs_log_cil.c89
-rw-r--r--fs/xfs/xfs_log_recover.c33
-rw-r--r--fs/xfs/xfs_mount.c24
-rw-r--r--fs/xfs/xfs_qm.c6
-rw-r--r--fs/xfs/xfs_qm.h18
-rw-r--r--fs/xfs/xfs_qm_syscalls.c18
-rw-r--r--fs/xfs/xfs_quota_priv.h42
-rw-r--r--fs/xfs/xfs_sb.c10
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_resv.c10
-rw-r--r--fs/xfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_vnode.h9
-rw-r--r--fs/xfs/xfs_xattr.c4
530 files changed, 18623 insertions, 15505 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 7af425f53bee..8482f2d11606 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -156,7 +156,7 @@ int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
156 return -EOPNOTSUPP; 156 return -EOPNOTSUPP;
157 acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); 157 acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
158 if (acl) { 158 if (acl) {
159 retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); 159 retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
160 if (retval) 160 if (retval)
161 return retval; 161 return retval;
162 set_cached_acl(inode, ACL_TYPE_ACCESS, acl); 162 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
@@ -200,7 +200,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
200 if (acl) { 200 if (acl) {
201 if (S_ISDIR(mode)) 201 if (S_ISDIR(mode))
202 *dpacl = posix_acl_dup(acl); 202 *dpacl = posix_acl_dup(acl);
203 retval = posix_acl_create(&acl, GFP_NOFS, &mode); 203 retval = __posix_acl_create(&acl, GFP_NOFS, &mode);
204 if (retval < 0) 204 if (retval < 0)
205 return retval; 205 return retval;
206 if (retval > 0) 206 if (retval > 0)
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 2b7a032c37bc..a69260f27555 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -239,13 +239,12 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
239void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) 239void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
240{ 240{
241 struct v9fs_inode *v9inode = V9FS_I(inode); 241 struct v9fs_inode *v9inode = V9FS_I(inode);
242 struct p9_fid *fid;
243 242
244 if (!v9inode->fscache) 243 if (!v9inode->fscache)
245 return; 244 return;
246 245
247 spin_lock(&v9inode->fscache_lock); 246 spin_lock(&v9inode->fscache_lock);
248 fid = filp->private_data; 247
249 if ((filp->f_flags & O_ACCMODE) != O_RDONLY) 248 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
250 v9fs_cache_inode_flush_cookie(inode); 249 v9fs_cache_inode_flush_cookie(inode);
251 else 250 else
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 08f2e1e9a7e6..14da82564f4e 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -56,7 +56,7 @@ enum {
56 /* Options that take no arguments */ 56 /* Options that take no arguments */
57 Opt_nodevmap, 57 Opt_nodevmap,
58 /* Cache options */ 58 /* Cache options */
59 Opt_cache_loose, Opt_fscache, 59 Opt_cache_loose, Opt_fscache, Opt_mmap,
60 /* Access options */ 60 /* Access options */
61 Opt_access, Opt_posixacl, 61 Opt_access, Opt_posixacl,
62 /* Error token */ 62 /* Error token */
@@ -74,6 +74,7 @@ static const match_table_t tokens = {
74 {Opt_cache, "cache=%s"}, 74 {Opt_cache, "cache=%s"},
75 {Opt_cache_loose, "loose"}, 75 {Opt_cache_loose, "loose"},
76 {Opt_fscache, "fscache"}, 76 {Opt_fscache, "fscache"},
77 {Opt_mmap, "mmap"},
77 {Opt_cachetag, "cachetag=%s"}, 78 {Opt_cachetag, "cachetag=%s"},
78 {Opt_access, "access=%s"}, 79 {Opt_access, "access=%s"},
79 {Opt_posixacl, "posixacl"}, 80 {Opt_posixacl, "posixacl"},
@@ -91,6 +92,9 @@ static int get_cache_mode(char *s)
91 } else if (!strcmp(s, "fscache")) { 92 } else if (!strcmp(s, "fscache")) {
92 version = CACHE_FSCACHE; 93 version = CACHE_FSCACHE;
93 p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); 94 p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
95 } else if (!strcmp(s, "mmap")) {
96 version = CACHE_MMAP;
97 p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
94 } else if (!strcmp(s, "none")) { 98 } else if (!strcmp(s, "none")) {
95 version = CACHE_NONE; 99 version = CACHE_NONE;
96 p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); 100 p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
@@ -220,6 +224,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
220 case Opt_fscache: 224 case Opt_fscache:
221 v9ses->cache = CACHE_FSCACHE; 225 v9ses->cache = CACHE_FSCACHE;
222 break; 226 break;
227 case Opt_mmap:
228 v9ses->cache = CACHE_MMAP;
229 break;
223 case Opt_cachetag: 230 case Opt_cachetag:
224#ifdef CONFIG_9P_FSCACHE 231#ifdef CONFIG_9P_FSCACHE
225 v9ses->cachetag = match_strdup(&args[0]); 232 v9ses->cachetag = match_strdup(&args[0]);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a8e127c89627..099c7712631c 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -64,6 +64,7 @@ enum p9_session_flags {
64 64
65enum p9_cache_modes { 65enum p9_cache_modes {
66 CACHE_NONE, 66 CACHE_NONE,
67 CACHE_MMAP,
67 CACHE_LOOSE, 68 CACHE_LOOSE,
68 CACHE_FSCACHE, 69 CACHE_FSCACHE,
69}; 70};
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index dc95a252523d..b83ebfbf3fdc 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -50,6 +50,8 @@ extern const struct dentry_operations v9fs_dentry_operations;
50extern const struct dentry_operations v9fs_cached_dentry_operations; 50extern const struct dentry_operations v9fs_cached_dentry_operations;
51extern const struct file_operations v9fs_cached_file_operations; 51extern const struct file_operations v9fs_cached_file_operations;
52extern const struct file_operations v9fs_cached_file_operations_dotl; 52extern const struct file_operations v9fs_cached_file_operations_dotl;
53extern const struct file_operations v9fs_mmap_file_operations;
54extern const struct file_operations v9fs_mmap_file_operations_dotl;
53extern struct kmem_cache *v9fs_inode_cache; 55extern struct kmem_cache *v9fs_inode_cache;
54 56
55struct inode *v9fs_alloc_inode(struct super_block *sb); 57struct inode *v9fs_alloc_inode(struct super_block *sb);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 9ff073f4090a..c71e88602ff4 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -202,6 +202,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
202{ 202{
203 int retval; 203 int retval;
204 204
205 p9_debug(P9_DEBUG_VFS, "page %p\n", page);
206
205 retval = v9fs_vfs_writepage_locked(page); 207 retval = v9fs_vfs_writepage_locked(page);
206 if (retval < 0) { 208 if (retval < 0) {
207 if (retval == -EAGAIN) { 209 if (retval == -EAGAIN) {
@@ -282,6 +284,9 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
282 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 284 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
283 struct inode *inode = mapping->host; 285 struct inode *inode = mapping->host;
284 286
287
288 p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
289
285 v9inode = V9FS_I(inode); 290 v9inode = V9FS_I(inode);
286start: 291start:
287 page = grab_cache_page_write_begin(mapping, index, flags); 292 page = grab_cache_page_write_begin(mapping, index, flags);
@@ -312,6 +317,8 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
312 loff_t last_pos = pos + copied; 317 loff_t last_pos = pos + copied;
313 struct inode *inode = page->mapping->host; 318 struct inode *inode = page->mapping->host;
314 319
320 p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
321
315 if (unlikely(copied < len)) { 322 if (unlikely(copied < len)) {
316 /* 323 /*
317 * zero out the rest of the area 324 * zero out the rest of the area
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a0df3e73c2b1..a16b0ff497ca 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -45,6 +45,7 @@
45#include "cache.h" 45#include "cache.h"
46 46
47static const struct vm_operations_struct v9fs_file_vm_ops; 47static const struct vm_operations_struct v9fs_file_vm_ops;
48static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
48 49
49/** 50/**
50 * v9fs_file_open - open a file (or directory) 51 * v9fs_file_open - open a file (or directory)
@@ -87,7 +88,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
87 88
88 file->private_data = fid; 89 file->private_data = fid;
89 mutex_lock(&v9inode->v_mutex); 90 mutex_lock(&v9inode->v_mutex);
90 if (v9ses->cache && !v9inode->writeback_fid && 91 if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
92 !v9inode->writeback_fid &&
91 ((file->f_flags & O_ACCMODE) != O_RDONLY)) { 93 ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
92 /* 94 /*
93 * clone a fid and add it to writeback_fid 95 * clone a fid and add it to writeback_fid
@@ -105,7 +107,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
105 v9inode->writeback_fid = (void *) fid; 107 v9inode->writeback_fid = (void *) fid;
106 } 108 }
107 mutex_unlock(&v9inode->v_mutex); 109 mutex_unlock(&v9inode->v_mutex);
108 if (v9ses->cache) 110 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
109 v9fs_cache_inode_set_cookie(inode, file); 111 v9fs_cache_inode_set_cookie(inode, file);
110 return 0; 112 return 0;
111out_error: 113out_error:
@@ -461,14 +463,12 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
461 int n; 463 int n;
462 loff_t i_size; 464 loff_t i_size;
463 size_t total = 0; 465 size_t total = 0;
464 struct p9_client *clnt;
465 loff_t origin = *offset; 466 loff_t origin = *offset;
466 unsigned long pg_start, pg_end; 467 unsigned long pg_start, pg_end;
467 468
468 p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n", 469 p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
469 data, (int)count, (int)*offset); 470 data, (int)count, (int)*offset);
470 471
471 clnt = fid->clnt;
472 do { 472 do {
473 n = p9_client_write(fid, NULL, data+total, origin+total, count); 473 n = p9_client_write(fid, NULL, data+total, origin+total, count);
474 if (n <= 0) 474 if (n <= 0)
@@ -581,11 +581,12 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
581} 581}
582 582
583static int 583static int
584v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) 584v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
585{ 585{
586 int retval; 586 int retval;
587 587
588 retval = generic_file_mmap(file, vma); 588
589 retval = generic_file_mmap(filp, vma);
589 if (!retval) 590 if (!retval)
590 vma->vm_ops = &v9fs_file_vm_ops; 591 vma->vm_ops = &v9fs_file_vm_ops;
591 592
@@ -593,6 +594,43 @@ v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
593} 594}
594 595
595static int 596static int
597v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
598{
599 int retval;
600 struct inode *inode;
601 struct v9fs_inode *v9inode;
602 struct p9_fid *fid;
603
604 inode = file_inode(filp);
605 v9inode = V9FS_I(inode);
606 mutex_lock(&v9inode->v_mutex);
607 if (!v9inode->writeback_fid &&
608 (vma->vm_flags & VM_WRITE)) {
609 /*
610 * clone a fid and add it to writeback_fid
611 * we do it during mmap instead of
612 * page dirty time via write_begin/page_mkwrite
613 * because we want write after unlink usecase
614 * to work.
615 */
616 fid = v9fs_writeback_fid(filp->f_path.dentry);
617 if (IS_ERR(fid)) {
618 retval = PTR_ERR(fid);
619 mutex_unlock(&v9inode->v_mutex);
620 return retval;
621 }
622 v9inode->writeback_fid = (void *) fid;
623 }
624 mutex_unlock(&v9inode->v_mutex);
625
626 retval = generic_file_mmap(filp, vma);
627 if (!retval)
628 vma->vm_ops = &v9fs_mmap_file_vm_ops;
629
630 return retval;
631}
632
633static int
596v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 634v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
597{ 635{
598 struct v9fs_inode *v9inode; 636 struct v9fs_inode *v9inode;
@@ -660,6 +698,22 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
660 return do_sync_read(filp, data, count, offset); 698 return do_sync_read(filp, data, count, offset);
661} 699}
662 700
701/**
702 * v9fs_mmap_file_read - read from a file
703 * @filp: file pointer to read
704 * @udata: user data buffer to read data into
705 * @count: size of buffer
706 * @offset: offset at which to read data
707 *
708 */
709static ssize_t
710v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count,
711 loff_t *offset)
712{
713 /* TODO: Check if there are dirty pages */
714 return v9fs_file_read(filp, data, count, offset);
715}
716
663static ssize_t 717static ssize_t
664v9fs_direct_write(struct file *filp, const char __user * data, 718v9fs_direct_write(struct file *filp, const char __user * data,
665 size_t count, loff_t *offsetp) 719 size_t count, loff_t *offsetp)
@@ -730,12 +784,65 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
730 return do_sync_write(filp, data, count, offset); 784 return do_sync_write(filp, data, count, offset);
731} 785}
732 786
787
788/**
789 * v9fs_mmap_file_write - write to a file
790 * @filp: file pointer to write
791 * @data: data buffer to write data from
792 * @count: size of buffer
793 * @offset: offset at which to write data
794 *
795 */
796static ssize_t
797v9fs_mmap_file_write(struct file *filp, const char __user *data,
798 size_t count, loff_t *offset)
799{
800 /*
801 * TODO: invalidate mmaps on filp's inode between
802 * offset and offset+count
803 */
804 return v9fs_file_write(filp, data, count, offset);
805}
806
807static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
808{
809 struct inode *inode;
810
811 struct writeback_control wbc = {
812 .nr_to_write = LONG_MAX,
813 .sync_mode = WB_SYNC_ALL,
814 .range_start = vma->vm_pgoff * PAGE_SIZE,
815 /* absolute end, byte at end included */
816 .range_end = vma->vm_pgoff * PAGE_SIZE +
817 (vma->vm_end - vma->vm_start - 1),
818 };
819
820
821 p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
822
823 inode = file_inode(vma->vm_file);
824
825 if (!mapping_cap_writeback_dirty(inode->i_mapping))
826 wbc.nr_to_write = 0;
827
828 might_sleep();
829 sync_inode(inode, &wbc);
830}
831
832
733static const struct vm_operations_struct v9fs_file_vm_ops = { 833static const struct vm_operations_struct v9fs_file_vm_ops = {
734 .fault = filemap_fault, 834 .fault = filemap_fault,
735 .page_mkwrite = v9fs_vm_page_mkwrite, 835 .page_mkwrite = v9fs_vm_page_mkwrite,
736 .remap_pages = generic_file_remap_pages, 836 .remap_pages = generic_file_remap_pages,
737}; 837};
738 838
839static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
840 .close = v9fs_mmap_vm_close,
841 .fault = filemap_fault,
842 .page_mkwrite = v9fs_vm_page_mkwrite,
843 .remap_pages = generic_file_remap_pages,
844};
845
739 846
740const struct file_operations v9fs_cached_file_operations = { 847const struct file_operations v9fs_cached_file_operations = {
741 .llseek = generic_file_llseek, 848 .llseek = generic_file_llseek,
@@ -786,3 +893,26 @@ const struct file_operations v9fs_file_operations_dotl = {
786 .mmap = generic_file_readonly_mmap, 893 .mmap = generic_file_readonly_mmap,
787 .fsync = v9fs_file_fsync_dotl, 894 .fsync = v9fs_file_fsync_dotl,
788}; 895};
896
897const struct file_operations v9fs_mmap_file_operations = {
898 .llseek = generic_file_llseek,
899 .read = v9fs_mmap_file_read,
900 .write = v9fs_mmap_file_write,
901 .open = v9fs_file_open,
902 .release = v9fs_dir_release,
903 .lock = v9fs_file_lock,
904 .mmap = v9fs_mmap_file_mmap,
905 .fsync = v9fs_file_fsync,
906};
907
908const struct file_operations v9fs_mmap_file_operations_dotl = {
909 .llseek = generic_file_llseek,
910 .read = v9fs_mmap_file_read,
911 .write = v9fs_mmap_file_write,
912 .open = v9fs_file_open,
913 .release = v9fs_dir_release,
914 .lock = v9fs_file_lock_dotl,
915 .flock = v9fs_file_flock_dotl,
916 .mmap = v9fs_mmap_file_mmap,
917 .fsync = v9fs_file_fsync_dotl,
918};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4e65aa903345..bb7991c7e5c7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -299,15 +299,22 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
299 case S_IFREG: 299 case S_IFREG:
300 if (v9fs_proto_dotl(v9ses)) { 300 if (v9fs_proto_dotl(v9ses)) {
301 inode->i_op = &v9fs_file_inode_operations_dotl; 301 inode->i_op = &v9fs_file_inode_operations_dotl;
302 if (v9ses->cache) 302 if (v9ses->cache == CACHE_LOOSE ||
303 v9ses->cache == CACHE_FSCACHE)
303 inode->i_fop = 304 inode->i_fop =
304 &v9fs_cached_file_operations_dotl; 305 &v9fs_cached_file_operations_dotl;
306 else if (v9ses->cache == CACHE_MMAP)
307 inode->i_fop = &v9fs_mmap_file_operations_dotl;
305 else 308 else
306 inode->i_fop = &v9fs_file_operations_dotl; 309 inode->i_fop = &v9fs_file_operations_dotl;
307 } else { 310 } else {
308 inode->i_op = &v9fs_file_inode_operations; 311 inode->i_op = &v9fs_file_inode_operations;
309 if (v9ses->cache) 312 if (v9ses->cache == CACHE_LOOSE ||
310 inode->i_fop = &v9fs_cached_file_operations; 313 v9ses->cache == CACHE_FSCACHE)
314 inode->i_fop =
315 &v9fs_cached_file_operations;
316 else if (v9ses->cache == CACHE_MMAP)
317 inode->i_fop = &v9fs_mmap_file_operations;
311 else 318 else
312 inode->i_fop = &v9fs_file_operations; 319 inode->i_fop = &v9fs_file_operations;
313 } 320 }
@@ -779,7 +786,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
779 unsigned int flags) 786 unsigned int flags)
780{ 787{
781 struct dentry *res; 788 struct dentry *res;
782 struct super_block *sb;
783 struct v9fs_session_info *v9ses; 789 struct v9fs_session_info *v9ses;
784 struct p9_fid *dfid, *fid; 790 struct p9_fid *dfid, *fid;
785 struct inode *inode; 791 struct inode *inode;
@@ -791,7 +797,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
791 if (dentry->d_name.len > NAME_MAX) 797 if (dentry->d_name.len > NAME_MAX)
792 return ERR_PTR(-ENAMETOOLONG); 798 return ERR_PTR(-ENAMETOOLONG);
793 799
794 sb = dir->i_sb;
795 v9ses = v9fs_inode2v9ses(dir); 800 v9ses = v9fs_inode2v9ses(dir);
796 /* We can walk d_parent because we hold the dir->i_mutex */ 801 /* We can walk d_parent because we hold the dir->i_mutex */
797 dfid = v9fs_fid_lookup(dentry->d_parent); 802 dfid = v9fs_fid_lookup(dentry->d_parent);
@@ -812,7 +817,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
812 * unlink. For cached mode create calls request for new 817 * unlink. For cached mode create calls request for new
813 * inode. But with cache disabled, lookup should do this. 818 * inode. But with cache disabled, lookup should do this.
814 */ 819 */
815 if (v9ses->cache) 820 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
816 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); 821 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
817 else 822 else
818 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); 823 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -863,7 +868,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
863 return finish_no_open(file, res); 868 return finish_no_open(file, res);
864 869
865 err = 0; 870 err = 0;
866 fid = NULL; 871
867 v9ses = v9fs_inode2v9ses(dir); 872 v9ses = v9fs_inode2v9ses(dir);
868 perm = unixmode2p9mode(v9ses, mode); 873 perm = unixmode2p9mode(v9ses, mode);
869 fid = v9fs_create(v9ses, dir, dentry, NULL, perm, 874 fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
@@ -878,7 +883,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
878 v9fs_invalidate_inode_attr(dir); 883 v9fs_invalidate_inode_attr(dir);
879 v9inode = V9FS_I(dentry->d_inode); 884 v9inode = V9FS_I(dentry->d_inode);
880 mutex_lock(&v9inode->v_mutex); 885 mutex_lock(&v9inode->v_mutex);
881 if (v9ses->cache && !v9inode->writeback_fid && 886 if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
887 !v9inode->writeback_fid &&
882 ((flags & O_ACCMODE) != O_RDONLY)) { 888 ((flags & O_ACCMODE) != O_RDONLY)) {
883 /* 889 /*
884 * clone a fid and add it to writeback_fid 890 * clone a fid and add it to writeback_fid
@@ -901,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
901 goto error; 907 goto error;
902 908
903 file->private_data = fid; 909 file->private_data = fid;
904 if (v9ses->cache) 910 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
905 v9fs_cache_inode_set_cookie(dentry->d_inode, file); 911 v9fs_cache_inode_set_cookie(dentry->d_inode, file);
906 912
907 *opened |= FILE_CREATED; 913 *opened |= FILE_CREATED;
@@ -1479,7 +1485,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1479 */ 1485 */
1480 i_size = inode->i_size; 1486 i_size = inode->i_size;
1481 v9fs_stat2inode(st, inode, inode->i_sb); 1487 v9fs_stat2inode(st, inode, inode->i_sb);
1482 if (v9ses->cache) 1488 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
1483 inode->i_size = i_size; 1489 inode->i_size = i_size;
1484 spin_unlock(&inode->i_lock); 1490 spin_unlock(&inode->i_lock);
1485out: 1491out:
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4c10edec26a0..59dc8e87647f 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -330,7 +330,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
330 330
331 v9inode = V9FS_I(inode); 331 v9inode = V9FS_I(inode);
332 mutex_lock(&v9inode->v_mutex); 332 mutex_lock(&v9inode->v_mutex);
333 if (v9ses->cache && !v9inode->writeback_fid && 333 if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
334 !v9inode->writeback_fid &&
334 ((flags & O_ACCMODE) != O_RDONLY)) { 335 ((flags & O_ACCMODE) != O_RDONLY)) {
335 /* 336 /*
336 * clone a fid and add it to writeback_fid 337 * clone a fid and add it to writeback_fid
@@ -353,7 +354,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
353 if (err) 354 if (err)
354 goto err_clunk_old_fid; 355 goto err_clunk_old_fid;
355 file->private_data = ofid; 356 file->private_data = ofid;
356 if (v9ses->cache) 357 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
357 v9fs_cache_inode_set_cookie(inode, file); 358 v9fs_cache_inode_set_cookie(inode, file);
358 *opened |= FILE_CREATED; 359 *opened |= FILE_CREATED;
359out: 360out:
@@ -473,13 +474,11 @@ static int
473v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, 474v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
474 struct kstat *stat) 475 struct kstat *stat)
475{ 476{
476 int err;
477 struct v9fs_session_info *v9ses; 477 struct v9fs_session_info *v9ses;
478 struct p9_fid *fid; 478 struct p9_fid *fid;
479 struct p9_stat_dotl *st; 479 struct p9_stat_dotl *st;
480 480
481 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); 481 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
482 err = -EPERM;
483 v9ses = v9fs_dentry2v9ses(dentry); 482 v9ses = v9fs_dentry2v9ses(dentry);
484 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 483 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
485 generic_fillattr(dentry->d_inode, stat); 484 generic_fillattr(dentry->d_inode, stat);
@@ -556,7 +555,6 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
556int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) 555int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
557{ 556{
558 int retval; 557 int retval;
559 struct v9fs_session_info *v9ses;
560 struct p9_fid *fid; 558 struct p9_fid *fid;
561 struct p9_iattr_dotl p9attr; 559 struct p9_iattr_dotl p9attr;
562 struct inode *inode = dentry->d_inode; 560 struct inode *inode = dentry->d_inode;
@@ -577,8 +575,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
577 p9attr.mtime_sec = iattr->ia_mtime.tv_sec; 575 p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
578 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; 576 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
579 577
580 retval = -EPERM;
581 v9ses = v9fs_dentry2v9ses(dentry);
582 fid = v9fs_fid_lookup(dentry); 578 fid = v9fs_fid_lookup(dentry);
583 if (IS_ERR(fid)) 579 if (IS_ERR(fid))
584 return PTR_ERR(fid); 580 return PTR_ERR(fid);
@@ -715,7 +711,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
715 } 711 }
716 712
717 v9fs_invalidate_inode_attr(dir); 713 v9fs_invalidate_inode_attr(dir);
718 if (v9ses->cache) { 714 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
719 /* Now walk from the parent so we can get an unopened fid. */ 715 /* Now walk from the parent so we can get an unopened fid. */
720 fid = p9_client_walk(dfid, 1, &name, 1); 716 fid = p9_client_walk(dfid, 1, &name, 1);
721 if (IS_ERR(fid)) { 717 if (IS_ERR(fid)) {
@@ -768,7 +764,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
768 struct dentry *dentry) 764 struct dentry *dentry)
769{ 765{
770 int err; 766 int err;
771 char *name;
772 struct dentry *dir_dentry; 767 struct dentry *dir_dentry;
773 struct p9_fid *dfid, *oldfid; 768 struct p9_fid *dfid, *oldfid;
774 struct v9fs_session_info *v9ses; 769 struct v9fs_session_info *v9ses;
@@ -786,8 +781,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
786 if (IS_ERR(oldfid)) 781 if (IS_ERR(oldfid))
787 return PTR_ERR(oldfid); 782 return PTR_ERR(oldfid);
788 783
789 name = (char *) dentry->d_name.name;
790
791 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); 784 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
792 785
793 if (err < 0) { 786 if (err < 0) {
@@ -973,7 +966,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
973 */ 966 */
974 i_size = inode->i_size; 967 i_size = inode->i_size;
975 v9fs_stat2inode_dotl(st, inode); 968 v9fs_stat2inode_dotl(st, inode);
976 if (v9ses->cache) 969 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
977 inode->i_size = i_size; 970 inode->i_size = i_size;
978 spin_unlock(&inode->i_lock); 971 spin_unlock(&inode->i_lock);
979out: 972out:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2756dcd5de6e..0afd0382822b 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -144,7 +144,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
144 } 144 }
145 v9fs_fill_super(sb, v9ses, flags, data); 145 v9fs_fill_super(sb, v9ses, flags, data);
146 146
147 if (v9ses->cache) 147 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
148 sb->s_d_op = &v9fs_cached_dentry_operations; 148 sb->s_d_op = &v9fs_cached_dentry_operations;
149 else 149 else
150 sb->s_d_op = &v9fs_dentry_operations; 150 sb->s_d_op = &v9fs_dentry_operations;
@@ -282,7 +282,7 @@ static int v9fs_drop_inode(struct inode *inode)
282{ 282{
283 struct v9fs_session_info *v9ses; 283 struct v9fs_session_info *v9ses;
284 v9ses = v9fs_inode2v9ses(inode); 284 v9ses = v9fs_inode2v9ses(inode);
285 if (v9ses->cache) 285 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
286 return generic_drop_inode(inode); 286 return generic_drop_inode(inode);
287 /* 287 /*
288 * in case of non cached mode always drop the 288 * in case of non cached mode always drop the
@@ -325,10 +325,12 @@ static int v9fs_write_inode_dotl(struct inode *inode,
325 * send an fsync request to server irrespective of 325 * send an fsync request to server irrespective of
326 * wbc->sync_mode. 326 * wbc->sync_mode.
327 */ 327 */
328 p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
329 v9inode = V9FS_I(inode); 328 v9inode = V9FS_I(inode);
329 p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n",
330 __func__, inode, v9inode->writeback_fid);
330 if (!v9inode->writeback_fid) 331 if (!v9inode->writeback_fid)
331 return 0; 332 return 0;
333
332 ret = p9_client_fsync(v9inode->writeback_fid, 0); 334 ret = p9_client_fsync(v9inode->writeback_fid, 0);
333 if (ret < 0) { 335 if (ret < 0) {
334 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 336 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 3c28cdfb8c47..04133a1fd9cb 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -138,8 +138,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
138 if (retval < 0) { 138 if (retval < 0) {
139 p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", 139 p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
140 retval); 140 retval);
141 p9_client_clunk(fid); 141 goto err;
142 return retval;
143 } 142 }
144 msize = fid->clnt->msize; 143 msize = fid->clnt->msize;
145 while (value_len) { 144 while (value_len) {
@@ -152,12 +151,15 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
152 if (write_count < 0) { 151 if (write_count < 0) {
153 /* error in xattr write */ 152 /* error in xattr write */
154 retval = write_count; 153 retval = write_count;
155 break; 154 goto err;
156 } 155 }
157 offset += write_count; 156 offset += write_count;
158 value_len -= write_count; 157 value_len -= write_count;
159 } 158 }
160 return p9_client_clunk(fid); 159 retval = offset;
160err:
161 p9_client_clunk(fid);
162 return retval;
161} 163}
162 164
163ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) 165ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
diff --git a/fs/Kconfig b/fs/Kconfig
index c229f828eb01..7385e54be4b9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -68,10 +68,6 @@ source "fs/quota/Kconfig"
68source "fs/autofs4/Kconfig" 68source "fs/autofs4/Kconfig"
69source "fs/fuse/Kconfig" 69source "fs/fuse/Kconfig"
70 70
71config GENERIC_ACL
72 bool
73 select FS_POSIX_ACL
74
75menu "Caches" 71menu "Caches"
76 72
77source "fs/fscache/Kconfig" 73source "fs/fscache/Kconfig"
@@ -119,7 +115,7 @@ config TMPFS_POSIX_ACL
119 bool "Tmpfs POSIX Access Control Lists" 115 bool "Tmpfs POSIX Access Control Lists"
120 depends on TMPFS 116 depends on TMPFS
121 select TMPFS_XATTR 117 select TMPFS_XATTR
122 select GENERIC_ACL 118 select FS_POSIX_ACL
123 help 119 help
124 POSIX Access Control Lists (ACLs) support additional access rights 120 POSIX Access Control Lists (ACLs) support additional access rights
125 for users and groups beyond the standard owner/group/world scheme, 121 for users and groups beyond the standard owner/group/world scheme,
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28f..47ac07bb4acc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -42,9 +42,8 @@ obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
42obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o 42obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
43 43
44obj-$(CONFIG_FS_MBCACHE) += mbcache.o 44obj-$(CONFIG_FS_MBCACHE) += mbcache.o
45obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o 45obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
46obj-$(CONFIG_NFS_COMMON) += nfs_common/ 46obj-$(CONFIG_NFS_COMMON) += nfs_common/
47obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
48obj-$(CONFIG_COREDUMP) += coredump.o 47obj-$(CONFIG_COREDUMP) += coredump.o
49obj-$(CONFIG_SYSCTL) += drop_caches.o 48obj-$(CONFIG_SYSCTL) += drop_caches.o
50 49
@@ -53,7 +52,7 @@ obj-$(CONFIG_FHANDLE) += fhandle.o
53obj-y += quota/ 52obj-y += quota/
54 53
55obj-$(CONFIG_PROC_FS) += proc/ 54obj-$(CONFIG_PROC_FS) += proc/
56obj-$(CONFIG_SYSFS) += sysfs/ 55obj-$(CONFIG_SYSFS) += sysfs/ kernfs/
57obj-$(CONFIG_CONFIGFS_FS) += configfs/ 56obj-$(CONFIG_CONFIGFS_FS) += configfs/
58obj-y += devpts/ 57obj-y += devpts/
59 58
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 45161a832bbc..d098731b82ff 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -49,11 +49,6 @@ affs_put_super(struct super_block *sb)
49 pr_debug("AFFS: put_super()\n"); 49 pr_debug("AFFS: put_super()\n");
50 50
51 cancel_delayed_work_sync(&sbi->sb_work); 51 cancel_delayed_work_sync(&sbi->sb_work);
52 kfree(sbi->s_prefix);
53 affs_free_bitmap(sb);
54 affs_brelse(sbi->s_root_bh);
55 kfree(sbi);
56 sb->s_fs_info = NULL;
57} 52}
58 53
59static int 54static int
@@ -316,7 +311,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
316 unsigned long mount_flags; 311 unsigned long mount_flags;
317 int tmp_flags; /* fix remount prototype... */ 312 int tmp_flags; /* fix remount prototype... */
318 u8 sig[4]; 313 u8 sig[4];
319 int ret = -EINVAL; 314 int ret;
320 315
321 save_mount_options(sb, data); 316 save_mount_options(sb, data);
322 317
@@ -412,17 +407,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
412 if (!silent) 407 if (!silent)
413 printk(KERN_ERR "AFFS: No valid root block on device %s\n", 408 printk(KERN_ERR "AFFS: No valid root block on device %s\n",
414 sb->s_id); 409 sb->s_id);
415 goto out_error; 410 return -EINVAL;
416 411
417 /* N.B. after this point bh must be released */ 412 /* N.B. after this point bh must be released */
418got_root: 413got_root:
414 /* Keep super block in cache */
415 sbi->s_root_bh = root_bh;
419 root_block = sbi->s_root_block; 416 root_block = sbi->s_root_block;
420 417
421 /* Find out which kind of FS we have */ 418 /* Find out which kind of FS we have */
422 boot_bh = sb_bread(sb, 0); 419 boot_bh = sb_bread(sb, 0);
423 if (!boot_bh) { 420 if (!boot_bh) {
424 printk(KERN_ERR "AFFS: Cannot read boot block\n"); 421 printk(KERN_ERR "AFFS: Cannot read boot block\n");
425 goto out_error; 422 return -EINVAL;
426 } 423 }
427 memcpy(sig, boot_bh->b_data, 4); 424 memcpy(sig, boot_bh->b_data, 4);
428 brelse(boot_bh); 425 brelse(boot_bh);
@@ -471,7 +468,7 @@ got_root:
471 default: 468 default:
472 printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n", 469 printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n",
473 sb->s_id, chksum); 470 sb->s_id, chksum);
474 goto out_error; 471 return -EINVAL;
475 } 472 }
476 473
477 if (mount_flags & SF_VERBOSE) { 474 if (mount_flags & SF_VERBOSE) {
@@ -488,22 +485,17 @@ got_root:
488 if (sbi->s_flags & SF_OFS) 485 if (sbi->s_flags & SF_OFS)
489 sbi->s_data_blksize -= 24; 486 sbi->s_data_blksize -= 24;
490 487
491 /* Keep super block in cache */
492 sbi->s_root_bh = root_bh;
493 /* N.B. after this point s_root_bh must be released */
494
495 tmp_flags = sb->s_flags; 488 tmp_flags = sb->s_flags;
496 if (affs_init_bitmap(sb, &tmp_flags)) 489 ret = affs_init_bitmap(sb, &tmp_flags);
497 goto out_error; 490 if (ret)
491 return ret;
498 sb->s_flags = tmp_flags; 492 sb->s_flags = tmp_flags;
499 493
500 /* set up enough so that it can read an inode */ 494 /* set up enough so that it can read an inode */
501 495
502 root_inode = affs_iget(sb, root_block); 496 root_inode = affs_iget(sb, root_block);
503 if (IS_ERR(root_inode)) { 497 if (IS_ERR(root_inode))
504 ret = PTR_ERR(root_inode); 498 return PTR_ERR(root_inode);
505 goto out_error;
506 }
507 499
508 if (AFFS_SB(sb)->s_flags & SF_INTL) 500 if (AFFS_SB(sb)->s_flags & SF_INTL)
509 sb->s_d_op = &affs_intl_dentry_operations; 501 sb->s_d_op = &affs_intl_dentry_operations;
@@ -513,22 +505,11 @@ got_root:
513 sb->s_root = d_make_root(root_inode); 505 sb->s_root = d_make_root(root_inode);
514 if (!sb->s_root) { 506 if (!sb->s_root) {
515 printk(KERN_ERR "AFFS: Get root inode failed\n"); 507 printk(KERN_ERR "AFFS: Get root inode failed\n");
516 goto out_error; 508 return -ENOMEM;
517 } 509 }
518 510
519 pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); 511 pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
520 return 0; 512 return 0;
521
522 /*
523 * Begin the cascaded cleanup ...
524 */
525out_error:
526 kfree(sbi->s_bitmap);
527 affs_brelse(root_bh);
528 kfree(sbi->s_prefix);
529 kfree(sbi);
530 sb->s_fs_info = NULL;
531 return ret;
532} 513}
533 514
534static int 515static int
@@ -615,11 +596,23 @@ static struct dentry *affs_mount(struct file_system_type *fs_type,
615 return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); 596 return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
616} 597}
617 598
599static void affs_kill_sb(struct super_block *sb)
600{
601 struct affs_sb_info *sbi = AFFS_SB(sb);
602 kill_block_super(sb);
603 if (sbi) {
604 affs_free_bitmap(sb);
605 affs_brelse(sbi->s_root_bh);
606 kfree(sbi->s_prefix);
607 kfree(sbi);
608 }
609}
610
618static struct file_system_type affs_fs_type = { 611static struct file_system_type affs_fs_type = {
619 .owner = THIS_MODULE, 612 .owner = THIS_MODULE,
620 .name = "affs", 613 .name = "affs",
621 .mount = affs_mount, 614 .mount = affs_mount,
622 .kill_sb = kill_block_super, 615 .kill_sb = affs_kill_sb,
623 .fs_flags = FS_REQUIRES_DEV, 616 .fs_flags = FS_REQUIRES_DEV,
624}; 617};
625MODULE_ALIAS_FS("affs"); 618MODULE_ALIAS_FS("affs");
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a306bb6d88d9..6621f8008122 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -195,7 +195,6 @@ struct afs_cell {
195 struct list_head link; /* main cell list link */ 195 struct list_head link; /* main cell list link */
196 struct key *anonymous_key; /* anonymous user key for this cell */ 196 struct key *anonymous_key; /* anonymous user key for this cell */
197 struct list_head proc_link; /* /proc cell list link */ 197 struct list_head proc_link; /* /proc cell list link */
198 struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
199#ifdef CONFIG_AFS_FSCACHE 198#ifdef CONFIG_AFS_FSCACHE
200 struct fscache_cookie *cache; /* caching cookie */ 199 struct fscache_cookie *cache; /* caching cookie */
201#endif 200#endif
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 526e4bbbde59..24a905b076fd 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -41,11 +41,8 @@ static const struct file_operations afs_proc_cells_fops = {
41 .write = afs_proc_cells_write, 41 .write = afs_proc_cells_write,
42 .llseek = seq_lseek, 42 .llseek = seq_lseek,
43 .release = seq_release, 43 .release = seq_release,
44 .owner = THIS_MODULE,
45}; 44};
46 45
47static int afs_proc_rootcell_open(struct inode *inode, struct file *file);
48static int afs_proc_rootcell_release(struct inode *inode, struct file *file);
49static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, 46static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
50 size_t size, loff_t *_pos); 47 size_t size, loff_t *_pos);
51static ssize_t afs_proc_rootcell_write(struct file *file, 48static ssize_t afs_proc_rootcell_write(struct file *file,
@@ -53,17 +50,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
53 size_t size, loff_t *_pos); 50 size_t size, loff_t *_pos);
54 51
55static const struct file_operations afs_proc_rootcell_fops = { 52static const struct file_operations afs_proc_rootcell_fops = {
56 .open = afs_proc_rootcell_open,
57 .read = afs_proc_rootcell_read, 53 .read = afs_proc_rootcell_read,
58 .write = afs_proc_rootcell_write, 54 .write = afs_proc_rootcell_write,
59 .llseek = no_llseek, 55 .llseek = no_llseek,
60 .release = afs_proc_rootcell_release,
61 .owner = THIS_MODULE,
62}; 56};
63 57
64static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); 58static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
65static int afs_proc_cell_volumes_release(struct inode *inode,
66 struct file *file);
67static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); 59static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
68static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, 60static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
69 loff_t *pos); 61 loff_t *pos);
@@ -81,14 +73,11 @@ static const struct file_operations afs_proc_cell_volumes_fops = {
81 .open = afs_proc_cell_volumes_open, 73 .open = afs_proc_cell_volumes_open,
82 .read = seq_read, 74 .read = seq_read,
83 .llseek = seq_lseek, 75 .llseek = seq_lseek,
84 .release = afs_proc_cell_volumes_release, 76 .release = seq_release,
85 .owner = THIS_MODULE,
86}; 77};
87 78
88static int afs_proc_cell_vlservers_open(struct inode *inode, 79static int afs_proc_cell_vlservers_open(struct inode *inode,
89 struct file *file); 80 struct file *file);
90static int afs_proc_cell_vlservers_release(struct inode *inode,
91 struct file *file);
92static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); 81static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
93static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, 82static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
94 loff_t *pos); 83 loff_t *pos);
@@ -106,13 +95,10 @@ static const struct file_operations afs_proc_cell_vlservers_fops = {
106 .open = afs_proc_cell_vlservers_open, 95 .open = afs_proc_cell_vlservers_open,
107 .read = seq_read, 96 .read = seq_read,
108 .llseek = seq_lseek, 97 .llseek = seq_lseek,
109 .release = afs_proc_cell_vlservers_release, 98 .release = seq_release,
110 .owner = THIS_MODULE,
111}; 99};
112 100
113static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); 101static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
114static int afs_proc_cell_servers_release(struct inode *inode,
115 struct file *file);
116static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos); 102static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
117static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, 103static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
118 loff_t *pos); 104 loff_t *pos);
@@ -130,8 +116,7 @@ static const struct file_operations afs_proc_cell_servers_fops = {
130 .open = afs_proc_cell_servers_open, 116 .open = afs_proc_cell_servers_open,
131 .read = seq_read, 117 .read = seq_read,
132 .llseek = seq_lseek, 118 .llseek = seq_lseek,
133 .release = afs_proc_cell_servers_release, 119 .release = seq_release,
134 .owner = THIS_MODULE,
135}; 120};
136 121
137/* 122/*
@@ -139,29 +124,21 @@ static const struct file_operations afs_proc_cell_servers_fops = {
139 */ 124 */
140int afs_proc_init(void) 125int afs_proc_init(void)
141{ 126{
142 struct proc_dir_entry *p;
143
144 _enter(""); 127 _enter("");
145 128
146 proc_afs = proc_mkdir("fs/afs", NULL); 129 proc_afs = proc_mkdir("fs/afs", NULL);
147 if (!proc_afs) 130 if (!proc_afs)
148 goto error_dir; 131 goto error_dir;
149 132
150 p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); 133 if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
151 if (!p) 134 !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
152 goto error_cells; 135 goto error_tree;
153
154 p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops);
155 if (!p)
156 goto error_rootcell;
157 136
158 _leave(" = 0"); 137 _leave(" = 0");
159 return 0; 138 return 0;
160 139
161error_rootcell: 140error_tree:
162 remove_proc_entry("cells", proc_afs); 141 remove_proc_subtree("fs/afs", NULL);
163error_cells:
164 remove_proc_entry("fs/afs", NULL);
165error_dir: 142error_dir:
166 _leave(" = -ENOMEM"); 143 _leave(" = -ENOMEM");
167 return -ENOMEM; 144 return -ENOMEM;
@@ -172,9 +149,7 @@ error_dir:
172 */ 149 */
173void afs_proc_cleanup(void) 150void afs_proc_cleanup(void)
174{ 151{
175 remove_proc_entry("rootcell", proc_afs); 152 remove_proc_subtree("fs/afs", NULL);
176 remove_proc_entry("cells", proc_afs);
177 remove_proc_entry("fs/afs", NULL);
178} 153}
179 154
180/* 155/*
@@ -319,19 +294,6 @@ inval:
319 goto done; 294 goto done;
320} 295}
321 296
322/*
323 * Stubs for /proc/fs/afs/rootcell
324 */
325static int afs_proc_rootcell_open(struct inode *inode, struct file *file)
326{
327 return 0;
328}
329
330static int afs_proc_rootcell_release(struct inode *inode, struct file *file)
331{
332 return 0;
333}
334
335static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, 297static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
336 size_t size, loff_t *_pos) 298 size_t size, loff_t *_pos)
337{ 299{
@@ -387,38 +349,27 @@ nomem:
387 */ 349 */
388int afs_proc_cell_setup(struct afs_cell *cell) 350int afs_proc_cell_setup(struct afs_cell *cell)
389{ 351{
390 struct proc_dir_entry *p; 352 struct proc_dir_entry *dir;
391 353
392 _enter("%p{%s}", cell, cell->name); 354 _enter("%p{%s}", cell, cell->name);
393 355
394 cell->proc_dir = proc_mkdir(cell->name, proc_afs); 356 dir = proc_mkdir(cell->name, proc_afs);
395 if (!cell->proc_dir) 357 if (!dir)
396 goto error_dir; 358 goto error_dir;
397 359
398 p = proc_create_data("servers", 0, cell->proc_dir, 360 if (!proc_create_data("servers", 0, dir,
399 &afs_proc_cell_servers_fops, cell); 361 &afs_proc_cell_servers_fops, cell) ||
400 if (!p) 362 !proc_create_data("vlservers", 0, dir,
401 goto error_servers; 363 &afs_proc_cell_vlservers_fops, cell) ||
402 364 !proc_create_data("volumes", 0, dir,
403 p = proc_create_data("vlservers", 0, cell->proc_dir, 365 &afs_proc_cell_volumes_fops, cell))
404 &afs_proc_cell_vlservers_fops, cell); 366 goto error_tree;
405 if (!p)
406 goto error_vlservers;
407
408 p = proc_create_data("volumes", 0, cell->proc_dir,
409 &afs_proc_cell_volumes_fops, cell);
410 if (!p)
411 goto error_volumes;
412 367
413 _leave(" = 0"); 368 _leave(" = 0");
414 return 0; 369 return 0;
415 370
416error_volumes: 371error_tree:
417 remove_proc_entry("vlservers", cell->proc_dir); 372 remove_proc_subtree(cell->name, proc_afs);
418error_vlservers:
419 remove_proc_entry("servers", cell->proc_dir);
420error_servers:
421 remove_proc_entry(cell->name, proc_afs);
422error_dir: 373error_dir:
423 _leave(" = -ENOMEM"); 374 _leave(" = -ENOMEM");
424 return -ENOMEM; 375 return -ENOMEM;
@@ -431,10 +382,7 @@ void afs_proc_cell_remove(struct afs_cell *cell)
431{ 382{
432 _enter(""); 383 _enter("");
433 384
434 remove_proc_entry("volumes", cell->proc_dir); 385 remove_proc_subtree(cell->name, proc_afs);
435 remove_proc_entry("vlservers", cell->proc_dir);
436 remove_proc_entry("servers", cell->proc_dir);
437 remove_proc_entry(cell->name, proc_afs);
438 386
439 _leave(""); 387 _leave("");
440} 388}
@@ -463,14 +411,6 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
463} 411}
464 412
465/* 413/*
466 * close the file and release the ref to the cell
467 */
468static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
469{
470 return seq_release(inode, file);
471}
472
473/*
474 * set up the iterator to start reading from the cells list and return the 414 * set up the iterator to start reading from the cells list and return the
475 * first item 415 * first item
476 */ 416 */
@@ -569,15 +509,6 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
569} 509}
570 510
571/* 511/*
572 * close the file and release the ref to the cell
573 */
574static int afs_proc_cell_vlservers_release(struct inode *inode,
575 struct file *file)
576{
577 return seq_release(inode, file);
578}
579
580/*
581 * set up the iterator to start reading from the cells list and return the 512 * set up the iterator to start reading from the cells list and return the
582 * first item 513 * first item
583 */ 514 */
@@ -673,15 +604,6 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
673} 604}
674 605
675/* 606/*
676 * close the file and release the ref to the cell
677 */
678static int afs_proc_cell_servers_release(struct inode *inode,
679 struct file *file)
680{
681 return seq_release(inode, file);
682}
683
684/*
685 * set up the iterator to start reading from the cells list and return the 607 * set up the iterator to start reading from the cells list and return the
686 * first item 608 * first item
687 */ 609 */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 24084732b1d0..80ef38c73e5a 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -41,19 +41,8 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
41static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, 41static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
42 int flags, const char *dev_name, void *data) 42 int flags, const char *dev_name, void *data)
43{ 43{
44 struct dentry *root; 44 return mount_pseudo(fs_type, "anon_inode:", NULL,
45 root = mount_pseudo(fs_type, "anon_inode:", NULL,
46 &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); 45 &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
47 if (!IS_ERR(root)) {
48 struct super_block *s = root->d_sb;
49 anon_inode_inode = alloc_anon_inode(s);
50 if (IS_ERR(anon_inode_inode)) {
51 dput(root);
52 deactivate_locked_super(s);
53 root = ERR_CAST(anon_inode_inode);
54 }
55 }
56 return root;
57} 46}
58 47
59static struct file_system_type anon_inode_fs_type = { 48static struct file_system_type anon_inode_fs_type = {
@@ -175,22 +164,15 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
175 164
176static int __init anon_inode_init(void) 165static int __init anon_inode_init(void)
177{ 166{
178 int error;
179
180 error = register_filesystem(&anon_inode_fs_type);
181 if (error)
182 goto err_exit;
183 anon_inode_mnt = kern_mount(&anon_inode_fs_type); 167 anon_inode_mnt = kern_mount(&anon_inode_fs_type);
184 if (IS_ERR(anon_inode_mnt)) { 168 if (IS_ERR(anon_inode_mnt))
185 error = PTR_ERR(anon_inode_mnt); 169 panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
186 goto err_unregister_filesystem;
187 }
188 return 0;
189 170
190err_unregister_filesystem: 171 anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
191 unregister_filesystem(&anon_inode_fs_type); 172 if (IS_ERR(anon_inode_inode))
192err_exit: 173 panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
193 panic(KERN_ERR "anon_inode_init() failed (%d)\n", error); 174
175 return 0;
194} 176}
195 177
196fs_initcall(anon_inode_init); 178fs_initcall(anon_inode_init);
diff --git a/fs/attr.c b/fs/attr.c
index 267968d94673..5d4e59d56e85 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -202,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
202 return -EPERM; 202 return -EPERM;
203 } 203 }
204 204
205 if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
206 if (attr->ia_size != inode->i_size)
207 inode_inc_iversion(inode);
208 }
209
210 if ((ia_valid & ATTR_MODE)) { 205 if ((ia_valid & ATTR_MODE)) {
211 umode_t amode = attr->ia_mode; 206 umode_t amode = attr->ia_mode;
212 /* Flag setting protected by i_mutex */ 207 /* Flag setting protected by i_mutex */
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4218e26df916..acf32054edd8 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -104,7 +104,7 @@ struct autofs_sb_info {
104 u32 magic; 104 u32 magic;
105 int pipefd; 105 int pipefd;
106 struct file *pipe; 106 struct file *pipe;
107 pid_t oz_pgrp; 107 struct pid *oz_pgrp;
108 int catatonic; 108 int catatonic;
109 int version; 109 int version;
110 int sub_version; 110 int sub_version;
@@ -140,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
140 filesystem without "magic".) */ 140 filesystem without "magic".) */
141 141
142static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { 142static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
143 return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; 143 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
144} 144}
145 145
146/* Does a dentry have some pending activity? */ 146/* Does a dentry have some pending activity? */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1818ce7f5a06..3182c0e68b42 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
346{ 346{
347 int pipefd; 347 int pipefd;
348 int err = 0; 348 int err = 0;
349 struct pid *new_pid = NULL;
349 350
350 if (param->setpipefd.pipefd == -1) 351 if (param->setpipefd.pipefd == -1)
351 return -EINVAL; 352 return -EINVAL;
@@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
357 mutex_unlock(&sbi->wq_mutex); 358 mutex_unlock(&sbi->wq_mutex);
358 return -EBUSY; 359 return -EBUSY;
359 } else { 360 } else {
360 struct file *pipe = fget(pipefd); 361 struct file *pipe;
362
363 new_pid = get_task_pid(current, PIDTYPE_PGID);
364
365 if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
366 AUTOFS_WARN("Not allowed to change PID namespace");
367 err = -EINVAL;
368 goto out;
369 }
370
371 pipe = fget(pipefd);
361 if (!pipe) { 372 if (!pipe) {
362 err = -EBADF; 373 err = -EBADF;
363 goto out; 374 goto out;
@@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
367 fput(pipe); 378 fput(pipe);
368 goto out; 379 goto out;
369 } 380 }
370 sbi->oz_pgrp = task_pgrp_nr(current); 381 swap(sbi->oz_pgrp, new_pid);
371 sbi->pipefd = pipefd; 382 sbi->pipefd = pipefd;
372 sbi->pipe = pipe; 383 sbi->pipe = pipe;
373 sbi->catatonic = 0; 384 sbi->catatonic = 0;
374 } 385 }
375out: 386out:
387 put_pid(new_pid);
376 mutex_unlock(&sbi->wq_mutex); 388 mutex_unlock(&sbi->wq_mutex);
377 return err; 389 return err;
378} 390}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3d9d3f5d5dda..394e90b02c5e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -402,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
402 goto next; 402 goto next;
403 } 403 }
404 404
405 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
406 DPRINTK("checking symlink %p %.*s",
407 dentry, (int)dentry->d_name.len, dentry->d_name.name);
408 /*
409 * A symlink can't be "busy" in the usual sense so
410 * just check last used for expire timeout.
411 */
412 if (autofs4_can_expire(dentry, timeout, do_now)) {
413 expired = dentry;
414 goto found;
415 }
416 goto next;
417 }
418
405 if (simple_empty(dentry)) 419 if (simple_empty(dentry))
406 goto next; 420 goto next;
407 421
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 3b9cc9b973c2..d7bd395ab586 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -56,8 +56,11 @@ void autofs4_kill_sb(struct super_block *sb)
56 * just call kill_anon_super when we are called from 56 * just call kill_anon_super when we are called from
57 * deactivate_super. 57 * deactivate_super.
58 */ 58 */
59 if (sbi) /* Free wait queues, close pipe */ 59 if (sbi) {
60 /* Free wait queues, close pipe */
60 autofs4_catatonic_mode(sbi); 61 autofs4_catatonic_mode(sbi);
62 put_pid(sbi->oz_pgrp);
63 }
61 64
62 DPRINTK("shutting down"); 65 DPRINTK("shutting down");
63 kill_litter_super(sb); 66 kill_litter_super(sb);
@@ -80,7 +83,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
80 if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) 83 if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
81 seq_printf(m, ",gid=%u", 84 seq_printf(m, ",gid=%u",
82 from_kgid_munged(&init_user_ns, root_inode->i_gid)); 85 from_kgid_munged(&init_user_ns, root_inode->i_gid));
83 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); 86 seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
84 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); 87 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
85 seq_printf(m, ",minproto=%d", sbi->min_proto); 88 seq_printf(m, ",minproto=%d", sbi->min_proto);
86 seq_printf(m, ",maxproto=%d", sbi->max_proto); 89 seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -124,7 +127,8 @@ static const match_table_t tokens = {
124}; 127};
125 128
126static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, 129static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
127 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) 130 int *pgrp, bool *pgrp_set, unsigned int *type,
131 int *minproto, int *maxproto)
128{ 132{
129 char *p; 133 char *p;
130 substring_t args[MAX_OPT_ARGS]; 134 substring_t args[MAX_OPT_ARGS];
@@ -132,7 +136,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
132 136
133 *uid = current_uid(); 137 *uid = current_uid();
134 *gid = current_gid(); 138 *gid = current_gid();
135 *pgrp = task_pgrp_nr(current);
136 139
137 *minproto = AUTOFS_MIN_PROTO_VERSION; 140 *minproto = AUTOFS_MIN_PROTO_VERSION;
138 *maxproto = AUTOFS_MAX_PROTO_VERSION; 141 *maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -171,6 +174,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
171 if (match_int(args, &option)) 174 if (match_int(args, &option))
172 return 1; 175 return 1;
173 *pgrp = option; 176 *pgrp = option;
177 *pgrp_set = true;
174 break; 178 break;
175 case Opt_minproto: 179 case Opt_minproto:
176 if (match_int(args, &option)) 180 if (match_int(args, &option))
@@ -206,10 +210,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
206 int pipefd; 210 int pipefd;
207 struct autofs_sb_info *sbi; 211 struct autofs_sb_info *sbi;
208 struct autofs_info *ino; 212 struct autofs_info *ino;
213 int pgrp;
214 bool pgrp_set = false;
215 int ret = -EINVAL;
209 216
210 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 217 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
211 if (!sbi) 218 if (!sbi)
212 goto fail_unlock; 219 return -ENOMEM;
213 DPRINTK("starting up, sbi = %p",sbi); 220 DPRINTK("starting up, sbi = %p",sbi);
214 221
215 s->s_fs_info = sbi; 222 s->s_fs_info = sbi;
@@ -218,7 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
218 sbi->pipe = NULL; 225 sbi->pipe = NULL;
219 sbi->catatonic = 1; 226 sbi->catatonic = 1;
220 sbi->exp_timeout = 0; 227 sbi->exp_timeout = 0;
221 sbi->oz_pgrp = task_pgrp_nr(current); 228 sbi->oz_pgrp = NULL;
222 sbi->sb = s; 229 sbi->sb = s;
223 sbi->version = 0; 230 sbi->version = 0;
224 sbi->sub_version = 0; 231 sbi->sub_version = 0;
@@ -243,8 +250,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
243 * Get the root inode and dentry, but defer checking for errors. 250 * Get the root inode and dentry, but defer checking for errors.
244 */ 251 */
245 ino = autofs4_new_ino(sbi); 252 ino = autofs4_new_ino(sbi);
246 if (!ino) 253 if (!ino) {
254 ret = -ENOMEM;
247 goto fail_free; 255 goto fail_free;
256 }
248 root_inode = autofs4_get_inode(s, S_IFDIR | 0755); 257 root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
249 root = d_make_root(root_inode); 258 root = d_make_root(root_inode);
250 if (!root) 259 if (!root)
@@ -255,12 +264,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
255 264
256 /* Can this call block? */ 265 /* Can this call block? */
257 if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, 266 if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
258 &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, 267 &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
259 &sbi->max_proto)) { 268 &sbi->max_proto)) {
260 printk("autofs: called with bogus options\n"); 269 printk("autofs: called with bogus options\n");
261 goto fail_dput; 270 goto fail_dput;
262 } 271 }
263 272
273 if (pgrp_set) {
274 sbi->oz_pgrp = find_get_pid(pgrp);
275 if (!sbi->oz_pgrp) {
276 pr_warn("autofs: could not find process group %d\n",
277 pgrp);
278 goto fail_dput;
279 }
280 } else {
281 sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
282 }
283
264 if (autofs_type_trigger(sbi->type)) 284 if (autofs_type_trigger(sbi->type))
265 __managed_dentry_set_managed(root); 285 __managed_dentry_set_managed(root);
266 286
@@ -284,14 +304,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
284 sbi->version = sbi->max_proto; 304 sbi->version = sbi->max_proto;
285 sbi->sub_version = AUTOFS_PROTO_SUBVERSION; 305 sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
286 306
287 DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); 307 DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
288 pipe = fget(pipefd); 308 pipe = fget(pipefd);
289 309
290 if (!pipe) { 310 if (!pipe) {
291 printk("autofs: could not open pipe file descriptor\n"); 311 printk("autofs: could not open pipe file descriptor\n");
292 goto fail_dput; 312 goto fail_dput;
293 } 313 }
294 if (autofs_prepare_pipe(pipe) < 0) 314 ret = autofs_prepare_pipe(pipe);
315 if (ret < 0)
295 goto fail_fput; 316 goto fail_fput;
296 sbi->pipe = pipe; 317 sbi->pipe = pipe;
297 sbi->pipefd = pipefd; 318 sbi->pipefd = pipefd;
@@ -316,10 +337,10 @@ fail_dput:
316fail_ino: 337fail_ino:
317 kfree(ino); 338 kfree(ino);
318fail_free: 339fail_free:
340 put_pid(sbi->oz_pgrp);
319 kfree(sbi); 341 kfree(sbi);
320 s->s_fs_info = NULL; 342 s->s_fs_info = NULL;
321fail_unlock: 343 return ret;
322 return -EINVAL;
323} 344}
324 345
325struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) 346struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 92ef341ba0cf..2caf36ac3e93 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -558,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir,
558 dget(dentry); 558 dget(dentry);
559 atomic_inc(&ino->count); 559 atomic_inc(&ino->count);
560 p_ino = autofs4_dentry_ino(dentry->d_parent); 560 p_ino = autofs4_dentry_ino(dentry->d_parent);
561 if (p_ino && dentry->d_parent != dentry) 561 if (p_ino && !IS_ROOT(dentry))
562 atomic_inc(&p_ino->count); 562 atomic_inc(&p_ino->count);
563 563
564 dir->i_mtime = CURRENT_TIME; 564 dir->i_mtime = CURRENT_TIME;
@@ -593,7 +593,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
593 593
594 if (atomic_dec_and_test(&ino->count)) { 594 if (atomic_dec_and_test(&ino->count)) {
595 p_ino = autofs4_dentry_ino(dentry->d_parent); 595 p_ino = autofs4_dentry_ino(dentry->d_parent);
596 if (p_ino && dentry->d_parent != dentry) 596 if (p_ino && !IS_ROOT(dentry))
597 atomic_dec(&p_ino->count); 597 atomic_dec(&p_ino->count);
598 } 598 }
599 dput(ino->dentry); 599 dput(ino->dentry);
@@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
732 dget(dentry); 732 dget(dentry);
733 atomic_inc(&ino->count); 733 atomic_inc(&ino->count);
734 p_ino = autofs4_dentry_ino(dentry->d_parent); 734 p_ino = autofs4_dentry_ino(dentry->d_parent);
735 if (p_ino && dentry->d_parent != dentry) 735 if (p_ino && !IS_ROOT(dentry))
736 atomic_inc(&p_ino->count); 736 atomic_inc(&p_ino->count);
737 inc_nlink(dir); 737 inc_nlink(dir);
738 dir->i_mtime = CURRENT_TIME; 738 dir->i_mtime = CURRENT_TIME;
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index f27c094a1919..1e8ea192be2b 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,6 +14,10 @@
14 14
15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) 15static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
16{ 16{
17 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
18 struct autofs_info *ino = autofs4_dentry_ino(dentry);
19 if (ino && !autofs4_oz_mode(sbi))
20 ino->last_used = jiffies;
17 nd_set_link(nd, dentry->d_inode->i_private); 21 nd_set_link(nd, dentry->d_inode->i_private);
18 return NULL; 22 return NULL;
19} 23}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 689e40d983ad..116fd38ee472 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -347,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
347 struct qstr qstr; 347 struct qstr qstr;
348 char *name; 348 char *name;
349 int status, ret, type; 349 int status, ret, type;
350 pid_t pid;
351 pid_t tgid;
350 352
351 /* In catatonic mode, we don't wait for nobody */ 353 /* In catatonic mode, we don't wait for nobody */
352 if (sbi->catatonic) 354 if (sbi->catatonic)
353 return -ENOENT; 355 return -ENOENT;
354 356
357 /*
358 * Try translating pids to the namespace of the daemon.
359 *
360 * Zero means failure: we are in an unrelated pid namespace.
361 */
362 pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
363 tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
364 if (pid == 0 || tgid == 0)
365 return -ENOENT;
366
355 if (!dentry->d_inode) { 367 if (!dentry->d_inode) {
356 /* 368 /*
357 * A wait for a negative dentry is invalid for certain 369 * A wait for a negative dentry is invalid for certain
@@ -417,8 +429,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
417 wq->ino = autofs4_get_ino(sbi); 429 wq->ino = autofs4_get_ino(sbi);
418 wq->uid = current_uid(); 430 wq->uid = current_uid();
419 wq->gid = current_gid(); 431 wq->gid = current_gid();
420 wq->pid = current->pid; 432 wq->pid = pid;
421 wq->tgid = current->tgid; 433 wq->tgid = tgid;
422 wq->status = -EINTR; /* Status return if interrupted */ 434 wq->status = -EINTR; /* Status return if interrupted */
423 wq->wait_ctr = 2; 435 wq->wait_ctr = 2;
424 436
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index daa15d6ba450..845d2d690ce2 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -324,8 +324,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
324 befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); 324 befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino);
325 325
326 inode = iget_locked(sb, ino); 326 inode = iget_locked(sb, ino);
327 if (IS_ERR(inode)) 327 if (!inode)
328 return inode; 328 return ERR_PTR(-ENOMEM);
329 if (!(inode->i_state & I_NEW)) 329 if (!(inode->i_state & I_NEW))
330 return inode; 330 return inode;
331 331
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 571a42326908..67be2951b98a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -543,9 +543,6 @@ out:
543 * libraries. There is no binary dependent code anywhere else. 543 * libraries. There is no binary dependent code anywhere else.
544 */ 544 */
545 545
546#define INTERPRETER_NONE 0
547#define INTERPRETER_ELF 2
548
549#ifndef STACK_RND_MASK 546#ifndef STACK_RND_MASK
550#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ 547#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
551#endif 548#endif
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index fc60b31453ee..4f70f383132c 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio)
114} 114}
115EXPORT_SYMBOL(bio_integrity_free); 115EXPORT_SYMBOL(bio_integrity_free);
116 116
117static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
118{
119 if (bip->bip_slab == BIO_POOL_NONE)
120 return BIP_INLINE_VECS;
121
122 return bvec_nr_vecs(bip->bip_slab);
123}
124
117/** 125/**
118 * bio_integrity_add_page - Attach integrity metadata 126 * bio_integrity_add_page - Attach integrity metadata
119 * @bio: bio to update 127 * @bio: bio to update
@@ -129,13 +137,12 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
129 struct bio_integrity_payload *bip = bio->bi_integrity; 137 struct bio_integrity_payload *bip = bio->bi_integrity;
130 struct bio_vec *iv; 138 struct bio_vec *iv;
131 139
132 if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { 140 if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
133 printk(KERN_ERR "%s: bip_vec full\n", __func__); 141 printk(KERN_ERR "%s: bip_vec full\n", __func__);
134 return 0; 142 return 0;
135 } 143 }
136 144
137 iv = bip_vec_idx(bip, bip->bip_vcnt); 145 iv = bip->bip_vec + bip->bip_vcnt;
138 BUG_ON(iv == NULL);
139 146
140 iv->bv_page = page; 147 iv->bv_page = page;
141 iv->bv_len = len; 148 iv->bv_len = len;
@@ -203,6 +210,12 @@ static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
203 return sectors; 210 return sectors;
204} 211}
205 212
213static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
214 unsigned int sectors)
215{
216 return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size;
217}
218
206/** 219/**
207 * bio_integrity_tag_size - Retrieve integrity tag space 220 * bio_integrity_tag_size - Retrieve integrity tag space
208 * @bio: bio to inspect 221 * @bio: bio to inspect
@@ -215,13 +228,14 @@ unsigned int bio_integrity_tag_size(struct bio *bio)
215{ 228{
216 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 229 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
217 230
218 BUG_ON(bio->bi_size == 0); 231 BUG_ON(bio->bi_iter.bi_size == 0);
219 232
220 return bi->tag_size * (bio->bi_size / bi->sector_size); 233 return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size);
221} 234}
222EXPORT_SYMBOL(bio_integrity_tag_size); 235EXPORT_SYMBOL(bio_integrity_tag_size);
223 236
224int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set) 237static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len,
238 int set)
225{ 239{
226 struct bio_integrity_payload *bip = bio->bi_integrity; 240 struct bio_integrity_payload *bip = bio->bi_integrity;
227 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 241 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
@@ -235,9 +249,9 @@ int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
235 nr_sectors = bio_integrity_hw_sectors(bi, 249 nr_sectors = bio_integrity_hw_sectors(bi,
236 DIV_ROUND_UP(len, bi->tag_size)); 250 DIV_ROUND_UP(len, bi->tag_size));
237 251
238 if (nr_sectors * bi->tuple_size > bip->bip_size) { 252 if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) {
239 printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", 253 printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__,
240 __func__, nr_sectors * bi->tuple_size, bip->bip_size); 254 nr_sectors * bi->tuple_size, bip->bip_iter.bi_size);
241 return -1; 255 return -1;
242 } 256 }
243 257
@@ -299,29 +313,30 @@ static void bio_integrity_generate(struct bio *bio)
299{ 313{
300 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 314 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
301 struct blk_integrity_exchg bix; 315 struct blk_integrity_exchg bix;
302 struct bio_vec *bv; 316 struct bio_vec bv;
303 sector_t sector = bio->bi_sector; 317 struct bvec_iter iter;
304 unsigned int i, sectors, total; 318 sector_t sector = bio->bi_iter.bi_sector;
319 unsigned int sectors, total;
305 void *prot_buf = bio->bi_integrity->bip_buf; 320 void *prot_buf = bio->bi_integrity->bip_buf;
306 321
307 total = 0; 322 total = 0;
308 bix.disk_name = bio->bi_bdev->bd_disk->disk_name; 323 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
309 bix.sector_size = bi->sector_size; 324 bix.sector_size = bi->sector_size;
310 325
311 bio_for_each_segment(bv, bio, i) { 326 bio_for_each_segment(bv, bio, iter) {
312 void *kaddr = kmap_atomic(bv->bv_page); 327 void *kaddr = kmap_atomic(bv.bv_page);
313 bix.data_buf = kaddr + bv->bv_offset; 328 bix.data_buf = kaddr + bv.bv_offset;
314 bix.data_size = bv->bv_len; 329 bix.data_size = bv.bv_len;
315 bix.prot_buf = prot_buf; 330 bix.prot_buf = prot_buf;
316 bix.sector = sector; 331 bix.sector = sector;
317 332
318 bi->generate_fn(&bix); 333 bi->generate_fn(&bix);
319 334
320 sectors = bv->bv_len / bi->sector_size; 335 sectors = bv.bv_len / bi->sector_size;
321 sector += sectors; 336 sector += sectors;
322 prot_buf += sectors * bi->tuple_size; 337 prot_buf += sectors * bi->tuple_size;
323 total += sectors * bi->tuple_size; 338 total += sectors * bi->tuple_size;
324 BUG_ON(total > bio->bi_integrity->bip_size); 339 BUG_ON(total > bio->bi_integrity->bip_iter.bi_size);
325 340
326 kunmap_atomic(kaddr); 341 kunmap_atomic(kaddr);
327 } 342 }
@@ -386,8 +401,8 @@ int bio_integrity_prep(struct bio *bio)
386 401
387 bip->bip_owns_buf = 1; 402 bip->bip_owns_buf = 1;
388 bip->bip_buf = buf; 403 bip->bip_buf = buf;
389 bip->bip_size = len; 404 bip->bip_iter.bi_size = len;
390 bip->bip_sector = bio->bi_sector; 405 bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
391 406
392 /* Map it */ 407 /* Map it */
393 offset = offset_in_page(buf); 408 offset = offset_in_page(buf);
@@ -442,16 +457,17 @@ static int bio_integrity_verify(struct bio *bio)
442 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 457 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
443 struct blk_integrity_exchg bix; 458 struct blk_integrity_exchg bix;
444 struct bio_vec *bv; 459 struct bio_vec *bv;
445 sector_t sector = bio->bi_integrity->bip_sector; 460 sector_t sector = bio->bi_integrity->bip_iter.bi_sector;
446 unsigned int i, sectors, total, ret; 461 unsigned int sectors, ret = 0;
447 void *prot_buf = bio->bi_integrity->bip_buf; 462 void *prot_buf = bio->bi_integrity->bip_buf;
463 int i;
448 464
449 ret = total = 0;
450 bix.disk_name = bio->bi_bdev->bd_disk->disk_name; 465 bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
451 bix.sector_size = bi->sector_size; 466 bix.sector_size = bi->sector_size;
452 467
453 bio_for_each_segment(bv, bio, i) { 468 bio_for_each_segment_all(bv, bio, i) {
454 void *kaddr = kmap_atomic(bv->bv_page); 469 void *kaddr = kmap_atomic(bv->bv_page);
470
455 bix.data_buf = kaddr + bv->bv_offset; 471 bix.data_buf = kaddr + bv->bv_offset;
456 bix.data_size = bv->bv_len; 472 bix.data_size = bv->bv_len;
457 bix.prot_buf = prot_buf; 473 bix.prot_buf = prot_buf;
@@ -467,8 +483,6 @@ static int bio_integrity_verify(struct bio *bio)
467 sectors = bv->bv_len / bi->sector_size; 483 sectors = bv->bv_len / bi->sector_size;
468 sector += sectors; 484 sector += sectors;
469 prot_buf += sectors * bi->tuple_size; 485 prot_buf += sectors * bi->tuple_size;
470 total += sectors * bi->tuple_size;
471 BUG_ON(total > bio->bi_integrity->bip_size);
472 486
473 kunmap_atomic(kaddr); 487 kunmap_atomic(kaddr);
474 } 488 }
@@ -495,7 +509,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
495 509
496 /* Restore original bio completion handler */ 510 /* Restore original bio completion handler */
497 bio->bi_end_io = bip->bip_end_io; 511 bio->bi_end_io = bip->bip_end_io;
498 bio_endio(bio, error); 512 bio_endio_nodec(bio, error);
499} 513}
500 514
501/** 515/**
@@ -533,56 +547,6 @@ void bio_integrity_endio(struct bio *bio, int error)
533EXPORT_SYMBOL(bio_integrity_endio); 547EXPORT_SYMBOL(bio_integrity_endio);
534 548
535/** 549/**
536 * bio_integrity_mark_head - Advance bip_vec skip bytes
537 * @bip: Integrity vector to advance
538 * @skip: Number of bytes to advance it
539 */
540void bio_integrity_mark_head(struct bio_integrity_payload *bip,
541 unsigned int skip)
542{
543 struct bio_vec *iv;
544 unsigned int i;
545
546 bip_for_each_vec(iv, bip, i) {
547 if (skip == 0) {
548 bip->bip_idx = i;
549 return;
550 } else if (skip >= iv->bv_len) {
551 skip -= iv->bv_len;
552 } else { /* skip < iv->bv_len) */
553 iv->bv_offset += skip;
554 iv->bv_len -= skip;
555 bip->bip_idx = i;
556 return;
557 }
558 }
559}
560
561/**
562 * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
563 * @bip: Integrity vector to truncate
564 * @len: New length of integrity vector
565 */
566void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
567 unsigned int len)
568{
569 struct bio_vec *iv;
570 unsigned int i;
571
572 bip_for_each_vec(iv, bip, i) {
573 if (len == 0) {
574 bip->bip_vcnt = i;
575 return;
576 } else if (len >= iv->bv_len) {
577 len -= iv->bv_len;
578 } else { /* len < iv->bv_len) */
579 iv->bv_len = len;
580 len = 0;
581 }
582 }
583}
584
585/**
586 * bio_integrity_advance - Advance integrity vector 550 * bio_integrity_advance - Advance integrity vector
587 * @bio: bio whose integrity vector to update 551 * @bio: bio whose integrity vector to update
588 * @bytes_done: number of data bytes that have been completed 552 * @bytes_done: number of data bytes that have been completed
@@ -595,13 +559,9 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
595{ 559{
596 struct bio_integrity_payload *bip = bio->bi_integrity; 560 struct bio_integrity_payload *bip = bio->bi_integrity;
597 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 561 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
598 unsigned int nr_sectors; 562 unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
599
600 BUG_ON(bip == NULL);
601 BUG_ON(bi == NULL);
602 563
603 nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9); 564 bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
604 bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
605} 565}
606EXPORT_SYMBOL(bio_integrity_advance); 566EXPORT_SYMBOL(bio_integrity_advance);
607 567
@@ -621,64 +581,13 @@ void bio_integrity_trim(struct bio *bio, unsigned int offset,
621{ 581{
622 struct bio_integrity_payload *bip = bio->bi_integrity; 582 struct bio_integrity_payload *bip = bio->bi_integrity;
623 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); 583 struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
624 unsigned int nr_sectors;
625
626 BUG_ON(bip == NULL);
627 BUG_ON(bi == NULL);
628 BUG_ON(!bio_flagged(bio, BIO_CLONED));
629 584
630 nr_sectors = bio_integrity_hw_sectors(bi, sectors); 585 bio_integrity_advance(bio, offset << 9);
631 bip->bip_sector = bip->bip_sector + offset; 586 bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
632 bio_integrity_mark_head(bip, offset * bi->tuple_size);
633 bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
634} 587}
635EXPORT_SYMBOL(bio_integrity_trim); 588EXPORT_SYMBOL(bio_integrity_trim);
636 589
637/** 590/**
638 * bio_integrity_split - Split integrity metadata
639 * @bio: Protected bio
640 * @bp: Resulting bio_pair
641 * @sectors: Offset
642 *
643 * Description: Splits an integrity page into a bio_pair.
644 */
645void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
646{
647 struct blk_integrity *bi;
648 struct bio_integrity_payload *bip = bio->bi_integrity;
649 unsigned int nr_sectors;
650
651 if (bio_integrity(bio) == 0)
652 return;
653
654 bi = bdev_get_integrity(bio->bi_bdev);
655 BUG_ON(bi == NULL);
656 BUG_ON(bip->bip_vcnt != 1);
657
658 nr_sectors = bio_integrity_hw_sectors(bi, sectors);
659
660 bp->bio1.bi_integrity = &bp->bip1;
661 bp->bio2.bi_integrity = &bp->bip2;
662
663 bp->iv1 = bip->bip_vec[bip->bip_idx];
664 bp->iv2 = bip->bip_vec[bip->bip_idx];
665
666 bp->bip1.bip_vec = &bp->iv1;
667 bp->bip2.bip_vec = &bp->iv2;
668
669 bp->iv1.bv_len = sectors * bi->tuple_size;
670 bp->iv2.bv_offset += sectors * bi->tuple_size;
671 bp->iv2.bv_len -= sectors * bi->tuple_size;
672
673 bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
674 bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
675
676 bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
677 bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
678}
679EXPORT_SYMBOL(bio_integrity_split);
680
681/**
682 * bio_integrity_clone - Callback for cloning bios with integrity metadata 591 * bio_integrity_clone - Callback for cloning bios with integrity metadata
683 * @bio: New bio 592 * @bio: New bio
684 * @bio_src: Original bio 593 * @bio_src: Original bio
@@ -702,9 +611,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
702 memcpy(bip->bip_vec, bip_src->bip_vec, 611 memcpy(bip->bip_vec, bip_src->bip_vec,
703 bip_src->bip_vcnt * sizeof(struct bio_vec)); 612 bip_src->bip_vcnt * sizeof(struct bio_vec));
704 613
705 bip->bip_sector = bip_src->bip_sector;
706 bip->bip_vcnt = bip_src->bip_vcnt; 614 bip->bip_vcnt = bip_src->bip_vcnt;
707 bip->bip_idx = bip_src->bip_idx; 615 bip->bip_iter = bip_src->bip_iter;
708 616
709 return 0; 617 return 0;
710} 618}
diff --git a/fs/bio.c b/fs/bio.c
index 33d79a4eb92d..8754e7b6eb49 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -38,8 +38,6 @@
38 */ 38 */
39#define BIO_INLINE_VECS 4 39#define BIO_INLINE_VECS 4
40 40
41static mempool_t *bio_split_pool __read_mostly;
42
43/* 41/*
44 * if you change this list, also change bvec_alloc or things will 42 * if you change this list, also change bvec_alloc or things will
45 * break badly! cannot be bigger than what you can fit into an 43 * break badly! cannot be bigger than what you can fit into an
@@ -273,6 +271,7 @@ void bio_init(struct bio *bio)
273{ 271{
274 memset(bio, 0, sizeof(*bio)); 272 memset(bio, 0, sizeof(*bio));
275 bio->bi_flags = 1 << BIO_UPTODATE; 273 bio->bi_flags = 1 << BIO_UPTODATE;
274 atomic_set(&bio->bi_remaining, 1);
276 atomic_set(&bio->bi_cnt, 1); 275 atomic_set(&bio->bi_cnt, 1);
277} 276}
278EXPORT_SYMBOL(bio_init); 277EXPORT_SYMBOL(bio_init);
@@ -295,9 +294,35 @@ void bio_reset(struct bio *bio)
295 294
296 memset(bio, 0, BIO_RESET_BYTES); 295 memset(bio, 0, BIO_RESET_BYTES);
297 bio->bi_flags = flags|(1 << BIO_UPTODATE); 296 bio->bi_flags = flags|(1 << BIO_UPTODATE);
297 atomic_set(&bio->bi_remaining, 1);
298} 298}
299EXPORT_SYMBOL(bio_reset); 299EXPORT_SYMBOL(bio_reset);
300 300
301static void bio_chain_endio(struct bio *bio, int error)
302{
303 bio_endio(bio->bi_private, error);
304 bio_put(bio);
305}
306
307/**
308 * bio_chain - chain bio completions
309 *
310 * The caller won't have a bi_end_io called when @bio completes - instead,
311 * @parent's bi_end_io won't be called until both @parent and @bio have
312 * completed; the chained bio will also be freed when it completes.
313 *
314 * The caller must not set bi_private or bi_end_io in @bio.
315 */
316void bio_chain(struct bio *bio, struct bio *parent)
317{
318 BUG_ON(bio->bi_private || bio->bi_end_io);
319
320 bio->bi_private = parent;
321 bio->bi_end_io = bio_chain_endio;
322 atomic_inc(&parent->bi_remaining);
323}
324EXPORT_SYMBOL(bio_chain);
325
301static void bio_alloc_rescue(struct work_struct *work) 326static void bio_alloc_rescue(struct work_struct *work)
302{ 327{
303 struct bio_set *bs = container_of(work, struct bio_set, rescue_work); 328 struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
@@ -473,13 +498,13 @@ EXPORT_SYMBOL(bio_alloc_bioset);
473void zero_fill_bio(struct bio *bio) 498void zero_fill_bio(struct bio *bio)
474{ 499{
475 unsigned long flags; 500 unsigned long flags;
476 struct bio_vec *bv; 501 struct bio_vec bv;
477 int i; 502 struct bvec_iter iter;
478 503
479 bio_for_each_segment(bv, bio, i) { 504 bio_for_each_segment(bv, bio, iter) {
480 char *data = bvec_kmap_irq(bv, &flags); 505 char *data = bvec_kmap_irq(&bv, &flags);
481 memset(data, 0, bv->bv_len); 506 memset(data, 0, bv.bv_len);
482 flush_dcache_page(bv->bv_page); 507 flush_dcache_page(bv.bv_page);
483 bvec_kunmap_irq(data, &flags); 508 bvec_kunmap_irq(data, &flags);
484 } 509 }
485} 510}
@@ -515,51 +540,49 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
515EXPORT_SYMBOL(bio_phys_segments); 540EXPORT_SYMBOL(bio_phys_segments);
516 541
517/** 542/**
518 * __bio_clone - clone a bio 543 * __bio_clone_fast - clone a bio that shares the original bio's biovec
519 * @bio: destination bio 544 * @bio: destination bio
520 * @bio_src: bio to clone 545 * @bio_src: bio to clone
521 * 546 *
522 * Clone a &bio. Caller will own the returned bio, but not 547 * Clone a &bio. Caller will own the returned bio, but not
523 * the actual data it points to. Reference count of returned 548 * the actual data it points to. Reference count of returned
524 * bio will be one. 549 * bio will be one.
550 *
551 * Caller must ensure that @bio_src is not freed before @bio.
525 */ 552 */
526void __bio_clone(struct bio *bio, struct bio *bio_src) 553void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
527{ 554{
528 memcpy(bio->bi_io_vec, bio_src->bi_io_vec, 555 BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE);
529 bio_src->bi_max_vecs * sizeof(struct bio_vec));
530 556
531 /* 557 /*
532 * most users will be overriding ->bi_bdev with a new target, 558 * most users will be overriding ->bi_bdev with a new target,
533 * so we don't set nor calculate new physical/hw segment counts here 559 * so we don't set nor calculate new physical/hw segment counts here
534 */ 560 */
535 bio->bi_sector = bio_src->bi_sector;
536 bio->bi_bdev = bio_src->bi_bdev; 561 bio->bi_bdev = bio_src->bi_bdev;
537 bio->bi_flags |= 1 << BIO_CLONED; 562 bio->bi_flags |= 1 << BIO_CLONED;
538 bio->bi_rw = bio_src->bi_rw; 563 bio->bi_rw = bio_src->bi_rw;
539 bio->bi_vcnt = bio_src->bi_vcnt; 564 bio->bi_iter = bio_src->bi_iter;
540 bio->bi_size = bio_src->bi_size; 565 bio->bi_io_vec = bio_src->bi_io_vec;
541 bio->bi_idx = bio_src->bi_idx;
542} 566}
543EXPORT_SYMBOL(__bio_clone); 567EXPORT_SYMBOL(__bio_clone_fast);
544 568
545/** 569/**
546 * bio_clone_bioset - clone a bio 570 * bio_clone_fast - clone a bio that shares the original bio's biovec
547 * @bio: bio to clone 571 * @bio: bio to clone
548 * @gfp_mask: allocation priority 572 * @gfp_mask: allocation priority
549 * @bs: bio_set to allocate from 573 * @bs: bio_set to allocate from
550 * 574 *
551 * Like __bio_clone, only also allocates the returned bio 575 * Like __bio_clone_fast, only also allocates the returned bio
552 */ 576 */
553struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask, 577struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
554 struct bio_set *bs)
555{ 578{
556 struct bio *b; 579 struct bio *b;
557 580
558 b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs); 581 b = bio_alloc_bioset(gfp_mask, 0, bs);
559 if (!b) 582 if (!b)
560 return NULL; 583 return NULL;
561 584
562 __bio_clone(b, bio); 585 __bio_clone_fast(b, bio);
563 586
564 if (bio_integrity(bio)) { 587 if (bio_integrity(bio)) {
565 int ret; 588 int ret;
@@ -574,6 +597,79 @@ struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
574 597
575 return b; 598 return b;
576} 599}
600EXPORT_SYMBOL(bio_clone_fast);
601
602/**
603 * bio_clone_bioset - clone a bio
604 * @bio_src: bio to clone
605 * @gfp_mask: allocation priority
606 * @bs: bio_set to allocate from
607 *
608 * Clone bio. Caller will own the returned bio, but not the actual data it
609 * points to. Reference count of returned bio will be one.
610 */
611struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
612 struct bio_set *bs)
613{
614 struct bvec_iter iter;
615 struct bio_vec bv;
616 struct bio *bio;
617
618 /*
619 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
620 * bio_src->bi_io_vec to bio->bi_io_vec.
621 *
622 * We can't do that anymore, because:
623 *
624 * - The point of cloning the biovec is to produce a bio with a biovec
625 * the caller can modify: bi_idx and bi_bvec_done should be 0.
626 *
627 * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
628 * we tried to clone the whole thing bio_alloc_bioset() would fail.
629 * But the clone should succeed as long as the number of biovecs we
630 * actually need to allocate is fewer than BIO_MAX_PAGES.
631 *
632 * - Lastly, bi_vcnt should not be looked at or relied upon by code
633 * that does not own the bio - reason being drivers don't use it for
634 * iterating over the biovec anymore, so expecting it to be kept up
635 * to date (i.e. for clones that share the parent biovec) is just
636 * asking for trouble and would force extra work on
637 * __bio_clone_fast() anyways.
638 */
639
640 bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
641 if (!bio)
642 return NULL;
643
644 bio->bi_bdev = bio_src->bi_bdev;
645 bio->bi_rw = bio_src->bi_rw;
646 bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
647 bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
648
649 if (bio->bi_rw & REQ_DISCARD)
650 goto integrity_clone;
651
652 if (bio->bi_rw & REQ_WRITE_SAME) {
653 bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
654 goto integrity_clone;
655 }
656
657 bio_for_each_segment(bv, bio_src, iter)
658 bio->bi_io_vec[bio->bi_vcnt++] = bv;
659
660integrity_clone:
661 if (bio_integrity(bio_src)) {
662 int ret;
663
664 ret = bio_integrity_clone(bio, bio_src, gfp_mask);
665 if (ret < 0) {
666 bio_put(bio);
667 return NULL;
668 }
669 }
670
671 return bio;
672}
577EXPORT_SYMBOL(bio_clone_bioset); 673EXPORT_SYMBOL(bio_clone_bioset);
578 674
579/** 675/**
@@ -612,7 +708,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
612 if (unlikely(bio_flagged(bio, BIO_CLONED))) 708 if (unlikely(bio_flagged(bio, BIO_CLONED)))
613 return 0; 709 return 0;
614 710
615 if (((bio->bi_size + len) >> 9) > max_sectors) 711 if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
616 return 0; 712 return 0;
617 713
618 /* 714 /*
@@ -635,8 +731,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
635 simulate merging updated prev_bvec 731 simulate merging updated prev_bvec
636 as new bvec. */ 732 as new bvec. */
637 .bi_bdev = bio->bi_bdev, 733 .bi_bdev = bio->bi_bdev,
638 .bi_sector = bio->bi_sector, 734 .bi_sector = bio->bi_iter.bi_sector,
639 .bi_size = bio->bi_size - prev_bv_len, 735 .bi_size = bio->bi_iter.bi_size -
736 prev_bv_len,
640 .bi_rw = bio->bi_rw, 737 .bi_rw = bio->bi_rw,
641 }; 738 };
642 739
@@ -684,8 +781,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
684 if (q->merge_bvec_fn) { 781 if (q->merge_bvec_fn) {
685 struct bvec_merge_data bvm = { 782 struct bvec_merge_data bvm = {
686 .bi_bdev = bio->bi_bdev, 783 .bi_bdev = bio->bi_bdev,
687 .bi_sector = bio->bi_sector, 784 .bi_sector = bio->bi_iter.bi_sector,
688 .bi_size = bio->bi_size, 785 .bi_size = bio->bi_iter.bi_size,
689 .bi_rw = bio->bi_rw, 786 .bi_rw = bio->bi_rw,
690 }; 787 };
691 788
@@ -708,7 +805,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
708 bio->bi_vcnt++; 805 bio->bi_vcnt++;
709 bio->bi_phys_segments++; 806 bio->bi_phys_segments++;
710 done: 807 done:
711 bio->bi_size += len; 808 bio->bi_iter.bi_size += len;
712 return len; 809 return len;
713} 810}
714 811
@@ -807,28 +904,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
807 if (bio_integrity(bio)) 904 if (bio_integrity(bio))
808 bio_integrity_advance(bio, bytes); 905 bio_integrity_advance(bio, bytes);
809 906
810 bio->bi_sector += bytes >> 9; 907 bio_advance_iter(bio, &bio->bi_iter, bytes);
811 bio->bi_size -= bytes;
812
813 if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
814 return;
815
816 while (bytes) {
817 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
818 WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
819 bio->bi_idx, bio->bi_vcnt);
820 break;
821 }
822
823 if (bytes >= bio_iovec(bio)->bv_len) {
824 bytes -= bio_iovec(bio)->bv_len;
825 bio->bi_idx++;
826 } else {
827 bio_iovec(bio)->bv_len -= bytes;
828 bio_iovec(bio)->bv_offset += bytes;
829 bytes = 0;
830 }
831 }
832} 908}
833EXPORT_SYMBOL(bio_advance); 909EXPORT_SYMBOL(bio_advance);
834 910
@@ -874,117 +950,80 @@ EXPORT_SYMBOL(bio_alloc_pages);
874 */ 950 */
875void bio_copy_data(struct bio *dst, struct bio *src) 951void bio_copy_data(struct bio *dst, struct bio *src)
876{ 952{
877 struct bio_vec *src_bv, *dst_bv; 953 struct bvec_iter src_iter, dst_iter;
878 unsigned src_offset, dst_offset, bytes; 954 struct bio_vec src_bv, dst_bv;
879 void *src_p, *dst_p; 955 void *src_p, *dst_p;
956 unsigned bytes;
880 957
881 src_bv = bio_iovec(src); 958 src_iter = src->bi_iter;
882 dst_bv = bio_iovec(dst); 959 dst_iter = dst->bi_iter;
883
884 src_offset = src_bv->bv_offset;
885 dst_offset = dst_bv->bv_offset;
886 960
887 while (1) { 961 while (1) {
888 if (src_offset == src_bv->bv_offset + src_bv->bv_len) { 962 if (!src_iter.bi_size) {
889 src_bv++; 963 src = src->bi_next;
890 if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { 964 if (!src)
891 src = src->bi_next; 965 break;
892 if (!src)
893 break;
894
895 src_bv = bio_iovec(src);
896 }
897 966
898 src_offset = src_bv->bv_offset; 967 src_iter = src->bi_iter;
899 } 968 }
900 969
901 if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { 970 if (!dst_iter.bi_size) {
902 dst_bv++; 971 dst = dst->bi_next;
903 if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { 972 if (!dst)
904 dst = dst->bi_next; 973 break;
905 if (!dst)
906 break;
907
908 dst_bv = bio_iovec(dst);
909 }
910 974
911 dst_offset = dst_bv->bv_offset; 975 dst_iter = dst->bi_iter;
912 } 976 }
913 977
914 bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, 978 src_bv = bio_iter_iovec(src, src_iter);
915 src_bv->bv_offset + src_bv->bv_len - src_offset); 979 dst_bv = bio_iter_iovec(dst, dst_iter);
916 980
917 src_p = kmap_atomic(src_bv->bv_page); 981 bytes = min(src_bv.bv_len, dst_bv.bv_len);
918 dst_p = kmap_atomic(dst_bv->bv_page);
919 982
920 memcpy(dst_p + dst_offset, 983 src_p = kmap_atomic(src_bv.bv_page);
921 src_p + src_offset, 984 dst_p = kmap_atomic(dst_bv.bv_page);
985
986 memcpy(dst_p + dst_bv.bv_offset,
987 src_p + src_bv.bv_offset,
922 bytes); 988 bytes);
923 989
924 kunmap_atomic(dst_p); 990 kunmap_atomic(dst_p);
925 kunmap_atomic(src_p); 991 kunmap_atomic(src_p);
926 992
927 src_offset += bytes; 993 bio_advance_iter(src, &src_iter, bytes);
928 dst_offset += bytes; 994 bio_advance_iter(dst, &dst_iter, bytes);
929 } 995 }
930} 996}
931EXPORT_SYMBOL(bio_copy_data); 997EXPORT_SYMBOL(bio_copy_data);
932 998
933struct bio_map_data { 999struct bio_map_data {
934 struct bio_vec *iovecs;
935 struct sg_iovec *sgvecs;
936 int nr_sgvecs; 1000 int nr_sgvecs;
937 int is_our_pages; 1001 int is_our_pages;
1002 struct sg_iovec sgvecs[];
938}; 1003};
939 1004
940static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, 1005static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
941 struct sg_iovec *iov, int iov_count, 1006 struct sg_iovec *iov, int iov_count,
942 int is_our_pages) 1007 int is_our_pages)
943{ 1008{
944 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
945 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); 1009 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
946 bmd->nr_sgvecs = iov_count; 1010 bmd->nr_sgvecs = iov_count;
947 bmd->is_our_pages = is_our_pages; 1011 bmd->is_our_pages = is_our_pages;
948 bio->bi_private = bmd; 1012 bio->bi_private = bmd;
949} 1013}
950 1014
951static void bio_free_map_data(struct bio_map_data *bmd)
952{
953 kfree(bmd->iovecs);
954 kfree(bmd->sgvecs);
955 kfree(bmd);
956}
957
958static struct bio_map_data *bio_alloc_map_data(int nr_segs, 1015static struct bio_map_data *bio_alloc_map_data(int nr_segs,
959 unsigned int iov_count, 1016 unsigned int iov_count,
960 gfp_t gfp_mask) 1017 gfp_t gfp_mask)
961{ 1018{
962 struct bio_map_data *bmd;
963
964 if (iov_count > UIO_MAXIOV) 1019 if (iov_count > UIO_MAXIOV)
965 return NULL; 1020 return NULL;
966 1021
967 bmd = kmalloc(sizeof(*bmd), gfp_mask); 1022 return kmalloc(sizeof(struct bio_map_data) +
968 if (!bmd) 1023 sizeof(struct sg_iovec) * iov_count, gfp_mask);
969 return NULL;
970
971 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
972 if (!bmd->iovecs) {
973 kfree(bmd);
974 return NULL;
975 }
976
977 bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
978 if (bmd->sgvecs)
979 return bmd;
980
981 kfree(bmd->iovecs);
982 kfree(bmd);
983 return NULL;
984} 1024}
985 1025
986static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, 1026static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
987 struct sg_iovec *iov, int iov_count,
988 int to_user, int from_user, int do_free_page) 1027 int to_user, int from_user, int do_free_page)
989{ 1028{
990 int ret = 0, i; 1029 int ret = 0, i;
@@ -994,7 +1033,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
994 1033
995 bio_for_each_segment_all(bvec, bio, i) { 1034 bio_for_each_segment_all(bvec, bio, i) {
996 char *bv_addr = page_address(bvec->bv_page); 1035 char *bv_addr = page_address(bvec->bv_page);
997 unsigned int bv_len = iovecs[i].bv_len; 1036 unsigned int bv_len = bvec->bv_len;
998 1037
999 while (bv_len && iov_idx < iov_count) { 1038 while (bv_len && iov_idx < iov_count) {
1000 unsigned int bytes; 1039 unsigned int bytes;
@@ -1054,14 +1093,14 @@ int bio_uncopy_user(struct bio *bio)
1054 * don't copy into a random user address space, just free. 1093 * don't copy into a random user address space, just free.
1055 */ 1094 */
1056 if (current->mm) 1095 if (current->mm)
1057 ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, 1096 ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
1058 bmd->nr_sgvecs, bio_data_dir(bio) == READ, 1097 bio_data_dir(bio) == READ,
1059 0, bmd->is_our_pages); 1098 0, bmd->is_our_pages);
1060 else if (bmd->is_our_pages) 1099 else if (bmd->is_our_pages)
1061 bio_for_each_segment_all(bvec, bio, i) 1100 bio_for_each_segment_all(bvec, bio, i)
1062 __free_page(bvec->bv_page); 1101 __free_page(bvec->bv_page);
1063 } 1102 }
1064 bio_free_map_data(bmd); 1103 kfree(bmd);
1065 bio_put(bio); 1104 bio_put(bio);
1066 return ret; 1105 return ret;
1067} 1106}
@@ -1175,7 +1214,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
1175 */ 1214 */
1176 if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || 1215 if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
1177 (map_data && map_data->from_user)) { 1216 (map_data && map_data->from_user)) {
1178 ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0); 1217 ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
1179 if (ret) 1218 if (ret)
1180 goto cleanup; 1219 goto cleanup;
1181 } 1220 }
@@ -1189,7 +1228,7 @@ cleanup:
1189 1228
1190 bio_put(bio); 1229 bio_put(bio);
1191out_bmd: 1230out_bmd:
1192 bio_free_map_data(bmd); 1231 kfree(bmd);
1193 return ERR_PTR(ret); 1232 return ERR_PTR(ret);
1194} 1233}
1195 1234
@@ -1485,7 +1524,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1485 if (IS_ERR(bio)) 1524 if (IS_ERR(bio))
1486 return bio; 1525 return bio;
1487 1526
1488 if (bio->bi_size == len) 1527 if (bio->bi_iter.bi_size == len)
1489 return bio; 1528 return bio;
1490 1529
1491 /* 1530 /*
@@ -1506,16 +1545,15 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
1506 1545
1507 bio_for_each_segment_all(bvec, bio, i) { 1546 bio_for_each_segment_all(bvec, bio, i) {
1508 char *addr = page_address(bvec->bv_page); 1547 char *addr = page_address(bvec->bv_page);
1509 int len = bmd->iovecs[i].bv_len;
1510 1548
1511 if (read) 1549 if (read)
1512 memcpy(p, addr, len); 1550 memcpy(p, addr, bvec->bv_len);
1513 1551
1514 __free_page(bvec->bv_page); 1552 __free_page(bvec->bv_page);
1515 p += len; 1553 p += bvec->bv_len;
1516 } 1554 }
1517 1555
1518 bio_free_map_data(bmd); 1556 kfree(bmd);
1519 bio_put(bio); 1557 bio_put(bio);
1520} 1558}
1521 1559
@@ -1686,11 +1724,11 @@ void bio_check_pages_dirty(struct bio *bio)
1686#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1724#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1687void bio_flush_dcache_pages(struct bio *bi) 1725void bio_flush_dcache_pages(struct bio *bi)
1688{ 1726{
1689 int i; 1727 struct bio_vec bvec;
1690 struct bio_vec *bvec; 1728 struct bvec_iter iter;
1691 1729
1692 bio_for_each_segment(bvec, bi, i) 1730 bio_for_each_segment(bvec, bi, iter)
1693 flush_dcache_page(bvec->bv_page); 1731 flush_dcache_page(bvec.bv_page);
1694} 1732}
1695EXPORT_SYMBOL(bio_flush_dcache_pages); 1733EXPORT_SYMBOL(bio_flush_dcache_pages);
1696#endif 1734#endif
@@ -1711,96 +1749,86 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
1711 **/ 1749 **/
1712void bio_endio(struct bio *bio, int error) 1750void bio_endio(struct bio *bio, int error)
1713{ 1751{
1714 if (error) 1752 while (bio) {
1715 clear_bit(BIO_UPTODATE, &bio->bi_flags); 1753 BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
1716 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1717 error = -EIO;
1718 1754
1719 if (bio->bi_end_io) 1755 if (error)
1720 bio->bi_end_io(bio, error); 1756 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1721} 1757 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1722EXPORT_SYMBOL(bio_endio); 1758 error = -EIO;
1723 1759
1724void bio_pair_release(struct bio_pair *bp) 1760 if (!atomic_dec_and_test(&bio->bi_remaining))
1725{ 1761 return;
1726 if (atomic_dec_and_test(&bp->cnt)) {
1727 struct bio *master = bp->bio1.bi_private;
1728 1762
1729 bio_endio(master, bp->error); 1763 /*
1730 mempool_free(bp, bp->bio2.bi_private); 1764 * Need to have a real endio function for chained bios,
1765 * otherwise various corner cases will break (like stacking
1766 * block devices that save/restore bi_end_io) - however, we want
1767 * to avoid unbounded recursion and blowing the stack. Tail call
1768 * optimization would handle this, but compiling with frame
1769 * pointers also disables gcc's sibling call optimization.
1770 */
1771 if (bio->bi_end_io == bio_chain_endio) {
1772 struct bio *parent = bio->bi_private;
1773 bio_put(bio);
1774 bio = parent;
1775 } else {
1776 if (bio->bi_end_io)
1777 bio->bi_end_io(bio, error);
1778 bio = NULL;
1779 }
1731 } 1780 }
1732} 1781}
1733EXPORT_SYMBOL(bio_pair_release); 1782EXPORT_SYMBOL(bio_endio);
1734
1735static void bio_pair_end_1(struct bio *bi, int err)
1736{
1737 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
1738
1739 if (err)
1740 bp->error = err;
1741
1742 bio_pair_release(bp);
1743}
1744 1783
1745static void bio_pair_end_2(struct bio *bi, int err) 1784/**
1785 * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
1786 * @bio: bio
1787 * @error: error, if any
1788 *
1789 * For code that has saved and restored bi_end_io; thing hard before using this
1790 * function, probably you should've cloned the entire bio.
1791 **/
1792void bio_endio_nodec(struct bio *bio, int error)
1746{ 1793{
1747 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); 1794 atomic_inc(&bio->bi_remaining);
1748 1795 bio_endio(bio, error);
1749 if (err)
1750 bp->error = err;
1751
1752 bio_pair_release(bp);
1753} 1796}
1797EXPORT_SYMBOL(bio_endio_nodec);
1754 1798
1755/* 1799/**
1756 * split a bio - only worry about a bio with a single page in its iovec 1800 * bio_split - split a bio
1801 * @bio: bio to split
1802 * @sectors: number of sectors to split from the front of @bio
1803 * @gfp: gfp mask
1804 * @bs: bio set to allocate from
1805 *
1806 * Allocates and returns a new bio which represents @sectors from the start of
1807 * @bio, and updates @bio to represent the remaining sectors.
1808 *
1809 * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's
1810 * responsibility to ensure that @bio is not freed before the split.
1757 */ 1811 */
1758struct bio_pair *bio_split(struct bio *bi, int first_sectors) 1812struct bio *bio_split(struct bio *bio, int sectors,
1813 gfp_t gfp, struct bio_set *bs)
1759{ 1814{
1760 struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); 1815 struct bio *split = NULL;
1761
1762 if (!bp)
1763 return bp;
1764
1765 trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
1766 bi->bi_sector + first_sectors);
1767
1768 BUG_ON(bio_segments(bi) > 1);
1769 atomic_set(&bp->cnt, 3);
1770 bp->error = 0;
1771 bp->bio1 = *bi;
1772 bp->bio2 = *bi;
1773 bp->bio2.bi_sector += first_sectors;
1774 bp->bio2.bi_size -= first_sectors << 9;
1775 bp->bio1.bi_size = first_sectors << 9;
1776
1777 if (bi->bi_vcnt != 0) {
1778 bp->bv1 = *bio_iovec(bi);
1779 bp->bv2 = *bio_iovec(bi);
1780
1781 if (bio_is_rw(bi)) {
1782 bp->bv2.bv_offset += first_sectors << 9;
1783 bp->bv2.bv_len -= first_sectors << 9;
1784 bp->bv1.bv_len = first_sectors << 9;
1785 }
1786 1816
1787 bp->bio1.bi_io_vec = &bp->bv1; 1817 BUG_ON(sectors <= 0);
1788 bp->bio2.bi_io_vec = &bp->bv2; 1818 BUG_ON(sectors >= bio_sectors(bio));
1789 1819
1790 bp->bio1.bi_max_vecs = 1; 1820 split = bio_clone_fast(bio, gfp, bs);
1791 bp->bio2.bi_max_vecs = 1; 1821 if (!split)
1792 } 1822 return NULL;
1793 1823
1794 bp->bio1.bi_end_io = bio_pair_end_1; 1824 split->bi_iter.bi_size = sectors << 9;
1795 bp->bio2.bi_end_io = bio_pair_end_2;
1796 1825
1797 bp->bio1.bi_private = bi; 1826 if (bio_integrity(split))
1798 bp->bio2.bi_private = bio_split_pool; 1827 bio_integrity_trim(split, 0, sectors);
1799 1828
1800 if (bio_integrity(bi)) 1829 bio_advance(bio, split->bi_iter.bi_size);
1801 bio_integrity_split(bi, bp, first_sectors);
1802 1830
1803 return bp; 1831 return split;
1804} 1832}
1805EXPORT_SYMBOL(bio_split); 1833EXPORT_SYMBOL(bio_split);
1806 1834
@@ -1814,80 +1842,20 @@ void bio_trim(struct bio *bio, int offset, int size)
1814{ 1842{
1815 /* 'bio' is a cloned bio which we need to trim to match 1843 /* 'bio' is a cloned bio which we need to trim to match
1816 * the given offset and size. 1844 * the given offset and size.
1817 * This requires adjusting bi_sector, bi_size, and bi_io_vec
1818 */ 1845 */
1819 int i;
1820 struct bio_vec *bvec;
1821 int sofar = 0;
1822 1846
1823 size <<= 9; 1847 size <<= 9;
1824 if (offset == 0 && size == bio->bi_size) 1848 if (offset == 0 && size == bio->bi_iter.bi_size)
1825 return; 1849 return;
1826 1850
1827 clear_bit(BIO_SEG_VALID, &bio->bi_flags); 1851 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1828 1852
1829 bio_advance(bio, offset << 9); 1853 bio_advance(bio, offset << 9);
1830 1854
1831 bio->bi_size = size; 1855 bio->bi_iter.bi_size = size;
1832
1833 /* avoid any complications with bi_idx being non-zero*/
1834 if (bio->bi_idx) {
1835 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
1836 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
1837 bio->bi_vcnt -= bio->bi_idx;
1838 bio->bi_idx = 0;
1839 }
1840 /* Make sure vcnt and last bv are not too big */
1841 bio_for_each_segment(bvec, bio, i) {
1842 if (sofar + bvec->bv_len > size)
1843 bvec->bv_len = size - sofar;
1844 if (bvec->bv_len == 0) {
1845 bio->bi_vcnt = i;
1846 break;
1847 }
1848 sofar += bvec->bv_len;
1849 }
1850} 1856}
1851EXPORT_SYMBOL_GPL(bio_trim); 1857EXPORT_SYMBOL_GPL(bio_trim);
1852 1858
1853/**
1854 * bio_sector_offset - Find hardware sector offset in bio
1855 * @bio: bio to inspect
1856 * @index: bio_vec index
1857 * @offset: offset in bv_page
1858 *
1859 * Return the number of hardware sectors between beginning of bio
1860 * and an end point indicated by a bio_vec index and an offset
1861 * within that vector's page.
1862 */
1863sector_t bio_sector_offset(struct bio *bio, unsigned short index,
1864 unsigned int offset)
1865{
1866 unsigned int sector_sz;
1867 struct bio_vec *bv;
1868 sector_t sectors;
1869 int i;
1870
1871 sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
1872 sectors = 0;
1873
1874 if (index >= bio->bi_idx)
1875 index = bio->bi_vcnt - 1;
1876
1877 bio_for_each_segment_all(bv, bio, i) {
1878 if (i == index) {
1879 if (offset > bv->bv_offset)
1880 sectors += (offset - bv->bv_offset) / sector_sz;
1881 break;
1882 }
1883
1884 sectors += bv->bv_len / sector_sz;
1885 }
1886
1887 return sectors;
1888}
1889EXPORT_SYMBOL(bio_sector_offset);
1890
1891/* 1859/*
1892 * create memory pools for biovec's in a bio_set. 1860 * create memory pools for biovec's in a bio_set.
1893 * use the global biovec slabs created for general use. 1861 * use the global biovec slabs created for general use.
@@ -2065,11 +2033,6 @@ static int __init init_bio(void)
2065 if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE)) 2033 if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
2066 panic("bio: can't create integrity pool\n"); 2034 panic("bio: can't create integrity pool\n");
2067 2035
2068 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
2069 sizeof(struct bio_pair));
2070 if (!bio_split_pool)
2071 panic("bio: can't create split pool\n");
2072
2073 return 0; 2036 return 0;
2074} 2037}
2075subsys_initcall(init_bio); 2038subsys_initcall(init_bio);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index aa976eced2d2..a66768ebc8d1 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,6 +1,7 @@
1config BTRFS_FS 1config BTRFS_FS
2 tristate "Btrfs filesystem support" 2 tristate "Btrfs filesystem support"
3 select LIBCRC32C 3 select CRYPTO
4 select CRYPTO_CRC32C
4 select ZLIB_INFLATE 5 select ZLIB_INFLATE
5 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
6 select LZO_COMPRESS 7 select LZO_COMPRESS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 1a44e42d602a..f341a98031d2 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
12 uuid-tree.o 12 uuid-tree.o props.o hash.o
13 13
14btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 14btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
15btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 15btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0890c83643e9..ff9b3995d453 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -35,13 +35,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
35 char *value = NULL; 35 char *value = NULL;
36 struct posix_acl *acl; 36 struct posix_acl *acl;
37 37
38 if (!IS_POSIXACL(inode))
39 return NULL;
40
41 acl = get_cached_acl(inode, type);
42 if (acl != ACL_NOT_CACHED)
43 return acl;
44
45 switch (type) { 38 switch (type) {
46 case ACL_TYPE_ACCESS: 39 case ACL_TYPE_ACCESS:
47 name = POSIX_ACL_XATTR_ACCESS; 40 name = POSIX_ACL_XATTR_ACCESS;
@@ -76,31 +69,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
76 return acl; 69 return acl;
77} 70}
78 71
79static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
80 void *value, size_t size, int type)
81{
82 struct posix_acl *acl;
83 int ret = 0;
84
85 if (!IS_POSIXACL(dentry->d_inode))
86 return -EOPNOTSUPP;
87
88 acl = btrfs_get_acl(dentry->d_inode, type);
89
90 if (IS_ERR(acl))
91 return PTR_ERR(acl);
92 if (acl == NULL)
93 return -ENODATA;
94 ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
95 posix_acl_release(acl);
96
97 return ret;
98}
99
100/* 72/*
101 * Needs to be called with fs_mutex held 73 * Needs to be called with fs_mutex held
102 */ 74 */
103static int btrfs_set_acl(struct btrfs_trans_handle *trans, 75static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
104 struct inode *inode, struct posix_acl *acl, int type) 76 struct inode *inode, struct posix_acl *acl, int type)
105{ 77{
106 int ret, size = 0; 78 int ret, size = 0;
@@ -158,35 +130,9 @@ out:
158 return ret; 130 return ret;
159} 131}
160 132
161static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, 133int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
162 const void *value, size_t size, int flags, int type)
163{ 134{
164 int ret; 135 return __btrfs_set_acl(NULL, inode, acl, type);
165 struct posix_acl *acl = NULL;
166
167 if (!inode_owner_or_capable(dentry->d_inode))
168 return -EPERM;
169
170 if (!IS_POSIXACL(dentry->d_inode))
171 return -EOPNOTSUPP;
172
173 if (value) {
174 acl = posix_acl_from_xattr(&init_user_ns, value, size);
175 if (IS_ERR(acl))
176 return PTR_ERR(acl);
177
178 if (acl) {
179 ret = posix_acl_valid(acl);
180 if (ret)
181 goto out;
182 }
183 }
184
185 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
186out:
187 posix_acl_release(acl);
188
189 return ret;
190} 136}
191 137
192/* 138/*
@@ -197,83 +143,31 @@ out:
197int btrfs_init_acl(struct btrfs_trans_handle *trans, 143int btrfs_init_acl(struct btrfs_trans_handle *trans,
198 struct inode *inode, struct inode *dir) 144 struct inode *inode, struct inode *dir)
199{ 145{
200 struct posix_acl *acl = NULL; 146 struct posix_acl *default_acl, *acl;
201 int ret = 0; 147 int ret = 0;
202 148
203 /* this happens with subvols */ 149 /* this happens with subvols */
204 if (!dir) 150 if (!dir)
205 return 0; 151 return 0;
206 152
207 if (!S_ISLNK(inode->i_mode)) { 153 ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
208 if (IS_POSIXACL(dir)) { 154 if (ret)
209 acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); 155 return ret;
210 if (IS_ERR(acl))
211 return PTR_ERR(acl);
212 }
213 156
214 if (!acl) 157 if (default_acl) {
215 inode->i_mode &= ~current_umask(); 158 ret = __btrfs_set_acl(trans, inode, default_acl,
159 ACL_TYPE_DEFAULT);
160 posix_acl_release(default_acl);
216 } 161 }
217 162
218 if (IS_POSIXACL(dir) && acl) { 163 if (acl) {
219 if (S_ISDIR(inode->i_mode)) { 164 if (!ret)
220 ret = btrfs_set_acl(trans, inode, acl, 165 ret = __btrfs_set_acl(trans, inode, acl,
221 ACL_TYPE_DEFAULT); 166 ACL_TYPE_ACCESS);
222 if (ret) 167 posix_acl_release(acl);
223 goto failed;
224 }
225 ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
226 if (ret < 0)
227 return ret;
228
229 if (ret > 0) {
230 /* we need an acl */
231 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
232 } else if (ret < 0) {
233 cache_no_acl(inode);
234 }
235 } else {
236 cache_no_acl(inode);
237 } 168 }
238failed:
239 posix_acl_release(acl);
240
241 return ret;
242}
243 169
244int btrfs_acl_chmod(struct inode *inode) 170 if (!default_acl && !acl)
245{ 171 cache_no_acl(inode);
246 struct posix_acl *acl;
247 int ret = 0;
248
249 if (S_ISLNK(inode->i_mode))
250 return -EOPNOTSUPP;
251
252 if (!IS_POSIXACL(inode))
253 return 0;
254
255 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
256 if (IS_ERR_OR_NULL(acl))
257 return PTR_ERR(acl);
258
259 ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
260 if (ret)
261 return ret;
262 ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS);
263 posix_acl_release(acl);
264 return ret; 172 return ret;
265} 173}
266
267const struct xattr_handler btrfs_xattr_acl_default_handler = {
268 .prefix = POSIX_ACL_XATTR_DEFAULT,
269 .flags = ACL_TYPE_DEFAULT,
270 .get = btrfs_xattr_acl_get,
271 .set = btrfs_xattr_acl_set,
272};
273
274const struct xattr_handler btrfs_xattr_acl_access_handler = {
275 .prefix = POSIX_ACL_XATTR_ACCESS,
276 .flags = ACL_TYPE_ACCESS,
277 .get = btrfs_xattr_acl_get,
278 .set = btrfs_xattr_acl_set,
279};
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3775947429b2..aded3ef3d3d4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -66,6 +66,16 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
66 return 0; 66 return 0;
67} 67}
68 68
69static void free_inode_elem_list(struct extent_inode_elem *eie)
70{
71 struct extent_inode_elem *eie_next;
72
73 for (; eie; eie = eie_next) {
74 eie_next = eie->next;
75 kfree(eie);
76 }
77}
78
69static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, 79static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
70 u64 extent_item_pos, 80 u64 extent_item_pos,
71 struct extent_inode_elem **eie) 81 struct extent_inode_elem **eie)
@@ -209,18 +219,19 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
209} 219}
210 220
211static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, 221static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
212 struct ulist *parents, int level, 222 struct ulist *parents, struct __prelim_ref *ref,
213 struct btrfs_key *key_for_search, u64 time_seq, 223 int level, u64 time_seq, const u64 *extent_item_pos)
214 u64 wanted_disk_byte,
215 const u64 *extent_item_pos)
216{ 224{
217 int ret = 0; 225 int ret = 0;
218 int slot; 226 int slot;
219 struct extent_buffer *eb; 227 struct extent_buffer *eb;
220 struct btrfs_key key; 228 struct btrfs_key key;
229 struct btrfs_key *key_for_search = &ref->key_for_search;
221 struct btrfs_file_extent_item *fi; 230 struct btrfs_file_extent_item *fi;
222 struct extent_inode_elem *eie = NULL, *old = NULL; 231 struct extent_inode_elem *eie = NULL, *old = NULL;
223 u64 disk_byte; 232 u64 disk_byte;
233 u64 wanted_disk_byte = ref->wanted_disk_byte;
234 u64 count = 0;
224 235
225 if (level != 0) { 236 if (level != 0) {
226 eb = path->nodes[level]; 237 eb = path->nodes[level];
@@ -238,7 +249,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
238 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) 249 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
239 ret = btrfs_next_old_leaf(root, path, time_seq); 250 ret = btrfs_next_old_leaf(root, path, time_seq);
240 251
241 while (!ret) { 252 while (!ret && count < ref->count) {
242 eb = path->nodes[0]; 253 eb = path->nodes[0];
243 slot = path->slots[0]; 254 slot = path->slots[0];
244 255
@@ -254,6 +265,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
254 if (disk_byte == wanted_disk_byte) { 265 if (disk_byte == wanted_disk_byte) {
255 eie = NULL; 266 eie = NULL;
256 old = NULL; 267 old = NULL;
268 count++;
257 if (extent_item_pos) { 269 if (extent_item_pos) {
258 ret = check_extent_in_eb(&key, eb, fi, 270 ret = check_extent_in_eb(&key, eb, fi,
259 *extent_item_pos, 271 *extent_item_pos,
@@ -273,6 +285,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
273 old = old->next; 285 old = old->next;
274 old->next = eie; 286 old->next = eie;
275 } 287 }
288 eie = NULL;
276 } 289 }
277next: 290next:
278 ret = btrfs_next_old_item(root, path, time_seq); 291 ret = btrfs_next_old_item(root, path, time_seq);
@@ -280,6 +293,8 @@ next:
280 293
281 if (ret > 0) 294 if (ret > 0)
282 ret = 0; 295 ret = 0;
296 else if (ret < 0)
297 free_inode_elem_list(eie);
283 return ret; 298 return ret;
284} 299}
285 300
@@ -299,23 +314,34 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
299 int ret = 0; 314 int ret = 0;
300 int root_level; 315 int root_level;
301 int level = ref->level; 316 int level = ref->level;
317 int index;
302 318
303 root_key.objectid = ref->root_id; 319 root_key.objectid = ref->root_id;
304 root_key.type = BTRFS_ROOT_ITEM_KEY; 320 root_key.type = BTRFS_ROOT_ITEM_KEY;
305 root_key.offset = (u64)-1; 321 root_key.offset = (u64)-1;
322
323 index = srcu_read_lock(&fs_info->subvol_srcu);
324
306 root = btrfs_read_fs_root_no_name(fs_info, &root_key); 325 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
307 if (IS_ERR(root)) { 326 if (IS_ERR(root)) {
327 srcu_read_unlock(&fs_info->subvol_srcu, index);
308 ret = PTR_ERR(root); 328 ret = PTR_ERR(root);
309 goto out; 329 goto out;
310 } 330 }
311 331
312 root_level = btrfs_old_root_level(root, time_seq); 332 root_level = btrfs_old_root_level(root, time_seq);
313 333
314 if (root_level + 1 == level) 334 if (root_level + 1 == level) {
335 srcu_read_unlock(&fs_info->subvol_srcu, index);
315 goto out; 336 goto out;
337 }
316 338
317 path->lowest_level = level; 339 path->lowest_level = level;
318 ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); 340 ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
341
342 /* root node has been locked, we can release @subvol_srcu safely here */
343 srcu_read_unlock(&fs_info->subvol_srcu, index);
344
319 pr_debug("search slot in root %llu (level %d, ref count %d) returned " 345 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
320 "%d for key (%llu %u %llu)\n", 346 "%d for key (%llu %u %llu)\n",
321 ref->root_id, level, ref->count, ret, 347 ref->root_id, level, ref->count, ret,
@@ -334,9 +360,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
334 eb = path->nodes[level]; 360 eb = path->nodes[level];
335 } 361 }
336 362
337 ret = add_all_parents(root, path, parents, level, &ref->key_for_search, 363 ret = add_all_parents(root, path, parents, ref, level, time_seq,
338 time_seq, ref->wanted_disk_byte, 364 extent_item_pos);
339 extent_item_pos);
340out: 365out:
341 path->lowest_level = 0; 366 path->lowest_level = 0;
342 btrfs_release_path(path); 367 btrfs_release_path(path);
@@ -376,10 +401,16 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
376 continue; 401 continue;
377 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 402 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
378 parents, extent_item_pos); 403 parents, extent_item_pos);
379 if (err == -ENOMEM) 404 /*
380 goto out; 405 * we can only tolerate ENOENT,otherwise,we should catch error
381 if (err) 406 * and return directly.
407 */
408 if (err == -ENOENT) {
382 continue; 409 continue;
410 } else if (err) {
411 ret = err;
412 goto out;
413 }
383 414
384 /* we put the first parent into the ref at hand */ 415 /* we put the first parent into the ref at hand */
385 ULIST_ITER_INIT(&uiter); 416 ULIST_ITER_INIT(&uiter);
@@ -538,14 +569,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
538 if (extent_op && extent_op->update_key) 569 if (extent_op && extent_op->update_key)
539 btrfs_disk_key_to_cpu(&op_key, &extent_op->key); 570 btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
540 571
541 while ((n = rb_prev(n))) { 572 spin_lock(&head->lock);
573 n = rb_first(&head->ref_root);
574 while (n) {
542 struct btrfs_delayed_ref_node *node; 575 struct btrfs_delayed_ref_node *node;
543 node = rb_entry(n, struct btrfs_delayed_ref_node, 576 node = rb_entry(n, struct btrfs_delayed_ref_node,
544 rb_node); 577 rb_node);
545 if (node->bytenr != head->node.bytenr) 578 n = rb_next(n);
546 break;
547 WARN_ON(node->is_head);
548
549 if (node->seq > seq) 579 if (node->seq > seq)
550 continue; 580 continue;
551 581
@@ -612,10 +642,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
612 WARN_ON(1); 642 WARN_ON(1);
613 } 643 }
614 if (ret) 644 if (ret)
615 return ret; 645 break;
616 } 646 }
617 647 spin_unlock(&head->lock);
618 return 0; 648 return ret;
619} 649}
620 650
621/* 651/*
@@ -828,6 +858,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
828 struct list_head prefs_delayed; 858 struct list_head prefs_delayed;
829 struct list_head prefs; 859 struct list_head prefs;
830 struct __prelim_ref *ref; 860 struct __prelim_ref *ref;
861 struct extent_inode_elem *eie = NULL;
831 862
832 INIT_LIST_HEAD(&prefs); 863 INIT_LIST_HEAD(&prefs);
833 INIT_LIST_HEAD(&prefs_delayed); 864 INIT_LIST_HEAD(&prefs_delayed);
@@ -882,15 +913,15 @@ again:
882 btrfs_put_delayed_ref(&head->node); 913 btrfs_put_delayed_ref(&head->node);
883 goto again; 914 goto again;
884 } 915 }
916 spin_unlock(&delayed_refs->lock);
885 ret = __add_delayed_refs(head, time_seq, 917 ret = __add_delayed_refs(head, time_seq,
886 &prefs_delayed); 918 &prefs_delayed);
887 mutex_unlock(&head->mutex); 919 mutex_unlock(&head->mutex);
888 if (ret) { 920 if (ret)
889 spin_unlock(&delayed_refs->lock);
890 goto out; 921 goto out;
891 } 922 } else {
923 spin_unlock(&delayed_refs->lock);
892 } 924 }
893 spin_unlock(&delayed_refs->lock);
894 } 925 }
895 926
896 if (path->slots[0]) { 927 if (path->slots[0]) {
@@ -941,7 +972,6 @@ again:
941 goto out; 972 goto out;
942 } 973 }
943 if (ref->count && ref->parent) { 974 if (ref->count && ref->parent) {
944 struct extent_inode_elem *eie = NULL;
945 if (extent_item_pos && !ref->inode_list) { 975 if (extent_item_pos && !ref->inode_list) {
946 u32 bsz; 976 u32 bsz;
947 struct extent_buffer *eb; 977 struct extent_buffer *eb;
@@ -976,6 +1006,7 @@ again:
976 eie = eie->next; 1006 eie = eie->next;
977 eie->next = ref->inode_list; 1007 eie->next = ref->inode_list;
978 } 1008 }
1009 eie = NULL;
979 } 1010 }
980 list_del(&ref->list); 1011 list_del(&ref->list);
981 kmem_cache_free(btrfs_prelim_ref_cache, ref); 1012 kmem_cache_free(btrfs_prelim_ref_cache, ref);
@@ -994,7 +1025,8 @@ out:
994 list_del(&ref->list); 1025 list_del(&ref->list);
995 kmem_cache_free(btrfs_prelim_ref_cache, ref); 1026 kmem_cache_free(btrfs_prelim_ref_cache, ref);
996 } 1027 }
997 1028 if (ret < 0)
1029 free_inode_elem_list(eie);
998 return ret; 1030 return ret;
999} 1031}
1000 1032
@@ -1002,7 +1034,6 @@ static void free_leaf_list(struct ulist *blocks)
1002{ 1034{
1003 struct ulist_node *node = NULL; 1035 struct ulist_node *node = NULL;
1004 struct extent_inode_elem *eie; 1036 struct extent_inode_elem *eie;
1005 struct extent_inode_elem *eie_next;
1006 struct ulist_iterator uiter; 1037 struct ulist_iterator uiter;
1007 1038
1008 ULIST_ITER_INIT(&uiter); 1039 ULIST_ITER_INIT(&uiter);
@@ -1010,10 +1041,7 @@ static void free_leaf_list(struct ulist *blocks)
1010 if (!node->aux) 1041 if (!node->aux)
1011 continue; 1042 continue;
1012 eie = (struct extent_inode_elem *)(uintptr_t)node->aux; 1043 eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
1013 for (; eie; eie = eie_next) { 1044 free_inode_elem_list(eie);
1014 eie_next = eie->next;
1015 kfree(eie);
1016 }
1017 node->aux = 0; 1045 node->aux = 0;
1018 } 1046 }
1019 1047
@@ -1101,44 +1129,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1101 if (!node) 1129 if (!node)
1102 break; 1130 break;
1103 bytenr = node->val; 1131 bytenr = node->val;
1132 cond_resched();
1104 } 1133 }
1105 1134
1106 ulist_free(tmp); 1135 ulist_free(tmp);
1107 return 0; 1136 return 0;
1108} 1137}
1109 1138
1110
1111static int __inode_info(u64 inum, u64 ioff, u8 key_type,
1112 struct btrfs_root *fs_root, struct btrfs_path *path,
1113 struct btrfs_key *found_key)
1114{
1115 int ret;
1116 struct btrfs_key key;
1117 struct extent_buffer *eb;
1118
1119 key.type = key_type;
1120 key.objectid = inum;
1121 key.offset = ioff;
1122
1123 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1124 if (ret < 0)
1125 return ret;
1126
1127 eb = path->nodes[0];
1128 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1129 ret = btrfs_next_leaf(fs_root, path);
1130 if (ret)
1131 return ret;
1132 eb = path->nodes[0];
1133 }
1134
1135 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1136 if (found_key->type != key.type || found_key->objectid != key.objectid)
1137 return 1;
1138
1139 return 0;
1140}
1141
1142/* 1139/*
1143 * this makes the path point to (inum INODE_ITEM ioff) 1140 * this makes the path point to (inum INODE_ITEM ioff)
1144 */ 1141 */
@@ -1146,16 +1143,16 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1146 struct btrfs_path *path) 1143 struct btrfs_path *path)
1147{ 1144{
1148 struct btrfs_key key; 1145 struct btrfs_key key;
1149 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, 1146 return btrfs_find_item(fs_root, path, inum, ioff,
1150 &key); 1147 BTRFS_INODE_ITEM_KEY, &key);
1151} 1148}
1152 1149
1153static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, 1150static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
1154 struct btrfs_path *path, 1151 struct btrfs_path *path,
1155 struct btrfs_key *found_key) 1152 struct btrfs_key *found_key)
1156{ 1153{
1157 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, 1154 return btrfs_find_item(fs_root, path, inum, ioff,
1158 found_key); 1155 BTRFS_INODE_REF_KEY, found_key);
1159} 1156}
1160 1157
1161int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, 1158int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
@@ -1335,20 +1332,45 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1335 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); 1332 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
1336 if (ret < 0) 1333 if (ret < 0)
1337 return ret; 1334 return ret;
1338 ret = btrfs_previous_item(fs_info->extent_root, path,
1339 0, BTRFS_EXTENT_ITEM_KEY);
1340 if (ret < 0)
1341 return ret;
1342 1335
1343 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1336 while (1) {
1337 u32 nritems;
1338 if (path->slots[0] == 0) {
1339 btrfs_set_path_blocking(path);
1340 ret = btrfs_prev_leaf(fs_info->extent_root, path);
1341 if (ret != 0) {
1342 if (ret > 0) {
1343 pr_debug("logical %llu is not within "
1344 "any extent\n", logical);
1345 ret = -ENOENT;
1346 }
1347 return ret;
1348 }
1349 } else {
1350 path->slots[0]--;
1351 }
1352 nritems = btrfs_header_nritems(path->nodes[0]);
1353 if (nritems == 0) {
1354 pr_debug("logical %llu is not within any extent\n",
1355 logical);
1356 return -ENOENT;
1357 }
1358 if (path->slots[0] == nritems)
1359 path->slots[0]--;
1360
1361 btrfs_item_key_to_cpu(path->nodes[0], found_key,
1362 path->slots[0]);
1363 if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
1364 found_key->type == BTRFS_METADATA_ITEM_KEY)
1365 break;
1366 }
1367
1344 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1345 size = fs_info->extent_root->leafsize; 1369 size = fs_info->extent_root->leafsize;
1346 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1347 size = found_key->offset; 1371 size = found_key->offset;
1348 1372
1349 if ((found_key->type != BTRFS_EXTENT_ITEM_KEY && 1373 if (found_key->objectid > logical ||
1350 found_key->type != BTRFS_METADATA_ITEM_KEY) ||
1351 found_key->objectid > logical ||
1352 found_key->objectid + size <= logical) { 1374 found_key->objectid + size <= logical) {
1353 pr_debug("logical %llu is not within any extent\n", logical); 1375 pr_debug("logical %llu is not within any extent\n", logical);
1354 return -ENOENT; 1376 return -ENOENT;
@@ -1601,7 +1623,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
1601 struct btrfs_key found_key; 1623 struct btrfs_key found_key;
1602 1624
1603 while (!ret) { 1625 while (!ret) {
1604 path->leave_spinning = 1;
1605 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, 1626 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
1606 &found_key); 1627 &found_key);
1607 if (ret < 0) 1628 if (ret < 0)
@@ -1614,9 +1635,12 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
1614 1635
1615 parent = found_key.offset; 1636 parent = found_key.offset;
1616 slot = path->slots[0]; 1637 slot = path->slots[0];
1617 eb = path->nodes[0]; 1638 eb = btrfs_clone_extent_buffer(path->nodes[0]);
1618 /* make sure we can use eb after releasing the path */ 1639 if (!eb) {
1619 atomic_inc(&eb->refs); 1640 ret = -ENOMEM;
1641 break;
1642 }
1643 extent_buffer_get(eb);
1620 btrfs_tree_read_lock(eb); 1644 btrfs_tree_read_lock(eb);
1621 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1645 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1622 btrfs_release_path(path); 1646 btrfs_release_path(path);
@@ -1674,17 +1698,20 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
1674 ++found; 1698 ++found;
1675 1699
1676 slot = path->slots[0]; 1700 slot = path->slots[0];
1677 eb = path->nodes[0]; 1701 eb = btrfs_clone_extent_buffer(path->nodes[0]);
1678 /* make sure we can use eb after releasing the path */ 1702 if (!eb) {
1679 atomic_inc(&eb->refs); 1703 ret = -ENOMEM;
1704 break;
1705 }
1706 extent_buffer_get(eb);
1680 1707
1681 btrfs_tree_read_lock(eb); 1708 btrfs_tree_read_lock(eb);
1682 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 1709 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1683 btrfs_release_path(path); 1710 btrfs_release_path(path);
1684 1711
1685 leaf = path->nodes[0]; 1712 leaf = path->nodes[0];
1686 item_size = btrfs_item_size_nr(leaf, path->slots[0]); 1713 item_size = btrfs_item_size_nr(leaf, slot);
1687 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 1714 ptr = btrfs_item_ptr_offset(leaf, slot);
1688 cur_offset = 0; 1715 cur_offset = 0;
1689 1716
1690 while (cur_offset < item_size) { 1717 while (cur_offset < item_size) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ac0b39db27d1..8fed2125689e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -43,6 +43,7 @@
43#define BTRFS_INODE_COPY_EVERYTHING 8 43#define BTRFS_INODE_COPY_EVERYTHING 8
44#define BTRFS_INODE_IN_DELALLOC_LIST 9 44#define BTRFS_INODE_IN_DELALLOC_LIST 9
45#define BTRFS_INODE_READDIO_NEED_LOCK 10 45#define BTRFS_INODE_READDIO_NEED_LOCK 10
46#define BTRFS_INODE_HAS_PROPS 11
46 47
47/* in memory btrfs inode */ 48/* in memory btrfs inode */
48struct btrfs_inode { 49struct btrfs_inode {
@@ -135,6 +136,9 @@ struct btrfs_inode {
135 */ 136 */
136 u64 index_cnt; 137 u64 index_cnt;
137 138
139 /* Cache the directory index number to speed the dir/file remove */
140 u64 dir_index;
141
138 /* the fsync log has some corner cases that mean we have to check 142 /* the fsync log has some corner cases that mean we have to check
139 * directories to see if any unlinks have been done before 143 * directories to see if any unlinks have been done before
140 * the directory was logged. See tree-log.c for all the 144 * the directory was logged. See tree-log.c for all the
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 131d82800b3a..0e8388e72d8d 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -92,11 +92,11 @@
92#include <linux/slab.h> 92#include <linux/slab.h>
93#include <linux/buffer_head.h> 93#include <linux/buffer_head.h>
94#include <linux/mutex.h> 94#include <linux/mutex.h>
95#include <linux/crc32c.h>
96#include <linux/genhd.h> 95#include <linux/genhd.h>
97#include <linux/blkdev.h> 96#include <linux/blkdev.h>
98#include "ctree.h" 97#include "ctree.h"
99#include "disk-io.h" 98#include "disk-io.h"
99#include "hash.h"
100#include "transaction.h" 100#include "transaction.h"
101#include "extent_io.h" 101#include "extent_io.h"
102#include "volumes.h" 102#include "volumes.h"
@@ -1456,10 +1456,14 @@ static int btrfsic_handle_extent_data(
1456 btrfsic_read_from_block_data(block_ctx, &file_extent_item, 1456 btrfsic_read_from_block_data(block_ctx, &file_extent_item,
1457 file_extent_item_offset, 1457 file_extent_item_offset,
1458 sizeof(struct btrfs_file_extent_item)); 1458 sizeof(struct btrfs_file_extent_item));
1459 next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) + 1459 next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
1460 btrfs_stack_file_extent_offset(&file_extent_item); 1460 if (btrfs_stack_file_extent_compression(&file_extent_item) ==
1461 generation = btrfs_stack_file_extent_generation(&file_extent_item); 1461 BTRFS_COMPRESS_NONE) {
1462 num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); 1462 next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
1463 num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
1464 } else {
1465 num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
1466 }
1463 generation = btrfs_stack_file_extent_generation(&file_extent_item); 1467 generation = btrfs_stack_file_extent_generation(&file_extent_item);
1464 1468
1465 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) 1469 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
@@ -1695,7 +1699,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1695 return -1; 1699 return -1;
1696 } 1700 }
1697 bio->bi_bdev = block_ctx->dev->bdev; 1701 bio->bi_bdev = block_ctx->dev->bdev;
1698 bio->bi_sector = dev_bytenr >> 9; 1702 bio->bi_iter.bi_sector = dev_bytenr >> 9;
1699 1703
1700 for (j = i; j < num_pages; j++) { 1704 for (j = i; j < num_pages; j++) {
1701 ret = bio_add_page(bio, block_ctx->pagev[j], 1705 ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1819,7 +1823,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1819 size_t sublen = i ? PAGE_CACHE_SIZE : 1823 size_t sublen = i ? PAGE_CACHE_SIZE :
1820 (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE); 1824 (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
1821 1825
1822 crc = crc32c(crc, data, sublen); 1826 crc = btrfs_crc32c(crc, data, sublen);
1823 } 1827 }
1824 btrfs_csum_final(crc, csum); 1828 btrfs_csum_final(crc, csum);
1825 if (memcmp(csum, h->csum, state->csum_size)) 1829 if (memcmp(csum, h->csum, state->csum_size))
@@ -3013,7 +3017,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
3013 int bio_is_patched; 3017 int bio_is_patched;
3014 char **mapped_datav; 3018 char **mapped_datav;
3015 3019
3016 dev_bytenr = 512 * bio->bi_sector; 3020 dev_bytenr = 512 * bio->bi_iter.bi_sector;
3017 bio_is_patched = 0; 3021 bio_is_patched = 0;
3018 if (dev_state->state->print_mask & 3022 if (dev_state->state->print_mask &
3019 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) 3023 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
@@ -3021,8 +3025,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
3021 "submit_bio(rw=0x%x, bi_vcnt=%u," 3025 "submit_bio(rw=0x%x, bi_vcnt=%u,"
3022 " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", 3026 " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
3023 rw, bio->bi_vcnt, 3027 rw, bio->bi_vcnt,
3024 (unsigned long long)bio->bi_sector, dev_bytenr, 3028 (unsigned long long)bio->bi_iter.bi_sector,
3025 bio->bi_bdev); 3029 dev_bytenr, bio->bi_bdev);
3026 3030
3027 mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, 3031 mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
3028 GFP_NOFS); 3032 GFP_NOFS);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1499b27b4186..b01fb6c527e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -128,11 +128,10 @@ static int check_compressed_csum(struct inode *inode,
128 kunmap_atomic(kaddr); 128 kunmap_atomic(kaddr);
129 129
130 if (csum != *cb_sum) { 130 if (csum != *cb_sum) {
131 printk(KERN_INFO "btrfs csum failed ino %llu " 131 btrfs_info(BTRFS_I(inode)->root->fs_info,
132 "extent %llu csum %u " 132 "csum failed ino %llu extent %llu csum %u wanted %u mirror %d",
133 "wanted %u mirror %d\n", 133 btrfs_ino(inode), disk_start, csum, *cb_sum,
134 btrfs_ino(inode), disk_start, csum, *cb_sum, 134 cb->mirror_num);
135 cb->mirror_num);
136 ret = -EIO; 135 ret = -EIO;
137 goto fail; 136 goto fail;
138 } 137 }
@@ -172,7 +171,8 @@ static void end_compressed_bio_read(struct bio *bio, int err)
172 goto out; 171 goto out;
173 172
174 inode = cb->inode; 173 inode = cb->inode;
175 ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); 174 ret = check_compressed_csum(inode, cb,
175 (u64)bio->bi_iter.bi_sector << 9);
176 if (ret) 176 if (ret)
177 goto csum_failed; 177 goto csum_failed;
178 178
@@ -201,18 +201,16 @@ csum_failed:
201 if (cb->errors) { 201 if (cb->errors) {
202 bio_io_error(cb->orig_bio); 202 bio_io_error(cb->orig_bio);
203 } else { 203 } else {
204 int bio_index = 0; 204 int i;
205 struct bio_vec *bvec = cb->orig_bio->bi_io_vec; 205 struct bio_vec *bvec;
206 206
207 /* 207 /*
208 * we have verified the checksum already, set page 208 * we have verified the checksum already, set page
209 * checked so the end_io handlers know about it 209 * checked so the end_io handlers know about it
210 */ 210 */
211 while (bio_index < cb->orig_bio->bi_vcnt) { 211 bio_for_each_segment_all(bvec, cb->orig_bio, i)
212 SetPageChecked(bvec->bv_page); 212 SetPageChecked(bvec->bv_page);
213 bvec++; 213
214 bio_index++;
215 }
216 bio_endio(cb->orig_bio, 0); 214 bio_endio(cb->orig_bio, 0);
217 } 215 }
218 216
@@ -372,7 +370,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
372 for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { 370 for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
373 page = compressed_pages[pg_index]; 371 page = compressed_pages[pg_index];
374 page->mapping = inode->i_mapping; 372 page->mapping = inode->i_mapping;
375 if (bio->bi_size) 373 if (bio->bi_iter.bi_size)
376 ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, 374 ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
377 PAGE_CACHE_SIZE, 375 PAGE_CACHE_SIZE,
378 bio, 0); 376 bio, 0);
@@ -412,7 +410,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
412 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 410 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
413 } 411 }
414 if (bytes_left < PAGE_CACHE_SIZE) { 412 if (bytes_left < PAGE_CACHE_SIZE) {
415 printk("bytes left %lu compress len %lu nr %lu\n", 413 btrfs_info(BTRFS_I(inode)->root->fs_info,
414 "bytes left %lu compress len %lu nr %lu",
416 bytes_left, cb->compressed_len, cb->nr_pages); 415 bytes_left, cb->compressed_len, cb->nr_pages);
417 } 416 }
418 bytes_left -= PAGE_CACHE_SIZE; 417 bytes_left -= PAGE_CACHE_SIZE;
@@ -506,7 +505,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
506 505
507 if (!em || last_offset < em->start || 506 if (!em || last_offset < em->start ||
508 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || 507 (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
509 (em->block_start >> 9) != cb->orig_bio->bi_sector) { 508 (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
510 free_extent_map(em); 509 free_extent_map(em);
511 unlock_extent(tree, last_offset, end); 510 unlock_extent(tree, last_offset, end);
512 unlock_page(page); 511 unlock_page(page);
@@ -552,7 +551,7 @@ next:
552 * in it. We don't actually do IO on those pages but allocate new ones 551 * in it. We don't actually do IO on those pages but allocate new ones
553 * to hold the compressed pages on disk. 552 * to hold the compressed pages on disk.
554 * 553 *
555 * bio->bi_sector points to the compressed extent on disk 554 * bio->bi_iter.bi_sector points to the compressed extent on disk
556 * bio->bi_io_vec points to all of the inode pages 555 * bio->bi_io_vec points to all of the inode pages
557 * bio->bi_vcnt is a count of pages 556 * bio->bi_vcnt is a count of pages
558 * 557 *
@@ -573,7 +572,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
573 struct page *page; 572 struct page *page;
574 struct block_device *bdev; 573 struct block_device *bdev;
575 struct bio *comp_bio; 574 struct bio *comp_bio;
576 u64 cur_disk_byte = (u64)bio->bi_sector << 9; 575 u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
577 u64 em_len; 576 u64 em_len;
578 u64 em_start; 577 u64 em_start;
579 struct extent_map *em; 578 struct extent_map *em;
@@ -659,7 +658,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
659 page->mapping = inode->i_mapping; 658 page->mapping = inode->i_mapping;
660 page->index = em_start >> PAGE_CACHE_SHIFT; 659 page->index = em_start >> PAGE_CACHE_SHIFT;
661 660
662 if (comp_bio->bi_size) 661 if (comp_bio->bi_iter.bi_size)
663 ret = tree->ops->merge_bio_hook(READ, page, 0, 662 ret = tree->ops->merge_bio_hook(READ, page, 0,
664 PAGE_CACHE_SIZE, 663 PAGE_CACHE_SIZE,
665 comp_bio, 0); 664 comp_bio, 0);
@@ -687,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
687 comp_bio, sums); 686 comp_bio, sums);
688 BUG_ON(ret); /* -ENOMEM */ 687 BUG_ON(ret); /* -ENOMEM */
689 } 688 }
690 sums += (comp_bio->bi_size + root->sectorsize - 1) / 689 sums += (comp_bio->bi_iter.bi_size +
691 root->sectorsize; 690 root->sectorsize - 1) / root->sectorsize;
692 691
693 ret = btrfs_map_bio(root, READ, comp_bio, 692 ret = btrfs_map_bio(root, READ, comp_bio,
694 mirror_num, 0); 693 mirror_num, 0);
@@ -1011,6 +1010,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
1011 bytes = min(bytes, working_bytes); 1010 bytes = min(bytes, working_bytes);
1012 kaddr = kmap_atomic(page_out); 1011 kaddr = kmap_atomic(page_out);
1013 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); 1012 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
1013 if (*pg_index == (vcnt - 1) && *pg_offset == 0)
1014 memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
1014 kunmap_atomic(kaddr); 1015 kunmap_atomic(kaddr);
1015 flush_dcache_page(page_out); 1016 flush_dcache_page(page_out);
1016 1017
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 316136bd6dd7..cbd3a7d6fa68 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
39 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
40static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, 40static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41 int level, int slot); 41 int level, int slot);
42static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 42static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43 struct extent_buffer *eb); 43 struct extent_buffer *eb);
44static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
45 44
46struct btrfs_path *btrfs_alloc_path(void) 45struct btrfs_path *btrfs_alloc_path(void)
47{ 46{
@@ -475,6 +474,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
475 * the index is the shifted logical of the *new* root node for root replace 474 * the index is the shifted logical of the *new* root node for root replace
476 * operations, or the shifted logical of the affected block for all other 475 * operations, or the shifted logical of the affected block for all other
477 * operations. 476 * operations.
477 *
478 * Note: must be called with write lock (tree_mod_log_write_lock).
478 */ 479 */
479static noinline int 480static noinline int
480__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) 481__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -483,24 +484,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
483 struct rb_node **new; 484 struct rb_node **new;
484 struct rb_node *parent = NULL; 485 struct rb_node *parent = NULL;
485 struct tree_mod_elem *cur; 486 struct tree_mod_elem *cur;
486 int ret = 0;
487 487
488 BUG_ON(!tm); 488 BUG_ON(!tm);
489 489
490 tree_mod_log_write_lock(fs_info);
491 if (list_empty(&fs_info->tree_mod_seq_list)) {
492 tree_mod_log_write_unlock(fs_info);
493 /*
494 * Ok we no longer care about logging modifications, free up tm
495 * and return 0. Any callers shouldn't be using tm after
496 * calling tree_mod_log_insert, but if they do we can just
497 * change this to return a special error code to let the callers
498 * do their own thing.
499 */
500 kfree(tm);
501 return 0;
502 }
503
504 spin_lock(&fs_info->tree_mod_seq_lock); 490 spin_lock(&fs_info->tree_mod_seq_lock);
505 tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); 491 tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
506 spin_unlock(&fs_info->tree_mod_seq_lock); 492 spin_unlock(&fs_info->tree_mod_seq_lock);
@@ -518,18 +504,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
518 new = &((*new)->rb_left); 504 new = &((*new)->rb_left);
519 else if (cur->seq > tm->seq) 505 else if (cur->seq > tm->seq)
520 new = &((*new)->rb_right); 506 new = &((*new)->rb_right);
521 else { 507 else
522 ret = -EEXIST; 508 return -EEXIST;
523 kfree(tm);
524 goto out;
525 }
526 } 509 }
527 510
528 rb_link_node(&tm->node, parent, new); 511 rb_link_node(&tm->node, parent, new);
529 rb_insert_color(&tm->node, tm_root); 512 rb_insert_color(&tm->node, tm_root);
530out: 513 return 0;
531 tree_mod_log_write_unlock(fs_info);
532 return ret;
533} 514}
534 515
535/* 516/*
@@ -545,19 +526,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
545 return 1; 526 return 1;
546 if (eb && btrfs_header_level(eb) == 0) 527 if (eb && btrfs_header_level(eb) == 0)
547 return 1; 528 return 1;
529
530 tree_mod_log_write_lock(fs_info);
531 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
532 tree_mod_log_write_unlock(fs_info);
533 return 1;
534 }
535
548 return 0; 536 return 0;
549} 537}
550 538
551static inline int 539/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
552__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, 540static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
553 struct extent_buffer *eb, int slot, 541 struct extent_buffer *eb)
554 enum mod_log_op op, gfp_t flags) 542{
543 smp_mb();
544 if (list_empty(&(fs_info)->tree_mod_seq_list))
545 return 0;
546 if (eb && btrfs_header_level(eb) == 0)
547 return 0;
548
549 return 1;
550}
551
552static struct tree_mod_elem *
553alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
554 enum mod_log_op op, gfp_t flags)
555{ 555{
556 struct tree_mod_elem *tm; 556 struct tree_mod_elem *tm;
557 557
558 tm = kzalloc(sizeof(*tm), flags); 558 tm = kzalloc(sizeof(*tm), flags);
559 if (!tm) 559 if (!tm)
560 return -ENOMEM; 560 return NULL;
561 561
562 tm->index = eb->start >> PAGE_CACHE_SHIFT; 562 tm->index = eb->start >> PAGE_CACHE_SHIFT;
563 if (op != MOD_LOG_KEY_ADD) { 563 if (op != MOD_LOG_KEY_ADD) {
@@ -567,8 +567,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
567 tm->op = op; 567 tm->op = op;
568 tm->slot = slot; 568 tm->slot = slot;
569 tm->generation = btrfs_node_ptr_generation(eb, slot); 569 tm->generation = btrfs_node_ptr_generation(eb, slot);
570 RB_CLEAR_NODE(&tm->node);
570 571
571 return __tree_mod_log_insert(fs_info, tm); 572 return tm;
572} 573}
573 574
574static noinline int 575static noinline int
@@ -576,10 +577,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
576 struct extent_buffer *eb, int slot, 577 struct extent_buffer *eb, int slot,
577 enum mod_log_op op, gfp_t flags) 578 enum mod_log_op op, gfp_t flags)
578{ 579{
579 if (tree_mod_dont_log(fs_info, eb)) 580 struct tree_mod_elem *tm;
581 int ret;
582
583 if (!tree_mod_need_log(fs_info, eb))
580 return 0; 584 return 0;
581 585
582 return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); 586 tm = alloc_tree_mod_elem(eb, slot, op, flags);
587 if (!tm)
588 return -ENOMEM;
589
590 if (tree_mod_dont_log(fs_info, eb)) {
591 kfree(tm);
592 return 0;
593 }
594
595 ret = __tree_mod_log_insert(fs_info, tm);
596 tree_mod_log_write_unlock(fs_info);
597 if (ret)
598 kfree(tm);
599
600 return ret;
583} 601}
584 602
585static noinline int 603static noinline int
@@ -587,53 +605,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
587 struct extent_buffer *eb, int dst_slot, int src_slot, 605 struct extent_buffer *eb, int dst_slot, int src_slot,
588 int nr_items, gfp_t flags) 606 int nr_items, gfp_t flags)
589{ 607{
590 struct tree_mod_elem *tm; 608 struct tree_mod_elem *tm = NULL;
591 int ret; 609 struct tree_mod_elem **tm_list = NULL;
610 int ret = 0;
592 int i; 611 int i;
612 int locked = 0;
593 613
594 if (tree_mod_dont_log(fs_info, eb)) 614 if (!tree_mod_need_log(fs_info, eb))
595 return 0; 615 return 0;
596 616
617 tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
618 if (!tm_list)
619 return -ENOMEM;
620
621 tm = kzalloc(sizeof(*tm), flags);
622 if (!tm) {
623 ret = -ENOMEM;
624 goto free_tms;
625 }
626
627 tm->index = eb->start >> PAGE_CACHE_SHIFT;
628 tm->slot = src_slot;
629 tm->move.dst_slot = dst_slot;
630 tm->move.nr_items = nr_items;
631 tm->op = MOD_LOG_MOVE_KEYS;
632
633 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
634 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
635 MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
636 if (!tm_list[i]) {
637 ret = -ENOMEM;
638 goto free_tms;
639 }
640 }
641
642 if (tree_mod_dont_log(fs_info, eb))
643 goto free_tms;
644 locked = 1;
645
597 /* 646 /*
598 * When we override something during the move, we log these removals. 647 * When we override something during the move, we log these removals.
599 * This can only happen when we move towards the beginning of the 648 * This can only happen when we move towards the beginning of the
600 * buffer, i.e. dst_slot < src_slot. 649 * buffer, i.e. dst_slot < src_slot.
601 */ 650 */
602 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { 651 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
603 ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, 652 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
604 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); 653 if (ret)
605 BUG_ON(ret < 0); 654 goto free_tms;
606 } 655 }
607 656
608 tm = kzalloc(sizeof(*tm), flags); 657 ret = __tree_mod_log_insert(fs_info, tm);
609 if (!tm) 658 if (ret)
610 return -ENOMEM; 659 goto free_tms;
660 tree_mod_log_write_unlock(fs_info);
661 kfree(tm_list);
611 662
612 tm->index = eb->start >> PAGE_CACHE_SHIFT; 663 return 0;
613 tm->slot = src_slot; 664free_tms:
614 tm->move.dst_slot = dst_slot; 665 for (i = 0; i < nr_items; i++) {
615 tm->move.nr_items = nr_items; 666 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
616 tm->op = MOD_LOG_MOVE_KEYS; 667 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
668 kfree(tm_list[i]);
669 }
670 if (locked)
671 tree_mod_log_write_unlock(fs_info);
672 kfree(tm_list);
673 kfree(tm);
617 674
618 return __tree_mod_log_insert(fs_info, tm); 675 return ret;
619} 676}
620 677
621static inline void 678static inline int
622__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 679__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
680 struct tree_mod_elem **tm_list,
681 int nritems)
623{ 682{
624 int i; 683 int i, j;
625 u32 nritems;
626 int ret; 684 int ret;
627 685
628 if (btrfs_header_level(eb) == 0)
629 return;
630
631 nritems = btrfs_header_nritems(eb);
632 for (i = nritems - 1; i >= 0; i--) { 686 for (i = nritems - 1; i >= 0; i--) {
633 ret = __tree_mod_log_insert_key(fs_info, eb, i, 687 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
634 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); 688 if (ret) {
635 BUG_ON(ret < 0); 689 for (j = nritems - 1; j > i; j--)
690 rb_erase(&tm_list[j]->node,
691 &fs_info->tree_mod_log);
692 return ret;
693 }
636 } 694 }
695
696 return 0;
637} 697}
638 698
639static noinline int 699static noinline int
@@ -642,17 +702,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
642 struct extent_buffer *new_root, gfp_t flags, 702 struct extent_buffer *new_root, gfp_t flags,
643 int log_removal) 703 int log_removal)
644{ 704{
645 struct tree_mod_elem *tm; 705 struct tree_mod_elem *tm = NULL;
706 struct tree_mod_elem **tm_list = NULL;
707 int nritems = 0;
708 int ret = 0;
709 int i;
646 710
647 if (tree_mod_dont_log(fs_info, NULL)) 711 if (!tree_mod_need_log(fs_info, NULL))
648 return 0; 712 return 0;
649 713
650 if (log_removal) 714 if (log_removal && btrfs_header_level(old_root) > 0) {
651 __tree_mod_log_free_eb(fs_info, old_root); 715 nritems = btrfs_header_nritems(old_root);
716 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
717 flags);
718 if (!tm_list) {
719 ret = -ENOMEM;
720 goto free_tms;
721 }
722 for (i = 0; i < nritems; i++) {
723 tm_list[i] = alloc_tree_mod_elem(old_root, i,
724 MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
725 if (!tm_list[i]) {
726 ret = -ENOMEM;
727 goto free_tms;
728 }
729 }
730 }
652 731
653 tm = kzalloc(sizeof(*tm), flags); 732 tm = kzalloc(sizeof(*tm), flags);
654 if (!tm) 733 if (!tm) {
655 return -ENOMEM; 734 ret = -ENOMEM;
735 goto free_tms;
736 }
656 737
657 tm->index = new_root->start >> PAGE_CACHE_SHIFT; 738 tm->index = new_root->start >> PAGE_CACHE_SHIFT;
658 tm->old_root.logical = old_root->start; 739 tm->old_root.logical = old_root->start;
@@ -660,7 +741,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
660 tm->generation = btrfs_header_generation(old_root); 741 tm->generation = btrfs_header_generation(old_root);
661 tm->op = MOD_LOG_ROOT_REPLACE; 742 tm->op = MOD_LOG_ROOT_REPLACE;
662 743
663 return __tree_mod_log_insert(fs_info, tm); 744 if (tree_mod_dont_log(fs_info, NULL))
745 goto free_tms;
746
747 if (tm_list)
748 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
749 if (!ret)
750 ret = __tree_mod_log_insert(fs_info, tm);
751
752 tree_mod_log_write_unlock(fs_info);
753 if (ret)
754 goto free_tms;
755 kfree(tm_list);
756
757 return ret;
758
759free_tms:
760 if (tm_list) {
761 for (i = 0; i < nritems; i++)
762 kfree(tm_list[i]);
763 kfree(tm_list);
764 }
765 kfree(tm);
766
767 return ret;
664} 768}
665 769
666static struct tree_mod_elem * 770static struct tree_mod_elem *
@@ -729,31 +833,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
729 return __tree_mod_log_search(fs_info, start, min_seq, 0); 833 return __tree_mod_log_search(fs_info, start, min_seq, 0);
730} 834}
731 835
732static noinline void 836static noinline int
733tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, 837tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
734 struct extent_buffer *src, unsigned long dst_offset, 838 struct extent_buffer *src, unsigned long dst_offset,
735 unsigned long src_offset, int nr_items) 839 unsigned long src_offset, int nr_items)
736{ 840{
737 int ret; 841 int ret = 0;
842 struct tree_mod_elem **tm_list = NULL;
843 struct tree_mod_elem **tm_list_add, **tm_list_rem;
738 int i; 844 int i;
845 int locked = 0;
739 846
740 if (tree_mod_dont_log(fs_info, NULL)) 847 if (!tree_mod_need_log(fs_info, NULL))
741 return; 848 return 0;
742 849
743 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 850 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
744 return; 851 return 0;
852
853 tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
854 GFP_NOFS);
855 if (!tm_list)
856 return -ENOMEM;
745 857
858 tm_list_add = tm_list;
859 tm_list_rem = tm_list + nr_items;
746 for (i = 0; i < nr_items; i++) { 860 for (i = 0; i < nr_items; i++) {
747 ret = __tree_mod_log_insert_key(fs_info, src, 861 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
748 i + src_offset, 862 MOD_LOG_KEY_REMOVE, GFP_NOFS);
749 MOD_LOG_KEY_REMOVE, GFP_NOFS); 863 if (!tm_list_rem[i]) {
750 BUG_ON(ret < 0); 864 ret = -ENOMEM;
751 ret = __tree_mod_log_insert_key(fs_info, dst, 865 goto free_tms;
752 i + dst_offset, 866 }
753 MOD_LOG_KEY_ADD, 867
754 GFP_NOFS); 868 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
755 BUG_ON(ret < 0); 869 MOD_LOG_KEY_ADD, GFP_NOFS);
870 if (!tm_list_add[i]) {
871 ret = -ENOMEM;
872 goto free_tms;
873 }
756 } 874 }
875
876 if (tree_mod_dont_log(fs_info, NULL))
877 goto free_tms;
878 locked = 1;
879
880 for (i = 0; i < nr_items; i++) {
881 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
882 if (ret)
883 goto free_tms;
884 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
885 if (ret)
886 goto free_tms;
887 }
888
889 tree_mod_log_write_unlock(fs_info);
890 kfree(tm_list);
891
892 return 0;
893
894free_tms:
895 for (i = 0; i < nr_items * 2; i++) {
896 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
897 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
898 kfree(tm_list[i]);
899 }
900 if (locked)
901 tree_mod_log_write_unlock(fs_info);
902 kfree(tm_list);
903
904 return ret;
757} 905}
758 906
759static inline void 907static inline void
@@ -772,18 +920,58 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
772{ 920{
773 int ret; 921 int ret;
774 922
775 ret = __tree_mod_log_insert_key(fs_info, eb, slot, 923 ret = tree_mod_log_insert_key(fs_info, eb, slot,
776 MOD_LOG_KEY_REPLACE, 924 MOD_LOG_KEY_REPLACE,
777 atomic ? GFP_ATOMIC : GFP_NOFS); 925 atomic ? GFP_ATOMIC : GFP_NOFS);
778 BUG_ON(ret < 0); 926 BUG_ON(ret < 0);
779} 927}
780 928
781static noinline void 929static noinline int
782tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) 930tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
783{ 931{
932 struct tree_mod_elem **tm_list = NULL;
933 int nritems = 0;
934 int i;
935 int ret = 0;
936
937 if (btrfs_header_level(eb) == 0)
938 return 0;
939
940 if (!tree_mod_need_log(fs_info, NULL))
941 return 0;
942
943 nritems = btrfs_header_nritems(eb);
944 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
945 GFP_NOFS);
946 if (!tm_list)
947 return -ENOMEM;
948
949 for (i = 0; i < nritems; i++) {
950 tm_list[i] = alloc_tree_mod_elem(eb, i,
951 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
952 if (!tm_list[i]) {
953 ret = -ENOMEM;
954 goto free_tms;
955 }
956 }
957
784 if (tree_mod_dont_log(fs_info, eb)) 958 if (tree_mod_dont_log(fs_info, eb))
785 return; 959 goto free_tms;
786 __tree_mod_log_free_eb(fs_info, eb); 960
961 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
962 tree_mod_log_write_unlock(fs_info);
963 if (ret)
964 goto free_tms;
965 kfree(tm_list);
966
967 return 0;
968
969free_tms:
970 for (i = 0; i < nritems; i++)
971 kfree(tm_list[i]);
972 kfree(tm_list);
973
974 return ret;
787} 975}
788 976
789static noinline void 977static noinline void
@@ -1041,8 +1229,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1041 btrfs_set_node_ptr_generation(parent, parent_slot, 1229 btrfs_set_node_ptr_generation(parent, parent_slot,
1042 trans->transid); 1230 trans->transid);
1043 btrfs_mark_buffer_dirty(parent); 1231 btrfs_mark_buffer_dirty(parent);
1044 if (last_ref) 1232 if (last_ref) {
1045 tree_mod_log_free_eb(root->fs_info, buf); 1233 ret = tree_mod_log_free_eb(root->fs_info, buf);
1234 if (ret) {
1235 btrfs_abort_transaction(trans, root, ret);
1236 return ret;
1237 }
1238 }
1046 btrfs_free_tree_block(trans, root, buf, parent_start, 1239 btrfs_free_tree_block(trans, root, buf, parent_start,
1047 last_ref); 1240 last_ref);
1048 } 1241 }
@@ -1287,8 +1480,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1287 old = read_tree_block(root, logical, blocksize, 0); 1480 old = read_tree_block(root, logical, blocksize, 0);
1288 if (WARN_ON(!old || !extent_buffer_uptodate(old))) { 1481 if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1289 free_extent_buffer(old); 1482 free_extent_buffer(old);
1290 pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", 1483 btrfs_warn(root->fs_info,
1291 logical); 1484 "failed to read tree block %llu from get_old_root", logical);
1292 } else { 1485 } else {
1293 eb = btrfs_clone_extent_buffer(old); 1486 eb = btrfs_clone_extent_buffer(old);
1294 free_extent_buffer(old); 1487 free_extent_buffer(old);
@@ -2462,6 +2655,49 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2462 return 0; 2655 return 0;
2463} 2656}
2464 2657
2658int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
2659 u64 iobjectid, u64 ioff, u8 key_type,
2660 struct btrfs_key *found_key)
2661{
2662 int ret;
2663 struct btrfs_key key;
2664 struct extent_buffer *eb;
2665 struct btrfs_path *path;
2666
2667 key.type = key_type;
2668 key.objectid = iobjectid;
2669 key.offset = ioff;
2670
2671 if (found_path == NULL) {
2672 path = btrfs_alloc_path();
2673 if (!path)
2674 return -ENOMEM;
2675 } else
2676 path = found_path;
2677
2678 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2679 if ((ret < 0) || (found_key == NULL)) {
2680 if (path != found_path)
2681 btrfs_free_path(path);
2682 return ret;
2683 }
2684
2685 eb = path->nodes[0];
2686 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2687 ret = btrfs_next_leaf(fs_root, path);
2688 if (ret)
2689 return ret;
2690 eb = path->nodes[0];
2691 }
2692
2693 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2694 if (found_key->type != key.type ||
2695 found_key->objectid != key.objectid)
2696 return 1;
2697
2698 return 0;
2699}
2700
2465/* 2701/*
2466 * look for key in the tree. path is filled in with nodes along the way 2702 * look for key in the tree. path is filled in with nodes along the way
2467 * if key is found, we return zero and you can find the item in the leaf 2703 * if key is found, we return zero and you can find the item in the leaf
@@ -2495,6 +2731,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2495 lowest_level = p->lowest_level; 2731 lowest_level = p->lowest_level;
2496 WARN_ON(lowest_level && ins_len > 0); 2732 WARN_ON(lowest_level && ins_len > 0);
2497 WARN_ON(p->nodes[0] != NULL); 2733 WARN_ON(p->nodes[0] != NULL);
2734 BUG_ON(!cow && ins_len);
2498 2735
2499 if (ins_len < 0) { 2736 if (ins_len < 0) {
2500 lowest_unlock = 2; 2737 lowest_unlock = 2;
@@ -2603,8 +2840,6 @@ again:
2603 } 2840 }
2604 } 2841 }
2605cow_done: 2842cow_done:
2606 BUG_ON(!cow && ins_len);
2607
2608 p->nodes[level] = b; 2843 p->nodes[level] = b;
2609 btrfs_clear_path_blocking(p, NULL, 0); 2844 btrfs_clear_path_blocking(p, NULL, 0);
2610 2845
@@ -2614,13 +2849,19 @@ cow_done:
2614 * It is safe to drop the lock on our parent before we 2849 * It is safe to drop the lock on our parent before we
2615 * go through the expensive btree search on b. 2850 * go through the expensive btree search on b.
2616 * 2851 *
2617 * If cow is true, then we might be changing slot zero, 2852 * If we're inserting or deleting (ins_len != 0), then we might
2618 * which may require changing the parent. So, we can't 2853 * be changing slot zero, which may require changing the parent.
2619 * drop the lock until after we know which slot we're 2854 * So, we can't drop the lock until after we know which slot
2620 * operating on. 2855 * we're operating on.
2621 */ 2856 */
2622 if (!cow) 2857 if (!ins_len && !p->keep_locks) {
2623 btrfs_unlock_up_safe(p, level + 1); 2858 int u = level + 1;
2859
2860 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2861 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2862 p->locks[u] = 0;
2863 }
2864 }
2624 2865
2625 ret = key_search(b, key, level, &prev_cmp, &slot); 2866 ret = key_search(b, key, level, &prev_cmp, &slot);
2626 2867
@@ -2648,7 +2889,7 @@ cow_done:
2648 * which means we must have a write lock 2889 * which means we must have a write lock
2649 * on the parent 2890 * on the parent
2650 */ 2891 */
2651 if (slot == 0 && cow && 2892 if (slot == 0 && ins_len &&
2652 write_lock_level < level + 1) { 2893 write_lock_level < level + 1) {
2653 write_lock_level = level + 1; 2894 write_lock_level = level + 1;
2654 btrfs_release_path(p); 2895 btrfs_release_path(p);
@@ -2901,7 +3142,9 @@ again:
2901 if (ret < 0) 3142 if (ret < 0)
2902 return ret; 3143 return ret;
2903 if (!ret) { 3144 if (!ret) {
2904 p->slots[0] = btrfs_header_nritems(leaf) - 1; 3145 leaf = p->nodes[0];
3146 if (p->slots[0] == btrfs_header_nritems(leaf))
3147 p->slots[0]--;
2905 return 0; 3148 return 0;
2906 } 3149 }
2907 if (!return_any) 3150 if (!return_any)
@@ -3022,8 +3265,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
3022 } else 3265 } else
3023 push_items = min(src_nritems - 8, push_items); 3266 push_items = min(src_nritems - 8, push_items);
3024 3267
3025 tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, 3268 ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3026 push_items); 3269 push_items);
3270 if (ret) {
3271 btrfs_abort_transaction(trans, root, ret);
3272 return ret;
3273 }
3027 copy_extent_buffer(dst, src, 3274 copy_extent_buffer(dst, src,
3028 btrfs_node_key_ptr_offset(dst_nritems), 3275 btrfs_node_key_ptr_offset(dst_nritems),
3029 btrfs_node_key_ptr_offset(0), 3276 btrfs_node_key_ptr_offset(0),
@@ -3093,8 +3340,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
3093 (dst_nritems) * 3340 (dst_nritems) *
3094 sizeof(struct btrfs_key_ptr)); 3341 sizeof(struct btrfs_key_ptr));
3095 3342
3096 tree_mod_log_eb_copy(root->fs_info, dst, src, 0, 3343 ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3097 src_nritems - push_items, push_items); 3344 src_nritems - push_items, push_items);
3345 if (ret) {
3346 btrfs_abort_transaction(trans, root, ret);
3347 return ret;
3348 }
3098 copy_extent_buffer(dst, src, 3349 copy_extent_buffer(dst, src,
3099 btrfs_node_key_ptr_offset(0), 3350 btrfs_node_key_ptr_offset(0),
3100 btrfs_node_key_ptr_offset(src_nritems - push_items), 3351 btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -3295,7 +3546,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3295 btrfs_header_chunk_tree_uuid(split), 3546 btrfs_header_chunk_tree_uuid(split),
3296 BTRFS_UUID_SIZE); 3547 BTRFS_UUID_SIZE);
3297 3548
3298 tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); 3549 ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3550 mid, c_nritems - mid);
3551 if (ret) {
3552 btrfs_abort_transaction(trans, root, ret);
3553 return ret;
3554 }
3299 copy_extent_buffer(split, c, 3555 copy_extent_buffer(split, c,
3300 btrfs_node_key_ptr_offset(0), 3556 btrfs_node_key_ptr_offset(0),
3301 btrfs_node_key_ptr_offset(mid), 3557 btrfs_node_key_ptr_offset(mid),
@@ -3362,8 +3618,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3362 int ret; 3618 int ret;
3363 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); 3619 ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3364 if (ret < 0) { 3620 if (ret < 0) {
3365 printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " 3621 btrfs_crit(root->fs_info,
3366 "used %d nritems %d\n", 3622 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3367 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), 3623 ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3368 leaf_space_used(leaf, 0, nritems), nritems); 3624 leaf_space_used(leaf, 0, nritems), nritems);
3369 } 3625 }
@@ -3571,6 +3827,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3571 if (left_nritems == 0) 3827 if (left_nritems == 0)
3572 goto out_unlock; 3828 goto out_unlock;
3573 3829
3830 if (path->slots[0] == left_nritems && !empty) {
3831 /* Key greater than all keys in the leaf, right neighbor has
3832 * enough room for it and we're not emptying our leaf to delete
3833 * it, therefore use right neighbor to insert the new item and
3834 * no need to touch/dirty our left leaft. */
3835 btrfs_tree_unlock(left);
3836 free_extent_buffer(left);
3837 path->nodes[0] = right;
3838 path->slots[0] = 0;
3839 path->slots[1]++;
3840 return 0;
3841 }
3842
3574 return __push_leaf_right(trans, root, path, min_data_size, empty, 3843 return __push_leaf_right(trans, root, path, min_data_size, empty,
3575 right, free_space, left_nritems, min_slot); 3844 right, free_space, left_nritems, min_slot);
3576out_unlock: 3845out_unlock:
@@ -3887,14 +4156,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3887 int progress = 0; 4156 int progress = 0;
3888 int slot; 4157 int slot;
3889 u32 nritems; 4158 u32 nritems;
4159 int space_needed = data_size;
3890 4160
3891 slot = path->slots[0]; 4161 slot = path->slots[0];
4162 if (slot < btrfs_header_nritems(path->nodes[0]))
4163 space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
3892 4164
3893 /* 4165 /*
3894 * try to push all the items after our slot into the 4166 * try to push all the items after our slot into the
3895 * right leaf 4167 * right leaf
3896 */ 4168 */
3897 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); 4169 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3898 if (ret < 0) 4170 if (ret < 0)
3899 return ret; 4171 return ret;
3900 4172
@@ -3914,7 +4186,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3914 4186
3915 /* try to push all the items before our slot into the next leaf */ 4187 /* try to push all the items before our slot into the next leaf */
3916 slot = path->slots[0]; 4188 slot = path->slots[0];
3917 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); 4189 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3918 if (ret < 0) 4190 if (ret < 0)
3919 return ret; 4191 return ret;
3920 4192
@@ -3958,13 +4230,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
3958 4230
3959 /* first try to make some room by pushing left and right */ 4231 /* first try to make some room by pushing left and right */
3960 if (data_size && path->nodes[1]) { 4232 if (data_size && path->nodes[1]) {
3961 wret = push_leaf_right(trans, root, path, data_size, 4233 int space_needed = data_size;
3962 data_size, 0, 0); 4234
4235 if (slot < btrfs_header_nritems(l))
4236 space_needed -= btrfs_leaf_free_space(root, l);
4237
4238 wret = push_leaf_right(trans, root, path, space_needed,
4239 space_needed, 0, 0);
3963 if (wret < 0) 4240 if (wret < 0)
3964 return wret; 4241 return wret;
3965 if (wret) { 4242 if (wret) {
3966 wret = push_leaf_left(trans, root, path, data_size, 4243 wret = push_leaf_left(trans, root, path, space_needed,
3967 data_size, 0, (u32)-1); 4244 space_needed, 0, (u32)-1);
3968 if (wret < 0) 4245 if (wret < 0)
3969 return wret; 4246 return wret;
3970 } 4247 }
@@ -4432,7 +4709,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4432 BUG_ON(slot < 0); 4709 BUG_ON(slot < 0);
4433 if (slot >= nritems) { 4710 if (slot >= nritems) {
4434 btrfs_print_leaf(root, leaf); 4711 btrfs_print_leaf(root, leaf);
4435 printk(KERN_CRIT "slot %d too large, nritems %d\n", 4712 btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4436 slot, nritems); 4713 slot, nritems);
4437 BUG_ON(1); 4714 BUG_ON(1);
4438 } 4715 }
@@ -4495,7 +4772,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4495 4772
4496 if (btrfs_leaf_free_space(root, leaf) < total_size) { 4773 if (btrfs_leaf_free_space(root, leaf) < total_size) {
4497 btrfs_print_leaf(root, leaf); 4774 btrfs_print_leaf(root, leaf);
4498 printk(KERN_CRIT "not enough freespace need %u have %d\n", 4775 btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4499 total_size, btrfs_leaf_free_space(root, leaf)); 4776 total_size, btrfs_leaf_free_space(root, leaf));
4500 BUG(); 4777 BUG();
4501 } 4778 }
@@ -4505,7 +4782,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4505 4782
4506 if (old_data < data_end) { 4783 if (old_data < data_end) {
4507 btrfs_print_leaf(root, leaf); 4784 btrfs_print_leaf(root, leaf);
4508 printk(KERN_CRIT "slot %d old_data %d data_end %d\n", 4785 btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4509 slot, old_data, data_end); 4786 slot, old_data, data_end);
4510 BUG_ON(1); 4787 BUG_ON(1);
4511 } 4788 }
@@ -4817,7 +5094,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4817 * This may release the path, and so you may lose any locks held at the 5094 * This may release the path, and so you may lose any locks held at the
4818 * time you call it. 5095 * time you call it.
4819 */ 5096 */
4820static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) 5097int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4821{ 5098{
4822 struct btrfs_key key; 5099 struct btrfs_key key;
4823 struct btrfs_disk_key found_key; 5100 struct btrfs_disk_key found_key;
@@ -5240,7 +5517,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5240 5517
5241 if (!left_start_ctransid || !right_start_ctransid) { 5518 if (!left_start_ctransid || !right_start_ctransid) {
5242 WARN(1, KERN_WARNING 5519 WARN(1, KERN_WARNING
5243 "btrfs: btrfs_compare_tree detected " 5520 "BTRFS: btrfs_compare_tree detected "
5244 "a change in one of the trees while " 5521 "a change in one of the trees while "
5245 "iterating. This is probably a " 5522 "iterating. This is probably a "
5246 "bug.\n"); 5523 "bug.\n");
@@ -5680,3 +5957,46 @@ int btrfs_previous_item(struct btrfs_root *root,
5680 } 5957 }
5681 return 1; 5958 return 1;
5682} 5959}
5960
5961/*
5962 * search in extent tree to find a previous Metadata/Data extent item with
5963 * min objecitd.
5964 *
5965 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5966 */
5967int btrfs_previous_extent_item(struct btrfs_root *root,
5968 struct btrfs_path *path, u64 min_objectid)
5969{
5970 struct btrfs_key found_key;
5971 struct extent_buffer *leaf;
5972 u32 nritems;
5973 int ret;
5974
5975 while (1) {
5976 if (path->slots[0] == 0) {
5977 btrfs_set_path_blocking(path);
5978 ret = btrfs_prev_leaf(root, path);
5979 if (ret != 0)
5980 return ret;
5981 } else {
5982 path->slots[0]--;
5983 }
5984 leaf = path->nodes[0];
5985 nritems = btrfs_header_nritems(leaf);
5986 if (nritems == 0)
5987 return 1;
5988 if (path->slots[0] == nritems)
5989 path->slots[0]--;
5990
5991 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5992 if (found_key.objectid < min_objectid)
5993 break;
5994 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5995 found_key.type == BTRFS_METADATA_ITEM_KEY)
5996 return 0;
5997 if (found_key.objectid == min_objectid &&
5998 found_key.type < BTRFS_EXTENT_ITEM_KEY)
5999 break;
6000 }
6001 return 1;
6002}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54ab86127f7a..2c1a42ca519f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -521,9 +521,15 @@ struct btrfs_super_block {
521#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 521#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
522#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) 522#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
523#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) 523#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
524#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9)
524 525
525#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 526#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
527#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
528#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
526#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 529#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
530#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
531#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
532
527#define BTRFS_FEATURE_INCOMPAT_SUPP \ 533#define BTRFS_FEATURE_INCOMPAT_SUPP \
528 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 534 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
529 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ 535 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
@@ -532,7 +538,12 @@ struct btrfs_super_block {
532 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 538 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
533 BTRFS_FEATURE_INCOMPAT_RAID56 | \ 539 BTRFS_FEATURE_INCOMPAT_RAID56 | \
534 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ 540 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
535 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) 541 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
542 BTRFS_FEATURE_INCOMPAT_NO_HOLES)
543
544#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
545 (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
546#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
536 547
537/* 548/*
538 * A leaf is full of items. offset and size tell us where to find 549 * A leaf is full of items. offset and size tell us where to find
@@ -1094,7 +1105,7 @@ struct btrfs_qgroup_limit_item {
1094} __attribute__ ((__packed__)); 1105} __attribute__ ((__packed__));
1095 1106
1096struct btrfs_space_info { 1107struct btrfs_space_info {
1097 u64 flags; 1108 spinlock_t lock;
1098 1109
1099 u64 total_bytes; /* total bytes in the space, 1110 u64 total_bytes; /* total bytes in the space,
1100 this doesn't take mirrors into account */ 1111 this doesn't take mirrors into account */
@@ -1104,14 +1115,25 @@ struct btrfs_space_info {
1104 transaction finishes */ 1115 transaction finishes */
1105 u64 bytes_reserved; /* total bytes the allocator has reserved for 1116 u64 bytes_reserved; /* total bytes the allocator has reserved for
1106 current allocations */ 1117 current allocations */
1107 u64 bytes_readonly; /* total bytes that are read only */
1108
1109 u64 bytes_may_use; /* number of bytes that may be used for 1118 u64 bytes_may_use; /* number of bytes that may be used for
1110 delalloc/allocations */ 1119 delalloc/allocations */
1120 u64 bytes_readonly; /* total bytes that are read only */
1121
1122 unsigned int full:1; /* indicates that we cannot allocate any more
1123 chunks for this space */
1124 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
1125
1126 unsigned int flush:1; /* set if we are trying to make space */
1127
1128 unsigned int force_alloc; /* set if we need to force a chunk
1129 alloc for this space */
1130
1111 u64 disk_used; /* total bytes used on disk */ 1131 u64 disk_used; /* total bytes used on disk */
1112 u64 disk_total; /* total bytes on disk, takes mirrors into 1132 u64 disk_total; /* total bytes on disk, takes mirrors into
1113 account */ 1133 account */
1114 1134
1135 u64 flags;
1136
1115 /* 1137 /*
1116 * bytes_pinned is kept in line with what is actually pinned, as in 1138 * bytes_pinned is kept in line with what is actually pinned, as in
1117 * we've called update_block_group and dropped the bytes_used counter 1139 * we've called update_block_group and dropped the bytes_used counter
@@ -1124,22 +1146,15 @@ struct btrfs_space_info {
1124 */ 1146 */
1125 struct percpu_counter total_bytes_pinned; 1147 struct percpu_counter total_bytes_pinned;
1126 1148
1127 unsigned int full:1; /* indicates that we cannot allocate any more
1128 chunks for this space */
1129 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
1130
1131 unsigned int flush:1; /* set if we are trying to make space */
1132
1133 unsigned int force_alloc; /* set if we need to force a chunk
1134 alloc for this space */
1135
1136 struct list_head list; 1149 struct list_head list;
1137 1150
1151 struct rw_semaphore groups_sem;
1138 /* for block groups in our same type */ 1152 /* for block groups in our same type */
1139 struct list_head block_groups[BTRFS_NR_RAID_TYPES]; 1153 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
1140 spinlock_t lock;
1141 struct rw_semaphore groups_sem;
1142 wait_queue_head_t wait; 1154 wait_queue_head_t wait;
1155
1156 struct kobject kobj;
1157 struct kobject block_group_kobjs[BTRFS_NR_RAID_TYPES];
1143}; 1158};
1144 1159
1145#define BTRFS_BLOCK_RSV_GLOBAL 1 1160#define BTRFS_BLOCK_RSV_GLOBAL 1
@@ -1346,6 +1361,7 @@ struct btrfs_fs_info {
1346 1361
1347 u64 generation; 1362 u64 generation;
1348 u64 last_trans_committed; 1363 u64 last_trans_committed;
1364 u64 avg_delayed_ref_runtime;
1349 1365
1350 /* 1366 /*
1351 * this is updated to the current trans every time a full commit 1367 * this is updated to the current trans every time a full commit
@@ -1448,7 +1464,6 @@ struct btrfs_fs_info {
1448 spinlock_t tree_mod_seq_lock; 1464 spinlock_t tree_mod_seq_lock;
1449 atomic64_t tree_mod_seq; 1465 atomic64_t tree_mod_seq;
1450 struct list_head tree_mod_seq_list; 1466 struct list_head tree_mod_seq_list;
1451 struct seq_list tree_mod_seq_elem;
1452 1467
1453 /* this protects tree_mod_log */ 1468 /* this protects tree_mod_log */
1454 rwlock_t tree_mod_log_lock; 1469 rwlock_t tree_mod_log_lock;
@@ -1515,6 +1530,8 @@ struct btrfs_fs_info {
1515 int thread_pool_size; 1530 int thread_pool_size;
1516 1531
1517 struct kobject super_kobj; 1532 struct kobject super_kobj;
1533 struct kobject *space_info_kobj;
1534 struct kobject *device_dir_kobj;
1518 struct completion kobj_unregister; 1535 struct completion kobj_unregister;
1519 int do_barriers; 1536 int do_barriers;
1520 int closing; 1537 int closing;
@@ -1643,6 +1660,10 @@ struct btrfs_fs_info {
1643 spinlock_t reada_lock; 1660 spinlock_t reada_lock;
1644 struct radix_tree_root reada_tree; 1661 struct radix_tree_root reada_tree;
1645 1662
1663 /* Extent buffer radix tree */
1664 spinlock_t buffer_lock;
1665 struct radix_tree_root buffer_radix;
1666
1646 /* next backup root to be overwritten */ 1667 /* next backup root to be overwritten */
1647 int backup_root_index; 1668 int backup_root_index;
1648 1669
@@ -1795,6 +1816,12 @@ struct btrfs_root {
1795 struct list_head ordered_extents; 1816 struct list_head ordered_extents;
1796 struct list_head ordered_root; 1817 struct list_head ordered_root;
1797 u64 nr_ordered_extents; 1818 u64 nr_ordered_extents;
1819
1820 /*
1821 * Number of currently running SEND ioctls to prevent
1822 * manipulation with the read-only status via SUBVOL_SETFLAGS
1823 */
1824 int send_in_progress;
1798}; 1825};
1799 1826
1800struct btrfs_ioctl_defrag_range_args { 1827struct btrfs_ioctl_defrag_range_args {
@@ -1997,6 +2024,7 @@ struct btrfs_ioctl_defrag_range_args {
1997#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) 2024#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
1998#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) 2025#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
1999#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) 2026#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
2027#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
2000 2028
2001#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2029#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2002 2030
@@ -2925,6 +2953,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
2925 struct btrfs_file_extent_item, generation, 64); 2953 struct btrfs_file_extent_item, generation, 64);
2926BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, 2954BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
2927 struct btrfs_file_extent_item, num_bytes, 64); 2955 struct btrfs_file_extent_item, num_bytes, 64);
2956BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
2957 struct btrfs_file_extent_item, disk_num_bytes, 64);
2958BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
2959 struct btrfs_file_extent_item, compression, 8);
2928 2960
2929static inline unsigned long 2961static inline unsigned long
2930btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) 2962btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
@@ -2958,15 +2990,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
2958BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, 2990BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
2959 other_encoding, 16); 2991 other_encoding, 16);
2960 2992
2961/* this returns the number of file bytes represented by the inline item.
2962 * If an item is compressed, this is the uncompressed size
2963 */
2964static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
2965 struct btrfs_file_extent_item *e)
2966{
2967 return btrfs_file_extent_ram_bytes(eb, e);
2968}
2969
2970/* 2993/*
2971 * this returns the number of bytes used by the item on disk, minus the 2994 * this returns the number of bytes used by the item on disk, minus the
2972 * size of any extent headers. If a file is compressed on disk, this is 2995 * size of any extent headers. If a file is compressed on disk, this is
@@ -2980,6 +3003,32 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
2980 return btrfs_item_size(eb, e) - offset; 3003 return btrfs_item_size(eb, e) - offset;
2981} 3004}
2982 3005
3006/* this returns the number of file bytes represented by the inline item.
3007 * If an item is compressed, this is the uncompressed size
3008 */
3009static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
3010 int slot,
3011 struct btrfs_file_extent_item *fi)
3012{
3013 struct btrfs_map_token token;
3014
3015 btrfs_init_map_token(&token);
3016 /*
3017 * return the space used on disk if this item isn't
3018 * compressed or encoded
3019 */
3020 if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 &&
3021 btrfs_token_file_extent_encryption(eb, fi, &token) == 0 &&
3022 btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) {
3023 return btrfs_file_extent_inline_item_len(eb,
3024 btrfs_item_nr(slot));
3025 }
3026
3027 /* otherwise use the ram bytes field */
3028 return btrfs_token_file_extent_ram_bytes(eb, fi, &token);
3029}
3030
3031
2983/* btrfs_dev_stats_item */ 3032/* btrfs_dev_stats_item */
2984static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, 3033static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
2985 struct btrfs_dev_stats_item *ptr, 3034 struct btrfs_dev_stats_item *ptr,
@@ -3143,6 +3192,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3143 3192
3144int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 3193int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3145 struct btrfs_root *root); 3194 struct btrfs_root *root);
3195int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
3196 struct btrfs_root *root);
3146void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3197void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3147int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3198int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3148 struct btrfs_root *root, unsigned long count); 3199 struct btrfs_root *root, unsigned long count);
@@ -3163,6 +3214,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
3163 struct btrfs_fs_info *info, 3214 struct btrfs_fs_info *info,
3164 u64 bytenr); 3215 u64 bytenr);
3165void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3216void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3217int get_block_group_index(struct btrfs_block_group_cache *cache);
3166struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 3218struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3167 struct btrfs_root *root, u32 blocksize, 3219 struct btrfs_root *root, u32 blocksize,
3168 u64 parent, u64 root_objectid, 3220 u64 parent, u64 root_objectid,
@@ -3301,6 +3353,8 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
3301int btrfs_previous_item(struct btrfs_root *root, 3353int btrfs_previous_item(struct btrfs_root *root,
3302 struct btrfs_path *path, u64 min_objectid, 3354 struct btrfs_path *path, u64 min_objectid,
3303 int type); 3355 int type);
3356int btrfs_previous_extent_item(struct btrfs_root *root,
3357 struct btrfs_path *path, u64 min_objectid);
3304void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, 3358void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3305 struct btrfs_key *new_key); 3359 struct btrfs_key *new_key);
3306struct extent_buffer *btrfs_root_node(struct btrfs_root *root); 3360struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
@@ -3350,6 +3404,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3350 struct btrfs_root *root, 3404 struct btrfs_root *root,
3351 struct btrfs_path *path, 3405 struct btrfs_path *path,
3352 struct btrfs_key *new_key); 3406 struct btrfs_key *new_key);
3407int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
3408 u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
3353int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root 3409int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
3354 *root, struct btrfs_key *key, struct btrfs_path *p, int 3410 *root, struct btrfs_key *key, struct btrfs_path *p, int
3355 ins_len, int cow); 3411 ins_len, int cow);
@@ -3399,6 +3455,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
3399} 3455}
3400 3456
3401int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3457int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
3458int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
3402int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3459int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
3403 u64 time_seq); 3460 u64 time_seq);
3404static inline int btrfs_next_old_item(struct btrfs_root *root, 3461static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3563,12 +3620,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
3563 struct btrfs_root *root, 3620 struct btrfs_root *root,
3564 const char *name, int name_len, 3621 const char *name, int name_len,
3565 u64 inode_objectid, u64 ref_objectid, u64 *index); 3622 u64 inode_objectid, u64 ref_objectid, u64 *index);
3566int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
3567 struct btrfs_root *root,
3568 struct btrfs_path *path,
3569 const char *name, int name_len,
3570 u64 inode_objectid, u64 ref_objectid, int mod,
3571 u64 *ret_index);
3572int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 3623int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
3573 struct btrfs_root *root, 3624 struct btrfs_root *root,
3574 struct btrfs_path *path, u64 objectid); 3625 struct btrfs_path *path, u64 objectid);
@@ -3676,7 +3727,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
3676int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 3727int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
3677 struct extent_state **cached_state); 3728 struct extent_state **cached_state);
3678int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3729int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
3679 struct btrfs_root *new_root, u64 new_dirid); 3730 struct btrfs_root *new_root,
3731 struct btrfs_root *parent_root,
3732 u64 new_dirid);
3680int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, 3733int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
3681 size_t size, struct bio *bio, 3734 size_t size, struct bio *bio,
3682 unsigned long bio_flags); 3735 unsigned long bio_flags);
@@ -3745,7 +3798,10 @@ extern const struct file_operations btrfs_file_operations;
3745int __btrfs_drop_extents(struct btrfs_trans_handle *trans, 3798int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
3746 struct btrfs_root *root, struct inode *inode, 3799 struct btrfs_root *root, struct inode *inode,
3747 struct btrfs_path *path, u64 start, u64 end, 3800 struct btrfs_path *path, u64 start, u64 end,
3748 u64 *drop_end, int drop_cache); 3801 u64 *drop_end, int drop_cache,
3802 int replace_extent,
3803 u32 extent_item_size,
3804 int *key_inserted);
3749int btrfs_drop_extents(struct btrfs_trans_handle *trans, 3805int btrfs_drop_extents(struct btrfs_trans_handle *trans,
3750 struct btrfs_root *root, struct inode *inode, u64 start, 3806 struct btrfs_root *root, struct inode *inode, u64 start,
3751 u64 end, int drop_cache); 3807 u64 end, int drop_cache);
@@ -3764,6 +3820,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
3764/* sysfs.c */ 3820/* sysfs.c */
3765int btrfs_init_sysfs(void); 3821int btrfs_init_sysfs(void);
3766void btrfs_exit_sysfs(void); 3822void btrfs_exit_sysfs(void);
3823int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
3824void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
3767 3825
3768/* xattr.c */ 3826/* xattr.c */
3769ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 3827ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -3796,14 +3854,20 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
3796 btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) 3854 btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
3797#define btrfs_info(fs_info, fmt, args...) \ 3855#define btrfs_info(fs_info, fmt, args...) \
3798 btrfs_printk(fs_info, KERN_INFO fmt, ##args) 3856 btrfs_printk(fs_info, KERN_INFO fmt, ##args)
3857
3858#ifdef DEBUG
3799#define btrfs_debug(fs_info, fmt, args...) \ 3859#define btrfs_debug(fs_info, fmt, args...) \
3800 btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) 3860 btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
3861#else
3862#define btrfs_debug(fs_info, fmt, args...) \
3863 no_printk(KERN_DEBUG fmt, ##args)
3864#endif
3801 3865
3802#ifdef CONFIG_BTRFS_ASSERT 3866#ifdef CONFIG_BTRFS_ASSERT
3803 3867
3804static inline void assfail(char *expr, char *file, int line) 3868static inline void assfail(char *expr, char *file, int line)
3805{ 3869{
3806 printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d", 3870 pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
3807 expr, file, line); 3871 expr, file, line);
3808 BUG(); 3872 BUG();
3809} 3873}
@@ -3841,7 +3905,7 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
3841 if (!(features & flag)) { 3905 if (!(features & flag)) {
3842 features |= flag; 3906 features |= flag;
3843 btrfs_set_super_incompat_flags(disk_super, features); 3907 btrfs_set_super_incompat_flags(disk_super, features);
3844 printk(KERN_INFO "btrfs: setting %llu feature flag\n", 3908 btrfs_info(fs_info, "setting %llu feature flag",
3845 flag); 3909 flag);
3846 } 3910 }
3847 spin_unlock(&fs_info->super_lock); 3911 spin_unlock(&fs_info->super_lock);
@@ -3899,20 +3963,17 @@ do { \
3899/* acl.c */ 3963/* acl.c */
3900#ifdef CONFIG_BTRFS_FS_POSIX_ACL 3964#ifdef CONFIG_BTRFS_FS_POSIX_ACL
3901struct posix_acl *btrfs_get_acl(struct inode *inode, int type); 3965struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
3966int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
3902int btrfs_init_acl(struct btrfs_trans_handle *trans, 3967int btrfs_init_acl(struct btrfs_trans_handle *trans,
3903 struct inode *inode, struct inode *dir); 3968 struct inode *inode, struct inode *dir);
3904int btrfs_acl_chmod(struct inode *inode);
3905#else 3969#else
3906#define btrfs_get_acl NULL 3970#define btrfs_get_acl NULL
3971#define btrfs_set_acl NULL
3907static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, 3972static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
3908 struct inode *inode, struct inode *dir) 3973 struct inode *inode, struct inode *dir)
3909{ 3974{
3910 return 0; 3975 return 0;
3911} 3976}
3912static inline int btrfs_acl_chmod(struct inode *inode)
3913{
3914 return 0;
3915}
3916#endif 3977#endif
3917 3978
3918/* relocation.c */ 3979/* relocation.c */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 8d292fbae659..451b00c86f6c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -55,8 +55,7 @@ static inline void btrfs_init_delayed_node(
55 delayed_node->inode_id = inode_id; 55 delayed_node->inode_id = inode_id;
56 atomic_set(&delayed_node->refs, 0); 56 atomic_set(&delayed_node->refs, 0);
57 delayed_node->count = 0; 57 delayed_node->count = 0;
58 delayed_node->in_list = 0; 58 delayed_node->flags = 0;
59 delayed_node->inode_dirty = 0;
60 delayed_node->ins_root = RB_ROOT; 59 delayed_node->ins_root = RB_ROOT;
61 delayed_node->del_root = RB_ROOT; 60 delayed_node->del_root = RB_ROOT;
62 mutex_init(&delayed_node->mutex); 61 mutex_init(&delayed_node->mutex);
@@ -172,7 +171,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
172 int mod) 171 int mod)
173{ 172{
174 spin_lock(&root->lock); 173 spin_lock(&root->lock);
175 if (node->in_list) { 174 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
176 if (!list_empty(&node->p_list)) 175 if (!list_empty(&node->p_list))
177 list_move_tail(&node->p_list, &root->prepare_list); 176 list_move_tail(&node->p_list, &root->prepare_list);
178 else if (mod) 177 else if (mod)
@@ -182,7 +181,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
182 list_add_tail(&node->p_list, &root->prepare_list); 181 list_add_tail(&node->p_list, &root->prepare_list);
183 atomic_inc(&node->refs); /* inserted into list */ 182 atomic_inc(&node->refs); /* inserted into list */
184 root->nodes++; 183 root->nodes++;
185 node->in_list = 1; 184 set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
186 } 185 }
187 spin_unlock(&root->lock); 186 spin_unlock(&root->lock);
188} 187}
@@ -192,13 +191,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
192 struct btrfs_delayed_node *node) 191 struct btrfs_delayed_node *node)
193{ 192{
194 spin_lock(&root->lock); 193 spin_lock(&root->lock);
195 if (node->in_list) { 194 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
196 root->nodes--; 195 root->nodes--;
197 atomic_dec(&node->refs); /* not in the list */ 196 atomic_dec(&node->refs); /* not in the list */
198 list_del_init(&node->n_list); 197 list_del_init(&node->n_list);
199 if (!list_empty(&node->p_list)) 198 if (!list_empty(&node->p_list))
200 list_del_init(&node->p_list); 199 list_del_init(&node->p_list);
201 node->in_list = 0; 200 clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
202 } 201 }
203 spin_unlock(&root->lock); 202 spin_unlock(&root->lock);
204} 203}
@@ -231,7 +230,8 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
231 230
232 delayed_root = node->root->fs_info->delayed_root; 231 delayed_root = node->root->fs_info->delayed_root;
233 spin_lock(&delayed_root->lock); 232 spin_lock(&delayed_root->lock);
234 if (!node->in_list) { /* not in the list */ 233 if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
234 /* not in the list */
235 if (list_empty(&delayed_root->node_list)) 235 if (list_empty(&delayed_root->node_list))
236 goto out; 236 goto out;
237 p = delayed_root->node_list.next; 237 p = delayed_root->node_list.next;
@@ -1004,9 +1004,10 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
1004{ 1004{
1005 struct btrfs_delayed_root *delayed_root; 1005 struct btrfs_delayed_root *delayed_root;
1006 1006
1007 if (delayed_node && delayed_node->inode_dirty) { 1007 if (delayed_node &&
1008 test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1008 BUG_ON(!delayed_node->root); 1009 BUG_ON(!delayed_node->root);
1009 delayed_node->inode_dirty = 0; 1010 clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
1010 delayed_node->count--; 1011 delayed_node->count--;
1011 1012
1012 delayed_root = delayed_node->root->fs_info->delayed_root; 1013 delayed_root = delayed_node->root->fs_info->delayed_root;
@@ -1014,6 +1015,18 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
1014 } 1015 }
1015} 1016}
1016 1017
1018static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
1019{
1020 struct btrfs_delayed_root *delayed_root;
1021
1022 ASSERT(delayed_node->root);
1023 clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1024 delayed_node->count--;
1025
1026 delayed_root = delayed_node->root->fs_info->delayed_root;
1027 finish_one_item(delayed_root);
1028}
1029
1017static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1030static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1018 struct btrfs_root *root, 1031 struct btrfs_root *root,
1019 struct btrfs_path *path, 1032 struct btrfs_path *path,
@@ -1022,13 +1035,19 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1022 struct btrfs_key key; 1035 struct btrfs_key key;
1023 struct btrfs_inode_item *inode_item; 1036 struct btrfs_inode_item *inode_item;
1024 struct extent_buffer *leaf; 1037 struct extent_buffer *leaf;
1038 int mod;
1025 int ret; 1039 int ret;
1026 1040
1027 key.objectid = node->inode_id; 1041 key.objectid = node->inode_id;
1028 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 1042 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
1029 key.offset = 0; 1043 key.offset = 0;
1030 1044
1031 ret = btrfs_lookup_inode(trans, root, path, &key, 1); 1045 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1046 mod = -1;
1047 else
1048 mod = 1;
1049
1050 ret = btrfs_lookup_inode(trans, root, path, &key, mod);
1032 if (ret > 0) { 1051 if (ret > 0) {
1033 btrfs_release_path(path); 1052 btrfs_release_path(path);
1034 return -ENOENT; 1053 return -ENOENT;
@@ -1036,19 +1055,58 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1036 return ret; 1055 return ret;
1037 } 1056 }
1038 1057
1039 btrfs_unlock_up_safe(path, 1);
1040 leaf = path->nodes[0]; 1058 leaf = path->nodes[0];
1041 inode_item = btrfs_item_ptr(leaf, path->slots[0], 1059 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1042 struct btrfs_inode_item); 1060 struct btrfs_inode_item);
1043 write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, 1061 write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
1044 sizeof(struct btrfs_inode_item)); 1062 sizeof(struct btrfs_inode_item));
1045 btrfs_mark_buffer_dirty(leaf); 1063 btrfs_mark_buffer_dirty(leaf);
1046 btrfs_release_path(path);
1047 1064
1065 if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
1066 goto no_iref;
1067
1068 path->slots[0]++;
1069 if (path->slots[0] >= btrfs_header_nritems(leaf))
1070 goto search;
1071again:
1072 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1073 if (key.objectid != node->inode_id)
1074 goto out;
1075
1076 if (key.type != BTRFS_INODE_REF_KEY &&
1077 key.type != BTRFS_INODE_EXTREF_KEY)
1078 goto out;
1079
1080 /*
1081 * Delayed iref deletion is for the inode who has only one link,
1082 * so there is only one iref. The case that several irefs are
1083 * in the same item doesn't exist.
1084 */
1085 btrfs_del_item(trans, root, path);
1086out:
1087 btrfs_release_delayed_iref(node);
1088no_iref:
1089 btrfs_release_path(path);
1090err_out:
1048 btrfs_delayed_inode_release_metadata(root, node); 1091 btrfs_delayed_inode_release_metadata(root, node);
1049 btrfs_release_delayed_inode(node); 1092 btrfs_release_delayed_inode(node);
1050 1093
1051 return 0; 1094 return ret;
1095
1096search:
1097 btrfs_release_path(path);
1098
1099 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
1100 key.offset = -1;
1101 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1102 if (ret < 0)
1103 goto err_out;
1104 ASSERT(ret);
1105
1106 ret = 0;
1107 leaf = path->nodes[0];
1108 path->slots[0]--;
1109 goto again;
1052} 1110}
1053 1111
1054static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1112static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1059,7 +1117,7 @@ static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1059 int ret; 1117 int ret;
1060 1118
1061 mutex_lock(&node->mutex); 1119 mutex_lock(&node->mutex);
1062 if (!node->inode_dirty) { 1120 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
1063 mutex_unlock(&node->mutex); 1121 mutex_unlock(&node->mutex);
1064 return 0; 1122 return 0;
1065 } 1123 }
@@ -1203,7 +1261,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
1203 return 0; 1261 return 0;
1204 1262
1205 mutex_lock(&delayed_node->mutex); 1263 mutex_lock(&delayed_node->mutex);
1206 if (!delayed_node->inode_dirty) { 1264 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1207 mutex_unlock(&delayed_node->mutex); 1265 mutex_unlock(&delayed_node->mutex);
1208 btrfs_release_delayed_node(delayed_node); 1266 btrfs_release_delayed_node(delayed_node);
1209 return 0; 1267 return 0;
@@ -1227,7 +1285,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
1227 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; 1285 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
1228 1286
1229 mutex_lock(&delayed_node->mutex); 1287 mutex_lock(&delayed_node->mutex);
1230 if (delayed_node->inode_dirty) 1288 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
1231 ret = __btrfs_update_delayed_inode(trans, delayed_node->root, 1289 ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
1232 path, delayed_node); 1290 path, delayed_node);
1233 else 1291 else
@@ -1300,36 +1358,9 @@ again:
1300 trans->block_rsv = &root->fs_info->delayed_block_rsv; 1358 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1301 1359
1302 __btrfs_commit_inode_delayed_items(trans, path, delayed_node); 1360 __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
1303 /*
1304 * Maybe new delayed items have been inserted, so we need requeue
1305 * the work. Besides that, we must dequeue the empty delayed nodes
1306 * to avoid the race between delayed items balance and the worker.
1307 * The race like this:
1308 * Task1 Worker thread
1309 * count == 0, needn't requeue
1310 * also needn't insert the
1311 * delayed node into prepare
1312 * list again.
1313 * add lots of delayed items
1314 * queue the delayed node
1315 * already in the list,
1316 * and not in the prepare
1317 * list, it means the delayed
1318 * node is being dealt with
1319 * by the worker.
1320 * do delayed items balance
1321 * the delayed node is being
1322 * dealt with by the worker
1323 * now, just wait.
1324 * the worker goto idle.
1325 * Task1 will sleep until the transaction is commited.
1326 */
1327 mutex_lock(&delayed_node->mutex);
1328 btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node);
1329 mutex_unlock(&delayed_node->mutex);
1330 1361
1331 trans->block_rsv = block_rsv; 1362 trans->block_rsv = block_rsv;
1332 btrfs_end_transaction_dmeta(trans, root); 1363 btrfs_end_transaction(trans, root);
1333 btrfs_btree_balance_dirty_nodelay(root); 1364 btrfs_btree_balance_dirty_nodelay(root);
1334 1365
1335release_path: 1366release_path:
@@ -1376,52 +1407,41 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
1376 WARN_ON(btrfs_first_delayed_node(delayed_root)); 1407 WARN_ON(btrfs_first_delayed_node(delayed_root));
1377} 1408}
1378 1409
1379static int refs_newer(struct btrfs_delayed_root *delayed_root, 1410static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
1380 int seq, int count)
1381{ 1411{
1382 int val = atomic_read(&delayed_root->items_seq); 1412 int val = atomic_read(&delayed_root->items_seq);
1383 1413
1384 if (val < seq || val >= seq + count) 1414 if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
1415 return 1;
1416
1417 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1385 return 1; 1418 return 1;
1419
1386 return 0; 1420 return 0;
1387} 1421}
1388 1422
1389void btrfs_balance_delayed_items(struct btrfs_root *root) 1423void btrfs_balance_delayed_items(struct btrfs_root *root)
1390{ 1424{
1391 struct btrfs_delayed_root *delayed_root; 1425 struct btrfs_delayed_root *delayed_root;
1392 int seq;
1393 1426
1394 delayed_root = btrfs_get_delayed_root(root); 1427 delayed_root = btrfs_get_delayed_root(root);
1395 1428
1396 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) 1429 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1397 return; 1430 return;
1398 1431
1399 seq = atomic_read(&delayed_root->items_seq);
1400
1401 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { 1432 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
1433 int seq;
1402 int ret; 1434 int ret;
1403 DEFINE_WAIT(__wait); 1435
1436 seq = atomic_read(&delayed_root->items_seq);
1404 1437
1405 ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); 1438 ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
1406 if (ret) 1439 if (ret)
1407 return; 1440 return;
1408 1441
1409 while (1) { 1442 wait_event_interruptible(delayed_root->wait,
1410 prepare_to_wait(&delayed_root->wait, &__wait, 1443 could_end_wait(delayed_root, seq));
1411 TASK_INTERRUPTIBLE); 1444 return;
1412
1413 if (refs_newer(delayed_root, seq,
1414 BTRFS_DELAYED_BATCH) ||
1415 atomic_read(&delayed_root->items) <
1416 BTRFS_DELAYED_BACKGROUND) {
1417 break;
1418 }
1419 if (!signal_pending(current))
1420 schedule();
1421 else
1422 break;
1423 }
1424 finish_wait(&delayed_root->wait, &__wait);
1425 } 1445 }
1426 1446
1427 btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); 1447 btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
@@ -1472,9 +1492,9 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1472 mutex_lock(&delayed_node->mutex); 1492 mutex_lock(&delayed_node->mutex);
1473 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); 1493 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1474 if (unlikely(ret)) { 1494 if (unlikely(ret)) {
1475 printk(KERN_ERR "err add delayed dir index item(name: %.*s) " 1495 btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) "
1476 "into the insertion tree of the delayed node" 1496 "into the insertion tree of the delayed node"
1477 "(root id: %llu, inode id: %llu, errno: %d)\n", 1497 "(root id: %llu, inode id: %llu, errno: %d)",
1478 name_len, name, delayed_node->root->objectid, 1498 name_len, name, delayed_node->root->objectid,
1479 delayed_node->inode_id, ret); 1499 delayed_node->inode_id, ret);
1480 BUG(); 1500 BUG();
@@ -1544,9 +1564,9 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1544 mutex_lock(&node->mutex); 1564 mutex_lock(&node->mutex);
1545 ret = __btrfs_add_delayed_deletion_item(node, item); 1565 ret = __btrfs_add_delayed_deletion_item(node, item);
1546 if (unlikely(ret)) { 1566 if (unlikely(ret)) {
1547 printk(KERN_ERR "err add delayed dir index item(index: %llu) " 1567 btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) "
1548 "into the deletion tree of the delayed node" 1568 "into the deletion tree of the delayed node"
1549 "(root id: %llu, inode id: %llu, errno: %d)\n", 1569 "(root id: %llu, inode id: %llu, errno: %d)",
1550 index, node->root->objectid, node->inode_id, 1570 index, node->root->objectid, node->inode_id,
1551 ret); 1571 ret);
1552 BUG(); 1572 BUG();
@@ -1759,7 +1779,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1759 return -ENOENT; 1779 return -ENOENT;
1760 1780
1761 mutex_lock(&delayed_node->mutex); 1781 mutex_lock(&delayed_node->mutex);
1762 if (!delayed_node->inode_dirty) { 1782 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1763 mutex_unlock(&delayed_node->mutex); 1783 mutex_unlock(&delayed_node->mutex);
1764 btrfs_release_delayed_node(delayed_node); 1784 btrfs_release_delayed_node(delayed_node);
1765 return -ENOENT; 1785 return -ENOENT;
@@ -1810,7 +1830,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1810 return PTR_ERR(delayed_node); 1830 return PTR_ERR(delayed_node);
1811 1831
1812 mutex_lock(&delayed_node->mutex); 1832 mutex_lock(&delayed_node->mutex);
1813 if (delayed_node->inode_dirty) { 1833 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1814 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1834 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1815 goto release_node; 1835 goto release_node;
1816 } 1836 }
@@ -1821,7 +1841,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1821 goto release_node; 1841 goto release_node;
1822 1842
1823 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1843 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1824 delayed_node->inode_dirty = 1; 1844 set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
1825 delayed_node->count++; 1845 delayed_node->count++;
1826 atomic_inc(&root->fs_info->delayed_root->items); 1846 atomic_inc(&root->fs_info->delayed_root->items);
1827release_node: 1847release_node:
@@ -1830,6 +1850,41 @@ release_node:
1830 return ret; 1850 return ret;
1831} 1851}
1832 1852
1853int btrfs_delayed_delete_inode_ref(struct inode *inode)
1854{
1855 struct btrfs_delayed_node *delayed_node;
1856
1857 delayed_node = btrfs_get_or_create_delayed_node(inode);
1858 if (IS_ERR(delayed_node))
1859 return PTR_ERR(delayed_node);
1860
1861 /*
1862 * We don't reserve space for inode ref deletion is because:
1863 * - We ONLY do async inode ref deletion for the inode who has only
1864 * one link(i_nlink == 1), it means there is only one inode ref.
1865 * And in most case, the inode ref and the inode item are in the
1866 * same leaf, and we will deal with them at the same time.
1867 * Since we are sure we will reserve the space for the inode item,
1868 * it is unnecessary to reserve space for inode ref deletion.
1869 * - If the inode ref and the inode item are not in the same leaf,
1870 * We also needn't worry about enospc problem, because we reserve
1871 * much more space for the inode update than it needs.
1872 * - At the worst, we can steal some space from the global reservation.
1873 * It is very rare.
1874 */
1875 mutex_lock(&delayed_node->mutex);
1876 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1877 goto release_node;
1878
1879 set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
1880 delayed_node->count++;
1881 atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
1882release_node:
1883 mutex_unlock(&delayed_node->mutex);
1884 btrfs_release_delayed_node(delayed_node);
1885 return 0;
1886}
1887
1833static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) 1888static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
1834{ 1889{
1835 struct btrfs_root *root = delayed_node->root; 1890 struct btrfs_root *root = delayed_node->root;
@@ -1852,7 +1907,10 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
1852 btrfs_release_delayed_item(prev_item); 1907 btrfs_release_delayed_item(prev_item);
1853 } 1908 }
1854 1909
1855 if (delayed_node->inode_dirty) { 1910 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
1911 btrfs_release_delayed_iref(delayed_node);
1912
1913 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
1856 btrfs_delayed_inode_release_metadata(root, delayed_node); 1914 btrfs_delayed_inode_release_metadata(root, delayed_node);
1857 btrfs_release_delayed_inode(delayed_node); 1915 btrfs_release_delayed_inode(delayed_node);
1858 } 1916 }
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index a4b38f934d14..f70119f25421 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -48,6 +48,10 @@ struct btrfs_delayed_root {
48 wait_queue_head_t wait; 48 wait_queue_head_t wait;
49}; 49};
50 50
51#define BTRFS_DELAYED_NODE_IN_LIST 0
52#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
53#define BTRFS_DELAYED_NODE_DEL_IREF 2
54
51struct btrfs_delayed_node { 55struct btrfs_delayed_node {
52 u64 inode_id; 56 u64 inode_id;
53 u64 bytes_reserved; 57 u64 bytes_reserved;
@@ -65,8 +69,7 @@ struct btrfs_delayed_node {
65 struct btrfs_inode_item inode_item; 69 struct btrfs_inode_item inode_item;
66 atomic_t refs; 70 atomic_t refs;
67 u64 index_cnt; 71 u64 index_cnt;
68 bool in_list; 72 unsigned long flags;
69 bool inode_dirty;
70 int count; 73 int count;
71}; 74};
72 75
@@ -125,6 +128,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode);
125int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, 128int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
126 struct btrfs_root *root, struct inode *inode); 129 struct btrfs_root *root, struct inode *inode);
127int btrfs_fill_inode(struct inode *inode, u32 *rdev); 130int btrfs_fill_inode(struct inode *inode, u32 *rdev);
131int btrfs_delayed_delete_inode_ref(struct inode *inode);
128 132
129/* Used for drop dead root */ 133/* Used for drop dead root */
130void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); 134void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e4d467be2dd4..f3bff89eecf0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -161,35 +161,61 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
161 return NULL; 161 return NULL;
162} 162}
163 163
164/* insert a new ref to head ref rbtree */
165static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
166 struct rb_node *node)
167{
168 struct rb_node **p = &root->rb_node;
169 struct rb_node *parent_node = NULL;
170 struct btrfs_delayed_ref_head *entry;
171 struct btrfs_delayed_ref_head *ins;
172 u64 bytenr;
173
174 ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
175 bytenr = ins->node.bytenr;
176 while (*p) {
177 parent_node = *p;
178 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
179 href_node);
180
181 if (bytenr < entry->node.bytenr)
182 p = &(*p)->rb_left;
183 else if (bytenr > entry->node.bytenr)
184 p = &(*p)->rb_right;
185 else
186 return entry;
187 }
188
189 rb_link_node(node, parent_node, p);
190 rb_insert_color(node, root);
191 return NULL;
192}
193
164/* 194/*
165 * find an head entry based on bytenr. This returns the delayed ref 195 * find an head entry based on bytenr. This returns the delayed ref
166 * head if it was able to find one, or NULL if nothing was in that spot. 196 * head if it was able to find one, or NULL if nothing was in that spot.
167 * If return_bigger is given, the next bigger entry is returned if no exact 197 * If return_bigger is given, the next bigger entry is returned if no exact
168 * match is found. 198 * match is found.
169 */ 199 */
170static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, 200static struct btrfs_delayed_ref_head *
171 u64 bytenr, 201find_ref_head(struct rb_root *root, u64 bytenr,
172 struct btrfs_delayed_ref_node **last, 202 struct btrfs_delayed_ref_head **last, int return_bigger)
173 int return_bigger)
174{ 203{
175 struct rb_node *n; 204 struct rb_node *n;
176 struct btrfs_delayed_ref_node *entry; 205 struct btrfs_delayed_ref_head *entry;
177 int cmp = 0; 206 int cmp = 0;
178 207
179again: 208again:
180 n = root->rb_node; 209 n = root->rb_node;
181 entry = NULL; 210 entry = NULL;
182 while (n) { 211 while (n) {
183 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 212 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
184 WARN_ON(!entry->in_tree);
185 if (last) 213 if (last)
186 *last = entry; 214 *last = entry;
187 215
188 if (bytenr < entry->bytenr) 216 if (bytenr < entry->node.bytenr)
189 cmp = -1; 217 cmp = -1;
190 else if (bytenr > entry->bytenr) 218 else if (bytenr > entry->node.bytenr)
191 cmp = 1;
192 else if (!btrfs_delayed_ref_is_head(entry))
193 cmp = 1; 219 cmp = 1;
194 else 220 else
195 cmp = 0; 221 cmp = 0;
@@ -203,12 +229,12 @@ again:
203 } 229 }
204 if (entry && return_bigger) { 230 if (entry && return_bigger) {
205 if (cmp > 0) { 231 if (cmp > 0) {
206 n = rb_next(&entry->rb_node); 232 n = rb_next(&entry->href_node);
207 if (!n) 233 if (!n)
208 n = rb_first(root); 234 n = rb_first(root);
209 entry = rb_entry(n, struct btrfs_delayed_ref_node, 235 entry = rb_entry(n, struct btrfs_delayed_ref_head,
210 rb_node); 236 href_node);
211 bytenr = entry->bytenr; 237 bytenr = entry->node.bytenr;
212 return_bigger = 0; 238 return_bigger = 0;
213 goto again; 239 goto again;
214 } 240 }
@@ -243,33 +269,38 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
243 269
244static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, 270static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
245 struct btrfs_delayed_ref_root *delayed_refs, 271 struct btrfs_delayed_ref_root *delayed_refs,
272 struct btrfs_delayed_ref_head *head,
246 struct btrfs_delayed_ref_node *ref) 273 struct btrfs_delayed_ref_node *ref)
247{ 274{
248 rb_erase(&ref->rb_node, &delayed_refs->root); 275 if (btrfs_delayed_ref_is_head(ref)) {
276 head = btrfs_delayed_node_to_head(ref);
277 rb_erase(&head->href_node, &delayed_refs->href_root);
278 } else {
279 assert_spin_locked(&head->lock);
280 rb_erase(&ref->rb_node, &head->ref_root);
281 }
249 ref->in_tree = 0; 282 ref->in_tree = 0;
250 btrfs_put_delayed_ref(ref); 283 btrfs_put_delayed_ref(ref);
251 delayed_refs->num_entries--; 284 atomic_dec(&delayed_refs->num_entries);
252 if (trans->delayed_ref_updates) 285 if (trans->delayed_ref_updates)
253 trans->delayed_ref_updates--; 286 trans->delayed_ref_updates--;
254} 287}
255 288
256static int merge_ref(struct btrfs_trans_handle *trans, 289static int merge_ref(struct btrfs_trans_handle *trans,
257 struct btrfs_delayed_ref_root *delayed_refs, 290 struct btrfs_delayed_ref_root *delayed_refs,
291 struct btrfs_delayed_ref_head *head,
258 struct btrfs_delayed_ref_node *ref, u64 seq) 292 struct btrfs_delayed_ref_node *ref, u64 seq)
259{ 293{
260 struct rb_node *node; 294 struct rb_node *node;
261 int merged = 0;
262 int mod = 0; 295 int mod = 0;
263 int done = 0; 296 int done = 0;
264 297
265 node = rb_prev(&ref->rb_node); 298 node = rb_next(&ref->rb_node);
266 while (node) { 299 while (!done && node) {
267 struct btrfs_delayed_ref_node *next; 300 struct btrfs_delayed_ref_node *next;
268 301
269 next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 302 next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
270 node = rb_prev(node); 303 node = rb_next(node);
271 if (next->bytenr != ref->bytenr)
272 break;
273 if (seq && next->seq >= seq) 304 if (seq && next->seq >= seq)
274 break; 305 break;
275 if (comp_entry(ref, next, 0)) 306 if (comp_entry(ref, next, 0))
@@ -289,12 +320,11 @@ static int merge_ref(struct btrfs_trans_handle *trans,
289 mod = -next->ref_mod; 320 mod = -next->ref_mod;
290 } 321 }
291 322
292 merged++; 323 drop_delayed_ref(trans, delayed_refs, head, next);
293 drop_delayed_ref(trans, delayed_refs, next);
294 ref->ref_mod += mod; 324 ref->ref_mod += mod;
295 if (ref->ref_mod == 0) { 325 if (ref->ref_mod == 0) {
296 drop_delayed_ref(trans, delayed_refs, ref); 326 drop_delayed_ref(trans, delayed_refs, head, ref);
297 break; 327 done = 1;
298 } else { 328 } else {
299 /* 329 /*
300 * You can't have multiples of the same ref on a tree 330 * You can't have multiples of the same ref on a tree
@@ -303,13 +333,8 @@ static int merge_ref(struct btrfs_trans_handle *trans,
303 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 333 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
304 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 334 ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
305 } 335 }
306
307 if (done)
308 break;
309 node = rb_prev(&ref->rb_node);
310 } 336 }
311 337 return done;
312 return merged;
313} 338}
314 339
315void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, 340void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
@@ -320,6 +345,14 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
320 struct rb_node *node; 345 struct rb_node *node;
321 u64 seq = 0; 346 u64 seq = 0;
322 347
348 assert_spin_locked(&head->lock);
349 /*
350 * We don't have too much refs to merge in the case of delayed data
351 * refs.
352 */
353 if (head->is_data)
354 return;
355
323 spin_lock(&fs_info->tree_mod_seq_lock); 356 spin_lock(&fs_info->tree_mod_seq_lock);
324 if (!list_empty(&fs_info->tree_mod_seq_list)) { 357 if (!list_empty(&fs_info->tree_mod_seq_list)) {
325 struct seq_list *elem; 358 struct seq_list *elem;
@@ -330,22 +363,19 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
330 } 363 }
331 spin_unlock(&fs_info->tree_mod_seq_lock); 364 spin_unlock(&fs_info->tree_mod_seq_lock);
332 365
333 node = rb_prev(&head->node.rb_node); 366 node = rb_first(&head->ref_root);
334 while (node) { 367 while (node) {
335 struct btrfs_delayed_ref_node *ref; 368 struct btrfs_delayed_ref_node *ref;
336 369
337 ref = rb_entry(node, struct btrfs_delayed_ref_node, 370 ref = rb_entry(node, struct btrfs_delayed_ref_node,
338 rb_node); 371 rb_node);
339 if (ref->bytenr != head->node.bytenr)
340 break;
341
342 /* We can't merge refs that are outside of our seq count */ 372 /* We can't merge refs that are outside of our seq count */
343 if (seq && ref->seq >= seq) 373 if (seq && ref->seq >= seq)
344 break; 374 break;
345 if (merge_ref(trans, delayed_refs, ref, seq)) 375 if (merge_ref(trans, delayed_refs, head, ref, seq))
346 node = rb_prev(&head->node.rb_node); 376 node = rb_first(&head->ref_root);
347 else 377 else
348 node = rb_prev(node); 378 node = rb_next(&ref->rb_node);
349 } 379 }
350} 380}
351 381
@@ -373,71 +403,52 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
373 return ret; 403 return ret;
374} 404}
375 405
376int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 406struct btrfs_delayed_ref_head *
377 struct list_head *cluster, u64 start) 407btrfs_select_ref_head(struct btrfs_trans_handle *trans)
378{ 408{
379 int count = 0;
380 struct btrfs_delayed_ref_root *delayed_refs; 409 struct btrfs_delayed_ref_root *delayed_refs;
381 struct rb_node *node;
382 struct btrfs_delayed_ref_node *ref;
383 struct btrfs_delayed_ref_head *head; 410 struct btrfs_delayed_ref_head *head;
411 u64 start;
412 bool loop = false;
384 413
385 delayed_refs = &trans->transaction->delayed_refs; 414 delayed_refs = &trans->transaction->delayed_refs;
386 if (start == 0) { 415
387 node = rb_first(&delayed_refs->root);
388 } else {
389 ref = NULL;
390 find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
391 if (ref) {
392 node = &ref->rb_node;
393 } else
394 node = rb_first(&delayed_refs->root);
395 }
396again: 416again:
397 while (node && count < 32) { 417 start = delayed_refs->run_delayed_start;
398 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 418 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
399 if (btrfs_delayed_ref_is_head(ref)) { 419 if (!head && !loop) {
400 head = btrfs_delayed_node_to_head(ref); 420 delayed_refs->run_delayed_start = 0;
401 if (list_empty(&head->cluster)) {
402 list_add_tail(&head->cluster, cluster);
403 delayed_refs->run_delayed_start =
404 head->node.bytenr;
405 count++;
406
407 WARN_ON(delayed_refs->num_heads_ready == 0);
408 delayed_refs->num_heads_ready--;
409 } else if (count) {
410 /* the goal of the clustering is to find extents
411 * that are likely to end up in the same extent
412 * leaf on disk. So, we don't want them spread
413 * all over the tree. Stop now if we've hit
414 * a head that was already in use
415 */
416 break;
417 }
418 }
419 node = rb_next(node);
420 }
421 if (count) {
422 return 0;
423 } else if (start) {
424 /*
425 * we've gone to the end of the rbtree without finding any
426 * clusters. start from the beginning and try again
427 */
428 start = 0; 421 start = 0;
429 node = rb_first(&delayed_refs->root); 422 loop = true;
430 goto again; 423 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
424 if (!head)
425 return NULL;
426 } else if (!head && loop) {
427 return NULL;
431 } 428 }
432 return 1;
433}
434 429
435void btrfs_release_ref_cluster(struct list_head *cluster) 430 while (head->processing) {
436{ 431 struct rb_node *node;
437 struct list_head *pos, *q; 432
433 node = rb_next(&head->href_node);
434 if (!node) {
435 if (loop)
436 return NULL;
437 delayed_refs->run_delayed_start = 0;
438 start = 0;
439 loop = true;
440 goto again;
441 }
442 head = rb_entry(node, struct btrfs_delayed_ref_head,
443 href_node);
444 }
438 445
439 list_for_each_safe(pos, q, cluster) 446 head->processing = 1;
440 list_del_init(pos); 447 WARN_ON(delayed_refs->num_heads_ready == 0);
448 delayed_refs->num_heads_ready--;
449 delayed_refs->run_delayed_start = head->node.bytenr +
450 head->node.num_bytes;
451 return head;
441} 452}
442 453
443/* 454/*
@@ -451,6 +462,7 @@ void btrfs_release_ref_cluster(struct list_head *cluster)
451static noinline void 462static noinline void
452update_existing_ref(struct btrfs_trans_handle *trans, 463update_existing_ref(struct btrfs_trans_handle *trans,
453 struct btrfs_delayed_ref_root *delayed_refs, 464 struct btrfs_delayed_ref_root *delayed_refs,
465 struct btrfs_delayed_ref_head *head,
454 struct btrfs_delayed_ref_node *existing, 466 struct btrfs_delayed_ref_node *existing,
455 struct btrfs_delayed_ref_node *update) 467 struct btrfs_delayed_ref_node *update)
456{ 468{
@@ -463,7 +475,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
463 */ 475 */
464 existing->ref_mod--; 476 existing->ref_mod--;
465 if (existing->ref_mod == 0) 477 if (existing->ref_mod == 0)
466 drop_delayed_ref(trans, delayed_refs, existing); 478 drop_delayed_ref(trans, delayed_refs, head, existing);
467 else 479 else
468 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 480 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
469 existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 481 existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -533,9 +545,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
533 } 545 }
534 } 546 }
535 /* 547 /*
536 * update the reference mod on the head to reflect this new operation 548 * update the reference mod on the head to reflect this new operation,
549 * only need the lock for this case cause we could be processing it
550 * currently, for refs we just added we know we're a-ok.
537 */ 551 */
552 spin_lock(&existing_ref->lock);
538 existing->ref_mod += update->ref_mod; 553 existing->ref_mod += update->ref_mod;
554 spin_unlock(&existing_ref->lock);
539} 555}
540 556
541/* 557/*
@@ -543,13 +559,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
543 * this does all the dirty work in terms of maintaining the correct 559 * this does all the dirty work in terms of maintaining the correct
544 * overall modification count. 560 * overall modification count.
545 */ 561 */
546static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, 562static noinline struct btrfs_delayed_ref_head *
547 struct btrfs_trans_handle *trans, 563add_delayed_ref_head(struct btrfs_fs_info *fs_info,
548 struct btrfs_delayed_ref_node *ref, 564 struct btrfs_trans_handle *trans,
549 u64 bytenr, u64 num_bytes, 565 struct btrfs_delayed_ref_node *ref, u64 bytenr,
550 int action, int is_data) 566 u64 num_bytes, int action, int is_data)
551{ 567{
552 struct btrfs_delayed_ref_node *existing; 568 struct btrfs_delayed_ref_head *existing;
553 struct btrfs_delayed_ref_head *head_ref = NULL; 569 struct btrfs_delayed_ref_head *head_ref = NULL;
554 struct btrfs_delayed_ref_root *delayed_refs; 570 struct btrfs_delayed_ref_root *delayed_refs;
555 int count_mod = 1; 571 int count_mod = 1;
@@ -596,38 +612,43 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
596 head_ref = btrfs_delayed_node_to_head(ref); 612 head_ref = btrfs_delayed_node_to_head(ref);
597 head_ref->must_insert_reserved = must_insert_reserved; 613 head_ref->must_insert_reserved = must_insert_reserved;
598 head_ref->is_data = is_data; 614 head_ref->is_data = is_data;
615 head_ref->ref_root = RB_ROOT;
616 head_ref->processing = 0;
599 617
600 INIT_LIST_HEAD(&head_ref->cluster); 618 spin_lock_init(&head_ref->lock);
601 mutex_init(&head_ref->mutex); 619 mutex_init(&head_ref->mutex);
602 620
603 trace_add_delayed_ref_head(ref, head_ref, action); 621 trace_add_delayed_ref_head(ref, head_ref, action);
604 622
605 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 623 existing = htree_insert(&delayed_refs->href_root,
606 624 &head_ref->href_node);
607 if (existing) { 625 if (existing) {
608 update_existing_head_ref(existing, ref); 626 update_existing_head_ref(&existing->node, ref);
609 /* 627 /*
610 * we've updated the existing ref, free the newly 628 * we've updated the existing ref, free the newly
611 * allocated ref 629 * allocated ref
612 */ 630 */
613 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 631 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
632 head_ref = existing;
614 } else { 633 } else {
615 delayed_refs->num_heads++; 634 delayed_refs->num_heads++;
616 delayed_refs->num_heads_ready++; 635 delayed_refs->num_heads_ready++;
617 delayed_refs->num_entries++; 636 atomic_inc(&delayed_refs->num_entries);
618 trans->delayed_ref_updates++; 637 trans->delayed_ref_updates++;
619 } 638 }
639 return head_ref;
620} 640}
621 641
622/* 642/*
623 * helper to insert a delayed tree ref into the rbtree. 643 * helper to insert a delayed tree ref into the rbtree.
624 */ 644 */
625static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, 645static noinline void
626 struct btrfs_trans_handle *trans, 646add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
627 struct btrfs_delayed_ref_node *ref, 647 struct btrfs_trans_handle *trans,
628 u64 bytenr, u64 num_bytes, u64 parent, 648 struct btrfs_delayed_ref_head *head_ref,
629 u64 ref_root, int level, int action, 649 struct btrfs_delayed_ref_node *ref, u64 bytenr,
630 int for_cow) 650 u64 num_bytes, u64 parent, u64 ref_root, int level,
651 int action, int for_cow)
631{ 652{
632 struct btrfs_delayed_ref_node *existing; 653 struct btrfs_delayed_ref_node *existing;
633 struct btrfs_delayed_tree_ref *full_ref; 654 struct btrfs_delayed_tree_ref *full_ref;
@@ -663,30 +684,33 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
663 684
664 trace_add_delayed_tree_ref(ref, full_ref, action); 685 trace_add_delayed_tree_ref(ref, full_ref, action);
665 686
666 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 687 spin_lock(&head_ref->lock);
667 688 existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
668 if (existing) { 689 if (existing) {
669 update_existing_ref(trans, delayed_refs, existing, ref); 690 update_existing_ref(trans, delayed_refs, head_ref, existing,
691 ref);
670 /* 692 /*
671 * we've updated the existing ref, free the newly 693 * we've updated the existing ref, free the newly
672 * allocated ref 694 * allocated ref
673 */ 695 */
674 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); 696 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
675 } else { 697 } else {
676 delayed_refs->num_entries++; 698 atomic_inc(&delayed_refs->num_entries);
677 trans->delayed_ref_updates++; 699 trans->delayed_ref_updates++;
678 } 700 }
701 spin_unlock(&head_ref->lock);
679} 702}
680 703
681/* 704/*
682 * helper to insert a delayed data ref into the rbtree. 705 * helper to insert a delayed data ref into the rbtree.
683 */ 706 */
684static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, 707static noinline void
685 struct btrfs_trans_handle *trans, 708add_delayed_data_ref(struct btrfs_fs_info *fs_info,
686 struct btrfs_delayed_ref_node *ref, 709 struct btrfs_trans_handle *trans,
687 u64 bytenr, u64 num_bytes, u64 parent, 710 struct btrfs_delayed_ref_head *head_ref,
688 u64 ref_root, u64 owner, u64 offset, 711 struct btrfs_delayed_ref_node *ref, u64 bytenr,
689 int action, int for_cow) 712 u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
713 u64 offset, int action, int for_cow)
690{ 714{
691 struct btrfs_delayed_ref_node *existing; 715 struct btrfs_delayed_ref_node *existing;
692 struct btrfs_delayed_data_ref *full_ref; 716 struct btrfs_delayed_data_ref *full_ref;
@@ -724,19 +748,21 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
724 748
725 trace_add_delayed_data_ref(ref, full_ref, action); 749 trace_add_delayed_data_ref(ref, full_ref, action);
726 750
727 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 751 spin_lock(&head_ref->lock);
728 752 existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
729 if (existing) { 753 if (existing) {
730 update_existing_ref(trans, delayed_refs, existing, ref); 754 update_existing_ref(trans, delayed_refs, head_ref, existing,
755 ref);
731 /* 756 /*
732 * we've updated the existing ref, free the newly 757 * we've updated the existing ref, free the newly
733 * allocated ref 758 * allocated ref
734 */ 759 */
735 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); 760 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
736 } else { 761 } else {
737 delayed_refs->num_entries++; 762 atomic_inc(&delayed_refs->num_entries);
738 trans->delayed_ref_updates++; 763 trans->delayed_ref_updates++;
739 } 764 }
765 spin_unlock(&head_ref->lock);
740} 766}
741 767
742/* 768/*
@@ -775,10 +801,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
775 * insert both the head node and the new ref without dropping 801 * insert both the head node and the new ref without dropping
776 * the spin lock 802 * the spin lock
777 */ 803 */
778 add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, 804 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
779 num_bytes, action, 0); 805 bytenr, num_bytes, action, 0);
780 806
781 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 807 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
782 num_bytes, parent, ref_root, level, action, 808 num_bytes, parent, ref_root, level, action,
783 for_cow); 809 for_cow);
784 spin_unlock(&delayed_refs->lock); 810 spin_unlock(&delayed_refs->lock);
@@ -823,10 +849,10 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
823 * insert both the head node and the new ref without dropping 849 * insert both the head node and the new ref without dropping
824 * the spin lock 850 * the spin lock
825 */ 851 */
826 add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, 852 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
827 num_bytes, action, 1); 853 bytenr, num_bytes, action, 1);
828 854
829 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 855 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
830 num_bytes, parent, ref_root, owner, offset, 856 num_bytes, parent, ref_root, owner, offset,
831 action, for_cow); 857 action, for_cow);
832 spin_unlock(&delayed_refs->lock); 858 spin_unlock(&delayed_refs->lock);
@@ -869,14 +895,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
869struct btrfs_delayed_ref_head * 895struct btrfs_delayed_ref_head *
870btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) 896btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
871{ 897{
872 struct btrfs_delayed_ref_node *ref;
873 struct btrfs_delayed_ref_root *delayed_refs; 898 struct btrfs_delayed_ref_root *delayed_refs;
874 899
875 delayed_refs = &trans->transaction->delayed_refs; 900 delayed_refs = &trans->transaction->delayed_refs;
876 ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0); 901 return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0);
877 if (ref)
878 return btrfs_delayed_node_to_head(ref);
879 return NULL;
880} 902}
881 903
882void btrfs_delayed_ref_exit(void) 904void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 70b962cc177d..4ba9b93022ff 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -81,7 +81,10 @@ struct btrfs_delayed_ref_head {
81 */ 81 */
82 struct mutex mutex; 82 struct mutex mutex;
83 83
84 struct list_head cluster; 84 spinlock_t lock;
85 struct rb_root ref_root;
86
87 struct rb_node href_node;
85 88
86 struct btrfs_delayed_extent_op *extent_op; 89 struct btrfs_delayed_extent_op *extent_op;
87 /* 90 /*
@@ -98,6 +101,7 @@ struct btrfs_delayed_ref_head {
98 */ 101 */
99 unsigned int must_insert_reserved:1; 102 unsigned int must_insert_reserved:1;
100 unsigned int is_data:1; 103 unsigned int is_data:1;
104 unsigned int processing:1;
101}; 105};
102 106
103struct btrfs_delayed_tree_ref { 107struct btrfs_delayed_tree_ref {
@@ -116,7 +120,8 @@ struct btrfs_delayed_data_ref {
116}; 120};
117 121
118struct btrfs_delayed_ref_root { 122struct btrfs_delayed_ref_root {
119 struct rb_root root; 123 /* head ref rbtree */
124 struct rb_root href_root;
120 125
121 /* this spin lock protects the rbtree and the entries inside */ 126 /* this spin lock protects the rbtree and the entries inside */
122 spinlock_t lock; 127 spinlock_t lock;
@@ -124,7 +129,7 @@ struct btrfs_delayed_ref_root {
124 /* how many delayed ref updates we've queued, used by the 129 /* how many delayed ref updates we've queued, used by the
125 * throttling code 130 * throttling code
126 */ 131 */
127 unsigned long num_entries; 132 atomic_t num_entries;
128 133
129 /* total number of head nodes in tree */ 134 /* total number of head nodes in tree */
130 unsigned long num_heads; 135 unsigned long num_heads;
@@ -133,15 +138,6 @@ struct btrfs_delayed_ref_root {
133 unsigned long num_heads_ready; 138 unsigned long num_heads_ready;
134 139
135 /* 140 /*
136 * bumped when someone is making progress on the delayed
137 * refs, so that other procs know they are just adding to
138 * contention intead of helping
139 */
140 atomic_t procs_running_refs;
141 atomic_t ref_seq;
142 wait_queue_head_t wait;
143
144 /*
145 * set when the tree is flushing before a transaction commit, 141 * set when the tree is flushing before a transaction commit,
146 * used by the throttling code to decide if new updates need 142 * used by the throttling code to decide if new updates need
147 * to be run right away 143 * to be run right away
@@ -226,9 +222,9 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
226 mutex_unlock(&head->mutex); 222 mutex_unlock(&head->mutex);
227} 223}
228 224
229int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 225
230 struct list_head *cluster, u64 search_start); 226struct btrfs_delayed_ref_head *
231void btrfs_release_ref_cluster(struct list_head *cluster); 227btrfs_select_ref_head(struct btrfs_trans_handle *trans);
232 228
233int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, 229int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
234 struct btrfs_delayed_ref_root *delayed_refs, 230 struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2cfc3dfff64f..564c92638b20 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -102,7 +102,8 @@ no_valid_dev_replace_entry_found:
102 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); 102 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
103 103
104 if (item_size != sizeof(struct btrfs_dev_replace_item)) { 104 if (item_size != sizeof(struct btrfs_dev_replace_item)) {
105 pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n"); 105 btrfs_warn(fs_info,
106 "dev_replace entry found has unexpected size, ignore entry");
106 goto no_valid_dev_replace_entry_found; 107 goto no_valid_dev_replace_entry_found;
107 } 108 }
108 109
@@ -145,13 +146,19 @@ no_valid_dev_replace_entry_found:
145 if (!dev_replace->srcdev && 146 if (!dev_replace->srcdev &&
146 !btrfs_test_opt(dev_root, DEGRADED)) { 147 !btrfs_test_opt(dev_root, DEGRADED)) {
147 ret = -EIO; 148 ret = -EIO;
148 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", 149 btrfs_warn(fs_info,
149 src_devid); 150 "cannot mount because device replace operation is ongoing and");
151 btrfs_warn(fs_info,
152 "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
153 src_devid);
150 } 154 }
151 if (!dev_replace->tgtdev && 155 if (!dev_replace->tgtdev &&
152 !btrfs_test_opt(dev_root, DEGRADED)) { 156 !btrfs_test_opt(dev_root, DEGRADED)) {
153 ret = -EIO; 157 ret = -EIO;
154 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", 158 btrfs_warn(fs_info,
159 "cannot mount because device replace operation is ongoing and");
160 btrfs_warn(fs_info,
161 "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
155 BTRFS_DEV_REPLACE_DEVID); 162 BTRFS_DEV_REPLACE_DEVID);
156 } 163 }
157 if (dev_replace->tgtdev) { 164 if (dev_replace->tgtdev) {
@@ -210,7 +217,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
210 } 217 }
211 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 218 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
212 if (ret < 0) { 219 if (ret < 0) {
213 pr_warn("btrfs: error %d while searching for dev_replace item!\n", 220 btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
214 ret); 221 ret);
215 goto out; 222 goto out;
216 } 223 }
@@ -230,7 +237,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
230 */ 237 */
231 ret = btrfs_del_item(trans, dev_root, path); 238 ret = btrfs_del_item(trans, dev_root, path);
232 if (ret != 0) { 239 if (ret != 0) {
233 pr_warn("btrfs: delete too small dev_replace item failed %d!\n", 240 btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
234 ret); 241 ret);
235 goto out; 242 goto out;
236 } 243 }
@@ -243,7 +250,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
243 ret = btrfs_insert_empty_item(trans, dev_root, path, 250 ret = btrfs_insert_empty_item(trans, dev_root, path,
244 &key, sizeof(*ptr)); 251 &key, sizeof(*ptr));
245 if (ret < 0) { 252 if (ret < 0) {
246 pr_warn("btrfs: insert dev_replace item failed %d!\n", 253 btrfs_warn(fs_info, "insert dev_replace item failed %d!",
247 ret); 254 ret);
248 goto out; 255 goto out;
249 } 256 }
@@ -305,7 +312,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
305 struct btrfs_device *src_device = NULL; 312 struct btrfs_device *src_device = NULL;
306 313
307 if (btrfs_fs_incompat(fs_info, RAID56)) { 314 if (btrfs_fs_incompat(fs_info, RAID56)) {
308 pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n"); 315 btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
309 return -EINVAL; 316 return -EINVAL;
310 } 317 }
311 318
@@ -325,7 +332,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
325 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, 332 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
326 &tgt_device); 333 &tgt_device);
327 if (ret) { 334 if (ret) {
328 pr_err("btrfs: target device %s is invalid!\n", 335 btrfs_err(fs_info, "target device %s is invalid!",
329 args->start.tgtdev_name); 336 args->start.tgtdev_name);
330 mutex_unlock(&fs_info->volume_mutex); 337 mutex_unlock(&fs_info->volume_mutex);
331 return -EINVAL; 338 return -EINVAL;
@@ -341,7 +348,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
341 } 348 }
342 349
343 if (tgt_device->total_bytes < src_device->total_bytes) { 350 if (tgt_device->total_bytes < src_device->total_bytes) {
344 pr_err("btrfs: target device is smaller than source device!\n"); 351 btrfs_err(fs_info, "target device is smaller than source device!");
345 ret = -EINVAL; 352 ret = -EINVAL;
346 goto leave_no_lock; 353 goto leave_no_lock;
347 } 354 }
@@ -366,7 +373,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
366 dev_replace->tgtdev = tgt_device; 373 dev_replace->tgtdev = tgt_device;
367 374
368 printk_in_rcu(KERN_INFO 375 printk_in_rcu(KERN_INFO
369 "btrfs: dev_replace from %s (devid %llu) to %s started\n", 376 "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
370 src_device->missing ? "<missing disk>" : 377 src_device->missing ? "<missing disk>" :
371 rcu_str_deref(src_device->name), 378 rcu_str_deref(src_device->name),
372 src_device->devid, 379 src_device->devid,
@@ -489,7 +496,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
489 496
490 if (scrub_ret) { 497 if (scrub_ret) {
491 printk_in_rcu(KERN_ERR 498 printk_in_rcu(KERN_ERR
492 "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 499 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
493 src_device->missing ? "<missing disk>" : 500 src_device->missing ? "<missing disk>" :
494 rcu_str_deref(src_device->name), 501 rcu_str_deref(src_device->name),
495 src_device->devid, 502 src_device->devid,
@@ -504,7 +511,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
504 } 511 }
505 512
506 printk_in_rcu(KERN_INFO 513 printk_in_rcu(KERN_INFO
507 "btrfs: dev_replace from %s (devid %llu) to %s) finished\n", 514 "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
508 src_device->missing ? "<missing disk>" : 515 src_device->missing ? "<missing disk>" :
509 rcu_str_deref(src_device->name), 516 rcu_str_deref(src_device->name),
510 src_device->devid, 517 src_device->devid,
@@ -699,7 +706,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
699 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 706 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
700 dev_replace->time_stopped = get_seconds(); 707 dev_replace->time_stopped = get_seconds();
701 dev_replace->item_needs_writeback = 1; 708 dev_replace->item_needs_writeback = 1;
702 pr_info("btrfs: suspending dev_replace for unmount\n"); 709 btrfs_info(fs_info, "suspending dev_replace for unmount");
703 break; 710 break;
704 } 711 }
705 712
@@ -728,8 +735,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
728 break; 735 break;
729 } 736 }
730 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { 737 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
731 pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n" 738 btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
732 "btrfs: you may cancel the operation after 'mount -o degraded'\n"); 739 btrfs_info(fs_info,
740 "you may cancel the operation after 'mount -o degraded'");
733 btrfs_dev_replace_unlock(dev_replace); 741 btrfs_dev_replace_unlock(dev_replace);
734 return 0; 742 return 0;
735 } 743 }
@@ -755,14 +763,14 @@ static int btrfs_dev_replace_kthread(void *data)
755 kfree(status_args); 763 kfree(status_args);
756 do_div(progress, 10); 764 do_div(progress, 10);
757 printk_in_rcu(KERN_INFO 765 printk_in_rcu(KERN_INFO
758 "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", 766 "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
759 dev_replace->srcdev->missing ? "<missing disk>" : 767 dev_replace->srcdev->missing ? "<missing disk>" :
760 rcu_str_deref(dev_replace->srcdev->name), 768 rcu_str_deref(dev_replace->srcdev->name),
761 dev_replace->srcdev->devid, 769 dev_replace->srcdev->devid,
762 dev_replace->tgtdev ? 770 dev_replace->tgtdev ?
763 rcu_str_deref(dev_replace->tgtdev->name) : 771 rcu_str_deref(dev_replace->tgtdev->name) :
764 "<missing target disk>", 772 "<missing target disk>",
765 (unsigned int)progress); 773 (unsigned int)progress);
766 } 774 }
767 btrfs_dev_replace_continue_on_mount(fs_info); 775 btrfs_dev_replace_continue_on_mount(fs_info);
768 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 776 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c031ea3fd70f..a0691df5dcea 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -261,7 +261,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
261 * see if there is room in the item to insert this 261 * see if there is room in the item to insert this
262 * name 262 * name
263 */ 263 */
264 data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item); 264 data_size = sizeof(*di) + name_len;
265 leaf = path->nodes[0]; 265 leaf = path->nodes[0];
266 slot = path->slots[0]; 266 slot = path->slots[0];
267 if (data_size + btrfs_item_size_nr(leaf, slot) + 267 if (data_size + btrfs_item_size_nr(leaf, slot) +
@@ -459,7 +459,7 @@ int verify_dir_item(struct btrfs_root *root,
459 u8 type = btrfs_dir_type(leaf, dir_item); 459 u8 type = btrfs_dir_type(leaf, dir_item);
460 460
461 if (type >= BTRFS_FT_MAX) { 461 if (type >= BTRFS_FT_MAX) {
462 printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", 462 btrfs_crit(root->fs_info, "invalid dir item type: %d",
463 (int)type); 463 (int)type);
464 return 1; 464 return 1;
465 } 465 }
@@ -468,7 +468,7 @@ int verify_dir_item(struct btrfs_root *root,
468 namelen = XATTR_NAME_MAX; 468 namelen = XATTR_NAME_MAX;
469 469
470 if (btrfs_dir_name_len(leaf, dir_item) > namelen) { 470 if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
471 printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n", 471 btrfs_crit(root->fs_info, "invalid dir item name len: %u",
472 (unsigned)btrfs_dir_data_len(leaf, dir_item)); 472 (unsigned)btrfs_dir_data_len(leaf, dir_item));
473 return 1; 473 return 1;
474 } 474 }
@@ -476,7 +476,7 @@ int verify_dir_item(struct btrfs_root *root,
476 /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ 476 /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
477 if ((btrfs_dir_data_len(leaf, dir_item) + 477 if ((btrfs_dir_data_len(leaf, dir_item) +
478 btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) { 478 btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
479 printk(KERN_CRIT "btrfs: invalid dir item name + data len: %u + %u\n", 479 btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u",
480 (unsigned)btrfs_dir_name_len(leaf, dir_item), 480 (unsigned)btrfs_dir_name_len(leaf, dir_item),
481 (unsigned)btrfs_dir_data_len(leaf, dir_item)); 481 (unsigned)btrfs_dir_data_len(leaf, dir_item));
482 return 1; 482 return 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8072cfa8a3b1..81ea55314b1f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,7 +26,6 @@
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h>
30#include <linux/slab.h> 29#include <linux/slab.h>
31#include <linux/migrate.h> 30#include <linux/migrate.h>
32#include <linux/ratelimit.h> 31#include <linux/ratelimit.h>
@@ -35,6 +34,7 @@
35#include <asm/unaligned.h> 34#include <asm/unaligned.h>
36#include "ctree.h" 35#include "ctree.h"
37#include "disk-io.h" 36#include "disk-io.h"
37#include "hash.h"
38#include "transaction.h" 38#include "transaction.h"
39#include "btrfs_inode.h" 39#include "btrfs_inode.h"
40#include "volumes.h" 40#include "volumes.h"
@@ -48,6 +48,7 @@
48#include "rcu-string.h" 48#include "rcu-string.h"
49#include "dev-replace.h" 49#include "dev-replace.h"
50#include "raid56.h" 50#include "raid56.h"
51#include "sysfs.h"
51 52
52#ifdef CONFIG_X86 53#ifdef CONFIG_X86
53#include <asm/cpufeature.h> 54#include <asm/cpufeature.h>
@@ -243,7 +244,7 @@ out:
243 244
244u32 btrfs_csum_data(char *data, u32 seed, size_t len) 245u32 btrfs_csum_data(char *data, u32 seed, size_t len)
245{ 246{
246 return crc32c(seed, data, len); 247 return btrfs_crc32c(seed, data, len);
247} 248}
248 249
249void btrfs_csum_final(u32 crc, char *result) 250void btrfs_csum_final(u32 crc, char *result)
@@ -299,11 +300,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
299 memcpy(&found, result, csum_size); 300 memcpy(&found, result, csum_size);
300 301
301 read_extent_buffer(buf, &val, 0, csum_size); 302 read_extent_buffer(buf, &val, 0, csum_size);
302 printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " 303 printk_ratelimited(KERN_INFO
303 "failed on %llu wanted %X found %X " 304 "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
304 "level %d\n", 305 "level %d\n",
305 root->fs_info->sb->s_id, buf->start, 306 root->fs_info->sb->s_id, buf->start,
306 val, found, btrfs_header_level(buf)); 307 val, found, btrfs_header_level(buf));
307 if (result != (char *)&inline_result) 308 if (result != (char *)&inline_result)
308 kfree(result); 309 kfree(result);
309 return 1; 310 return 1;
@@ -382,13 +383,14 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
382 ret = 1; 383 ret = 1;
383 384
384 if (ret && btrfs_super_generation(disk_sb) < 10) { 385 if (ret && btrfs_super_generation(disk_sb) < 10) {
385 printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n"); 386 printk(KERN_WARNING
387 "BTRFS: super block crcs don't match, older mkfs detected\n");
386 ret = 0; 388 ret = 0;
387 } 389 }
388 } 390 }
389 391
390 if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { 392 if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
391 printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n", 393 printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
392 csum_type); 394 csum_type);
393 ret = 1; 395 ret = 1;
394 } 396 }
@@ -464,13 +466,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
464 466
465static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) 467static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
466{ 468{
467 struct extent_io_tree *tree;
468 u64 start = page_offset(page); 469 u64 start = page_offset(page);
469 u64 found_start; 470 u64 found_start;
470 struct extent_buffer *eb; 471 struct extent_buffer *eb;
471 472
472 tree = &BTRFS_I(page->mapping->host)->io_tree;
473
474 eb = (struct extent_buffer *)page->private; 473 eb = (struct extent_buffer *)page->private;
475 if (page != eb->pages[0]) 474 if (page != eb->pages[0])
476 return 0; 475 return 0;
@@ -500,8 +499,8 @@ static int check_tree_block_fsid(struct btrfs_root *root,
500} 499}
501 500
502#define CORRUPT(reason, eb, root, slot) \ 501#define CORRUPT(reason, eb, root, slot) \
503 printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ 502 btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \
504 "root=%llu, slot=%d\n", reason, \ 503 "root=%llu, slot=%d", reason, \
505 btrfs_header_bytenr(eb), root->objectid, slot) 504 btrfs_header_bytenr(eb), root->objectid, slot)
506 505
507static noinline int check_leaf(struct btrfs_root *root, 506static noinline int check_leaf(struct btrfs_root *root,
@@ -569,7 +568,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
569 u64 phy_offset, struct page *page, 568 u64 phy_offset, struct page *page,
570 u64 start, u64 end, int mirror) 569 u64 start, u64 end, int mirror)
571{ 570{
572 struct extent_io_tree *tree;
573 u64 found_start; 571 u64 found_start;
574 int found_level; 572 int found_level;
575 struct extent_buffer *eb; 573 struct extent_buffer *eb;
@@ -580,7 +578,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
580 if (!page->private) 578 if (!page->private)
581 goto out; 579 goto out;
582 580
583 tree = &BTRFS_I(page->mapping->host)->io_tree;
584 eb = (struct extent_buffer *)page->private; 581 eb = (struct extent_buffer *)page->private;
585 582
586 /* the pending IO might have been the only thing that kept this buffer 583 /* the pending IO might have been the only thing that kept this buffer
@@ -600,21 +597,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
600 597
601 found_start = btrfs_header_bytenr(eb); 598 found_start = btrfs_header_bytenr(eb);
602 if (found_start != eb->start) { 599 if (found_start != eb->start) {
603 printk_ratelimited(KERN_INFO "btrfs bad tree block start " 600 printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
604 "%llu %llu\n", 601 "%llu %llu\n",
605 found_start, eb->start); 602 found_start, eb->start);
606 ret = -EIO; 603 ret = -EIO;
607 goto err; 604 goto err;
608 } 605 }
609 if (check_tree_block_fsid(root, eb)) { 606 if (check_tree_block_fsid(root, eb)) {
610 printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", 607 printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
611 eb->start); 608 eb->start);
612 ret = -EIO; 609 ret = -EIO;
613 goto err; 610 goto err;
614 } 611 }
615 found_level = btrfs_header_level(eb); 612 found_level = btrfs_header_level(eb);
616 if (found_level >= BTRFS_MAX_LEVEL) { 613 if (found_level >= BTRFS_MAX_LEVEL) {
617 btrfs_info(root->fs_info, "bad tree block level %d\n", 614 btrfs_info(root->fs_info, "bad tree block level %d",
618 (int)btrfs_header_level(eb)); 615 (int)btrfs_header_level(eb));
619 ret = -EIO; 616 ret = -EIO;
620 goto err; 617 goto err;
@@ -842,20 +839,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
842 839
843static int btree_csum_one_bio(struct bio *bio) 840static int btree_csum_one_bio(struct bio *bio)
844{ 841{
845 struct bio_vec *bvec = bio->bi_io_vec; 842 struct bio_vec *bvec;
846 int bio_index = 0;
847 struct btrfs_root *root; 843 struct btrfs_root *root;
848 int ret = 0; 844 int i, ret = 0;
849 845
850 WARN_ON(bio->bi_vcnt <= 0); 846 bio_for_each_segment_all(bvec, bio, i) {
851 while (bio_index < bio->bi_vcnt) {
852 root = BTRFS_I(bvec->bv_page->mapping->host)->root; 847 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
853 ret = csum_dirty_buffer(root, bvec->bv_page); 848 ret = csum_dirty_buffer(root, bvec->bv_page);
854 if (ret) 849 if (ret)
855 break; 850 break;
856 bio_index++;
857 bvec++;
858 } 851 }
852
859 return ret; 853 return ret;
860} 854}
861 855
@@ -967,11 +961,9 @@ static int btree_migratepage(struct address_space *mapping,
967static int btree_writepages(struct address_space *mapping, 961static int btree_writepages(struct address_space *mapping,
968 struct writeback_control *wbc) 962 struct writeback_control *wbc)
969{ 963{
970 struct extent_io_tree *tree;
971 struct btrfs_fs_info *fs_info; 964 struct btrfs_fs_info *fs_info;
972 int ret; 965 int ret;
973 966
974 tree = &BTRFS_I(mapping->host)->io_tree;
975 if (wbc->sync_mode == WB_SYNC_NONE) { 967 if (wbc->sync_mode == WB_SYNC_NONE) {
976 968
977 if (wbc->for_kupdate) 969 if (wbc->for_kupdate)
@@ -1010,8 +1002,9 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
1010 extent_invalidatepage(tree, page, offset); 1002 extent_invalidatepage(tree, page, offset);
1011 btree_releasepage(page, GFP_NOFS); 1003 btree_releasepage(page, GFP_NOFS);
1012 if (PagePrivate(page)) { 1004 if (PagePrivate(page)) {
1013 printk(KERN_WARNING "btrfs warning page private not zero " 1005 btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
1014 "on page %llu\n", (unsigned long long)page_offset(page)); 1006 "page private not zero on page %llu",
1007 (unsigned long long)page_offset(page));
1015 ClearPagePrivate(page); 1008 ClearPagePrivate(page);
1016 set_page_private(page, 0); 1009 set_page_private(page, 0);
1017 page_cache_release(page); 1010 page_cache_release(page);
@@ -1095,21 +1088,13 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1095struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1088struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1096 u64 bytenr, u32 blocksize) 1089 u64 bytenr, u32 blocksize)
1097{ 1090{
1098 struct inode *btree_inode = root->fs_info->btree_inode; 1091 return find_extent_buffer(root->fs_info, bytenr);
1099 struct extent_buffer *eb;
1100 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, bytenr);
1101 return eb;
1102} 1092}
1103 1093
1104struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 1094struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1105 u64 bytenr, u32 blocksize) 1095 u64 bytenr, u32 blocksize)
1106{ 1096{
1107 struct inode *btree_inode = root->fs_info->btree_inode; 1097 return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
1108 struct extent_buffer *eb;
1109
1110 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
1111 bytenr, blocksize);
1112 return eb;
1113} 1098}
1114 1099
1115 1100
@@ -1273,7 +1258,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1273 struct btrfs_root *root; 1258 struct btrfs_root *root;
1274 struct btrfs_key key; 1259 struct btrfs_key key;
1275 int ret = 0; 1260 int ret = 0;
1276 u64 bytenr;
1277 uuid_le uuid; 1261 uuid_le uuid;
1278 1262
1279 root = btrfs_alloc_root(fs_info); 1263 root = btrfs_alloc_root(fs_info);
@@ -1295,7 +1279,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1295 goto fail; 1279 goto fail;
1296 } 1280 }
1297 1281
1298 bytenr = leaf->start;
1299 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); 1282 memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
1300 btrfs_set_header_bytenr(leaf, leaf->start); 1283 btrfs_set_header_bytenr(leaf, leaf->start);
1301 btrfs_set_header_generation(leaf, trans->transid); 1284 btrfs_set_header_generation(leaf, trans->transid);
@@ -1616,7 +1599,8 @@ again:
1616 if (ret) 1599 if (ret)
1617 goto fail; 1600 goto fail;
1618 1601
1619 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); 1602 ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
1603 location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
1620 if (ret < 0) 1604 if (ret < 0)
1621 goto fail; 1605 goto fail;
1622 if (ret == 0) 1606 if (ret == 0)
@@ -1684,18 +1668,16 @@ static void end_workqueue_fn(struct btrfs_work *work)
1684{ 1668{
1685 struct bio *bio; 1669 struct bio *bio;
1686 struct end_io_wq *end_io_wq; 1670 struct end_io_wq *end_io_wq;
1687 struct btrfs_fs_info *fs_info;
1688 int error; 1671 int error;
1689 1672
1690 end_io_wq = container_of(work, struct end_io_wq, work); 1673 end_io_wq = container_of(work, struct end_io_wq, work);
1691 bio = end_io_wq->bio; 1674 bio = end_io_wq->bio;
1692 fs_info = end_io_wq->info;
1693 1675
1694 error = end_io_wq->error; 1676 error = end_io_wq->error;
1695 bio->bi_private = end_io_wq->private; 1677 bio->bi_private = end_io_wq->private;
1696 bio->bi_end_io = end_io_wq->end_io; 1678 bio->bi_end_io = end_io_wq->end_io;
1697 kfree(end_io_wq); 1679 kfree(end_io_wq);
1698 bio_endio(bio, error); 1680 bio_endio_nodec(bio, error);
1699} 1681}
1700 1682
1701static int cleaner_kthread(void *arg) 1683static int cleaner_kthread(void *arg)
@@ -2080,6 +2062,12 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
2080 for (i = 0; i < ret; i++) 2062 for (i = 0; i < ret; i++)
2081 btrfs_drop_and_free_fs_root(fs_info, gang[i]); 2063 btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2082 } 2064 }
2065
2066 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
2067 btrfs_free_log_root_tree(NULL, fs_info);
2068 btrfs_destroy_pinned_extent(fs_info->tree_root,
2069 fs_info->pinned_extents);
2070 }
2083} 2071}
2084 2072
2085int open_ctree(struct super_block *sb, 2073int open_ctree(struct super_block *sb,
@@ -2154,6 +2142,7 @@ int open_ctree(struct super_block *sb,
2154 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 2142 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
2155 2143
2156 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 2144 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2145 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
2157 INIT_LIST_HEAD(&fs_info->trans_list); 2146 INIT_LIST_HEAD(&fs_info->trans_list);
2158 INIT_LIST_HEAD(&fs_info->dead_roots); 2147 INIT_LIST_HEAD(&fs_info->dead_roots);
2159 INIT_LIST_HEAD(&fs_info->delayed_iputs); 2148 INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2167,6 +2156,7 @@ int open_ctree(struct super_block *sb,
2167 spin_lock_init(&fs_info->free_chunk_lock); 2156 spin_lock_init(&fs_info->free_chunk_lock);
2168 spin_lock_init(&fs_info->tree_mod_seq_lock); 2157 spin_lock_init(&fs_info->tree_mod_seq_lock);
2169 spin_lock_init(&fs_info->super_lock); 2158 spin_lock_init(&fs_info->super_lock);
2159 spin_lock_init(&fs_info->buffer_lock);
2170 rwlock_init(&fs_info->tree_mod_log_lock); 2160 rwlock_init(&fs_info->tree_mod_log_lock);
2171 mutex_init(&fs_info->reloc_mutex); 2161 mutex_init(&fs_info->reloc_mutex);
2172 seqlock_init(&fs_info->profiles_lock); 2162 seqlock_init(&fs_info->profiles_lock);
@@ -2198,7 +2188,7 @@ int open_ctree(struct super_block *sb,
2198 fs_info->free_chunk_space = 0; 2188 fs_info->free_chunk_space = 0;
2199 fs_info->tree_mod_log = RB_ROOT; 2189 fs_info->tree_mod_log = RB_ROOT;
2200 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; 2190 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2201 2191 fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
2202 /* readahead state */ 2192 /* readahead state */
2203 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 2193 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
2204 spin_lock_init(&fs_info->reada_lock); 2194 spin_lock_init(&fs_info->reada_lock);
@@ -2337,7 +2327,7 @@ int open_ctree(struct super_block *sb,
2337 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). 2327 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
2338 */ 2328 */
2339 if (btrfs_check_super_csum(bh->b_data)) { 2329 if (btrfs_check_super_csum(bh->b_data)) {
2340 printk(KERN_ERR "btrfs: superblock checksum mismatch\n"); 2330 printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
2341 err = -EINVAL; 2331 err = -EINVAL;
2342 goto fail_alloc; 2332 goto fail_alloc;
2343 } 2333 }
@@ -2356,7 +2346,7 @@ int open_ctree(struct super_block *sb,
2356 2346
2357 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2347 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2358 if (ret) { 2348 if (ret) {
2359 printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); 2349 printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
2360 err = -EINVAL; 2350 err = -EINVAL;
2361 goto fail_alloc; 2351 goto fail_alloc;
2362 } 2352 }
@@ -2421,7 +2411,7 @@ int open_ctree(struct super_block *sb,
2421 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 2411 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
2422 2412
2423 if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) 2413 if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
2424 printk(KERN_ERR "btrfs: has skinny extents\n"); 2414 printk(KERN_ERR "BTRFS: has skinny extents\n");
2425 2415
2426 /* 2416 /*
2427 * flag our filesystem as having big metadata blocks if 2417 * flag our filesystem as having big metadata blocks if
@@ -2429,7 +2419,7 @@ int open_ctree(struct super_block *sb,
2429 */ 2419 */
2430 if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { 2420 if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
2431 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) 2421 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2432 printk(KERN_INFO "btrfs flagging fs with big metadata feature\n"); 2422 printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
2433 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; 2423 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2434 } 2424 }
2435 2425
@@ -2446,7 +2436,7 @@ int open_ctree(struct super_block *sb,
2446 */ 2436 */
2447 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && 2437 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2448 (sectorsize != leafsize)) { 2438 (sectorsize != leafsize)) {
2449 printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes " 2439 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
2450 "are not allowed for mixed block groups on %s\n", 2440 "are not allowed for mixed block groups on %s\n",
2451 sb->s_id); 2441 sb->s_id);
2452 goto fail_alloc; 2442 goto fail_alloc;
@@ -2583,12 +2573,12 @@ int open_ctree(struct super_block *sb,
2583 sb->s_blocksize_bits = blksize_bits(sectorsize); 2573 sb->s_blocksize_bits = blksize_bits(sectorsize);
2584 2574
2585 if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 2575 if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
2586 printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); 2576 printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
2587 goto fail_sb_buffer; 2577 goto fail_sb_buffer;
2588 } 2578 }
2589 2579
2590 if (sectorsize != PAGE_SIZE) { 2580 if (sectorsize != PAGE_SIZE) {
2591 printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " 2581 printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
2592 "found on %s\n", (unsigned long)sectorsize, sb->s_id); 2582 "found on %s\n", (unsigned long)sectorsize, sb->s_id);
2593 goto fail_sb_buffer; 2583 goto fail_sb_buffer;
2594 } 2584 }
@@ -2597,7 +2587,7 @@ int open_ctree(struct super_block *sb,
2597 ret = btrfs_read_sys_array(tree_root); 2587 ret = btrfs_read_sys_array(tree_root);
2598 mutex_unlock(&fs_info->chunk_mutex); 2588 mutex_unlock(&fs_info->chunk_mutex);
2599 if (ret) { 2589 if (ret) {
2600 printk(KERN_WARNING "btrfs: failed to read the system " 2590 printk(KERN_WARNING "BTRFS: failed to read the system "
2601 "array on %s\n", sb->s_id); 2591 "array on %s\n", sb->s_id);
2602 goto fail_sb_buffer; 2592 goto fail_sb_buffer;
2603 } 2593 }
@@ -2614,7 +2604,7 @@ int open_ctree(struct super_block *sb,
2614 blocksize, generation); 2604 blocksize, generation);
2615 if (!chunk_root->node || 2605 if (!chunk_root->node ||
2616 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2606 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
2617 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2607 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
2618 sb->s_id); 2608 sb->s_id);
2619 goto fail_tree_roots; 2609 goto fail_tree_roots;
2620 } 2610 }
@@ -2626,7 +2616,7 @@ int open_ctree(struct super_block *sb,
2626 2616
2627 ret = btrfs_read_chunk_tree(chunk_root); 2617 ret = btrfs_read_chunk_tree(chunk_root);
2628 if (ret) { 2618 if (ret) {
2629 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2619 printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
2630 sb->s_id); 2620 sb->s_id);
2631 goto fail_tree_roots; 2621 goto fail_tree_roots;
2632 } 2622 }
@@ -2638,7 +2628,7 @@ int open_ctree(struct super_block *sb,
2638 btrfs_close_extra_devices(fs_info, fs_devices, 0); 2628 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2639 2629
2640 if (!fs_devices->latest_bdev) { 2630 if (!fs_devices->latest_bdev) {
2641 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", 2631 printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
2642 sb->s_id); 2632 sb->s_id);
2643 goto fail_tree_roots; 2633 goto fail_tree_roots;
2644 } 2634 }
@@ -2653,7 +2643,7 @@ retry_root_backup:
2653 blocksize, generation); 2643 blocksize, generation);
2654 if (!tree_root->node || 2644 if (!tree_root->node ||
2655 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { 2645 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
2656 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2646 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
2657 sb->s_id); 2647 sb->s_id);
2658 2648
2659 goto recovery_tree_root; 2649 goto recovery_tree_root;
@@ -2724,50 +2714,56 @@ retry_root_backup:
2724 2714
2725 ret = btrfs_recover_balance(fs_info); 2715 ret = btrfs_recover_balance(fs_info);
2726 if (ret) { 2716 if (ret) {
2727 printk(KERN_WARNING "btrfs: failed to recover balance\n"); 2717 printk(KERN_WARNING "BTRFS: failed to recover balance\n");
2728 goto fail_block_groups; 2718 goto fail_block_groups;
2729 } 2719 }
2730 2720
2731 ret = btrfs_init_dev_stats(fs_info); 2721 ret = btrfs_init_dev_stats(fs_info);
2732 if (ret) { 2722 if (ret) {
2733 printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n", 2723 printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
2734 ret); 2724 ret);
2735 goto fail_block_groups; 2725 goto fail_block_groups;
2736 } 2726 }
2737 2727
2738 ret = btrfs_init_dev_replace(fs_info); 2728 ret = btrfs_init_dev_replace(fs_info);
2739 if (ret) { 2729 if (ret) {
2740 pr_err("btrfs: failed to init dev_replace: %d\n", ret); 2730 pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
2741 goto fail_block_groups; 2731 goto fail_block_groups;
2742 } 2732 }
2743 2733
2744 btrfs_close_extra_devices(fs_info, fs_devices, 1); 2734 btrfs_close_extra_devices(fs_info, fs_devices, 1);
2745 2735
2746 ret = btrfs_init_space_info(fs_info); 2736 ret = btrfs_sysfs_add_one(fs_info);
2747 if (ret) { 2737 if (ret) {
2748 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2738 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
2749 goto fail_block_groups; 2739 goto fail_block_groups;
2750 } 2740 }
2751 2741
2742 ret = btrfs_init_space_info(fs_info);
2743 if (ret) {
2744 printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
2745 goto fail_sysfs;
2746 }
2747
2752 ret = btrfs_read_block_groups(extent_root); 2748 ret = btrfs_read_block_groups(extent_root);
2753 if (ret) { 2749 if (ret) {
2754 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2750 printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
2755 goto fail_block_groups; 2751 goto fail_sysfs;
2756 } 2752 }
2757 fs_info->num_tolerated_disk_barrier_failures = 2753 fs_info->num_tolerated_disk_barrier_failures =
2758 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2754 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2759 if (fs_info->fs_devices->missing_devices > 2755 if (fs_info->fs_devices->missing_devices >
2760 fs_info->num_tolerated_disk_barrier_failures && 2756 fs_info->num_tolerated_disk_barrier_failures &&
2761 !(sb->s_flags & MS_RDONLY)) { 2757 !(sb->s_flags & MS_RDONLY)) {
2762 printk(KERN_WARNING 2758 printk(KERN_WARNING "BTRFS: "
2763 "Btrfs: too many missing devices, writeable mount is not allowed\n"); 2759 "too many missing devices, writeable mount is not allowed\n");
2764 goto fail_block_groups; 2760 goto fail_sysfs;
2765 } 2761 }
2766 2762
2767 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2763 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2768 "btrfs-cleaner"); 2764 "btrfs-cleaner");
2769 if (IS_ERR(fs_info->cleaner_kthread)) 2765 if (IS_ERR(fs_info->cleaner_kthread))
2770 goto fail_block_groups; 2766 goto fail_sysfs;
2771 2767
2772 fs_info->transaction_kthread = kthread_run(transaction_kthread, 2768 fs_info->transaction_kthread = kthread_run(transaction_kthread,
2773 tree_root, 2769 tree_root,
@@ -2778,11 +2774,15 @@ retry_root_backup:
2778 if (!btrfs_test_opt(tree_root, SSD) && 2774 if (!btrfs_test_opt(tree_root, SSD) &&
2779 !btrfs_test_opt(tree_root, NOSSD) && 2775 !btrfs_test_opt(tree_root, NOSSD) &&
2780 !fs_info->fs_devices->rotating) { 2776 !fs_info->fs_devices->rotating) {
2781 printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " 2777 printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
2782 "mode\n"); 2778 "mode\n");
2783 btrfs_set_opt(fs_info->mount_opt, SSD); 2779 btrfs_set_opt(fs_info->mount_opt, SSD);
2784 } 2780 }
2785 2781
2782 /* Set the real inode map cache flag */
2783 if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
2784 btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
2785
2786#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2786#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2787 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { 2787 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
2788 ret = btrfsic_mount(tree_root, fs_devices, 2788 ret = btrfsic_mount(tree_root, fs_devices,
@@ -2791,7 +2791,7 @@ retry_root_backup:
2791 1 : 0, 2791 1 : 0,
2792 fs_info->check_integrity_print_mask); 2792 fs_info->check_integrity_print_mask);
2793 if (ret) 2793 if (ret)
2794 printk(KERN_WARNING "btrfs: failed to initialize" 2794 printk(KERN_WARNING "BTRFS: failed to initialize"
2795 " integrity check module %s\n", sb->s_id); 2795 " integrity check module %s\n", sb->s_id);
2796 } 2796 }
2797#endif 2797#endif
@@ -2804,7 +2804,7 @@ retry_root_backup:
2804 u64 bytenr = btrfs_super_log_root(disk_super); 2804 u64 bytenr = btrfs_super_log_root(disk_super);
2805 2805
2806 if (fs_devices->rw_devices == 0) { 2806 if (fs_devices->rw_devices == 0) {
2807 printk(KERN_WARNING "Btrfs log replay required " 2807 printk(KERN_WARNING "BTRFS: log replay required "
2808 "on RO media\n"); 2808 "on RO media\n");
2809 err = -EIO; 2809 err = -EIO;
2810 goto fail_qgroup; 2810 goto fail_qgroup;
@@ -2827,7 +2827,7 @@ retry_root_backup:
2827 generation + 1); 2827 generation + 1);
2828 if (!log_tree_root->node || 2828 if (!log_tree_root->node ||
2829 !extent_buffer_uptodate(log_tree_root->node)) { 2829 !extent_buffer_uptodate(log_tree_root->node)) {
2830 printk(KERN_ERR "btrfs: failed to read log tree\n"); 2830 printk(KERN_ERR "BTRFS: failed to read log tree\n");
2831 free_extent_buffer(log_tree_root->node); 2831 free_extent_buffer(log_tree_root->node);
2832 kfree(log_tree_root); 2832 kfree(log_tree_root);
2833 goto fail_trans_kthread; 2833 goto fail_trans_kthread;
@@ -2861,7 +2861,7 @@ retry_root_backup:
2861 ret = btrfs_recover_relocation(tree_root); 2861 ret = btrfs_recover_relocation(tree_root);
2862 if (ret < 0) { 2862 if (ret < 0) {
2863 printk(KERN_WARNING 2863 printk(KERN_WARNING
2864 "btrfs: failed to recover relocation\n"); 2864 "BTRFS: failed to recover relocation\n");
2865 err = -EINVAL; 2865 err = -EINVAL;
2866 goto fail_qgroup; 2866 goto fail_qgroup;
2867 } 2867 }
@@ -2891,14 +2891,14 @@ retry_root_backup:
2891 2891
2892 ret = btrfs_resume_balance_async(fs_info); 2892 ret = btrfs_resume_balance_async(fs_info);
2893 if (ret) { 2893 if (ret) {
2894 printk(KERN_WARNING "btrfs: failed to resume balance\n"); 2894 printk(KERN_WARNING "BTRFS: failed to resume balance\n");
2895 close_ctree(tree_root); 2895 close_ctree(tree_root);
2896 return ret; 2896 return ret;
2897 } 2897 }
2898 2898
2899 ret = btrfs_resume_dev_replace_async(fs_info); 2899 ret = btrfs_resume_dev_replace_async(fs_info);
2900 if (ret) { 2900 if (ret) {
2901 pr_warn("btrfs: failed to resume dev_replace\n"); 2901 pr_warn("BTRFS: failed to resume dev_replace\n");
2902 close_ctree(tree_root); 2902 close_ctree(tree_root);
2903 return ret; 2903 return ret;
2904 } 2904 }
@@ -2906,20 +2906,20 @@ retry_root_backup:
2906 btrfs_qgroup_rescan_resume(fs_info); 2906 btrfs_qgroup_rescan_resume(fs_info);
2907 2907
2908 if (create_uuid_tree) { 2908 if (create_uuid_tree) {
2909 pr_info("btrfs: creating UUID tree\n"); 2909 pr_info("BTRFS: creating UUID tree\n");
2910 ret = btrfs_create_uuid_tree(fs_info); 2910 ret = btrfs_create_uuid_tree(fs_info);
2911 if (ret) { 2911 if (ret) {
2912 pr_warn("btrfs: failed to create the UUID tree %d\n", 2912 pr_warn("BTRFS: failed to create the UUID tree %d\n",
2913 ret); 2913 ret);
2914 close_ctree(tree_root); 2914 close_ctree(tree_root);
2915 return ret; 2915 return ret;
2916 } 2916 }
2917 } else if (check_uuid_tree || 2917 } else if (check_uuid_tree ||
2918 btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { 2918 btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
2919 pr_info("btrfs: checking UUID tree\n"); 2919 pr_info("BTRFS: checking UUID tree\n");
2920 ret = btrfs_check_uuid_tree(fs_info); 2920 ret = btrfs_check_uuid_tree(fs_info);
2921 if (ret) { 2921 if (ret) {
2922 pr_warn("btrfs: failed to check the UUID tree %d\n", 2922 pr_warn("BTRFS: failed to check the UUID tree %d\n",
2923 ret); 2923 ret);
2924 close_ctree(tree_root); 2924 close_ctree(tree_root);
2925 return ret; 2925 return ret;
@@ -2945,6 +2945,9 @@ fail_cleaner:
2945 */ 2945 */
2946 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2946 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2947 2947
2948fail_sysfs:
2949 btrfs_sysfs_remove_one(fs_info);
2950
2948fail_block_groups: 2951fail_block_groups:
2949 btrfs_put_block_group_cache(fs_info); 2952 btrfs_put_block_group_cache(fs_info);
2950 btrfs_free_block_groups(fs_info); 2953 btrfs_free_block_groups(fs_info);
@@ -3000,7 +3003,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
3000 struct btrfs_device *device = (struct btrfs_device *) 3003 struct btrfs_device *device = (struct btrfs_device *)
3001 bh->b_private; 3004 bh->b_private;
3002 3005
3003 printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to " 3006 printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
3004 "I/O error on %s\n", 3007 "I/O error on %s\n",
3005 rcu_str_deref(device->name)); 3008 rcu_str_deref(device->name));
3006 /* note, we dont' set_buffer_write_io_error because we have 3009 /* note, we dont' set_buffer_write_io_error because we have
@@ -3119,7 +3122,7 @@ static int write_dev_supers(struct btrfs_device *device,
3119 bh = __getblk(device->bdev, bytenr / 4096, 3122 bh = __getblk(device->bdev, bytenr / 4096,
3120 BTRFS_SUPER_INFO_SIZE); 3123 BTRFS_SUPER_INFO_SIZE);
3121 if (!bh) { 3124 if (!bh) {
3122 printk(KERN_ERR "btrfs: couldn't get super " 3125 printk(KERN_ERR "BTRFS: couldn't get super "
3123 "buffer head for bytenr %Lu\n", bytenr); 3126 "buffer head for bytenr %Lu\n", bytenr);
3124 errors++; 3127 errors++;
3125 continue; 3128 continue;
@@ -3140,7 +3143,10 @@ static int write_dev_supers(struct btrfs_device *device,
3140 * we fua the first super. The others we allow 3143 * we fua the first super. The others we allow
3141 * to go down lazy. 3144 * to go down lazy.
3142 */ 3145 */
3143 ret = btrfsic_submit_bh(WRITE_FUA, bh); 3146 if (i == 0)
3147 ret = btrfsic_submit_bh(WRITE_FUA, bh);
3148 else
3149 ret = btrfsic_submit_bh(WRITE_SYNC, bh);
3144 if (ret) 3150 if (ret)
3145 errors++; 3151 errors++;
3146 } 3152 }
@@ -3186,7 +3192,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
3186 wait_for_completion(&device->flush_wait); 3192 wait_for_completion(&device->flush_wait);
3187 3193
3188 if (bio_flagged(bio, BIO_EOPNOTSUPP)) { 3194 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
3189 printk_in_rcu("btrfs: disabling barriers on dev %s\n", 3195 printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
3190 rcu_str_deref(device->name)); 3196 rcu_str_deref(device->name));
3191 device->nobarriers = 1; 3197 device->nobarriers = 1;
3192 } else if (!bio_flagged(bio, BIO_UPTODATE)) { 3198 } else if (!bio_flagged(bio, BIO_UPTODATE)) {
@@ -3407,7 +3413,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3407 total_errors++; 3413 total_errors++;
3408 } 3414 }
3409 if (total_errors > max_errors) { 3415 if (total_errors > max_errors) {
3410 printk(KERN_ERR "btrfs: %d errors while writing supers\n", 3416 btrfs_err(root->fs_info, "%d errors while writing supers",
3411 total_errors); 3417 total_errors);
3412 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3418 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3413 3419
@@ -3455,10 +3461,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3455 if (btrfs_root_refs(&root->root_item) == 0) 3461 if (btrfs_root_refs(&root->root_item) == 0)
3456 synchronize_srcu(&fs_info->subvol_srcu); 3462 synchronize_srcu(&fs_info->subvol_srcu);
3457 3463
3458 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { 3464 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3459 btrfs_free_log(NULL, root); 3465 btrfs_free_log(NULL, root);
3460 btrfs_free_log_root_tree(NULL, fs_info);
3461 }
3462 3466
3463 __btrfs_remove_free_space_cache(root->free_ino_pinned); 3467 __btrfs_remove_free_space_cache(root->free_ino_pinned);
3464 __btrfs_remove_free_space_cache(root->free_ino_ctl); 3468 __btrfs_remove_free_space_cache(root->free_ino_ctl);
@@ -3563,14 +3567,12 @@ int close_ctree(struct btrfs_root *root)
3563 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3567 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3564 ret = btrfs_commit_super(root); 3568 ret = btrfs_commit_super(root);
3565 if (ret) 3569 if (ret)
3566 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3570 btrfs_err(root->fs_info, "commit super ret %d", ret);
3567 } 3571 }
3568 3572
3569 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 3573 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3570 btrfs_error_commit_super(root); 3574 btrfs_error_commit_super(root);
3571 3575
3572 btrfs_put_block_group_cache(fs_info);
3573
3574 kthread_stop(fs_info->transaction_kthread); 3576 kthread_stop(fs_info->transaction_kthread);
3575 kthread_stop(fs_info->cleaner_kthread); 3577 kthread_stop(fs_info->cleaner_kthread);
3576 3578
@@ -3580,12 +3582,16 @@ int close_ctree(struct btrfs_root *root)
3580 btrfs_free_qgroup_config(root->fs_info); 3582 btrfs_free_qgroup_config(root->fs_info);
3581 3583
3582 if (percpu_counter_sum(&fs_info->delalloc_bytes)) { 3584 if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
3583 printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", 3585 btrfs_info(root->fs_info, "at unmount delalloc count %lld",
3584 percpu_counter_sum(&fs_info->delalloc_bytes)); 3586 percpu_counter_sum(&fs_info->delalloc_bytes));
3585 } 3587 }
3586 3588
3589 btrfs_sysfs_remove_one(fs_info);
3590
3587 del_fs_roots(fs_info); 3591 del_fs_roots(fs_info);
3588 3592
3593 btrfs_put_block_group_cache(fs_info);
3594
3589 btrfs_free_block_groups(fs_info); 3595 btrfs_free_block_groups(fs_info);
3590 3596
3591 btrfs_stop_all_workers(fs_info); 3597 btrfs_stop_all_workers(fs_info);
@@ -3803,55 +3809,54 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3803 delayed_refs = &trans->delayed_refs; 3809 delayed_refs = &trans->delayed_refs;
3804 3810
3805 spin_lock(&delayed_refs->lock); 3811 spin_lock(&delayed_refs->lock);
3806 if (delayed_refs->num_entries == 0) { 3812 if (atomic_read(&delayed_refs->num_entries) == 0) {
3807 spin_unlock(&delayed_refs->lock); 3813 spin_unlock(&delayed_refs->lock);
3808 printk(KERN_INFO "delayed_refs has NO entry\n"); 3814 btrfs_info(root->fs_info, "delayed_refs has NO entry");
3809 return ret; 3815 return ret;
3810 } 3816 }
3811 3817
3812 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3818 while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
3813 struct btrfs_delayed_ref_head *head = NULL; 3819 struct btrfs_delayed_ref_head *head;
3814 bool pin_bytes = false; 3820 bool pin_bytes = false;
3815 3821
3816 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3822 head = rb_entry(node, struct btrfs_delayed_ref_head,
3817 atomic_set(&ref->refs, 1); 3823 href_node);
3818 if (btrfs_delayed_ref_is_head(ref)) { 3824 if (!mutex_trylock(&head->mutex)) {
3819 3825 atomic_inc(&head->node.refs);
3820 head = btrfs_delayed_node_to_head(ref); 3826 spin_unlock(&delayed_refs->lock);
3821 if (!mutex_trylock(&head->mutex)) {
3822 atomic_inc(&ref->refs);
3823 spin_unlock(&delayed_refs->lock);
3824
3825 /* Need to wait for the delayed ref to run */
3826 mutex_lock(&head->mutex);
3827 mutex_unlock(&head->mutex);
3828 btrfs_put_delayed_ref(ref);
3829
3830 spin_lock(&delayed_refs->lock);
3831 continue;
3832 }
3833 3827
3834 if (head->must_insert_reserved) 3828 mutex_lock(&head->mutex);
3835 pin_bytes = true;
3836 btrfs_free_delayed_extent_op(head->extent_op);
3837 delayed_refs->num_heads--;
3838 if (list_empty(&head->cluster))
3839 delayed_refs->num_heads_ready--;
3840 list_del_init(&head->cluster);
3841 }
3842
3843 ref->in_tree = 0;
3844 rb_erase(&ref->rb_node, &delayed_refs->root);
3845 delayed_refs->num_entries--;
3846 spin_unlock(&delayed_refs->lock);
3847 if (head) {
3848 if (pin_bytes)
3849 btrfs_pin_extent(root, ref->bytenr,
3850 ref->num_bytes, 1);
3851 mutex_unlock(&head->mutex); 3829 mutex_unlock(&head->mutex);
3830 btrfs_put_delayed_ref(&head->node);
3831 spin_lock(&delayed_refs->lock);
3832 continue;
3833 }
3834 spin_lock(&head->lock);
3835 while ((node = rb_first(&head->ref_root)) != NULL) {
3836 ref = rb_entry(node, struct btrfs_delayed_ref_node,
3837 rb_node);
3838 ref->in_tree = 0;
3839 rb_erase(&ref->rb_node, &head->ref_root);
3840 atomic_dec(&delayed_refs->num_entries);
3841 btrfs_put_delayed_ref(ref);
3852 } 3842 }
3853 btrfs_put_delayed_ref(ref); 3843 if (head->must_insert_reserved)
3844 pin_bytes = true;
3845 btrfs_free_delayed_extent_op(head->extent_op);
3846 delayed_refs->num_heads--;
3847 if (head->processing == 0)
3848 delayed_refs->num_heads_ready--;
3849 atomic_dec(&delayed_refs->num_entries);
3850 head->node.in_tree = 0;
3851 rb_erase(&head->href_node, &delayed_refs->href_root);
3852 spin_unlock(&head->lock);
3853 spin_unlock(&delayed_refs->lock);
3854 mutex_unlock(&head->mutex);
3854 3855
3856 if (pin_bytes)
3857 btrfs_pin_extent(root, head->node.bytenr,
3858 head->node.num_bytes, 1);
3859 btrfs_put_delayed_ref(&head->node);
3855 cond_resched(); 3860 cond_resched();
3856 spin_lock(&delayed_refs->lock); 3861 spin_lock(&delayed_refs->lock);
3857 } 3862 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9c01509dd8ab..32312e09f0f5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,6 +35,7 @@
35#include "locking.h" 35#include "locking.h"
36#include "free-space-cache.h" 36#include "free-space-cache.h"
37#include "math.h" 37#include "math.h"
38#include "sysfs.h"
38 39
39#undef SCRAMBLE_DELAYED_REFS 40#undef SCRAMBLE_DELAYED_REFS
40 41
@@ -441,7 +442,8 @@ next:
441 if (ret) 442 if (ret)
442 break; 443 break;
443 444
444 if (need_resched()) { 445 if (need_resched() ||
446 rwsem_is_contended(&fs_info->extent_commit_sem)) {
445 caching_ctl->progress = last; 447 caching_ctl->progress = last;
446 btrfs_release_path(path); 448 btrfs_release_path(path);
447 up_read(&fs_info->extent_commit_sem); 449 up_read(&fs_info->extent_commit_sem);
@@ -855,12 +857,14 @@ again:
855 btrfs_put_delayed_ref(&head->node); 857 btrfs_put_delayed_ref(&head->node);
856 goto search_again; 858 goto search_again;
857 } 859 }
860 spin_lock(&head->lock);
858 if (head->extent_op && head->extent_op->update_flags) 861 if (head->extent_op && head->extent_op->update_flags)
859 extent_flags |= head->extent_op->flags_to_set; 862 extent_flags |= head->extent_op->flags_to_set;
860 else 863 else
861 BUG_ON(num_refs == 0); 864 BUG_ON(num_refs == 0);
862 865
863 num_refs += head->node.ref_mod; 866 num_refs += head->node.ref_mod;
867 spin_unlock(&head->lock);
864 mutex_unlock(&head->mutex); 868 mutex_unlock(&head->mutex);
865 } 869 }
866 spin_unlock(&delayed_refs->lock); 870 spin_unlock(&delayed_refs->lock);
@@ -1070,11 +1074,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1070 __le64 lenum; 1074 __le64 lenum;
1071 1075
1072 lenum = cpu_to_le64(root_objectid); 1076 lenum = cpu_to_le64(root_objectid);
1073 high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); 1077 high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1074 lenum = cpu_to_le64(owner); 1078 lenum = cpu_to_le64(owner);
1075 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1079 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1076 lenum = cpu_to_le64(offset); 1080 lenum = cpu_to_le64(offset);
1077 low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); 1081 low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1078 1082
1079 return ((u64)high_crc << 31) ^ (u64)low_crc; 1083 return ((u64)high_crc << 31) ^ (u64)low_crc;
1080} 1084}
@@ -2285,64 +2289,62 @@ static noinline struct btrfs_delayed_ref_node *
2285select_delayed_ref(struct btrfs_delayed_ref_head *head) 2289select_delayed_ref(struct btrfs_delayed_ref_head *head)
2286{ 2290{
2287 struct rb_node *node; 2291 struct rb_node *node;
2288 struct btrfs_delayed_ref_node *ref; 2292 struct btrfs_delayed_ref_node *ref, *last = NULL;;
2289 int action = BTRFS_ADD_DELAYED_REF; 2293
2290again:
2291 /* 2294 /*
2292 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2295 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2293 * this prevents ref count from going down to zero when 2296 * this prevents ref count from going down to zero when
2294 * there still are pending delayed ref. 2297 * there still are pending delayed ref.
2295 */ 2298 */
2296 node = rb_prev(&head->node.rb_node); 2299 node = rb_first(&head->ref_root);
2297 while (1) { 2300 while (node) {
2298 if (!node)
2299 break;
2300 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2301 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2301 rb_node); 2302 rb_node);
2302 if (ref->bytenr != head->node.bytenr) 2303 if (ref->action == BTRFS_ADD_DELAYED_REF)
2303 break;
2304 if (ref->action == action)
2305 return ref; 2304 return ref;
2306 node = rb_prev(node); 2305 else if (last == NULL)
2307 } 2306 last = ref;
2308 if (action == BTRFS_ADD_DELAYED_REF) { 2307 node = rb_next(node);
2309 action = BTRFS_DROP_DELAYED_REF;
2310 goto again;
2311 } 2308 }
2312 return NULL; 2309 return last;
2313} 2310}
2314 2311
2315/* 2312/*
2316 * Returns 0 on success or if called with an already aborted transaction. 2313 * Returns 0 on success or if called with an already aborted transaction.
2317 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2314 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2318 */ 2315 */
2319static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 2316static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2320 struct btrfs_root *root, 2317 struct btrfs_root *root,
2321 struct list_head *cluster) 2318 unsigned long nr)
2322{ 2319{
2323 struct btrfs_delayed_ref_root *delayed_refs; 2320 struct btrfs_delayed_ref_root *delayed_refs;
2324 struct btrfs_delayed_ref_node *ref; 2321 struct btrfs_delayed_ref_node *ref;
2325 struct btrfs_delayed_ref_head *locked_ref = NULL; 2322 struct btrfs_delayed_ref_head *locked_ref = NULL;
2326 struct btrfs_delayed_extent_op *extent_op; 2323 struct btrfs_delayed_extent_op *extent_op;
2327 struct btrfs_fs_info *fs_info = root->fs_info; 2324 struct btrfs_fs_info *fs_info = root->fs_info;
2325 ktime_t start = ktime_get();
2328 int ret; 2326 int ret;
2329 int count = 0; 2327 unsigned long count = 0;
2328 unsigned long actual_count = 0;
2330 int must_insert_reserved = 0; 2329 int must_insert_reserved = 0;
2331 2330
2332 delayed_refs = &trans->transaction->delayed_refs; 2331 delayed_refs = &trans->transaction->delayed_refs;
2333 while (1) { 2332 while (1) {
2334 if (!locked_ref) { 2333 if (!locked_ref) {
2335 /* pick a new head ref from the cluster list */ 2334 if (count >= nr)
2336 if (list_empty(cluster))
2337 break; 2335 break;
2338 2336
2339 locked_ref = list_entry(cluster->next, 2337 spin_lock(&delayed_refs->lock);
2340 struct btrfs_delayed_ref_head, cluster); 2338 locked_ref = btrfs_select_ref_head(trans);
2339 if (!locked_ref) {
2340 spin_unlock(&delayed_refs->lock);
2341 break;
2342 }
2341 2343
2342 /* grab the lock that says we are going to process 2344 /* grab the lock that says we are going to process
2343 * all the refs for this head */ 2345 * all the refs for this head */
2344 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2346 ret = btrfs_delayed_ref_lock(trans, locked_ref);
2345 2347 spin_unlock(&delayed_refs->lock);
2346 /* 2348 /*
2347 * we may have dropped the spin lock to get the head 2349 * we may have dropped the spin lock to get the head
2348 * mutex lock, and that might have given someone else 2350 * mutex lock, and that might have given someone else
@@ -2363,6 +2365,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2363 * finish. If we merged anything we need to re-loop so we can 2365 * finish. If we merged anything we need to re-loop so we can
2364 * get a good ref. 2366 * get a good ref.
2365 */ 2367 */
2368 spin_lock(&locked_ref->lock);
2366 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2369 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2367 locked_ref); 2370 locked_ref);
2368 2371
@@ -2374,17 +2377,15 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2374 2377
2375 if (ref && ref->seq && 2378 if (ref && ref->seq &&
2376 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2379 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2377 /* 2380 spin_unlock(&locked_ref->lock);
2378 * there are still refs with lower seq numbers in the
2379 * process of being added. Don't run this ref yet.
2380 */
2381 list_del_init(&locked_ref->cluster);
2382 btrfs_delayed_ref_unlock(locked_ref); 2381 btrfs_delayed_ref_unlock(locked_ref);
2383 locked_ref = NULL; 2382 spin_lock(&delayed_refs->lock);
2383 locked_ref->processing = 0;
2384 delayed_refs->num_heads_ready++; 2384 delayed_refs->num_heads_ready++;
2385 spin_unlock(&delayed_refs->lock); 2385 spin_unlock(&delayed_refs->lock);
2386 locked_ref = NULL;
2386 cond_resched(); 2387 cond_resched();
2387 spin_lock(&delayed_refs->lock); 2388 count++;
2388 continue; 2389 continue;
2389 } 2390 }
2390 2391
@@ -2399,6 +2400,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2399 locked_ref->extent_op = NULL; 2400 locked_ref->extent_op = NULL;
2400 2401
2401 if (!ref) { 2402 if (!ref) {
2403
2404
2402 /* All delayed refs have been processed, Go ahead 2405 /* All delayed refs have been processed, Go ahead
2403 * and send the head node to run_one_delayed_ref, 2406 * and send the head node to run_one_delayed_ref,
2404 * so that any accounting fixes can happen 2407 * so that any accounting fixes can happen
@@ -2411,8 +2414,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2411 } 2414 }
2412 2415
2413 if (extent_op) { 2416 if (extent_op) {
2414 spin_unlock(&delayed_refs->lock); 2417 spin_unlock(&locked_ref->lock);
2415
2416 ret = run_delayed_extent_op(trans, root, 2418 ret = run_delayed_extent_op(trans, root,
2417 ref, extent_op); 2419 ref, extent_op);
2418 btrfs_free_delayed_extent_op(extent_op); 2420 btrfs_free_delayed_extent_op(extent_op);
@@ -2426,19 +2428,39 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2426 */ 2428 */
2427 if (must_insert_reserved) 2429 if (must_insert_reserved)
2428 locked_ref->must_insert_reserved = 1; 2430 locked_ref->must_insert_reserved = 1;
2431 locked_ref->processing = 0;
2429 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2432 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2430 spin_lock(&delayed_refs->lock);
2431 btrfs_delayed_ref_unlock(locked_ref); 2433 btrfs_delayed_ref_unlock(locked_ref);
2432 return ret; 2434 return ret;
2433 } 2435 }
2436 continue;
2437 }
2434 2438
2435 goto next; 2439 /*
2440 * Need to drop our head ref lock and re-aqcuire the
2441 * delayed ref lock and then re-check to make sure
2442 * nobody got added.
2443 */
2444 spin_unlock(&locked_ref->lock);
2445 spin_lock(&delayed_refs->lock);
2446 spin_lock(&locked_ref->lock);
2447 if (rb_first(&locked_ref->ref_root)) {
2448 spin_unlock(&locked_ref->lock);
2449 spin_unlock(&delayed_refs->lock);
2450 continue;
2436 } 2451 }
2452 ref->in_tree = 0;
2453 delayed_refs->num_heads--;
2454 rb_erase(&locked_ref->href_node,
2455 &delayed_refs->href_root);
2456 spin_unlock(&delayed_refs->lock);
2457 } else {
2458 actual_count++;
2459 ref->in_tree = 0;
2460 rb_erase(&ref->rb_node, &locked_ref->ref_root);
2437 } 2461 }
2462 atomic_dec(&delayed_refs->num_entries);
2438 2463
2439 ref->in_tree = 0;
2440 rb_erase(&ref->rb_node, &delayed_refs->root);
2441 delayed_refs->num_entries--;
2442 if (!btrfs_delayed_ref_is_head(ref)) { 2464 if (!btrfs_delayed_ref_is_head(ref)) {
2443 /* 2465 /*
2444 * when we play the delayed ref, also correct the 2466 * when we play the delayed ref, also correct the
@@ -2455,20 +2477,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2455 default: 2477 default:
2456 WARN_ON(1); 2478 WARN_ON(1);
2457 } 2479 }
2458 } else {
2459 list_del_init(&locked_ref->cluster);
2460 } 2480 }
2461 spin_unlock(&delayed_refs->lock); 2481 spin_unlock(&locked_ref->lock);
2462 2482
2463 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2483 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2464 must_insert_reserved); 2484 must_insert_reserved);
2465 2485
2466 btrfs_free_delayed_extent_op(extent_op); 2486 btrfs_free_delayed_extent_op(extent_op);
2467 if (ret) { 2487 if (ret) {
2488 locked_ref->processing = 0;
2468 btrfs_delayed_ref_unlock(locked_ref); 2489 btrfs_delayed_ref_unlock(locked_ref);
2469 btrfs_put_delayed_ref(ref); 2490 btrfs_put_delayed_ref(ref);
2470 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2491 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2471 spin_lock(&delayed_refs->lock);
2472 return ret; 2492 return ret;
2473 } 2493 }
2474 2494
@@ -2484,11 +2504,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2484 } 2504 }
2485 btrfs_put_delayed_ref(ref); 2505 btrfs_put_delayed_ref(ref);
2486 count++; 2506 count++;
2487next:
2488 cond_resched(); 2507 cond_resched();
2508 }
2509
2510 /*
2511 * We don't want to include ref heads since we can have empty ref heads
2512 * and those will drastically skew our runtime down since we just do
2513 * accounting, no actual extent tree updates.
2514 */
2515 if (actual_count > 0) {
2516 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2517 u64 avg;
2518
2519 /*
2520 * We weigh the current average higher than our current runtime
2521 * to avoid large swings in the average.
2522 */
2489 spin_lock(&delayed_refs->lock); 2523 spin_lock(&delayed_refs->lock);
2524 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2525 avg = div64_u64(avg, 4);
2526 fs_info->avg_delayed_ref_runtime = avg;
2527 spin_unlock(&delayed_refs->lock);
2490 } 2528 }
2491 return count; 2529 return 0;
2492} 2530}
2493 2531
2494#ifdef SCRAMBLE_DELAYED_REFS 2532#ifdef SCRAMBLE_DELAYED_REFS
@@ -2570,16 +2608,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2570 return ret; 2608 return ret;
2571} 2609}
2572 2610
2573static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2574 int count)
2575{
2576 int val = atomic_read(&delayed_refs->ref_seq);
2577
2578 if (val < seq || val >= seq + count)
2579 return 1;
2580 return 0;
2581}
2582
2583static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2611static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2584{ 2612{
2585 u64 num_bytes; 2613 u64 num_bytes;
@@ -2596,7 +2624,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2596 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2624 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2597} 2625}
2598 2626
2599int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2627int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2600 struct btrfs_root *root) 2628 struct btrfs_root *root)
2601{ 2629{
2602 struct btrfs_block_rsv *global_rsv; 2630 struct btrfs_block_rsv *global_rsv;
@@ -2625,6 +2653,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2625 return ret; 2653 return ret;
2626} 2654}
2627 2655
2656int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2657 struct btrfs_root *root)
2658{
2659 struct btrfs_fs_info *fs_info = root->fs_info;
2660 u64 num_entries =
2661 atomic_read(&trans->transaction->delayed_refs.num_entries);
2662 u64 avg_runtime;
2663
2664 smp_mb();
2665 avg_runtime = fs_info->avg_delayed_ref_runtime;
2666 if (num_entries * avg_runtime >= NSEC_PER_SEC)
2667 return 1;
2668
2669 return btrfs_check_space_for_delayed_refs(trans, root);
2670}
2671
2628/* 2672/*
2629 * this starts processing the delayed reference count updates and 2673 * this starts processing the delayed reference count updates and
2630 * extent insertions we have queued up so far. count can be 2674 * extent insertions we have queued up so far. count can be
@@ -2640,13 +2684,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2640{ 2684{
2641 struct rb_node *node; 2685 struct rb_node *node;
2642 struct btrfs_delayed_ref_root *delayed_refs; 2686 struct btrfs_delayed_ref_root *delayed_refs;
2643 struct btrfs_delayed_ref_node *ref; 2687 struct btrfs_delayed_ref_head *head;
2644 struct list_head cluster;
2645 int ret; 2688 int ret;
2646 u64 delayed_start;
2647 int run_all = count == (unsigned long)-1; 2689 int run_all = count == (unsigned long)-1;
2648 int run_most = 0; 2690 int run_most = 0;
2649 int loops;
2650 2691
2651 /* We'll clean this up in btrfs_cleanup_transaction */ 2692 /* We'll clean this up in btrfs_cleanup_transaction */
2652 if (trans->aborted) 2693 if (trans->aborted)
@@ -2658,130 +2699,40 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2658 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2699 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2659 2700
2660 delayed_refs = &trans->transaction->delayed_refs; 2701 delayed_refs = &trans->transaction->delayed_refs;
2661 INIT_LIST_HEAD(&cluster);
2662 if (count == 0) { 2702 if (count == 0) {
2663 count = delayed_refs->num_entries * 2; 2703 count = atomic_read(&delayed_refs->num_entries) * 2;
2664 run_most = 1; 2704 run_most = 1;
2665 } 2705 }
2666 2706
2667 if (!run_all && !run_most) {
2668 int old;
2669 int seq = atomic_read(&delayed_refs->ref_seq);
2670
2671progress:
2672 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2673 if (old) {
2674 DEFINE_WAIT(__wait);
2675 if (delayed_refs->flushing ||
2676 !btrfs_should_throttle_delayed_refs(trans, root))
2677 return 0;
2678
2679 prepare_to_wait(&delayed_refs->wait, &__wait,
2680 TASK_UNINTERRUPTIBLE);
2681
2682 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2683 if (old) {
2684 schedule();
2685 finish_wait(&delayed_refs->wait, &__wait);
2686
2687 if (!refs_newer(delayed_refs, seq, 256))
2688 goto progress;
2689 else
2690 return 0;
2691 } else {
2692 finish_wait(&delayed_refs->wait, &__wait);
2693 goto again;
2694 }
2695 }
2696
2697 } else {
2698 atomic_inc(&delayed_refs->procs_running_refs);
2699 }
2700
2701again: 2707again:
2702 loops = 0;
2703 spin_lock(&delayed_refs->lock);
2704
2705#ifdef SCRAMBLE_DELAYED_REFS 2708#ifdef SCRAMBLE_DELAYED_REFS
2706 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2709 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2707#endif 2710#endif
2708 2711 ret = __btrfs_run_delayed_refs(trans, root, count);
2709 while (1) { 2712 if (ret < 0) {
2710 if (!(run_all || run_most) && 2713 btrfs_abort_transaction(trans, root, ret);
2711 !btrfs_should_throttle_delayed_refs(trans, root)) 2714 return ret;
2712 break;
2713
2714 /*
2715 * go find something we can process in the rbtree. We start at
2716 * the beginning of the tree, and then build a cluster
2717 * of refs to process starting at the first one we are able to
2718 * lock
2719 */
2720 delayed_start = delayed_refs->run_delayed_start;
2721 ret = btrfs_find_ref_cluster(trans, &cluster,
2722 delayed_refs->run_delayed_start);
2723 if (ret)
2724 break;
2725
2726 ret = run_clustered_refs(trans, root, &cluster);
2727 if (ret < 0) {
2728 btrfs_release_ref_cluster(&cluster);
2729 spin_unlock(&delayed_refs->lock);
2730 btrfs_abort_transaction(trans, root, ret);
2731 atomic_dec(&delayed_refs->procs_running_refs);
2732 wake_up(&delayed_refs->wait);
2733 return ret;
2734 }
2735
2736 atomic_add(ret, &delayed_refs->ref_seq);
2737
2738 count -= min_t(unsigned long, ret, count);
2739
2740 if (count == 0)
2741 break;
2742
2743 if (delayed_start >= delayed_refs->run_delayed_start) {
2744 if (loops == 0) {
2745 /*
2746 * btrfs_find_ref_cluster looped. let's do one
2747 * more cycle. if we don't run any delayed ref
2748 * during that cycle (because we can't because
2749 * all of them are blocked), bail out.
2750 */
2751 loops = 1;
2752 } else {
2753 /*
2754 * no runnable refs left, stop trying
2755 */
2756 BUG_ON(run_all);
2757 break;
2758 }
2759 }
2760 if (ret) {
2761 /* refs were run, let's reset staleness detection */
2762 loops = 0;
2763 }
2764 } 2715 }
2765 2716
2766 if (run_all) { 2717 if (run_all) {
2767 if (!list_empty(&trans->new_bgs)) { 2718 if (!list_empty(&trans->new_bgs))
2768 spin_unlock(&delayed_refs->lock);
2769 btrfs_create_pending_block_groups(trans, root); 2719 btrfs_create_pending_block_groups(trans, root);
2770 spin_lock(&delayed_refs->lock);
2771 }
2772 2720
2773 node = rb_first(&delayed_refs->root); 2721 spin_lock(&delayed_refs->lock);
2774 if (!node) 2722 node = rb_first(&delayed_refs->href_root);
2723 if (!node) {
2724 spin_unlock(&delayed_refs->lock);
2775 goto out; 2725 goto out;
2726 }
2776 count = (unsigned long)-1; 2727 count = (unsigned long)-1;
2777 2728
2778 while (node) { 2729 while (node) {
2779 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2730 head = rb_entry(node, struct btrfs_delayed_ref_head,
2780 rb_node); 2731 href_node);
2781 if (btrfs_delayed_ref_is_head(ref)) { 2732 if (btrfs_delayed_ref_is_head(&head->node)) {
2782 struct btrfs_delayed_ref_head *head; 2733 struct btrfs_delayed_ref_node *ref;
2783 2734
2784 head = btrfs_delayed_node_to_head(ref); 2735 ref = &head->node;
2785 atomic_inc(&ref->refs); 2736 atomic_inc(&ref->refs);
2786 2737
2787 spin_unlock(&delayed_refs->lock); 2738 spin_unlock(&delayed_refs->lock);
@@ -2795,20 +2746,16 @@ again:
2795 btrfs_put_delayed_ref(ref); 2746 btrfs_put_delayed_ref(ref);
2796 cond_resched(); 2747 cond_resched();
2797 goto again; 2748 goto again;
2749 } else {
2750 WARN_ON(1);
2798 } 2751 }
2799 node = rb_next(node); 2752 node = rb_next(node);
2800 } 2753 }
2801 spin_unlock(&delayed_refs->lock); 2754 spin_unlock(&delayed_refs->lock);
2802 schedule_timeout(1); 2755 cond_resched();
2803 goto again; 2756 goto again;
2804 } 2757 }
2805out: 2758out:
2806 atomic_dec(&delayed_refs->procs_running_refs);
2807 smp_mb();
2808 if (waitqueue_active(&delayed_refs->wait))
2809 wake_up(&delayed_refs->wait);
2810
2811 spin_unlock(&delayed_refs->lock);
2812 assert_qgroups_uptodate(trans); 2759 assert_qgroups_uptodate(trans);
2813 return 0; 2760 return 0;
2814} 2761}
@@ -2850,12 +2797,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2850 struct rb_node *node; 2797 struct rb_node *node;
2851 int ret = 0; 2798 int ret = 0;
2852 2799
2853 ret = -ENOENT;
2854 delayed_refs = &trans->transaction->delayed_refs; 2800 delayed_refs = &trans->transaction->delayed_refs;
2855 spin_lock(&delayed_refs->lock); 2801 spin_lock(&delayed_refs->lock);
2856 head = btrfs_find_delayed_ref_head(trans, bytenr); 2802 head = btrfs_find_delayed_ref_head(trans, bytenr);
2857 if (!head) 2803 if (!head) {
2858 goto out; 2804 spin_unlock(&delayed_refs->lock);
2805 return 0;
2806 }
2859 2807
2860 if (!mutex_trylock(&head->mutex)) { 2808 if (!mutex_trylock(&head->mutex)) {
2861 atomic_inc(&head->node.refs); 2809 atomic_inc(&head->node.refs);
@@ -2872,40 +2820,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2872 btrfs_put_delayed_ref(&head->node); 2820 btrfs_put_delayed_ref(&head->node);
2873 return -EAGAIN; 2821 return -EAGAIN;
2874 } 2822 }
2823 spin_unlock(&delayed_refs->lock);
2875 2824
2876 node = rb_prev(&head->node.rb_node); 2825 spin_lock(&head->lock);
2877 if (!node) 2826 node = rb_first(&head->ref_root);
2878 goto out_unlock; 2827 while (node) {
2879 2828 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2880 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2829 node = rb_next(node);
2881
2882 if (ref->bytenr != bytenr)
2883 goto out_unlock;
2884
2885 ret = 1;
2886 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2887 goto out_unlock;
2888 2830
2889 data_ref = btrfs_delayed_node_to_data_ref(ref); 2831 /* If it's a shared ref we know a cross reference exists */
2832 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2833 ret = 1;
2834 break;
2835 }
2890 2836
2891 node = rb_prev(node); 2837 data_ref = btrfs_delayed_node_to_data_ref(ref);
2892 if (node) {
2893 int seq = ref->seq;
2894 2838
2895 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2839 /*
2896 if (ref->bytenr == bytenr && ref->seq == seq) 2840 * If our ref doesn't match the one we're currently looking at
2897 goto out_unlock; 2841 * then we have a cross reference.
2842 */
2843 if (data_ref->root != root->root_key.objectid ||
2844 data_ref->objectid != objectid ||
2845 data_ref->offset != offset) {
2846 ret = 1;
2847 break;
2848 }
2898 } 2849 }
2899 2850 spin_unlock(&head->lock);
2900 if (data_ref->root != root->root_key.objectid ||
2901 data_ref->objectid != objectid || data_ref->offset != offset)
2902 goto out_unlock;
2903
2904 ret = 0;
2905out_unlock:
2906 mutex_unlock(&head->mutex); 2851 mutex_unlock(&head->mutex);
2907out:
2908 spin_unlock(&delayed_refs->lock);
2909 return ret; 2852 return ret;
2910} 2853}
2911 2854
@@ -3402,6 +3345,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3402 return readonly; 3345 return readonly;
3403} 3346}
3404 3347
3348static const char *alloc_name(u64 flags)
3349{
3350 switch (flags) {
3351 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3352 return "mixed";
3353 case BTRFS_BLOCK_GROUP_METADATA:
3354 return "metadata";
3355 case BTRFS_BLOCK_GROUP_DATA:
3356 return "data";
3357 case BTRFS_BLOCK_GROUP_SYSTEM:
3358 return "system";
3359 default:
3360 WARN_ON(1);
3361 return "invalid-combination";
3362 };
3363}
3364
3405static int update_space_info(struct btrfs_fs_info *info, u64 flags, 3365static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3406 u64 total_bytes, u64 bytes_used, 3366 u64 total_bytes, u64 bytes_used,
3407 struct btrfs_space_info **space_info) 3367 struct btrfs_space_info **space_info)
@@ -3439,8 +3399,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3439 return ret; 3399 return ret;
3440 } 3400 }
3441 3401
3442 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 3402 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
3443 INIT_LIST_HEAD(&found->block_groups[i]); 3403 INIT_LIST_HEAD(&found->block_groups[i]);
3404 kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype);
3405 }
3444 init_rwsem(&found->groups_sem); 3406 init_rwsem(&found->groups_sem);
3445 spin_lock_init(&found->lock); 3407 spin_lock_init(&found->lock);
3446 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 3408 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
@@ -3457,11 +3419,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3457 found->chunk_alloc = 0; 3419 found->chunk_alloc = 0;
3458 found->flush = 0; 3420 found->flush = 0;
3459 init_waitqueue_head(&found->wait); 3421 init_waitqueue_head(&found->wait);
3422
3423 ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3424 info->space_info_kobj, "%s",
3425 alloc_name(found->flags));
3426 if (ret) {
3427 kfree(found);
3428 return ret;
3429 }
3430
3460 *space_info = found; 3431 *space_info = found;
3461 list_add_rcu(&found->list, &info->space_info); 3432 list_add_rcu(&found->list, &info->space_info);
3462 if (flags & BTRFS_BLOCK_GROUP_DATA) 3433 if (flags & BTRFS_BLOCK_GROUP_DATA)
3463 info->data_sinfo = found; 3434 info->data_sinfo = found;
3464 return 0; 3435
3436 return ret;
3465} 3437}
3466 3438
3467static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3439static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
@@ -4637,7 +4609,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
4637 u64 num_bytes) 4609 u64 num_bytes)
4638{ 4610{
4639 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; 4611 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4640 if (global_rsv->full || global_rsv == block_rsv || 4612 if (global_rsv == block_rsv ||
4641 block_rsv->space_info != global_rsv->space_info) 4613 block_rsv->space_info != global_rsv->space_info)
4642 global_rsv = NULL; 4614 global_rsv = NULL;
4643 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, 4615 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
@@ -5916,24 +5888,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5916{ 5888{
5917 struct btrfs_delayed_ref_head *head; 5889 struct btrfs_delayed_ref_head *head;
5918 struct btrfs_delayed_ref_root *delayed_refs; 5890 struct btrfs_delayed_ref_root *delayed_refs;
5919 struct btrfs_delayed_ref_node *ref;
5920 struct rb_node *node;
5921 int ret = 0; 5891 int ret = 0;
5922 5892
5923 delayed_refs = &trans->transaction->delayed_refs; 5893 delayed_refs = &trans->transaction->delayed_refs;
5924 spin_lock(&delayed_refs->lock); 5894 spin_lock(&delayed_refs->lock);
5925 head = btrfs_find_delayed_ref_head(trans, bytenr); 5895 head = btrfs_find_delayed_ref_head(trans, bytenr);
5926 if (!head) 5896 if (!head)
5927 goto out; 5897 goto out_delayed_unlock;
5928 5898
5929 node = rb_prev(&head->node.rb_node); 5899 spin_lock(&head->lock);
5930 if (!node) 5900 if (rb_first(&head->ref_root))
5931 goto out;
5932
5933 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5934
5935 /* there are still entries for this ref, we can't drop it */
5936 if (ref->bytenr == bytenr)
5937 goto out; 5901 goto out;
5938 5902
5939 if (head->extent_op) { 5903 if (head->extent_op) {
@@ -5955,19 +5919,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5955 * ahead and process it. 5919 * ahead and process it.
5956 */ 5920 */
5957 head->node.in_tree = 0; 5921 head->node.in_tree = 0;
5958 rb_erase(&head->node.rb_node, &delayed_refs->root); 5922 rb_erase(&head->href_node, &delayed_refs->href_root);
5959 5923
5960 delayed_refs->num_entries--; 5924 atomic_dec(&delayed_refs->num_entries);
5961 5925
5962 /* 5926 /*
5963 * we don't take a ref on the node because we're removing it from the 5927 * we don't take a ref on the node because we're removing it from the
5964 * tree, so we just steal the ref the tree was holding. 5928 * tree, so we just steal the ref the tree was holding.
5965 */ 5929 */
5966 delayed_refs->num_heads--; 5930 delayed_refs->num_heads--;
5967 if (list_empty(&head->cluster)) 5931 if (head->processing == 0)
5968 delayed_refs->num_heads_ready--; 5932 delayed_refs->num_heads_ready--;
5969 5933 head->processing = 0;
5970 list_del_init(&head->cluster); 5934 spin_unlock(&head->lock);
5971 spin_unlock(&delayed_refs->lock); 5935 spin_unlock(&delayed_refs->lock);
5972 5936
5973 BUG_ON(head->extent_op); 5937 BUG_ON(head->extent_op);
@@ -5978,6 +5942,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5978 btrfs_put_delayed_ref(&head->node); 5942 btrfs_put_delayed_ref(&head->node);
5979 return ret; 5943 return ret;
5980out: 5944out:
5945 spin_unlock(&head->lock);
5946
5947out_delayed_unlock:
5981 spin_unlock(&delayed_refs->lock); 5948 spin_unlock(&delayed_refs->lock);
5982 return 0; 5949 return 0;
5983} 5950}
@@ -6145,11 +6112,29 @@ int __get_raid_index(u64 flags)
6145 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 6112 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6146} 6113}
6147 6114
6148static int get_block_group_index(struct btrfs_block_group_cache *cache) 6115int get_block_group_index(struct btrfs_block_group_cache *cache)
6149{ 6116{
6150 return __get_raid_index(cache->flags); 6117 return __get_raid_index(cache->flags);
6151} 6118}
6152 6119
6120static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6121 [BTRFS_RAID_RAID10] = "raid10",
6122 [BTRFS_RAID_RAID1] = "raid1",
6123 [BTRFS_RAID_DUP] = "dup",
6124 [BTRFS_RAID_RAID0] = "raid0",
6125 [BTRFS_RAID_SINGLE] = "single",
6126 [BTRFS_RAID_RAID5] = "raid5",
6127 [BTRFS_RAID_RAID6] = "raid6",
6128};
6129
6130static const char *get_raid_name(enum btrfs_raid_types type)
6131{
6132 if (type >= BTRFS_NR_RAID_TYPES)
6133 return NULL;
6134
6135 return btrfs_raid_type_names[type];
6136}
6137
6153enum btrfs_loop_type { 6138enum btrfs_loop_type {
6154 LOOP_CACHING_NOWAIT = 0, 6139 LOOP_CACHING_NOWAIT = 0,
6155 LOOP_CACHING_WAIT = 1, 6140 LOOP_CACHING_WAIT = 1,
@@ -6177,7 +6162,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6177 struct btrfs_root *root = orig_root->fs_info->extent_root; 6162 struct btrfs_root *root = orig_root->fs_info->extent_root;
6178 struct btrfs_free_cluster *last_ptr = NULL; 6163 struct btrfs_free_cluster *last_ptr = NULL;
6179 struct btrfs_block_group_cache *block_group = NULL; 6164 struct btrfs_block_group_cache *block_group = NULL;
6180 struct btrfs_block_group_cache *used_block_group;
6181 u64 search_start = 0; 6165 u64 search_start = 0;
6182 u64 max_extent_size = 0; 6166 u64 max_extent_size = 0;
6183 int empty_cluster = 2 * 1024 * 1024; 6167 int empty_cluster = 2 * 1024 * 1024;
@@ -6186,7 +6170,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6186 int index = __get_raid_index(flags); 6170 int index = __get_raid_index(flags);
6187 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? 6171 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
6188 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 6172 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
6189 bool found_uncached_bg = false;
6190 bool failed_cluster_refill = false; 6173 bool failed_cluster_refill = false;
6191 bool failed_alloc = false; 6174 bool failed_alloc = false;
6192 bool use_cluster = true; 6175 bool use_cluster = true;
@@ -6239,7 +6222,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6239 if (search_start == hint_byte) { 6222 if (search_start == hint_byte) {
6240 block_group = btrfs_lookup_block_group(root->fs_info, 6223 block_group = btrfs_lookup_block_group(root->fs_info,
6241 search_start); 6224 search_start);
6242 used_block_group = block_group;
6243 /* 6225 /*
6244 * we don't want to use the block group if it doesn't match our 6226 * we don't want to use the block group if it doesn't match our
6245 * allocation bits, or if its not cached. 6227 * allocation bits, or if its not cached.
@@ -6276,7 +6258,6 @@ search:
6276 u64 offset; 6258 u64 offset;
6277 int cached; 6259 int cached;
6278 6260
6279 used_block_group = block_group;
6280 btrfs_get_block_group(block_group); 6261 btrfs_get_block_group(block_group);
6281 search_start = block_group->key.objectid; 6262 search_start = block_group->key.objectid;
6282 6263
@@ -6304,7 +6285,6 @@ search:
6304have_block_group: 6285have_block_group:
6305 cached = block_group_cache_done(block_group); 6286 cached = block_group_cache_done(block_group);
6306 if (unlikely(!cached)) { 6287 if (unlikely(!cached)) {
6307 found_uncached_bg = true;
6308 ret = cache_block_group(block_group, 0); 6288 ret = cache_block_group(block_group, 0);
6309 BUG_ON(ret < 0); 6289 BUG_ON(ret < 0);
6310 ret = 0; 6290 ret = 0;
@@ -6320,6 +6300,7 @@ have_block_group:
6320 * lets look there 6300 * lets look there
6321 */ 6301 */
6322 if (last_ptr) { 6302 if (last_ptr) {
6303 struct btrfs_block_group_cache *used_block_group;
6323 unsigned long aligned_cluster; 6304 unsigned long aligned_cluster;
6324 /* 6305 /*
6325 * the refill lock keeps out other 6306 * the refill lock keeps out other
@@ -6330,10 +6311,8 @@ have_block_group:
6330 if (used_block_group != block_group && 6311 if (used_block_group != block_group &&
6331 (!used_block_group || 6312 (!used_block_group ||
6332 used_block_group->ro || 6313 used_block_group->ro ||
6333 !block_group_bits(used_block_group, flags))) { 6314 !block_group_bits(used_block_group, flags)))
6334 used_block_group = block_group;
6335 goto refill_cluster; 6315 goto refill_cluster;
6336 }
6337 6316
6338 if (used_block_group != block_group) 6317 if (used_block_group != block_group)
6339 btrfs_get_block_group(used_block_group); 6318 btrfs_get_block_group(used_block_group);
@@ -6347,17 +6326,19 @@ have_block_group:
6347 /* we have a block, we're done */ 6326 /* we have a block, we're done */
6348 spin_unlock(&last_ptr->refill_lock); 6327 spin_unlock(&last_ptr->refill_lock);
6349 trace_btrfs_reserve_extent_cluster(root, 6328 trace_btrfs_reserve_extent_cluster(root,
6350 block_group, search_start, num_bytes); 6329 used_block_group,
6330 search_start, num_bytes);
6331 if (used_block_group != block_group) {
6332 btrfs_put_block_group(block_group);
6333 block_group = used_block_group;
6334 }
6351 goto checks; 6335 goto checks;
6352 } 6336 }
6353 6337
6354 WARN_ON(last_ptr->block_group != used_block_group); 6338 WARN_ON(last_ptr->block_group != used_block_group);
6355 if (used_block_group != block_group) { 6339 if (used_block_group != block_group)
6356 btrfs_put_block_group(used_block_group); 6340 btrfs_put_block_group(used_block_group);
6357 used_block_group = block_group;
6358 }
6359refill_cluster: 6341refill_cluster:
6360 BUG_ON(used_block_group != block_group);
6361 /* If we are on LOOP_NO_EMPTY_SIZE, we can't 6342 /* If we are on LOOP_NO_EMPTY_SIZE, we can't
6362 * set up a new clusters, so lets just skip it 6343 * set up a new clusters, so lets just skip it
6363 * and let the allocator find whatever block 6344 * and let the allocator find whatever block
@@ -6476,25 +6457,25 @@ unclustered_alloc:
6476 goto loop; 6457 goto loop;
6477 } 6458 }
6478checks: 6459checks:
6479 search_start = stripe_align(root, used_block_group, 6460 search_start = stripe_align(root, block_group,
6480 offset, num_bytes); 6461 offset, num_bytes);
6481 6462
6482 /* move on to the next group */ 6463 /* move on to the next group */
6483 if (search_start + num_bytes > 6464 if (search_start + num_bytes >
6484 used_block_group->key.objectid + used_block_group->key.offset) { 6465 block_group->key.objectid + block_group->key.offset) {
6485 btrfs_add_free_space(used_block_group, offset, num_bytes); 6466 btrfs_add_free_space(block_group, offset, num_bytes);
6486 goto loop; 6467 goto loop;
6487 } 6468 }
6488 6469
6489 if (offset < search_start) 6470 if (offset < search_start)
6490 btrfs_add_free_space(used_block_group, offset, 6471 btrfs_add_free_space(block_group, offset,
6491 search_start - offset); 6472 search_start - offset);
6492 BUG_ON(offset > search_start); 6473 BUG_ON(offset > search_start);
6493 6474
6494 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, 6475 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
6495 alloc_type); 6476 alloc_type);
6496 if (ret == -EAGAIN) { 6477 if (ret == -EAGAIN) {
6497 btrfs_add_free_space(used_block_group, offset, num_bytes); 6478 btrfs_add_free_space(block_group, offset, num_bytes);
6498 goto loop; 6479 goto loop;
6499 } 6480 }
6500 6481
@@ -6504,16 +6485,12 @@ checks:
6504 6485
6505 trace_btrfs_reserve_extent(orig_root, block_group, 6486 trace_btrfs_reserve_extent(orig_root, block_group,
6506 search_start, num_bytes); 6487 search_start, num_bytes);
6507 if (used_block_group != block_group)
6508 btrfs_put_block_group(used_block_group);
6509 btrfs_put_block_group(block_group); 6488 btrfs_put_block_group(block_group);
6510 break; 6489 break;
6511loop: 6490loop:
6512 failed_cluster_refill = false; 6491 failed_cluster_refill = false;
6513 failed_alloc = false; 6492 failed_alloc = false;
6514 BUG_ON(index != get_block_group_index(block_group)); 6493 BUG_ON(index != get_block_group_index(block_group));
6515 if (used_block_group != block_group)
6516 btrfs_put_block_group(used_block_group);
6517 btrfs_put_block_group(block_group); 6494 btrfs_put_block_group(block_group);
6518 } 6495 }
6519 up_read(&space_info->groups_sem); 6496 up_read(&space_info->groups_sem);
@@ -6584,12 +6561,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6584 int index = 0; 6561 int index = 0;
6585 6562
6586 spin_lock(&info->lock); 6563 spin_lock(&info->lock);
6587 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", 6564 printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
6588 info->flags, 6565 info->flags,
6589 info->total_bytes - info->bytes_used - info->bytes_pinned - 6566 info->total_bytes - info->bytes_used - info->bytes_pinned -
6590 info->bytes_reserved - info->bytes_readonly, 6567 info->bytes_reserved - info->bytes_readonly,
6591 (info->full) ? "" : "not "); 6568 (info->full) ? "" : "not ");
6592 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " 6569 printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
6593 "reserved=%llu, may_use=%llu, readonly=%llu\n", 6570 "reserved=%llu, may_use=%llu, readonly=%llu\n",
6594 info->total_bytes, info->bytes_used, info->bytes_pinned, 6571 info->total_bytes, info->bytes_used, info->bytes_pinned,
6595 info->bytes_reserved, info->bytes_may_use, 6572 info->bytes_reserved, info->bytes_may_use,
@@ -6603,7 +6580,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6603again: 6580again:
6604 list_for_each_entry(cache, &info->block_groups[index], list) { 6581 list_for_each_entry(cache, &info->block_groups[index], list) {
6605 spin_lock(&cache->lock); 6582 spin_lock(&cache->lock);
6606 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", 6583 printk(KERN_INFO "BTRFS: "
6584 "block group %llu has %llu bytes, "
6585 "%llu used %llu pinned %llu reserved %s\n",
6607 cache->key.objectid, cache->key.offset, 6586 cache->key.objectid, cache->key.offset,
6608 btrfs_block_group_used(&cache->item), cache->pinned, 6587 btrfs_block_group_used(&cache->item), cache->pinned,
6609 cache->reserved, cache->ro ? "[readonly]" : ""); 6588 cache->reserved, cache->ro ? "[readonly]" : "");
@@ -6966,7 +6945,7 @@ again:
6966 /*DEFAULT_RATELIMIT_BURST*/ 1); 6945 /*DEFAULT_RATELIMIT_BURST*/ 1);
6967 if (__ratelimit(&_rs)) 6946 if (__ratelimit(&_rs))
6968 WARN(1, KERN_DEBUG 6947 WARN(1, KERN_DEBUG
6969 "btrfs: block rsv returned %d\n", ret); 6948 "BTRFS: block rsv returned %d\n", ret);
6970 } 6949 }
6971try_reserve: 6950try_reserve:
6972 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6951 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
@@ -7714,7 +7693,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7714 7693
7715 btrfs_end_transaction_throttle(trans, tree_root); 7694 btrfs_end_transaction_throttle(trans, tree_root);
7716 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 7695 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
7717 pr_debug("btrfs: drop snapshot early exit\n"); 7696 pr_debug("BTRFS: drop snapshot early exit\n");
7718 err = -EAGAIN; 7697 err = -EAGAIN;
7719 goto out_free; 7698 goto out_free;
7720 } 7699 }
@@ -7779,7 +7758,7 @@ out:
7779 */ 7758 */
7780 if (!for_reloc && root_dropped == false) 7759 if (!for_reloc && root_dropped == false)
7781 btrfs_add_dead_root(root); 7760 btrfs_add_dead_root(root);
7782 if (err) 7761 if (err && err != -EAGAIN)
7783 btrfs_std_error(root->fs_info, err); 7762 btrfs_std_error(root->fs_info, err);
7784 return err; 7763 return err;
7785} 7764}
@@ -8333,6 +8312,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8333 release_global_block_rsv(info); 8312 release_global_block_rsv(info);
8334 8313
8335 while (!list_empty(&info->space_info)) { 8314 while (!list_empty(&info->space_info)) {
8315 int i;
8316
8336 space_info = list_entry(info->space_info.next, 8317 space_info = list_entry(info->space_info.next,
8337 struct btrfs_space_info, 8318 struct btrfs_space_info,
8338 list); 8319 list);
@@ -8343,9 +8324,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8343 dump_space_info(space_info, 0, 0); 8324 dump_space_info(space_info, 0, 0);
8344 } 8325 }
8345 } 8326 }
8346 percpu_counter_destroy(&space_info->total_bytes_pinned);
8347 list_del(&space_info->list); 8327 list_del(&space_info->list);
8348 kfree(space_info); 8328 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
8329 struct kobject *kobj;
8330 kobj = &space_info->block_group_kobjs[i];
8331 if (kobj->parent) {
8332 kobject_del(kobj);
8333 kobject_put(kobj);
8334 }
8335 }
8336 kobject_del(&space_info->kobj);
8337 kobject_put(&space_info->kobj);
8349 } 8338 }
8350 return 0; 8339 return 0;
8351} 8340}
@@ -8356,10 +8345,57 @@ static void __link_block_group(struct btrfs_space_info *space_info,
8356 int index = get_block_group_index(cache); 8345 int index = get_block_group_index(cache);
8357 8346
8358 down_write(&space_info->groups_sem); 8347 down_write(&space_info->groups_sem);
8348 if (list_empty(&space_info->block_groups[index])) {
8349 struct kobject *kobj = &space_info->block_group_kobjs[index];
8350 int ret;
8351
8352 kobject_get(&space_info->kobj); /* put in release */
8353 ret = kobject_add(kobj, &space_info->kobj, "%s",
8354 get_raid_name(index));
8355 if (ret) {
8356 pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
8357 kobject_put(&space_info->kobj);
8358 }
8359 }
8359 list_add_tail(&cache->list, &space_info->block_groups[index]); 8360 list_add_tail(&cache->list, &space_info->block_groups[index]);
8360 up_write(&space_info->groups_sem); 8361 up_write(&space_info->groups_sem);
8361} 8362}
8362 8363
8364static struct btrfs_block_group_cache *
8365btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8366{
8367 struct btrfs_block_group_cache *cache;
8368
8369 cache = kzalloc(sizeof(*cache), GFP_NOFS);
8370 if (!cache)
8371 return NULL;
8372
8373 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8374 GFP_NOFS);
8375 if (!cache->free_space_ctl) {
8376 kfree(cache);
8377 return NULL;
8378 }
8379
8380 cache->key.objectid = start;
8381 cache->key.offset = size;
8382 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8383
8384 cache->sectorsize = root->sectorsize;
8385 cache->fs_info = root->fs_info;
8386 cache->full_stripe_len = btrfs_full_stripe_len(root,
8387 &root->fs_info->mapping_tree,
8388 start);
8389 atomic_set(&cache->count, 1);
8390 spin_lock_init(&cache->lock);
8391 INIT_LIST_HEAD(&cache->list);
8392 INIT_LIST_HEAD(&cache->cluster_list);
8393 INIT_LIST_HEAD(&cache->new_bg_list);
8394 btrfs_init_free_space_ctl(cache);
8395
8396 return cache;
8397}
8398
8363int btrfs_read_block_groups(struct btrfs_root *root) 8399int btrfs_read_block_groups(struct btrfs_root *root)
8364{ 8400{
8365 struct btrfs_path *path; 8401 struct btrfs_path *path;
@@ -8395,26 +8431,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8395 break; 8431 break;
8396 if (ret != 0) 8432 if (ret != 0)
8397 goto error; 8433 goto error;
8434
8398 leaf = path->nodes[0]; 8435 leaf = path->nodes[0];
8399 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8436 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8400 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8437
8438 cache = btrfs_create_block_group_cache(root, found_key.objectid,
8439 found_key.offset);
8401 if (!cache) { 8440 if (!cache) {
8402 ret = -ENOMEM; 8441 ret = -ENOMEM;
8403 goto error; 8442 goto error;
8404 } 8443 }
8405 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8406 GFP_NOFS);
8407 if (!cache->free_space_ctl) {
8408 kfree(cache);
8409 ret = -ENOMEM;
8410 goto error;
8411 }
8412
8413 atomic_set(&cache->count, 1);
8414 spin_lock_init(&cache->lock);
8415 cache->fs_info = info;
8416 INIT_LIST_HEAD(&cache->list);
8417 INIT_LIST_HEAD(&cache->cluster_list);
8418 8444
8419 if (need_clear) { 8445 if (need_clear) {
8420 /* 8446 /*
@@ -8435,16 +8461,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8435 read_extent_buffer(leaf, &cache->item, 8461 read_extent_buffer(leaf, &cache->item,
8436 btrfs_item_ptr_offset(leaf, path->slots[0]), 8462 btrfs_item_ptr_offset(leaf, path->slots[0]),
8437 sizeof(cache->item)); 8463 sizeof(cache->item));
8438 memcpy(&cache->key, &found_key, sizeof(found_key)); 8464 cache->flags = btrfs_block_group_flags(&cache->item);
8439 8465
8440 key.objectid = found_key.objectid + found_key.offset; 8466 key.objectid = found_key.objectid + found_key.offset;
8441 btrfs_release_path(path); 8467 btrfs_release_path(path);
8442 cache->flags = btrfs_block_group_flags(&cache->item);
8443 cache->sectorsize = root->sectorsize;
8444 cache->full_stripe_len = btrfs_full_stripe_len(root,
8445 &root->fs_info->mapping_tree,
8446 found_key.objectid);
8447 btrfs_init_free_space_ctl(cache);
8448 8468
8449 /* 8469 /*
8450 * We need to exclude the super stripes now so that the space 8470 * We need to exclude the super stripes now so that the space
@@ -8458,8 +8478,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8458 * case. 8478 * case.
8459 */ 8479 */
8460 free_excluded_extents(root, cache); 8480 free_excluded_extents(root, cache);
8461 kfree(cache->free_space_ctl); 8481 btrfs_put_block_group(cache);
8462 kfree(cache);
8463 goto error; 8482 goto error;
8464 } 8483 }
8465 8484
@@ -8590,38 +8609,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8590 8609
8591 root->fs_info->last_trans_log_full_commit = trans->transid; 8610 root->fs_info->last_trans_log_full_commit = trans->transid;
8592 8611
8593 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8612 cache = btrfs_create_block_group_cache(root, chunk_offset, size);
8594 if (!cache) 8613 if (!cache)
8595 return -ENOMEM; 8614 return -ENOMEM;
8596 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8597 GFP_NOFS);
8598 if (!cache->free_space_ctl) {
8599 kfree(cache);
8600 return -ENOMEM;
8601 }
8602
8603 cache->key.objectid = chunk_offset;
8604 cache->key.offset = size;
8605 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8606 cache->sectorsize = root->sectorsize;
8607 cache->fs_info = root->fs_info;
8608 cache->full_stripe_len = btrfs_full_stripe_len(root,
8609 &root->fs_info->mapping_tree,
8610 chunk_offset);
8611
8612 atomic_set(&cache->count, 1);
8613 spin_lock_init(&cache->lock);
8614 INIT_LIST_HEAD(&cache->list);
8615 INIT_LIST_HEAD(&cache->cluster_list);
8616 INIT_LIST_HEAD(&cache->new_bg_list);
8617
8618 btrfs_init_free_space_ctl(cache);
8619 8615
8620 btrfs_set_block_group_used(&cache->item, bytes_used); 8616 btrfs_set_block_group_used(&cache->item, bytes_used);
8621 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 8617 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8622 cache->flags = type;
8623 btrfs_set_block_group_flags(&cache->item, type); 8618 btrfs_set_block_group_flags(&cache->item, type);
8624 8619
8620 cache->flags = type;
8625 cache->last_byte_to_unpin = (u64)-1; 8621 cache->last_byte_to_unpin = (u64)-1;
8626 cache->cached = BTRFS_CACHE_FINISHED; 8622 cache->cached = BTRFS_CACHE_FINISHED;
8627 ret = exclude_super_stripes(root, cache); 8623 ret = exclude_super_stripes(root, cache);
@@ -8631,8 +8627,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8631 * case. 8627 * case.
8632 */ 8628 */
8633 free_excluded_extents(root, cache); 8629 free_excluded_extents(root, cache);
8634 kfree(cache->free_space_ctl); 8630 btrfs_put_block_group(cache);
8635 kfree(cache);
8636 return ret; 8631 return ret;
8637 } 8632 }
8638 8633
@@ -8796,8 +8791,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8796 * are still on the list after taking the semaphore 8791 * are still on the list after taking the semaphore
8797 */ 8792 */
8798 list_del_init(&block_group->list); 8793 list_del_init(&block_group->list);
8799 if (list_empty(&block_group->space_info->block_groups[index])) 8794 if (list_empty(&block_group->space_info->block_groups[index])) {
8795 kobject_del(&block_group->space_info->block_group_kobjs[index]);
8796 kobject_put(&block_group->space_info->block_group_kobjs[index]);
8800 clear_avail_alloc_bits(root->fs_info, block_group->flags); 8797 clear_avail_alloc_bits(root->fs_info, block_group->flags);
8798 }
8801 up_write(&block_group->space_info->groups_sem); 8799 up_write(&block_group->space_info->groups_sem);
8802 8800
8803 if (block_group->cached == BTRFS_CACHE_STARTED) 8801 if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ff43802a7c88..85bbd01f1271 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -59,7 +59,7 @@ void btrfs_leak_debug_check(void)
59 59
60 while (!list_empty(&states)) { 60 while (!list_empty(&states)) {
61 state = list_entry(states.next, struct extent_state, leak_list); 61 state = list_entry(states.next, struct extent_state, leak_list);
62 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 62 printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
63 "state %lu in tree %p refs %d\n", 63 "state %lu in tree %p refs %d\n",
64 state->start, state->end, state->state, state->tree, 64 state->start, state->end, state->state, state->tree,
65 atomic_read(&state->refs)); 65 atomic_read(&state->refs));
@@ -69,7 +69,7 @@ void btrfs_leak_debug_check(void)
69 69
70 while (!list_empty(&buffers)) { 70 while (!list_empty(&buffers)) {
71 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 71 eb = list_entry(buffers.next, struct extent_buffer, leak_list);
72 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 72 printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
73 "refs %d\n", 73 "refs %d\n",
74 eb->start, eb->len, atomic_read(&eb->refs)); 74 eb->start, eb->len, atomic_read(&eb->refs));
75 list_del(&eb->leak_list); 75 list_del(&eb->leak_list);
@@ -77,16 +77,22 @@ void btrfs_leak_debug_check(void)
77 } 77 }
78} 78}
79 79
80#define btrfs_debug_check_extent_io_range(inode, start, end) \ 80#define btrfs_debug_check_extent_io_range(tree, start, end) \
81 __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end)) 81 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
82static inline void __btrfs_debug_check_extent_io_range(const char *caller, 82static inline void __btrfs_debug_check_extent_io_range(const char *caller,
83 struct inode *inode, u64 start, u64 end) 83 struct extent_io_tree *tree, u64 start, u64 end)
84{ 84{
85 u64 isize = i_size_read(inode); 85 struct inode *inode;
86 u64 isize;
87
88 if (!tree->mapping)
89 return;
86 90
91 inode = tree->mapping->host;
92 isize = i_size_read(inode);
87 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 93 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
88 printk_ratelimited(KERN_DEBUG 94 printk_ratelimited(KERN_DEBUG
89 "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", 95 "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
90 caller, btrfs_ino(inode), isize, start, end); 96 caller, btrfs_ino(inode), isize, start, end);
91 } 97 }
92} 98}
@@ -124,6 +130,8 @@ static noinline void flush_write_bio(void *data);
124static inline struct btrfs_fs_info * 130static inline struct btrfs_fs_info *
125tree_fs_info(struct extent_io_tree *tree) 131tree_fs_info(struct extent_io_tree *tree)
126{ 132{
133 if (!tree->mapping)
134 return NULL;
127 return btrfs_sb(tree->mapping->host->i_sb); 135 return btrfs_sb(tree->mapping->host->i_sb);
128} 136}
129 137
@@ -186,11 +194,9 @@ void extent_io_tree_init(struct extent_io_tree *tree,
186 struct address_space *mapping) 194 struct address_space *mapping)
187{ 195{
188 tree->state = RB_ROOT; 196 tree->state = RB_ROOT;
189 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
190 tree->ops = NULL; 197 tree->ops = NULL;
191 tree->dirty_bytes = 0; 198 tree->dirty_bytes = 0;
192 spin_lock_init(&tree->lock); 199 spin_lock_init(&tree->lock);
193 spin_lock_init(&tree->buffer_lock);
194 tree->mapping = mapping; 200 tree->mapping = mapping;
195} 201}
196 202
@@ -224,12 +230,20 @@ void free_extent_state(struct extent_state *state)
224} 230}
225 231
226static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 232static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
227 struct rb_node *node) 233 struct rb_node *node,
234 struct rb_node ***p_in,
235 struct rb_node **parent_in)
228{ 236{
229 struct rb_node **p = &root->rb_node; 237 struct rb_node **p = &root->rb_node;
230 struct rb_node *parent = NULL; 238 struct rb_node *parent = NULL;
231 struct tree_entry *entry; 239 struct tree_entry *entry;
232 240
241 if (p_in && parent_in) {
242 p = *p_in;
243 parent = *parent_in;
244 goto do_insert;
245 }
246
233 while (*p) { 247 while (*p) {
234 parent = *p; 248 parent = *p;
235 entry = rb_entry(parent, struct tree_entry, rb_node); 249 entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -242,35 +256,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
242 return parent; 256 return parent;
243 } 257 }
244 258
259do_insert:
245 rb_link_node(node, parent, p); 260 rb_link_node(node, parent, p);
246 rb_insert_color(node, root); 261 rb_insert_color(node, root);
247 return NULL; 262 return NULL;
248} 263}
249 264
250static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 265static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
251 struct rb_node **prev_ret, 266 struct rb_node **prev_ret,
252 struct rb_node **next_ret) 267 struct rb_node **next_ret,
268 struct rb_node ***p_ret,
269 struct rb_node **parent_ret)
253{ 270{
254 struct rb_root *root = &tree->state; 271 struct rb_root *root = &tree->state;
255 struct rb_node *n = root->rb_node; 272 struct rb_node **n = &root->rb_node;
256 struct rb_node *prev = NULL; 273 struct rb_node *prev = NULL;
257 struct rb_node *orig_prev = NULL; 274 struct rb_node *orig_prev = NULL;
258 struct tree_entry *entry; 275 struct tree_entry *entry;
259 struct tree_entry *prev_entry = NULL; 276 struct tree_entry *prev_entry = NULL;
260 277
261 while (n) { 278 while (*n) {
262 entry = rb_entry(n, struct tree_entry, rb_node); 279 prev = *n;
263 prev = n; 280 entry = rb_entry(prev, struct tree_entry, rb_node);
264 prev_entry = entry; 281 prev_entry = entry;
265 282
266 if (offset < entry->start) 283 if (offset < entry->start)
267 n = n->rb_left; 284 n = &(*n)->rb_left;
268 else if (offset > entry->end) 285 else if (offset > entry->end)
269 n = n->rb_right; 286 n = &(*n)->rb_right;
270 else 287 else
271 return n; 288 return *n;
272 } 289 }
273 290
291 if (p_ret)
292 *p_ret = n;
293 if (parent_ret)
294 *parent_ret = prev;
295
274 if (prev_ret) { 296 if (prev_ret) {
275 orig_prev = prev; 297 orig_prev = prev;
276 while (prev && offset > prev_entry->end) { 298 while (prev && offset > prev_entry->end) {
@@ -292,18 +314,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
292 return NULL; 314 return NULL;
293} 315}
294 316
295static inline struct rb_node *tree_search(struct extent_io_tree *tree, 317static inline struct rb_node *
296 u64 offset) 318tree_search_for_insert(struct extent_io_tree *tree,
319 u64 offset,
320 struct rb_node ***p_ret,
321 struct rb_node **parent_ret)
297{ 322{
298 struct rb_node *prev = NULL; 323 struct rb_node *prev = NULL;
299 struct rb_node *ret; 324 struct rb_node *ret;
300 325
301 ret = __etree_search(tree, offset, &prev, NULL); 326 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
302 if (!ret) 327 if (!ret)
303 return prev; 328 return prev;
304 return ret; 329 return ret;
305} 330}
306 331
332static inline struct rb_node *tree_search(struct extent_io_tree *tree,
333 u64 offset)
334{
335 return tree_search_for_insert(tree, offset, NULL, NULL);
336}
337
307static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 338static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
308 struct extent_state *other) 339 struct extent_state *other)
309{ 340{
@@ -385,23 +416,25 @@ static void set_state_bits(struct extent_io_tree *tree,
385 */ 416 */
386static int insert_state(struct extent_io_tree *tree, 417static int insert_state(struct extent_io_tree *tree,
387 struct extent_state *state, u64 start, u64 end, 418 struct extent_state *state, u64 start, u64 end,
419 struct rb_node ***p,
420 struct rb_node **parent,
388 unsigned long *bits) 421 unsigned long *bits)
389{ 422{
390 struct rb_node *node; 423 struct rb_node *node;
391 424
392 if (end < start) 425 if (end < start)
393 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", 426 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
394 end, start); 427 end, start);
395 state->start = start; 428 state->start = start;
396 state->end = end; 429 state->end = end;
397 430
398 set_state_bits(tree, state, bits); 431 set_state_bits(tree, state, bits);
399 432
400 node = tree_insert(&tree->state, end, &state->rb_node); 433 node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
401 if (node) { 434 if (node) {
402 struct extent_state *found; 435 struct extent_state *found;
403 found = rb_entry(node, struct extent_state, rb_node); 436 found = rb_entry(node, struct extent_state, rb_node);
404 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 437 printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
405 "%llu %llu\n", 438 "%llu %llu\n",
406 found->start, found->end, start, end); 439 found->start, found->end, start, end);
407 return -EEXIST; 440 return -EEXIST;
@@ -444,7 +477,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
444 prealloc->state = orig->state; 477 prealloc->state = orig->state;
445 orig->start = split; 478 orig->start = split;
446 479
447 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 480 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
481 NULL, NULL);
448 if (node) { 482 if (node) {
449 free_extent_state(prealloc); 483 free_extent_state(prealloc);
450 return -EEXIST; 484 return -EEXIST;
@@ -542,7 +576,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
542 int err; 576 int err;
543 int clear = 0; 577 int clear = 0;
544 578
545 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); 579 btrfs_debug_check_extent_io_range(tree, start, end);
546 580
547 if (bits & EXTENT_DELALLOC) 581 if (bits & EXTENT_DELALLOC)
548 bits |= EXTENT_NORESERVE; 582 bits |= EXTENT_NORESERVE;
@@ -702,7 +736,7 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
702 struct extent_state *state; 736 struct extent_state *state;
703 struct rb_node *node; 737 struct rb_node *node;
704 738
705 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); 739 btrfs_debug_check_extent_io_range(tree, start, end);
706 740
707 spin_lock(&tree->lock); 741 spin_lock(&tree->lock);
708again: 742again:
@@ -783,11 +817,13 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
783 struct extent_state *state; 817 struct extent_state *state;
784 struct extent_state *prealloc = NULL; 818 struct extent_state *prealloc = NULL;
785 struct rb_node *node; 819 struct rb_node *node;
820 struct rb_node **p;
821 struct rb_node *parent;
786 int err = 0; 822 int err = 0;
787 u64 last_start; 823 u64 last_start;
788 u64 last_end; 824 u64 last_end;
789 825
790 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); 826 btrfs_debug_check_extent_io_range(tree, start, end);
791 827
792 bits |= EXTENT_FIRST_DELALLOC; 828 bits |= EXTENT_FIRST_DELALLOC;
793again: 829again:
@@ -809,14 +845,16 @@ again:
809 * this search will find all the extents that end after 845 * this search will find all the extents that end after
810 * our range starts. 846 * our range starts.
811 */ 847 */
812 node = tree_search(tree, start); 848 node = tree_search_for_insert(tree, start, &p, &parent);
813 if (!node) { 849 if (!node) {
814 prealloc = alloc_extent_state_atomic(prealloc); 850 prealloc = alloc_extent_state_atomic(prealloc);
815 BUG_ON(!prealloc); 851 BUG_ON(!prealloc);
816 err = insert_state(tree, prealloc, start, end, &bits); 852 err = insert_state(tree, prealloc, start, end,
853 &p, &parent, &bits);
817 if (err) 854 if (err)
818 extent_io_tree_panic(tree, err); 855 extent_io_tree_panic(tree, err);
819 856
857 cache_state(prealloc, cached_state);
820 prealloc = NULL; 858 prealloc = NULL;
821 goto out; 859 goto out;
822 } 860 }
@@ -919,7 +957,7 @@ hit_next:
919 * the later extent. 957 * the later extent.
920 */ 958 */
921 err = insert_state(tree, prealloc, start, this_end, 959 err = insert_state(tree, prealloc, start, this_end,
922 &bits); 960 NULL, NULL, &bits);
923 if (err) 961 if (err)
924 extent_io_tree_panic(tree, err); 962 extent_io_tree_panic(tree, err);
925 963
@@ -1005,11 +1043,13 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1005 struct extent_state *state; 1043 struct extent_state *state;
1006 struct extent_state *prealloc = NULL; 1044 struct extent_state *prealloc = NULL;
1007 struct rb_node *node; 1045 struct rb_node *node;
1046 struct rb_node **p;
1047 struct rb_node *parent;
1008 int err = 0; 1048 int err = 0;
1009 u64 last_start; 1049 u64 last_start;
1010 u64 last_end; 1050 u64 last_end;
1011 1051
1012 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); 1052 btrfs_debug_check_extent_io_range(tree, start, end);
1013 1053
1014again: 1054again:
1015 if (!prealloc && (mask & __GFP_WAIT)) { 1055 if (!prealloc && (mask & __GFP_WAIT)) {
@@ -1032,17 +1072,19 @@ again:
1032 * this search will find all the extents that end after 1072 * this search will find all the extents that end after
1033 * our range starts. 1073 * our range starts.
1034 */ 1074 */
1035 node = tree_search(tree, start); 1075 node = tree_search_for_insert(tree, start, &p, &parent);
1036 if (!node) { 1076 if (!node) {
1037 prealloc = alloc_extent_state_atomic(prealloc); 1077 prealloc = alloc_extent_state_atomic(prealloc);
1038 if (!prealloc) { 1078 if (!prealloc) {
1039 err = -ENOMEM; 1079 err = -ENOMEM;
1040 goto out; 1080 goto out;
1041 } 1081 }
1042 err = insert_state(tree, prealloc, start, end, &bits); 1082 err = insert_state(tree, prealloc, start, end,
1043 prealloc = NULL; 1083 &p, &parent, &bits);
1044 if (err) 1084 if (err)
1045 extent_io_tree_panic(tree, err); 1085 extent_io_tree_panic(tree, err);
1086 cache_state(prealloc, cached_state);
1087 prealloc = NULL;
1046 goto out; 1088 goto out;
1047 } 1089 }
1048 state = rb_entry(node, struct extent_state, rb_node); 1090 state = rb_entry(node, struct extent_state, rb_node);
@@ -1135,7 +1177,7 @@ hit_next:
1135 * the later extent. 1177 * the later extent.
1136 */ 1178 */
1137 err = insert_state(tree, prealloc, start, this_end, 1179 err = insert_state(tree, prealloc, start, this_end,
1138 &bits); 1180 NULL, NULL, &bits);
1139 if (err) 1181 if (err)
1140 extent_io_tree_panic(tree, err); 1182 extent_io_tree_panic(tree, err);
1141 cache_state(prealloc, cached_state); 1183 cache_state(prealloc, cached_state);
@@ -1984,7 +2026,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1984 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2026 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1985 if (!bio) 2027 if (!bio)
1986 return -EIO; 2028 return -EIO;
1987 bio->bi_size = 0; 2029 bio->bi_iter.bi_size = 0;
1988 map_length = length; 2030 map_length = length;
1989 2031
1990 ret = btrfs_map_block(fs_info, WRITE, logical, 2032 ret = btrfs_map_block(fs_info, WRITE, logical,
@@ -1995,7 +2037,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1995 } 2037 }
1996 BUG_ON(mirror_num != bbio->mirror_num); 2038 BUG_ON(mirror_num != bbio->mirror_num);
1997 sector = bbio->stripes[mirror_num-1].physical >> 9; 2039 sector = bbio->stripes[mirror_num-1].physical >> 9;
1998 bio->bi_sector = sector; 2040 bio->bi_iter.bi_sector = sector;
1999 dev = bbio->stripes[mirror_num-1].dev; 2041 dev = bbio->stripes[mirror_num-1].dev;
2000 kfree(bbio); 2042 kfree(bbio);
2001 if (!dev || !dev->bdev || !dev->writeable) { 2043 if (!dev || !dev->bdev || !dev->writeable) {
@@ -2012,9 +2054,10 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2012 return -EIO; 2054 return -EIO;
2013 } 2055 }
2014 2056
2015 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 2057 printk_ratelimited_in_rcu(KERN_INFO
2016 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 2058 "BTRFS: read error corrected: ino %lu off %llu "
2017 start, rcu_str_deref(dev->name), sector); 2059 "(dev %s sector %llu)\n", page->mapping->host->i_ino,
2060 start, rcu_str_deref(dev->name), sector);
2018 2061
2019 bio_put(bio); 2062 bio_put(bio);
2020 return 0; 2063 return 0;
@@ -2156,7 +2199,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2156 return -EIO; 2199 return -EIO;
2157 } 2200 }
2158 2201
2159 if (em->start > start || em->start + em->len < start) { 2202 if (em->start > start || em->start + em->len <= start) {
2160 free_extent_map(em); 2203 free_extent_map(em);
2161 em = NULL; 2204 em = NULL;
2162 } 2205 }
@@ -2268,9 +2311,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2268 return -EIO; 2311 return -EIO;
2269 } 2312 }
2270 bio->bi_end_io = failed_bio->bi_end_io; 2313 bio->bi_end_io = failed_bio->bi_end_io;
2271 bio->bi_sector = failrec->logical >> 9; 2314 bio->bi_iter.bi_sector = failrec->logical >> 9;
2272 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2315 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2273 bio->bi_size = 0; 2316 bio->bi_iter.bi_size = 0;
2274 2317
2275 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2318 btrfs_failed_bio = btrfs_io_bio(failed_bio);
2276 if (btrfs_failed_bio->csum) { 2319 if (btrfs_failed_bio->csum) {
@@ -2332,37 +2375,39 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2332 */ 2375 */
2333static void end_bio_extent_writepage(struct bio *bio, int err) 2376static void end_bio_extent_writepage(struct bio *bio, int err)
2334{ 2377{
2335 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2378 struct bio_vec *bvec;
2336 struct extent_io_tree *tree;
2337 u64 start; 2379 u64 start;
2338 u64 end; 2380 u64 end;
2381 int i;
2339 2382
2340 do { 2383 bio_for_each_segment_all(bvec, bio, i) {
2341 struct page *page = bvec->bv_page; 2384 struct page *page = bvec->bv_page;
2342 tree = &BTRFS_I(page->mapping->host)->io_tree;
2343 2385
2344 /* We always issue full-page reads, but if some block 2386 /* We always issue full-page reads, but if some block
2345 * in a page fails to read, blk_update_request() will 2387 * in a page fails to read, blk_update_request() will
2346 * advance bv_offset and adjust bv_len to compensate. 2388 * advance bv_offset and adjust bv_len to compensate.
2347 * Print a warning for nonzero offsets, and an error 2389 * Print a warning for nonzero offsets, and an error
2348 * if they don't add up to a full page. */ 2390 * if they don't add up to a full page. */
2349 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) 2391 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
2350 printk("%s page write in btrfs with offset %u and length %u\n", 2392 if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
2351 bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE 2393 btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
2352 ? KERN_ERR "partial" : KERN_INFO "incomplete", 2394 "partial page write in btrfs with offset %u and length %u",
2353 bvec->bv_offset, bvec->bv_len); 2395 bvec->bv_offset, bvec->bv_len);
2396 else
2397 btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
2398 "incomplete page write in btrfs with offset %u and "
2399 "length %u",
2400 bvec->bv_offset, bvec->bv_len);
2401 }
2354 2402
2355 start = page_offset(page); 2403 start = page_offset(page);
2356 end = start + bvec->bv_offset + bvec->bv_len - 1; 2404 end = start + bvec->bv_offset + bvec->bv_len - 1;
2357 2405
2358 if (--bvec >= bio->bi_io_vec)
2359 prefetchw(&bvec->bv_page->flags);
2360
2361 if (end_extent_writepage(page, err, start, end)) 2406 if (end_extent_writepage(page, err, start, end))
2362 continue; 2407 continue;
2363 2408
2364 end_page_writeback(page); 2409 end_page_writeback(page);
2365 } while (bvec >= bio->bi_io_vec); 2410 }
2366 2411
2367 bio_put(bio); 2412 bio_put(bio);
2368} 2413}
@@ -2392,9 +2437,8 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2392 */ 2437 */
2393static void end_bio_extent_readpage(struct bio *bio, int err) 2438static void end_bio_extent_readpage(struct bio *bio, int err)
2394{ 2439{
2440 struct bio_vec *bvec;
2395 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 2441 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2396 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2397 struct bio_vec *bvec = bio->bi_io_vec;
2398 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2442 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2399 struct extent_io_tree *tree; 2443 struct extent_io_tree *tree;
2400 u64 offset = 0; 2444 u64 offset = 0;
@@ -2405,16 +2449,17 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2405 u64 extent_len = 0; 2449 u64 extent_len = 0;
2406 int mirror; 2450 int mirror;
2407 int ret; 2451 int ret;
2452 int i;
2408 2453
2409 if (err) 2454 if (err)
2410 uptodate = 0; 2455 uptodate = 0;
2411 2456
2412 do { 2457 bio_for_each_segment_all(bvec, bio, i) {
2413 struct page *page = bvec->bv_page; 2458 struct page *page = bvec->bv_page;
2414 struct inode *inode = page->mapping->host; 2459 struct inode *inode = page->mapping->host;
2415 2460
2416 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2461 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2417 "mirror=%lu\n", (u64)bio->bi_sector, err, 2462 "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
2418 io_bio->mirror_num); 2463 io_bio->mirror_num);
2419 tree = &BTRFS_I(inode)->io_tree; 2464 tree = &BTRFS_I(inode)->io_tree;
2420 2465
@@ -2423,19 +2468,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2423 * advance bv_offset and adjust bv_len to compensate. 2468 * advance bv_offset and adjust bv_len to compensate.
2424 * Print a warning for nonzero offsets, and an error 2469 * Print a warning for nonzero offsets, and an error
2425 * if they don't add up to a full page. */ 2470 * if they don't add up to a full page. */
2426 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) 2471 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
2427 printk("%s page read in btrfs with offset %u and length %u\n", 2472 if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
2428 bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE 2473 btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
2429 ? KERN_ERR "partial" : KERN_INFO "incomplete", 2474 "partial page read in btrfs with offset %u and length %u",
2430 bvec->bv_offset, bvec->bv_len); 2475 bvec->bv_offset, bvec->bv_len);
2476 else
2477 btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
2478 "incomplete page read in btrfs with offset %u and "
2479 "length %u",
2480 bvec->bv_offset, bvec->bv_len);
2481 }
2431 2482
2432 start = page_offset(page); 2483 start = page_offset(page);
2433 end = start + bvec->bv_offset + bvec->bv_len - 1; 2484 end = start + bvec->bv_offset + bvec->bv_len - 1;
2434 len = bvec->bv_len; 2485 len = bvec->bv_len;
2435 2486
2436 if (++bvec <= bvec_end)
2437 prefetchw(&bvec->bv_page->flags);
2438
2439 mirror = io_bio->mirror_num; 2487 mirror = io_bio->mirror_num;
2440 if (likely(uptodate && tree->ops && 2488 if (likely(uptodate && tree->ops &&
2441 tree->ops->readpage_end_io_hook)) { 2489 tree->ops->readpage_end_io_hook)) {
@@ -2516,7 +2564,7 @@ readpage_ok:
2516 extent_start = start; 2564 extent_start = start;
2517 extent_len = end + 1 - start; 2565 extent_len = end + 1 - start;
2518 } 2566 }
2519 } while (bvec <= bvec_end); 2567 }
2520 2568
2521 if (extent_len) 2569 if (extent_len)
2522 endio_readpage_release_extent(tree, extent_start, extent_len, 2570 endio_readpage_release_extent(tree, extent_start, extent_len,
@@ -2547,9 +2595,8 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2547 } 2595 }
2548 2596
2549 if (bio) { 2597 if (bio) {
2550 bio->bi_size = 0;
2551 bio->bi_bdev = bdev; 2598 bio->bi_bdev = bdev;
2552 bio->bi_sector = first_sector; 2599 bio->bi_iter.bi_sector = first_sector;
2553 btrfs_bio = btrfs_io_bio(bio); 2600 btrfs_bio = btrfs_io_bio(bio);
2554 btrfs_bio->csum = NULL; 2601 btrfs_bio->csum = NULL;
2555 btrfs_bio->csum_allocated = NULL; 2602 btrfs_bio->csum_allocated = NULL;
@@ -2643,7 +2690,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2643 if (bio_ret && *bio_ret) { 2690 if (bio_ret && *bio_ret) {
2644 bio = *bio_ret; 2691 bio = *bio_ret;
2645 if (old_compressed) 2692 if (old_compressed)
2646 contig = bio->bi_sector == sector; 2693 contig = bio->bi_iter.bi_sector == sector;
2647 else 2694 else
2648 contig = bio_end_sector(bio) == sector; 2695 contig = bio_end_sector(bio) == sector;
2649 2696
@@ -3287,8 +3334,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3287 3334
3288 set_range_writeback(tree, cur, cur + iosize - 1); 3335 set_range_writeback(tree, cur, cur + iosize - 1);
3289 if (!PageWriteback(page)) { 3336 if (!PageWriteback(page)) {
3290 printk(KERN_ERR "btrfs warning page %lu not " 3337 btrfs_err(BTRFS_I(inode)->root->fs_info,
3291 "writeback, cur %llu end %llu\n", 3338 "page %lu not writeback, cur %llu end %llu",
3292 page->index, cur, end); 3339 page->index, cur, end);
3293 } 3340 }
3294 3341
@@ -3410,20 +3457,18 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
3410 3457
3411static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3458static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3412{ 3459{
3413 int uptodate = err == 0; 3460 struct bio_vec *bvec;
3414 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3415 struct extent_buffer *eb; 3461 struct extent_buffer *eb;
3416 int done; 3462 int i, done;
3417 3463
3418 do { 3464 bio_for_each_segment_all(bvec, bio, i) {
3419 struct page *page = bvec->bv_page; 3465 struct page *page = bvec->bv_page;
3420 3466
3421 bvec--;
3422 eb = (struct extent_buffer *)page->private; 3467 eb = (struct extent_buffer *)page->private;
3423 BUG_ON(!eb); 3468 BUG_ON(!eb);
3424 done = atomic_dec_and_test(&eb->io_pages); 3469 done = atomic_dec_and_test(&eb->io_pages);
3425 3470
3426 if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3471 if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3427 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3472 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3428 ClearPageUptodate(page); 3473 ClearPageUptodate(page);
3429 SetPageError(page); 3474 SetPageError(page);
@@ -3435,10 +3480,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3435 continue; 3480 continue;
3436 3481
3437 end_extent_buffer_writeback(eb); 3482 end_extent_buffer_writeback(eb);
3438 } while (bvec >= bio->bi_io_vec); 3483 }
3439 3484
3440 bio_put(bio); 3485 bio_put(bio);
3441
3442} 3486}
3443 3487
3444static int write_one_eb(struct extent_buffer *eb, 3488static int write_one_eb(struct extent_buffer *eb,
@@ -3447,6 +3491,7 @@ static int write_one_eb(struct extent_buffer *eb,
3447 struct extent_page_data *epd) 3491 struct extent_page_data *epd)
3448{ 3492{
3449 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3493 struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3494 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
3450 u64 offset = eb->start; 3495 u64 offset = eb->start;
3451 unsigned long i, num_pages; 3496 unsigned long i, num_pages;
3452 unsigned long bio_flags = 0; 3497 unsigned long bio_flags = 0;
@@ -3464,7 +3509,7 @@ static int write_one_eb(struct extent_buffer *eb,
3464 3509
3465 clear_page_dirty_for_io(p); 3510 clear_page_dirty_for_io(p);
3466 set_page_writeback(p); 3511 set_page_writeback(p);
3467 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3512 ret = submit_extent_page(rw, tree, p, offset >> 9,
3468 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3513 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3469 -1, end_bio_extent_buffer_writepage, 3514 -1, end_bio_extent_buffer_writepage,
3470 0, epd->bio_flags, bio_flags); 3515 0, epd->bio_flags, bio_flags);
@@ -4082,12 +4127,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4082 struct extent_map *em = NULL; 4127 struct extent_map *em = NULL;
4083 struct extent_state *cached_state = NULL; 4128 struct extent_state *cached_state = NULL;
4084 struct btrfs_path *path; 4129 struct btrfs_path *path;
4085 struct btrfs_file_extent_item *item;
4086 int end = 0; 4130 int end = 0;
4087 u64 em_start = 0; 4131 u64 em_start = 0;
4088 u64 em_len = 0; 4132 u64 em_len = 0;
4089 u64 em_end = 0; 4133 u64 em_end = 0;
4090 unsigned long emflags;
4091 4134
4092 if (len == 0) 4135 if (len == 0)
4093 return -EINVAL; 4136 return -EINVAL;
@@ -4112,8 +4155,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4112 } 4155 }
4113 WARN_ON(!ret); 4156 WARN_ON(!ret);
4114 path->slots[0]--; 4157 path->slots[0]--;
4115 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4116 struct btrfs_file_extent_item);
4117 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4158 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4118 found_type = btrfs_key_type(&found_key); 4159 found_type = btrfs_key_type(&found_key);
4119 4160
@@ -4181,7 +4222,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4181 offset_in_extent = em_start - em->start; 4222 offset_in_extent = em_start - em->start;
4182 em_end = extent_map_end(em); 4223 em_end = extent_map_end(em);
4183 em_len = em_end - em_start; 4224 em_len = em_end - em_start;
4184 emflags = em->flags;
4185 disko = 0; 4225 disko = 0;
4186 flags = 0; 4226 flags = 0;
4187 4227
@@ -4333,10 +4373,9 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4333 __free_extent_buffer(eb); 4373 __free_extent_buffer(eb);
4334} 4374}
4335 4375
4336static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 4376static struct extent_buffer *
4337 u64 start, 4377__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4338 unsigned long len, 4378 unsigned long len, gfp_t mask)
4339 gfp_t mask)
4340{ 4379{
4341 struct extent_buffer *eb = NULL; 4380 struct extent_buffer *eb = NULL;
4342 4381
@@ -4345,7 +4384,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
4345 return NULL; 4384 return NULL;
4346 eb->start = start; 4385 eb->start = start;
4347 eb->len = len; 4386 eb->len = len;
4348 eb->tree = tree; 4387 eb->fs_info = fs_info;
4349 eb->bflags = 0; 4388 eb->bflags = 0;
4350 rwlock_init(&eb->lock); 4389 rwlock_init(&eb->lock);
4351 atomic_set(&eb->write_locks, 0); 4390 atomic_set(&eb->write_locks, 0);
@@ -4477,13 +4516,14 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4477 } 4516 }
4478} 4517}
4479 4518
4480struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 4519struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4481 u64 start) 4520 u64 start)
4482{ 4521{
4483 struct extent_buffer *eb; 4522 struct extent_buffer *eb;
4484 4523
4485 rcu_read_lock(); 4524 rcu_read_lock();
4486 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4525 eb = radix_tree_lookup(&fs_info->buffer_radix,
4526 start >> PAGE_CACHE_SHIFT);
4487 if (eb && atomic_inc_not_zero(&eb->refs)) { 4527 if (eb && atomic_inc_not_zero(&eb->refs)) {
4488 rcu_read_unlock(); 4528 rcu_read_unlock();
4489 mark_extent_buffer_accessed(eb); 4529 mark_extent_buffer_accessed(eb);
@@ -4494,7 +4534,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4494 return NULL; 4534 return NULL;
4495} 4535}
4496 4536
4497struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 4537struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4498 u64 start, unsigned long len) 4538 u64 start, unsigned long len)
4499{ 4539{
4500 unsigned long num_pages = num_extent_pages(start, len); 4540 unsigned long num_pages = num_extent_pages(start, len);
@@ -4503,16 +4543,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4503 struct extent_buffer *eb; 4543 struct extent_buffer *eb;
4504 struct extent_buffer *exists = NULL; 4544 struct extent_buffer *exists = NULL;
4505 struct page *p; 4545 struct page *p;
4506 struct address_space *mapping = tree->mapping; 4546 struct address_space *mapping = fs_info->btree_inode->i_mapping;
4507 int uptodate = 1; 4547 int uptodate = 1;
4508 int ret; 4548 int ret;
4509 4549
4510 4550 eb = find_extent_buffer(fs_info, start);
4511 eb = find_extent_buffer(tree, start);
4512 if (eb) 4551 if (eb)
4513 return eb; 4552 return eb;
4514 4553
4515 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 4554 eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
4516 if (!eb) 4555 if (!eb)
4517 return NULL; 4556 return NULL;
4518 4557
@@ -4567,12 +4606,13 @@ again:
4567 if (ret) 4606 if (ret)
4568 goto free_eb; 4607 goto free_eb;
4569 4608
4570 spin_lock(&tree->buffer_lock); 4609 spin_lock(&fs_info->buffer_lock);
4571 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 4610 ret = radix_tree_insert(&fs_info->buffer_radix,
4572 spin_unlock(&tree->buffer_lock); 4611 start >> PAGE_CACHE_SHIFT, eb);
4612 spin_unlock(&fs_info->buffer_lock);
4573 radix_tree_preload_end(); 4613 radix_tree_preload_end();
4574 if (ret == -EEXIST) { 4614 if (ret == -EEXIST) {
4575 exists = find_extent_buffer(tree, start); 4615 exists = find_extent_buffer(fs_info, start);
4576 if (exists) 4616 if (exists)
4577 goto free_eb; 4617 goto free_eb;
4578 else 4618 else
@@ -4580,6 +4620,7 @@ again:
4580 } 4620 }
4581 /* add one reference for the tree */ 4621 /* add one reference for the tree */
4582 check_buffer_tree_ref(eb); 4622 check_buffer_tree_ref(eb);
4623 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4583 4624
4584 /* 4625 /*
4585 * there is a race where release page may have 4626 * there is a race where release page may have
@@ -4623,17 +4664,17 @@ static int release_extent_buffer(struct extent_buffer *eb)
4623{ 4664{
4624 WARN_ON(atomic_read(&eb->refs) == 0); 4665 WARN_ON(atomic_read(&eb->refs) == 0);
4625 if (atomic_dec_and_test(&eb->refs)) { 4666 if (atomic_dec_and_test(&eb->refs)) {
4626 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { 4667 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
4627 spin_unlock(&eb->refs_lock); 4668 struct btrfs_fs_info *fs_info = eb->fs_info;
4628 } else {
4629 struct extent_io_tree *tree = eb->tree;
4630 4669
4631 spin_unlock(&eb->refs_lock); 4670 spin_unlock(&eb->refs_lock);
4632 4671
4633 spin_lock(&tree->buffer_lock); 4672 spin_lock(&fs_info->buffer_lock);
4634 radix_tree_delete(&tree->buffer, 4673 radix_tree_delete(&fs_info->buffer_radix,
4635 eb->start >> PAGE_CACHE_SHIFT); 4674 eb->start >> PAGE_CACHE_SHIFT);
4636 spin_unlock(&tree->buffer_lock); 4675 spin_unlock(&fs_info->buffer_lock);
4676 } else {
4677 spin_unlock(&eb->refs_lock);
4637 } 4678 }
4638 4679
4639 /* Should be safe to release our pages at this point */ 4680 /* Should be safe to release our pages at this point */
@@ -5112,12 +5153,12 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5112 unsigned long src_i; 5153 unsigned long src_i;
5113 5154
5114 if (src_offset + len > dst->len) { 5155 if (src_offset + len > dst->len) {
5115 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 5156 printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
5116 "len %lu dst len %lu\n", src_offset, len, dst->len); 5157 "len %lu dst len %lu\n", src_offset, len, dst->len);
5117 BUG_ON(1); 5158 BUG_ON(1);
5118 } 5159 }
5119 if (dst_offset + len > dst->len) { 5160 if (dst_offset + len > dst->len) {
5120 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 5161 printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
5121 "len %lu dst len %lu\n", dst_offset, len, dst->len); 5162 "len %lu dst len %lu\n", dst_offset, len, dst->len);
5122 BUG_ON(1); 5163 BUG_ON(1);
5123 } 5164 }
@@ -5159,12 +5200,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5159 unsigned long src_i; 5200 unsigned long src_i;
5160 5201
5161 if (src_offset + len > dst->len) { 5202 if (src_offset + len > dst->len) {
5162 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 5203 printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
5163 "len %lu len %lu\n", src_offset, len, dst->len); 5204 "len %lu len %lu\n", src_offset, len, dst->len);
5164 BUG_ON(1); 5205 BUG_ON(1);
5165 } 5206 }
5166 if (dst_offset + len > dst->len) { 5207 if (dst_offset + len > dst->len) {
5167 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 5208 printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
5168 "len %lu len %lu\n", dst_offset, len, dst->len); 5209 "len %lu len %lu\n", dst_offset, len, dst->len);
5169 BUG_ON(1); 5210 BUG_ON(1);
5170 } 5211 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 19620c58f096..58b27e5ab521 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -43,6 +43,7 @@
43#define EXTENT_BUFFER_WRITEBACK 7 43#define EXTENT_BUFFER_WRITEBACK 7
44#define EXTENT_BUFFER_IOERR 8 44#define EXTENT_BUFFER_IOERR 8
45#define EXTENT_BUFFER_DUMMY 9 45#define EXTENT_BUFFER_DUMMY 9
46#define EXTENT_BUFFER_IN_TREE 10
46 47
47/* these are flags for extent_clear_unlock_delalloc */ 48/* these are flags for extent_clear_unlock_delalloc */
48#define PAGE_UNLOCK (1 << 0) 49#define PAGE_UNLOCK (1 << 0)
@@ -94,12 +95,10 @@ struct extent_io_ops {
94 95
95struct extent_io_tree { 96struct extent_io_tree {
96 struct rb_root state; 97 struct rb_root state;
97 struct radix_tree_root buffer;
98 struct address_space *mapping; 98 struct address_space *mapping;
99 u64 dirty_bytes; 99 u64 dirty_bytes;
100 int track_uptodate; 100 int track_uptodate;
101 spinlock_t lock; 101 spinlock_t lock;
102 spinlock_t buffer_lock;
103 struct extent_io_ops *ops; 102 struct extent_io_ops *ops;
104}; 103};
105 104
@@ -130,7 +129,7 @@ struct extent_buffer {
130 unsigned long map_start; 129 unsigned long map_start;
131 unsigned long map_len; 130 unsigned long map_len;
132 unsigned long bflags; 131 unsigned long bflags;
133 struct extent_io_tree *tree; 132 struct btrfs_fs_info *fs_info;
134 spinlock_t refs_lock; 133 spinlock_t refs_lock;
135 atomic_t refs; 134 atomic_t refs;
136 atomic_t io_pages; 135 atomic_t io_pages;
@@ -266,11 +265,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
266int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 265int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
267void set_page_extent_mapped(struct page *page); 266void set_page_extent_mapped(struct page *page);
268 267
269struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 268struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
270 u64 start, unsigned long len); 269 u64 start, unsigned long len);
271struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); 270struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
272struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); 271struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
273struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 272struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
274 u64 start); 273 u64 start);
275void free_extent_buffer(struct extent_buffer *eb); 274void free_extent_buffer(struct extent_buffer *eb);
276void free_extent_buffer_stale(struct extent_buffer *eb); 275void free_extent_buffer_stale(struct extent_buffer *eb);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a4a7a1a8da95..996ad56b57db 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -79,12 +79,21 @@ void free_extent_map(struct extent_map *em)
79 } 79 }
80} 80}
81 81
82static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 82/* simple helper to do math around the end of an extent, handling wrap */
83 struct rb_node *node) 83static u64 range_end(u64 start, u64 len)
84{
85 if (start + len < start)
86 return (u64)-1;
87 return start + len;
88}
89
90static int tree_insert(struct rb_root *root, struct extent_map *em)
84{ 91{
85 struct rb_node **p = &root->rb_node; 92 struct rb_node **p = &root->rb_node;
86 struct rb_node *parent = NULL; 93 struct rb_node *parent = NULL;
87 struct extent_map *entry; 94 struct extent_map *entry = NULL;
95 struct rb_node *orig_parent = NULL;
96 u64 end = range_end(em->start, em->len);
88 97
89 while (*p) { 98 while (*p) {
90 parent = *p; 99 parent = *p;
@@ -92,19 +101,37 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
92 101
93 WARN_ON(!entry->in_tree); 102 WARN_ON(!entry->in_tree);
94 103
95 if (offset < entry->start) 104 if (em->start < entry->start)
96 p = &(*p)->rb_left; 105 p = &(*p)->rb_left;
97 else if (offset >= extent_map_end(entry)) 106 else if (em->start >= extent_map_end(entry))
98 p = &(*p)->rb_right; 107 p = &(*p)->rb_right;
99 else 108 else
100 return parent; 109 return -EEXIST;
101 } 110 }
102 111
103 entry = rb_entry(node, struct extent_map, rb_node); 112 orig_parent = parent;
104 entry->in_tree = 1; 113 while (parent && em->start >= extent_map_end(entry)) {
105 rb_link_node(node, parent, p); 114 parent = rb_next(parent);
106 rb_insert_color(node, root); 115 entry = rb_entry(parent, struct extent_map, rb_node);
107 return NULL; 116 }
117 if (parent)
118 if (end > entry->start && em->start < extent_map_end(entry))
119 return -EEXIST;
120
121 parent = orig_parent;
122 entry = rb_entry(parent, struct extent_map, rb_node);
123 while (parent && em->start < entry->start) {
124 parent = rb_prev(parent);
125 entry = rb_entry(parent, struct extent_map, rb_node);
126 }
127 if (parent)
128 if (end > entry->start && em->start < extent_map_end(entry))
129 return -EEXIST;
130
131 em->in_tree = 1;
132 rb_link_node(&em->rb_node, orig_parent, p);
133 rb_insert_color(&em->rb_node, root);
134 return 0;
108} 135}
109 136
110/* 137/*
@@ -228,7 +255,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
228 merge = rb_entry(rb, struct extent_map, rb_node); 255 merge = rb_entry(rb, struct extent_map, rb_node);
229 if (rb && mergable_maps(em, merge)) { 256 if (rb && mergable_maps(em, merge)) {
230 em->len += merge->len; 257 em->len += merge->len;
231 em->block_len += merge->len; 258 em->block_len += merge->block_len;
232 rb_erase(&merge->rb_node, &tree->map); 259 rb_erase(&merge->rb_node, &tree->map);
233 merge->in_tree = 0; 260 merge->in_tree = 0;
234 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; 261 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
@@ -310,20 +337,11 @@ int add_extent_mapping(struct extent_map_tree *tree,
310 struct extent_map *em, int modified) 337 struct extent_map *em, int modified)
311{ 338{
312 int ret = 0; 339 int ret = 0;
313 struct rb_node *rb;
314 struct extent_map *exist;
315 340
316 exist = lookup_extent_mapping(tree, em->start, em->len); 341 ret = tree_insert(&tree->map, em);
317 if (exist) { 342 if (ret)
318 free_extent_map(exist);
319 ret = -EEXIST;
320 goto out;
321 }
322 rb = tree_insert(&tree->map, em->start, &em->rb_node);
323 if (rb) {
324 ret = -EEXIST;
325 goto out; 343 goto out;
326 } 344
327 atomic_inc(&em->refs); 345 atomic_inc(&em->refs);
328 346
329 em->mod_start = em->start; 347 em->mod_start = em->start;
@@ -337,14 +355,6 @@ out:
337 return ret; 355 return ret;
338} 356}
339 357
340/* simple helper to do math around the end of an extent, handling wrap */
341static u64 range_end(u64 start, u64 len)
342{
343 if (start + len < start)
344 return (u64)-1;
345 return start + len;
346}
347
348static struct extent_map * 358static struct extent_map *
349__lookup_extent_mapping(struct extent_map_tree *tree, 359__lookup_extent_mapping(struct extent_map_tree *tree,
350 u64 start, u64 len, int strict) 360 u64 start, u64 len, int strict)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6f3848860283..127555b29f58 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
182 if (!path) 182 if (!path)
183 return -ENOMEM; 183 return -ENOMEM;
184 184
185 nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits; 185 nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
186 if (!dst) { 186 if (!dst) {
187 if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { 187 if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
188 btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, 188 btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
@@ -201,7 +201,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
201 csum = (u8 *)dst; 201 csum = (u8 *)dst;
202 } 202 }
203 203
204 if (bio->bi_size > PAGE_CACHE_SIZE * 8) 204 if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
205 path->reada = 2; 205 path->reada = 2;
206 206
207 WARN_ON(bio->bi_vcnt <= 0); 207 WARN_ON(bio->bi_vcnt <= 0);
@@ -217,7 +217,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
217 path->skip_locking = 1; 217 path->skip_locking = 1;
218 } 218 }
219 219
220 disk_bytenr = (u64)bio->bi_sector << 9; 220 disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
221 if (dio) 221 if (dio)
222 offset = logical_offset; 222 offset = logical_offset;
223 while (bio_index < bio->bi_vcnt) { 223 while (bio_index < bio->bi_vcnt) {
@@ -246,8 +246,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
246 offset + bvec->bv_len - 1, 246 offset + bvec->bv_len - 1,
247 EXTENT_NODATASUM, GFP_NOFS); 247 EXTENT_NODATASUM, GFP_NOFS);
248 } else { 248 } else {
249 printk(KERN_INFO "btrfs no csum found " 249 btrfs_info(BTRFS_I(inode)->root->fs_info,
250 "for inode %llu start %llu\n", 250 "no csum found for inode %llu start %llu",
251 btrfs_ino(inode), offset); 251 btrfs_ino(inode), offset);
252 } 252 }
253 item = NULL; 253 item = NULL;
@@ -302,7 +302,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
302 struct btrfs_dio_private *dip, struct bio *bio, 302 struct btrfs_dio_private *dip, struct bio *bio,
303 u64 offset) 303 u64 offset)
304{ 304{
305 int len = (bio->bi_sector << 9) - dip->disk_bytenr; 305 int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
306 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 306 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
307 int ret; 307 int ret;
308 308
@@ -447,11 +447,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
447 u64 offset; 447 u64 offset;
448 448
449 WARN_ON(bio->bi_vcnt <= 0); 449 WARN_ON(bio->bi_vcnt <= 0);
450 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); 450 sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size),
451 GFP_NOFS);
451 if (!sums) 452 if (!sums)
452 return -ENOMEM; 453 return -ENOMEM;
453 454
454 sums->len = bio->bi_size; 455 sums->len = bio->bi_iter.bi_size;
455 INIT_LIST_HEAD(&sums->list); 456 INIT_LIST_HEAD(&sums->list);
456 457
457 if (contig) 458 if (contig)
@@ -461,7 +462,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
461 462
462 ordered = btrfs_lookup_ordered_extent(inode, offset); 463 ordered = btrfs_lookup_ordered_extent(inode, offset);
463 BUG_ON(!ordered); /* Logic error */ 464 BUG_ON(!ordered); /* Logic error */
464 sums->bytenr = (u64)bio->bi_sector << 9; 465 sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
465 index = 0; 466 index = 0;
466 467
467 while (bio_index < bio->bi_vcnt) { 468 while (bio_index < bio->bi_vcnt) {
@@ -476,7 +477,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
476 btrfs_add_ordered_sum(inode, ordered, sums); 477 btrfs_add_ordered_sum(inode, ordered, sums);
477 btrfs_put_ordered_extent(ordered); 478 btrfs_put_ordered_extent(ordered);
478 479
479 bytes_left = bio->bi_size - total_bytes; 480 bytes_left = bio->bi_iter.bi_size - total_bytes;
480 481
481 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), 482 sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
482 GFP_NOFS); 483 GFP_NOFS);
@@ -484,7 +485,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
484 sums->len = bytes_left; 485 sums->len = bytes_left;
485 ordered = btrfs_lookup_ordered_extent(inode, offset); 486 ordered = btrfs_lookup_ordered_extent(inode, offset);
486 BUG_ON(!ordered); /* Logic error */ 487 BUG_ON(!ordered); /* Logic error */
487 sums->bytenr = ((u64)bio->bi_sector << 9) + 488 sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
488 total_bytes; 489 total_bytes;
489 index = 0; 490 index = 0;
490 } 491 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 82d0342763c5..0165b8672f09 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -692,7 +692,10 @@ next:
692int __btrfs_drop_extents(struct btrfs_trans_handle *trans, 692int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
693 struct btrfs_root *root, struct inode *inode, 693 struct btrfs_root *root, struct inode *inode,
694 struct btrfs_path *path, u64 start, u64 end, 694 struct btrfs_path *path, u64 start, u64 end,
695 u64 *drop_end, int drop_cache) 695 u64 *drop_end, int drop_cache,
696 int replace_extent,
697 u32 extent_item_size,
698 int *key_inserted)
696{ 699{
697 struct extent_buffer *leaf; 700 struct extent_buffer *leaf;
698 struct btrfs_file_extent_item *fi; 701 struct btrfs_file_extent_item *fi;
@@ -712,6 +715,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
712 int modify_tree = -1; 715 int modify_tree = -1;
713 int update_refs = (root->ref_cows || root == root->fs_info->tree_root); 716 int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
714 int found = 0; 717 int found = 0;
718 int leafs_visited = 0;
715 719
716 if (drop_cache) 720 if (drop_cache)
717 btrfs_drop_extent_cache(inode, start, end - 1, 0); 721 btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -733,6 +737,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
733 path->slots[0]--; 737 path->slots[0]--;
734 } 738 }
735 ret = 0; 739 ret = 0;
740 leafs_visited++;
736next_slot: 741next_slot:
737 leaf = path->nodes[0]; 742 leaf = path->nodes[0];
738 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 743 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -744,6 +749,7 @@ next_slot:
744 ret = 0; 749 ret = 0;
745 break; 750 break;
746 } 751 }
752 leafs_visited++;
747 leaf = path->nodes[0]; 753 leaf = path->nodes[0];
748 recow = 1; 754 recow = 1;
749 } 755 }
@@ -766,7 +772,8 @@ next_slot:
766 btrfs_file_extent_num_bytes(leaf, fi); 772 btrfs_file_extent_num_bytes(leaf, fi);
767 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 773 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
768 extent_end = key.offset + 774 extent_end = key.offset +
769 btrfs_file_extent_inline_len(leaf, fi); 775 btrfs_file_extent_inline_len(leaf,
776 path->slots[0], fi);
770 } else { 777 } else {
771 WARN_ON(1); 778 WARN_ON(1);
772 extent_end = search_start; 779 extent_end = search_start;
@@ -927,14 +934,44 @@ next_slot:
927 } 934 }
928 935
929 if (!ret && del_nr > 0) { 936 if (!ret && del_nr > 0) {
937 /*
938 * Set path->slots[0] to first slot, so that after the delete
939 * if items are move off from our leaf to its immediate left or
940 * right neighbor leafs, we end up with a correct and adjusted
941 * path->slots[0] for our insertion.
942 */
943 path->slots[0] = del_slot;
930 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 944 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
931 if (ret) 945 if (ret)
932 btrfs_abort_transaction(trans, root, ret); 946 btrfs_abort_transaction(trans, root, ret);
947
948 leaf = path->nodes[0];
949 /*
950 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
951 * is, its contents got pushed to its neighbors), in which case
952 * it means path->locks[0] == 0
953 */
954 if (!ret && replace_extent && leafs_visited == 1 &&
955 path->locks[0] &&
956 btrfs_leaf_free_space(root, leaf) >=
957 sizeof(struct btrfs_item) + extent_item_size) {
958
959 key.objectid = ino;
960 key.type = BTRFS_EXTENT_DATA_KEY;
961 key.offset = start;
962 setup_items_for_insert(root, path, &key,
963 &extent_item_size,
964 extent_item_size,
965 sizeof(struct btrfs_item) +
966 extent_item_size, 1);
967 *key_inserted = 1;
968 }
933 } 969 }
934 970
971 if (!replace_extent || !(*key_inserted))
972 btrfs_release_path(path);
935 if (drop_end) 973 if (drop_end)
936 *drop_end = found ? min(end, extent_end) : end; 974 *drop_end = found ? min(end, extent_end) : end;
937 btrfs_release_path(path);
938 return ret; 975 return ret;
939} 976}
940 977
@@ -949,7 +986,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
949 if (!path) 986 if (!path)
950 return -ENOMEM; 987 return -ENOMEM;
951 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL, 988 ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
952 drop_cache); 989 drop_cache, 0, 0, NULL);
953 btrfs_free_path(path); 990 btrfs_free_path(path);
954 return ret; 991 return ret;
955} 992}
@@ -1235,29 +1272,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
1235} 1272}
1236 1273
1237/* 1274/*
1238 * this gets pages into the page cache and locks them down, it also properly 1275 * this just gets pages into the page cache and locks them down.
1239 * waits for data=ordered extents to finish before allowing the pages to be
1240 * modified.
1241 */ 1276 */
1242static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1277static noinline int prepare_pages(struct inode *inode, struct page **pages,
1243 struct page **pages, size_t num_pages, 1278 size_t num_pages, loff_t pos,
1244 loff_t pos, unsigned long first_index, 1279 size_t write_bytes, bool force_uptodate)
1245 size_t write_bytes, bool force_uptodate)
1246{ 1280{
1247 struct extent_state *cached_state = NULL;
1248 int i; 1281 int i;
1249 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1282 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1250 struct inode *inode = file_inode(file);
1251 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); 1283 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1252 int err = 0; 1284 int err = 0;
1253 int faili = 0; 1285 int faili;
1254 u64 start_pos;
1255 u64 last_pos;
1256
1257 start_pos = pos & ~((u64)root->sectorsize - 1);
1258 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
1259 1286
1260again:
1261 for (i = 0; i < num_pages; i++) { 1287 for (i = 0; i < num_pages; i++) {
1262 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1288 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1263 mask | __GFP_WRITE); 1289 mask | __GFP_WRITE);
@@ -1280,57 +1306,85 @@ again:
1280 } 1306 }
1281 wait_on_page_writeback(pages[i]); 1307 wait_on_page_writeback(pages[i]);
1282 } 1308 }
1283 faili = num_pages - 1; 1309
1284 err = 0; 1310 return 0;
1311fail:
1312 while (faili >= 0) {
1313 unlock_page(pages[faili]);
1314 page_cache_release(pages[faili]);
1315 faili--;
1316 }
1317 return err;
1318
1319}
1320
1321/*
1322 * This function locks the extent and properly waits for data=ordered extents
1323 * to finish before allowing the pages to be modified if need.
1324 *
1325 * The return value:
1326 * 1 - the extent is locked
1327 * 0 - the extent is not locked, and everything is OK
1328 * -EAGAIN - need re-prepare the pages
1329 * the other < 0 number - Something wrong happens
1330 */
1331static noinline int
1332lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
1333 size_t num_pages, loff_t pos,
1334 u64 *lockstart, u64 *lockend,
1335 struct extent_state **cached_state)
1336{
1337 u64 start_pos;
1338 u64 last_pos;
1339 int i;
1340 int ret = 0;
1341
1342 start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1343 last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
1344
1285 if (start_pos < inode->i_size) { 1345 if (start_pos < inode->i_size) {
1286 struct btrfs_ordered_extent *ordered; 1346 struct btrfs_ordered_extent *ordered;
1287 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1347 lock_extent_bits(&BTRFS_I(inode)->io_tree,
1288 start_pos, last_pos - 1, 0, &cached_state); 1348 start_pos, last_pos, 0, cached_state);
1289 ordered = btrfs_lookup_first_ordered_extent(inode, 1349 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
1290 last_pos - 1);
1291 if (ordered && 1350 if (ordered &&
1292 ordered->file_offset + ordered->len > start_pos && 1351 ordered->file_offset + ordered->len > start_pos &&
1293 ordered->file_offset < last_pos) { 1352 ordered->file_offset <= last_pos) {
1294 btrfs_put_ordered_extent(ordered); 1353 btrfs_put_ordered_extent(ordered);
1295 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1354 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1296 start_pos, last_pos - 1, 1355 start_pos, last_pos,
1297 &cached_state, GFP_NOFS); 1356 cached_state, GFP_NOFS);
1298 for (i = 0; i < num_pages; i++) { 1357 for (i = 0; i < num_pages; i++) {
1299 unlock_page(pages[i]); 1358 unlock_page(pages[i]);
1300 page_cache_release(pages[i]); 1359 page_cache_release(pages[i]);
1301 } 1360 }
1302 err = btrfs_wait_ordered_range(inode, start_pos, 1361 ret = btrfs_wait_ordered_range(inode, start_pos,
1303 last_pos - start_pos); 1362 last_pos - start_pos + 1);
1304 if (err) 1363 if (ret)
1305 goto fail; 1364 return ret;
1306 goto again; 1365 else
1366 return -EAGAIN;
1307 } 1367 }
1308 if (ordered) 1368 if (ordered)
1309 btrfs_put_ordered_extent(ordered); 1369 btrfs_put_ordered_extent(ordered);
1310 1370
1311 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, 1371 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
1312 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 1372 last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
1313 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1373 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1314 0, 0, &cached_state, GFP_NOFS); 1374 0, 0, cached_state, GFP_NOFS);
1315 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 1375 *lockstart = start_pos;
1316 start_pos, last_pos - 1, &cached_state, 1376 *lockend = last_pos;
1317 GFP_NOFS); 1377 ret = 1;
1318 } 1378 }
1379
1319 for (i = 0; i < num_pages; i++) { 1380 for (i = 0; i < num_pages; i++) {
1320 if (clear_page_dirty_for_io(pages[i])) 1381 if (clear_page_dirty_for_io(pages[i]))
1321 account_page_redirty(pages[i]); 1382 account_page_redirty(pages[i]);
1322 set_page_extent_mapped(pages[i]); 1383 set_page_extent_mapped(pages[i]);
1323 WARN_ON(!PageLocked(pages[i])); 1384 WARN_ON(!PageLocked(pages[i]));
1324 } 1385 }
1325 return 0;
1326fail:
1327 while (faili >= 0) {
1328 unlock_page(pages[faili]);
1329 page_cache_release(pages[faili]);
1330 faili--;
1331 }
1332 return err;
1333 1386
1387 return ret;
1334} 1388}
1335 1389
1336static noinline int check_can_nocow(struct inode *inode, loff_t pos, 1390static noinline int check_can_nocow(struct inode *inode, loff_t pos,
@@ -1381,13 +1435,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1381 struct inode *inode = file_inode(file); 1435 struct inode *inode = file_inode(file);
1382 struct btrfs_root *root = BTRFS_I(inode)->root; 1436 struct btrfs_root *root = BTRFS_I(inode)->root;
1383 struct page **pages = NULL; 1437 struct page **pages = NULL;
1438 struct extent_state *cached_state = NULL;
1384 u64 release_bytes = 0; 1439 u64 release_bytes = 0;
1440 u64 lockstart;
1441 u64 lockend;
1385 unsigned long first_index; 1442 unsigned long first_index;
1386 size_t num_written = 0; 1443 size_t num_written = 0;
1387 int nrptrs; 1444 int nrptrs;
1388 int ret = 0; 1445 int ret = 0;
1389 bool only_release_metadata = false; 1446 bool only_release_metadata = false;
1390 bool force_page_uptodate = false; 1447 bool force_page_uptodate = false;
1448 bool need_unlock;
1391 1449
1392 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1450 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1393 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1451 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1456,18 +1514,31 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1456 } 1514 }
1457 1515
1458 release_bytes = reserve_bytes; 1516 release_bytes = reserve_bytes;
1459 1517 need_unlock = false;
1518again:
1460 /* 1519 /*
1461 * This is going to setup the pages array with the number of 1520 * This is going to setup the pages array with the number of
1462 * pages we want, so we don't really need to worry about the 1521 * pages we want, so we don't really need to worry about the
1463 * contents of pages from loop to loop 1522 * contents of pages from loop to loop
1464 */ 1523 */
1465 ret = prepare_pages(root, file, pages, num_pages, 1524 ret = prepare_pages(inode, pages, num_pages,
1466 pos, first_index, write_bytes, 1525 pos, write_bytes,
1467 force_page_uptodate); 1526 force_page_uptodate);
1468 if (ret) 1527 if (ret)
1469 break; 1528 break;
1470 1529
1530 ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
1531 pos, &lockstart, &lockend,
1532 &cached_state);
1533 if (ret < 0) {
1534 if (ret == -EAGAIN)
1535 goto again;
1536 break;
1537 } else if (ret > 0) {
1538 need_unlock = true;
1539 ret = 0;
1540 }
1541
1471 copied = btrfs_copy_from_user(pos, num_pages, 1542 copied = btrfs_copy_from_user(pos, num_pages,
1472 write_bytes, pages, i); 1543 write_bytes, pages, i);
1473 1544
@@ -1512,19 +1583,21 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1512 } 1583 }
1513 1584
1514 release_bytes = dirty_pages << PAGE_CACHE_SHIFT; 1585 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
1515 if (copied > 0) { 1586
1587 if (copied > 0)
1516 ret = btrfs_dirty_pages(root, inode, pages, 1588 ret = btrfs_dirty_pages(root, inode, pages,
1517 dirty_pages, pos, copied, 1589 dirty_pages, pos, copied,
1518 NULL); 1590 NULL);
1519 if (ret) { 1591 if (need_unlock)
1520 btrfs_drop_pages(pages, num_pages); 1592 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1521 break; 1593 lockstart, lockend, &cached_state,
1522 } 1594 GFP_NOFS);
1595 if (ret) {
1596 btrfs_drop_pages(pages, num_pages);
1597 break;
1523 } 1598 }
1524 1599
1525 release_bytes = 0; 1600 release_bytes = 0;
1526 btrfs_drop_pages(pages, num_pages);
1527
1528 if (only_release_metadata && copied > 0) { 1601 if (only_release_metadata && copied > 0) {
1529 u64 lockstart = round_down(pos, root->sectorsize); 1602 u64 lockstart = round_down(pos, root->sectorsize);
1530 u64 lockend = lockstart + 1603 u64 lockend = lockstart +
@@ -1536,6 +1609,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1536 only_release_metadata = false; 1609 only_release_metadata = false;
1537 } 1610 }
1538 1611
1612 btrfs_drop_pages(pages, num_pages);
1613
1539 cond_resched(); 1614 cond_resched();
1540 1615
1541 balance_dirty_pages_ratelimited(inode->i_mapping); 1616 balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1857,12 +1932,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1857 if (file->private_data) 1932 if (file->private_data)
1858 btrfs_ioctl_trans_end(file); 1933 btrfs_ioctl_trans_end(file);
1859 1934
1935 /*
1936 * We use start here because we will need to wait on the IO to complete
1937 * in btrfs_sync_log, which could require joining a transaction (for
1938 * example checking cross references in the nocow path). If we use join
1939 * here we could get into a situation where we're waiting on IO to
1940 * happen that is blocked on a transaction trying to commit. With start
1941 * we inc the extwriter counter, so we wait for all extwriters to exit
1942 * before we start blocking join'ers. This comment is to keep somebody
1943 * from thinking they are super smart and changing this to
1944 * btrfs_join_transaction *cough*Josef*cough*.
1945 */
1860 trans = btrfs_start_transaction(root, 0); 1946 trans = btrfs_start_transaction(root, 0);
1861 if (IS_ERR(trans)) { 1947 if (IS_ERR(trans)) {
1862 ret = PTR_ERR(trans); 1948 ret = PTR_ERR(trans);
1863 mutex_unlock(&inode->i_mutex); 1949 mutex_unlock(&inode->i_mutex);
1864 goto out; 1950 goto out;
1865 } 1951 }
1952 trans->sync = true;
1866 1953
1867 ret = btrfs_log_dentry_safe(trans, root, dentry); 1954 ret = btrfs_log_dentry_safe(trans, root, dentry);
1868 if (ret < 0) { 1955 if (ret < 0) {
@@ -1963,11 +2050,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
1963 struct btrfs_key key; 2050 struct btrfs_key key;
1964 int ret; 2051 int ret;
1965 2052
2053 if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
2054 goto out;
2055
1966 key.objectid = btrfs_ino(inode); 2056 key.objectid = btrfs_ino(inode);
1967 key.type = BTRFS_EXTENT_DATA_KEY; 2057 key.type = BTRFS_EXTENT_DATA_KEY;
1968 key.offset = offset; 2058 key.offset = offset;
1969 2059
1970
1971 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2060 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1972 if (ret < 0) 2061 if (ret < 0)
1973 return ret; 2062 return ret;
@@ -2064,8 +2153,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2064 u64 drop_end; 2153 u64 drop_end;
2065 int ret = 0; 2154 int ret = 0;
2066 int err = 0; 2155 int err = 0;
2156 int rsv_count;
2067 bool same_page = ((offset >> PAGE_CACHE_SHIFT) == 2157 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
2068 ((offset + len - 1) >> PAGE_CACHE_SHIFT)); 2158 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
2159 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2069 2160
2070 ret = btrfs_wait_ordered_range(inode, offset, len); 2161 ret = btrfs_wait_ordered_range(inode, offset, len);
2071 if (ret) 2162 if (ret)
@@ -2125,7 +2216,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2125 * we need to try again. 2216 * we need to try again.
2126 */ 2217 */
2127 if ((!ordered || 2218 if ((!ordered ||
2128 (ordered->file_offset + ordered->len < lockstart || 2219 (ordered->file_offset + ordered->len <= lockstart ||
2129 ordered->file_offset > lockend)) && 2220 ordered->file_offset > lockend)) &&
2130 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart, 2221 !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
2131 lockend, EXTENT_UPTODATE, 0, 2222 lockend, EXTENT_UPTODATE, 0,
@@ -2163,9 +2254,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2163 /* 2254 /*
2164 * 1 - update the inode 2255 * 1 - update the inode
2165 * 1 - removing the extents in the range 2256 * 1 - removing the extents in the range
2166 * 1 - adding the hole extent 2257 * 1 - adding the hole extent if no_holes isn't set
2167 */ 2258 */
2168 trans = btrfs_start_transaction(root, 3); 2259 rsv_count = no_holes ? 2 : 3;
2260 trans = btrfs_start_transaction(root, rsv_count);
2169 if (IS_ERR(trans)) { 2261 if (IS_ERR(trans)) {
2170 err = PTR_ERR(trans); 2262 err = PTR_ERR(trans);
2171 goto out_free; 2263 goto out_free;
@@ -2179,7 +2271,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2179 while (cur_offset < lockend) { 2271 while (cur_offset < lockend) {
2180 ret = __btrfs_drop_extents(trans, root, inode, path, 2272 ret = __btrfs_drop_extents(trans, root, inode, path,
2181 cur_offset, lockend + 1, 2273 cur_offset, lockend + 1,
2182 &drop_end, 1); 2274 &drop_end, 1, 0, 0, NULL);
2183 if (ret != -ENOSPC) 2275 if (ret != -ENOSPC)
2184 break; 2276 break;
2185 2277
@@ -2202,7 +2294,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2202 btrfs_end_transaction(trans, root); 2294 btrfs_end_transaction(trans, root);
2203 btrfs_btree_balance_dirty(root); 2295 btrfs_btree_balance_dirty(root);
2204 2296
2205 trans = btrfs_start_transaction(root, 3); 2297 trans = btrfs_start_transaction(root, rsv_count);
2206 if (IS_ERR(trans)) { 2298 if (IS_ERR(trans)) {
2207 ret = PTR_ERR(trans); 2299 ret = PTR_ERR(trans);
2208 trans = NULL; 2300 trans = NULL;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 057be95b1e1e..73f3de7a083c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -347,8 +347,8 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
347 btrfs_readpage(NULL, page); 347 btrfs_readpage(NULL, page);
348 lock_page(page); 348 lock_page(page);
349 if (!PageUptodate(page)) { 349 if (!PageUptodate(page)) {
350 printk(KERN_ERR "btrfs: error reading free " 350 btrfs_err(BTRFS_I(inode)->root->fs_info,
351 "space cache\n"); 351 "error reading free space cache");
352 io_ctl_drop_pages(io_ctl); 352 io_ctl_drop_pages(io_ctl);
353 return -EIO; 353 return -EIO;
354 } 354 }
@@ -405,7 +405,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
405 405
406 gen = io_ctl->cur; 406 gen = io_ctl->cur;
407 if (le64_to_cpu(*gen) != generation) { 407 if (le64_to_cpu(*gen) != generation) {
408 printk_ratelimited(KERN_ERR "btrfs: space cache generation " 408 printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
409 "(%Lu) does not match inode (%Lu)\n", *gen, 409 "(%Lu) does not match inode (%Lu)\n", *gen,
410 generation); 410 generation);
411 io_ctl_unmap_page(io_ctl); 411 io_ctl_unmap_page(io_ctl);
@@ -463,7 +463,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
463 PAGE_CACHE_SIZE - offset); 463 PAGE_CACHE_SIZE - offset);
464 btrfs_csum_final(crc, (char *)&crc); 464 btrfs_csum_final(crc, (char *)&crc);
465 if (val != crc) { 465 if (val != crc) {
466 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " 466 printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
467 "space cache\n"); 467 "space cache\n");
468 io_ctl_unmap_page(io_ctl); 468 io_ctl_unmap_page(io_ctl);
469 return -EIO; 469 return -EIO;
@@ -1902,7 +1902,7 @@ out:
1902 spin_unlock(&ctl->tree_lock); 1902 spin_unlock(&ctl->tree_lock);
1903 1903
1904 if (ret) { 1904 if (ret) {
1905 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); 1905 printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
1906 ASSERT(ret != -EEXIST); 1906 ASSERT(ret != -EEXIST);
1907 } 1907 }
1908 1908
@@ -2011,14 +2011,15 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2011 info = rb_entry(n, struct btrfs_free_space, offset_index); 2011 info = rb_entry(n, struct btrfs_free_space, offset_index);
2012 if (info->bytes >= bytes && !block_group->ro) 2012 if (info->bytes >= bytes && !block_group->ro)
2013 count++; 2013 count++;
2014 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", 2014 btrfs_crit(block_group->fs_info,
2015 info->offset, info->bytes, 2015 "entry offset %llu, bytes %llu, bitmap %s",
2016 info->offset, info->bytes,
2016 (info->bitmap) ? "yes" : "no"); 2017 (info->bitmap) ? "yes" : "no");
2017 } 2018 }
2018 printk(KERN_INFO "block group has cluster?: %s\n", 2019 btrfs_info(block_group->fs_info, "block group has cluster?: %s",
2019 list_empty(&block_group->cluster_list) ? "no" : "yes"); 2020 list_empty(&block_group->cluster_list) ? "no" : "yes");
2020 printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" 2021 btrfs_info(block_group->fs_info,
2021 "\n", count); 2022 "%d blocks of free space at or bigger than bytes is", count);
2022} 2023}
2023 2024
2024void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) 2025void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
@@ -2421,7 +2422,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2421 struct btrfs_free_space *entry = NULL; 2422 struct btrfs_free_space *entry = NULL;
2422 struct btrfs_free_space *last; 2423 struct btrfs_free_space *last;
2423 struct rb_node *node; 2424 struct rb_node *node;
2424 u64 window_start;
2425 u64 window_free; 2425 u64 window_free;
2426 u64 max_extent; 2426 u64 max_extent;
2427 u64 total_size = 0; 2427 u64 total_size = 0;
@@ -2443,7 +2443,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2443 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2443 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2444 } 2444 }
2445 2445
2446 window_start = entry->offset;
2447 window_free = entry->bytes; 2446 window_free = entry->bytes;
2448 max_extent = entry->bytes; 2447 max_extent = entry->bytes;
2449 first = entry; 2448 first = entry;
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
new file mode 100644
index 000000000000..85889aa82c62
--- /dev/null
+++ b/fs/btrfs/hash.c
@@ -0,0 +1,50 @@
1/*
2 * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13
14#include <crypto/hash.h>
15#include <linux/err.h>
16#include "hash.h"
17
18static struct crypto_shash *tfm;
19
20int __init btrfs_hash_init(void)
21{
22 tfm = crypto_alloc_shash("crc32c", 0, 0);
23 if (IS_ERR(tfm))
24 return PTR_ERR(tfm);
25
26 return 0;
27}
28
29void btrfs_hash_exit(void)
30{
31 crypto_free_shash(tfm);
32}
33
34u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
35{
36 struct {
37 struct shash_desc shash;
38 char ctx[crypto_shash_descsize(tfm)];
39 } desc;
40 int err;
41
42 desc.shash.tfm = tfm;
43 desc.shash.flags = 0;
44 *(u32 *)desc.ctx = crc;
45
46 err = crypto_shash_update(&desc.shash, address, length);
47 BUG_ON(err);
48
49 return *(u32 *)desc.ctx;
50}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 1d982812ab67..118a2316e5d3 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,10 +19,15 @@
19#ifndef __HASH__ 19#ifndef __HASH__
20#define __HASH__ 20#define __HASH__
21 21
22#include <linux/crc32c.h> 22int __init btrfs_hash_init(void);
23
24void btrfs_hash_exit(void);
25
26u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
27
23static inline u64 btrfs_name_hash(const char *name, int len) 28static inline u64 btrfs_name_hash(const char *name, int len)
24{ 29{
25 return crc32c((u32)~1, name, len); 30 return btrfs_crc32c((u32)~1, name, len);
26} 31}
27 32
28/* 33/*
@@ -31,7 +36,7 @@ static inline u64 btrfs_name_hash(const char *name, int len)
31static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, 36static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
32 int len) 37 int len)
33{ 38{
34 return (u64) crc32c(parent_objectid, name, len); 39 return (u64) btrfs_crc32c(parent_objectid, name, len);
35} 40}
36 41
37#endif 42#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index ec82fae07097..2be38df703c9 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -91,32 +91,6 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
91 return 0; 91 return 0;
92} 92}
93 93
94static struct btrfs_inode_ref *
95btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
96 struct btrfs_root *root,
97 struct btrfs_path *path,
98 const char *name, int name_len,
99 u64 inode_objectid, u64 ref_objectid, int ins_len,
100 int cow)
101{
102 int ret;
103 struct btrfs_key key;
104 struct btrfs_inode_ref *ref;
105
106 key.objectid = inode_objectid;
107 key.type = BTRFS_INODE_REF_KEY;
108 key.offset = ref_objectid;
109
110 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
111 if (ret < 0)
112 return ERR_PTR(ret);
113 if (ret > 0)
114 return NULL;
115 if (!find_name_in_backref(path, name, name_len, &ref))
116 return NULL;
117 return ref;
118}
119
120/* Returns NULL if no extref found */ 94/* Returns NULL if no extref found */
121struct btrfs_inode_extref * 95struct btrfs_inode_extref *
122btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, 96btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
@@ -144,45 +118,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
144 return extref; 118 return extref;
145} 119}
146 120
147int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
148 struct btrfs_root *root,
149 struct btrfs_path *path,
150 const char *name, int name_len,
151 u64 inode_objectid, u64 ref_objectid, int mod,
152 u64 *ret_index)
153{
154 struct btrfs_inode_ref *ref;
155 struct btrfs_inode_extref *extref;
156 int ins_len = mod < 0 ? -1 : 0;
157 int cow = mod != 0;
158
159 ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
160 inode_objectid, ref_objectid, ins_len,
161 cow);
162 if (IS_ERR(ref))
163 return PTR_ERR(ref);
164
165 if (ref != NULL) {
166 *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
167 return 0;
168 }
169
170 btrfs_release_path(path);
171
172 extref = btrfs_lookup_inode_extref(trans, root, path, name,
173 name_len, inode_objectid,
174 ref_objectid, ins_len, cow);
175 if (IS_ERR(extref))
176 return PTR_ERR(extref);
177
178 if (extref) {
179 *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
180 return 0;
181 }
182
183 return -ENOENT;
184}
185
186static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, 121static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
187 struct btrfs_root *root, 122 struct btrfs_root *root,
188 const char *name, int name_len, 123 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f1a77449d032..d3d44486290b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -58,9 +58,10 @@
58#include "inode-map.h" 58#include "inode-map.h"
59#include "backref.h" 59#include "backref.h"
60#include "hash.h" 60#include "hash.h"
61#include "props.h"
61 62
62struct btrfs_iget_args { 63struct btrfs_iget_args {
63 u64 ino; 64 struct btrfs_key *location;
64 struct btrfs_root *root; 65 struct btrfs_root *root;
65}; 66};
66 67
@@ -125,13 +126,12 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
125 * no overlapping inline items exist in the btree 126 * no overlapping inline items exist in the btree
126 */ 127 */
127static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 128static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
129 struct btrfs_path *path, int extent_inserted,
128 struct btrfs_root *root, struct inode *inode, 130 struct btrfs_root *root, struct inode *inode,
129 u64 start, size_t size, size_t compressed_size, 131 u64 start, size_t size, size_t compressed_size,
130 int compress_type, 132 int compress_type,
131 struct page **compressed_pages) 133 struct page **compressed_pages)
132{ 134{
133 struct btrfs_key key;
134 struct btrfs_path *path;
135 struct extent_buffer *leaf; 135 struct extent_buffer *leaf;
136 struct page *page = NULL; 136 struct page *page = NULL;
137 char *kaddr; 137 char *kaddr;
@@ -140,29 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
140 int err = 0; 140 int err = 0;
141 int ret; 141 int ret;
142 size_t cur_size = size; 142 size_t cur_size = size;
143 size_t datasize;
144 unsigned long offset; 143 unsigned long offset;
145 144
146 if (compressed_size && compressed_pages) 145 if (compressed_size && compressed_pages)
147 cur_size = compressed_size; 146 cur_size = compressed_size;
148 147
149 path = btrfs_alloc_path(); 148 inode_add_bytes(inode, size);
150 if (!path)
151 return -ENOMEM;
152 149
153 path->leave_spinning = 1; 150 if (!extent_inserted) {
151 struct btrfs_key key;
152 size_t datasize;
154 153
155 key.objectid = btrfs_ino(inode); 154 key.objectid = btrfs_ino(inode);
156 key.offset = start; 155 key.offset = start;
157 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 156 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
158 datasize = btrfs_file_extent_calc_inline_size(cur_size);
159 157
160 inode_add_bytes(inode, size); 158 datasize = btrfs_file_extent_calc_inline_size(cur_size);
161 ret = btrfs_insert_empty_item(trans, root, path, &key, 159 path->leave_spinning = 1;
162 datasize); 160 ret = btrfs_insert_empty_item(trans, root, path, &key,
163 if (ret) { 161 datasize);
164 err = ret; 162 if (ret) {
165 goto fail; 163 err = ret;
164 goto fail;
165 }
166 } 166 }
167 leaf = path->nodes[0]; 167 leaf = path->nodes[0];
168 ei = btrfs_item_ptr(leaf, path->slots[0], 168 ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -203,7 +203,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
203 page_cache_release(page); 203 page_cache_release(page);
204 } 204 }
205 btrfs_mark_buffer_dirty(leaf); 205 btrfs_mark_buffer_dirty(leaf);
206 btrfs_free_path(path); 206 btrfs_release_path(path);
207 207
208 /* 208 /*
209 * we're an inline extent, so nobody can 209 * we're an inline extent, so nobody can
@@ -219,7 +219,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
219 219
220 return ret; 220 return ret;
221fail: 221fail:
222 btrfs_free_path(path);
223 return err; 222 return err;
224} 223}
225 224
@@ -242,6 +241,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
242 u64 aligned_end = ALIGN(end, root->sectorsize); 241 u64 aligned_end = ALIGN(end, root->sectorsize);
243 u64 data_len = inline_len; 242 u64 data_len = inline_len;
244 int ret; 243 int ret;
244 struct btrfs_path *path;
245 int extent_inserted = 0;
246 u32 extent_item_size;
245 247
246 if (compressed_size) 248 if (compressed_size)
247 data_len = compressed_size; 249 data_len = compressed_size;
@@ -256,12 +258,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
256 return 1; 258 return 1;
257 } 259 }
258 260
261 path = btrfs_alloc_path();
262 if (!path)
263 return -ENOMEM;
264
259 trans = btrfs_join_transaction(root); 265 trans = btrfs_join_transaction(root);
260 if (IS_ERR(trans)) 266 if (IS_ERR(trans)) {
267 btrfs_free_path(path);
261 return PTR_ERR(trans); 268 return PTR_ERR(trans);
269 }
262 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 270 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
263 271
264 ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); 272 if (compressed_size && compressed_pages)
273 extent_item_size = btrfs_file_extent_calc_inline_size(
274 compressed_size);
275 else
276 extent_item_size = btrfs_file_extent_calc_inline_size(
277 inline_len);
278
279 ret = __btrfs_drop_extents(trans, root, inode, path,
280 start, aligned_end, NULL,
281 1, 1, extent_item_size, &extent_inserted);
265 if (ret) { 282 if (ret) {
266 btrfs_abort_transaction(trans, root, ret); 283 btrfs_abort_transaction(trans, root, ret);
267 goto out; 284 goto out;
@@ -269,7 +286,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
269 286
270 if (isize > actual_end) 287 if (isize > actual_end)
271 inline_len = min_t(u64, isize, actual_end); 288 inline_len = min_t(u64, isize, actual_end);
272 ret = insert_inline_extent(trans, root, inode, start, 289 ret = insert_inline_extent(trans, path, extent_inserted,
290 root, inode, start,
273 inline_len, compressed_size, 291 inline_len, compressed_size,
274 compress_type, compressed_pages); 292 compress_type, compressed_pages);
275 if (ret && ret != -ENOSPC) { 293 if (ret && ret != -ENOSPC) {
@@ -284,6 +302,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
284 btrfs_delalloc_release_metadata(inode, end + 1 - start); 302 btrfs_delalloc_release_metadata(inode, end + 1 - start);
285 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 303 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
286out: 304out:
305 btrfs_free_path(path);
287 btrfs_end_transaction(trans, root); 306 btrfs_end_transaction(trans, root);
288 return ret; 307 return ret;
289} 308}
@@ -1262,7 +1281,8 @@ next_slot:
1262 nocow = 1; 1281 nocow = 1;
1263 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1282 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1264 extent_end = found_key.offset + 1283 extent_end = found_key.offset +
1265 btrfs_file_extent_inline_len(leaf, fi); 1284 btrfs_file_extent_inline_len(leaf,
1285 path->slots[0], fi);
1266 extent_end = ALIGN(extent_end, root->sectorsize); 1286 extent_end = ALIGN(extent_end, root->sectorsize);
1267 } else { 1287 } else {
1268 BUG_ON(1); 1288 BUG_ON(1);
@@ -1577,7 +1597,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1577 unsigned long bio_flags) 1597 unsigned long bio_flags)
1578{ 1598{
1579 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1599 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1580 u64 logical = (u64)bio->bi_sector << 9; 1600 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1581 u64 length = 0; 1601 u64 length = 0;
1582 u64 map_length; 1602 u64 map_length;
1583 int ret; 1603 int ret;
@@ -1585,7 +1605,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1585 if (bio_flags & EXTENT_BIO_COMPRESSED) 1605 if (bio_flags & EXTENT_BIO_COMPRESSED)
1586 return 0; 1606 return 0;
1587 1607
1588 length = bio->bi_size; 1608 length = bio->bi_iter.bi_size;
1589 map_length = length; 1609 map_length = length;
1590 ret = btrfs_map_block(root->fs_info, rw, logical, 1610 ret = btrfs_map_block(root->fs_info, rw, logical,
1591 &map_length, NULL, 0); 1611 &map_length, NULL, 0);
@@ -1841,14 +1861,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1841 struct btrfs_path *path; 1861 struct btrfs_path *path;
1842 struct extent_buffer *leaf; 1862 struct extent_buffer *leaf;
1843 struct btrfs_key ins; 1863 struct btrfs_key ins;
1864 int extent_inserted = 0;
1844 int ret; 1865 int ret;
1845 1866
1846 path = btrfs_alloc_path(); 1867 path = btrfs_alloc_path();
1847 if (!path) 1868 if (!path)
1848 return -ENOMEM; 1869 return -ENOMEM;
1849 1870
1850 path->leave_spinning = 1;
1851
1852 /* 1871 /*
1853 * we may be replacing one extent in the tree with another. 1872 * we may be replacing one extent in the tree with another.
1854 * The new extent is pinned in the extent map, and we don't want 1873 * The new extent is pinned in the extent map, and we don't want
@@ -1858,17 +1877,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1858 * the caller is expected to unpin it and allow it to be merged 1877 * the caller is expected to unpin it and allow it to be merged
1859 * with the others. 1878 * with the others.
1860 */ 1879 */
1861 ret = btrfs_drop_extents(trans, root, inode, file_pos, 1880 ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
1862 file_pos + num_bytes, 0); 1881 file_pos + num_bytes, NULL, 0,
1882 1, sizeof(*fi), &extent_inserted);
1863 if (ret) 1883 if (ret)
1864 goto out; 1884 goto out;
1865 1885
1866 ins.objectid = btrfs_ino(inode); 1886 if (!extent_inserted) {
1867 ins.offset = file_pos; 1887 ins.objectid = btrfs_ino(inode);
1868 ins.type = BTRFS_EXTENT_DATA_KEY; 1888 ins.offset = file_pos;
1869 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1889 ins.type = BTRFS_EXTENT_DATA_KEY;
1870 if (ret) 1890
1871 goto out; 1891 path->leave_spinning = 1;
1892 ret = btrfs_insert_empty_item(trans, root, path, &ins,
1893 sizeof(*fi));
1894 if (ret)
1895 goto out;
1896 }
1872 leaf = path->nodes[0]; 1897 leaf = path->nodes[0];
1873 fi = btrfs_item_ptr(leaf, path->slots[0], 1898 fi = btrfs_item_ptr(leaf, path->slots[0],
1874 struct btrfs_file_extent_item); 1899 struct btrfs_file_extent_item);
@@ -2290,7 +2315,7 @@ again:
2290 u64 extent_len; 2315 u64 extent_len;
2291 struct btrfs_key found_key; 2316 struct btrfs_key found_key;
2292 2317
2293 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 2318 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2294 if (ret < 0) 2319 if (ret < 0)
2295 goto out_free_path; 2320 goto out_free_path;
2296 2321
@@ -2543,12 +2568,6 @@ out_kfree:
2543 return NULL; 2568 return NULL;
2544} 2569}
2545 2570
2546/*
2547 * helper function for btrfs_finish_ordered_io, this
2548 * just reads in some of the csum leaves to prime them into ram
2549 * before we start the transaction. It limits the amount of btree
2550 * reads required while inside the transaction.
2551 */
2552/* as ordered data IO finishes, this gets called so we can finish 2571/* as ordered data IO finishes, this gets called so we can finish
2553 * an ordered extent if the range of bytes in the file it covers are 2572 * an ordered extent if the range of bytes in the file it covers are
2554 * fully written. 2573 * fully written.
@@ -2610,7 +2629,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2610 EXTENT_DEFRAG, 1, cached_state); 2629 EXTENT_DEFRAG, 1, cached_state);
2611 if (ret) { 2630 if (ret) {
2612 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 2631 u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2613 if (last_snapshot >= BTRFS_I(inode)->generation) 2632 if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2614 /* the inode is shared */ 2633 /* the inode is shared */
2615 new = record_old_file_extents(inode, ordered_extent); 2634 new = record_old_file_extents(inode, ordered_extent);
2616 2635
@@ -3248,7 +3267,8 @@ out:
3248 * slot is the slot the inode is in, objectid is the objectid of the inode 3267 * slot is the slot the inode is in, objectid is the objectid of the inode
3249 */ 3268 */
3250static noinline int acls_after_inode_item(struct extent_buffer *leaf, 3269static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3251 int slot, u64 objectid) 3270 int slot, u64 objectid,
3271 int *first_xattr_slot)
3252{ 3272{
3253 u32 nritems = btrfs_header_nritems(leaf); 3273 u32 nritems = btrfs_header_nritems(leaf);
3254 struct btrfs_key found_key; 3274 struct btrfs_key found_key;
@@ -3264,6 +3284,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3264 } 3284 }
3265 3285
3266 slot++; 3286 slot++;
3287 *first_xattr_slot = -1;
3267 while (slot < nritems) { 3288 while (slot < nritems) {
3268 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3289 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3269 3290
@@ -3273,6 +3294,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3273 3294
3274 /* we found an xattr, assume we've got an acl */ 3295 /* we found an xattr, assume we've got an acl */
3275 if (found_key.type == BTRFS_XATTR_ITEM_KEY) { 3296 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3297 if (*first_xattr_slot == -1)
3298 *first_xattr_slot = slot;
3276 if (found_key.offset == xattr_access || 3299 if (found_key.offset == xattr_access ||
3277 found_key.offset == xattr_default) 3300 found_key.offset == xattr_default)
3278 return 1; 3301 return 1;
@@ -3301,6 +3324,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3301 * something larger than an xattr. We have to assume the inode 3324 * something larger than an xattr. We have to assume the inode
3302 * has acls 3325 * has acls
3303 */ 3326 */
3327 if (*first_xattr_slot == -1)
3328 *first_xattr_slot = slot;
3304 return 1; 3329 return 1;
3305} 3330}
3306 3331
@@ -3315,10 +3340,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
3315 struct btrfs_timespec *tspec; 3340 struct btrfs_timespec *tspec;
3316 struct btrfs_root *root = BTRFS_I(inode)->root; 3341 struct btrfs_root *root = BTRFS_I(inode)->root;
3317 struct btrfs_key location; 3342 struct btrfs_key location;
3343 unsigned long ptr;
3318 int maybe_acls; 3344 int maybe_acls;
3319 u32 rdev; 3345 u32 rdev;
3320 int ret; 3346 int ret;
3321 bool filled = false; 3347 bool filled = false;
3348 int first_xattr_slot;
3322 3349
3323 ret = btrfs_fill_inode(inode, &rdev); 3350 ret = btrfs_fill_inode(inode, &rdev);
3324 if (!ret) 3351 if (!ret)
@@ -3328,7 +3355,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
3328 if (!path) 3355 if (!path)
3329 goto make_bad; 3356 goto make_bad;
3330 3357
3331 path->leave_spinning = 1;
3332 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3358 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3333 3359
3334 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3360 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -3338,7 +3364,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
3338 leaf = path->nodes[0]; 3364 leaf = path->nodes[0];
3339 3365
3340 if (filled) 3366 if (filled)
3341 goto cache_acl; 3367 goto cache_index;
3342 3368
3343 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3369 inode_item = btrfs_item_ptr(leaf, path->slots[0],
3344 struct btrfs_inode_item); 3370 struct btrfs_inode_item);
@@ -3381,18 +3407,51 @@ static void btrfs_read_locked_inode(struct inode *inode)
3381 3407
3382 BTRFS_I(inode)->index_cnt = (u64)-1; 3408 BTRFS_I(inode)->index_cnt = (u64)-1;
3383 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 3409 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3410
3411cache_index:
3412 path->slots[0]++;
3413 if (inode->i_nlink != 1 ||
3414 path->slots[0] >= btrfs_header_nritems(leaf))
3415 goto cache_acl;
3416
3417 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3418 if (location.objectid != btrfs_ino(inode))
3419 goto cache_acl;
3420
3421 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3422 if (location.type == BTRFS_INODE_REF_KEY) {
3423 struct btrfs_inode_ref *ref;
3424
3425 ref = (struct btrfs_inode_ref *)ptr;
3426 BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3427 } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3428 struct btrfs_inode_extref *extref;
3429
3430 extref = (struct btrfs_inode_extref *)ptr;
3431 BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3432 extref);
3433 }
3384cache_acl: 3434cache_acl:
3385 /* 3435 /*
3386 * try to precache a NULL acl entry for files that don't have 3436 * try to precache a NULL acl entry for files that don't have
3387 * any xattrs or acls 3437 * any xattrs or acls
3388 */ 3438 */
3389 maybe_acls = acls_after_inode_item(leaf, path->slots[0], 3439 maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3390 btrfs_ino(inode)); 3440 btrfs_ino(inode), &first_xattr_slot);
3441 if (first_xattr_slot != -1) {
3442 path->slots[0] = first_xattr_slot;
3443 ret = btrfs_load_inode_props(inode, path);
3444 if (ret)
3445 btrfs_err(root->fs_info,
3446 "error loading props for ino %llu (root %llu): %d\n",
3447 btrfs_ino(inode),
3448 root->root_key.objectid, ret);
3449 }
3450 btrfs_free_path(path);
3451
3391 if (!maybe_acls) 3452 if (!maybe_acls)
3392 cache_no_acl(inode); 3453 cache_no_acl(inode);
3393 3454
3394 btrfs_free_path(path);
3395
3396 switch (inode->i_mode & S_IFMT) { 3455 switch (inode->i_mode & S_IFMT) {
3397 case S_IFREG: 3456 case S_IFREG:
3398 inode->i_mapping->a_ops = &btrfs_aops; 3457 inode->i_mapping->a_ops = &btrfs_aops;
@@ -3496,7 +3555,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3496 goto failed; 3555 goto failed;
3497 } 3556 }
3498 3557
3499 btrfs_unlock_up_safe(path, 1);
3500 leaf = path->nodes[0]; 3558 leaf = path->nodes[0];
3501 inode_item = btrfs_item_ptr(leaf, path->slots[0], 3559 inode_item = btrfs_item_ptr(leaf, path->slots[0],
3502 struct btrfs_inode_item); 3560 struct btrfs_inode_item);
@@ -3593,6 +3651,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3593 goto err; 3651 goto err;
3594 btrfs_release_path(path); 3652 btrfs_release_path(path);
3595 3653
3654 /*
3655 * If we don't have dir index, we have to get it by looking up
3656 * the inode ref, since we get the inode ref, remove it directly,
3657 * it is unnecessary to do delayed deletion.
3658 *
3659 * But if we have dir index, needn't search inode ref to get it.
3660 * Since the inode ref is close to the inode item, it is better
3661 * that we delay to delete it, and just do this deletion when
3662 * we update the inode item.
3663 */
3664 if (BTRFS_I(inode)->dir_index) {
3665 ret = btrfs_delayed_delete_inode_ref(inode);
3666 if (!ret) {
3667 index = BTRFS_I(inode)->dir_index;
3668 goto skip_backref;
3669 }
3670 }
3671
3596 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, 3672 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3597 dir_ino, &index); 3673 dir_ino, &index);
3598 if (ret) { 3674 if (ret) {
@@ -3602,7 +3678,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3602 btrfs_abort_transaction(trans, root, ret); 3678 btrfs_abort_transaction(trans, root, ret);
3603 goto err; 3679 goto err;
3604 } 3680 }
3605 3681skip_backref:
3606 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); 3682 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3607 if (ret) { 3683 if (ret) {
3608 btrfs_abort_transaction(trans, root, ret); 3684 btrfs_abort_transaction(trans, root, ret);
@@ -3948,7 +4024,7 @@ search_again:
3948 btrfs_file_extent_num_bytes(leaf, fi); 4024 btrfs_file_extent_num_bytes(leaf, fi);
3949 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 4025 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3950 item_end += btrfs_file_extent_inline_len(leaf, 4026 item_end += btrfs_file_extent_inline_len(leaf,
3951 fi); 4027 path->slots[0], fi);
3952 } 4028 }
3953 item_end--; 4029 item_end--;
3954 } 4030 }
@@ -4018,6 +4094,12 @@ search_again:
4018 inode_sub_bytes(inode, item_end + 1 - 4094 inode_sub_bytes(inode, item_end + 1 -
4019 new_size); 4095 new_size);
4020 } 4096 }
4097
4098 /*
4099 * update the ram bytes to properly reflect
4100 * the new size of our item
4101 */
4102 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4021 size = 4103 size =
4022 btrfs_file_extent_calc_inline_size(size); 4104 btrfs_file_extent_calc_inline_size(size);
4023 btrfs_truncate_item(root, path, size, 1); 4105 btrfs_truncate_item(root, path, size, 1);
@@ -4203,6 +4285,49 @@ out:
4203 return ret; 4285 return ret;
4204} 4286}
4205 4287
4288static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4289 u64 offset, u64 len)
4290{
4291 struct btrfs_trans_handle *trans;
4292 int ret;
4293
4294 /*
4295 * Still need to make sure the inode looks like it's been updated so
4296 * that any holes get logged if we fsync.
4297 */
4298 if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
4299 BTRFS_I(inode)->last_trans = root->fs_info->generation;
4300 BTRFS_I(inode)->last_sub_trans = root->log_transid;
4301 BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4302 return 0;
4303 }
4304
4305 /*
4306 * 1 - for the one we're dropping
4307 * 1 - for the one we're adding
4308 * 1 - for updating the inode.
4309 */
4310 trans = btrfs_start_transaction(root, 3);
4311 if (IS_ERR(trans))
4312 return PTR_ERR(trans);
4313
4314 ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4315 if (ret) {
4316 btrfs_abort_transaction(trans, root, ret);
4317 btrfs_end_transaction(trans, root);
4318 return ret;
4319 }
4320
4321 ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
4322 0, 0, len, 0, len, 0, 0, 0);
4323 if (ret)
4324 btrfs_abort_transaction(trans, root, ret);
4325 else
4326 btrfs_update_inode(trans, root, inode);
4327 btrfs_end_transaction(trans, root);
4328 return ret;
4329}
4330
4206/* 4331/*
4207 * This function puts in dummy file extents for the area we're creating a hole 4332 * This function puts in dummy file extents for the area we're creating a hole
4208 * for. So if we are truncating this file to a larger size we need to insert 4333 * for. So if we are truncating this file to a larger size we need to insert
@@ -4211,7 +4336,6 @@ out:
4211 */ 4336 */
4212int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 4337int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4213{ 4338{
4214 struct btrfs_trans_handle *trans;
4215 struct btrfs_root *root = BTRFS_I(inode)->root; 4339 struct btrfs_root *root = BTRFS_I(inode)->root;
4216 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4340 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4217 struct extent_map *em = NULL; 4341 struct extent_map *em = NULL;
@@ -4266,31 +4390,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4266 struct extent_map *hole_em; 4390 struct extent_map *hole_em;
4267 hole_size = last_byte - cur_offset; 4391 hole_size = last_byte - cur_offset;
4268 4392
4269 trans = btrfs_start_transaction(root, 3); 4393 err = maybe_insert_hole(root, inode, cur_offset,
4270 if (IS_ERR(trans)) { 4394 hole_size);
4271 err = PTR_ERR(trans); 4395 if (err)
4272 break;
4273 }
4274
4275 err = btrfs_drop_extents(trans, root, inode,
4276 cur_offset,
4277 cur_offset + hole_size, 1);
4278 if (err) {
4279 btrfs_abort_transaction(trans, root, err);
4280 btrfs_end_transaction(trans, root);
4281 break;
4282 }
4283
4284 err = btrfs_insert_file_extent(trans, root,
4285 btrfs_ino(inode), cur_offset, 0,
4286 0, hole_size, 0, hole_size,
4287 0, 0, 0);
4288 if (err) {
4289 btrfs_abort_transaction(trans, root, err);
4290 btrfs_end_transaction(trans, root);
4291 break; 4396 break;
4292 }
4293
4294 btrfs_drop_extent_cache(inode, cur_offset, 4397 btrfs_drop_extent_cache(inode, cur_offset,
4295 cur_offset + hole_size - 1, 0); 4398 cur_offset + hole_size - 1, 0);
4296 hole_em = alloc_extent_map(); 4399 hole_em = alloc_extent_map();
@@ -4309,7 +4412,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4309 hole_em->ram_bytes = hole_size; 4412 hole_em->ram_bytes = hole_size;
4310 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 4413 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4311 hole_em->compress_type = BTRFS_COMPRESS_NONE; 4414 hole_em->compress_type = BTRFS_COMPRESS_NONE;
4312 hole_em->generation = trans->transid; 4415 hole_em->generation = root->fs_info->generation;
4313 4416
4314 while (1) { 4417 while (1) {
4315 write_lock(&em_tree->lock); 4418 write_lock(&em_tree->lock);
@@ -4322,17 +4425,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4322 hole_size - 1, 0); 4425 hole_size - 1, 0);
4323 } 4426 }
4324 free_extent_map(hole_em); 4427 free_extent_map(hole_em);
4325next:
4326 btrfs_update_inode(trans, root, inode);
4327 btrfs_end_transaction(trans, root);
4328 } 4428 }
4429next:
4329 free_extent_map(em); 4430 free_extent_map(em);
4330 em = NULL; 4431 em = NULL;
4331 cur_offset = last_byte; 4432 cur_offset = last_byte;
4332 if (cur_offset >= block_end) 4433 if (cur_offset >= block_end)
4333 break; 4434 break;
4334 } 4435 }
4335
4336 free_extent_map(em); 4436 free_extent_map(em);
4337 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 4437 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4338 GFP_NOFS); 4438 GFP_NOFS);
@@ -4354,8 +4454,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4354 * these flags set. For all other operations the VFS set these flags 4454 * these flags set. For all other operations the VFS set these flags
4355 * explicitly if it wants a timestamp update. 4455 * explicitly if it wants a timestamp update.
4356 */ 4456 */
4357 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) 4457 if (newsize != oldsize) {
4358 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); 4458 inode_inc_iversion(inode);
4459 if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
4460 inode->i_ctime = inode->i_mtime =
4461 current_fs_time(inode->i_sb);
4462 }
4359 4463
4360 if (newsize > oldsize) { 4464 if (newsize > oldsize) {
4361 truncate_pagecache(inode, newsize); 4465 truncate_pagecache(inode, newsize);
@@ -4464,12 +4568,70 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4464 err = btrfs_dirty_inode(inode); 4568 err = btrfs_dirty_inode(inode);
4465 4569
4466 if (!err && attr->ia_valid & ATTR_MODE) 4570 if (!err && attr->ia_valid & ATTR_MODE)
4467 err = btrfs_acl_chmod(inode); 4571 err = posix_acl_chmod(inode, inode->i_mode);
4468 } 4572 }
4469 4573
4470 return err; 4574 return err;
4471} 4575}
4472 4576
4577/*
4578 * While truncating the inode pages during eviction, we get the VFS calling
4579 * btrfs_invalidatepage() against each page of the inode. This is slow because
4580 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
4581 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
4582 * extent_state structures over and over, wasting lots of time.
4583 *
4584 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
4585 * those expensive operations on a per page basis and do only the ordered io
4586 * finishing, while we release here the extent_map and extent_state structures,
4587 * without the excessive merging and splitting.
4588 */
4589static void evict_inode_truncate_pages(struct inode *inode)
4590{
4591 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4592 struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
4593 struct rb_node *node;
4594
4595 ASSERT(inode->i_state & I_FREEING);
4596 truncate_inode_pages(&inode->i_data, 0);
4597
4598 write_lock(&map_tree->lock);
4599 while (!RB_EMPTY_ROOT(&map_tree->map)) {
4600 struct extent_map *em;
4601
4602 node = rb_first(&map_tree->map);
4603 em = rb_entry(node, struct extent_map, rb_node);
4604 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
4605 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4606 remove_extent_mapping(map_tree, em);
4607 free_extent_map(em);
4608 }
4609 write_unlock(&map_tree->lock);
4610
4611 spin_lock(&io_tree->lock);
4612 while (!RB_EMPTY_ROOT(&io_tree->state)) {
4613 struct extent_state *state;
4614 struct extent_state *cached_state = NULL;
4615
4616 node = rb_first(&io_tree->state);
4617 state = rb_entry(node, struct extent_state, rb_node);
4618 atomic_inc(&state->refs);
4619 spin_unlock(&io_tree->lock);
4620
4621 lock_extent_bits(io_tree, state->start, state->end,
4622 0, &cached_state);
4623 clear_extent_bit(io_tree, state->start, state->end,
4624 EXTENT_LOCKED | EXTENT_DIRTY |
4625 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
4626 EXTENT_DEFRAG, 1, 1,
4627 &cached_state, GFP_NOFS);
4628 free_extent_state(state);
4629
4630 spin_lock(&io_tree->lock);
4631 }
4632 spin_unlock(&io_tree->lock);
4633}
4634
4473void btrfs_evict_inode(struct inode *inode) 4635void btrfs_evict_inode(struct inode *inode)
4474{ 4636{
4475 struct btrfs_trans_handle *trans; 4637 struct btrfs_trans_handle *trans;
@@ -4480,7 +4642,8 @@ void btrfs_evict_inode(struct inode *inode)
4480 4642
4481 trace_btrfs_inode_evict(inode); 4643 trace_btrfs_inode_evict(inode);
4482 4644
4483 truncate_inode_pages(&inode->i_data, 0); 4645 evict_inode_truncate_pages(inode);
4646
4484 if (inode->i_nlink && 4647 if (inode->i_nlink &&
4485 ((btrfs_root_refs(&root->root_item) != 0 && 4648 ((btrfs_root_refs(&root->root_item) != 0 &&
4486 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || 4649 root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
@@ -4655,9 +4818,9 @@ static int fixup_tree_root_location(struct btrfs_root *root,
4655 } 4818 }
4656 4819
4657 err = -ENOENT; 4820 err = -ENOENT;
4658 ret = btrfs_find_root_ref(root->fs_info->tree_root, path, 4821 ret = btrfs_find_item(root->fs_info->tree_root, path,
4659 BTRFS_I(dir)->root->root_key.objectid, 4822 BTRFS_I(dir)->root->root_key.objectid,
4660 location->objectid); 4823 location->objectid, BTRFS_ROOT_REF_KEY, NULL);
4661 if (ret) { 4824 if (ret) {
4662 if (ret < 0) 4825 if (ret < 0)
4663 err = ret; 4826 err = ret;
@@ -4818,7 +4981,9 @@ again:
4818static int btrfs_init_locked_inode(struct inode *inode, void *p) 4981static int btrfs_init_locked_inode(struct inode *inode, void *p)
4819{ 4982{
4820 struct btrfs_iget_args *args = p; 4983 struct btrfs_iget_args *args = p;
4821 inode->i_ino = args->ino; 4984 inode->i_ino = args->location->objectid;
4985 memcpy(&BTRFS_I(inode)->location, args->location,
4986 sizeof(*args->location));
4822 BTRFS_I(inode)->root = args->root; 4987 BTRFS_I(inode)->root = args->root;
4823 return 0; 4988 return 0;
4824} 4989}
@@ -4826,19 +4991,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
4826static int btrfs_find_actor(struct inode *inode, void *opaque) 4991static int btrfs_find_actor(struct inode *inode, void *opaque)
4827{ 4992{
4828 struct btrfs_iget_args *args = opaque; 4993 struct btrfs_iget_args *args = opaque;
4829 return args->ino == btrfs_ino(inode) && 4994 return args->location->objectid == BTRFS_I(inode)->location.objectid &&
4830 args->root == BTRFS_I(inode)->root; 4995 args->root == BTRFS_I(inode)->root;
4831} 4996}
4832 4997
4833static struct inode *btrfs_iget_locked(struct super_block *s, 4998static struct inode *btrfs_iget_locked(struct super_block *s,
4834 u64 objectid, 4999 struct btrfs_key *location,
4835 struct btrfs_root *root) 5000 struct btrfs_root *root)
4836{ 5001{
4837 struct inode *inode; 5002 struct inode *inode;
4838 struct btrfs_iget_args args; 5003 struct btrfs_iget_args args;
4839 unsigned long hashval = btrfs_inode_hash(objectid, root); 5004 unsigned long hashval = btrfs_inode_hash(location->objectid, root);
4840 5005
4841 args.ino = objectid; 5006 args.location = location;
4842 args.root = root; 5007 args.root = root;
4843 5008
4844 inode = iget5_locked(s, hashval, btrfs_find_actor, 5009 inode = iget5_locked(s, hashval, btrfs_find_actor,
@@ -4855,13 +5020,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4855{ 5020{
4856 struct inode *inode; 5021 struct inode *inode;
4857 5022
4858 inode = btrfs_iget_locked(s, location->objectid, root); 5023 inode = btrfs_iget_locked(s, location, root);
4859 if (!inode) 5024 if (!inode)
4860 return ERR_PTR(-ENOMEM); 5025 return ERR_PTR(-ENOMEM);
4861 5026
4862 if (inode->i_state & I_NEW) { 5027 if (inode->i_state & I_NEW) {
4863 BTRFS_I(inode)->root = root;
4864 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4865 btrfs_read_locked_inode(inode); 5028 btrfs_read_locked_inode(inode);
4866 if (!is_bad_inode(inode)) { 5029 if (!is_bad_inode(inode)) {
4867 inode_tree_add(inode); 5030 inode_tree_add(inode);
@@ -4917,7 +5080,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4917 return ERR_PTR(ret); 5080 return ERR_PTR(ret);
4918 5081
4919 if (location.objectid == 0) 5082 if (location.objectid == 0)
4920 return NULL; 5083 return ERR_PTR(-ENOENT);
4921 5084
4922 if (location.type == BTRFS_INODE_ITEM_KEY) { 5085 if (location.type == BTRFS_INODE_ITEM_KEY) {
4923 inode = btrfs_iget(dir->i_sb, &location, root, NULL); 5086 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
@@ -4981,10 +5144,17 @@ static void btrfs_dentry_release(struct dentry *dentry)
4981static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 5144static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4982 unsigned int flags) 5145 unsigned int flags)
4983{ 5146{
4984 struct dentry *ret; 5147 struct inode *inode;
4985 5148
4986 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 5149 inode = btrfs_lookup_dentry(dir, dentry);
4987 return ret; 5150 if (IS_ERR(inode)) {
5151 if (PTR_ERR(inode) == -ENOENT)
5152 inode = NULL;
5153 else
5154 return ERR_CAST(inode);
5155 }
5156
5157 return d_materialise_unique(dentry, inode);
4988} 5158}
4989 5159
4990unsigned char btrfs_filetype_table[] = { 5160unsigned char btrfs_filetype_table[] = {
@@ -5354,7 +5524,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5354 u32 sizes[2]; 5524 u32 sizes[2];
5355 unsigned long ptr; 5525 unsigned long ptr;
5356 int ret; 5526 int ret;
5357 int owner;
5358 5527
5359 path = btrfs_alloc_path(); 5528 path = btrfs_alloc_path();
5360 if (!path) 5529 if (!path)
@@ -5388,6 +5557,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5388 * number 5557 * number
5389 */ 5558 */
5390 BTRFS_I(inode)->index_cnt = 2; 5559 BTRFS_I(inode)->index_cnt = 2;
5560 BTRFS_I(inode)->dir_index = *index;
5391 BTRFS_I(inode)->root = root; 5561 BTRFS_I(inode)->root = root;
5392 BTRFS_I(inode)->generation = trans->transid; 5562 BTRFS_I(inode)->generation = trans->transid;
5393 inode->i_generation = BTRFS_I(inode)->generation; 5563 inode->i_generation = BTRFS_I(inode)->generation;
@@ -5400,11 +5570,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5400 */ 5570 */
5401 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 5571 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5402 5572
5403 if (S_ISDIR(mode))
5404 owner = 0;
5405 else
5406 owner = 1;
5407
5408 key[0].objectid = objectid; 5573 key[0].objectid = objectid;
5409 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 5574 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5410 key[0].offset = 0; 5575 key[0].offset = 0;
@@ -5469,6 +5634,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5469 5634
5470 btrfs_update_root_times(trans, root); 5635 btrfs_update_root_times(trans, root);
5471 5636
5637 ret = btrfs_inode_inherit_props(trans, inode, dir);
5638 if (ret)
5639 btrfs_err(root->fs_info,
5640 "error inheriting props for ino %llu (root %llu): %d",
5641 btrfs_ino(inode), root->root_key.objectid, ret);
5642
5472 return inode; 5643 return inode;
5473fail: 5644fail:
5474 if (dir) 5645 if (dir)
@@ -5737,6 +5908,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5737 goto fail; 5908 goto fail;
5738 } 5909 }
5739 5910
5911 /* There are several dir indexes for this inode, clear the cache. */
5912 BTRFS_I(inode)->dir_index = 0ULL;
5740 inc_nlink(inode); 5913 inc_nlink(inode);
5741 inode_inc_iversion(inode); 5914 inode_inc_iversion(inode);
5742 inode->i_ctime = CURRENT_TIME; 5915 inode->i_ctime = CURRENT_TIME;
@@ -6000,7 +6173,7 @@ again:
6000 btrfs_file_extent_num_bytes(leaf, item); 6173 btrfs_file_extent_num_bytes(leaf, item);
6001 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 6174 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6002 size_t size; 6175 size_t size;
6003 size = btrfs_file_extent_inline_len(leaf, item); 6176 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6004 extent_end = ALIGN(extent_start + size, root->sectorsize); 6177 extent_end = ALIGN(extent_start + size, root->sectorsize);
6005 } 6178 }
6006next: 6179next:
@@ -6069,7 +6242,7 @@ next:
6069 goto out; 6242 goto out;
6070 } 6243 }
6071 6244
6072 size = btrfs_file_extent_inline_len(leaf, item); 6245 size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6073 extent_offset = page_offset(page) + pg_offset - extent_start; 6246 extent_offset = page_offset(page) + pg_offset - extent_start;
6074 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, 6247 copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6075 size - extent_offset); 6248 size - extent_offset);
@@ -6386,6 +6559,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6386 int slot; 6559 int slot;
6387 int found_type; 6560 int found_type;
6388 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); 6561 bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6562
6389 path = btrfs_alloc_path(); 6563 path = btrfs_alloc_path();
6390 if (!path) 6564 if (!path)
6391 return -ENOMEM; 6565 return -ENOMEM;
@@ -6429,6 +6603,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6429 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) 6603 if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6430 goto out; 6604 goto out;
6431 6605
6606 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6607 if (extent_end <= offset)
6608 goto out;
6609
6432 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 6610 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6433 if (disk_bytenr == 0) 6611 if (disk_bytenr == 0)
6434 goto out; 6612 goto out;
@@ -6446,8 +6624,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6446 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); 6624 *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6447 } 6625 }
6448 6626
6449 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6450
6451 if (btrfs_extent_readonly(root, disk_bytenr)) 6627 if (btrfs_extent_readonly(root, disk_bytenr))
6452 goto out; 6628 goto out;
6453 btrfs_release_path(path); 6629 btrfs_release_path(path);
@@ -6779,17 +6955,16 @@ unlock_err:
6779static void btrfs_endio_direct_read(struct bio *bio, int err) 6955static void btrfs_endio_direct_read(struct bio *bio, int err)
6780{ 6956{
6781 struct btrfs_dio_private *dip = bio->bi_private; 6957 struct btrfs_dio_private *dip = bio->bi_private;
6782 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 6958 struct bio_vec *bvec;
6783 struct bio_vec *bvec = bio->bi_io_vec;
6784 struct inode *inode = dip->inode; 6959 struct inode *inode = dip->inode;
6785 struct btrfs_root *root = BTRFS_I(inode)->root; 6960 struct btrfs_root *root = BTRFS_I(inode)->root;
6786 struct bio *dio_bio; 6961 struct bio *dio_bio;
6787 u32 *csums = (u32 *)dip->csum; 6962 u32 *csums = (u32 *)dip->csum;
6788 int index = 0;
6789 u64 start; 6963 u64 start;
6964 int i;
6790 6965
6791 start = dip->logical_offset; 6966 start = dip->logical_offset;
6792 do { 6967 bio_for_each_segment_all(bvec, bio, i) {
6793 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 6968 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
6794 struct page *page = bvec->bv_page; 6969 struct page *page = bvec->bv_page;
6795 char *kaddr; 6970 char *kaddr;
@@ -6805,18 +6980,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
6805 local_irq_restore(flags); 6980 local_irq_restore(flags);
6806 6981
6807 flush_dcache_page(bvec->bv_page); 6982 flush_dcache_page(bvec->bv_page);
6808 if (csum != csums[index]) { 6983 if (csum != csums[i]) {
6809 btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", 6984 btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
6810 btrfs_ino(inode), start, csum, 6985 btrfs_ino(inode), start, csum,
6811 csums[index]); 6986 csums[i]);
6812 err = -EIO; 6987 err = -EIO;
6813 } 6988 }
6814 } 6989 }
6815 6990
6816 start += bvec->bv_len; 6991 start += bvec->bv_len;
6817 bvec++; 6992 }
6818 index++;
6819 } while (bvec <= bvec_end);
6820 6993
6821 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 6994 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
6822 dip->logical_offset + dip->bytes - 1); 6995 dip->logical_offset + dip->bytes - 1);
@@ -6894,10 +7067,11 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
6894 struct btrfs_dio_private *dip = bio->bi_private; 7067 struct btrfs_dio_private *dip = bio->bi_private;
6895 7068
6896 if (err) { 7069 if (err) {
6897 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " 7070 btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
6898 "sector %#Lx len %u err no %d\n", 7071 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
6899 btrfs_ino(dip->inode), bio->bi_rw, 7072 btrfs_ino(dip->inode), bio->bi_rw,
6900 (unsigned long long)bio->bi_sector, bio->bi_size, err); 7073 (unsigned long long)bio->bi_iter.bi_sector,
7074 bio->bi_iter.bi_size, err);
6901 dip->errors = 1; 7075 dip->errors = 1;
6902 7076
6903 /* 7077 /*
@@ -6988,7 +7162,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6988 struct bio *bio; 7162 struct bio *bio;
6989 struct bio *orig_bio = dip->orig_bio; 7163 struct bio *orig_bio = dip->orig_bio;
6990 struct bio_vec *bvec = orig_bio->bi_io_vec; 7164 struct bio_vec *bvec = orig_bio->bi_io_vec;
6991 u64 start_sector = orig_bio->bi_sector; 7165 u64 start_sector = orig_bio->bi_iter.bi_sector;
6992 u64 file_offset = dip->logical_offset; 7166 u64 file_offset = dip->logical_offset;
6993 u64 submit_len = 0; 7167 u64 submit_len = 0;
6994 u64 map_length; 7168 u64 map_length;
@@ -6996,7 +7170,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6996 int ret = 0; 7170 int ret = 0;
6997 int async_submit = 0; 7171 int async_submit = 0;
6998 7172
6999 map_length = orig_bio->bi_size; 7173 map_length = orig_bio->bi_iter.bi_size;
7000 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 7174 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
7001 &map_length, NULL, 0); 7175 &map_length, NULL, 0);
7002 if (ret) { 7176 if (ret) {
@@ -7004,7 +7178,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7004 return -EIO; 7178 return -EIO;
7005 } 7179 }
7006 7180
7007 if (map_length >= orig_bio->bi_size) { 7181 if (map_length >= orig_bio->bi_iter.bi_size) {
7008 bio = orig_bio; 7182 bio = orig_bio;
7009 goto submit; 7183 goto submit;
7010 } 7184 }
@@ -7056,7 +7230,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7056 bio->bi_private = dip; 7230 bio->bi_private = dip;
7057 bio->bi_end_io = btrfs_end_dio_bio; 7231 bio->bi_end_io = btrfs_end_dio_bio;
7058 7232
7059 map_length = orig_bio->bi_size; 7233 map_length = orig_bio->bi_iter.bi_size;
7060 ret = btrfs_map_block(root->fs_info, rw, 7234 ret = btrfs_map_block(root->fs_info, rw,
7061 start_sector << 9, 7235 start_sector << 9,
7062 &map_length, NULL, 0); 7236 &map_length, NULL, 0);
@@ -7114,7 +7288,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7114 7288
7115 if (!skip_sum && !write) { 7289 if (!skip_sum && !write) {
7116 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 7290 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7117 sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; 7291 sum_len = dio_bio->bi_iter.bi_size >>
7292 inode->i_sb->s_blocksize_bits;
7118 sum_len *= csum_size; 7293 sum_len *= csum_size;
7119 } else { 7294 } else {
7120 sum_len = 0; 7295 sum_len = 0;
@@ -7129,8 +7304,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7129 dip->private = dio_bio->bi_private; 7304 dip->private = dio_bio->bi_private;
7130 dip->inode = inode; 7305 dip->inode = inode;
7131 dip->logical_offset = file_offset; 7306 dip->logical_offset = file_offset;
7132 dip->bytes = dio_bio->bi_size; 7307 dip->bytes = dio_bio->bi_iter.bi_size;
7133 dip->disk_bytenr = (u64)dio_bio->bi_sector << 9; 7308 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7134 io_bio->bi_private = dip; 7309 io_bio->bi_private = dip;
7135 dip->errors = 0; 7310 dip->errors = 0;
7136 dip->orig_bio = io_bio; 7311 dip->orig_bio = io_bio;
@@ -7367,6 +7542,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7367 struct extent_state *cached_state = NULL; 7542 struct extent_state *cached_state = NULL;
7368 u64 page_start = page_offset(page); 7543 u64 page_start = page_offset(page);
7369 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 7544 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7545 int inode_evicting = inode->i_state & I_FREEING;
7370 7546
7371 /* 7547 /*
7372 * we have the page locked, so new writeback can't start, 7548 * we have the page locked, so new writeback can't start,
@@ -7382,17 +7558,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7382 btrfs_releasepage(page, GFP_NOFS); 7558 btrfs_releasepage(page, GFP_NOFS);
7383 return; 7559 return;
7384 } 7560 }
7385 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7561
7386 ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); 7562 if (!inode_evicting)
7563 lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7564 ordered = btrfs_lookup_ordered_extent(inode, page_start);
7387 if (ordered) { 7565 if (ordered) {
7388 /* 7566 /*
7389 * IO on this page will never be started, so we need 7567 * IO on this page will never be started, so we need
7390 * to account for any ordered extents now 7568 * to account for any ordered extents now
7391 */ 7569 */
7392 clear_extent_bit(tree, page_start, page_end, 7570 if (!inode_evicting)
7393 EXTENT_DIRTY | EXTENT_DELALLOC | 7571 clear_extent_bit(tree, page_start, page_end,
7394 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 7572 EXTENT_DIRTY | EXTENT_DELALLOC |
7395 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); 7573 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7574 EXTENT_DEFRAG, 1, 0, &cached_state,
7575 GFP_NOFS);
7396 /* 7576 /*
7397 * whoever cleared the private bit is responsible 7577 * whoever cleared the private bit is responsible
7398 * for the finish_ordered_io 7578 * for the finish_ordered_io
@@ -7416,14 +7596,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7416 btrfs_finish_ordered_io(ordered); 7596 btrfs_finish_ordered_io(ordered);
7417 } 7597 }
7418 btrfs_put_ordered_extent(ordered); 7598 btrfs_put_ordered_extent(ordered);
7419 cached_state = NULL; 7599 if (!inode_evicting) {
7420 lock_extent_bits(tree, page_start, page_end, 0, &cached_state); 7600 cached_state = NULL;
7601 lock_extent_bits(tree, page_start, page_end, 0,
7602 &cached_state);
7603 }
7604 }
7605
7606 if (!inode_evicting) {
7607 clear_extent_bit(tree, page_start, page_end,
7608 EXTENT_LOCKED | EXTENT_DIRTY |
7609 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
7610 EXTENT_DEFRAG, 1, 1,
7611 &cached_state, GFP_NOFS);
7612
7613 __btrfs_releasepage(page, GFP_NOFS);
7421 } 7614 }
7422 clear_extent_bit(tree, page_start, page_end,
7423 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
7424 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
7425 &cached_state, GFP_NOFS);
7426 __btrfs_releasepage(page, GFP_NOFS);
7427 7615
7428 ClearPageChecked(page); 7616 ClearPageChecked(page);
7429 if (PagePrivate(page)) { 7617 if (PagePrivate(page)) {
@@ -7733,7 +7921,9 @@ out:
7733 * create a new subvolume directory/inode (helper for the ioctl). 7921 * create a new subvolume directory/inode (helper for the ioctl).
7734 */ 7922 */
7735int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 7923int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
7736 struct btrfs_root *new_root, u64 new_dirid) 7924 struct btrfs_root *new_root,
7925 struct btrfs_root *parent_root,
7926 u64 new_dirid)
7737{ 7927{
7738 struct inode *inode; 7928 struct inode *inode;
7739 int err; 7929 int err;
@@ -7751,6 +7941,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
7751 set_nlink(inode, 1); 7941 set_nlink(inode, 1);
7752 btrfs_i_size_write(inode, 0); 7942 btrfs_i_size_write(inode, 0);
7753 7943
7944 err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
7945 if (err)
7946 btrfs_err(new_root->fs_info,
7947 "error inheriting subvolume %llu properties: %d\n",
7948 new_root->root_key.objectid, err);
7949
7754 err = btrfs_update_inode(trans, new_root, inode); 7950 err = btrfs_update_inode(trans, new_root, inode);
7755 7951
7756 iput(inode); 7952 iput(inode);
@@ -7776,6 +7972,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
7776 ei->flags = 0; 7972 ei->flags = 0;
7777 ei->csum_bytes = 0; 7973 ei->csum_bytes = 0;
7778 ei->index_cnt = (u64)-1; 7974 ei->index_cnt = (u64)-1;
7975 ei->dir_index = 0;
7779 ei->last_unlink_trans = 0; 7976 ei->last_unlink_trans = 0;
7780 ei->last_log_commit = 0; 7977 ei->last_log_commit = 0;
7781 7978
@@ -8063,6 +8260,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8063 if (ret) 8260 if (ret)
8064 goto out_fail; 8261 goto out_fail;
8065 8262
8263 BTRFS_I(old_inode)->dir_index = 0ULL;
8066 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { 8264 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8067 /* force full log commit if subvolume involved. */ 8265 /* force full log commit if subvolume involved. */
8068 root->fs_info->last_trans_log_full_commit = trans->transid; 8266 root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -8151,6 +8349,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8151 goto out_fail; 8349 goto out_fail;
8152 } 8350 }
8153 8351
8352 if (old_inode->i_nlink == 1)
8353 BTRFS_I(old_inode)->dir_index = index;
8354
8154 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 8355 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8155 struct dentry *parent = new_dentry->d_parent; 8356 struct dentry *parent = new_dentry->d_parent;
8156 btrfs_log_new_name(trans, old_inode, old_dir, parent); 8357 btrfs_log_new_name(trans, old_inode, old_dir, parent);
@@ -8286,7 +8487,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8286{ 8487{
8287 int ret; 8488 int ret;
8288 8489
8289 if (root->fs_info->sb->s_flags & MS_RDONLY) 8490 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
8290 return -EROFS; 8491 return -EROFS;
8291 8492
8292 ret = __start_delalloc_inodes(root, delay_iput); 8493 ret = __start_delalloc_inodes(root, delay_iput);
@@ -8312,7 +8513,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
8312 struct list_head splice; 8513 struct list_head splice;
8313 int ret; 8514 int ret;
8314 8515
8315 if (fs_info->sb->s_flags & MS_RDONLY) 8516 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
8316 return -EROFS; 8517 return -EROFS;
8317 8518
8318 INIT_LIST_HEAD(&splice); 8519 INIT_LIST_HEAD(&splice);
@@ -8649,12 +8850,14 @@ static const struct inode_operations btrfs_dir_inode_operations = {
8649 .removexattr = btrfs_removexattr, 8850 .removexattr = btrfs_removexattr,
8650 .permission = btrfs_permission, 8851 .permission = btrfs_permission,
8651 .get_acl = btrfs_get_acl, 8852 .get_acl = btrfs_get_acl,
8853 .set_acl = btrfs_set_acl,
8652 .update_time = btrfs_update_time, 8854 .update_time = btrfs_update_time,
8653}; 8855};
8654static const struct inode_operations btrfs_dir_ro_inode_operations = { 8856static const struct inode_operations btrfs_dir_ro_inode_operations = {
8655 .lookup = btrfs_lookup, 8857 .lookup = btrfs_lookup,
8656 .permission = btrfs_permission, 8858 .permission = btrfs_permission,
8657 .get_acl = btrfs_get_acl, 8859 .get_acl = btrfs_get_acl,
8860 .set_acl = btrfs_set_acl,
8658 .update_time = btrfs_update_time, 8861 .update_time = btrfs_update_time,
8659}; 8862};
8660 8863
@@ -8724,6 +8927,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
8724 .permission = btrfs_permission, 8927 .permission = btrfs_permission,
8725 .fiemap = btrfs_fiemap, 8928 .fiemap = btrfs_fiemap,
8726 .get_acl = btrfs_get_acl, 8929 .get_acl = btrfs_get_acl,
8930 .set_acl = btrfs_set_acl,
8727 .update_time = btrfs_update_time, 8931 .update_time = btrfs_update_time,
8728}; 8932};
8729static const struct inode_operations btrfs_special_inode_operations = { 8933static const struct inode_operations btrfs_special_inode_operations = {
@@ -8735,6 +8939,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
8735 .listxattr = btrfs_listxattr, 8939 .listxattr = btrfs_listxattr,
8736 .removexattr = btrfs_removexattr, 8940 .removexattr = btrfs_removexattr,
8737 .get_acl = btrfs_get_acl, 8941 .get_acl = btrfs_get_acl,
8942 .set_acl = btrfs_set_acl,
8738 .update_time = btrfs_update_time, 8943 .update_time = btrfs_update_time,
8739}; 8944};
8740static const struct inode_operations btrfs_symlink_inode_operations = { 8945static const struct inode_operations btrfs_symlink_inode_operations = {
@@ -8748,7 +8953,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
8748 .getxattr = btrfs_getxattr, 8953 .getxattr = btrfs_getxattr,
8749 .listxattr = btrfs_listxattr, 8954 .listxattr = btrfs_listxattr,
8750 .removexattr = btrfs_removexattr, 8955 .removexattr = btrfs_removexattr,
8751 .get_acl = btrfs_get_acl,
8752 .update_time = btrfs_update_time, 8956 .update_time = btrfs_update_time,
8753}; 8957};
8754 8958
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 21da5762b0b1..a6d8efa46bfe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -56,6 +56,8 @@
56#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h" 57#include "send.h"
58#include "dev-replace.h" 58#include "dev-replace.h"
59#include "props.h"
60#include "sysfs.h"
59 61
60static int btrfs_clone(struct inode *src, struct inode *inode, 62static int btrfs_clone(struct inode *src, struct inode *inode,
61 u64 off, u64 olen, u64 olen_aligned, u64 destoff); 63 u64 off, u64 olen, u64 olen_aligned, u64 destoff);
@@ -190,6 +192,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
190 unsigned int i_oldflags; 192 unsigned int i_oldflags;
191 umode_t mode; 193 umode_t mode;
192 194
195 if (!inode_owner_or_capable(inode))
196 return -EPERM;
197
193 if (btrfs_root_readonly(root)) 198 if (btrfs_root_readonly(root))
194 return -EROFS; 199 return -EROFS;
195 200
@@ -200,9 +205,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
200 if (ret) 205 if (ret)
201 return ret; 206 return ret;
202 207
203 if (!inode_owner_or_capable(inode))
204 return -EACCES;
205
206 ret = mnt_want_write_file(file); 208 ret = mnt_want_write_file(file);
207 if (ret) 209 if (ret)
208 return ret; 210 return ret;
@@ -280,9 +282,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
280 if (flags & FS_NOCOMP_FL) { 282 if (flags & FS_NOCOMP_FL) {
281 ip->flags &= ~BTRFS_INODE_COMPRESS; 283 ip->flags &= ~BTRFS_INODE_COMPRESS;
282 ip->flags |= BTRFS_INODE_NOCOMPRESS; 284 ip->flags |= BTRFS_INODE_NOCOMPRESS;
285
286 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
287 if (ret && ret != -ENODATA)
288 goto out_drop;
283 } else if (flags & FS_COMPR_FL) { 289 } else if (flags & FS_COMPR_FL) {
290 const char *comp;
291
284 ip->flags |= BTRFS_INODE_COMPRESS; 292 ip->flags |= BTRFS_INODE_COMPRESS;
285 ip->flags &= ~BTRFS_INODE_NOCOMPRESS; 293 ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
294
295 if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
296 comp = "lzo";
297 else
298 comp = "zlib";
299 ret = btrfs_set_prop(inode, "btrfs.compression",
300 comp, strlen(comp), 0);
301 if (ret)
302 goto out_drop;
303
286 } else { 304 } else {
287 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 305 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
288 } 306 }
@@ -392,6 +410,7 @@ static noinline int create_subvol(struct inode *dir,
392 struct btrfs_root *new_root; 410 struct btrfs_root *new_root;
393 struct btrfs_block_rsv block_rsv; 411 struct btrfs_block_rsv block_rsv;
394 struct timespec cur_time = CURRENT_TIME; 412 struct timespec cur_time = CURRENT_TIME;
413 struct inode *inode;
395 int ret; 414 int ret;
396 int err; 415 int err;
397 u64 objectid; 416 u64 objectid;
@@ -417,7 +436,9 @@ static noinline int create_subvol(struct inode *dir,
417 trans = btrfs_start_transaction(root, 0); 436 trans = btrfs_start_transaction(root, 0);
418 if (IS_ERR(trans)) { 437 if (IS_ERR(trans)) {
419 ret = PTR_ERR(trans); 438 ret = PTR_ERR(trans);
420 goto out; 439 btrfs_subvolume_release_metadata(root, &block_rsv,
440 qgroup_reserved);
441 return ret;
421 } 442 }
422 trans->block_rsv = &block_rsv; 443 trans->block_rsv = &block_rsv;
423 trans->bytes_reserved = block_rsv.size; 444 trans->bytes_reserved = block_rsv.size;
@@ -500,7 +521,7 @@ static noinline int create_subvol(struct inode *dir,
500 521
501 btrfs_record_root_in_trans(trans, new_root); 522 btrfs_record_root_in_trans(trans, new_root);
502 523
503 ret = btrfs_create_subvol_root(trans, new_root, new_dirid); 524 ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
504 if (ret) { 525 if (ret) {
505 /* We potentially lose an unused inode item here */ 526 /* We potentially lose an unused inode item here */
506 btrfs_abort_transaction(trans, root, ret); 527 btrfs_abort_transaction(trans, root, ret);
@@ -542,6 +563,8 @@ static noinline int create_subvol(struct inode *dir,
542fail: 563fail:
543 trans->block_rsv = NULL; 564 trans->block_rsv = NULL;
544 trans->bytes_reserved = 0; 565 trans->bytes_reserved = 0;
566 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
567
545 if (async_transid) { 568 if (async_transid) {
546 *async_transid = trans->transid; 569 *async_transid = trans->transid;
547 err = btrfs_commit_transaction_async(trans, root, 1); 570 err = btrfs_commit_transaction_async(trans, root, 1);
@@ -553,10 +576,12 @@ fail:
553 if (err && !ret) 576 if (err && !ret)
554 ret = err; 577 ret = err;
555 578
556 if (!ret) 579 if (!ret) {
557 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 580 inode = btrfs_lookup_dentry(dir, dentry);
558out: 581 if (IS_ERR(inode))
559 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); 582 return PTR_ERR(inode);
583 d_instantiate(dentry, inode);
584 }
560 return ret; 585 return ret;
561} 586}
562 587
@@ -642,7 +667,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
642 ret = PTR_ERR(inode); 667 ret = PTR_ERR(inode);
643 goto fail; 668 goto fail;
644 } 669 }
645 BUG_ON(!inode); 670
646 d_instantiate(dentry, inode); 671 d_instantiate(dentry, inode);
647 ret = 0; 672 ret = 0;
648fail: 673fail:
@@ -1011,7 +1036,7 @@ out:
1011static int cluster_pages_for_defrag(struct inode *inode, 1036static int cluster_pages_for_defrag(struct inode *inode,
1012 struct page **pages, 1037 struct page **pages,
1013 unsigned long start_index, 1038 unsigned long start_index,
1014 int num_pages) 1039 unsigned long num_pages)
1015{ 1040{
1016 unsigned long file_end; 1041 unsigned long file_end;
1017 u64 isize = i_size_read(inode); 1042 u64 isize = i_size_read(inode);
@@ -1169,8 +1194,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1169 int defrag_count = 0; 1194 int defrag_count = 0;
1170 int compress_type = BTRFS_COMPRESS_ZLIB; 1195 int compress_type = BTRFS_COMPRESS_ZLIB;
1171 int extent_thresh = range->extent_thresh; 1196 int extent_thresh = range->extent_thresh;
1172 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 1197 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
1173 int cluster = max_cluster; 1198 unsigned long cluster = max_cluster;
1174 u64 new_align = ~((u64)128 * 1024 - 1); 1199 u64 new_align = ~((u64)128 * 1024 - 1);
1175 struct page **pages = NULL; 1200 struct page **pages = NULL;
1176 1201
@@ -1254,7 +1279,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1254 break; 1279 break;
1255 1280
1256 if (btrfs_defrag_cancelled(root->fs_info)) { 1281 if (btrfs_defrag_cancelled(root->fs_info)) {
1257 printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); 1282 printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
1258 ret = -EAGAIN; 1283 ret = -EAGAIN;
1259 break; 1284 break;
1260 } 1285 }
@@ -1416,20 +1441,20 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1416 ret = -EINVAL; 1441 ret = -EINVAL;
1417 goto out_free; 1442 goto out_free;
1418 } 1443 }
1419 printk(KERN_INFO "btrfs: resizing devid %llu\n", devid); 1444 btrfs_info(root->fs_info, "resizing devid %llu", devid);
1420 } 1445 }
1421 1446
1422 device = btrfs_find_device(root->fs_info, devid, NULL, NULL); 1447 device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
1423 if (!device) { 1448 if (!device) {
1424 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1449 btrfs_info(root->fs_info, "resizer unable to find device %llu",
1425 devid); 1450 devid);
1426 ret = -ENODEV; 1451 ret = -ENODEV;
1427 goto out_free; 1452 goto out_free;
1428 } 1453 }
1429 1454
1430 if (!device->writeable) { 1455 if (!device->writeable) {
1431 printk(KERN_INFO "btrfs: resizer unable to apply on " 1456 btrfs_info(root->fs_info,
1432 "readonly device %llu\n", 1457 "resizer unable to apply on readonly device %llu",
1433 devid); 1458 devid);
1434 ret = -EPERM; 1459 ret = -EPERM;
1435 goto out_free; 1460 goto out_free;
@@ -1466,6 +1491,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1466 } 1491 }
1467 new_size = old_size - new_size; 1492 new_size = old_size - new_size;
1468 } else if (mod > 0) { 1493 } else if (mod > 0) {
1494 if (new_size > ULLONG_MAX - old_size) {
1495 ret = -EINVAL;
1496 goto out_free;
1497 }
1469 new_size = old_size + new_size; 1498 new_size = old_size + new_size;
1470 } 1499 }
1471 1500
@@ -1481,7 +1510,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1481 do_div(new_size, root->sectorsize); 1510 do_div(new_size, root->sectorsize);
1482 new_size *= root->sectorsize; 1511 new_size *= root->sectorsize;
1483 1512
1484 printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", 1513 printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
1485 rcu_str_deref(device->name), new_size); 1514 rcu_str_deref(device->name), new_size);
1486 1515
1487 if (new_size > old_size) { 1516 if (new_size > old_size) {
@@ -1542,9 +1571,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1542 1571
1543 src_inode = file_inode(src.file); 1572 src_inode = file_inode(src.file);
1544 if (src_inode->i_sb != file_inode(file)->i_sb) { 1573 if (src_inode->i_sb != file_inode(file)->i_sb) {
1545 printk(KERN_INFO "btrfs: Snapshot src from " 1574 btrfs_info(BTRFS_I(src_inode)->root->fs_info,
1546 "another FS\n"); 1575 "Snapshot src from another FS");
1547 ret = -EINVAL; 1576 ret = -EINVAL;
1577 } else if (!inode_owner_or_capable(src_inode)) {
1578 /*
1579 * Subvolume creation is not restricted, but snapshots
1580 * are limited to own subvolumes only
1581 */
1582 ret = -EPERM;
1548 } else { 1583 } else {
1549 ret = btrfs_mksubvol(&file->f_path, name, namelen, 1584 ret = btrfs_mksubvol(&file->f_path, name, namelen,
1550 BTRFS_I(src_inode)->root, 1585 BTRFS_I(src_inode)->root,
@@ -1662,6 +1697,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1662 u64 flags; 1697 u64 flags;
1663 int ret = 0; 1698 int ret = 0;
1664 1699
1700 if (!inode_owner_or_capable(inode))
1701 return -EPERM;
1702
1665 ret = mnt_want_write_file(file); 1703 ret = mnt_want_write_file(file);
1666 if (ret) 1704 if (ret)
1667 goto out; 1705 goto out;
@@ -1686,11 +1724,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1686 goto out_drop_write; 1724 goto out_drop_write;
1687 } 1725 }
1688 1726
1689 if (!inode_owner_or_capable(inode)) {
1690 ret = -EACCES;
1691 goto out_drop_write;
1692 }
1693
1694 down_write(&root->fs_info->subvol_sem); 1727 down_write(&root->fs_info->subvol_sem);
1695 1728
1696 /* nothing to do */ 1729 /* nothing to do */
@@ -1698,12 +1731,28 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1698 goto out_drop_sem; 1731 goto out_drop_sem;
1699 1732
1700 root_flags = btrfs_root_flags(&root->root_item); 1733 root_flags = btrfs_root_flags(&root->root_item);
1701 if (flags & BTRFS_SUBVOL_RDONLY) 1734 if (flags & BTRFS_SUBVOL_RDONLY) {
1702 btrfs_set_root_flags(&root->root_item, 1735 btrfs_set_root_flags(&root->root_item,
1703 root_flags | BTRFS_ROOT_SUBVOL_RDONLY); 1736 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1704 else 1737 } else {
1705 btrfs_set_root_flags(&root->root_item, 1738 /*
1739 * Block RO -> RW transition if this subvolume is involved in
1740 * send
1741 */
1742 spin_lock(&root->root_item_lock);
1743 if (root->send_in_progress == 0) {
1744 btrfs_set_root_flags(&root->root_item,
1706 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); 1745 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1746 spin_unlock(&root->root_item_lock);
1747 } else {
1748 spin_unlock(&root->root_item_lock);
1749 btrfs_warn(root->fs_info,
1750 "Attempt to set subvolume %llu read-write during send",
1751 root->root_key.objectid);
1752 ret = -EPERM;
1753 goto out_drop_sem;
1754 }
1755 }
1707 1756
1708 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 1);
1709 if (IS_ERR(trans)) { 1758 if (IS_ERR(trans)) {
@@ -1910,7 +1959,7 @@ static noinline int search_ioctl(struct inode *inode,
1910 key.offset = (u64)-1; 1959 key.offset = (u64)-1;
1911 root = btrfs_read_fs_root_no_name(info, &key); 1960 root = btrfs_read_fs_root_no_name(info, &key);
1912 if (IS_ERR(root)) { 1961 if (IS_ERR(root)) {
1913 printk(KERN_ERR "could not find root %llu\n", 1962 printk(KERN_ERR "BTRFS: could not find root %llu\n",
1914 sk->tree_id); 1963 sk->tree_id);
1915 btrfs_free_path(path); 1964 btrfs_free_path(path);
1916 return -ENOENT; 1965 return -ENOENT;
@@ -2000,7 +2049,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
2000 key.offset = (u64)-1; 2049 key.offset = (u64)-1;
2001 root = btrfs_read_fs_root_no_name(info, &key); 2050 root = btrfs_read_fs_root_no_name(info, &key);
2002 if (IS_ERR(root)) { 2051 if (IS_ERR(root)) {
2003 printk(KERN_ERR "could not find root %llu\n", tree_id); 2052 printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
2004 ret = -ENOENT; 2053 ret = -ENOENT;
2005 goto out; 2054 goto out;
2006 } 2055 }
@@ -2686,14 +2735,11 @@ out_unlock:
2686#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) 2735#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
2687 2736
2688static long btrfs_ioctl_file_extent_same(struct file *file, 2737static long btrfs_ioctl_file_extent_same(struct file *file,
2689 void __user *argp) 2738 struct btrfs_ioctl_same_args __user *argp)
2690{ 2739{
2691 struct btrfs_ioctl_same_args tmp;
2692 struct btrfs_ioctl_same_args *same; 2740 struct btrfs_ioctl_same_args *same;
2693 struct btrfs_ioctl_same_extent_info *info; 2741 struct btrfs_ioctl_same_extent_info *info;
2694 struct inode *src = file->f_dentry->d_inode; 2742 struct inode *src = file_inode(file);
2695 struct file *dst_file = NULL;
2696 struct inode *dst;
2697 u64 off; 2743 u64 off;
2698 u64 len; 2744 u64 len;
2699 int i; 2745 int i;
@@ -2701,6 +2747,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
2701 unsigned long size; 2747 unsigned long size;
2702 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; 2748 u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
2703 bool is_admin = capable(CAP_SYS_ADMIN); 2749 bool is_admin = capable(CAP_SYS_ADMIN);
2750 u16 count;
2704 2751
2705 if (!(file->f_mode & FMODE_READ)) 2752 if (!(file->f_mode & FMODE_READ))
2706 return -EINVAL; 2753 return -EINVAL;
@@ -2709,17 +2756,14 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
2709 if (ret) 2756 if (ret)
2710 return ret; 2757 return ret;
2711 2758
2712 if (copy_from_user(&tmp, 2759 if (get_user(count, &argp->dest_count)) {
2713 (struct btrfs_ioctl_same_args __user *)argp,
2714 sizeof(tmp))) {
2715 ret = -EFAULT; 2760 ret = -EFAULT;
2716 goto out; 2761 goto out;
2717 } 2762 }
2718 2763
2719 size = sizeof(tmp) + 2764 size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
2720 tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
2721 2765
2722 same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size); 2766 same = memdup_user(argp, size);
2723 2767
2724 if (IS_ERR(same)) { 2768 if (IS_ERR(same)) {
2725 ret = PTR_ERR(same); 2769 ret = PTR_ERR(same);
@@ -2756,52 +2800,35 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
2756 goto out; 2800 goto out;
2757 2801
2758 /* pre-format output fields to sane values */ 2802 /* pre-format output fields to sane values */
2759 for (i = 0; i < same->dest_count; i++) { 2803 for (i = 0; i < count; i++) {
2760 same->info[i].bytes_deduped = 0ULL; 2804 same->info[i].bytes_deduped = 0ULL;
2761 same->info[i].status = 0; 2805 same->info[i].status = 0;
2762 } 2806 }
2763 2807
2764 ret = 0; 2808 for (i = 0, info = same->info; i < count; i++, info++) {
2765 for (i = 0; i < same->dest_count; i++) { 2809 struct inode *dst;
2766 info = &same->info[i]; 2810 struct fd dst_file = fdget(info->fd);
2767 2811 if (!dst_file.file) {
2768 dst_file = fget(info->fd);
2769 if (!dst_file) {
2770 info->status = -EBADF; 2812 info->status = -EBADF;
2771 goto next; 2813 continue;
2772 } 2814 }
2815 dst = file_inode(dst_file.file);
2773 2816
2774 if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { 2817 if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
2775 info->status = -EINVAL; 2818 info->status = -EINVAL;
2776 goto next; 2819 } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
2777 } 2820 info->status = -EXDEV;
2778 2821 } else if (S_ISDIR(dst->i_mode)) {
2779 info->status = -EXDEV;
2780 if (file->f_path.mnt != dst_file->f_path.mnt)
2781 goto next;
2782
2783 dst = dst_file->f_dentry->d_inode;
2784 if (src->i_sb != dst->i_sb)
2785 goto next;
2786
2787 if (S_ISDIR(dst->i_mode)) {
2788 info->status = -EISDIR; 2822 info->status = -EISDIR;
2789 goto next; 2823 } else if (!S_ISREG(dst->i_mode)) {
2790 }
2791
2792 if (!S_ISREG(dst->i_mode)) {
2793 info->status = -EACCES; 2824 info->status = -EACCES;
2794 goto next; 2825 } else {
2826 info->status = btrfs_extent_same(src, off, len, dst,
2827 info->logical_offset);
2828 if (info->status == 0)
2829 info->bytes_deduped += len;
2795 } 2830 }
2796 2831 fdput(dst_file);
2797 info->status = btrfs_extent_same(src, off, len, dst,
2798 info->logical_offset);
2799 if (info->status == 0)
2800 info->bytes_deduped += len;
2801
2802next:
2803 if (dst_file)
2804 fput(dst_file);
2805 } 2832 }
2806 2833
2807 ret = copy_to_user(argp, same, size); 2834 ret = copy_to_user(argp, same, size);
@@ -2860,12 +2887,14 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2860 * note the key will change type as we walk through the 2887 * note the key will change type as we walk through the
2861 * tree. 2888 * tree.
2862 */ 2889 */
2890 path->leave_spinning = 1;
2863 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 2891 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
2864 0, 0); 2892 0, 0);
2865 if (ret < 0) 2893 if (ret < 0)
2866 goto out; 2894 goto out;
2867 2895
2868 nritems = btrfs_header_nritems(path->nodes[0]); 2896 nritems = btrfs_header_nritems(path->nodes[0]);
2897process_slot:
2869 if (path->slots[0] >= nritems) { 2898 if (path->slots[0] >= nritems) {
2870 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 2899 ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
2871 if (ret < 0) 2900 if (ret < 0)
@@ -2892,11 +2921,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2892 u8 comp; 2921 u8 comp;
2893 u64 endoff; 2922 u64 endoff;
2894 2923
2895 size = btrfs_item_size_nr(leaf, slot);
2896 read_extent_buffer(leaf, buf,
2897 btrfs_item_ptr_offset(leaf, slot),
2898 size);
2899
2900 extent = btrfs_item_ptr(leaf, slot, 2924 extent = btrfs_item_ptr(leaf, slot,
2901 struct btrfs_file_extent_item); 2925 struct btrfs_file_extent_item);
2902 comp = btrfs_file_extent_compression(leaf, extent); 2926 comp = btrfs_file_extent_compression(leaf, extent);
@@ -2915,11 +2939,20 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
2915 datal = btrfs_file_extent_ram_bytes(leaf, 2939 datal = btrfs_file_extent_ram_bytes(leaf,
2916 extent); 2940 extent);
2917 } 2941 }
2918 btrfs_release_path(path);
2919 2942
2920 if (key.offset + datal <= off || 2943 if (key.offset + datal <= off ||
2921 key.offset >= off + len - 1) 2944 key.offset >= off + len - 1) {
2922 goto next; 2945 path->slots[0]++;
2946 goto process_slot;
2947 }
2948
2949 size = btrfs_item_size_nr(leaf, slot);
2950 read_extent_buffer(leaf, buf,
2951 btrfs_item_ptr_offset(leaf, slot),
2952 size);
2953
2954 btrfs_release_path(path);
2955 path->leave_spinning = 0;
2923 2956
2924 memcpy(&new_key, &key, sizeof(new_key)); 2957 memcpy(&new_key, &key, sizeof(new_key));
2925 new_key.objectid = btrfs_ino(inode); 2958 new_key.objectid = btrfs_ino(inode);
@@ -3090,7 +3123,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
3090 } 3123 }
3091 ret = btrfs_end_transaction(trans, root); 3124 ret = btrfs_end_transaction(trans, root);
3092 } 3125 }
3093next:
3094 btrfs_release_path(path); 3126 btrfs_release_path(path);
3095 key.offset++; 3127 key.offset++;
3096 } 3128 }
@@ -3218,9 +3250,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3218 3250
3219 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); 3251 unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
3220out_unlock: 3252out_unlock:
3221 mutex_unlock(&src->i_mutex); 3253 if (!same_inode) {
3222 if (!same_inode) 3254 if (inode < src) {
3223 mutex_unlock(&inode->i_mutex); 3255 mutex_unlock(&src->i_mutex);
3256 mutex_unlock(&inode->i_mutex);
3257 } else {
3258 mutex_unlock(&inode->i_mutex);
3259 mutex_unlock(&src->i_mutex);
3260 }
3261 } else {
3262 mutex_unlock(&src->i_mutex);
3263 }
3224out_fput: 3264out_fput:
3225 fdput(src_file); 3265 fdput(src_file);
3226out_drop_write: 3266out_drop_write:
@@ -3343,8 +3383,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
3343 if (IS_ERR_OR_NULL(di)) { 3383 if (IS_ERR_OR_NULL(di)) {
3344 btrfs_free_path(path); 3384 btrfs_free_path(path);
3345 btrfs_end_transaction(trans, root); 3385 btrfs_end_transaction(trans, root);
3346 printk(KERN_ERR "Umm, you don't have the default dir item, " 3386 btrfs_err(new_root->fs_info, "Umm, you don't have the default dir"
3347 "this isn't going to work\n"); 3387 "item, this isn't going to work");
3348 ret = -ENOENT; 3388 ret = -ENOENT;
3349 goto out; 3389 goto out;
3350 } 3390 }
@@ -4325,6 +4365,9 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4325 int ret = 0; 4365 int ret = 0;
4326 int received_uuid_changed; 4366 int received_uuid_changed;
4327 4367
4368 if (!inode_owner_or_capable(inode))
4369 return -EPERM;
4370
4328 ret = mnt_want_write_file(file); 4371 ret = mnt_want_write_file(file);
4329 if (ret < 0) 4372 if (ret < 0)
4330 return ret; 4373 return ret;
@@ -4341,11 +4384,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
4341 goto out; 4384 goto out;
4342 } 4385 }
4343 4386
4344 if (!inode_owner_or_capable(inode)) {
4345 ret = -EACCES;
4346 goto out;
4347 }
4348
4349 sa = memdup_user(arg, sizeof(*sa)); 4387 sa = memdup_user(arg, sizeof(*sa));
4350 if (IS_ERR(sa)) { 4388 if (IS_ERR(sa)) {
4351 ret = PTR_ERR(sa); 4389 ret = PTR_ERR(sa);
@@ -4431,8 +4469,8 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
4431 len = strnlen(label, BTRFS_LABEL_SIZE); 4469 len = strnlen(label, BTRFS_LABEL_SIZE);
4432 4470
4433 if (len == BTRFS_LABEL_SIZE) { 4471 if (len == BTRFS_LABEL_SIZE) {
4434 pr_warn("btrfs: label is too long, return the first %zu bytes\n", 4472 btrfs_warn(root->fs_info,
4435 --len); 4473 "label is too long, return the first %zu bytes", --len);
4436 } 4474 }
4437 4475
4438 ret = copy_to_user(arg, label, len); 4476 ret = copy_to_user(arg, label, len);
@@ -4455,7 +4493,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
4455 return -EFAULT; 4493 return -EFAULT;
4456 4494
4457 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { 4495 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
4458 pr_err("btrfs: unable to set label with more than %d bytes\n", 4496 btrfs_err(root->fs_info, "unable to set label with more than %d bytes",
4459 BTRFS_LABEL_SIZE - 1); 4497 BTRFS_LABEL_SIZE - 1);
4460 return -EINVAL; 4498 return -EINVAL;
4461 } 4499 }
@@ -4473,13 +4511,173 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
4473 spin_lock(&root->fs_info->super_lock); 4511 spin_lock(&root->fs_info->super_lock);
4474 strcpy(super_block->label, label); 4512 strcpy(super_block->label, label);
4475 spin_unlock(&root->fs_info->super_lock); 4513 spin_unlock(&root->fs_info->super_lock);
4476 ret = btrfs_end_transaction(trans, root); 4514 ret = btrfs_commit_transaction(trans, root);
4477 4515
4478out_unlock: 4516out_unlock:
4479 mnt_drop_write_file(file); 4517 mnt_drop_write_file(file);
4480 return ret; 4518 return ret;
4481} 4519}
4482 4520
4521#define INIT_FEATURE_FLAGS(suffix) \
4522 { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
4523 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
4524 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
4525
4526static int btrfs_ioctl_get_supported_features(struct file *file,
4527 void __user *arg)
4528{
4529 static struct btrfs_ioctl_feature_flags features[3] = {
4530 INIT_FEATURE_FLAGS(SUPP),
4531 INIT_FEATURE_FLAGS(SAFE_SET),
4532 INIT_FEATURE_FLAGS(SAFE_CLEAR)
4533 };
4534
4535 if (copy_to_user(arg, &features, sizeof(features)))
4536 return -EFAULT;
4537
4538 return 0;
4539}
4540
4541static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
4542{
4543 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4544 struct btrfs_super_block *super_block = root->fs_info->super_copy;
4545 struct btrfs_ioctl_feature_flags features;
4546
4547 features.compat_flags = btrfs_super_compat_flags(super_block);
4548 features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
4549 features.incompat_flags = btrfs_super_incompat_flags(super_block);
4550
4551 if (copy_to_user(arg, &features, sizeof(features)))
4552 return -EFAULT;
4553
4554 return 0;
4555}
4556
4557static int check_feature_bits(struct btrfs_root *root,
4558 enum btrfs_feature_set set,
4559 u64 change_mask, u64 flags, u64 supported_flags,
4560 u64 safe_set, u64 safe_clear)
4561{
4562 const char *type = btrfs_feature_set_names[set];
4563 char *names;
4564 u64 disallowed, unsupported;
4565 u64 set_mask = flags & change_mask;
4566 u64 clear_mask = ~flags & change_mask;
4567
4568 unsupported = set_mask & ~supported_flags;
4569 if (unsupported) {
4570 names = btrfs_printable_features(set, unsupported);
4571 if (names) {
4572 btrfs_warn(root->fs_info,
4573 "this kernel does not support the %s feature bit%s",
4574 names, strchr(names, ',') ? "s" : "");
4575 kfree(names);
4576 } else
4577 btrfs_warn(root->fs_info,
4578 "this kernel does not support %s bits 0x%llx",
4579 type, unsupported);
4580 return -EOPNOTSUPP;
4581 }
4582
4583 disallowed = set_mask & ~safe_set;
4584 if (disallowed) {
4585 names = btrfs_printable_features(set, disallowed);
4586 if (names) {
4587 btrfs_warn(root->fs_info,
4588 "can't set the %s feature bit%s while mounted",
4589 names, strchr(names, ',') ? "s" : "");
4590 kfree(names);
4591 } else
4592 btrfs_warn(root->fs_info,
4593 "can't set %s bits 0x%llx while mounted",
4594 type, disallowed);
4595 return -EPERM;
4596 }
4597
4598 disallowed = clear_mask & ~safe_clear;
4599 if (disallowed) {
4600 names = btrfs_printable_features(set, disallowed);
4601 if (names) {
4602 btrfs_warn(root->fs_info,
4603 "can't clear the %s feature bit%s while mounted",
4604 names, strchr(names, ',') ? "s" : "");
4605 kfree(names);
4606 } else
4607 btrfs_warn(root->fs_info,
4608 "can't clear %s bits 0x%llx while mounted",
4609 type, disallowed);
4610 return -EPERM;
4611 }
4612
4613 return 0;
4614}
4615
4616#define check_feature(root, change_mask, flags, mask_base) \
4617check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \
4618 BTRFS_FEATURE_ ## mask_base ## _SUPP, \
4619 BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
4620 BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
4621
4622static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
4623{
4624 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4625 struct btrfs_super_block *super_block = root->fs_info->super_copy;
4626 struct btrfs_ioctl_feature_flags flags[2];
4627 struct btrfs_trans_handle *trans;
4628 u64 newflags;
4629 int ret;
4630
4631 if (!capable(CAP_SYS_ADMIN))
4632 return -EPERM;
4633
4634 if (copy_from_user(flags, arg, sizeof(flags)))
4635 return -EFAULT;
4636
4637 /* Nothing to do */
4638 if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
4639 !flags[0].incompat_flags)
4640 return 0;
4641
4642 ret = check_feature(root, flags[0].compat_flags,
4643 flags[1].compat_flags, COMPAT);
4644 if (ret)
4645 return ret;
4646
4647 ret = check_feature(root, flags[0].compat_ro_flags,
4648 flags[1].compat_ro_flags, COMPAT_RO);
4649 if (ret)
4650 return ret;
4651
4652 ret = check_feature(root, flags[0].incompat_flags,
4653 flags[1].incompat_flags, INCOMPAT);
4654 if (ret)
4655 return ret;
4656
4657 trans = btrfs_start_transaction(root, 0);
4658 if (IS_ERR(trans))
4659 return PTR_ERR(trans);
4660
4661 spin_lock(&root->fs_info->super_lock);
4662 newflags = btrfs_super_compat_flags(super_block);
4663 newflags |= flags[0].compat_flags & flags[1].compat_flags;
4664 newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
4665 btrfs_set_super_compat_flags(super_block, newflags);
4666
4667 newflags = btrfs_super_compat_ro_flags(super_block);
4668 newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
4669 newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
4670 btrfs_set_super_compat_ro_flags(super_block, newflags);
4671
4672 newflags = btrfs_super_incompat_flags(super_block);
4673 newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
4674 newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
4675 btrfs_set_super_incompat_flags(super_block, newflags);
4676 spin_unlock(&root->fs_info->super_lock);
4677
4678 return btrfs_commit_transaction(trans, root);
4679}
4680
4483long btrfs_ioctl(struct file *file, unsigned int 4681long btrfs_ioctl(struct file *file, unsigned int
4484 cmd, unsigned long arg) 4682 cmd, unsigned long arg)
4485{ 4683{
@@ -4598,6 +4796,12 @@ long btrfs_ioctl(struct file *file, unsigned int
4598 return btrfs_ioctl_set_fslabel(file, argp); 4796 return btrfs_ioctl_set_fslabel(file, argp);
4599 case BTRFS_IOC_FILE_EXTENT_SAME: 4797 case BTRFS_IOC_FILE_EXTENT_SAME:
4600 return btrfs_ioctl_file_extent_same(file, argp); 4798 return btrfs_ioctl_file_extent_same(file, argp);
4799 case BTRFS_IOC_GET_SUPPORTED_FEATURES:
4800 return btrfs_ioctl_get_supported_features(file, argp);
4801 case BTRFS_IOC_GET_FEATURES:
4802 return btrfs_ioctl_get_features(file, argp);
4803 case BTRFS_IOC_SET_FEATURES:
4804 return btrfs_ioctl_set_features(file, argp);
4601 } 4805 }
4602 4806
4603 return -ENOTTY; 4807 return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b6a6f07c5ce2..b47f669aca75 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -141,7 +141,7 @@ static int lzo_compress_pages(struct list_head *ws,
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, 141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem); 142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) { 143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", 144 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
145 ret); 145 ret);
146 ret = -1; 146 ret = -1;
147 goto out; 147 goto out;
@@ -357,7 +357,7 @@ cont:
357 if (need_unmap) 357 if (need_unmap)
358 kunmap(pages_in[page_in_index - 1]); 358 kunmap(pages_in[page_in_index - 1]);
359 if (ret != LZO_E_OK) { 359 if (ret != LZO_E_OK) {
360 printk(KERN_WARNING "btrfs decompress failed\n"); 360 printk(KERN_WARNING "BTRFS: decompress failed\n");
361 ret = -1; 361 ret = -1;
362 break; 362 break;
363 } 363 }
@@ -401,7 +401,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
401 out_len = PAGE_CACHE_SIZE; 401 out_len = PAGE_CACHE_SIZE;
402 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); 402 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
403 if (ret != LZO_E_OK) { 403 if (ret != LZO_E_OK) {
404 printk(KERN_WARNING "btrfs decompress failed!\n"); 404 printk(KERN_WARNING "BTRFS: decompress failed!\n");
405 ret = -1; 405 ret = -1;
406 goto out; 406 goto out;
407 } 407 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 69582d5b69d1..b16450b840e7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -336,13 +336,14 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
336 entry->len); 336 entry->len);
337 *file_offset = dec_end; 337 *file_offset = dec_end;
338 if (dec_start > dec_end) { 338 if (dec_start > dec_end) {
339 printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", 339 btrfs_crit(BTRFS_I(inode)->root->fs_info,
340 dec_start, dec_end); 340 "bad ordering dec_start %llu end %llu", dec_start, dec_end);
341 } 341 }
342 to_dec = dec_end - dec_start; 342 to_dec = dec_end - dec_start;
343 if (to_dec > entry->bytes_left) { 343 if (to_dec > entry->bytes_left) {
344 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", 344 btrfs_crit(BTRFS_I(inode)->root->fs_info,
345 entry->bytes_left, to_dec); 345 "bad ordered accounting left %llu size %llu",
346 entry->bytes_left, to_dec);
346 } 347 }
347 entry->bytes_left -= to_dec; 348 entry->bytes_left -= to_dec;
348 if (!uptodate) 349 if (!uptodate)
@@ -401,7 +402,8 @@ have_entry:
401 } 402 }
402 403
403 if (io_size > entry->bytes_left) { 404 if (io_size > entry->bytes_left) {
404 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", 405 btrfs_crit(BTRFS_I(inode)->root->fs_info,
406 "bad ordered accounting left %llu size %llu",
405 entry->bytes_left, io_size); 407 entry->bytes_left, io_size);
406 } 408 }
407 entry->bytes_left -= io_size; 409 entry->bytes_left -= io_size;
@@ -520,7 +522,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
520 spin_lock_irq(&tree->lock); 522 spin_lock_irq(&tree->lock);
521 node = &entry->rb_node; 523 node = &entry->rb_node;
522 rb_erase(node, &tree->tree); 524 rb_erase(node, &tree->tree);
523 tree->last = NULL; 525 if (tree->last == node)
526 tree->last = NULL;
524 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 527 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
525 spin_unlock_irq(&tree->lock); 528 spin_unlock_irq(&tree->lock);
526 529
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 24cad1695af7..65793edb38ca 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -69,23 +69,3 @@ out:
69 btrfs_free_path(path); 69 btrfs_free_path(path);
70 return ret; 70 return ret;
71} 71}
72
73int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
74{
75 struct btrfs_path *path;
76 struct btrfs_key key;
77 int ret;
78
79 key.objectid = BTRFS_ORPHAN_OBJECTID;
80 key.type = BTRFS_ORPHAN_ITEM_KEY;
81 key.offset = offset;
82
83 path = btrfs_alloc_path();
84 if (!path)
85 return -ENOMEM;
86
87 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
88
89 btrfs_free_path(path);
90 return ret;
91}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 417053b17181..6efd70d3b64f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -154,7 +154,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
154 u32 item_size) 154 u32 item_size)
155{ 155{
156 if (!IS_ALIGNED(item_size, sizeof(u64))) { 156 if (!IS_ALIGNED(item_size, sizeof(u64))) {
157 pr_warn("btrfs: uuid item with illegal size %lu!\n", 157 pr_warn("BTRFS: uuid item with illegal size %lu!\n",
158 (unsigned long)item_size); 158 (unsigned long)item_size);
159 return; 159 return;
160 } 160 }
@@ -249,7 +249,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
249 BTRFS_FILE_EXTENT_INLINE) { 249 BTRFS_FILE_EXTENT_INLINE) {
250 printk(KERN_INFO "\t\tinline extent data " 250 printk(KERN_INFO "\t\tinline extent data "
251 "size %u\n", 251 "size %u\n",
252 btrfs_file_extent_inline_len(l, fi)); 252 btrfs_file_extent_inline_len(l, i, fi));
253 break; 253 break;
254 } 254 }
255 printk(KERN_INFO "\t\textent data disk bytenr %llu " 255 printk(KERN_INFO "\t\textent data disk bytenr %llu "
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
new file mode 100644
index 000000000000..129b1dd28527
--- /dev/null
+++ b/fs/btrfs/props.c
@@ -0,0 +1,427 @@
1/*
2 * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/hashtable.h>
20#include "props.h"
21#include "btrfs_inode.h"
22#include "hash.h"
23#include "transaction.h"
24#include "xattr.h"
25
26#define BTRFS_PROP_HANDLERS_HT_BITS 8
27static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
28
29struct prop_handler {
30 struct hlist_node node;
31 const char *xattr_name;
32 int (*validate)(const char *value, size_t len);
33 int (*apply)(struct inode *inode, const char *value, size_t len);
34 const char *(*extract)(struct inode *inode);
35 int inheritable;
36};
37
38static int prop_compression_validate(const char *value, size_t len);
39static int prop_compression_apply(struct inode *inode,
40 const char *value,
41 size_t len);
42static const char *prop_compression_extract(struct inode *inode);
43
44static struct prop_handler prop_handlers[] = {
45 {
46 .xattr_name = XATTR_BTRFS_PREFIX "compression",
47 .validate = prop_compression_validate,
48 .apply = prop_compression_apply,
49 .extract = prop_compression_extract,
50 .inheritable = 1
51 },
52 {
53 .xattr_name = NULL
54 }
55};
56
57void __init btrfs_props_init(void)
58{
59 struct prop_handler *p;
60
61 hash_init(prop_handlers_ht);
62
63 for (p = &prop_handlers[0]; p->xattr_name; p++) {
64 u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
65
66 hash_add(prop_handlers_ht, &p->node, h);
67 }
68}
69
70static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
71{
72 struct hlist_head *h;
73
74 h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)];
75 if (hlist_empty(h))
76 return NULL;
77
78 return h;
79}
80
81static const struct prop_handler *
82find_prop_handler(const char *name,
83 const struct hlist_head *handlers)
84{
85 struct prop_handler *h;
86
87 if (!handlers) {
88 u64 hash = btrfs_name_hash(name, strlen(name));
89
90 handlers = find_prop_handlers_by_hash(hash);
91 if (!handlers)
92 return NULL;
93 }
94
95 hlist_for_each_entry(h, handlers, node)
96 if (!strcmp(h->xattr_name, name))
97 return h;
98
99 return NULL;
100}
101
102static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
103 struct inode *inode,
104 const char *name,
105 const char *value,
106 size_t value_len,
107 int flags)
108{
109 const struct prop_handler *handler;
110 int ret;
111
112 if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
113 return -EINVAL;
114
115 handler = find_prop_handler(name, NULL);
116 if (!handler)
117 return -EINVAL;
118
119 if (value_len == 0) {
120 ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
121 NULL, 0, flags);
122 if (ret)
123 return ret;
124
125 ret = handler->apply(inode, NULL, 0);
126 ASSERT(ret == 0);
127
128 return ret;
129 }
130
131 ret = handler->validate(value, value_len);
132 if (ret)
133 return ret;
134 ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
135 value, value_len, flags);
136 if (ret)
137 return ret;
138 ret = handler->apply(inode, value, value_len);
139 if (ret) {
140 __btrfs_setxattr(trans, inode, handler->xattr_name,
141 NULL, 0, flags);
142 return ret;
143 }
144
145 set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
146
147 return 0;
148}
149
150int btrfs_set_prop(struct inode *inode,
151 const char *name,
152 const char *value,
153 size_t value_len,
154 int flags)
155{
156 return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
157}
158
159static int iterate_object_props(struct btrfs_root *root,
160 struct btrfs_path *path,
161 u64 objectid,
162 void (*iterator)(void *,
163 const struct prop_handler *,
164 const char *,
165 size_t),
166 void *ctx)
167{
168 int ret;
169 char *name_buf = NULL;
170 char *value_buf = NULL;
171 int name_buf_len = 0;
172 int value_buf_len = 0;
173
174 while (1) {
175 struct btrfs_key key;
176 struct btrfs_dir_item *di;
177 struct extent_buffer *leaf;
178 u32 total_len, cur, this_len;
179 int slot;
180 const struct hlist_head *handlers;
181
182 slot = path->slots[0];
183 leaf = path->nodes[0];
184
185 if (slot >= btrfs_header_nritems(leaf)) {
186 ret = btrfs_next_leaf(root, path);
187 if (ret < 0)
188 goto out;
189 else if (ret > 0)
190 break;
191 continue;
192 }
193
194 btrfs_item_key_to_cpu(leaf, &key, slot);
195 if (key.objectid != objectid)
196 break;
197 if (key.type != BTRFS_XATTR_ITEM_KEY)
198 break;
199
200 handlers = find_prop_handlers_by_hash(key.offset);
201 if (!handlers)
202 goto next_slot;
203
204 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
205 cur = 0;
206 total_len = btrfs_item_size_nr(leaf, slot);
207
208 while (cur < total_len) {
209 u32 name_len = btrfs_dir_name_len(leaf, di);
210 u32 data_len = btrfs_dir_data_len(leaf, di);
211 unsigned long name_ptr, data_ptr;
212 const struct prop_handler *handler;
213
214 this_len = sizeof(*di) + name_len + data_len;
215 name_ptr = (unsigned long)(di + 1);
216 data_ptr = name_ptr + name_len;
217
218 if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
219 memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
220 name_ptr,
221 XATTR_BTRFS_PREFIX_LEN))
222 goto next_dir_item;
223
224 if (name_len >= name_buf_len) {
225 kfree(name_buf);
226 name_buf_len = name_len + 1;
227 name_buf = kmalloc(name_buf_len, GFP_NOFS);
228 if (!name_buf) {
229 ret = -ENOMEM;
230 goto out;
231 }
232 }
233 read_extent_buffer(leaf, name_buf, name_ptr, name_len);
234 name_buf[name_len] = '\0';
235
236 handler = find_prop_handler(name_buf, handlers);
237 if (!handler)
238 goto next_dir_item;
239
240 if (data_len > value_buf_len) {
241 kfree(value_buf);
242 value_buf_len = data_len;
243 value_buf = kmalloc(data_len, GFP_NOFS);
244 if (!value_buf) {
245 ret = -ENOMEM;
246 goto out;
247 }
248 }
249 read_extent_buffer(leaf, value_buf, data_ptr, data_len);
250
251 iterator(ctx, handler, value_buf, data_len);
252next_dir_item:
253 cur += this_len;
254 di = (struct btrfs_dir_item *)((char *) di + this_len);
255 }
256
257next_slot:
258 path->slots[0]++;
259 }
260
261 ret = 0;
262out:
263 btrfs_release_path(path);
264 kfree(name_buf);
265 kfree(value_buf);
266
267 return ret;
268}
269
270static void inode_prop_iterator(void *ctx,
271 const struct prop_handler *handler,
272 const char *value,
273 size_t len)
274{
275 struct inode *inode = ctx;
276 struct btrfs_root *root = BTRFS_I(inode)->root;
277 int ret;
278
279 ret = handler->apply(inode, value, len);
280 if (unlikely(ret))
281 btrfs_warn(root->fs_info,
282 "error applying prop %s to ino %llu (root %llu): %d",
283 handler->xattr_name, btrfs_ino(inode),
284 root->root_key.objectid, ret);
285 else
286 set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
287}
288
289int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
290{
291 struct btrfs_root *root = BTRFS_I(inode)->root;
292 u64 ino = btrfs_ino(inode);
293 int ret;
294
295 ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
296
297 return ret;
298}
299
300static int inherit_props(struct btrfs_trans_handle *trans,
301 struct inode *inode,
302 struct inode *parent)
303{
304 const struct prop_handler *h;
305 struct btrfs_root *root = BTRFS_I(inode)->root;
306 int ret;
307
308 if (!test_bit(BTRFS_INODE_HAS_PROPS,
309 &BTRFS_I(parent)->runtime_flags))
310 return 0;
311
312 for (h = &prop_handlers[0]; h->xattr_name; h++) {
313 const char *value;
314 u64 num_bytes;
315
316 if (!h->inheritable)
317 continue;
318
319 value = h->extract(parent);
320 if (!value)
321 continue;
322
323 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
324 ret = btrfs_block_rsv_add(root, trans->block_rsv,
325 num_bytes, BTRFS_RESERVE_NO_FLUSH);
326 if (ret)
327 goto out;
328 ret = __btrfs_set_prop(trans, inode, h->xattr_name,
329 value, strlen(value), 0);
330 btrfs_block_rsv_release(root, trans->block_rsv, num_bytes);
331 if (ret)
332 goto out;
333 }
334 ret = 0;
335out:
336 return ret;
337}
338
339int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
340 struct inode *inode,
341 struct inode *dir)
342{
343 if (!dir)
344 return 0;
345
346 return inherit_props(trans, inode, dir);
347}
348
349int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
350 struct btrfs_root *root,
351 struct btrfs_root *parent_root)
352{
353 struct btrfs_key key;
354 struct inode *parent_inode, *child_inode;
355 int ret;
356
357 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
358 key.type = BTRFS_INODE_ITEM_KEY;
359 key.offset = 0;
360
361 parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
362 parent_root, NULL);
363 if (IS_ERR(parent_inode))
364 return PTR_ERR(parent_inode);
365
366 child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
367 if (IS_ERR(child_inode)) {
368 iput(parent_inode);
369 return PTR_ERR(child_inode);
370 }
371
372 ret = inherit_props(trans, child_inode, parent_inode);
373 iput(child_inode);
374 iput(parent_inode);
375
376 return ret;
377}
378
379static int prop_compression_validate(const char *value, size_t len)
380{
381 if (!strncmp("lzo", value, len))
382 return 0;
383 else if (!strncmp("zlib", value, len))
384 return 0;
385
386 return -EINVAL;
387}
388
389static int prop_compression_apply(struct inode *inode,
390 const char *value,
391 size_t len)
392{
393 int type;
394
395 if (len == 0) {
396 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
397 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
398 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
399
400 return 0;
401 }
402
403 if (!strncmp("lzo", value, len))
404 type = BTRFS_COMPRESS_LZO;
405 else if (!strncmp("zlib", value, len))
406 type = BTRFS_COMPRESS_ZLIB;
407 else
408 return -EINVAL;
409
410 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
411 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
412 BTRFS_I(inode)->force_compress = type;
413
414 return 0;
415}
416
417static const char *prop_compression_extract(struct inode *inode)
418{
419 switch (BTRFS_I(inode)->force_compress) {
420 case BTRFS_COMPRESS_ZLIB:
421 return "zlib";
422 case BTRFS_COMPRESS_LZO:
423 return "lzo";
424 }
425
426 return NULL;
427}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
new file mode 100644
index 000000000000..100f18829d50
--- /dev/null
+++ b/fs/btrfs/props.h
@@ -0,0 +1,42 @@
1/*
2 * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_PROPS_H
20#define __BTRFS_PROPS_H
21
22#include "ctree.h"
23
24void __init btrfs_props_init(void);
25
26int btrfs_set_prop(struct inode *inode,
27 const char *name,
28 const char *value,
29 size_t value_len,
30 int flags);
31
32int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
33
34int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
35 struct inode *inode,
36 struct inode *dir);
37
38int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
39 struct btrfs_root *root,
40 struct btrfs_root *parent_root);
41
42#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e6ef490619e..472302a2d745 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -301,16 +301,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
301 301
302 if (btrfs_qgroup_status_version(l, ptr) != 302 if (btrfs_qgroup_status_version(l, ptr) !=
303 BTRFS_QGROUP_STATUS_VERSION) { 303 BTRFS_QGROUP_STATUS_VERSION) {
304 printk(KERN_ERR 304 btrfs_err(fs_info,
305 "btrfs: old qgroup version, quota disabled\n"); 305 "old qgroup version, quota disabled");
306 goto out; 306 goto out;
307 } 307 }
308 if (btrfs_qgroup_status_generation(l, ptr) != 308 if (btrfs_qgroup_status_generation(l, ptr) !=
309 fs_info->generation) { 309 fs_info->generation) {
310 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 310 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
311 printk(KERN_ERR 311 btrfs_err(fs_info,
312 "btrfs: qgroup generation mismatch, " 312 "qgroup generation mismatch, "
313 "marked as inconsistent\n"); 313 "marked as inconsistent");
314 } 314 }
315 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 315 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
316 ptr); 316 ptr);
@@ -325,7 +325,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
325 qgroup = find_qgroup_rb(fs_info, found_key.offset); 325 qgroup = find_qgroup_rb(fs_info, found_key.offset);
326 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 326 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
327 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 327 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
328 printk(KERN_ERR "btrfs: inconsitent qgroup config\n"); 328 btrfs_err(fs_info, "inconsitent qgroup config");
329 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 329 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
330 } 330 }
331 if (!qgroup) { 331 if (!qgroup) {
@@ -396,8 +396,8 @@ next1:
396 ret = add_relation_rb(fs_info, found_key.objectid, 396 ret = add_relation_rb(fs_info, found_key.objectid,
397 found_key.offset); 397 found_key.offset);
398 if (ret == -ENOENT) { 398 if (ret == -ENOENT) {
399 printk(KERN_WARNING 399 btrfs_warn(fs_info,
400 "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", 400 "orphan qgroup relation 0x%llx->0x%llx",
401 found_key.objectid, found_key.offset); 401 found_key.objectid, found_key.offset);
402 ret = 0; /* ignore the error */ 402 ret = 0; /* ignore the error */
403 } 403 }
@@ -644,8 +644,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
644 644
645 l = path->nodes[0]; 645 l = path->nodes[0];
646 slot = path->slots[0]; 646 slot = path->slots[0];
647 qgroup_limit = btrfs_item_ptr(l, path->slots[0], 647 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
648 struct btrfs_qgroup_limit_item);
649 btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); 648 btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
650 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); 649 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
651 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); 650 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
@@ -687,8 +686,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
687 686
688 l = path->nodes[0]; 687 l = path->nodes[0];
689 slot = path->slots[0]; 688 slot = path->slots[0];
690 qgroup_info = btrfs_item_ptr(l, path->slots[0], 689 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
691 struct btrfs_qgroup_info_item);
692 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 690 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
693 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 691 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
694 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 692 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
@@ -1161,7 +1159,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
1161 limit->rsv_excl); 1159 limit->rsv_excl);
1162 if (ret) { 1160 if (ret) {
1163 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1161 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1164 printk(KERN_INFO "unable to update quota limit for %llu\n", 1162 btrfs_info(fs_info, "unable to update quota limit for %llu",
1165 qgroupid); 1163 qgroupid);
1166 } 1164 }
1167 1165
@@ -1349,7 +1347,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1349 struct btrfs_delayed_ref_node *node, 1347 struct btrfs_delayed_ref_node *node,
1350 struct btrfs_delayed_extent_op *extent_op) 1348 struct btrfs_delayed_extent_op *extent_op)
1351{ 1349{
1352 struct btrfs_key ins;
1353 struct btrfs_root *quota_root; 1350 struct btrfs_root *quota_root;
1354 u64 ref_root; 1351 u64 ref_root;
1355 struct btrfs_qgroup *qgroup; 1352 struct btrfs_qgroup *qgroup;
@@ -1363,10 +1360,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
1363 1360
1364 BUG_ON(!fs_info->quota_root); 1361 BUG_ON(!fs_info->quota_root);
1365 1362
1366 ins.objectid = node->bytenr;
1367 ins.offset = node->num_bytes;
1368 ins.type = BTRFS_EXTENT_ITEM_KEY;
1369
1370 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1363 if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1371 node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 1364 node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
1372 struct btrfs_delayed_tree_ref *ref; 1365 struct btrfs_delayed_tree_ref *ref;
@@ -1840,7 +1833,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
1840{ 1833{
1841 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) 1834 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
1842 return; 1835 return;
1843 pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n", 1836 btrfs_err(trans->root->fs_info,
1837 "qgroups not uptodate in trans handle %p: list is%s empty, "
1838 "seq is %#x.%x",
1844 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", 1839 trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
1845 (u32)(trans->delayed_ref_elem.seq >> 32), 1840 (u32)(trans->delayed_ref_elem.seq >> 32),
1846 (u32)trans->delayed_ref_elem.seq); 1841 (u32)trans->delayed_ref_elem.seq);
@@ -1902,9 +1897,17 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1902 mutex_unlock(&fs_info->qgroup_rescan_lock); 1897 mutex_unlock(&fs_info->qgroup_rescan_lock);
1903 1898
1904 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 1899 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
1900 u64 num_bytes;
1901
1905 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 1902 btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
1906 if (found.type != BTRFS_EXTENT_ITEM_KEY) 1903 if (found.type != BTRFS_EXTENT_ITEM_KEY &&
1904 found.type != BTRFS_METADATA_ITEM_KEY)
1907 continue; 1905 continue;
1906 if (found.type == BTRFS_METADATA_ITEM_KEY)
1907 num_bytes = fs_info->extent_root->leafsize;
1908 else
1909 num_bytes = found.offset;
1910
1908 ret = btrfs_find_all_roots(trans, fs_info, found.objectid, 1911 ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
1909 tree_mod_seq_elem.seq, &roots); 1912 tree_mod_seq_elem.seq, &roots);
1910 if (ret < 0) 1913 if (ret < 0)
@@ -1949,12 +1952,12 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1949 struct btrfs_qgroup_list *glist; 1952 struct btrfs_qgroup_list *glist;
1950 1953
1951 qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; 1954 qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
1952 qg->rfer += found.offset; 1955 qg->rfer += num_bytes;
1953 qg->rfer_cmpr += found.offset; 1956 qg->rfer_cmpr += num_bytes;
1954 WARN_ON(qg->tag >= seq); 1957 WARN_ON(qg->tag >= seq);
1955 if (qg->refcnt - seq == roots->nnodes) { 1958 if (qg->refcnt - seq == roots->nnodes) {
1956 qg->excl += found.offset; 1959 qg->excl += num_bytes;
1957 qg->excl_cmpr += found.offset; 1960 qg->excl_cmpr += num_bytes;
1958 } 1961 }
1959 qgroup_dirty(fs_info, qg); 1962 qgroup_dirty(fs_info, qg);
1960 1963
@@ -2037,10 +2040,10 @@ out:
2037 mutex_unlock(&fs_info->qgroup_rescan_lock); 2040 mutex_unlock(&fs_info->qgroup_rescan_lock);
2038 2041
2039 if (err >= 0) { 2042 if (err >= 0) {
2040 pr_info("btrfs: qgroup scan completed%s\n", 2043 btrfs_info(fs_info, "qgroup scan completed%s",
2041 err == 2 ? " (inconsistency flag cleared)" : ""); 2044 err == 2 ? " (inconsistency flag cleared)" : "");
2042 } else { 2045 } else {
2043 pr_err("btrfs: qgroup scan failed with %d\n", err); 2046 btrfs_err(fs_info, "qgroup scan failed with %d", err);
2044 } 2047 }
2045 2048
2046 complete_all(&fs_info->qgroup_rescan_completion); 2049 complete_all(&fs_info->qgroup_rescan_completion);
@@ -2096,7 +2099,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2096 2099
2097 if (ret) { 2100 if (ret) {
2098err: 2101err:
2099 pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret); 2102 btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
2100 return ret; 2103 return ret;
2101 } 2104 }
2102 2105
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24ac21840a9a..9af0b25d991a 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1032,8 +1032,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1032 1032
1033 /* see if we can add this page onto our existing bio */ 1033 /* see if we can add this page onto our existing bio */
1034 if (last) { 1034 if (last) {
1035 last_end = (u64)last->bi_sector << 9; 1035 last_end = (u64)last->bi_iter.bi_sector << 9;
1036 last_end += last->bi_size; 1036 last_end += last->bi_iter.bi_size;
1037 1037
1038 /* 1038 /*
1039 * we can't merge these if they are from different 1039 * we can't merge these if they are from different
@@ -1053,9 +1053,9 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1053 if (!bio) 1053 if (!bio)
1054 return -ENOMEM; 1054 return -ENOMEM;
1055 1055
1056 bio->bi_size = 0; 1056 bio->bi_iter.bi_size = 0;
1057 bio->bi_bdev = stripe->dev->bdev; 1057 bio->bi_bdev = stripe->dev->bdev;
1058 bio->bi_sector = disk_start >> 9; 1058 bio->bi_iter.bi_sector = disk_start >> 9;
1059 set_bit(BIO_UPTODATE, &bio->bi_flags); 1059 set_bit(BIO_UPTODATE, &bio->bi_flags);
1060 1060
1061 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 1061 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -1111,7 +1111,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1111 1111
1112 spin_lock_irq(&rbio->bio_list_lock); 1112 spin_lock_irq(&rbio->bio_list_lock);
1113 bio_list_for_each(bio, &rbio->bio_list) { 1113 bio_list_for_each(bio, &rbio->bio_list) {
1114 start = (u64)bio->bi_sector << 9; 1114 start = (u64)bio->bi_iter.bi_sector << 9;
1115 stripe_offset = start - rbio->raid_map[0]; 1115 stripe_offset = start - rbio->raid_map[0];
1116 page_index = stripe_offset >> PAGE_CACHE_SHIFT; 1116 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1117 1117
@@ -1272,7 +1272,7 @@ cleanup:
1272static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1272static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1273 struct bio *bio) 1273 struct bio *bio)
1274{ 1274{
1275 u64 physical = bio->bi_sector; 1275 u64 physical = bio->bi_iter.bi_sector;
1276 u64 stripe_start; 1276 u64 stripe_start;
1277 int i; 1277 int i;
1278 struct btrfs_bio_stripe *stripe; 1278 struct btrfs_bio_stripe *stripe;
@@ -1298,7 +1298,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1298static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1298static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1299 struct bio *bio) 1299 struct bio *bio)
1300{ 1300{
1301 u64 logical = bio->bi_sector; 1301 u64 logical = bio->bi_iter.bi_sector;
1302 u64 stripe_start; 1302 u64 stripe_start;
1303 int i; 1303 int i;
1304 1304
@@ -1602,8 +1602,8 @@ static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1602 plug_list); 1602 plug_list);
1603 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1603 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1604 plug_list); 1604 plug_list);
1605 u64 a_sector = ra->bio_list.head->bi_sector; 1605 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1606 u64 b_sector = rb->bio_list.head->bi_sector; 1606 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1607 1607
1608 if (a_sector < b_sector) 1608 if (a_sector < b_sector)
1609 return -1; 1609 return -1;
@@ -1691,7 +1691,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1691 if (IS_ERR(rbio)) 1691 if (IS_ERR(rbio))
1692 return PTR_ERR(rbio); 1692 return PTR_ERR(rbio);
1693 bio_list_add(&rbio->bio_list, bio); 1693 bio_list_add(&rbio->bio_list, bio);
1694 rbio->bio_list_bytes = bio->bi_size; 1694 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1695 1695
1696 /* 1696 /*
1697 * don't plug on full rbios, just get them out the door 1697 * don't plug on full rbios, just get them out the door
@@ -2044,7 +2044,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2044 2044
2045 rbio->read_rebuild = 1; 2045 rbio->read_rebuild = 1;
2046 bio_list_add(&rbio->bio_list, bio); 2046 bio_list_add(&rbio->bio_list, bio);
2047 rbio->bio_list_bytes = bio->bi_size; 2047 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2048 2048
2049 rbio->faila = find_logical_bio_stripe(rbio, bio); 2049 rbio->faila = find_logical_bio_stripe(rbio, bio);
2050 if (rbio->faila == -1) { 2050 if (rbio->faila == -1) {
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 1031b69252c5..31c797c48c3e 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -189,8 +189,8 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
189 */ 189 */
190#ifdef DEBUG 190#ifdef DEBUG
191 if (rec->generation != generation) { 191 if (rec->generation != generation) {
192 printk(KERN_DEBUG "generation mismatch for " 192 btrfs_debug(root->fs_info,
193 "(%llu,%d,%llu) %llu != %llu\n", 193 "generation mismatch for (%llu,%d,%llu) %llu != %llu",
194 key.objectid, key.type, key.offset, 194 key.objectid, key.type, key.offset,
195 rec->generation, generation); 195 rec->generation, generation);
196 } 196 }
@@ -365,8 +365,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
365 goto error; 365 goto error;
366 366
367 if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { 367 if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
368 printk(KERN_ERR "btrfs readahead: more than %d copies not " 368 btrfs_err(root->fs_info,
369 "supported", BTRFS_MAX_MIRRORS); 369 "readahead: more than %d copies not supported",
370 BTRFS_MAX_MIRRORS);
370 goto error; 371 goto error;
371 } 372 }
372 373
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 429c73c374b8..07b3b36f40ee 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -94,6 +94,7 @@ struct backref_edge {
94 94
95#define LOWER 0 95#define LOWER 0
96#define UPPER 1 96#define UPPER 1
97#define RELOCATION_RESERVED_NODES 256
97 98
98struct backref_cache { 99struct backref_cache {
99 /* red black tree of all backref nodes in the cache */ 100 /* red black tree of all backref nodes in the cache */
@@ -176,6 +177,8 @@ struct reloc_control {
176 u64 merging_rsv_size; 177 u64 merging_rsv_size;
177 /* size of relocated tree nodes */ 178 /* size of relocated tree nodes */
178 u64 nodes_relocated; 179 u64 nodes_relocated;
180 /* reserved size for block group relocation*/
181 u64 reserved_bytes;
179 182
180 u64 search_start; 183 u64 search_start;
181 u64 extents_found; 184 u64 extents_found;
@@ -184,7 +187,6 @@ struct reloc_control {
184 unsigned int create_reloc_tree:1; 187 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1; 188 unsigned int merge_reloc_tree:1;
186 unsigned int found_file_extent:1; 189 unsigned int found_file_extent:1;
187 unsigned int commit_transaction:1;
188}; 190};
189 191
190/* stages of data relocation */ 192/* stages of data relocation */
@@ -2309,9 +2311,6 @@ void free_reloc_roots(struct list_head *list)
2309 reloc_root = list_entry(list->next, struct btrfs_root, 2311 reloc_root = list_entry(list->next, struct btrfs_root,
2310 root_list); 2312 root_list);
2311 __del_reloc_root(reloc_root); 2313 __del_reloc_root(reloc_root);
2312 free_extent_buffer(reloc_root->node);
2313 free_extent_buffer(reloc_root->commit_root);
2314 kfree(reloc_root);
2315 } 2314 }
2316} 2315}
2317 2316
@@ -2353,10 +2352,9 @@ again:
2353 2352
2354 ret = merge_reloc_root(rc, root); 2353 ret = merge_reloc_root(rc, root);
2355 if (ret) { 2354 if (ret) {
2356 __del_reloc_root(reloc_root); 2355 if (list_empty(&reloc_root->root_list))
2357 free_extent_buffer(reloc_root->node); 2356 list_add_tail(&reloc_root->root_list,
2358 free_extent_buffer(reloc_root->commit_root); 2357 &reloc_roots);
2359 kfree(reloc_root);
2360 goto out; 2358 goto out;
2361 } 2359 }
2362 } else { 2360 } else {
@@ -2452,7 +2450,7 @@ static noinline_for_stack
2452struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2450struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2453 struct reloc_control *rc, 2451 struct reloc_control *rc,
2454 struct backref_node *node, 2452 struct backref_node *node,
2455 struct backref_edge *edges[], int *nr) 2453 struct backref_edge *edges[])
2456{ 2454{
2457 struct backref_node *next; 2455 struct backref_node *next;
2458 struct btrfs_root *root; 2456 struct btrfs_root *root;
@@ -2494,7 +2492,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2494 if (!root) 2492 if (!root)
2495 return NULL; 2493 return NULL;
2496 2494
2497 *nr = index;
2498 next = node; 2495 next = node;
2499 /* setup backref node path for btrfs_reloc_cow_block */ 2496 /* setup backref node path for btrfs_reloc_cow_block */
2500 while (1) { 2497 while (1) {
@@ -2590,28 +2587,36 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2590 struct btrfs_root *root = rc->extent_root; 2587 struct btrfs_root *root = rc->extent_root;
2591 u64 num_bytes; 2588 u64 num_bytes;
2592 int ret; 2589 int ret;
2590 u64 tmp;
2593 2591
2594 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2592 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2595 2593
2596 trans->block_rsv = rc->block_rsv; 2594 trans->block_rsv = rc->block_rsv;
2597 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes, 2595 rc->reserved_bytes += num_bytes;
2598 BTRFS_RESERVE_FLUSH_ALL); 2596 ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2597 BTRFS_RESERVE_FLUSH_ALL);
2599 if (ret) { 2598 if (ret) {
2600 if (ret == -EAGAIN) 2599 if (ret == -EAGAIN) {
2601 rc->commit_transaction = 1; 2600 tmp = rc->extent_root->nodesize *
2601 RELOCATION_RESERVED_NODES;
2602 while (tmp <= rc->reserved_bytes)
2603 tmp <<= 1;
2604 /*
2605 * only one thread can access block_rsv at this point,
2606 * so we don't need hold lock to protect block_rsv.
2607 * we expand more reservation size here to allow enough
2608 * space for relocation and we will return eailer in
2609 * enospc case.
2610 */
2611 rc->block_rsv->size = tmp + rc->extent_root->nodesize *
2612 RELOCATION_RESERVED_NODES;
2613 }
2602 return ret; 2614 return ret;
2603 } 2615 }
2604 2616
2605 return 0; 2617 return 0;
2606} 2618}
2607 2619
2608static void release_metadata_space(struct reloc_control *rc,
2609 struct backref_node *node)
2610{
2611 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2612 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
2613}
2614
2615/* 2620/*
2616 * relocate a block tree, and then update pointers in upper level 2621 * relocate a block tree, and then update pointers in upper level
2617 * blocks that reference the block to point to the new location. 2622 * blocks that reference the block to point to the new location.
@@ -2633,7 +2638,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2633 u32 blocksize; 2638 u32 blocksize;
2634 u64 bytenr; 2639 u64 bytenr;
2635 u64 generation; 2640 u64 generation;
2636 int nr;
2637 int slot; 2641 int slot;
2638 int ret; 2642 int ret;
2639 int err = 0; 2643 int err = 0;
@@ -2646,7 +2650,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2646 cond_resched(); 2650 cond_resched();
2647 2651
2648 upper = edge->node[UPPER]; 2652 upper = edge->node[UPPER];
2649 root = select_reloc_root(trans, rc, upper, edges, &nr); 2653 root = select_reloc_root(trans, rc, upper, edges);
2650 BUG_ON(!root); 2654 BUG_ON(!root);
2651 2655
2652 if (upper->eb && !upper->locked) { 2656 if (upper->eb && !upper->locked) {
@@ -2898,7 +2902,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2898 struct btrfs_path *path) 2902 struct btrfs_path *path)
2899{ 2903{
2900 struct btrfs_root *root; 2904 struct btrfs_root *root;
2901 int release = 0;
2902 int ret = 0; 2905 int ret = 0;
2903 2906
2904 if (!node) 2907 if (!node)
@@ -2915,7 +2918,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2915 ret = reserve_metadata_space(trans, rc, node); 2918 ret = reserve_metadata_space(trans, rc, node);
2916 if (ret) 2919 if (ret)
2917 goto out; 2920 goto out;
2918 release = 1;
2919 } 2921 }
2920 2922
2921 if (root) { 2923 if (root) {
@@ -2940,11 +2942,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2940 ret = do_relocation(trans, rc, node, key, path, 1); 2942 ret = do_relocation(trans, rc, node, key, path, 1);
2941 } 2943 }
2942out: 2944out:
2943 if (ret || node->level == 0 || node->cowonly) { 2945 if (ret || node->level == 0 || node->cowonly)
2944 if (release)
2945 release_metadata_space(rc, node);
2946 remove_backref_node(&rc->backref_cache, node); 2946 remove_backref_node(&rc->backref_cache, node);
2947 }
2948 return ret; 2947 return ret;
2949} 2948}
2950 2949
@@ -3867,29 +3866,20 @@ static noinline_for_stack
3867int prepare_to_relocate(struct reloc_control *rc) 3866int prepare_to_relocate(struct reloc_control *rc)
3868{ 3867{
3869 struct btrfs_trans_handle *trans; 3868 struct btrfs_trans_handle *trans;
3870 int ret;
3871 3869
3872 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, 3870 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3873 BTRFS_BLOCK_RSV_TEMP); 3871 BTRFS_BLOCK_RSV_TEMP);
3874 if (!rc->block_rsv) 3872 if (!rc->block_rsv)
3875 return -ENOMEM; 3873 return -ENOMEM;
3876 3874
3877 /*
3878 * reserve some space for creating reloc trees.
3879 * btrfs_init_reloc_root will use them when there
3880 * is no reservation in transaction handle.
3881 */
3882 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3883 rc->extent_root->nodesize * 256,
3884 BTRFS_RESERVE_FLUSH_ALL);
3885 if (ret)
3886 return ret;
3887
3888 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3875 memset(&rc->cluster, 0, sizeof(rc->cluster));
3889 rc->search_start = rc->block_group->key.objectid; 3876 rc->search_start = rc->block_group->key.objectid;
3890 rc->extents_found = 0; 3877 rc->extents_found = 0;
3891 rc->nodes_relocated = 0; 3878 rc->nodes_relocated = 0;
3892 rc->merging_rsv_size = 0; 3879 rc->merging_rsv_size = 0;
3880 rc->reserved_bytes = 0;
3881 rc->block_rsv->size = rc->extent_root->nodesize *
3882 RELOCATION_RESERVED_NODES;
3893 3883
3894 rc->create_reloc_tree = 1; 3884 rc->create_reloc_tree = 1;
3895 set_reloc_control(rc); 3885 set_reloc_control(rc);
@@ -3933,6 +3923,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3933 } 3923 }
3934 3924
3935 while (1) { 3925 while (1) {
3926 rc->reserved_bytes = 0;
3927 ret = btrfs_block_rsv_refill(rc->extent_root,
3928 rc->block_rsv, rc->block_rsv->size,
3929 BTRFS_RESERVE_FLUSH_ALL);
3930 if (ret) {
3931 err = ret;
3932 break;
3933 }
3936 progress++; 3934 progress++;
3937 trans = btrfs_start_transaction(rc->extent_root, 0); 3935 trans = btrfs_start_transaction(rc->extent_root, 0);
3938 if (IS_ERR(trans)) { 3936 if (IS_ERR(trans)) {
@@ -4011,6 +4009,12 @@ restart:
4011 if (!RB_EMPTY_ROOT(&blocks)) { 4009 if (!RB_EMPTY_ROOT(&blocks)) {
4012 ret = relocate_tree_blocks(trans, rc, &blocks); 4010 ret = relocate_tree_blocks(trans, rc, &blocks);
4013 if (ret < 0) { 4011 if (ret < 0) {
4012 /*
4013 * if we fail to relocate tree blocks, force to update
4014 * backref cache when committing transaction.
4015 */
4016 rc->backref_cache.last_trans = trans->transid - 1;
4017
4014 if (ret != -EAGAIN) { 4018 if (ret != -EAGAIN) {
4015 err = ret; 4019 err = ret;
4016 break; 4020 break;
@@ -4020,14 +4024,8 @@ restart:
4020 } 4024 }
4021 } 4025 }
4022 4026
4023 if (rc->commit_transaction) { 4027 btrfs_end_transaction_throttle(trans, rc->extent_root);
4024 rc->commit_transaction = 0; 4028 btrfs_btree_balance_dirty(rc->extent_root);
4025 ret = btrfs_commit_transaction(trans, rc->extent_root);
4026 BUG_ON(ret);
4027 } else {
4028 btrfs_end_transaction_throttle(trans, rc->extent_root);
4029 btrfs_btree_balance_dirty(rc->extent_root);
4030 }
4031 trans = NULL; 4029 trans = NULL;
4032 4030
4033 if (rc->stage == MOVE_DATA_EXTENTS && 4031 if (rc->stage == MOVE_DATA_EXTENTS &&
@@ -4247,7 +4245,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4247 goto out; 4245 goto out;
4248 } 4246 }
4249 4247
4250 printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", 4248 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4251 rc->block_group->key.objectid, rc->block_group->flags); 4249 rc->block_group->key.objectid, rc->block_group->flags);
4252 4250
4253 ret = btrfs_start_delalloc_roots(fs_info, 0); 4251 ret = btrfs_start_delalloc_roots(fs_info, 0);
@@ -4269,7 +4267,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4269 if (rc->extents_found == 0) 4267 if (rc->extents_found == 0)
4270 break; 4268 break;
4271 4269
4272 printk(KERN_INFO "btrfs: found %llu extents\n", 4270 btrfs_info(extent_root->fs_info, "found %llu extents",
4273 rc->extents_found); 4271 rc->extents_found);
4274 4272
4275 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 4273 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
@@ -4285,11 +4283,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4285 } 4283 }
4286 } 4284 }
4287 4285
4288 filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
4289 rc->block_group->key.objectid,
4290 rc->block_group->key.objectid +
4291 rc->block_group->key.offset - 1);
4292
4293 WARN_ON(rc->block_group->pinned > 0); 4286 WARN_ON(rc->block_group->pinned > 0);
4294 WARN_ON(rc->block_group->reserved > 0); 4287 WARN_ON(rc->block_group->reserved > 0);
4295 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 4288 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ec71ea44d2b4..1389b69059de 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -44,7 +44,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
44 if (!need_reset && btrfs_root_generation(item) 44 if (!need_reset && btrfs_root_generation(item)
45 != btrfs_root_generation_v2(item)) { 45 != btrfs_root_generation_v2(item)) {
46 if (btrfs_root_generation_v2(item) != 0) { 46 if (btrfs_root_generation_v2(item) != 0) {
47 printk(KERN_WARNING "btrfs: mismatching " 47 printk(KERN_WARNING "BTRFS: mismatching "
48 "generation and generation_v2 " 48 "generation and generation_v2 "
49 "found in root item. This root " 49 "found in root item. This root "
50 "was probably mounted with an " 50 "was probably mounted with an "
@@ -154,7 +154,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
154 154
155 if (ret != 0) { 155 if (ret != 0) {
156 btrfs_print_leaf(root, path->nodes[0]); 156 btrfs_print_leaf(root, path->nodes[0]);
157 printk(KERN_CRIT "unable to update root key %llu %u %llu\n", 157 btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu",
158 key->objectid, key->type, key->offset); 158 key->objectid, key->type, key->offset);
159 BUG_ON(1); 159 BUG_ON(1);
160 } 160 }
@@ -400,21 +400,6 @@ out:
400 return err; 400 return err;
401} 401}
402 402
403int btrfs_find_root_ref(struct btrfs_root *tree_root,
404 struct btrfs_path *path,
405 u64 root_id, u64 ref_id)
406{
407 struct btrfs_key key;
408 int ret;
409
410 key.objectid = root_id;
411 key.type = BTRFS_ROOT_REF_KEY;
412 key.offset = ref_id;
413
414 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
415 return ret;
416}
417
418/* 403/*
419 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY 404 * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
420 * or BTRFS_ROOT_BACKREF_KEY. 405 * or BTRFS_ROOT_BACKREF_KEY.
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 1fd3f33c330a..efba5d1282ee 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -256,6 +256,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
256static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 256static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
257 int mirror_num, u64 physical_for_dev_replace); 257 int mirror_num, u64 physical_for_dev_replace);
258static void copy_nocow_pages_worker(struct btrfs_work *work); 258static void copy_nocow_pages_worker(struct btrfs_work *work);
259static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
260static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
259 261
260 262
261static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 263static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -269,6 +271,29 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
269 wake_up(&sctx->list_wait); 271 wake_up(&sctx->list_wait);
270} 272}
271 273
274static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
275{
276 while (atomic_read(&fs_info->scrub_pause_req)) {
277 mutex_unlock(&fs_info->scrub_lock);
278 wait_event(fs_info->scrub_pause_wait,
279 atomic_read(&fs_info->scrub_pause_req) == 0);
280 mutex_lock(&fs_info->scrub_lock);
281 }
282}
283
284static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
285{
286 atomic_inc(&fs_info->scrubs_paused);
287 wake_up(&fs_info->scrub_pause_wait);
288
289 mutex_lock(&fs_info->scrub_lock);
290 __scrub_blocked_if_needed(fs_info);
291 atomic_dec(&fs_info->scrubs_paused);
292 mutex_unlock(&fs_info->scrub_lock);
293
294 wake_up(&fs_info->scrub_pause_wait);
295}
296
272/* 297/*
273 * used for workers that require transaction commits (i.e., for the 298 * used for workers that require transaction commits (i.e., for the
274 * NOCOW case) 299 * NOCOW case)
@@ -480,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
480 * hold all of the paths here 505 * hold all of the paths here
481 */ 506 */
482 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 507 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
483 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " 508 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
484 "%s, sector %llu, root %llu, inode %llu, offset %llu, " 509 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
485 "length %llu, links %u (path: %s)\n", swarn->errstr, 510 "length %llu, links %u (path: %s)\n", swarn->errstr,
486 swarn->logical, rcu_str_deref(swarn->dev->name), 511 swarn->logical, rcu_str_deref(swarn->dev->name),
@@ -492,7 +517,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
492 return 0; 517 return 0;
493 518
494err: 519err:
495 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " 520 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
496 "%s, sector %llu, root %llu, inode %llu, offset %llu: path " 521 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
497 "resolving failed with ret=%d\n", swarn->errstr, 522 "resolving failed with ret=%d\n", swarn->errstr,
498 swarn->logical, rcu_str_deref(swarn->dev->name), 523 swarn->logical, rcu_str_deref(swarn->dev->name),
@@ -555,7 +580,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
555 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 580 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
556 &ref_root, &ref_level); 581 &ref_root, &ref_level);
557 printk_in_rcu(KERN_WARNING 582 printk_in_rcu(KERN_WARNING
558 "btrfs: %s at logical %llu on dev %s, " 583 "BTRFS: %s at logical %llu on dev %s, "
559 "sector %llu: metadata %s (level %d) in tree " 584 "sector %llu: metadata %s (level %d) in tree "
560 "%llu\n", errstr, swarn.logical, 585 "%llu\n", errstr, swarn.logical,
561 rcu_str_deref(dev->name), 586 rcu_str_deref(dev->name),
@@ -704,13 +729,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
704 struct scrub_fixup_nodatasum *fixup; 729 struct scrub_fixup_nodatasum *fixup;
705 struct scrub_ctx *sctx; 730 struct scrub_ctx *sctx;
706 struct btrfs_trans_handle *trans = NULL; 731 struct btrfs_trans_handle *trans = NULL;
707 struct btrfs_fs_info *fs_info;
708 struct btrfs_path *path; 732 struct btrfs_path *path;
709 int uncorrectable = 0; 733 int uncorrectable = 0;
710 734
711 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 735 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
712 sctx = fixup->sctx; 736 sctx = fixup->sctx;
713 fs_info = fixup->root->fs_info;
714 737
715 path = btrfs_alloc_path(); 738 path = btrfs_alloc_path();
716 if (!path) { 739 if (!path) {
@@ -759,8 +782,8 @@ out:
759 btrfs_dev_replace_stats_inc( 782 btrfs_dev_replace_stats_inc(
760 &sctx->dev_root->fs_info->dev_replace. 783 &sctx->dev_root->fs_info->dev_replace.
761 num_uncorrectable_read_errors); 784 num_uncorrectable_read_errors);
762 printk_ratelimited_in_rcu(KERN_ERR 785 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
763 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 786 "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
764 fixup->logical, rcu_str_deref(fixup->dev->name)); 787 fixup->logical, rcu_str_deref(fixup->dev->name));
765 } 788 }
766 789
@@ -1161,7 +1184,7 @@ corrected_error:
1161 sctx->stat.corrected_errors++; 1184 sctx->stat.corrected_errors++;
1162 spin_unlock(&sctx->stat_lock); 1185 spin_unlock(&sctx->stat_lock);
1163 printk_ratelimited_in_rcu(KERN_ERR 1186 printk_ratelimited_in_rcu(KERN_ERR
1164 "btrfs: fixed up error at logical %llu on dev %s\n", 1187 "BTRFS: fixed up error at logical %llu on dev %s\n",
1165 logical, rcu_str_deref(dev->name)); 1188 logical, rcu_str_deref(dev->name));
1166 } 1189 }
1167 } else { 1190 } else {
@@ -1170,7 +1193,7 @@ did_not_correct_error:
1170 sctx->stat.uncorrectable_errors++; 1193 sctx->stat.uncorrectable_errors++;
1171 spin_unlock(&sctx->stat_lock); 1194 spin_unlock(&sctx->stat_lock);
1172 printk_ratelimited_in_rcu(KERN_ERR 1195 printk_ratelimited_in_rcu(KERN_ERR
1173 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1196 "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1174 logical, rcu_str_deref(dev->name)); 1197 logical, rcu_str_deref(dev->name));
1175 } 1198 }
1176 1199
@@ -1308,7 +1331,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1308 continue; 1331 continue;
1309 } 1332 }
1310 bio->bi_bdev = page->dev->bdev; 1333 bio->bi_bdev = page->dev->bdev;
1311 bio->bi_sector = page->physical >> 9; 1334 bio->bi_iter.bi_sector = page->physical >> 9;
1312 1335
1313 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1336 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1314 if (btrfsic_submit_bio_wait(READ, bio)) 1337 if (btrfsic_submit_bio_wait(READ, bio))
@@ -1418,8 +1441,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1418 int ret; 1441 int ret;
1419 1442
1420 if (!page_bad->dev->bdev) { 1443 if (!page_bad->dev->bdev) {
1421 printk_ratelimited(KERN_WARNING 1444 printk_ratelimited(KERN_WARNING "BTRFS: "
1422 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n"); 1445 "scrub_repair_page_from_good_copy(bdev == NULL) "
1446 "is unexpected!\n");
1423 return -EIO; 1447 return -EIO;
1424 } 1448 }
1425 1449
@@ -1427,7 +1451,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1427 if (!bio) 1451 if (!bio)
1428 return -EIO; 1452 return -EIO;
1429 bio->bi_bdev = page_bad->dev->bdev; 1453 bio->bi_bdev = page_bad->dev->bdev;
1430 bio->bi_sector = page_bad->physical >> 9; 1454 bio->bi_iter.bi_sector = page_bad->physical >> 9;
1431 1455
1432 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); 1456 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1433 if (PAGE_SIZE != ret) { 1457 if (PAGE_SIZE != ret) {
@@ -1520,7 +1544,7 @@ again:
1520 bio->bi_private = sbio; 1544 bio->bi_private = sbio;
1521 bio->bi_end_io = scrub_wr_bio_end_io; 1545 bio->bi_end_io = scrub_wr_bio_end_io;
1522 bio->bi_bdev = sbio->dev->bdev; 1546 bio->bi_bdev = sbio->dev->bdev;
1523 bio->bi_sector = sbio->physical >> 9; 1547 bio->bi_iter.bi_sector = sbio->physical >> 9;
1524 sbio->err = 0; 1548 sbio->err = 0;
1525 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1549 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1526 spage->physical_for_dev_replace || 1550 spage->physical_for_dev_replace ||
@@ -1877,7 +1901,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
1877 * This case is handled correctly (but _very_ slowly). 1901 * This case is handled correctly (but _very_ slowly).
1878 */ 1902 */
1879 printk_ratelimited(KERN_WARNING 1903 printk_ratelimited(KERN_WARNING
1880 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n"); 1904 "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
1881 bio_endio(sbio->bio, -EIO); 1905 bio_endio(sbio->bio, -EIO);
1882 } else { 1906 } else {
1883 btrfsic_submit_bio(READ, sbio->bio); 1907 btrfsic_submit_bio(READ, sbio->bio);
@@ -1926,7 +1950,7 @@ again:
1926 bio->bi_private = sbio; 1950 bio->bi_private = sbio;
1927 bio->bi_end_io = scrub_bio_end_io; 1951 bio->bi_end_io = scrub_bio_end_io;
1928 bio->bi_bdev = sbio->dev->bdev; 1952 bio->bi_bdev = sbio->dev->bdev;
1929 bio->bi_sector = sbio->physical >> 9; 1953 bio->bi_iter.bi_sector = sbio->physical >> 9;
1930 sbio->err = 0; 1954 sbio->err = 0;
1931 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1955 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1932 spage->physical || 1956 spage->physical ||
@@ -2286,8 +2310,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2286 2310
2287 wait_event(sctx->list_wait, 2311 wait_event(sctx->list_wait,
2288 atomic_read(&sctx->bios_in_flight) == 0); 2312 atomic_read(&sctx->bios_in_flight) == 0);
2289 atomic_inc(&fs_info->scrubs_paused); 2313 scrub_blocked_if_needed(fs_info);
2290 wake_up(&fs_info->scrub_pause_wait);
2291 2314
2292 /* FIXME it might be better to start readahead at commit root */ 2315 /* FIXME it might be better to start readahead at commit root */
2293 key_start.objectid = logical; 2316 key_start.objectid = logical;
@@ -2311,16 +2334,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2311 if (!IS_ERR(reada2)) 2334 if (!IS_ERR(reada2))
2312 btrfs_reada_wait(reada2); 2335 btrfs_reada_wait(reada2);
2313 2336
2314 mutex_lock(&fs_info->scrub_lock);
2315 while (atomic_read(&fs_info->scrub_pause_req)) {
2316 mutex_unlock(&fs_info->scrub_lock);
2317 wait_event(fs_info->scrub_pause_wait,
2318 atomic_read(&fs_info->scrub_pause_req) == 0);
2319 mutex_lock(&fs_info->scrub_lock);
2320 }
2321 atomic_dec(&fs_info->scrubs_paused);
2322 mutex_unlock(&fs_info->scrub_lock);
2323 wake_up(&fs_info->scrub_pause_wait);
2324 2337
2325 /* 2338 /*
2326 * collect all data csums for the stripe to avoid seeking during 2339 * collect all data csums for the stripe to avoid seeking during
@@ -2357,22 +2370,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2357 wait_event(sctx->list_wait, 2370 wait_event(sctx->list_wait,
2358 atomic_read(&sctx->bios_in_flight) == 0); 2371 atomic_read(&sctx->bios_in_flight) == 0);
2359 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2372 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2360 atomic_inc(&fs_info->scrubs_paused); 2373 scrub_blocked_if_needed(fs_info);
2361 wake_up(&fs_info->scrub_pause_wait);
2362 mutex_lock(&fs_info->scrub_lock);
2363 while (atomic_read(&fs_info->scrub_pause_req)) {
2364 mutex_unlock(&fs_info->scrub_lock);
2365 wait_event(fs_info->scrub_pause_wait,
2366 atomic_read(&fs_info->scrub_pause_req) == 0);
2367 mutex_lock(&fs_info->scrub_lock);
2368 }
2369 atomic_dec(&fs_info->scrubs_paused);
2370 mutex_unlock(&fs_info->scrub_lock);
2371 wake_up(&fs_info->scrub_pause_wait);
2372 } 2374 }
2373 2375
2376 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2377 key.type = BTRFS_METADATA_ITEM_KEY;
2378 else
2379 key.type = BTRFS_EXTENT_ITEM_KEY;
2374 key.objectid = logical; 2380 key.objectid = logical;
2375 key.type = BTRFS_EXTENT_ITEM_KEY;
2376 key.offset = (u64)-1; 2381 key.offset = (u64)-1;
2377 2382
2378 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2383 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2380,8 +2385,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2380 goto out; 2385 goto out;
2381 2386
2382 if (ret > 0) { 2387 if (ret > 0) {
2383 ret = btrfs_previous_item(root, path, 0, 2388 ret = btrfs_previous_extent_item(root, path, 0);
2384 BTRFS_EXTENT_ITEM_KEY);
2385 if (ret < 0) 2389 if (ret < 0)
2386 goto out; 2390 goto out;
2387 if (ret > 0) { 2391 if (ret > 0) {
@@ -2439,9 +2443,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2439 2443
2440 if (key.objectid < logical && 2444 if (key.objectid < logical &&
2441 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { 2445 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2442 printk(KERN_ERR 2446 btrfs_err(fs_info,
2443 "btrfs scrub: tree block %llu spanning " 2447 "scrub: tree block %llu spanning "
2444 "stripes, ignored. logical=%llu\n", 2448 "stripes, ignored. logical=%llu",
2445 key.objectid, logical); 2449 key.objectid, logical);
2446 goto next; 2450 goto next;
2447 } 2451 }
@@ -2683,21 +2687,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2683 wait_event(sctx->list_wait, 2687 wait_event(sctx->list_wait,
2684 atomic_read(&sctx->bios_in_flight) == 0); 2688 atomic_read(&sctx->bios_in_flight) == 0);
2685 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 2689 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2686 atomic_inc(&fs_info->scrubs_paused);
2687 wake_up(&fs_info->scrub_pause_wait);
2688 wait_event(sctx->list_wait, 2690 wait_event(sctx->list_wait,
2689 atomic_read(&sctx->workers_pending) == 0); 2691 atomic_read(&sctx->workers_pending) == 0);
2690 2692 scrub_blocked_if_needed(fs_info);
2691 mutex_lock(&fs_info->scrub_lock);
2692 while (atomic_read(&fs_info->scrub_pause_req)) {
2693 mutex_unlock(&fs_info->scrub_lock);
2694 wait_event(fs_info->scrub_pause_wait,
2695 atomic_read(&fs_info->scrub_pause_req) == 0);
2696 mutex_lock(&fs_info->scrub_lock);
2697 }
2698 atomic_dec(&fs_info->scrubs_paused);
2699 mutex_unlock(&fs_info->scrub_lock);
2700 wake_up(&fs_info->scrub_pause_wait);
2701 2693
2702 btrfs_put_block_group(cache); 2694 btrfs_put_block_group(cache);
2703 if (ret) 2695 if (ret)
@@ -2823,8 +2815,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2823 * check some assumptions 2815 * check some assumptions
2824 */ 2816 */
2825 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { 2817 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2826 printk(KERN_ERR 2818 btrfs_err(fs_info,
2827 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2819 "scrub: size assumption nodesize == leafsize (%d == %d) fails",
2828 fs_info->chunk_root->nodesize, 2820 fs_info->chunk_root->nodesize,
2829 fs_info->chunk_root->leafsize); 2821 fs_info->chunk_root->leafsize);
2830 return -EINVAL; 2822 return -EINVAL;
@@ -2836,16 +2828,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2836 * the way scrub is implemented. Do not handle this 2828 * the way scrub is implemented. Do not handle this
2837 * situation at all because it won't ever happen. 2829 * situation at all because it won't ever happen.
2838 */ 2830 */
2839 printk(KERN_ERR 2831 btrfs_err(fs_info,
2840 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2832 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
2841 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); 2833 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2842 return -EINVAL; 2834 return -EINVAL;
2843 } 2835 }
2844 2836
2845 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { 2837 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2846 /* not supported for data w/o checksums */ 2838 /* not supported for data w/o checksums */
2847 printk(KERN_ERR 2839 btrfs_err(fs_info,
2848 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n", 2840 "scrub: size assumption sectorsize != PAGE_SIZE "
2841 "(%d != %lu) fails",
2849 fs_info->chunk_root->sectorsize, PAGE_SIZE); 2842 fs_info->chunk_root->sectorsize, PAGE_SIZE);
2850 return -EINVAL; 2843 return -EINVAL;
2851 } 2844 }
@@ -2858,7 +2851,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2858 * would exhaust the array bounds of pagev member in 2851 * would exhaust the array bounds of pagev member in
2859 * struct scrub_block 2852 * struct scrub_block
2860 */ 2853 */
2861 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n", 2854 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
2855 "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
2862 fs_info->chunk_root->nodesize, 2856 fs_info->chunk_root->nodesize,
2863 SCRUB_MAX_PAGES_PER_BLOCK, 2857 SCRUB_MAX_PAGES_PER_BLOCK,
2864 fs_info->chunk_root->sectorsize, 2858 fs_info->chunk_root->sectorsize,
@@ -2908,7 +2902,13 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2908 } 2902 }
2909 sctx->readonly = readonly; 2903 sctx->readonly = readonly;
2910 dev->scrub_device = sctx; 2904 dev->scrub_device = sctx;
2905 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2911 2906
2907 /*
2908 * checking @scrub_pause_req here, we can avoid
2909 * race between committing transaction and scrubbing.
2910 */
2911 __scrub_blocked_if_needed(fs_info);
2912 atomic_inc(&fs_info->scrubs_running); 2912 atomic_inc(&fs_info->scrubs_running);
2913 mutex_unlock(&fs_info->scrub_lock); 2913 mutex_unlock(&fs_info->scrub_lock);
2914 2914
@@ -2917,9 +2917,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2917 * by holding device list mutex, we can 2917 * by holding device list mutex, we can
2918 * kick off writing super in log tree sync. 2918 * kick off writing super in log tree sync.
2919 */ 2919 */
2920 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2920 ret = scrub_supers(sctx, dev); 2921 ret = scrub_supers(sctx, dev);
2922 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2921 } 2923 }
2922 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2923 2924
2924 if (!ret) 2925 if (!ret)
2925 ret = scrub_enumerate_chunks(sctx, dev, start, end, 2926 ret = scrub_enumerate_chunks(sctx, dev, start, end,
@@ -3167,7 +3168,8 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
3167 ret = iterate_inodes_from_logical(logical, fs_info, path, 3168 ret = iterate_inodes_from_logical(logical, fs_info, path,
3168 record_inode_for_nocow, nocow_ctx); 3169 record_inode_for_nocow, nocow_ctx);
3169 if (ret != 0 && ret != -ENOENT) { 3170 if (ret != 0 && ret != -ENOENT) {
3170 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", 3171 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
3172 "phys %llu, len %llu, mir %u, ret %d",
3171 logical, physical_for_dev_replace, len, mirror_num, 3173 logical, physical_for_dev_replace, len, mirror_num,
3172 ret); 3174 ret);
3173 not_written = 1; 3175 not_written = 1;
@@ -3289,7 +3291,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
3289again: 3291again:
3290 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 3292 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3291 if (!page) { 3293 if (!page) {
3292 pr_err("find_or_create_page() failed\n"); 3294 btrfs_err(fs_info, "find_or_create_page() failed");
3293 ret = -ENOMEM; 3295 ret = -ENOMEM;
3294 goto out; 3296 goto out;
3295 } 3297 }
@@ -3361,7 +3363,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
3361 return -EIO; 3363 return -EIO;
3362 if (!dev->bdev) { 3364 if (!dev->bdev) {
3363 printk_ratelimited(KERN_WARNING 3365 printk_ratelimited(KERN_WARNING
3364 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); 3366 "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3365 return -EIO; 3367 return -EIO;
3366 } 3368 }
3367 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 3369 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
@@ -3371,8 +3373,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
3371 spin_unlock(&sctx->stat_lock); 3373 spin_unlock(&sctx->stat_lock);
3372 return -ENOMEM; 3374 return -ENOMEM;
3373 } 3375 }
3374 bio->bi_size = 0; 3376 bio->bi_iter.bi_size = 0;
3375 bio->bi_sector = physical_for_dev_replace >> 9; 3377 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
3376 bio->bi_bdev = dev->bdev; 3378 bio->bi_bdev = dev->bdev;
3377 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 3379 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3378 if (ret != PAGE_CACHE_SIZE) { 3380 if (ret != PAGE_CACHE_SIZE) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 945d1db98f26..9dde9717c1b9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -24,12 +24,12 @@
24#include <linux/xattr.h> 24#include <linux/xattr.h>
25#include <linux/posix_acl_xattr.h> 25#include <linux/posix_acl_xattr.h>
26#include <linux/radix-tree.h> 26#include <linux/radix-tree.h>
27#include <linux/crc32c.h>
28#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
29#include <linux/string.h> 28#include <linux/string.h>
30 29
31#include "send.h" 30#include "send.h"
32#include "backref.h" 31#include "backref.h"
32#include "hash.h"
33#include "locking.h" 33#include "locking.h"
34#include "disk-io.h" 34#include "disk-io.h"
35#include "btrfs_inode.h" 35#include "btrfs_inode.h"
@@ -88,8 +88,6 @@ struct send_ctx {
88 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; 88 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
89 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ 89 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
90 90
91 struct vfsmount *mnt;
92
93 struct btrfs_root *send_root; 91 struct btrfs_root *send_root;
94 struct btrfs_root *parent_root; 92 struct btrfs_root *parent_root;
95 struct clone_root *clone_roots; 93 struct clone_root *clone_roots;
@@ -111,6 +109,7 @@ struct send_ctx {
111 int cur_inode_deleted; 109 int cur_inode_deleted;
112 u64 cur_inode_size; 110 u64 cur_inode_size;
113 u64 cur_inode_mode; 111 u64 cur_inode_mode;
112 u64 cur_inode_last_extent;
114 113
115 u64 send_progress; 114 u64 send_progress;
116 115
@@ -122,6 +121,74 @@ struct send_ctx {
122 int name_cache_size; 121 int name_cache_size;
123 122
124 char *read_buf; 123 char *read_buf;
124
125 /*
126 * We process inodes by their increasing order, so if before an
127 * incremental send we reverse the parent/child relationship of
128 * directories such that a directory with a lower inode number was
129 * the parent of a directory with a higher inode number, and the one
130 * becoming the new parent got renamed too, we can't rename/move the
131 * directory with lower inode number when we finish processing it - we
132 * must process the directory with higher inode number first, then
133 * rename/move it and then rename/move the directory with lower inode
134 * number. Example follows.
135 *
136 * Tree state when the first send was performed:
137 *
138 * .
139 * |-- a (ino 257)
140 * |-- b (ino 258)
141 * |
142 * |
143 * |-- c (ino 259)
144 * | |-- d (ino 260)
145 * |
146 * |-- c2 (ino 261)
147 *
148 * Tree state when the second (incremental) send is performed:
149 *
150 * .
151 * |-- a (ino 257)
152 * |-- b (ino 258)
153 * |-- c2 (ino 261)
154 * |-- d2 (ino 260)
155 * |-- cc (ino 259)
156 *
157 * The sequence of steps that lead to the second state was:
158 *
159 * mv /a/b/c/d /a/b/c2/d2
160 * mv /a/b/c /a/b/c2/d2/cc
161 *
162 * "c" has lower inode number, but we can't move it (2nd mv operation)
163 * before we move "d", which has higher inode number.
164 *
165 * So we just memorize which move/rename operations must be performed
166 * later when their respective parent is processed and moved/renamed.
167 */
168
169 /* Indexed by parent directory inode number. */
170 struct rb_root pending_dir_moves;
171
172 /*
173 * Reverse index, indexed by the inode number of a directory that
174 * is waiting for the move/rename of its immediate parent before its
175 * own move/rename can be performed.
176 */
177 struct rb_root waiting_dir_moves;
178};
179
180struct pending_dir_move {
181 struct rb_node node;
182 struct list_head list;
183 u64 parent_ino;
184 u64 ino;
185 u64 gen;
186 struct list_head update_refs;
187};
188
189struct waiting_dir_move {
190 struct rb_node node;
191 u64 ino;
125}; 192};
126 193
127struct name_cache_entry { 194struct name_cache_entry {
@@ -145,6 +212,15 @@ struct name_cache_entry {
145 char name[]; 212 char name[];
146}; 213};
147 214
215static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
216
217static int need_send_hole(struct send_ctx *sctx)
218{
219 return (sctx->parent_root && !sctx->cur_inode_new &&
220 !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
221 S_ISREG(sctx->cur_inode_mode));
222}
223
148static void fs_path_reset(struct fs_path *p) 224static void fs_path_reset(struct fs_path *p)
149{ 225{
150 if (p->reversed) { 226 if (p->reversed) {
@@ -336,16 +412,6 @@ out:
336 return ret; 412 return ret;
337} 413}
338 414
339#if 0
340static void fs_path_remove(struct fs_path *p)
341{
342 BUG_ON(p->reversed);
343 while (p->start != p->end && *p->end != '/')
344 p->end--;
345 *p->end = 0;
346}
347#endif
348
349static int fs_path_copy(struct fs_path *p, struct fs_path *from) 415static int fs_path_copy(struct fs_path *p, struct fs_path *from)
350{ 416{
351 int ret; 417 int ret;
@@ -436,30 +502,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
436 return 0; 502 return 0;
437} 503}
438 504
439#if 0 505#define TLV_PUT_DEFINE_INT(bits) \
440static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value) 506 static int tlv_put_u##bits(struct send_ctx *sctx, \
441{ 507 u##bits attr, u##bits value) \
442 return tlv_put(sctx, attr, &value, sizeof(value)); 508 { \
443} 509 __le##bits __tmp = cpu_to_le##bits(value); \
444 510 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
445static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value) 511 }
446{
447 __le16 tmp = cpu_to_le16(value);
448 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
449}
450
451static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value)
452{
453 __le32 tmp = cpu_to_le32(value);
454 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
455}
456#endif
457 512
458static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value) 513TLV_PUT_DEFINE_INT(64)
459{
460 __le64 tmp = cpu_to_le64(value);
461 return tlv_put(sctx, attr, &tmp, sizeof(tmp));
462}
463 514
464static int tlv_put_string(struct send_ctx *sctx, u16 attr, 515static int tlv_put_string(struct send_ctx *sctx, u16 attr,
465 const char *str, int len) 516 const char *str, int len)
@@ -475,17 +526,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
475 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE); 526 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
476} 527}
477 528
478#if 0
479static int tlv_put_timespec(struct send_ctx *sctx, u16 attr,
480 struct timespec *ts)
481{
482 struct btrfs_timespec bts;
483 bts.sec = cpu_to_le64(ts->tv_sec);
484 bts.nsec = cpu_to_le32(ts->tv_nsec);
485 return tlv_put(sctx, attr, &bts, sizeof(bts));
486}
487#endif
488
489static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, 529static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
490 struct extent_buffer *eb, 530 struct extent_buffer *eb,
491 struct btrfs_timespec *ts) 531 struct btrfs_timespec *ts)
@@ -533,12 +573,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
533 if (ret < 0) \ 573 if (ret < 0) \
534 goto tlv_put_failure; \ 574 goto tlv_put_failure; \
535 } while (0) 575 } while (0)
536#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \
537 do { \
538 ret = tlv_put_timespec(sctx, attrtype, ts); \
539 if (ret < 0) \
540 goto tlv_put_failure; \
541 } while (0)
542#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \ 576#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
543 do { \ 577 do { \
544 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \ 578 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
@@ -586,7 +620,7 @@ static int send_cmd(struct send_ctx *sctx)
586 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); 620 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
587 hdr->crc = 0; 621 hdr->crc = 0;
588 622
589 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); 623 crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
590 hdr->crc = cpu_to_le32(crc); 624 hdr->crc = cpu_to_le32(crc);
591 625
592 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, 626 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -1270,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx,
1270 if (!backref_ctx->found_itself) { 1304 if (!backref_ctx->found_itself) {
1271 /* found a bug in backref code? */ 1305 /* found a bug in backref code? */
1272 ret = -EIO; 1306 ret = -EIO;
1273 printk(KERN_ERR "btrfs: ERROR did not find backref in " 1307 btrfs_err(sctx->send_root->fs_info, "did not find backref in "
1274 "send_root. inode=%llu, offset=%llu, " 1308 "send_root. inode=%llu, offset=%llu, "
1275 "disk_byte=%llu found extent=%llu\n", 1309 "disk_byte=%llu found extent=%llu\n",
1276 ino, data_offset, disk_byte, found_key.objectid); 1310 ino, data_offset, disk_byte, found_key.objectid);
@@ -1298,6 +1332,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1298 } 1332 }
1299 1333
1300 if (cur_clone_root) { 1334 if (cur_clone_root) {
1335 if (compressed != BTRFS_COMPRESS_NONE) {
1336 /*
1337 * Offsets given by iterate_extent_inodes() are relative
1338 * to the start of the extent, we need to add logical
1339 * offset from the file extent item.
1340 * (See why at backref.c:check_extent_in_eb())
1341 */
1342 cur_clone_root->offset += btrfs_file_extent_offset(eb,
1343 fi);
1344 }
1301 *found = cur_clone_root; 1345 *found = cur_clone_root;
1302 ret = 0; 1346 ret = 0;
1303 } else { 1347 } else {
@@ -1343,7 +1387,7 @@ static int read_symlink(struct btrfs_root *root,
1343 BUG_ON(compression); 1387 BUG_ON(compression);
1344 1388
1345 off = btrfs_file_extent_inline_start(ei); 1389 off = btrfs_file_extent_inline_start(ei);
1346 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 1390 len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
1347 1391
1348 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len); 1392 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1349 1393
@@ -1372,7 +1416,7 @@ static int gen_unique_name(struct send_ctx *sctx,
1372 return -ENOMEM; 1416 return -ENOMEM;
1373 1417
1374 while (1) { 1418 while (1) {
1375 len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu", 1419 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1376 ino, gen, idx); 1420 ino, gen, idx);
1377 if (len >= sizeof(tmp)) { 1421 if (len >= sizeof(tmp)) {
1378 /* should really not happen */ 1422 /* should really not happen */
@@ -1933,6 +1977,7 @@ static void name_cache_free(struct send_ctx *sctx)
1933 */ 1977 */
1934static int __get_cur_name_and_parent(struct send_ctx *sctx, 1978static int __get_cur_name_and_parent(struct send_ctx *sctx,
1935 u64 ino, u64 gen, 1979 u64 ino, u64 gen,
1980 int skip_name_cache,
1936 u64 *parent_ino, 1981 u64 *parent_ino,
1937 u64 *parent_gen, 1982 u64 *parent_gen,
1938 struct fs_path *dest) 1983 struct fs_path *dest)
@@ -1942,6 +1987,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1942 struct btrfs_path *path = NULL; 1987 struct btrfs_path *path = NULL;
1943 struct name_cache_entry *nce = NULL; 1988 struct name_cache_entry *nce = NULL;
1944 1989
1990 if (skip_name_cache)
1991 goto get_ref;
1945 /* 1992 /*
1946 * First check if we already did a call to this function with the same 1993 * First check if we already did a call to this function with the same
1947 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes 1994 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -1986,11 +2033,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
1986 goto out_cache; 2033 goto out_cache;
1987 } 2034 }
1988 2035
2036get_ref:
1989 /* 2037 /*
1990 * Depending on whether the inode was already processed or not, use 2038 * Depending on whether the inode was already processed or not, use
1991 * send_root or parent_root for ref lookup. 2039 * send_root or parent_root for ref lookup.
1992 */ 2040 */
1993 if (ino < sctx->send_progress) 2041 if (ino < sctx->send_progress && !skip_name_cache)
1994 ret = get_first_ref(sctx->send_root, ino, 2042 ret = get_first_ref(sctx->send_root, ino,
1995 parent_ino, parent_gen, dest); 2043 parent_ino, parent_gen, dest);
1996 else 2044 else
@@ -2014,6 +2062,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
2014 goto out; 2062 goto out;
2015 ret = 1; 2063 ret = 1;
2016 } 2064 }
2065 if (skip_name_cache)
2066 goto out;
2017 2067
2018out_cache: 2068out_cache:
2019 /* 2069 /*
@@ -2081,6 +2131,9 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2081 u64 parent_inode = 0; 2131 u64 parent_inode = 0;
2082 u64 parent_gen = 0; 2132 u64 parent_gen = 0;
2083 int stop = 0; 2133 int stop = 0;
2134 u64 start_ino = ino;
2135 u64 start_gen = gen;
2136 int skip_name_cache = 0;
2084 2137
2085 name = fs_path_alloc(); 2138 name = fs_path_alloc();
2086 if (!name) { 2139 if (!name) {
@@ -2088,19 +2141,32 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2088 goto out; 2141 goto out;
2089 } 2142 }
2090 2143
2144 if (is_waiting_for_move(sctx, ino))
2145 skip_name_cache = 1;
2146
2147again:
2091 dest->reversed = 1; 2148 dest->reversed = 1;
2092 fs_path_reset(dest); 2149 fs_path_reset(dest);
2093 2150
2094 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { 2151 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2095 fs_path_reset(name); 2152 fs_path_reset(name);
2096 2153
2097 ret = __get_cur_name_and_parent(sctx, ino, gen, 2154 ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
2098 &parent_inode, &parent_gen, name); 2155 &parent_inode, &parent_gen, name);
2099 if (ret < 0) 2156 if (ret < 0)
2100 goto out; 2157 goto out;
2101 if (ret) 2158 if (ret)
2102 stop = 1; 2159 stop = 1;
2103 2160
2161 if (!skip_name_cache &&
2162 is_waiting_for_move(sctx, parent_inode)) {
2163 ino = start_ino;
2164 gen = start_gen;
2165 stop = 0;
2166 skip_name_cache = 1;
2167 goto again;
2168 }
2169
2104 ret = fs_path_add_path(dest, name); 2170 ret = fs_path_add_path(dest, name);
2105 if (ret < 0) 2171 if (ret < 0)
2106 goto out; 2172 goto out;
@@ -2131,7 +2197,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
2131 char *name = NULL; 2197 char *name = NULL;
2132 int namelen; 2198 int namelen;
2133 2199
2134 path = alloc_path_for_send(); 2200 path = btrfs_alloc_path();
2135 if (!path) 2201 if (!path)
2136 return -ENOMEM; 2202 return -ENOMEM;
2137 2203
@@ -2180,12 +2246,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
2180 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, 2246 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2181 sctx->send_root->root_item.uuid); 2247 sctx->send_root->root_item.uuid);
2182 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, 2248 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2183 sctx->send_root->root_item.ctransid); 2249 le64_to_cpu(sctx->send_root->root_item.ctransid));
2184 if (parent_root) { 2250 if (parent_root) {
2185 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 2251 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2186 sctx->parent_root->root_item.uuid); 2252 sctx->parent_root->root_item.uuid);
2187 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 2253 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2188 sctx->parent_root->root_item.ctransid); 2254 le64_to_cpu(sctx->parent_root->root_item.ctransid));
2189 } 2255 }
2190 2256
2191 ret = send_cmd(sctx); 2257 ret = send_cmd(sctx);
@@ -2672,10 +2738,347 @@ out:
2672 return ret; 2738 return ret;
2673} 2739}
2674 2740
2741static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
2742{
2743 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
2744 struct waiting_dir_move *entry;
2745
2746 while (n) {
2747 entry = rb_entry(n, struct waiting_dir_move, node);
2748 if (ino < entry->ino)
2749 n = n->rb_left;
2750 else if (ino > entry->ino)
2751 n = n->rb_right;
2752 else
2753 return 1;
2754 }
2755 return 0;
2756}
2757
2758static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2759{
2760 struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
2761 struct rb_node *parent = NULL;
2762 struct waiting_dir_move *entry, *dm;
2763
2764 dm = kmalloc(sizeof(*dm), GFP_NOFS);
2765 if (!dm)
2766 return -ENOMEM;
2767 dm->ino = ino;
2768
2769 while (*p) {
2770 parent = *p;
2771 entry = rb_entry(parent, struct waiting_dir_move, node);
2772 if (ino < entry->ino) {
2773 p = &(*p)->rb_left;
2774 } else if (ino > entry->ino) {
2775 p = &(*p)->rb_right;
2776 } else {
2777 kfree(dm);
2778 return -EEXIST;
2779 }
2780 }
2781
2782 rb_link_node(&dm->node, parent, p);
2783 rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
2784 return 0;
2785}
2786
2787static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino)
2788{
2789 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
2790 struct waiting_dir_move *entry;
2791
2792 while (n) {
2793 entry = rb_entry(n, struct waiting_dir_move, node);
2794 if (ino < entry->ino) {
2795 n = n->rb_left;
2796 } else if (ino > entry->ino) {
2797 n = n->rb_right;
2798 } else {
2799 rb_erase(&entry->node, &sctx->waiting_dir_moves);
2800 kfree(entry);
2801 return 0;
2802 }
2803 }
2804 return -ENOENT;
2805}
2806
2807static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
2808{
2809 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
2810 struct rb_node *parent = NULL;
2811 struct pending_dir_move *entry, *pm;
2812 struct recorded_ref *cur;
2813 int exists = 0;
2814 int ret;
2815
2816 pm = kmalloc(sizeof(*pm), GFP_NOFS);
2817 if (!pm)
2818 return -ENOMEM;
2819 pm->parent_ino = parent_ino;
2820 pm->ino = sctx->cur_ino;
2821 pm->gen = sctx->cur_inode_gen;
2822 INIT_LIST_HEAD(&pm->list);
2823 INIT_LIST_HEAD(&pm->update_refs);
2824 RB_CLEAR_NODE(&pm->node);
2825
2826 while (*p) {
2827 parent = *p;
2828 entry = rb_entry(parent, struct pending_dir_move, node);
2829 if (parent_ino < entry->parent_ino) {
2830 p = &(*p)->rb_left;
2831 } else if (parent_ino > entry->parent_ino) {
2832 p = &(*p)->rb_right;
2833 } else {
2834 exists = 1;
2835 break;
2836 }
2837 }
2838
2839 list_for_each_entry(cur, &sctx->deleted_refs, list) {
2840 ret = dup_ref(cur, &pm->update_refs);
2841 if (ret < 0)
2842 goto out;
2843 }
2844 list_for_each_entry(cur, &sctx->new_refs, list) {
2845 ret = dup_ref(cur, &pm->update_refs);
2846 if (ret < 0)
2847 goto out;
2848 }
2849
2850 ret = add_waiting_dir_move(sctx, pm->ino);
2851 if (ret)
2852 goto out;
2853
2854 if (exists) {
2855 list_add_tail(&pm->list, &entry->list);
2856 } else {
2857 rb_link_node(&pm->node, parent, p);
2858 rb_insert_color(&pm->node, &sctx->pending_dir_moves);
2859 }
2860 ret = 0;
2861out:
2862 if (ret) {
2863 __free_recorded_refs(&pm->update_refs);
2864 kfree(pm);
2865 }
2866 return ret;
2867}
2868
2869static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
2870 u64 parent_ino)
2871{
2872 struct rb_node *n = sctx->pending_dir_moves.rb_node;
2873 struct pending_dir_move *entry;
2874
2875 while (n) {
2876 entry = rb_entry(n, struct pending_dir_move, node);
2877 if (parent_ino < entry->parent_ino)
2878 n = n->rb_left;
2879 else if (parent_ino > entry->parent_ino)
2880 n = n->rb_right;
2881 else
2882 return entry;
2883 }
2884 return NULL;
2885}
2886
2887static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
2888{
2889 struct fs_path *from_path = NULL;
2890 struct fs_path *to_path = NULL;
2891 u64 orig_progress = sctx->send_progress;
2892 struct recorded_ref *cur;
2893 int ret;
2894
2895 from_path = fs_path_alloc();
2896 if (!from_path)
2897 return -ENOMEM;
2898
2899 sctx->send_progress = pm->ino;
2900 ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
2901 if (ret < 0)
2902 goto out;
2903
2904 to_path = fs_path_alloc();
2905 if (!to_path) {
2906 ret = -ENOMEM;
2907 goto out;
2908 }
2909
2910 sctx->send_progress = sctx->cur_ino + 1;
2911 ret = del_waiting_dir_move(sctx, pm->ino);
2912 ASSERT(ret == 0);
2913
2914 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
2915 if (ret < 0)
2916 goto out;
2917
2918 ret = send_rename(sctx, from_path, to_path);
2919 if (ret < 0)
2920 goto out;
2921
2922 ret = send_utimes(sctx, pm->ino, pm->gen);
2923 if (ret < 0)
2924 goto out;
2925
2926 /*
2927 * After rename/move, need to update the utimes of both new parent(s)
2928 * and old parent(s).
2929 */
2930 list_for_each_entry(cur, &pm->update_refs, list) {
2931 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
2932 if (ret < 0)
2933 goto out;
2934 }
2935
2936out:
2937 fs_path_free(from_path);
2938 fs_path_free(to_path);
2939 sctx->send_progress = orig_progress;
2940
2941 return ret;
2942}
2943
2944static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
2945{
2946 if (!list_empty(&m->list))
2947 list_del(&m->list);
2948 if (!RB_EMPTY_NODE(&m->node))
2949 rb_erase(&m->node, &sctx->pending_dir_moves);
2950 __free_recorded_refs(&m->update_refs);
2951 kfree(m);
2952}
2953
2954static void tail_append_pending_moves(struct pending_dir_move *moves,
2955 struct list_head *stack)
2956{
2957 if (list_empty(&moves->list)) {
2958 list_add_tail(&moves->list, stack);
2959 } else {
2960 LIST_HEAD(list);
2961 list_splice_init(&moves->list, &list);
2962 list_add_tail(&moves->list, stack);
2963 list_splice_tail(&list, stack);
2964 }
2965}
2966
2967static int apply_children_dir_moves(struct send_ctx *sctx)
2968{
2969 struct pending_dir_move *pm;
2970 struct list_head stack;
2971 u64 parent_ino = sctx->cur_ino;
2972 int ret = 0;
2973
2974 pm = get_pending_dir_moves(sctx, parent_ino);
2975 if (!pm)
2976 return 0;
2977
2978 INIT_LIST_HEAD(&stack);
2979 tail_append_pending_moves(pm, &stack);
2980
2981 while (!list_empty(&stack)) {
2982 pm = list_first_entry(&stack, struct pending_dir_move, list);
2983 parent_ino = pm->ino;
2984 ret = apply_dir_move(sctx, pm);
2985 free_pending_move(sctx, pm);
2986 if (ret)
2987 goto out;
2988 pm = get_pending_dir_moves(sctx, parent_ino);
2989 if (pm)
2990 tail_append_pending_moves(pm, &stack);
2991 }
2992 return 0;
2993
2994out:
2995 while (!list_empty(&stack)) {
2996 pm = list_first_entry(&stack, struct pending_dir_move, list);
2997 free_pending_move(sctx, pm);
2998 }
2999 return ret;
3000}
3001
3002static int wait_for_parent_move(struct send_ctx *sctx,
3003 struct recorded_ref *parent_ref)
3004{
3005 int ret;
3006 u64 ino = parent_ref->dir;
3007 u64 parent_ino_before, parent_ino_after;
3008 u64 new_gen, old_gen;
3009 struct fs_path *path_before = NULL;
3010 struct fs_path *path_after = NULL;
3011 int len1, len2;
3012
3013 if (parent_ref->dir <= sctx->cur_ino)
3014 return 0;
3015
3016 if (is_waiting_for_move(sctx, ino))
3017 return 1;
3018
3019 ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
3020 NULL, NULL, NULL, NULL);
3021 if (ret == -ENOENT)
3022 return 0;
3023 else if (ret < 0)
3024 return ret;
3025
3026 ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
3027 NULL, NULL, NULL, NULL);
3028 if (ret < 0)
3029 return ret;
3030
3031 if (new_gen != old_gen)
3032 return 0;
3033
3034 path_before = fs_path_alloc();
3035 if (!path_before)
3036 return -ENOMEM;
3037
3038 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3039 NULL, path_before);
3040 if (ret == -ENOENT) {
3041 ret = 0;
3042 goto out;
3043 } else if (ret < 0) {
3044 goto out;
3045 }
3046
3047 path_after = fs_path_alloc();
3048 if (!path_after) {
3049 ret = -ENOMEM;
3050 goto out;
3051 }
3052
3053 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3054 NULL, path_after);
3055 if (ret == -ENOENT) {
3056 ret = 0;
3057 goto out;
3058 } else if (ret < 0) {
3059 goto out;
3060 }
3061
3062 len1 = fs_path_len(path_before);
3063 len2 = fs_path_len(path_after);
3064 if ((parent_ino_before != parent_ino_after) && (len1 != len2 ||
3065 memcmp(path_before->start, path_after->start, len1))) {
3066 ret = 1;
3067 goto out;
3068 }
3069 ret = 0;
3070
3071out:
3072 fs_path_free(path_before);
3073 fs_path_free(path_after);
3074
3075 return ret;
3076}
3077
2675/* 3078/*
2676 * This does all the move/link/unlink/rmdir magic. 3079 * This does all the move/link/unlink/rmdir magic.
2677 */ 3080 */
2678static int process_recorded_refs(struct send_ctx *sctx) 3081static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
2679{ 3082{
2680 int ret = 0; 3083 int ret = 0;
2681 struct recorded_ref *cur; 3084 struct recorded_ref *cur;
@@ -2824,11 +3227,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
2824 * dirs, we always have one new and one deleted 3227 * dirs, we always have one new and one deleted
2825 * ref. The deleted ref is ignored later. 3228 * ref. The deleted ref is ignored later.
2826 */ 3229 */
2827 ret = send_rename(sctx, valid_path, 3230 if (wait_for_parent_move(sctx, cur)) {
2828 cur->full_path); 3231 ret = add_pending_dir_move(sctx,
2829 if (ret < 0) 3232 cur->dir);
2830 goto out; 3233 *pending_move = 1;
2831 ret = fs_path_copy(valid_path, cur->full_path); 3234 } else {
3235 ret = send_rename(sctx, valid_path,
3236 cur->full_path);
3237 if (!ret)
3238 ret = fs_path_copy(valid_path,
3239 cur->full_path);
3240 }
2832 if (ret < 0) 3241 if (ret < 0)
2833 goto out; 3242 goto out;
2834 } else { 3243 } else {
@@ -3197,6 +3606,7 @@ static int process_all_refs(struct send_ctx *sctx,
3197 struct extent_buffer *eb; 3606 struct extent_buffer *eb;
3198 int slot; 3607 int slot;
3199 iterate_inode_ref_t cb; 3608 iterate_inode_ref_t cb;
3609 int pending_move = 0;
3200 3610
3201 path = alloc_path_for_send(); 3611 path = alloc_path_for_send();
3202 if (!path) 3612 if (!path)
@@ -3240,7 +3650,9 @@ static int process_all_refs(struct send_ctx *sctx,
3240 } 3650 }
3241 btrfs_release_path(path); 3651 btrfs_release_path(path);
3242 3652
3243 ret = process_recorded_refs(sctx); 3653 ret = process_recorded_refs(sctx, &pending_move);
3654 /* Only applicable to an incremental send. */
3655 ASSERT(pending_move == 0);
3244 3656
3245out: 3657out:
3246 btrfs_free_path(path); 3658 btrfs_free_path(path);
@@ -3706,7 +4118,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
3706 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, 4118 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
3707 clone_root->root->root_item.uuid); 4119 clone_root->root->root_item.uuid);
3708 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, 4120 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
3709 clone_root->root->root_item.ctransid); 4121 le64_to_cpu(clone_root->root->root_item.ctransid));
3710 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); 4122 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
3711 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET, 4123 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
3712 clone_root->offset); 4124 clone_root->offset);
@@ -3752,6 +4164,39 @@ out:
3752 return ret; 4164 return ret;
3753} 4165}
3754 4166
4167static int send_hole(struct send_ctx *sctx, u64 end)
4168{
4169 struct fs_path *p = NULL;
4170 u64 offset = sctx->cur_inode_last_extent;
4171 u64 len;
4172 int ret = 0;
4173
4174 p = fs_path_alloc();
4175 if (!p)
4176 return -ENOMEM;
4177 memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
4178 while (offset < end) {
4179 len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
4180
4181 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
4182 if (ret < 0)
4183 break;
4184 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4185 if (ret < 0)
4186 break;
4187 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
4188 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
4189 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
4190 ret = send_cmd(sctx);
4191 if (ret < 0)
4192 break;
4193 offset += len;
4194 }
4195tlv_put_failure:
4196 fs_path_free(p);
4197 return ret;
4198}
4199
3755static int send_write_or_clone(struct send_ctx *sctx, 4200static int send_write_or_clone(struct send_ctx *sctx,
3756 struct btrfs_path *path, 4201 struct btrfs_path *path,
3757 struct btrfs_key *key, 4202 struct btrfs_key *key,
@@ -3764,12 +4209,14 @@ static int send_write_or_clone(struct send_ctx *sctx,
3764 u64 len; 4209 u64 len;
3765 u32 l; 4210 u32 l;
3766 u8 type; 4211 u8 type;
4212 u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
3767 4213
3768 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 4214 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3769 struct btrfs_file_extent_item); 4215 struct btrfs_file_extent_item);
3770 type = btrfs_file_extent_type(path->nodes[0], ei); 4216 type = btrfs_file_extent_type(path->nodes[0], ei);
3771 if (type == BTRFS_FILE_EXTENT_INLINE) { 4217 if (type == BTRFS_FILE_EXTENT_INLINE) {
3772 len = btrfs_file_extent_inline_len(path->nodes[0], ei); 4218 len = btrfs_file_extent_inline_len(path->nodes[0],
4219 path->slots[0], ei);
3773 /* 4220 /*
3774 * it is possible the inline item won't cover the whole page, 4221 * it is possible the inline item won't cover the whole page,
3775 * but there may be items after this page. Make 4222 * but there may be items after this page. Make
@@ -3787,7 +4234,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
3787 goto out; 4234 goto out;
3788 } 4235 }
3789 4236
3790 if (clone_root) { 4237 if (clone_root && IS_ALIGNED(offset + len, bs)) {
3791 ret = send_clone(sctx, offset, len, clone_root); 4238 ret = send_clone(sctx, offset, len, clone_root);
3792 } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { 4239 } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
3793 ret = send_update_extent(sctx, offset, len); 4240 ret = send_update_extent(sctx, offset, len);
@@ -3979,6 +4426,101 @@ out:
3979 return ret; 4426 return ret;
3980} 4427}
3981 4428
4429static int get_last_extent(struct send_ctx *sctx, u64 offset)
4430{
4431 struct btrfs_path *path;
4432 struct btrfs_root *root = sctx->send_root;
4433 struct btrfs_file_extent_item *fi;
4434 struct btrfs_key key;
4435 u64 extent_end;
4436 u8 type;
4437 int ret;
4438
4439 path = alloc_path_for_send();
4440 if (!path)
4441 return -ENOMEM;
4442
4443 sctx->cur_inode_last_extent = 0;
4444
4445 key.objectid = sctx->cur_ino;
4446 key.type = BTRFS_EXTENT_DATA_KEY;
4447 key.offset = offset;
4448 ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
4449 if (ret < 0)
4450 goto out;
4451 ret = 0;
4452 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4453 if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
4454 goto out;
4455
4456 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
4457 struct btrfs_file_extent_item);
4458 type = btrfs_file_extent_type(path->nodes[0], fi);
4459 if (type == BTRFS_FILE_EXTENT_INLINE) {
4460 u64 size = btrfs_file_extent_inline_len(path->nodes[0],
4461 path->slots[0], fi);
4462 extent_end = ALIGN(key.offset + size,
4463 sctx->send_root->sectorsize);
4464 } else {
4465 extent_end = key.offset +
4466 btrfs_file_extent_num_bytes(path->nodes[0], fi);
4467 }
4468 sctx->cur_inode_last_extent = extent_end;
4469out:
4470 btrfs_free_path(path);
4471 return ret;
4472}
4473
4474static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
4475 struct btrfs_key *key)
4476{
4477 struct btrfs_file_extent_item *fi;
4478 u64 extent_end;
4479 u8 type;
4480 int ret = 0;
4481
4482 if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
4483 return 0;
4484
4485 if (sctx->cur_inode_last_extent == (u64)-1) {
4486 ret = get_last_extent(sctx, key->offset - 1);
4487 if (ret)
4488 return ret;
4489 }
4490
4491 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
4492 struct btrfs_file_extent_item);
4493 type = btrfs_file_extent_type(path->nodes[0], fi);
4494 if (type == BTRFS_FILE_EXTENT_INLINE) {
4495 u64 size = btrfs_file_extent_inline_len(path->nodes[0],
4496 path->slots[0], fi);
4497 extent_end = ALIGN(key->offset + size,
4498 sctx->send_root->sectorsize);
4499 } else {
4500 extent_end = key->offset +
4501 btrfs_file_extent_num_bytes(path->nodes[0], fi);
4502 }
4503
4504 if (path->slots[0] == 0 &&
4505 sctx->cur_inode_last_extent < key->offset) {
4506 /*
4507 * We might have skipped entire leafs that contained only
4508 * file extent items for our current inode. These leafs have
4509 * a generation number smaller (older) than the one in the
4510 * current leaf and the leaf our last extent came from, and
4511 * are located between these 2 leafs.
4512 */
4513 ret = get_last_extent(sctx, key->offset - 1);
4514 if (ret)
4515 return ret;
4516 }
4517
4518 if (sctx->cur_inode_last_extent < key->offset)
4519 ret = send_hole(sctx, key->offset);
4520 sctx->cur_inode_last_extent = extent_end;
4521 return ret;
4522}
4523
3982static int process_extent(struct send_ctx *sctx, 4524static int process_extent(struct send_ctx *sctx,
3983 struct btrfs_path *path, 4525 struct btrfs_path *path,
3984 struct btrfs_key *key) 4526 struct btrfs_key *key)
@@ -3995,7 +4537,7 @@ static int process_extent(struct send_ctx *sctx,
3995 goto out; 4537 goto out;
3996 if (ret) { 4538 if (ret) {
3997 ret = 0; 4539 ret = 0;
3998 goto out; 4540 goto out_hole;
3999 } 4541 }
4000 } else { 4542 } else {
4001 struct btrfs_file_extent_item *ei; 4543 struct btrfs_file_extent_item *ei;
@@ -4031,7 +4573,10 @@ static int process_extent(struct send_ctx *sctx,
4031 goto out; 4573 goto out;
4032 4574
4033 ret = send_write_or_clone(sctx, path, key, found_clone); 4575 ret = send_write_or_clone(sctx, path, key, found_clone);
4034 4576 if (ret)
4577 goto out;
4578out_hole:
4579 ret = maybe_send_hole(sctx, path, key);
4035out: 4580out:
4036 return ret; 4581 return ret;
4037} 4582}
@@ -4054,17 +4599,25 @@ static int process_all_extents(struct send_ctx *sctx)
4054 key.objectid = sctx->cmp_key->objectid; 4599 key.objectid = sctx->cmp_key->objectid;
4055 key.type = BTRFS_EXTENT_DATA_KEY; 4600 key.type = BTRFS_EXTENT_DATA_KEY;
4056 key.offset = 0; 4601 key.offset = 0;
4057 while (1) { 4602 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4058 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0); 4603 if (ret < 0)
4059 if (ret < 0) 4604 goto out;
4060 goto out;
4061 if (ret) {
4062 ret = 0;
4063 goto out;
4064 }
4065 4605
4606 while (1) {
4066 eb = path->nodes[0]; 4607 eb = path->nodes[0];
4067 slot = path->slots[0]; 4608 slot = path->slots[0];
4609
4610 if (slot >= btrfs_header_nritems(eb)) {
4611 ret = btrfs_next_leaf(root, path);
4612 if (ret < 0) {
4613 goto out;
4614 } else if (ret > 0) {
4615 ret = 0;
4616 break;
4617 }
4618 continue;
4619 }
4620
4068 btrfs_item_key_to_cpu(eb, &found_key, slot); 4621 btrfs_item_key_to_cpu(eb, &found_key, slot);
4069 4622
4070 if (found_key.objectid != key.objectid || 4623 if (found_key.objectid != key.objectid ||
@@ -4077,8 +4630,7 @@ static int process_all_extents(struct send_ctx *sctx)
4077 if (ret < 0) 4630 if (ret < 0)
4078 goto out; 4631 goto out;
4079 4632
4080 btrfs_release_path(path); 4633 path->slots[0]++;
4081 key.offset = found_key.offset + 1;
4082 } 4634 }
4083 4635
4084out: 4636out:
@@ -4086,7 +4638,9 @@ out:
4086 return ret; 4638 return ret;
4087} 4639}
4088 4640
4089static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end) 4641static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
4642 int *pending_move,
4643 int *refs_processed)
4090{ 4644{
4091 int ret = 0; 4645 int ret = 0;
4092 4646
@@ -4098,17 +4652,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
4098 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) 4652 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
4099 goto out; 4653 goto out;
4100 4654
4101 ret = process_recorded_refs(sctx); 4655 ret = process_recorded_refs(sctx, pending_move);
4102 if (ret < 0) 4656 if (ret < 0)
4103 goto out; 4657 goto out;
4104 4658
4105 /* 4659 *refs_processed = 1;
4106 * We have processed the refs and thus need to advance send_progress.
4107 * Now, calls to get_cur_xxx will take the updated refs of the current
4108 * inode into account.
4109 */
4110 sctx->send_progress = sctx->cur_ino + 1;
4111
4112out: 4660out:
4113 return ret; 4661 return ret;
4114} 4662}
@@ -4124,11 +4672,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4124 u64 right_gid; 4672 u64 right_gid;
4125 int need_chmod = 0; 4673 int need_chmod = 0;
4126 int need_chown = 0; 4674 int need_chown = 0;
4675 int pending_move = 0;
4676 int refs_processed = 0;
4127 4677
4128 ret = process_recorded_refs_if_needed(sctx, at_end); 4678 ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
4679 &refs_processed);
4129 if (ret < 0) 4680 if (ret < 0)
4130 goto out; 4681 goto out;
4131 4682
4683 /*
4684 * We have processed the refs and thus need to advance send_progress.
4685 * Now, calls to get_cur_xxx will take the updated refs of the current
4686 * inode into account.
4687 *
4688 * On the other hand, if our current inode is a directory and couldn't
4689 * be moved/renamed because its parent was renamed/moved too and it has
4690 * a higher inode number, we can only move/rename our current inode
4691 * after we moved/renamed its parent. Therefore in this case operate on
4692 * the old path (pre move/rename) of our current inode, and the
4693 * move/rename will be performed later.
4694 */
4695 if (refs_processed && !pending_move)
4696 sctx->send_progress = sctx->cur_ino + 1;
4697
4132 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted) 4698 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
4133 goto out; 4699 goto out;
4134 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino) 4700 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
@@ -4157,6 +4723,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4157 } 4723 }
4158 4724
4159 if (S_ISREG(sctx->cur_inode_mode)) { 4725 if (S_ISREG(sctx->cur_inode_mode)) {
4726 if (need_send_hole(sctx)) {
4727 if (sctx->cur_inode_last_extent == (u64)-1) {
4728 ret = get_last_extent(sctx, (u64)-1);
4729 if (ret)
4730 goto out;
4731 }
4732 if (sctx->cur_inode_last_extent <
4733 sctx->cur_inode_size) {
4734 ret = send_hole(sctx, sctx->cur_inode_size);
4735 if (ret)
4736 goto out;
4737 }
4738 }
4160 ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen, 4739 ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4161 sctx->cur_inode_size); 4740 sctx->cur_inode_size);
4162 if (ret < 0) 4741 if (ret < 0)
@@ -4177,9 +4756,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
4177 } 4756 }
4178 4757
4179 /* 4758 /*
4180 * Need to send that every time, no matter if it actually changed 4759 * If other directory inodes depended on our current directory
4181 * between the two trees as we have done changes to the inode before. 4760 * inode's move/rename, now do their move/rename operations.
4761 */
4762 if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
4763 ret = apply_children_dir_moves(sctx);
4764 if (ret)
4765 goto out;
4766 }
4767
4768 /*
4769 * Need to send that every time, no matter if it actually
4770 * changed between the two trees as we have done changes to
4771 * the inode before.
4182 */ 4772 */
4773 sctx->send_progress = sctx->cur_ino + 1;
4183 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen); 4774 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4184 if (ret < 0) 4775 if (ret < 0)
4185 goto out; 4776 goto out;
@@ -4200,6 +4791,7 @@ static int changed_inode(struct send_ctx *sctx,
4200 4791
4201 sctx->cur_ino = key->objectid; 4792 sctx->cur_ino = key->objectid;
4202 sctx->cur_inode_new_gen = 0; 4793 sctx->cur_inode_new_gen = 0;
4794 sctx->cur_inode_last_extent = (u64)-1;
4203 4795
4204 /* 4796 /*
4205 * Set send_progress to current inode. This will tell all get_cur_xxx 4797 * Set send_progress to current inode. This will tell all get_cur_xxx
@@ -4480,14 +5072,18 @@ static int changed_cb(struct btrfs_root *left_root,
4480 struct send_ctx *sctx = ctx; 5072 struct send_ctx *sctx = ctx;
4481 5073
4482 if (result == BTRFS_COMPARE_TREE_SAME) { 5074 if (result == BTRFS_COMPARE_TREE_SAME) {
4483 if (key->type != BTRFS_INODE_REF_KEY && 5075 if (key->type == BTRFS_INODE_REF_KEY ||
4484 key->type != BTRFS_INODE_EXTREF_KEY) 5076 key->type == BTRFS_INODE_EXTREF_KEY) {
4485 return 0; 5077 ret = compare_refs(sctx, left_path, key);
4486 ret = compare_refs(sctx, left_path, key); 5078 if (!ret)
4487 if (!ret) 5079 return 0;
5080 if (ret < 0)
5081 return ret;
5082 } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
5083 return maybe_send_hole(sctx, left_path, key);
5084 } else {
4488 return 0; 5085 return 0;
4489 if (ret < 0) 5086 }
4490 return ret;
4491 result = BTRFS_COMPARE_TREE_CHANGED; 5087 result = BTRFS_COMPARE_TREE_CHANGED;
4492 ret = 0; 5088 ret = 0;
4493 } 5089 }
@@ -4522,7 +5118,6 @@ out:
4522static int full_send_tree(struct send_ctx *sctx) 5118static int full_send_tree(struct send_ctx *sctx)
4523{ 5119{
4524 int ret; 5120 int ret;
4525 struct btrfs_trans_handle *trans = NULL;
4526 struct btrfs_root *send_root = sctx->send_root; 5121 struct btrfs_root *send_root = sctx->send_root;
4527 struct btrfs_key key; 5122 struct btrfs_key key;
4528 struct btrfs_key found_key; 5123 struct btrfs_key found_key;
@@ -4544,19 +5139,6 @@ static int full_send_tree(struct send_ctx *sctx)
4544 key.type = BTRFS_INODE_ITEM_KEY; 5139 key.type = BTRFS_INODE_ITEM_KEY;
4545 key.offset = 0; 5140 key.offset = 0;
4546 5141
4547join_trans:
4548 /*
4549 * We need to make sure the transaction does not get committed
4550 * while we do anything on commit roots. Join a transaction to prevent
4551 * this.
4552 */
4553 trans = btrfs_join_transaction(send_root);
4554 if (IS_ERR(trans)) {
4555 ret = PTR_ERR(trans);
4556 trans = NULL;
4557 goto out;
4558 }
4559
4560 /* 5142 /*
4561 * Make sure the tree has not changed after re-joining. We detect this 5143 * Make sure the tree has not changed after re-joining. We detect this
4562 * by comparing start_ctransid and ctransid. They should always match. 5144 * by comparing start_ctransid and ctransid. They should always match.
@@ -4566,7 +5148,7 @@ join_trans:
4566 spin_unlock(&send_root->root_item_lock); 5148 spin_unlock(&send_root->root_item_lock);
4567 5149
4568 if (ctransid != start_ctransid) { 5150 if (ctransid != start_ctransid) {
4569 WARN(1, KERN_WARNING "btrfs: the root that you're trying to " 5151 WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
4570 "send was modified in between. This is " 5152 "send was modified in between. This is "
4571 "probably a bug.\n"); 5153 "probably a bug.\n");
4572 ret = -EIO; 5154 ret = -EIO;
@@ -4580,19 +5162,6 @@ join_trans:
4580 goto out_finish; 5162 goto out_finish;
4581 5163
4582 while (1) { 5164 while (1) {
4583 /*
4584 * When someone want to commit while we iterate, end the
4585 * joined transaction and rejoin.
4586 */
4587 if (btrfs_should_end_transaction(trans, send_root)) {
4588 ret = btrfs_end_transaction(trans, send_root);
4589 trans = NULL;
4590 if (ret < 0)
4591 goto out;
4592 btrfs_release_path(path);
4593 goto join_trans;
4594 }
4595
4596 eb = path->nodes[0]; 5165 eb = path->nodes[0];
4597 slot = path->slots[0]; 5166 slot = path->slots[0];
4598 btrfs_item_key_to_cpu(eb, &found_key, slot); 5167 btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -4620,12 +5189,6 @@ out_finish:
4620 5189
4621out: 5190out:
4622 btrfs_free_path(path); 5191 btrfs_free_path(path);
4623 if (trans) {
4624 if (!ret)
4625 ret = btrfs_end_transaction(trans, send_root);
4626 else
4627 btrfs_end_transaction(trans, send_root);
4628 }
4629 return ret; 5192 return ret;
4630} 5193}
4631 5194
@@ -4662,6 +5225,21 @@ out:
4662 return ret; 5225 return ret;
4663} 5226}
4664 5227
5228static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
5229{
5230 spin_lock(&root->root_item_lock);
5231 root->send_in_progress--;
5232 /*
5233 * Not much left to do, we don't know why it's unbalanced and
5234 * can't blindly reset it to 0.
5235 */
5236 if (root->send_in_progress < 0)
5237 btrfs_err(root->fs_info,
5238 "send_in_progres unbalanced %d root %llu\n",
5239 root->send_in_progress, root->root_key.objectid);
5240 spin_unlock(&root->root_item_lock);
5241}
5242
4665long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) 5243long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4666{ 5244{
4667 int ret = 0; 5245 int ret = 0;
@@ -4673,6 +5251,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4673 struct send_ctx *sctx = NULL; 5251 struct send_ctx *sctx = NULL;
4674 u32 i; 5252 u32 i;
4675 u64 *clone_sources_tmp = NULL; 5253 u64 *clone_sources_tmp = NULL;
5254 int clone_sources_to_rollback = 0;
5255 int sort_clone_roots = 0;
5256 int index;
4676 5257
4677 if (!capable(CAP_SYS_ADMIN)) 5258 if (!capable(CAP_SYS_ADMIN))
4678 return -EPERM; 5259 return -EPERM;
@@ -4681,38 +5262,26 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4681 fs_info = send_root->fs_info; 5262 fs_info = send_root->fs_info;
4682 5263
4683 /* 5264 /*
5265 * The subvolume must remain read-only during send, protect against
5266 * making it RW.
5267 */
5268 spin_lock(&send_root->root_item_lock);
5269 send_root->send_in_progress++;
5270 spin_unlock(&send_root->root_item_lock);
5271
5272 /*
4684 * This is done when we lookup the root, it should already be complete 5273 * This is done when we lookup the root, it should already be complete
4685 * by the time we get here. 5274 * by the time we get here.
4686 */ 5275 */
4687 WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); 5276 WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
4688 5277
4689 /* 5278 /*
4690 * If we just created this root we need to make sure that the orphan 5279 * Userspace tools do the checks and warn the user if it's
4691 * cleanup has been done and committed since we search the commit root, 5280 * not RO.
4692 * so check its commit root transid with our otransid and if they match
4693 * commit the transaction to make sure everything is updated.
4694 */ 5281 */
4695 down_read(&send_root->fs_info->extent_commit_sem); 5282 if (!btrfs_root_readonly(send_root)) {
4696 if (btrfs_header_generation(send_root->commit_root) == 5283 ret = -EPERM;
4697 btrfs_root_otransid(&send_root->root_item)) { 5284 goto out;
4698 struct btrfs_trans_handle *trans;
4699
4700 up_read(&send_root->fs_info->extent_commit_sem);
4701
4702 trans = btrfs_attach_transaction_barrier(send_root);
4703 if (IS_ERR(trans)) {
4704 if (PTR_ERR(trans) != -ENOENT) {
4705 ret = PTR_ERR(trans);
4706 goto out;
4707 }
4708 /* ENOENT means theres no transaction */
4709 } else {
4710 ret = btrfs_commit_transaction(trans, send_root);
4711 if (ret)
4712 goto out;
4713 }
4714 } else {
4715 up_read(&send_root->fs_info->extent_commit_sem);
4716 } 5285 }
4717 5286
4718 arg = memdup_user(arg_, sizeof(*arg)); 5287 arg = memdup_user(arg_, sizeof(*arg));
@@ -4753,8 +5322,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4753 goto out; 5322 goto out;
4754 } 5323 }
4755 5324
4756 sctx->mnt = mnt_file->f_path.mnt;
4757
4758 sctx->send_root = send_root; 5325 sctx->send_root = send_root;
4759 sctx->clone_roots_cnt = arg->clone_sources_count; 5326 sctx->clone_roots_cnt = arg->clone_sources_count;
4760 5327
@@ -4771,6 +5338,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4771 goto out; 5338 goto out;
4772 } 5339 }
4773 5340
5341 sctx->pending_dir_moves = RB_ROOT;
5342 sctx->waiting_dir_moves = RB_ROOT;
5343
4774 sctx->clone_roots = vzalloc(sizeof(struct clone_root) * 5344 sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
4775 (arg->clone_sources_count + 1)); 5345 (arg->clone_sources_count + 1));
4776 if (!sctx->clone_roots) { 5346 if (!sctx->clone_roots) {
@@ -4798,11 +5368,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4798 key.objectid = clone_sources_tmp[i]; 5368 key.objectid = clone_sources_tmp[i];
4799 key.type = BTRFS_ROOT_ITEM_KEY; 5369 key.type = BTRFS_ROOT_ITEM_KEY;
4800 key.offset = (u64)-1; 5370 key.offset = (u64)-1;
5371
5372 index = srcu_read_lock(&fs_info->subvol_srcu);
5373
4801 clone_root = btrfs_read_fs_root_no_name(fs_info, &key); 5374 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
4802 if (IS_ERR(clone_root)) { 5375 if (IS_ERR(clone_root)) {
5376 srcu_read_unlock(&fs_info->subvol_srcu, index);
4803 ret = PTR_ERR(clone_root); 5377 ret = PTR_ERR(clone_root);
4804 goto out; 5378 goto out;
4805 } 5379 }
5380 clone_sources_to_rollback = i + 1;
5381 spin_lock(&clone_root->root_item_lock);
5382 clone_root->send_in_progress++;
5383 if (!btrfs_root_readonly(clone_root)) {
5384 spin_unlock(&clone_root->root_item_lock);
5385 srcu_read_unlock(&fs_info->subvol_srcu, index);
5386 ret = -EPERM;
5387 goto out;
5388 }
5389 spin_unlock(&clone_root->root_item_lock);
5390 srcu_read_unlock(&fs_info->subvol_srcu, index);
5391
4806 sctx->clone_roots[i].root = clone_root; 5392 sctx->clone_roots[i].root = clone_root;
4807 } 5393 }
4808 vfree(clone_sources_tmp); 5394 vfree(clone_sources_tmp);
@@ -4813,11 +5399,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4813 key.objectid = arg->parent_root; 5399 key.objectid = arg->parent_root;
4814 key.type = BTRFS_ROOT_ITEM_KEY; 5400 key.type = BTRFS_ROOT_ITEM_KEY;
4815 key.offset = (u64)-1; 5401 key.offset = (u64)-1;
5402
5403 index = srcu_read_lock(&fs_info->subvol_srcu);
5404
4816 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key); 5405 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
4817 if (IS_ERR(sctx->parent_root)) { 5406 if (IS_ERR(sctx->parent_root)) {
5407 srcu_read_unlock(&fs_info->subvol_srcu, index);
4818 ret = PTR_ERR(sctx->parent_root); 5408 ret = PTR_ERR(sctx->parent_root);
4819 goto out; 5409 goto out;
4820 } 5410 }
5411
5412 spin_lock(&sctx->parent_root->root_item_lock);
5413 sctx->parent_root->send_in_progress++;
5414 if (!btrfs_root_readonly(sctx->parent_root)) {
5415 spin_unlock(&sctx->parent_root->root_item_lock);
5416 srcu_read_unlock(&fs_info->subvol_srcu, index);
5417 ret = -EPERM;
5418 goto out;
5419 }
5420 spin_unlock(&sctx->parent_root->root_item_lock);
5421
5422 srcu_read_unlock(&fs_info->subvol_srcu, index);
4821 } 5423 }
4822 5424
4823 /* 5425 /*
@@ -4831,6 +5433,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4831 sort(sctx->clone_roots, sctx->clone_roots_cnt, 5433 sort(sctx->clone_roots, sctx->clone_roots_cnt,
4832 sizeof(*sctx->clone_roots), __clone_root_cmp_sort, 5434 sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
4833 NULL); 5435 NULL);
5436 sort_clone_roots = 1;
4834 5437
4835 ret = send_subvol(sctx); 5438 ret = send_subvol(sctx);
4836 if (ret < 0) 5439 if (ret < 0)
@@ -4846,6 +5449,48 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4846 } 5449 }
4847 5450
4848out: 5451out:
5452 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
5453 while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
5454 struct rb_node *n;
5455 struct pending_dir_move *pm;
5456
5457 n = rb_first(&sctx->pending_dir_moves);
5458 pm = rb_entry(n, struct pending_dir_move, node);
5459 while (!list_empty(&pm->list)) {
5460 struct pending_dir_move *pm2;
5461
5462 pm2 = list_first_entry(&pm->list,
5463 struct pending_dir_move, list);
5464 free_pending_move(sctx, pm2);
5465 }
5466 free_pending_move(sctx, pm);
5467 }
5468
5469 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
5470 while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
5471 struct rb_node *n;
5472 struct waiting_dir_move *dm;
5473
5474 n = rb_first(&sctx->waiting_dir_moves);
5475 dm = rb_entry(n, struct waiting_dir_move, node);
5476 rb_erase(&dm->node, &sctx->waiting_dir_moves);
5477 kfree(dm);
5478 }
5479
5480 if (sort_clone_roots) {
5481 for (i = 0; i < sctx->clone_roots_cnt; i++)
5482 btrfs_root_dec_send_in_progress(
5483 sctx->clone_roots[i].root);
5484 } else {
5485 for (i = 0; sctx && i < clone_sources_to_rollback; i++)
5486 btrfs_root_dec_send_in_progress(
5487 sctx->clone_roots[i].root);
5488
5489 btrfs_root_dec_send_in_progress(send_root);
5490 }
5491 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
5492 btrfs_root_dec_send_in_progress(sctx->parent_root);
5493
4849 kfree(arg); 5494 kfree(arg);
4850 vfree(clone_sources_tmp); 5495 vfree(clone_sources_tmp);
4851 5496
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d71a11d13dfa..d04db817be5c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -48,6 +48,8 @@
48#include "transaction.h" 48#include "transaction.h"
49#include "btrfs_inode.h" 49#include "btrfs_inode.h"
50#include "print-tree.h" 50#include "print-tree.h"
51#include "hash.h"
52#include "props.h"
51#include "xattr.h" 53#include "xattr.h"
52#include "volumes.h" 54#include "volumes.h"
53#include "export.h" 55#include "export.h"
@@ -152,11 +154,12 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
152 vaf.fmt = fmt; 154 vaf.fmt = fmt;
153 vaf.va = &args; 155 vaf.va = &args;
154 156
155 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n", 157 printk(KERN_CRIT
158 "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
156 sb->s_id, function, line, errno, errstr, &vaf); 159 sb->s_id, function, line, errno, errstr, &vaf);
157 va_end(args); 160 va_end(args);
158 } else { 161 } else {
159 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n", 162 printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
160 sb->s_id, function, line, errno, errstr); 163 sb->s_id, function, line, errno, errstr);
161 } 164 }
162 165
@@ -250,7 +253,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
250 */ 253 */
251 if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, 254 if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
252 &root->fs_info->fs_state)) { 255 &root->fs_info->fs_state)) {
253 WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n", 256 WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
254 errno); 257 errno);
255 } 258 }
256 trans->aborted = errno; 259 trans->aborted = errno;
@@ -294,8 +297,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
294 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", 297 panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
295 s_id, function, line, &vaf, errno, errstr); 298 s_id, function, line, &vaf, errno, errstr);
296 299
297 printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", 300 btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
298 s_id, function, line, &vaf, errno, errstr); 301 function, line, &vaf, errno, errstr);
299 va_end(args); 302 va_end(args);
300 /* Caller calls BUG() */ 303 /* Caller calls BUG() */
301} 304}
@@ -322,7 +325,9 @@ enum {
322 Opt_no_space_cache, Opt_recovery, Opt_skip_balance, 325 Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
323 Opt_check_integrity, Opt_check_integrity_including_extent_data, 326 Opt_check_integrity, Opt_check_integrity_including_extent_data,
324 Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, 327 Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
325 Opt_commit_interval, 328 Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
329 Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
330 Opt_datasum, Opt_treelog, Opt_noinode_cache,
326 Opt_err, 331 Opt_err,
327}; 332};
328 333
@@ -332,8 +337,11 @@ static match_table_t tokens = {
332 {Opt_subvolid, "subvolid=%s"}, 337 {Opt_subvolid, "subvolid=%s"},
333 {Opt_device, "device=%s"}, 338 {Opt_device, "device=%s"},
334 {Opt_nodatasum, "nodatasum"}, 339 {Opt_nodatasum, "nodatasum"},
340 {Opt_datasum, "datasum"},
335 {Opt_nodatacow, "nodatacow"}, 341 {Opt_nodatacow, "nodatacow"},
342 {Opt_datacow, "datacow"},
336 {Opt_nobarrier, "nobarrier"}, 343 {Opt_nobarrier, "nobarrier"},
344 {Opt_barrier, "barrier"},
337 {Opt_max_inline, "max_inline=%s"}, 345 {Opt_max_inline, "max_inline=%s"},
338 {Opt_alloc_start, "alloc_start=%s"}, 346 {Opt_alloc_start, "alloc_start=%s"},
339 {Opt_thread_pool, "thread_pool=%d"}, 347 {Opt_thread_pool, "thread_pool=%d"},
@@ -344,18 +352,25 @@ static match_table_t tokens = {
344 {Opt_ssd, "ssd"}, 352 {Opt_ssd, "ssd"},
345 {Opt_ssd_spread, "ssd_spread"}, 353 {Opt_ssd_spread, "ssd_spread"},
346 {Opt_nossd, "nossd"}, 354 {Opt_nossd, "nossd"},
355 {Opt_acl, "acl"},
347 {Opt_noacl, "noacl"}, 356 {Opt_noacl, "noacl"},
348 {Opt_notreelog, "notreelog"}, 357 {Opt_notreelog, "notreelog"},
358 {Opt_treelog, "treelog"},
349 {Opt_flushoncommit, "flushoncommit"}, 359 {Opt_flushoncommit, "flushoncommit"},
360 {Opt_noflushoncommit, "noflushoncommit"},
350 {Opt_ratio, "metadata_ratio=%d"}, 361 {Opt_ratio, "metadata_ratio=%d"},
351 {Opt_discard, "discard"}, 362 {Opt_discard, "discard"},
363 {Opt_nodiscard, "nodiscard"},
352 {Opt_space_cache, "space_cache"}, 364 {Opt_space_cache, "space_cache"},
353 {Opt_clear_cache, "clear_cache"}, 365 {Opt_clear_cache, "clear_cache"},
354 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 366 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
355 {Opt_enospc_debug, "enospc_debug"}, 367 {Opt_enospc_debug, "enospc_debug"},
368 {Opt_noenospc_debug, "noenospc_debug"},
356 {Opt_subvolrootid, "subvolrootid=%d"}, 369 {Opt_subvolrootid, "subvolrootid=%d"},
357 {Opt_defrag, "autodefrag"}, 370 {Opt_defrag, "autodefrag"},
371 {Opt_nodefrag, "noautodefrag"},
358 {Opt_inode_cache, "inode_cache"}, 372 {Opt_inode_cache, "inode_cache"},
373 {Opt_noinode_cache, "noinode_cache"},
359 {Opt_no_space_cache, "nospace_cache"}, 374 {Opt_no_space_cache, "nospace_cache"},
360 {Opt_recovery, "recovery"}, 375 {Opt_recovery, "recovery"},
361 {Opt_skip_balance, "skip_balance"}, 376 {Opt_skip_balance, "skip_balance"},
@@ -368,6 +383,20 @@ static match_table_t tokens = {
368 {Opt_err, NULL}, 383 {Opt_err, NULL},
369}; 384};
370 385
386#define btrfs_set_and_info(root, opt, fmt, args...) \
387{ \
388 if (!btrfs_test_opt(root, opt)) \
389 btrfs_info(root->fs_info, fmt, ##args); \
390 btrfs_set_opt(root->fs_info->mount_opt, opt); \
391}
392
393#define btrfs_clear_and_info(root, opt, fmt, args...) \
394{ \
395 if (btrfs_test_opt(root, opt)) \
396 btrfs_info(root->fs_info, fmt, ##args); \
397 btrfs_clear_opt(root->fs_info->mount_opt, opt); \
398}
399
371/* 400/*
372 * Regular mount options parser. Everything that is needed only when 401 * Regular mount options parser. Everything that is needed only when
373 * reading in a new superblock is parsed here. 402 * reading in a new superblock is parsed here.
@@ -383,6 +412,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
383 int ret = 0; 412 int ret = 0;
384 char *compress_type; 413 char *compress_type;
385 bool compress_force = false; 414 bool compress_force = false;
415 bool compress = false;
386 416
387 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 417 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
388 if (cache_gen) 418 if (cache_gen)
@@ -409,7 +439,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
409 token = match_token(p, tokens, args); 439 token = match_token(p, tokens, args);
410 switch (token) { 440 switch (token) {
411 case Opt_degraded: 441 case Opt_degraded:
412 printk(KERN_INFO "btrfs: allowing degraded mounts\n"); 442 btrfs_info(root->fs_info, "allowing degraded mounts");
413 btrfs_set_opt(info->mount_opt, DEGRADED); 443 btrfs_set_opt(info->mount_opt, DEGRADED);
414 break; 444 break;
415 case Opt_subvol: 445 case Opt_subvol:
@@ -422,27 +452,45 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
422 */ 452 */
423 break; 453 break;
424 case Opt_nodatasum: 454 case Opt_nodatasum:
425 printk(KERN_INFO "btrfs: setting nodatasum\n"); 455 btrfs_set_and_info(root, NODATASUM,
426 btrfs_set_opt(info->mount_opt, NODATASUM); 456 "setting nodatasum");
457 break;
458 case Opt_datasum:
459 if (btrfs_test_opt(root, NODATASUM)) {
460 if (btrfs_test_opt(root, NODATACOW))
461 btrfs_info(root->fs_info, "setting datasum, datacow enabled");
462 else
463 btrfs_info(root->fs_info, "setting datasum");
464 }
465 btrfs_clear_opt(info->mount_opt, NODATACOW);
466 btrfs_clear_opt(info->mount_opt, NODATASUM);
427 break; 467 break;
428 case Opt_nodatacow: 468 case Opt_nodatacow:
429 if (!btrfs_test_opt(root, COMPRESS) || 469 if (!btrfs_test_opt(root, NODATACOW)) {
430 !btrfs_test_opt(root, FORCE_COMPRESS)) { 470 if (!btrfs_test_opt(root, COMPRESS) ||
431 printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n"); 471 !btrfs_test_opt(root, FORCE_COMPRESS)) {
432 } else { 472 btrfs_info(root->fs_info,
433 printk(KERN_INFO "btrfs: setting nodatacow\n"); 473 "setting nodatacow, compression disabled");
474 } else {
475 btrfs_info(root->fs_info, "setting nodatacow");
476 }
434 } 477 }
435 btrfs_clear_opt(info->mount_opt, COMPRESS); 478 btrfs_clear_opt(info->mount_opt, COMPRESS);
436 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); 479 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
437 btrfs_set_opt(info->mount_opt, NODATACOW); 480 btrfs_set_opt(info->mount_opt, NODATACOW);
438 btrfs_set_opt(info->mount_opt, NODATASUM); 481 btrfs_set_opt(info->mount_opt, NODATASUM);
439 break; 482 break;
483 case Opt_datacow:
484 btrfs_clear_and_info(root, NODATACOW,
485 "setting datacow");
486 break;
440 case Opt_compress_force: 487 case Opt_compress_force:
441 case Opt_compress_force_type: 488 case Opt_compress_force_type:
442 compress_force = true; 489 compress_force = true;
443 /* Fallthrough */ 490 /* Fallthrough */
444 case Opt_compress: 491 case Opt_compress:
445 case Opt_compress_type: 492 case Opt_compress_type:
493 compress = true;
446 if (token == Opt_compress || 494 if (token == Opt_compress ||
447 token == Opt_compress_force || 495 token == Opt_compress_force ||
448 strcmp(args[0].from, "zlib") == 0) { 496 strcmp(args[0].from, "zlib") == 0) {
@@ -469,34 +517,36 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
469 } 517 }
470 518
471 if (compress_force) { 519 if (compress_force) {
472 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 520 btrfs_set_and_info(root, FORCE_COMPRESS,
473 pr_info("btrfs: force %s compression\n", 521 "force %s compression",
474 compress_type); 522 compress_type);
475 } else if (btrfs_test_opt(root, COMPRESS)) { 523 } else if (compress) {
476 pr_info("btrfs: use %s compression\n", 524 if (!btrfs_test_opt(root, COMPRESS))
477 compress_type); 525 btrfs_info(root->fs_info,
526 "btrfs: use %s compression\n",
527 compress_type);
478 } 528 }
479 break; 529 break;
480 case Opt_ssd: 530 case Opt_ssd:
481 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 531 btrfs_set_and_info(root, SSD,
482 btrfs_set_opt(info->mount_opt, SSD); 532 "use ssd allocation scheme");
483 break; 533 break;
484 case Opt_ssd_spread: 534 case Opt_ssd_spread:
485 printk(KERN_INFO "btrfs: use spread ssd " 535 btrfs_set_and_info(root, SSD_SPREAD,
486 "allocation scheme\n"); 536 "use spread ssd allocation scheme");
487 btrfs_set_opt(info->mount_opt, SSD);
488 btrfs_set_opt(info->mount_opt, SSD_SPREAD);
489 break; 537 break;
490 case Opt_nossd: 538 case Opt_nossd:
491 printk(KERN_INFO "btrfs: not using ssd allocation " 539 btrfs_clear_and_info(root, NOSSD,
492 "scheme\n"); 540 "not using ssd allocation scheme");
493 btrfs_set_opt(info->mount_opt, NOSSD);
494 btrfs_clear_opt(info->mount_opt, SSD); 541 btrfs_clear_opt(info->mount_opt, SSD);
495 btrfs_clear_opt(info->mount_opt, SSD_SPREAD); 542 break;
543 case Opt_barrier:
544 btrfs_clear_and_info(root, NOBARRIER,
545 "turning on barriers");
496 break; 546 break;
497 case Opt_nobarrier: 547 case Opt_nobarrier:
498 printk(KERN_INFO "btrfs: turning off barriers\n"); 548 btrfs_set_and_info(root, NOBARRIER,
499 btrfs_set_opt(info->mount_opt, NOBARRIER); 549 "turning off barriers");
500 break; 550 break;
501 case Opt_thread_pool: 551 case Opt_thread_pool:
502 ret = match_int(&args[0], &intarg); 552 ret = match_int(&args[0], &intarg);
@@ -516,11 +566,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
516 kfree(num); 566 kfree(num);
517 567
518 if (info->max_inline) { 568 if (info->max_inline) {
519 info->max_inline = max_t(u64, 569 info->max_inline = min_t(u64,
520 info->max_inline, 570 info->max_inline,
521 root->sectorsize); 571 root->sectorsize);
522 } 572 }
523 printk(KERN_INFO "btrfs: max_inline at %llu\n", 573 btrfs_info(root->fs_info, "max_inline at %llu",
524 info->max_inline); 574 info->max_inline);
525 } else { 575 } else {
526 ret = -ENOMEM; 576 ret = -ENOMEM;
@@ -534,24 +584,34 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
534 info->alloc_start = memparse(num, NULL); 584 info->alloc_start = memparse(num, NULL);
535 mutex_unlock(&info->chunk_mutex); 585 mutex_unlock(&info->chunk_mutex);
536 kfree(num); 586 kfree(num);
537 printk(KERN_INFO 587 btrfs_info(root->fs_info, "allocations start at %llu",
538 "btrfs: allocations start at %llu\n",
539 info->alloc_start); 588 info->alloc_start);
540 } else { 589 } else {
541 ret = -ENOMEM; 590 ret = -ENOMEM;
542 goto out; 591 goto out;
543 } 592 }
544 break; 593 break;
594 case Opt_acl:
595 root->fs_info->sb->s_flags |= MS_POSIXACL;
596 break;
545 case Opt_noacl: 597 case Opt_noacl:
546 root->fs_info->sb->s_flags &= ~MS_POSIXACL; 598 root->fs_info->sb->s_flags &= ~MS_POSIXACL;
547 break; 599 break;
548 case Opt_notreelog: 600 case Opt_notreelog:
549 printk(KERN_INFO "btrfs: disabling tree log\n"); 601 btrfs_set_and_info(root, NOTREELOG,
550 btrfs_set_opt(info->mount_opt, NOTREELOG); 602 "disabling tree log");
603 break;
604 case Opt_treelog:
605 btrfs_clear_and_info(root, NOTREELOG,
606 "enabling tree log");
551 break; 607 break;
552 case Opt_flushoncommit: 608 case Opt_flushoncommit:
553 printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); 609 btrfs_set_and_info(root, FLUSHONCOMMIT,
554 btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); 610 "turning on flush-on-commit");
611 break;
612 case Opt_noflushoncommit:
613 btrfs_clear_and_info(root, FLUSHONCOMMIT,
614 "turning off flush-on-commit");
555 break; 615 break;
556 case Opt_ratio: 616 case Opt_ratio:
557 ret = match_int(&args[0], &intarg); 617 ret = match_int(&args[0], &intarg);
@@ -559,7 +619,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
559 goto out; 619 goto out;
560 } else if (intarg >= 0) { 620 } else if (intarg >= 0) {
561 info->metadata_ratio = intarg; 621 info->metadata_ratio = intarg;
562 printk(KERN_INFO "btrfs: metadata ratio %d\n", 622 btrfs_info(root->fs_info, "metadata ratio %d",
563 info->metadata_ratio); 623 info->metadata_ratio);
564 } else { 624 } else {
565 ret = -EINVAL; 625 ret = -EINVAL;
@@ -567,25 +627,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
567 } 627 }
568 break; 628 break;
569 case Opt_discard: 629 case Opt_discard:
570 btrfs_set_opt(info->mount_opt, DISCARD); 630 btrfs_set_and_info(root, DISCARD,
631 "turning on discard");
632 break;
633 case Opt_nodiscard:
634 btrfs_clear_and_info(root, DISCARD,
635 "turning off discard");
571 break; 636 break;
572 case Opt_space_cache: 637 case Opt_space_cache:
573 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 638 btrfs_set_and_info(root, SPACE_CACHE,
639 "enabling disk space caching");
574 break; 640 break;
575 case Opt_rescan_uuid_tree: 641 case Opt_rescan_uuid_tree:
576 btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); 642 btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
577 break; 643 break;
578 case Opt_no_space_cache: 644 case Opt_no_space_cache:
579 printk(KERN_INFO "btrfs: disabling disk space caching\n"); 645 btrfs_clear_and_info(root, SPACE_CACHE,
580 btrfs_clear_opt(info->mount_opt, SPACE_CACHE); 646 "disabling disk space caching");
581 break; 647 break;
582 case Opt_inode_cache: 648 case Opt_inode_cache:
583 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 649 btrfs_set_and_info(root, CHANGE_INODE_CACHE,
584 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 650 "enabling inode map caching");
651 break;
652 case Opt_noinode_cache:
653 btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
654 "disabling inode map caching");
585 break; 655 break;
586 case Opt_clear_cache: 656 case Opt_clear_cache:
587 printk(KERN_INFO "btrfs: force clearing of disk cache\n"); 657 btrfs_set_and_info(root, CLEAR_CACHE,
588 btrfs_set_opt(info->mount_opt, CLEAR_CACHE); 658 "force clearing of disk cache");
589 break; 659 break;
590 case Opt_user_subvol_rm_allowed: 660 case Opt_user_subvol_rm_allowed:
591 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); 661 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
@@ -593,12 +663,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
593 case Opt_enospc_debug: 663 case Opt_enospc_debug:
594 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); 664 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
595 break; 665 break;
666 case Opt_noenospc_debug:
667 btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
668 break;
596 case Opt_defrag: 669 case Opt_defrag:
597 printk(KERN_INFO "btrfs: enabling auto defrag\n"); 670 btrfs_set_and_info(root, AUTO_DEFRAG,
598 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 671 "enabling auto defrag");
672 break;
673 case Opt_nodefrag:
674 btrfs_clear_and_info(root, AUTO_DEFRAG,
675 "disabling auto defrag");
599 break; 676 break;
600 case Opt_recovery: 677 case Opt_recovery:
601 printk(KERN_INFO "btrfs: enabling auto recovery\n"); 678 btrfs_info(root->fs_info, "enabling auto recovery");
602 btrfs_set_opt(info->mount_opt, RECOVERY); 679 btrfs_set_opt(info->mount_opt, RECOVERY);
603 break; 680 break;
604 case Opt_skip_balance: 681 case Opt_skip_balance:
@@ -606,14 +683,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
606 break; 683 break;
607#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 684#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
608 case Opt_check_integrity_including_extent_data: 685 case Opt_check_integrity_including_extent_data:
609 printk(KERN_INFO "btrfs: enabling check integrity" 686 btrfs_info(root->fs_info,
610 " including extent data\n"); 687 "enabling check integrity including extent data");
611 btrfs_set_opt(info->mount_opt, 688 btrfs_set_opt(info->mount_opt,
612 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); 689 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
613 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); 690 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
614 break; 691 break;
615 case Opt_check_integrity: 692 case Opt_check_integrity:
616 printk(KERN_INFO "btrfs: enabling check integrity\n"); 693 btrfs_info(root->fs_info, "enabling check integrity");
617 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); 694 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
618 break; 695 break;
619 case Opt_check_integrity_print_mask: 696 case Opt_check_integrity_print_mask:
@@ -622,8 +699,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
622 goto out; 699 goto out;
623 } else if (intarg >= 0) { 700 } else if (intarg >= 0) {
624 info->check_integrity_print_mask = intarg; 701 info->check_integrity_print_mask = intarg;
625 printk(KERN_INFO "btrfs:" 702 btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x",
626 " check_integrity_print_mask 0x%x\n",
627 info->check_integrity_print_mask); 703 info->check_integrity_print_mask);
628 } else { 704 } else {
629 ret = -EINVAL; 705 ret = -EINVAL;
@@ -634,8 +710,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
634 case Opt_check_integrity_including_extent_data: 710 case Opt_check_integrity_including_extent_data:
635 case Opt_check_integrity: 711 case Opt_check_integrity:
636 case Opt_check_integrity_print_mask: 712 case Opt_check_integrity_print_mask:
637 printk(KERN_ERR "btrfs: support for check_integrity*" 713 btrfs_err(root->fs_info,
638 " not compiled in!\n"); 714 "support for check_integrity* not compiled in!");
639 ret = -EINVAL; 715 ret = -EINVAL;
640 goto out; 716 goto out;
641#endif 717#endif
@@ -655,28 +731,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
655 intarg = 0; 731 intarg = 0;
656 ret = match_int(&args[0], &intarg); 732 ret = match_int(&args[0], &intarg);
657 if (ret < 0) { 733 if (ret < 0) {
658 printk(KERN_ERR 734 btrfs_err(root->fs_info, "invalid commit interval");
659 "btrfs: invalid commit interval\n");
660 ret = -EINVAL; 735 ret = -EINVAL;
661 goto out; 736 goto out;
662 } 737 }
663 if (intarg > 0) { 738 if (intarg > 0) {
664 if (intarg > 300) { 739 if (intarg > 300) {
665 printk(KERN_WARNING 740 btrfs_warn(root->fs_info, "excessive commit interval %d",
666 "btrfs: excessive commit interval %d\n",
667 intarg); 741 intarg);
668 } 742 }
669 info->commit_interval = intarg; 743 info->commit_interval = intarg;
670 } else { 744 } else {
671 printk(KERN_INFO 745 btrfs_info(root->fs_info, "using default commit interval %ds",
672 "btrfs: using default commit interval %ds\n",
673 BTRFS_DEFAULT_COMMIT_INTERVAL); 746 BTRFS_DEFAULT_COMMIT_INTERVAL);
674 info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; 747 info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
675 } 748 }
676 break; 749 break;
677 case Opt_err: 750 case Opt_err:
678 printk(KERN_INFO "btrfs: unrecognized mount option " 751 btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
679 "'%s'\n", p);
680 ret = -EINVAL; 752 ret = -EINVAL;
681 goto out; 753 goto out;
682 default: 754 default:
@@ -685,7 +757,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
685 } 757 }
686out: 758out:
687 if (!ret && btrfs_test_opt(root, SPACE_CACHE)) 759 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
688 printk(KERN_INFO "btrfs: disk space caching is enabled\n"); 760 btrfs_info(root->fs_info, "disk space caching is enabled");
689 kfree(orig); 761 kfree(orig);
690 return ret; 762 return ret;
691} 763}
@@ -748,7 +820,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
748 break; 820 break;
749 case Opt_subvolrootid: 821 case Opt_subvolrootid:
750 printk(KERN_WARNING 822 printk(KERN_WARNING
751 "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n"); 823 "BTRFS: 'subvolrootid' mount option is deprecated and has "
824 "no effect\n");
752 break; 825 break;
753 case Opt_device: 826 case Opt_device:
754 device_name = match_strdup(&args[0]); 827 device_name = match_strdup(&args[0]);
@@ -782,6 +855,7 @@ static struct dentry *get_default_root(struct super_block *sb,
782 struct btrfs_path *path; 855 struct btrfs_path *path;
783 struct btrfs_key location; 856 struct btrfs_key location;
784 struct inode *inode; 857 struct inode *inode;
858 struct dentry *dentry;
785 u64 dir_id; 859 u64 dir_id;
786 int new = 0; 860 int new = 0;
787 861
@@ -852,7 +926,13 @@ setup_root:
852 return dget(sb->s_root); 926 return dget(sb->s_root);
853 } 927 }
854 928
855 return d_obtain_alias(inode); 929 dentry = d_obtain_alias(inode);
930 if (!IS_ERR(dentry)) {
931 spin_lock(&dentry->d_lock);
932 dentry->d_flags &= ~DCACHE_DISCONNECTED;
933 spin_unlock(&dentry->d_lock);
934 }
935 return dentry;
856} 936}
857 937
858static int btrfs_fill_super(struct super_block *sb, 938static int btrfs_fill_super(struct super_block *sb,
@@ -877,7 +957,7 @@ static int btrfs_fill_super(struct super_block *sb,
877 sb->s_flags |= MS_I_VERSION; 957 sb->s_flags |= MS_I_VERSION;
878 err = open_ctree(sb, fs_devices, (char *)data); 958 err = open_ctree(sb, fs_devices, (char *)data);
879 if (err) { 959 if (err) {
880 printk("btrfs: open_ctree failed\n"); 960 printk(KERN_ERR "BTRFS: open_ctree failed\n");
881 return err; 961 return err;
882 } 962 }
883 963
@@ -1115,7 +1195,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
1115 dput(root); 1195 dput(root);
1116 root = ERR_PTR(-EINVAL); 1196 root = ERR_PTR(-EINVAL);
1117 deactivate_locked_super(s); 1197 deactivate_locked_super(s);
1118 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", 1198 printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
1119 subvol_name); 1199 subvol_name);
1120 } 1200 }
1121 1201
@@ -1240,7 +1320,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1240 1320
1241 fs_info->thread_pool_size = new_pool_size; 1321 fs_info->thread_pool_size = new_pool_size;
1242 1322
1243 printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n", 1323 btrfs_info(fs_info, "resize thread pool %d -> %d",
1244 old_pool_size, new_pool_size); 1324 old_pool_size, new_pool_size);
1245 1325
1246 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size); 1326 btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
@@ -1346,7 +1426,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1346 } else { 1426 } else {
1347 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { 1427 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
1348 btrfs_err(fs_info, 1428 btrfs_err(fs_info,
1349 "Remounting read-write after error is not allowed\n"); 1429 "Remounting read-write after error is not allowed");
1350 ret = -EINVAL; 1430 ret = -EINVAL;
1351 goto restore; 1431 goto restore;
1352 } 1432 }
@@ -1358,8 +1438,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1358 if (fs_info->fs_devices->missing_devices > 1438 if (fs_info->fs_devices->missing_devices >
1359 fs_info->num_tolerated_disk_barrier_failures && 1439 fs_info->num_tolerated_disk_barrier_failures &&
1360 !(*flags & MS_RDONLY)) { 1440 !(*flags & MS_RDONLY)) {
1361 printk(KERN_WARNING 1441 btrfs_warn(fs_info,
1362 "Btrfs: too many missing devices, writeable remount is not allowed\n"); 1442 "too many missing devices, writeable remount is not allowed");
1363 ret = -EACCES; 1443 ret = -EACCES;
1364 goto restore; 1444 goto restore;
1365 } 1445 }
@@ -1384,16 +1464,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1384 1464
1385 ret = btrfs_resume_dev_replace_async(fs_info); 1465 ret = btrfs_resume_dev_replace_async(fs_info);
1386 if (ret) { 1466 if (ret) {
1387 pr_warn("btrfs: failed to resume dev_replace\n"); 1467 btrfs_warn(fs_info, "failed to resume dev_replace");
1388 goto restore; 1468 goto restore;
1389 } 1469 }
1390 1470
1391 if (!fs_info->uuid_root) { 1471 if (!fs_info->uuid_root) {
1392 pr_info("btrfs: creating UUID tree\n"); 1472 btrfs_info(fs_info, "creating UUID tree");
1393 ret = btrfs_create_uuid_tree(fs_info); 1473 ret = btrfs_create_uuid_tree(fs_info);
1394 if (ret) { 1474 if (ret) {
1395 pr_warn("btrfs: failed to create the uuid tree" 1475 btrfs_warn(fs_info, "failed to create the UUID tree %d", ret);
1396 "%d\n", ret);
1397 goto restore; 1476 goto restore;
1398 } 1477 }
1399 } 1478 }
@@ -1773,7 +1852,7 @@ static int btrfs_interface_init(void)
1773static void btrfs_interface_exit(void) 1852static void btrfs_interface_exit(void)
1774{ 1853{
1775 if (misc_deregister(&btrfs_misc) < 0) 1854 if (misc_deregister(&btrfs_misc) < 0)
1776 printk(KERN_INFO "btrfs: misc_deregister failed for control device\n"); 1855 printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
1777} 1856}
1778 1857
1779static void btrfs_print_info(void) 1858static void btrfs_print_info(void)
@@ -1818,10 +1897,16 @@ static int __init init_btrfs_fs(void)
1818{ 1897{
1819 int err; 1898 int err;
1820 1899
1821 err = btrfs_init_sysfs(); 1900 err = btrfs_hash_init();
1822 if (err) 1901 if (err)
1823 return err; 1902 return err;
1824 1903
1904 btrfs_props_init();
1905
1906 err = btrfs_init_sysfs();
1907 if (err)
1908 goto free_hash;
1909
1825 btrfs_init_compress(); 1910 btrfs_init_compress();
1826 1911
1827 err = btrfs_init_cachep(); 1912 err = btrfs_init_cachep();
@@ -1895,6 +1980,8 @@ free_cachep:
1895free_compress: 1980free_compress:
1896 btrfs_exit_compress(); 1981 btrfs_exit_compress();
1897 btrfs_exit_sysfs(); 1982 btrfs_exit_sysfs();
1983free_hash:
1984 btrfs_hash_exit();
1898 return err; 1985 return err;
1899} 1986}
1900 1987
@@ -1913,9 +2000,10 @@ static void __exit exit_btrfs_fs(void)
1913 btrfs_exit_sysfs(); 2000 btrfs_exit_sysfs();
1914 btrfs_cleanup_fs_uuids(); 2001 btrfs_cleanup_fs_uuids();
1915 btrfs_exit_compress(); 2002 btrfs_exit_compress();
2003 btrfs_hash_exit();
1916} 2004}
1917 2005
1918module_init(init_btrfs_fs) 2006late_initcall(init_btrfs_fs);
1919module_exit(exit_btrfs_fs) 2007module_exit(exit_btrfs_fs)
1920 2008
1921MODULE_LICENSE("GPL"); 2009MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5b326cd60a4a..865f4cf9a769 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,24 +22,647 @@
22#include <linux/completion.h> 22#include <linux/completion.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/kobject.h> 24#include <linux/kobject.h>
25#include <linux/bug.h>
26#include <linux/genhd.h>
25 27
26#include "ctree.h" 28#include "ctree.h"
27#include "disk-io.h" 29#include "disk-io.h"
28#include "transaction.h" 30#include "transaction.h"
31#include "sysfs.h"
32#include "volumes.h"
33
34static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
35
36static u64 get_features(struct btrfs_fs_info *fs_info,
37 enum btrfs_feature_set set)
38{
39 struct btrfs_super_block *disk_super = fs_info->super_copy;
40 if (set == FEAT_COMPAT)
41 return btrfs_super_compat_flags(disk_super);
42 else if (set == FEAT_COMPAT_RO)
43 return btrfs_super_compat_ro_flags(disk_super);
44 else
45 return btrfs_super_incompat_flags(disk_super);
46}
47
48static void set_features(struct btrfs_fs_info *fs_info,
49 enum btrfs_feature_set set, u64 features)
50{
51 struct btrfs_super_block *disk_super = fs_info->super_copy;
52 if (set == FEAT_COMPAT)
53 btrfs_set_super_compat_flags(disk_super, features);
54 else if (set == FEAT_COMPAT_RO)
55 btrfs_set_super_compat_ro_flags(disk_super, features);
56 else
57 btrfs_set_super_incompat_flags(disk_super, features);
58}
59
60static int can_modify_feature(struct btrfs_feature_attr *fa)
61{
62 int val = 0;
63 u64 set, clear;
64 switch (fa->feature_set) {
65 case FEAT_COMPAT:
66 set = BTRFS_FEATURE_COMPAT_SAFE_SET;
67 clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
68 break;
69 case FEAT_COMPAT_RO:
70 set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
71 clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
72 break;
73 case FEAT_INCOMPAT:
74 set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
75 clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
76 break;
77 default:
78 printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n",
79 fa->feature_set);
80 return 0;
81 }
82
83 if (set & fa->feature_bit)
84 val |= 1;
85 if (clear & fa->feature_bit)
86 val |= 2;
87
88 return val;
89}
90
91static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
92 struct kobj_attribute *a, char *buf)
93{
94 int val = 0;
95 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
96 struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
97 if (fs_info) {
98 u64 features = get_features(fs_info, fa->feature_set);
99 if (features & fa->feature_bit)
100 val = 1;
101 } else
102 val = can_modify_feature(fa);
103
104 return snprintf(buf, PAGE_SIZE, "%d\n", val);
105}
106
107static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
108 struct kobj_attribute *a,
109 const char *buf, size_t count)
110{
111 struct btrfs_fs_info *fs_info;
112 struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
113 struct btrfs_trans_handle *trans;
114 u64 features, set, clear;
115 unsigned long val;
116 int ret;
117
118 fs_info = to_fs_info(kobj);
119 if (!fs_info)
120 return -EPERM;
121
122 ret = kstrtoul(skip_spaces(buf), 0, &val);
123 if (ret)
124 return ret;
125
126 if (fa->feature_set == FEAT_COMPAT) {
127 set = BTRFS_FEATURE_COMPAT_SAFE_SET;
128 clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
129 } else if (fa->feature_set == FEAT_COMPAT_RO) {
130 set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
131 clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
132 } else {
133 set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
134 clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
135 }
136
137 features = get_features(fs_info, fa->feature_set);
138
139 /* Nothing to do */
140 if ((val && (features & fa->feature_bit)) ||
141 (!val && !(features & fa->feature_bit)))
142 return count;
143
144 if ((val && !(set & fa->feature_bit)) ||
145 (!val && !(clear & fa->feature_bit))) {
146 btrfs_info(fs_info,
147 "%sabling feature %s on mounted fs is not supported.",
148 val ? "En" : "Dis", fa->kobj_attr.attr.name);
149 return -EPERM;
150 }
151
152 btrfs_info(fs_info, "%s %s feature flag",
153 val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
154
155 trans = btrfs_start_transaction(fs_info->fs_root, 0);
156 if (IS_ERR(trans))
157 return PTR_ERR(trans);
158
159 spin_lock(&fs_info->super_lock);
160 features = get_features(fs_info, fa->feature_set);
161 if (val)
162 features |= fa->feature_bit;
163 else
164 features &= ~fa->feature_bit;
165 set_features(fs_info, fa->feature_set, features);
166 spin_unlock(&fs_info->super_lock);
167
168 ret = btrfs_commit_transaction(trans, fs_info->fs_root);
169 if (ret)
170 return ret;
171
172 return count;
173}
174
175static umode_t btrfs_feature_visible(struct kobject *kobj,
176 struct attribute *attr, int unused)
177{
178 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
179 umode_t mode = attr->mode;
180
181 if (fs_info) {
182 struct btrfs_feature_attr *fa;
183 u64 features;
184
185 fa = attr_to_btrfs_feature_attr(attr);
186 features = get_features(fs_info, fa->feature_set);
187
188 if (can_modify_feature(fa))
189 mode |= S_IWUSR;
190 else if (!(features & fa->feature_bit))
191 mode = 0;
192 }
193
194 return mode;
195}
196
197BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
198BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
199BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
200BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
201BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
202BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
203BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
204BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
205BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
206
207static struct attribute *btrfs_supported_feature_attrs[] = {
208 BTRFS_FEAT_ATTR_PTR(mixed_backref),
209 BTRFS_FEAT_ATTR_PTR(default_subvol),
210 BTRFS_FEAT_ATTR_PTR(mixed_groups),
211 BTRFS_FEAT_ATTR_PTR(compress_lzo),
212 BTRFS_FEAT_ATTR_PTR(big_metadata),
213 BTRFS_FEAT_ATTR_PTR(extended_iref),
214 BTRFS_FEAT_ATTR_PTR(raid56),
215 BTRFS_FEAT_ATTR_PTR(skinny_metadata),
216 BTRFS_FEAT_ATTR_PTR(no_holes),
217 NULL
218};
219
220static const struct attribute_group btrfs_feature_attr_group = {
221 .name = "features",
222 .is_visible = btrfs_feature_visible,
223 .attrs = btrfs_supported_feature_attrs,
224};
225
226static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
227{
228 u64 val;
229 if (lock)
230 spin_lock(lock);
231 val = *value_ptr;
232 if (lock)
233 spin_unlock(lock);
234 return snprintf(buf, PAGE_SIZE, "%llu\n", val);
235}
236
237static ssize_t global_rsv_size_show(struct kobject *kobj,
238 struct kobj_attribute *ka, char *buf)
239{
240 struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
241 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
242 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
243}
244BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
245
246static ssize_t global_rsv_reserved_show(struct kobject *kobj,
247 struct kobj_attribute *a, char *buf)
248{
249 struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
250 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
251 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
252}
253BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
254
255#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
256
257static ssize_t raid_bytes_show(struct kobject *kobj,
258 struct kobj_attribute *attr, char *buf);
259BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
260BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
261
262static ssize_t raid_bytes_show(struct kobject *kobj,
263 struct kobj_attribute *attr, char *buf)
264
265{
266 struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
267 struct btrfs_block_group_cache *block_group;
268 int index = kobj - sinfo->block_group_kobjs;
269 u64 val = 0;
270
271 down_read(&sinfo->groups_sem);
272 list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
273 if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
274 val += block_group->key.offset;
275 else
276 val += btrfs_block_group_used(&block_group->item);
277 }
278 up_read(&sinfo->groups_sem);
279 return snprintf(buf, PAGE_SIZE, "%llu\n", val);
280}
281
282static struct attribute *raid_attributes[] = {
283 BTRFS_RAID_ATTR_PTR(total_bytes),
284 BTRFS_RAID_ATTR_PTR(used_bytes),
285 NULL
286};
287
288static void release_raid_kobj(struct kobject *kobj)
289{
290 kobject_put(kobj->parent);
291}
292
293struct kobj_type btrfs_raid_ktype = {
294 .sysfs_ops = &kobj_sysfs_ops,
295 .release = release_raid_kobj,
296 .default_attrs = raid_attributes,
297};
298
299#define SPACE_INFO_ATTR(field) \
300static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
301 struct kobj_attribute *a, \
302 char *buf) \
303{ \
304 struct btrfs_space_info *sinfo = to_space_info(kobj); \
305 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
306} \
307BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
308
309static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
310 struct kobj_attribute *a,
311 char *buf)
312{
313 struct btrfs_space_info *sinfo = to_space_info(kobj);
314 s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
315 return snprintf(buf, PAGE_SIZE, "%lld\n", val);
316}
317
318SPACE_INFO_ATTR(flags);
319SPACE_INFO_ATTR(total_bytes);
320SPACE_INFO_ATTR(bytes_used);
321SPACE_INFO_ATTR(bytes_pinned);
322SPACE_INFO_ATTR(bytes_reserved);
323SPACE_INFO_ATTR(bytes_may_use);
324SPACE_INFO_ATTR(disk_used);
325SPACE_INFO_ATTR(disk_total);
326BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
327
328static struct attribute *space_info_attrs[] = {
329 BTRFS_ATTR_PTR(flags),
330 BTRFS_ATTR_PTR(total_bytes),
331 BTRFS_ATTR_PTR(bytes_used),
332 BTRFS_ATTR_PTR(bytes_pinned),
333 BTRFS_ATTR_PTR(bytes_reserved),
334 BTRFS_ATTR_PTR(bytes_may_use),
335 BTRFS_ATTR_PTR(disk_used),
336 BTRFS_ATTR_PTR(disk_total),
337 BTRFS_ATTR_PTR(total_bytes_pinned),
338 NULL,
339};
340
341static void space_info_release(struct kobject *kobj)
342{
343 struct btrfs_space_info *sinfo = to_space_info(kobj);
344 percpu_counter_destroy(&sinfo->total_bytes_pinned);
345 kfree(sinfo);
346}
347
348struct kobj_type space_info_ktype = {
349 .sysfs_ops = &kobj_sysfs_ops,
350 .release = space_info_release,
351 .default_attrs = space_info_attrs,
352};
353
354static const struct attribute *allocation_attrs[] = {
355 BTRFS_ATTR_PTR(global_rsv_reserved),
356 BTRFS_ATTR_PTR(global_rsv_size),
357 NULL,
358};
359
360static ssize_t btrfs_label_show(struct kobject *kobj,
361 struct kobj_attribute *a, char *buf)
362{
363 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
364 return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
365}
366
367static ssize_t btrfs_label_store(struct kobject *kobj,
368 struct kobj_attribute *a,
369 const char *buf, size_t len)
370{
371 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
372 struct btrfs_trans_handle *trans;
373 struct btrfs_root *root = fs_info->fs_root;
374 int ret;
375
376 if (len >= BTRFS_LABEL_SIZE) {
377 pr_err("BTRFS: unable to set label with more than %d bytes\n",
378 BTRFS_LABEL_SIZE - 1);
379 return -EINVAL;
380 }
381
382 trans = btrfs_start_transaction(root, 0);
383 if (IS_ERR(trans))
384 return PTR_ERR(trans);
385
386 spin_lock(&root->fs_info->super_lock);
387 strcpy(fs_info->super_copy->label, buf);
388 spin_unlock(&root->fs_info->super_lock);
389 ret = btrfs_commit_transaction(trans, root);
390
391 if (!ret)
392 return len;
393
394 return ret;
395}
396BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
397
398static struct attribute *btrfs_attrs[] = {
399 BTRFS_ATTR_PTR(label),
400 NULL,
401};
402
403static void btrfs_release_super_kobj(struct kobject *kobj)
404{
405 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
406 complete(&fs_info->kobj_unregister);
407}
408
409static struct kobj_type btrfs_ktype = {
410 .sysfs_ops = &kobj_sysfs_ops,
411 .release = btrfs_release_super_kobj,
412 .default_attrs = btrfs_attrs,
413};
414
415static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
416{
417 if (kobj->ktype != &btrfs_ktype)
418 return NULL;
419 return container_of(kobj, struct btrfs_fs_info, super_kobj);
420}
421
422#define NUM_FEATURE_BITS 64
423static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
424static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
425
426static u64 supported_feature_masks[3] = {
427 [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
428 [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
429 [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
430};
431
432static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
433{
434 int set;
435
436 for (set = 0; set < FEAT_MAX; set++) {
437 int i;
438 struct attribute *attrs[2];
439 struct attribute_group agroup = {
440 .name = "features",
441 .attrs = attrs,
442 };
443 u64 features = get_features(fs_info, set);
444 features &= ~supported_feature_masks[set];
445
446 if (!features)
447 continue;
448
449 attrs[1] = NULL;
450 for (i = 0; i < NUM_FEATURE_BITS; i++) {
451 struct btrfs_feature_attr *fa;
452
453 if (!(features & (1ULL << i)))
454 continue;
455
456 fa = &btrfs_feature_attrs[set][i];
457 attrs[0] = &fa->kobj_attr.attr;
458 if (add) {
459 int ret;
460 ret = sysfs_merge_group(&fs_info->super_kobj,
461 &agroup);
462 if (ret)
463 return ret;
464 } else
465 sysfs_unmerge_group(&fs_info->super_kobj,
466 &agroup);
467 }
468
469 }
470 return 0;
471}
472
473static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
474{
475 kobject_del(&fs_info->super_kobj);
476 kobject_put(&fs_info->super_kobj);
477 wait_for_completion(&fs_info->kobj_unregister);
478}
479
480void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
481{
482 if (fs_info->space_info_kobj) {
483 sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
484 kobject_del(fs_info->space_info_kobj);
485 kobject_put(fs_info->space_info_kobj);
486 }
487 kobject_del(fs_info->device_dir_kobj);
488 kobject_put(fs_info->device_dir_kobj);
489 addrm_unknown_feature_attrs(fs_info, false);
490 sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
491 __btrfs_sysfs_remove_one(fs_info);
492}
493
494const char * const btrfs_feature_set_names[3] = {
495 [FEAT_COMPAT] = "compat",
496 [FEAT_COMPAT_RO] = "compat_ro",
497 [FEAT_INCOMPAT] = "incompat",
498};
499
500char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
501{
502 size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
503 int len = 0;
504 int i;
505 char *str;
506
507 str = kmalloc(bufsize, GFP_KERNEL);
508 if (!str)
509 return str;
510
511 for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
512 const char *name;
513
514 if (!(flags & (1ULL << i)))
515 continue;
516
517 name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
518 len += snprintf(str + len, bufsize - len, "%s%s",
519 len ? "," : "", name);
520 }
521
522 return str;
523}
524
525static void init_feature_attrs(void)
526{
527 struct btrfs_feature_attr *fa;
528 int set, i;
529
530 BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
531 ARRAY_SIZE(btrfs_feature_attrs));
532 BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
533 ARRAY_SIZE(btrfs_feature_attrs[0]));
534
535 memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
536 memset(btrfs_unknown_feature_names, 0,
537 sizeof(btrfs_unknown_feature_names));
538
539 for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
540 struct btrfs_feature_attr *sfa;
541 struct attribute *a = btrfs_supported_feature_attrs[i];
542 int bit;
543 sfa = attr_to_btrfs_feature_attr(a);
544 bit = ilog2(sfa->feature_bit);
545 fa = &btrfs_feature_attrs[sfa->feature_set][bit];
546
547 fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
548 }
549
550 for (set = 0; set < FEAT_MAX; set++) {
551 for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
552 char *name = btrfs_unknown_feature_names[set][i];
553 fa = &btrfs_feature_attrs[set][i];
554
555 if (fa->kobj_attr.attr.name)
556 continue;
557
558 snprintf(name, 13, "%s:%u",
559 btrfs_feature_set_names[set], i);
560
561 fa->kobj_attr.attr.name = name;
562 fa->kobj_attr.attr.mode = S_IRUGO;
563 fa->feature_set = set;
564 fa->feature_bit = 1ULL << i;
565 }
566 }
567}
568
569static int add_device_membership(struct btrfs_fs_info *fs_info)
570{
571 int error = 0;
572 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
573 struct btrfs_device *dev;
574
575 fs_info->device_dir_kobj = kobject_create_and_add("devices",
576 &fs_info->super_kobj);
577 if (!fs_info->device_dir_kobj)
578 return -ENOMEM;
579
580 list_for_each_entry(dev, &fs_devices->devices, dev_list) {
581 struct hd_struct *disk;
582 struct kobject *disk_kobj;
583
584 if (!dev->bdev)
585 continue;
586
587 disk = dev->bdev->bd_part;
588 disk_kobj = &part_to_dev(disk)->kobj;
589
590 error = sysfs_create_link(fs_info->device_dir_kobj,
591 disk_kobj, disk_kobj->name);
592 if (error)
593 break;
594 }
595
596 return error;
597}
29 598
30/* /sys/fs/btrfs/ entry */ 599/* /sys/fs/btrfs/ entry */
31static struct kset *btrfs_kset; 600static struct kset *btrfs_kset;
32 601
602int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
603{
604 int error;
605
606 init_completion(&fs_info->kobj_unregister);
607 fs_info->super_kobj.kset = btrfs_kset;
608 error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
609 "%pU", fs_info->fsid);
610 if (error)
611 return error;
612
613 error = sysfs_create_group(&fs_info->super_kobj,
614 &btrfs_feature_attr_group);
615 if (error) {
616 __btrfs_sysfs_remove_one(fs_info);
617 return error;
618 }
619
620 error = addrm_unknown_feature_attrs(fs_info, true);
621 if (error)
622 goto failure;
623
624 error = add_device_membership(fs_info);
625 if (error)
626 goto failure;
627
628 fs_info->space_info_kobj = kobject_create_and_add("allocation",
629 &fs_info->super_kobj);
630 if (!fs_info->space_info_kobj) {
631 error = -ENOMEM;
632 goto failure;
633 }
634
635 error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
636 if (error)
637 goto failure;
638
639 return 0;
640failure:
641 btrfs_sysfs_remove_one(fs_info);
642 return error;
643}
644
33int btrfs_init_sysfs(void) 645int btrfs_init_sysfs(void)
34{ 646{
647 int ret;
35 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); 648 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
36 if (!btrfs_kset) 649 if (!btrfs_kset)
37 return -ENOMEM; 650 return -ENOMEM;
651
652 init_feature_attrs();
653
654 ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
655 if (ret) {
656 kset_unregister(btrfs_kset);
657 return ret;
658 }
659
38 return 0; 660 return 0;
39} 661}
40 662
41void btrfs_exit_sysfs(void) 663void btrfs_exit_sysfs(void)
42{ 664{
665 sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
43 kset_unregister(btrfs_kset); 666 kset_unregister(btrfs_kset);
44} 667}
45 668
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
new file mode 100644
index 000000000000..f3cea3710d44
--- /dev/null
+++ b/fs/btrfs/sysfs.h
@@ -0,0 +1,64 @@
1#ifndef _BTRFS_SYSFS_H_
2#define _BTRFS_SYSFS_H_
3
4enum btrfs_feature_set {
5 FEAT_COMPAT,
6 FEAT_COMPAT_RO,
7 FEAT_INCOMPAT,
8 FEAT_MAX
9};
10
11#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
12{ \
13 .attr = { .name = __stringify(_name), .mode = _mode }, \
14 .show = _show, \
15 .store = _store, \
16}
17
18#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \
19static struct kobj_attribute btrfs_attr_##_name = \
20 __INIT_KOBJ_ATTR(_name, _mode, _show, _store)
21#define BTRFS_ATTR(_name, _mode, _show) \
22 BTRFS_ATTR_RW(_name, _mode, _show, NULL)
23#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
24
25#define BTRFS_RAID_ATTR(_name, _show) \
26static struct kobj_attribute btrfs_raid_attr_##_name = \
27 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
28#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
29
30
31struct btrfs_feature_attr {
32 struct kobj_attribute kobj_attr;
33 enum btrfs_feature_set feature_set;
34 u64 feature_bit;
35};
36
37#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \
38static struct btrfs_feature_attr btrfs_attr_##_name = { \
39 .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \
40 btrfs_feature_attr_show, \
41 btrfs_feature_attr_store), \
42 .feature_set = _feature_set, \
43 .feature_bit = _prefix ##_## _feature_bit, \
44}
45#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr)
46
47#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
48 BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
49#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
50 BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
51#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
52 BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
53
54/* convert from attribute */
55#define to_btrfs_feature_attr(a) \
56 container_of(a, struct btrfs_feature_attr, kobj_attr)
57#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
58#define attr_to_btrfs_feature_attr(a) \
59 to_btrfs_feature_attr(attr_to_btrfs_attr(a))
60char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
61extern const char * const btrfs_feature_set_names[3];
62extern struct kobj_type space_info_ktype;
63extern struct kobj_type btrfs_raid_ktype;
64#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index b353bc806ca0..312560a9123d 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -21,7 +21,7 @@
21 21
22#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 22#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
23 23
24#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__) 24#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
25 25
26int btrfs_test_free_space_cache(void); 26int btrfs_test_free_space_cache(void);
27int btrfs_test_extent_buffer_operations(void); 27int btrfs_test_extent_buffer_operations(void);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 6fc82010dc15..c8d9ddf84c69 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache)
101 101
102 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); 102 ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
103 if (ret) { 103 if (ret) {
104 test_msg("Error removing middle peice %d\n", ret); 104 test_msg("Error removing middle piece %d\n", ret);
105 return ret; 105 return ret;
106 } 106 }
107 107
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
266 } 266 }
267 267
268 if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { 268 if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
269 test_msg("Left over peices after removing overlapping\n"); 269 test_msg("Left over pieces after removing overlapping\n");
270 return -1; 270 return -1;
271 } 271 }
272 272
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c6a872a8a468..34cd83184c4a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -62,7 +62,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
62 WARN_ON(atomic_read(&transaction->use_count) == 0); 62 WARN_ON(atomic_read(&transaction->use_count) == 0);
63 if (atomic_dec_and_test(&transaction->use_count)) { 63 if (atomic_dec_and_test(&transaction->use_count)) {
64 BUG_ON(!list_empty(&transaction->list)); 64 BUG_ON(!list_empty(&transaction->list));
65 WARN_ON(transaction->delayed_refs.root.rb_node); 65 WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
66 while (!list_empty(&transaction->pending_chunks)) { 66 while (!list_empty(&transaction->pending_chunks)) {
67 struct extent_map *em; 67 struct extent_map *em;
68 68
@@ -183,8 +183,8 @@ loop:
183 atomic_set(&cur_trans->use_count, 2); 183 atomic_set(&cur_trans->use_count, 2);
184 cur_trans->start_time = get_seconds(); 184 cur_trans->start_time = get_seconds();
185 185
186 cur_trans->delayed_refs.root = RB_ROOT; 186 cur_trans->delayed_refs.href_root = RB_ROOT;
187 cur_trans->delayed_refs.num_entries = 0; 187 atomic_set(&cur_trans->delayed_refs.num_entries, 0);
188 cur_trans->delayed_refs.num_heads_ready = 0; 188 cur_trans->delayed_refs.num_heads_ready = 0;
189 cur_trans->delayed_refs.num_heads = 0; 189 cur_trans->delayed_refs.num_heads = 0;
190 cur_trans->delayed_refs.flushing = 0; 190 cur_trans->delayed_refs.flushing = 0;
@@ -196,17 +196,14 @@ loop:
196 */ 196 */
197 smp_mb(); 197 smp_mb();
198 if (!list_empty(&fs_info->tree_mod_seq_list)) 198 if (!list_empty(&fs_info->tree_mod_seq_list))
199 WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when " 199 WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
200 "creating a fresh transaction\n"); 200 "creating a fresh transaction\n");
201 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) 201 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
202 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " 202 WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
203 "creating a fresh transaction\n"); 203 "creating a fresh transaction\n");
204 atomic64_set(&fs_info->tree_mod_seq, 0); 204 atomic64_set(&fs_info->tree_mod_seq, 0);
205 205
206 spin_lock_init(&cur_trans->delayed_refs.lock); 206 spin_lock_init(&cur_trans->delayed_refs.lock);
207 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
208 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
209 init_waitqueue_head(&cur_trans->delayed_refs.wait);
210 207
211 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 208 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
212 INIT_LIST_HEAD(&cur_trans->ordered_operations); 209 INIT_LIST_HEAD(&cur_trans->ordered_operations);
@@ -472,6 +469,7 @@ again:
472 h->type = type; 469 h->type = type;
473 h->allocating_chunk = false; 470 h->allocating_chunk = false;
474 h->reloc_reserved = false; 471 h->reloc_reserved = false;
472 h->sync = false;
475 INIT_LIST_HEAD(&h->qgroup_ref_list); 473 INIT_LIST_HEAD(&h->qgroup_ref_list);
476 INIT_LIST_HEAD(&h->new_bgs); 474 INIT_LIST_HEAD(&h->new_bgs);
477 475
@@ -647,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
647 struct btrfs_root *root) 645 struct btrfs_root *root)
648{ 646{
649 if (root->fs_info->global_block_rsv.space_info->full && 647 if (root->fs_info->global_block_rsv.space_info->full &&
650 btrfs_should_throttle_delayed_refs(trans, root)) 648 btrfs_check_space_for_delayed_refs(trans, root))
651 return 1; 649 return 1;
652 650
653 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); 651 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
@@ -711,8 +709,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
711 btrfs_create_pending_block_groups(trans, root); 709 btrfs_create_pending_block_groups(trans, root);
712 710
713 trans->delayed_ref_updates = 0; 711 trans->delayed_ref_updates = 0;
714 if (btrfs_should_throttle_delayed_refs(trans, root)) { 712 if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
715 cur = max_t(unsigned long, cur, 1); 713 cur = max_t(unsigned long, cur, 32);
716 trans->delayed_ref_updates = 0; 714 trans->delayed_ref_updates = 0;
717 btrfs_run_delayed_refs(trans, root, cur); 715 btrfs_run_delayed_refs(trans, root, cur);
718 } 716 }
@@ -788,12 +786,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
788 return __btrfs_end_transaction(trans, root, 1); 786 return __btrfs_end_transaction(trans, root, 1);
789} 787}
790 788
791int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
792 struct btrfs_root *root)
793{
794 return __btrfs_end_transaction(trans, root, 1);
795}
796
797/* 789/*
798 * when btree blocks are allocated, they have some corresponding bits set for 790 * when btree blocks are allocated, they have some corresponding bits set for
799 * them in one of two extent_io trees. This is used to make sure all of 791 * them in one of two extent_io trees. This is used to make sure all of
@@ -1105,7 +1097,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
1105 break; 1097 break;
1106 1098
1107 if (btrfs_defrag_cancelled(root->fs_info)) { 1099 if (btrfs_defrag_cancelled(root->fs_info)) {
1108 printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); 1100 pr_debug("BTRFS: defrag_root cancelled\n");
1109 ret = -EAGAIN; 1101 ret = -EAGAIN;
1110 break; 1102 break;
1111 } 1103 }
@@ -1746,6 +1738,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1746 goto cleanup_transaction; 1738 goto cleanup_transaction;
1747 1739
1748 btrfs_wait_delalloc_flush(root->fs_info); 1740 btrfs_wait_delalloc_flush(root->fs_info);
1741
1742 btrfs_scrub_pause(root);
1749 /* 1743 /*
1750 * Ok now we need to make sure to block out any other joins while we 1744 * Ok now we need to make sure to block out any other joins while we
1751 * commit the transaction. We could have started a join before setting 1745 * commit the transaction. We could have started a join before setting
@@ -1810,7 +1804,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1810 1804
1811 WARN_ON(cur_trans != trans->transaction); 1805 WARN_ON(cur_trans != trans->transaction);
1812 1806
1813 btrfs_scrub_pause(root);
1814 /* btrfs_commit_tree_roots is responsible for getting the 1807 /* btrfs_commit_tree_roots is responsible for getting the
1815 * various roots consistent with each other. Every pointer 1808 * various roots consistent with each other. Every pointer
1816 * in the tree of tree roots has to point to the most up to date 1809 * in the tree of tree roots has to point to the most up to date
@@ -1833,6 +1826,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1833 goto cleanup_transaction; 1826 goto cleanup_transaction;
1834 } 1827 }
1835 1828
1829 /*
1830 * Since the transaction is done, we should set the inode map cache flag
1831 * before any other comming transaction.
1832 */
1833 if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
1834 btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
1835 else
1836 btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
1837
1836 /* commit_fs_roots gets rid of all the tree log roots, it is now 1838 /* commit_fs_roots gets rid of all the tree log roots, it is now
1837 * safe to free the root of tree log roots 1839 * safe to free the root of tree log roots
1838 */ 1840 */
@@ -1975,10 +1977,23 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1975 } 1977 }
1976 root = list_first_entry(&fs_info->dead_roots, 1978 root = list_first_entry(&fs_info->dead_roots,
1977 struct btrfs_root, root_list); 1979 struct btrfs_root, root_list);
1980 /*
1981 * Make sure root is not involved in send,
1982 * if we fail with first root, we return
1983 * directly rather than continue.
1984 */
1985 spin_lock(&root->root_item_lock);
1986 if (root->send_in_progress) {
1987 spin_unlock(&fs_info->trans_lock);
1988 spin_unlock(&root->root_item_lock);
1989 return 0;
1990 }
1991 spin_unlock(&root->root_item_lock);
1992
1978 list_del_init(&root->root_list); 1993 list_del_init(&root->root_list);
1979 spin_unlock(&fs_info->trans_lock); 1994 spin_unlock(&fs_info->trans_lock);
1980 1995
1981 pr_debug("btrfs: cleaner removing %llu\n", root->objectid); 1996 pr_debug("BTRFS: cleaner removing %llu\n", root->objectid);
1982 1997
1983 btrfs_kill_all_delayed_nodes(root); 1998 btrfs_kill_all_delayed_nodes(root);
1984 1999
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7657d115067d..6ac037e9f9f0 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -93,6 +93,7 @@ struct btrfs_trans_handle {
93 short adding_csums; 93 short adding_csums;
94 bool allocating_chunk; 94 bool allocating_chunk;
95 bool reloc_reserved; 95 bool reloc_reserved;
96 bool sync;
96 unsigned int type; 97 unsigned int type;
97 /* 98 /*
98 * this root is only needed to validate that the root passed to 99 * this root is only needed to validate that the root passed to
@@ -154,8 +155,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
154 int wait_for_unblock); 155 int wait_for_unblock);
155int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 156int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
156 struct btrfs_root *root); 157 struct btrfs_root *root);
157int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
158 struct btrfs_root *root);
159int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 158int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
160 struct btrfs_root *root); 159 struct btrfs_root *root);
161void btrfs_throttle(struct btrfs_root *root); 160void btrfs_throttle(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9f7fc51ca334..39d83da03e03 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -570,7 +570,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
570 if (btrfs_file_extent_disk_bytenr(eb, item) == 0) 570 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
571 nbytes = 0; 571 nbytes = 0;
572 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 572 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
573 size = btrfs_file_extent_inline_len(eb, item); 573 size = btrfs_file_extent_inline_len(eb, slot, item);
574 nbytes = btrfs_file_extent_ram_bytes(eb, item); 574 nbytes = btrfs_file_extent_ram_bytes(eb, item);
575 extent_end = ALIGN(start + size, root->sectorsize); 575 extent_end = ALIGN(start + size, root->sectorsize);
576 } else { 576 } else {
@@ -1238,7 +1238,8 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
1238 struct btrfs_root *root, u64 offset) 1238 struct btrfs_root *root, u64 offset)
1239{ 1239{
1240 int ret; 1240 int ret;
1241 ret = btrfs_find_orphan_item(root, offset); 1241 ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
1242 offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
1242 if (ret > 0) 1243 if (ret > 0)
1243 ret = btrfs_insert_orphan_item(trans, root, offset); 1244 ret = btrfs_insert_orphan_item(trans, root, offset);
1244 return ret; 1245 return ret;
@@ -3194,7 +3195,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
3194static noinline int copy_items(struct btrfs_trans_handle *trans, 3195static noinline int copy_items(struct btrfs_trans_handle *trans,
3195 struct inode *inode, 3196 struct inode *inode,
3196 struct btrfs_path *dst_path, 3197 struct btrfs_path *dst_path,
3197 struct extent_buffer *src, 3198 struct btrfs_path *src_path, u64 *last_extent,
3198 int start_slot, int nr, int inode_only) 3199 int start_slot, int nr, int inode_only)
3199{ 3200{
3200 unsigned long src_offset; 3201 unsigned long src_offset;
@@ -3202,6 +3203,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3202 struct btrfs_root *log = BTRFS_I(inode)->root->log_root; 3203 struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
3203 struct btrfs_file_extent_item *extent; 3204 struct btrfs_file_extent_item *extent;
3204 struct btrfs_inode_item *inode_item; 3205 struct btrfs_inode_item *inode_item;
3206 struct extent_buffer *src = src_path->nodes[0];
3207 struct btrfs_key first_key, last_key, key;
3205 int ret; 3208 int ret;
3206 struct btrfs_key *ins_keys; 3209 struct btrfs_key *ins_keys;
3207 u32 *ins_sizes; 3210 u32 *ins_sizes;
@@ -3209,6 +3212,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3209 int i; 3212 int i;
3210 struct list_head ordered_sums; 3213 struct list_head ordered_sums;
3211 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3214 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3215 bool has_extents = false;
3216 bool need_find_last_extent = (*last_extent == 0);
3217 bool done = false;
3212 3218
3213 INIT_LIST_HEAD(&ordered_sums); 3219 INIT_LIST_HEAD(&ordered_sums);
3214 3220
@@ -3217,6 +3223,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3217 if (!ins_data) 3223 if (!ins_data)
3218 return -ENOMEM; 3224 return -ENOMEM;
3219 3225
3226 first_key.objectid = (u64)-1;
3227
3220 ins_sizes = (u32 *)ins_data; 3228 ins_sizes = (u32 *)ins_data;
3221 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 3229 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3222 3230
@@ -3237,6 +3245,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3237 3245
3238 src_offset = btrfs_item_ptr_offset(src, start_slot + i); 3246 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3239 3247
3248 if ((i == (nr - 1)))
3249 last_key = ins_keys[i];
3250
3240 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { 3251 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3241 inode_item = btrfs_item_ptr(dst_path->nodes[0], 3252 inode_item = btrfs_item_ptr(dst_path->nodes[0],
3242 dst_path->slots[0], 3253 dst_path->slots[0],
@@ -3248,6 +3259,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3248 src_offset, ins_sizes[i]); 3259 src_offset, ins_sizes[i]);
3249 } 3260 }
3250 3261
3262 /*
3263 * We set need_find_last_extent here in case we know we were
3264 * processing other items and then walk into the first extent in
3265 * the inode. If we don't hit an extent then nothing changes,
3266 * we'll do the last search the next time around.
3267 */
3268 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3269 has_extents = true;
3270 if (need_find_last_extent &&
3271 first_key.objectid == (u64)-1)
3272 first_key = ins_keys[i];
3273 } else {
3274 need_find_last_extent = false;
3275 }
3276
3251 /* take a reference on file data extents so that truncates 3277 /* take a reference on file data extents so that truncates
3252 * or deletes of this inode don't have to relog the inode 3278 * or deletes of this inode don't have to relog the inode
3253 * again 3279 * again
@@ -3312,6 +3338,128 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3312 list_del(&sums->list); 3338 list_del(&sums->list);
3313 kfree(sums); 3339 kfree(sums);
3314 } 3340 }
3341
3342 if (!has_extents)
3343 return ret;
3344
3345 /*
3346 * Because we use btrfs_search_forward we could skip leaves that were
3347 * not modified and then assume *last_extent is valid when it really
3348 * isn't. So back up to the previous leaf and read the end of the last
3349 * extent before we go and fill in holes.
3350 */
3351 if (need_find_last_extent) {
3352 u64 len;
3353
3354 ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
3355 if (ret < 0)
3356 return ret;
3357 if (ret)
3358 goto fill_holes;
3359 if (src_path->slots[0])
3360 src_path->slots[0]--;
3361 src = src_path->nodes[0];
3362 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
3363 if (key.objectid != btrfs_ino(inode) ||
3364 key.type != BTRFS_EXTENT_DATA_KEY)
3365 goto fill_holes;
3366 extent = btrfs_item_ptr(src, src_path->slots[0],
3367 struct btrfs_file_extent_item);
3368 if (btrfs_file_extent_type(src, extent) ==
3369 BTRFS_FILE_EXTENT_INLINE) {
3370 len = btrfs_file_extent_inline_len(src,
3371 src_path->slots[0],
3372 extent);
3373 *last_extent = ALIGN(key.offset + len,
3374 log->sectorsize);
3375 } else {
3376 len = btrfs_file_extent_num_bytes(src, extent);
3377 *last_extent = key.offset + len;
3378 }
3379 }
3380fill_holes:
3381 /* So we did prev_leaf, now we need to move to the next leaf, but a few
3382 * things could have happened
3383 *
3384 * 1) A merge could have happened, so we could currently be on a leaf
3385 * that holds what we were copying in the first place.
3386 * 2) A split could have happened, and now not all of the items we want
3387 * are on the same leaf.
3388 *
3389 * So we need to adjust how we search for holes, we need to drop the
3390 * path and re-search for the first extent key we found, and then walk
3391 * forward until we hit the last one we copied.
3392 */
3393 if (need_find_last_extent) {
3394 /* btrfs_prev_leaf could return 1 without releasing the path */
3395 btrfs_release_path(src_path);
3396 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
3397 src_path, 0, 0);
3398 if (ret < 0)
3399 return ret;
3400 ASSERT(ret == 0);
3401 src = src_path->nodes[0];
3402 i = src_path->slots[0];
3403 } else {
3404 i = start_slot;
3405 }
3406
3407 /*
3408 * Ok so here we need to go through and fill in any holes we may have
3409 * to make sure that holes are punched for those areas in case they had
3410 * extents previously.
3411 */
3412 while (!done) {
3413 u64 offset, len;
3414 u64 extent_end;
3415
3416 if (i >= btrfs_header_nritems(src_path->nodes[0])) {
3417 ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
3418 if (ret < 0)
3419 return ret;
3420 ASSERT(ret == 0);
3421 src = src_path->nodes[0];
3422 i = 0;
3423 }
3424
3425 btrfs_item_key_to_cpu(src, &key, i);
3426 if (!btrfs_comp_cpu_keys(&key, &last_key))
3427 done = true;
3428 if (key.objectid != btrfs_ino(inode) ||
3429 key.type != BTRFS_EXTENT_DATA_KEY) {
3430 i++;
3431 continue;
3432 }
3433 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
3434 if (btrfs_file_extent_type(src, extent) ==
3435 BTRFS_FILE_EXTENT_INLINE) {
3436 len = btrfs_file_extent_inline_len(src, i, extent);
3437 extent_end = ALIGN(key.offset + len, log->sectorsize);
3438 } else {
3439 len = btrfs_file_extent_num_bytes(src, extent);
3440 extent_end = key.offset + len;
3441 }
3442 i++;
3443
3444 if (*last_extent == key.offset) {
3445 *last_extent = extent_end;
3446 continue;
3447 }
3448 offset = *last_extent;
3449 len = key.offset - *last_extent;
3450 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
3451 offset, 0, 0, len, 0, len, 0,
3452 0, 0);
3453 if (ret)
3454 break;
3455 *last_extent = offset + len;
3456 }
3457 /*
3458 * Need to let the callers know we dropped the path so they should
3459 * re-search.
3460 */
3461 if (!ret && need_find_last_extent)
3462 ret = 1;
3315 return ret; 3463 return ret;
3316} 3464}
3317 3465
@@ -3349,21 +3497,27 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3349 int ret; 3497 int ret;
3350 int index = log->log_transid % 2; 3498 int index = log->log_transid % 2;
3351 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3499 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3352 3500 int extent_inserted = 0;
3353 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3354 em->start + em->len, NULL, 0);
3355 if (ret)
3356 return ret;
3357 3501
3358 INIT_LIST_HEAD(&ordered_sums); 3502 INIT_LIST_HEAD(&ordered_sums);
3359 btrfs_init_map_token(&token); 3503 btrfs_init_map_token(&token);
3360 key.objectid = btrfs_ino(inode);
3361 key.type = BTRFS_EXTENT_DATA_KEY;
3362 key.offset = em->start;
3363 3504
3364 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); 3505 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3506 em->start + em->len, NULL, 0, 1,
3507 sizeof(*fi), &extent_inserted);
3365 if (ret) 3508 if (ret)
3366 return ret; 3509 return ret;
3510
3511 if (!extent_inserted) {
3512 key.objectid = btrfs_ino(inode);
3513 key.type = BTRFS_EXTENT_DATA_KEY;
3514 key.offset = em->start;
3515
3516 ret = btrfs_insert_empty_item(trans, log, path, &key,
3517 sizeof(*fi));
3518 if (ret)
3519 return ret;
3520 }
3367 leaf = path->nodes[0]; 3521 leaf = path->nodes[0];
3368 fi = btrfs_item_ptr(leaf, path->slots[0], 3522 fi = btrfs_item_ptr(leaf, path->slots[0],
3369 struct btrfs_file_extent_item); 3523 struct btrfs_file_extent_item);
@@ -3485,7 +3639,11 @@ again:
3485 * start over after this. 3639 * start over after this.
3486 */ 3640 */
3487 3641
3488 wait_event(ordered->wait, ordered->csum_bytes_left == 0); 3642 if (ordered->csum_bytes_left) {
3643 btrfs_start_ordered_extent(inode, ordered, 0);
3644 wait_event(ordered->wait,
3645 ordered->csum_bytes_left == 0);
3646 }
3489 3647
3490 list_for_each_entry(sum, &ordered->list, list) { 3648 list_for_each_entry(sum, &ordered->list, list) {
3491 ret = btrfs_csum_file_blocks(trans, log, sum); 3649 ret = btrfs_csum_file_blocks(trans, log, sum);
@@ -3630,6 +3788,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3630 struct btrfs_key max_key; 3788 struct btrfs_key max_key;
3631 struct btrfs_root *log = root->log_root; 3789 struct btrfs_root *log = root->log_root;
3632 struct extent_buffer *src = NULL; 3790 struct extent_buffer *src = NULL;
3791 u64 last_extent = 0;
3633 int err = 0; 3792 int err = 0;
3634 int ret; 3793 int ret;
3635 int nritems; 3794 int nritems;
@@ -3745,11 +3904,15 @@ again:
3745 goto next_slot; 3904 goto next_slot;
3746 } 3905 }
3747 3906
3748 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3907 ret = copy_items(trans, inode, dst_path, path, &last_extent,
3749 ins_nr, inode_only); 3908 ins_start_slot, ins_nr, inode_only);
3750 if (ret) { 3909 if (ret < 0) {
3751 err = ret; 3910 err = ret;
3752 goto out_unlock; 3911 goto out_unlock;
3912 } if (ret) {
3913 ins_nr = 0;
3914 btrfs_release_path(path);
3915 continue;
3753 } 3916 }
3754 ins_nr = 1; 3917 ins_nr = 1;
3755 ins_start_slot = path->slots[0]; 3918 ins_start_slot = path->slots[0];
@@ -3763,13 +3926,14 @@ next_slot:
3763 goto again; 3926 goto again;
3764 } 3927 }
3765 if (ins_nr) { 3928 if (ins_nr) {
3766 ret = copy_items(trans, inode, dst_path, src, 3929 ret = copy_items(trans, inode, dst_path, path,
3767 ins_start_slot, 3930 &last_extent, ins_start_slot,
3768 ins_nr, inode_only); 3931 ins_nr, inode_only);
3769 if (ret) { 3932 if (ret < 0) {
3770 err = ret; 3933 err = ret;
3771 goto out_unlock; 3934 goto out_unlock;
3772 } 3935 }
3936 ret = 0;
3773 ins_nr = 0; 3937 ins_nr = 0;
3774 } 3938 }
3775 btrfs_release_path(path); 3939 btrfs_release_path(path);
@@ -3784,12 +3948,13 @@ next_slot:
3784 } 3948 }
3785 } 3949 }
3786 if (ins_nr) { 3950 if (ins_nr) {
3787 ret = copy_items(trans, inode, dst_path, src, ins_start_slot, 3951 ret = copy_items(trans, inode, dst_path, path, &last_extent,
3788 ins_nr, inode_only); 3952 ins_start_slot, ins_nr, inode_only);
3789 if (ret) { 3953 if (ret < 0) {
3790 err = ret; 3954 err = ret;
3791 goto out_unlock; 3955 goto out_unlock;
3792 } 3956 }
3957 ret = 0;
3793 ins_nr = 0; 3958 ins_nr = 0;
3794 } 3959 }
3795 3960
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index b0a523b2c60e..840a38b2778a 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,8 +5,8 @@
5 */ 5 */
6 6
7#include <linux/slab.h> 7#include <linux/slab.h>
8#include <linux/export.h>
9#include "ulist.h" 8#include "ulist.h"
9#include "ctree.h"
10 10
11/* 11/*
12 * ulist is a generic data structure to hold a collection of unique u64 12 * ulist is a generic data structure to hold a collection of unique u64
@@ -14,10 +14,6 @@
14 * enumerating it. 14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key. 15 * It is possible to store an auxiliary value along with the key.
16 * 16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 *
21 * A sample usage for ulists is the enumeration of directed graphs without 17 * A sample usage for ulists is the enumeration of directed graphs without
22 * visiting a node twice. The pseudo-code could look like this: 18 * visiting a node twice. The pseudo-code could look like this:
23 * 19 *
@@ -50,12 +46,10 @@
50 */ 46 */
51void ulist_init(struct ulist *ulist) 47void ulist_init(struct ulist *ulist)
52{ 48{
53 ulist->nnodes = 0; 49 INIT_LIST_HEAD(&ulist->nodes);
54 ulist->nodes = ulist->int_nodes;
55 ulist->nodes_alloced = ULIST_SIZE;
56 ulist->root = RB_ROOT; 50 ulist->root = RB_ROOT;
51 ulist->nnodes = 0;
57} 52}
58EXPORT_SYMBOL(ulist_init);
59 53
60/** 54/**
61 * ulist_fini - free up additionally allocated memory for the ulist 55 * ulist_fini - free up additionally allocated memory for the ulist
@@ -64,18 +58,17 @@ EXPORT_SYMBOL(ulist_init);
64 * This is useful in cases where the base 'struct ulist' has been statically 58 * This is useful in cases where the base 'struct ulist' has been statically
65 * allocated. 59 * allocated.
66 */ 60 */
67void ulist_fini(struct ulist *ulist) 61static void ulist_fini(struct ulist *ulist)
68{ 62{
69 /* 63 struct ulist_node *node;
70 * The first ULIST_SIZE elements are stored inline in struct ulist. 64 struct ulist_node *next;
71 * Only if more elements are alocated they need to be freed. 65
72 */ 66 list_for_each_entry_safe(node, next, &ulist->nodes, list) {
73 if (ulist->nodes_alloced > ULIST_SIZE) 67 kfree(node);
74 kfree(ulist->nodes); 68 }
75 ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
76 ulist->root = RB_ROOT; 69 ulist->root = RB_ROOT;
70 INIT_LIST_HEAD(&ulist->nodes);
77} 71}
78EXPORT_SYMBOL(ulist_fini);
79 72
80/** 73/**
81 * ulist_reinit - prepare a ulist for reuse 74 * ulist_reinit - prepare a ulist for reuse
@@ -89,7 +82,6 @@ void ulist_reinit(struct ulist *ulist)
89 ulist_fini(ulist); 82 ulist_fini(ulist);
90 ulist_init(ulist); 83 ulist_init(ulist);
91} 84}
92EXPORT_SYMBOL(ulist_reinit);
93 85
94/** 86/**
95 * ulist_alloc - dynamically allocate a ulist 87 * ulist_alloc - dynamically allocate a ulist
@@ -108,7 +100,6 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
108 100
109 return ulist; 101 return ulist;
110} 102}
111EXPORT_SYMBOL(ulist_alloc);
112 103
113/** 104/**
114 * ulist_free - free dynamically allocated ulist 105 * ulist_free - free dynamically allocated ulist
@@ -123,7 +114,6 @@ void ulist_free(struct ulist *ulist)
123 ulist_fini(ulist); 114 ulist_fini(ulist);
124 kfree(ulist); 115 kfree(ulist);
125} 116}
126EXPORT_SYMBOL(ulist_free);
127 117
128static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) 118static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
129{ 119{
@@ -192,63 +182,32 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
192int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, 182int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
193 u64 *old_aux, gfp_t gfp_mask) 183 u64 *old_aux, gfp_t gfp_mask)
194{ 184{
195 int ret = 0; 185 int ret;
196 struct ulist_node *node = NULL; 186 struct ulist_node *node;
187
197 node = ulist_rbtree_search(ulist, val); 188 node = ulist_rbtree_search(ulist, val);
198 if (node) { 189 if (node) {
199 if (old_aux) 190 if (old_aux)
200 *old_aux = node->aux; 191 *old_aux = node->aux;
201 return 0; 192 return 0;
202 } 193 }
194 node = kmalloc(sizeof(*node), gfp_mask);
195 if (!node)
196 return -ENOMEM;
203 197
204 if (ulist->nnodes >= ulist->nodes_alloced) { 198 node->val = val;
205 u64 new_alloced = ulist->nodes_alloced + 128; 199 node->aux = aux;
206 struct ulist_node *new_nodes; 200#ifdef CONFIG_BTRFS_DEBUG
207 void *old = NULL; 201 node->seqnum = ulist->nnodes;
208 int i; 202#endif
209
210 for (i = 0; i < ulist->nnodes; i++)
211 rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
212
213 /*
214 * if nodes_alloced == ULIST_SIZE no memory has been allocated
215 * yet, so pass NULL to krealloc
216 */
217 if (ulist->nodes_alloced > ULIST_SIZE)
218 old = ulist->nodes;
219 203
220 new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced, 204 ret = ulist_rbtree_insert(ulist, node);
221 gfp_mask); 205 ASSERT(!ret);
222 if (!new_nodes) 206 list_add_tail(&node->list, &ulist->nodes);
223 return -ENOMEM; 207 ulist->nnodes++;
224
225 if (!old)
226 memcpy(new_nodes, ulist->int_nodes,
227 sizeof(ulist->int_nodes));
228
229 ulist->nodes = new_nodes;
230 ulist->nodes_alloced = new_alloced;
231
232 /*
233 * krealloc actually uses memcpy, which does not copy rb_node
234 * pointers, so we have to do it ourselves. Otherwise we may
235 * be bitten by crashes.
236 */
237 for (i = 0; i < ulist->nnodes; i++) {
238 ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
239 if (ret < 0)
240 return ret;
241 }
242 }
243 ulist->nodes[ulist->nnodes].val = val;
244 ulist->nodes[ulist->nnodes].aux = aux;
245 ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]);
246 BUG_ON(ret);
247 ++ulist->nnodes;
248 208
249 return 1; 209 return 1;
250} 210}
251EXPORT_SYMBOL(ulist_add);
252 211
253/** 212/**
254 * ulist_next - iterate ulist 213 * ulist_next - iterate ulist
@@ -268,11 +227,25 @@ EXPORT_SYMBOL(ulist_add);
268 */ 227 */
269struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) 228struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
270{ 229{
271 if (ulist->nnodes == 0) 230 struct ulist_node *node;
231
232 if (list_empty(&ulist->nodes))
272 return NULL; 233 return NULL;
273 if (uiter->i < 0 || uiter->i >= ulist->nnodes) 234 if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
274 return NULL; 235 return NULL;
275 236 if (uiter->cur_list) {
276 return &ulist->nodes[uiter->i++]; 237 uiter->cur_list = uiter->cur_list->next;
238 } else {
239 uiter->cur_list = ulist->nodes.next;
240#ifdef CONFIG_BTRFS_DEBUG
241 uiter->i = 0;
242#endif
243 }
244 node = list_entry(uiter->cur_list, struct ulist_node, list);
245#ifdef CONFIG_BTRFS_DEBUG
246 ASSERT(node->seqnum == uiter->i);
247 ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
248 uiter->i++;
249#endif
250 return node;
277} 251}
278EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index fb36731074b5..7f78cbf5cf41 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -17,18 +17,12 @@
17 * enumerating it. 17 * enumerating it.
18 * It is possible to store an auxiliary value along with the key. 18 * It is possible to store an auxiliary value along with the key.
19 * 19 *
20 * The implementation is preliminary and can probably be sped up
21 * significantly. A first step would be to store the values in an rbtree
22 * as soon as ULIST_SIZE is exceeded.
23 */ 20 */
24
25/*
26 * number of elements statically allocated inside struct ulist
27 */
28#define ULIST_SIZE 16
29
30struct ulist_iterator { 21struct ulist_iterator {
22#ifdef CONFIG_BTRFS_DEBUG
31 int i; 23 int i;
24#endif
25 struct list_head *cur_list; /* hint to start search */
32}; 26};
33 27
34/* 28/*
@@ -37,6 +31,12 @@ struct ulist_iterator {
37struct ulist_node { 31struct ulist_node {
38 u64 val; /* value to store */ 32 u64 val; /* value to store */
39 u64 aux; /* auxiliary value saved along with the val */ 33 u64 aux; /* auxiliary value saved along with the val */
34
35#ifdef CONFIG_BTRFS_DEBUG
36 int seqnum; /* sequence number this node is added */
37#endif
38
39 struct list_head list; /* used to link node */
40 struct rb_node rb_node; /* used to speed up search */ 40 struct rb_node rb_node; /* used to speed up search */
41}; 41};
42 42
@@ -46,28 +46,11 @@ struct ulist {
46 */ 46 */
47 unsigned long nnodes; 47 unsigned long nnodes;
48 48
49 /* 49 struct list_head nodes;
50 * number of nodes we already have room for
51 */
52 unsigned long nodes_alloced;
53
54 /*
55 * pointer to the array storing the elements. The first ULIST_SIZE
56 * elements are stored inline. In this case the it points to int_nodes.
57 * After exceeding ULIST_SIZE, dynamic memory is allocated.
58 */
59 struct ulist_node *nodes;
60
61 struct rb_root root; 50 struct rb_root root;
62
63 /*
64 * inline storage space for the first ULIST_SIZE entries
65 */
66 struct ulist_node int_nodes[ULIST_SIZE];
67}; 51};
68 52
69void ulist_init(struct ulist *ulist); 53void ulist_init(struct ulist *ulist);
70void ulist_fini(struct ulist *ulist);
71void ulist_reinit(struct ulist *ulist); 54void ulist_reinit(struct ulist *ulist);
72struct ulist *ulist_alloc(gfp_t gfp_mask); 55struct ulist *ulist_alloc(gfp_t gfp_mask);
73void ulist_free(struct ulist *ulist); 56void ulist_free(struct ulist *ulist);
@@ -77,6 +60,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
77struct ulist_node *ulist_next(struct ulist *ulist, 60struct ulist_node *ulist_next(struct ulist *ulist,
78 struct ulist_iterator *uiter); 61 struct ulist_iterator *uiter);
79 62
80#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0) 63#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
81 64
82#endif 65#endif
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index fbda90004fe9..f6a4c03ee7d8 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -69,7 +69,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
69 ret = -ENOENT; 69 ret = -ENOENT;
70 70
71 if (!IS_ALIGNED(item_size, sizeof(u64))) { 71 if (!IS_ALIGNED(item_size, sizeof(u64))) {
72 pr_warn("btrfs: uuid item with illegal size %lu!\n", 72 btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
73 (unsigned long)item_size); 73 (unsigned long)item_size);
74 goto out; 74 goto out;
75 } 75 }
@@ -137,7 +137,8 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
137 offset = btrfs_item_ptr_offset(eb, slot); 137 offset = btrfs_item_ptr_offset(eb, slot);
138 offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); 138 offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
139 } else if (ret < 0) { 139 } else if (ret < 0) {
140 pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n", 140 btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d "
141 "(0x%016llx, 0x%016llx) type %u!",
141 ret, (unsigned long long)key.objectid, 142 ret, (unsigned long long)key.objectid,
142 (unsigned long long)key.offset, type); 143 (unsigned long long)key.offset, type);
143 goto out; 144 goto out;
@@ -183,7 +184,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
183 184
184 ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); 185 ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
185 if (ret < 0) { 186 if (ret < 0) {
186 pr_warn("btrfs: error %d while searching for uuid item!\n", 187 btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!",
187 ret); 188 ret);
188 goto out; 189 goto out;
189 } 190 }
@@ -197,7 +198,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
197 offset = btrfs_item_ptr_offset(eb, slot); 198 offset = btrfs_item_ptr_offset(eb, slot);
198 item_size = btrfs_item_size_nr(eb, slot); 199 item_size = btrfs_item_size_nr(eb, slot);
199 if (!IS_ALIGNED(item_size, sizeof(u64))) { 200 if (!IS_ALIGNED(item_size, sizeof(u64))) {
200 pr_warn("btrfs: uuid item with illegal size %lu!\n", 201 btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
201 (unsigned long)item_size); 202 (unsigned long)item_size);
202 ret = -ENOENT; 203 ret = -ENOENT;
203 goto out; 204 goto out;
@@ -299,7 +300,7 @@ again_search_slot:
299 offset = btrfs_item_ptr_offset(leaf, slot); 300 offset = btrfs_item_ptr_offset(leaf, slot);
300 item_size = btrfs_item_size_nr(leaf, slot); 301 item_size = btrfs_item_size_nr(leaf, slot);
301 if (!IS_ALIGNED(item_size, sizeof(u64))) { 302 if (!IS_ALIGNED(item_size, sizeof(u64))) {
302 pr_warn("btrfs: uuid item with illegal size %lu!\n", 303 btrfs_warn(fs_info, "uuid item with illegal size %lu!",
303 (unsigned long)item_size); 304 (unsigned long)item_size);
304 goto skip; 305 goto skip;
305 } 306 }
@@ -349,6 +350,6 @@ skip:
349out: 350out:
350 btrfs_free_path(path); 351 btrfs_free_path(path);
351 if (ret) 352 if (ret)
352 pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret); 353 btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
353 return 0; 354 return 0;
354} 355}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 92303f42baaa..bab0b84d8f80 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,7 +125,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev,
125 125
126 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action); 126 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
127 if (ret) 127 if (ret)
128 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n", 128 pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
129 action, 129 action,
130 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj), 130 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
131 &disk_to_dev(bdev->bd_disk)->kobj); 131 &disk_to_dev(bdev->bd_disk)->kobj);
@@ -200,7 +200,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
200 200
201 if (IS_ERR(*bdev)) { 201 if (IS_ERR(*bdev)) {
202 ret = PTR_ERR(*bdev); 202 ret = PTR_ERR(*bdev);
203 printk(KERN_INFO "btrfs: open %s failed\n", device_path); 203 printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
204 goto error; 204 goto error;
205 } 205 }
206 206
@@ -912,9 +912,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
912 if (disk_super->label[0]) { 912 if (disk_super->label[0]) {
913 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 913 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
914 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 914 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
915 printk(KERN_INFO "btrfs: device label %s ", disk_super->label); 915 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
916 } else { 916 } else {
917 printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid); 917 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
918 } 918 }
919 919
920 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); 920 printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
@@ -1813,7 +1813,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1813 } 1813 }
1814 1814
1815 if (!*device) { 1815 if (!*device) {
1816 pr_err("btrfs: no missing device found\n"); 1816 btrfs_err(root->fs_info, "no missing device found");
1817 return -ENOENT; 1817 return -ENOENT;
1818 } 1818 }
1819 1819
@@ -3052,7 +3052,7 @@ loop:
3052error: 3052error:
3053 btrfs_free_path(path); 3053 btrfs_free_path(path);
3054 if (enospc_errors) { 3054 if (enospc_errors) {
3055 printk(KERN_INFO "btrfs: %d enospc errors during balance\n", 3055 btrfs_info(fs_info, "%d enospc errors during balance",
3056 enospc_errors); 3056 enospc_errors);
3057 if (!ret) 3057 if (!ret)
3058 ret = -ENOSPC; 3058 ret = -ENOSPC;
@@ -3138,8 +3138,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3138 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 3138 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3139 !(bctl->flags & BTRFS_BALANCE_METADATA) || 3139 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3140 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 3140 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3141 printk(KERN_ERR "btrfs: with mixed groups data and " 3141 btrfs_err(fs_info, "with mixed groups data and "
3142 "metadata balance options must be the same\n"); 3142 "metadata balance options must be the same");
3143 ret = -EINVAL; 3143 ret = -EINVAL;
3144 goto out; 3144 goto out;
3145 } 3145 }
@@ -3165,8 +3165,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3165 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3165 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3166 (!alloc_profile_is_valid(bctl->data.target, 1) || 3166 (!alloc_profile_is_valid(bctl->data.target, 1) ||
3167 (bctl->data.target & ~allowed))) { 3167 (bctl->data.target & ~allowed))) {
3168 printk(KERN_ERR "btrfs: unable to start balance with target " 3168 btrfs_err(fs_info, "unable to start balance with target "
3169 "data profile %llu\n", 3169 "data profile %llu",
3170 bctl->data.target); 3170 bctl->data.target);
3171 ret = -EINVAL; 3171 ret = -EINVAL;
3172 goto out; 3172 goto out;
@@ -3174,8 +3174,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3174 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3174 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3175 (!alloc_profile_is_valid(bctl->meta.target, 1) || 3175 (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3176 (bctl->meta.target & ~allowed))) { 3176 (bctl->meta.target & ~allowed))) {
3177 printk(KERN_ERR "btrfs: unable to start balance with target " 3177 btrfs_err(fs_info,
3178 "metadata profile %llu\n", 3178 "unable to start balance with target metadata profile %llu",
3179 bctl->meta.target); 3179 bctl->meta.target);
3180 ret = -EINVAL; 3180 ret = -EINVAL;
3181 goto out; 3181 goto out;
@@ -3183,8 +3183,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3183 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3183 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3184 (!alloc_profile_is_valid(bctl->sys.target, 1) || 3184 (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3185 (bctl->sys.target & ~allowed))) { 3185 (bctl->sys.target & ~allowed))) {
3186 printk(KERN_ERR "btrfs: unable to start balance with target " 3186 btrfs_err(fs_info,
3187 "system profile %llu\n", 3187 "unable to start balance with target system profile %llu",
3188 bctl->sys.target); 3188 bctl->sys.target);
3189 ret = -EINVAL; 3189 ret = -EINVAL;
3190 goto out; 3190 goto out;
@@ -3193,7 +3193,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3193 /* allow dup'ed data chunks only in mixed mode */ 3193 /* allow dup'ed data chunks only in mixed mode */
3194 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3194 if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3195 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { 3195 (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
3196 printk(KERN_ERR "btrfs: dup for data is not allowed\n"); 3196 btrfs_err(fs_info, "dup for data is not allowed");
3197 ret = -EINVAL; 3197 ret = -EINVAL;
3198 goto out; 3198 goto out;
3199 } 3199 }
@@ -3213,11 +3213,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3213 (fs_info->avail_metadata_alloc_bits & allowed) && 3213 (fs_info->avail_metadata_alloc_bits & allowed) &&
3214 !(bctl->meta.target & allowed))) { 3214 !(bctl->meta.target & allowed))) {
3215 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3215 if (bctl->flags & BTRFS_BALANCE_FORCE) {
3216 printk(KERN_INFO "btrfs: force reducing metadata " 3216 btrfs_info(fs_info, "force reducing metadata integrity");
3217 "integrity\n");
3218 } else { 3217 } else {
3219 printk(KERN_ERR "btrfs: balance will reduce metadata " 3218 btrfs_err(fs_info, "balance will reduce metadata "
3220 "integrity, use force if you want this\n"); 3219 "integrity, use force if you want this");
3221 ret = -EINVAL; 3220 ret = -EINVAL;
3222 goto out; 3221 goto out;
3223 } 3222 }
@@ -3303,7 +3302,7 @@ static int balance_kthread(void *data)
3303 mutex_lock(&fs_info->balance_mutex); 3302 mutex_lock(&fs_info->balance_mutex);
3304 3303
3305 if (fs_info->balance_ctl) { 3304 if (fs_info->balance_ctl) {
3306 printk(KERN_INFO "btrfs: continuing balance\n"); 3305 btrfs_info(fs_info, "continuing balance");
3307 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3306 ret = btrfs_balance(fs_info->balance_ctl, NULL);
3308 } 3307 }
3309 3308
@@ -3325,7 +3324,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3325 spin_unlock(&fs_info->balance_lock); 3324 spin_unlock(&fs_info->balance_lock);
3326 3325
3327 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { 3326 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
3328 printk(KERN_INFO "btrfs: force skipping balance\n"); 3327 btrfs_info(fs_info, "force skipping balance");
3329 return 0; 3328 return 0;
3330 } 3329 }
3331 3330
@@ -3543,7 +3542,7 @@ update_tree:
3543 BTRFS_UUID_KEY_SUBVOL, 3542 BTRFS_UUID_KEY_SUBVOL,
3544 key.objectid); 3543 key.objectid);
3545 if (ret < 0) { 3544 if (ret < 0) {
3546 pr_warn("btrfs: uuid_tree_add failed %d\n", 3545 btrfs_warn(fs_info, "uuid_tree_add failed %d",
3547 ret); 3546 ret);
3548 break; 3547 break;
3549 } 3548 }
@@ -3555,7 +3554,7 @@ update_tree:
3555 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 3554 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
3556 key.objectid); 3555 key.objectid);
3557 if (ret < 0) { 3556 if (ret < 0) {
3558 pr_warn("btrfs: uuid_tree_add failed %d\n", 3557 btrfs_warn(fs_info, "uuid_tree_add failed %d",
3559 ret); 3558 ret);
3560 break; 3559 break;
3561 } 3560 }
@@ -3590,7 +3589,7 @@ out:
3590 if (trans && !IS_ERR(trans)) 3589 if (trans && !IS_ERR(trans))
3591 btrfs_end_transaction(trans, fs_info->uuid_root); 3590 btrfs_end_transaction(trans, fs_info->uuid_root);
3592 if (ret) 3591 if (ret)
3593 pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret); 3592 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
3594 else 3593 else
3595 fs_info->update_uuid_tree_gen = 1; 3594 fs_info->update_uuid_tree_gen = 1;
3596 up(&fs_info->uuid_tree_rescan_sem); 3595 up(&fs_info->uuid_tree_rescan_sem);
@@ -3654,7 +3653,7 @@ static int btrfs_uuid_rescan_kthread(void *data)
3654 */ 3653 */
3655 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); 3654 ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
3656 if (ret < 0) { 3655 if (ret < 0) {
3657 pr_warn("btrfs: iterating uuid_tree failed %d\n", ret); 3656 btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
3658 up(&fs_info->uuid_tree_rescan_sem); 3657 up(&fs_info->uuid_tree_rescan_sem);
3659 return ret; 3658 return ret;
3660 } 3659 }
@@ -3695,7 +3694,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
3695 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 3694 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
3696 if (IS_ERR(task)) { 3695 if (IS_ERR(task)) {
3697 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3696 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3698 pr_warn("btrfs: failed to start uuid_scan task\n"); 3697 btrfs_warn(fs_info, "failed to start uuid_scan task");
3699 up(&fs_info->uuid_tree_rescan_sem); 3698 up(&fs_info->uuid_tree_rescan_sem);
3700 return PTR_ERR(task); 3699 return PTR_ERR(task);
3701 } 3700 }
@@ -3711,7 +3710,7 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3711 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 3710 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3712 if (IS_ERR(task)) { 3711 if (IS_ERR(task)) {
3713 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3712 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3714 pr_warn("btrfs: failed to start uuid_rescan task\n"); 3713 btrfs_warn(fs_info, "failed to start uuid_rescan task");
3715 up(&fs_info->uuid_tree_rescan_sem); 3714 up(&fs_info->uuid_tree_rescan_sem);
3716 return PTR_ERR(task); 3715 return PTR_ERR(task);
3717 } 3716 }
@@ -4033,7 +4032,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4033 max_stripe_size = 32 * 1024 * 1024; 4032 max_stripe_size = 32 * 1024 * 1024;
4034 max_chunk_size = 2 * max_stripe_size; 4033 max_chunk_size = 2 * max_stripe_size;
4035 } else { 4034 } else {
4036 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 4035 btrfs_err(info, "invalid chunk type 0x%llx requested\n",
4037 type); 4036 type);
4038 BUG_ON(1); 4037 BUG_ON(1);
4039 } 4038 }
@@ -4065,7 +4064,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4065 4064
4066 if (!device->writeable) { 4065 if (!device->writeable) {
4067 WARN(1, KERN_ERR 4066 WARN(1, KERN_ERR
4068 "btrfs: read-only device in alloc_list\n"); 4067 "BTRFS: read-only device in alloc_list\n");
4069 continue; 4068 continue;
4070 } 4069 }
4071 4070
@@ -5193,13 +5192,13 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5193 read_unlock(&em_tree->lock); 5192 read_unlock(&em_tree->lock);
5194 5193
5195 if (!em) { 5194 if (!em) {
5196 printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", 5195 printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
5197 chunk_start); 5196 chunk_start);
5198 return -EIO; 5197 return -EIO;
5199 } 5198 }
5200 5199
5201 if (em->start != chunk_start) { 5200 if (em->start != chunk_start) {
5202 printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", 5201 printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
5203 em->start, chunk_start); 5202 em->start, chunk_start);
5204 free_extent_map(em); 5203 free_extent_map(em);
5205 return -EIO; 5204 return -EIO;
@@ -5298,6 +5297,13 @@ static void btrfs_end_bio(struct bio *bio, int err)
5298 bio_put(bio); 5297 bio_put(bio);
5299 bio = bbio->orig_bio; 5298 bio = bbio->orig_bio;
5300 } 5299 }
5300
5301 /*
5302 * We have original bio now. So increment bi_remaining to
5303 * account for it in endio
5304 */
5305 atomic_inc(&bio->bi_remaining);
5306
5301 bio->bi_private = bbio->private; 5307 bio->bi_private = bbio->private;
5302 bio->bi_end_io = bbio->end_io; 5308 bio->bi_end_io = bbio->end_io;
5303 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5309 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5411,7 +5417,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
5411 if (!q->merge_bvec_fn) 5417 if (!q->merge_bvec_fn)
5412 return 1; 5418 return 1;
5413 5419
5414 bvm.bi_size = bio->bi_size - prev->bv_len; 5420 bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
5415 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) 5421 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
5416 return 0; 5422 return 0;
5417 return 1; 5423 return 1;
@@ -5426,7 +5432,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5426 bio->bi_private = bbio; 5432 bio->bi_private = bbio;
5427 btrfs_io_bio(bio)->stripe_index = dev_nr; 5433 btrfs_io_bio(bio)->stripe_index = dev_nr;
5428 bio->bi_end_io = btrfs_end_bio; 5434 bio->bi_end_io = btrfs_end_bio;
5429 bio->bi_sector = physical >> 9; 5435 bio->bi_iter.bi_sector = physical >> 9;
5430#ifdef DEBUG 5436#ifdef DEBUG
5431 { 5437 {
5432 struct rcu_string *name; 5438 struct rcu_string *name;
@@ -5464,7 +5470,7 @@ again:
5464 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) { 5470 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
5465 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len, 5471 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5466 bvec->bv_offset) < bvec->bv_len) { 5472 bvec->bv_offset) < bvec->bv_len) {
5467 u64 len = bio->bi_size; 5473 u64 len = bio->bi_iter.bi_size;
5468 5474
5469 atomic_inc(&bbio->stripes_pending); 5475 atomic_inc(&bbio->stripes_pending);
5470 submit_stripe_bio(root, bbio, bio, physical, dev_nr, 5476 submit_stripe_bio(root, bbio, bio, physical, dev_nr,
@@ -5486,7 +5492,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5486 bio->bi_private = bbio->private; 5492 bio->bi_private = bbio->private;
5487 bio->bi_end_io = bbio->end_io; 5493 bio->bi_end_io = bbio->end_io;
5488 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; 5494 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5489 bio->bi_sector = logical >> 9; 5495 bio->bi_iter.bi_sector = logical >> 9;
5490 kfree(bbio); 5496 kfree(bbio);
5491 bio_endio(bio, -EIO); 5497 bio_endio(bio, -EIO);
5492 } 5498 }
@@ -5497,7 +5503,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5497{ 5503{
5498 struct btrfs_device *dev; 5504 struct btrfs_device *dev;
5499 struct bio *first_bio = bio; 5505 struct bio *first_bio = bio;
5500 u64 logical = (u64)bio->bi_sector << 9; 5506 u64 logical = (u64)bio->bi_iter.bi_sector << 9;
5501 u64 length = 0; 5507 u64 length = 0;
5502 u64 map_length; 5508 u64 map_length;
5503 u64 *raid_map = NULL; 5509 u64 *raid_map = NULL;
@@ -5506,7 +5512,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5506 int total_devs = 1; 5512 int total_devs = 1;
5507 struct btrfs_bio *bbio = NULL; 5513 struct btrfs_bio *bbio = NULL;
5508 5514
5509 length = bio->bi_size; 5515 length = bio->bi_iter.bi_size;
5510 map_length = length; 5516 map_length = length;
5511 5517
5512 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5518 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
@@ -6123,7 +6129,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6123 BUG_ON(!path); 6129 BUG_ON(!path);
6124 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 6130 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
6125 if (ret < 0) { 6131 if (ret < 0) {
6126 printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", 6132 printk_in_rcu(KERN_WARNING "BTRFS: "
6133 "error %d while searching for dev_stats item for device %s!\n",
6127 ret, rcu_str_deref(device->name)); 6134 ret, rcu_str_deref(device->name));
6128 goto out; 6135 goto out;
6129 } 6136 }
@@ -6133,7 +6140,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6133 /* need to delete old one and insert a new one */ 6140 /* need to delete old one and insert a new one */
6134 ret = btrfs_del_item(trans, dev_root, path); 6141 ret = btrfs_del_item(trans, dev_root, path);
6135 if (ret != 0) { 6142 if (ret != 0) {
6136 printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", 6143 printk_in_rcu(KERN_WARNING "BTRFS: "
6144 "delete too small dev_stats item for device %s failed %d!\n",
6137 rcu_str_deref(device->name), ret); 6145 rcu_str_deref(device->name), ret);
6138 goto out; 6146 goto out;
6139 } 6147 }
@@ -6146,7 +6154,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6146 ret = btrfs_insert_empty_item(trans, dev_root, path, 6154 ret = btrfs_insert_empty_item(trans, dev_root, path,
6147 &key, sizeof(*ptr)); 6155 &key, sizeof(*ptr));
6148 if (ret < 0) { 6156 if (ret < 0) {
6149 printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", 6157 printk_in_rcu(KERN_WARNING "BTRFS: "
6158 "insert dev_stats item for device %s failed %d!\n",
6150 rcu_str_deref(device->name), ret); 6159 rcu_str_deref(device->name), ret);
6151 goto out; 6160 goto out;
6152 } 6161 }
@@ -6199,16 +6208,14 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
6199{ 6208{
6200 if (!dev->dev_stats_valid) 6209 if (!dev->dev_stats_valid)
6201 return; 6210 return;
6202 printk_ratelimited_in_rcu(KERN_ERR 6211 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
6203 "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6212 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6204 rcu_str_deref(dev->name), 6213 rcu_str_deref(dev->name),
6205 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6214 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6206 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6215 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6207 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 6216 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6208 btrfs_dev_stat_read(dev, 6217 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6209 BTRFS_DEV_STAT_CORRUPTION_ERRS), 6218 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6210 btrfs_dev_stat_read(dev,
6211 BTRFS_DEV_STAT_GENERATION_ERRS));
6212} 6219}
6213 6220
6214static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 6221static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
@@ -6221,7 +6228,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
6221 if (i == BTRFS_DEV_STAT_VALUES_MAX) 6228 if (i == BTRFS_DEV_STAT_VALUES_MAX)
6222 return; /* all values == 0, suppress message */ 6229 return; /* all values == 0, suppress message */
6223 6230
6224 printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6231 printk_in_rcu(KERN_INFO "BTRFS: "
6232 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6225 rcu_str_deref(dev->name), 6233 rcu_str_deref(dev->name),
6226 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6234 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6227 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6235 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6242,12 +6250,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
6242 mutex_unlock(&fs_devices->device_list_mutex); 6250 mutex_unlock(&fs_devices->device_list_mutex);
6243 6251
6244 if (!dev) { 6252 if (!dev) {
6245 printk(KERN_WARNING 6253 btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
6246 "btrfs: get dev_stats failed, device not found\n");
6247 return -ENODEV; 6254 return -ENODEV;
6248 } else if (!dev->dev_stats_valid) { 6255 } else if (!dev->dev_stats_valid) {
6249 printk(KERN_WARNING 6256 btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
6250 "btrfs: get dev_stats failed, not yet valid\n");
6251 return -ENODEV; 6257 return -ENODEV;
6252 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 6258 } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
6253 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 6259 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 05740b9789e4..ad8328d797ea 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -22,11 +22,13 @@
22#include <linux/rwsem.h> 22#include <linux/rwsem.h>
23#include <linux/xattr.h> 23#include <linux/xattr.h>
24#include <linux/security.h> 24#include <linux/security.h>
25#include <linux/posix_acl_xattr.h>
25#include "ctree.h" 26#include "ctree.h"
26#include "btrfs_inode.h" 27#include "btrfs_inode.h"
27#include "transaction.h" 28#include "transaction.h"
28#include "xattr.h" 29#include "xattr.h"
29#include "disk-io.h" 30#include "disk-io.h"
31#include "props.h"
30 32
31 33
32ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 34ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -313,8 +315,8 @@ err:
313 */ 315 */
314const struct xattr_handler *btrfs_xattr_handlers[] = { 316const struct xattr_handler *btrfs_xattr_handlers[] = {
315#ifdef CONFIG_BTRFS_FS_POSIX_ACL 317#ifdef CONFIG_BTRFS_FS_POSIX_ACL
316 &btrfs_xattr_acl_access_handler, 318 &posix_acl_access_xattr_handler,
317 &btrfs_xattr_acl_default_handler, 319 &posix_acl_default_xattr_handler,
318#endif 320#endif
319 NULL, 321 NULL,
320}; 322};
@@ -331,7 +333,8 @@ static bool btrfs_is_valid_xattr(const char *name)
331 XATTR_SECURITY_PREFIX_LEN) || 333 XATTR_SECURITY_PREFIX_LEN) ||
332 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || 334 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
333 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 335 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
334 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 336 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
337 !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
335} 338}
336 339
337ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, 340ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
@@ -373,6 +376,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
373 if (!btrfs_is_valid_xattr(name)) 376 if (!btrfs_is_valid_xattr(name))
374 return -EOPNOTSUPP; 377 return -EOPNOTSUPP;
375 378
379 if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
380 return btrfs_set_prop(dentry->d_inode, name,
381 value, size, flags);
382
376 if (size == 0) 383 if (size == 0)
377 value = ""; /* empty EA, do not remove */ 384 value = ""; /* empty EA, do not remove */
378 385
@@ -402,6 +409,10 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
402 if (!btrfs_is_valid_xattr(name)) 409 if (!btrfs_is_valid_xattr(name))
403 return -EOPNOTSUPP; 410 return -EOPNOTSUPP;
404 411
412 if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
413 return btrfs_set_prop(dentry->d_inode, name,
414 NULL, 0, XATTR_REPLACE);
415
405 return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, 416 return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
406 XATTR_REPLACE); 417 XATTR_REPLACE);
407} 418}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index b3cc8039134b..5049608d1388 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,8 +21,6 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern const struct xattr_handler *btrfs_xattr_handlers[]; 24extern const struct xattr_handler *btrfs_xattr_handlers[];
27 25
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 26extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 9acb846c3e7f..8e57191950cb 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws,
97 *total_in = 0; 97 *total_in = 0;
98 98
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
100 printk(KERN_WARNING "btrfs: deflateInit failed\n"); 100 printk(KERN_WARNING "BTRFS: deflateInit failed\n");
101 ret = -1; 101 ret = -1;
102 goto out; 102 goto out;
103 } 103 }
@@ -125,7 +125,7 @@ static int zlib_compress_pages(struct list_head *ws,
125 while (workspace->def_strm.total_in < len) { 125 while (workspace->def_strm.total_in < len) {
126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
127 if (ret != Z_OK) { 127 if (ret != Z_OK) {
128 printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n", 128 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
129 ret); 129 ret);
130 zlib_deflateEnd(&workspace->def_strm); 130 zlib_deflateEnd(&workspace->def_strm);
131 ret = -1; 131 ret = -1;
@@ -252,7 +252,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
252 } 252 }
253 253
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
255 printk(KERN_WARNING "btrfs: inflateInit failed\n"); 255 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
256 return -1; 256 return -1;
257 } 257 }
258 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
336 } 336 }
337 337
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
339 printk(KERN_WARNING "btrfs: inflateInit failed\n"); 339 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
340 return -1; 340 return -1;
341 } 341 }
342 342
diff --git a/fs/buffer.c b/fs/buffer.c
index 6024877335ca..27265a8b43c1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
654static void __set_page_dirty(struct page *page, 654static void __set_page_dirty(struct page *page,
655 struct address_space *mapping, int warn) 655 struct address_space *mapping, int warn)
656{ 656{
657 spin_lock_irq(&mapping->tree_lock); 657 unsigned long flags;
658
659 spin_lock_irqsave(&mapping->tree_lock, flags);
658 if (page->mapping) { /* Race with truncate? */ 660 if (page->mapping) { /* Race with truncate? */
659 WARN_ON_ONCE(warn && !PageUptodate(page)); 661 WARN_ON_ONCE(warn && !PageUptodate(page));
660 account_page_dirtied(page, mapping); 662 account_page_dirtied(page, mapping);
661 radix_tree_tag_set(&mapping->page_tree, 663 radix_tree_tag_set(&mapping->page_tree,
662 page_index(page), PAGECACHE_TAG_DIRTY); 664 page_index(page), PAGECACHE_TAG_DIRTY);
663 } 665 }
664 spin_unlock_irq(&mapping->tree_lock); 666 spin_unlock_irqrestore(&mapping->tree_lock, flags);
665 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 667 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
666} 668}
667 669
@@ -1312,7 +1314,7 @@ static void bh_lru_install(struct buffer_head *bh)
1312 } 1314 }
1313 while (out < BH_LRU_SIZE) 1315 while (out < BH_LRU_SIZE)
1314 bhs[out++] = NULL; 1316 bhs[out++] = NULL;
1315 memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); 1317 memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1316 } 1318 }
1317 bh_lru_unlock(); 1319 bh_lru_unlock();
1318 1320
@@ -2982,11 +2984,11 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2982 * let it through, and the IO layer will turn it into 2984 * let it through, and the IO layer will turn it into
2983 * an EIO. 2985 * an EIO.
2984 */ 2986 */
2985 if (unlikely(bio->bi_sector >= maxsector)) 2987 if (unlikely(bio->bi_iter.bi_sector >= maxsector))
2986 return; 2988 return;
2987 2989
2988 maxsector -= bio->bi_sector; 2990 maxsector -= bio->bi_iter.bi_sector;
2989 bytes = bio->bi_size; 2991 bytes = bio->bi_iter.bi_size;
2990 if (likely((bytes >> 9) <= maxsector)) 2992 if (likely((bytes >> 9) <= maxsector))
2991 return; 2993 return;
2992 2994
@@ -2994,7 +2996,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2994 bytes = maxsector << 9; 2996 bytes = maxsector << 9;
2995 2997
2996 /* Truncate the bio.. */ 2998 /* Truncate the bio.. */
2997 bio->bi_size = bytes; 2999 bio->bi_iter.bi_size = bytes;
2998 bio->bi_io_vec[0].bv_len = bytes; 3000 bio->bi_io_vec[0].bv_len = bytes;
2999 3001
3000 /* ..and clear the end of the buffer for reads */ 3002 /* ..and clear the end of the buffer for reads */
@@ -3029,14 +3031,14 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3029 */ 3031 */
3030 bio = bio_alloc(GFP_NOIO, 1); 3032 bio = bio_alloc(GFP_NOIO, 1);
3031 3033
3032 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 3034 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3033 bio->bi_bdev = bh->b_bdev; 3035 bio->bi_bdev = bh->b_bdev;
3034 bio->bi_io_vec[0].bv_page = bh->b_page; 3036 bio->bi_io_vec[0].bv_page = bh->b_page;
3035 bio->bi_io_vec[0].bv_len = bh->b_size; 3037 bio->bi_io_vec[0].bv_len = bh->b_size;
3036 bio->bi_io_vec[0].bv_offset = bh_offset(bh); 3038 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
3037 3039
3038 bio->bi_vcnt = 1; 3040 bio->bi_vcnt = 1;
3039 bio->bi_size = bh->b_size; 3041 bio->bi_iter.bi_size = bh->b_size;
3040 3042
3041 bio->bi_end_io = end_bio_bh_io_sync; 3043 bio->bi_end_io = end_bio_bh_io_sync;
3042 bio->bi_private = bh; 3044 bio->bi_private = bh;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index ac9a2ef5bb9b..264e9bf83ff3 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -25,3 +25,16 @@ config CEPH_FSCACHE
25 caching support for Ceph clients using FS-Cache 25 caching support for Ceph clients using FS-Cache
26 26
27endif 27endif
28
29config CEPH_FS_POSIX_ACL
30 bool "Ceph POSIX Access Control Lists"
31 depends on CEPH_FS
32 select FS_POSIX_ACL
33 help
34 POSIX Access Control Lists (ACLs) support permissions for users and
35 groups beyond the owner/group/world scheme.
36
37 To learn more about Access Control Lists, visit the POSIX ACLs for
38 Linux website <http://acl.bestbits.at/>.
39
40 If you don't know what Access Control Lists are, say N
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 32e30106a2f0..85a4230b9bff 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 debugfs.o 10 debugfs.o
11 11
12ceph-$(CONFIG_CEPH_FSCACHE) += cache.o 12ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
13ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
new file mode 100644
index 000000000000..21887d63dad5
--- /dev/null
+++ b/fs/ceph/acl.c
@@ -0,0 +1,200 @@
1/*
2 * linux/fs/ceph/acl.c
3 *
4 * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 021110-1307, USA.
19 */
20
21#include <linux/ceph/ceph_debug.h>
22#include <linux/fs.h>
23#include <linux/string.h>
24#include <linux/xattr.h>
25#include <linux/posix_acl_xattr.h>
26#include <linux/posix_acl.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29
30#include "super.h"
31
32static inline void ceph_set_cached_acl(struct inode *inode,
33 int type, struct posix_acl *acl)
34{
35 struct ceph_inode_info *ci = ceph_inode(inode);
36
37 spin_lock(&ci->i_ceph_lock);
38 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
39 set_cached_acl(inode, type, acl);
40 spin_unlock(&ci->i_ceph_lock);
41}
42
43static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
44 int type)
45{
46 struct ceph_inode_info *ci = ceph_inode(inode);
47 struct posix_acl *acl = ACL_NOT_CACHED;
48
49 spin_lock(&ci->i_ceph_lock);
50 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
51 acl = get_cached_acl(inode, type);
52 spin_unlock(&ci->i_ceph_lock);
53
54 return acl;
55}
56
57struct posix_acl *ceph_get_acl(struct inode *inode, int type)
58{
59 int size;
60 const char *name;
61 char *value = NULL;
62 struct posix_acl *acl;
63
64 switch (type) {
65 case ACL_TYPE_ACCESS:
66 name = POSIX_ACL_XATTR_ACCESS;
67 break;
68 case ACL_TYPE_DEFAULT:
69 name = POSIX_ACL_XATTR_DEFAULT;
70 break;
71 default:
72 BUG();
73 }
74
75 size = __ceph_getxattr(inode, name, "", 0);
76 if (size > 0) {
77 value = kzalloc(size, GFP_NOFS);
78 if (!value)
79 return ERR_PTR(-ENOMEM);
80 size = __ceph_getxattr(inode, name, value, size);
81 }
82
83 if (size > 0)
84 acl = posix_acl_from_xattr(&init_user_ns, value, size);
85 else if (size == -ERANGE || size == -ENODATA || size == 0)
86 acl = NULL;
87 else
88 acl = ERR_PTR(-EIO);
89
90 kfree(value);
91
92 if (!IS_ERR(acl))
93 ceph_set_cached_acl(inode, type, acl);
94
95 return acl;
96}
97
98int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
99{
100 int ret = 0, size = 0;
101 const char *name = NULL;
102 char *value = NULL;
103 struct iattr newattrs;
104 umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
105 struct dentry *dentry;
106
107 if (acl) {
108 ret = posix_acl_valid(acl);
109 if (ret < 0)
110 goto out;
111 }
112
113 switch (type) {
114 case ACL_TYPE_ACCESS:
115 name = POSIX_ACL_XATTR_ACCESS;
116 if (acl) {
117 ret = posix_acl_equiv_mode(acl, &new_mode);
118 if (ret < 0)
119 goto out;
120 if (ret == 0)
121 acl = NULL;
122 }
123 break;
124 case ACL_TYPE_DEFAULT:
125 if (!S_ISDIR(inode->i_mode)) {
126 ret = acl ? -EINVAL : 0;
127 goto out;
128 }
129 name = POSIX_ACL_XATTR_DEFAULT;
130 break;
131 default:
132 ret = -EINVAL;
133 goto out;
134 }
135
136 if (acl) {
137 size = posix_acl_xattr_size(acl->a_count);
138 value = kmalloc(size, GFP_NOFS);
139 if (!value) {
140 ret = -ENOMEM;
141 goto out;
142 }
143
144 ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
145 if (ret < 0)
146 goto out_free;
147 }
148
149 dentry = d_find_alias(inode);
150 if (new_mode != old_mode) {
151 newattrs.ia_mode = new_mode;
152 newattrs.ia_valid = ATTR_MODE;
153 ret = ceph_setattr(dentry, &newattrs);
154 if (ret)
155 goto out_dput;
156 }
157
158 ret = __ceph_setxattr(dentry, name, value, size, 0);
159 if (ret) {
160 if (new_mode != old_mode) {
161 newattrs.ia_mode = old_mode;
162 newattrs.ia_valid = ATTR_MODE;
163 ceph_setattr(dentry, &newattrs);
164 }
165 goto out_dput;
166 }
167
168 ceph_set_cached_acl(inode, type, acl);
169
170out_dput:
171 dput(dentry);
172out_free:
173 kfree(value);
174out:
175 return ret;
176}
177
178int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
179{
180 struct posix_acl *default_acl, *acl;
181 int error;
182
183 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
184 if (error)
185 return error;
186
187 if (!default_acl && !acl)
188 cache_no_acl(inode);
189
190 if (default_acl) {
191 error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
192 posix_acl_release(default_acl);
193 }
194 if (acl) {
195 if (!error)
196 error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
197 posix_acl_release(acl);
198 }
199 return error;
200}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ec3ba43b9faa..b53278c9fd97 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -209,6 +209,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
209 err = 0; 209 err = 0;
210 if (err < 0) { 210 if (err < 0) {
211 SetPageError(page); 211 SetPageError(page);
212 ceph_fscache_readpage_cancel(inode, page);
212 goto out; 213 goto out;
213 } else { 214 } else {
214 if (err < PAGE_CACHE_SIZE) { 215 if (err < PAGE_CACHE_SIZE) {
@@ -256,6 +257,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
256 for (i = 0; i < num_pages; i++) { 257 for (i = 0; i < num_pages; i++) {
257 struct page *page = osd_data->pages[i]; 258 struct page *page = osd_data->pages[i];
258 259
260 if (rc < 0)
261 goto unlock;
259 if (bytes < (int)PAGE_CACHE_SIZE) { 262 if (bytes < (int)PAGE_CACHE_SIZE) {
260 /* zero (remainder of) page */ 263 /* zero (remainder of) page */
261 int s = bytes < 0 ? 0 : bytes; 264 int s = bytes < 0 ? 0 : bytes;
@@ -266,6 +269,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
266 flush_dcache_page(page); 269 flush_dcache_page(page);
267 SetPageUptodate(page); 270 SetPageUptodate(page);
268 ceph_readpage_to_fscache(inode, page); 271 ceph_readpage_to_fscache(inode, page);
272unlock:
269 unlock_page(page); 273 unlock_page(page);
270 page_cache_release(page); 274 page_cache_release(page);
271 bytes -= PAGE_CACHE_SIZE; 275 bytes -= PAGE_CACHE_SIZE;
@@ -1207,6 +1211,41 @@ const struct address_space_operations ceph_aops = {
1207/* 1211/*
1208 * vm ops 1212 * vm ops
1209 */ 1213 */
1214static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1215{
1216 struct inode *inode = file_inode(vma->vm_file);
1217 struct ceph_inode_info *ci = ceph_inode(inode);
1218 struct ceph_file_info *fi = vma->vm_file->private_data;
1219 loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
1220 int want, got, ret;
1221
1222 dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
1223 inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
1224 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1225 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1226 else
1227 want = CEPH_CAP_FILE_CACHE;
1228 while (1) {
1229 got = 0;
1230 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
1231 if (ret == 0)
1232 break;
1233 if (ret != -ERESTARTSYS) {
1234 WARN_ON(1);
1235 return VM_FAULT_SIGBUS;
1236 }
1237 }
1238 dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
1239 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
1240
1241 ret = filemap_fault(vma, vmf);
1242
1243 dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
1244 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
1245 ceph_put_cap_refs(ci, got);
1246
1247 return ret;
1248}
1210 1249
1211/* 1250/*
1212 * Reuse write_begin here for simplicity. 1251 * Reuse write_begin here for simplicity.
@@ -1214,23 +1253,41 @@ const struct address_space_operations ceph_aops = {
1214static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1253static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1215{ 1254{
1216 struct inode *inode = file_inode(vma->vm_file); 1255 struct inode *inode = file_inode(vma->vm_file);
1217 struct page *page = vmf->page; 1256 struct ceph_inode_info *ci = ceph_inode(inode);
1257 struct ceph_file_info *fi = vma->vm_file->private_data;
1218 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1258 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1259 struct page *page = vmf->page;
1219 loff_t off = page_offset(page); 1260 loff_t off = page_offset(page);
1220 loff_t size, len; 1261 loff_t size = i_size_read(inode);
1221 int ret; 1262 size_t len;
1222 1263 int want, got, ret;
1223 /* Update time before taking page lock */
1224 file_update_time(vma->vm_file);
1225 1264
1226 size = i_size_read(inode);
1227 if (off + PAGE_CACHE_SIZE <= size) 1265 if (off + PAGE_CACHE_SIZE <= size)
1228 len = PAGE_CACHE_SIZE; 1266 len = PAGE_CACHE_SIZE;
1229 else 1267 else
1230 len = size & ~PAGE_CACHE_MASK; 1268 len = size & ~PAGE_CACHE_MASK;
1231 1269
1232 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, 1270 dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
1233 off, len, page, page->index); 1271 inode, ceph_vinop(inode), off, len, size);
1272 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1273 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1274 else
1275 want = CEPH_CAP_FILE_BUFFER;
1276 while (1) {
1277 got = 0;
1278 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len);
1279 if (ret == 0)
1280 break;
1281 if (ret != -ERESTARTSYS) {
1282 WARN_ON(1);
1283 return VM_FAULT_SIGBUS;
1284 }
1285 }
1286 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
1287 inode, off, len, ceph_cap_string(got));
1288
1289 /* Update time before taking page lock */
1290 file_update_time(vma->vm_file);
1234 1291
1235 lock_page(page); 1292 lock_page(page);
1236 1293
@@ -1252,14 +1309,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1252 ret = VM_FAULT_SIGBUS; 1309 ret = VM_FAULT_SIGBUS;
1253 } 1310 }
1254out: 1311out:
1255 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); 1312 if (ret != VM_FAULT_LOCKED) {
1256 if (ret != VM_FAULT_LOCKED)
1257 unlock_page(page); 1313 unlock_page(page);
1314 } else {
1315 int dirty;
1316 spin_lock(&ci->i_ceph_lock);
1317 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
1318 spin_unlock(&ci->i_ceph_lock);
1319 if (dirty)
1320 __mark_inode_dirty(inode, dirty);
1321 }
1322
1323 dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
1324 inode, off, len, ceph_cap_string(got), ret);
1325 ceph_put_cap_refs(ci, got);
1326
1258 return ret; 1327 return ret;
1259} 1328}
1260 1329
1261static struct vm_operations_struct ceph_vmops = { 1330static struct vm_operations_struct ceph_vmops = {
1262 .fault = filemap_fault, 1331 .fault = ceph_filemap_fault,
1263 .page_mkwrite = ceph_page_mkwrite, 1332 .page_mkwrite = ceph_page_mkwrite,
1264 .remap_pages = generic_file_remap_pages, 1333 .remap_pages = generic_file_remap_pages,
1265}; 1334};
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba949408a336..da95f61b7a09 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
67 return fscache_maybe_release_page(ci->fscache, page, gfp); 67 return fscache_maybe_release_page(ci->fscache, page, gfp);
68} 68}
69 69
70static inline void ceph_fscache_readpage_cancel(struct inode *inode,
71 struct page *page)
72{
73 struct ceph_inode_info *ci = ceph_inode(inode);
74 if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
75 __fscache_uncache_page(ci->fscache, page);
76}
77
70static inline void ceph_fscache_readpages_cancel(struct inode *inode, 78static inline void ceph_fscache_readpages_cancel(struct inode *inode,
71 struct list_head *pages) 79 struct list_head *pages)
72{ 80{
@@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
145 return 1; 153 return 1;
146} 154}
147 155
156static inline void ceph_fscache_readpage_cancel(struct inode *inode,
157 struct page *page)
158{
159}
160
148static inline void ceph_fscache_readpages_cancel(struct inode *inode, 161static inline void ceph_fscache_readpages_cancel(struct inode *inode,
149 struct list_head *pages) 162 struct list_head *pages)
150{ 163{
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3c0a4bd74996..17543383545c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -555,21 +555,34 @@ retry:
555 cap->ci = ci; 555 cap->ci = ci;
556 __insert_cap_node(ci, cap); 556 __insert_cap_node(ci, cap);
557 557
558 /* clear out old exporting info? (i.e. on cap import) */
559 if (ci->i_cap_exporting_mds == mds) {
560 ci->i_cap_exporting_issued = 0;
561 ci->i_cap_exporting_mseq = 0;
562 ci->i_cap_exporting_mds = -1;
563 }
564
565 /* add to session cap list */ 558 /* add to session cap list */
566 cap->session = session; 559 cap->session = session;
567 spin_lock(&session->s_cap_lock); 560 spin_lock(&session->s_cap_lock);
568 list_add_tail(&cap->session_caps, &session->s_caps); 561 list_add_tail(&cap->session_caps, &session->s_caps);
569 session->s_nr_caps++; 562 session->s_nr_caps++;
570 spin_unlock(&session->s_cap_lock); 563 spin_unlock(&session->s_cap_lock);
571 } else if (new_cap) 564 } else {
572 ceph_put_cap(mdsc, new_cap); 565 if (new_cap)
566 ceph_put_cap(mdsc, new_cap);
567
568 /*
569 * auth mds of the inode changed. we received the cap export
570 * message, but still haven't received the cap import message.
571 * handle_cap_export() updated the new auth MDS' cap.
572 *
573 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
574 * a message that was send before the cap import message. So
575 * don't remove caps.
576 */
577 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
578 WARN_ON(cap != ci->i_auth_cap);
579 WARN_ON(cap->cap_id != cap_id);
580 seq = cap->seq;
581 mseq = cap->mseq;
582 issued |= cap->issued;
583 flags |= CEPH_CAP_FLAG_AUTH;
584 }
585 }
573 586
574 if (!ci->i_snap_realm) { 587 if (!ci->i_snap_realm) {
575 /* 588 /*
@@ -611,15 +624,9 @@ retry:
611 if (ci->i_auth_cap == NULL || 624 if (ci->i_auth_cap == NULL ||
612 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) 625 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
613 ci->i_auth_cap = cap; 626 ci->i_auth_cap = cap;
614 } else if (ci->i_auth_cap == cap) { 627 ci->i_cap_exporting_issued = 0;
615 ci->i_auth_cap = NULL; 628 } else {
616 spin_lock(&mdsc->cap_dirty_lock); 629 WARN_ON(ci->i_auth_cap == cap);
617 if (!list_empty(&ci->i_dirty_item)) {
618 dout(" moving %p to cap_dirty_migrating\n", inode);
619 list_move(&ci->i_dirty_item,
620 &mdsc->cap_dirty_migrating);
621 }
622 spin_unlock(&mdsc->cap_dirty_lock);
623 } 630 }
624 631
625 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", 632 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
@@ -628,7 +635,7 @@ retry:
628 cap->cap_id = cap_id; 635 cap->cap_id = cap_id;
629 cap->issued = issued; 636 cap->issued = issued;
630 cap->implemented |= issued; 637 cap->implemented |= issued;
631 if (mseq > cap->mseq) 638 if (ceph_seq_cmp(mseq, cap->mseq) > 0)
632 cap->mds_wanted = wanted; 639 cap->mds_wanted = wanted;
633 else 640 else
634 cap->mds_wanted |= wanted; 641 cap->mds_wanted |= wanted;
@@ -816,7 +823,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
816 823
817 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 824 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
818 cap = rb_entry(p, struct ceph_cap, ci_node); 825 cap = rb_entry(p, struct ceph_cap, ci_node);
819 if (cap != ocap && __cap_is_valid(cap) && 826 if (cap != ocap &&
820 (cap->implemented & ~cap->issued & mask)) 827 (cap->implemented & ~cap->issued & mask))
821 return 1; 828 return 1;
822 } 829 }
@@ -888,7 +895,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
888 */ 895 */
889static int __ceph_is_any_caps(struct ceph_inode_info *ci) 896static int __ceph_is_any_caps(struct ceph_inode_info *ci)
890{ 897{
891 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; 898 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued;
899}
900
901int ceph_is_any_caps(struct inode *inode)
902{
903 struct ceph_inode_info *ci = ceph_inode(inode);
904 int ret;
905
906 spin_lock(&ci->i_ceph_lock);
907 ret = __ceph_is_any_caps(ci);
908 spin_unlock(&ci->i_ceph_lock);
909
910 return ret;
892} 911}
893 912
894/* 913/*
@@ -1383,13 +1402,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1383 ci->i_snap_realm->cached_context); 1402 ci->i_snap_realm->cached_context);
1384 dout(" inode %p now dirty snapc %p auth cap %p\n", 1403 dout(" inode %p now dirty snapc %p auth cap %p\n",
1385 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); 1404 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
1405 WARN_ON(!ci->i_auth_cap);
1386 BUG_ON(!list_empty(&ci->i_dirty_item)); 1406 BUG_ON(!list_empty(&ci->i_dirty_item));
1387 spin_lock(&mdsc->cap_dirty_lock); 1407 spin_lock(&mdsc->cap_dirty_lock);
1388 if (ci->i_auth_cap) 1408 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1389 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1390 else
1391 list_add(&ci->i_dirty_item,
1392 &mdsc->cap_dirty_migrating);
1393 spin_unlock(&mdsc->cap_dirty_lock); 1409 spin_unlock(&mdsc->cap_dirty_lock);
1394 if (ci->i_flushing_caps == 0) { 1410 if (ci->i_flushing_caps == 0) {
1395 ihold(inode); 1411 ihold(inode);
@@ -1735,13 +1751,12 @@ ack:
1735/* 1751/*
1736 * Try to flush dirty caps back to the auth mds. 1752 * Try to flush dirty caps back to the auth mds.
1737 */ 1753 */
1738static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1754static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
1739 unsigned *flush_tid)
1740{ 1755{
1741 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1756 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1742 struct ceph_inode_info *ci = ceph_inode(inode); 1757 struct ceph_inode_info *ci = ceph_inode(inode);
1743 int unlock_session = session ? 0 : 1;
1744 int flushing = 0; 1758 int flushing = 0;
1759 struct ceph_mds_session *session = NULL;
1745 1760
1746retry: 1761retry:
1747 spin_lock(&ci->i_ceph_lock); 1762 spin_lock(&ci->i_ceph_lock);
@@ -1755,13 +1770,14 @@ retry:
1755 int want = __ceph_caps_wanted(ci); 1770 int want = __ceph_caps_wanted(ci);
1756 int delayed; 1771 int delayed;
1757 1772
1758 if (!session) { 1773 if (!session || session != cap->session) {
1759 spin_unlock(&ci->i_ceph_lock); 1774 spin_unlock(&ci->i_ceph_lock);
1775 if (session)
1776 mutex_unlock(&session->s_mutex);
1760 session = cap->session; 1777 session = cap->session;
1761 mutex_lock(&session->s_mutex); 1778 mutex_lock(&session->s_mutex);
1762 goto retry; 1779 goto retry;
1763 } 1780 }
1764 BUG_ON(session != cap->session);
1765 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) 1781 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1766 goto out; 1782 goto out;
1767 1783
@@ -1780,7 +1796,7 @@ retry:
1780out: 1796out:
1781 spin_unlock(&ci->i_ceph_lock); 1797 spin_unlock(&ci->i_ceph_lock);
1782out_unlocked: 1798out_unlocked:
1783 if (session && unlock_session) 1799 if (session)
1784 mutex_unlock(&session->s_mutex); 1800 mutex_unlock(&session->s_mutex);
1785 return flushing; 1801 return flushing;
1786} 1802}
@@ -1865,7 +1881,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
1865 return ret; 1881 return ret;
1866 mutex_lock(&inode->i_mutex); 1882 mutex_lock(&inode->i_mutex);
1867 1883
1868 dirty = try_flush_caps(inode, NULL, &flush_tid); 1884 dirty = try_flush_caps(inode, &flush_tid);
1869 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 1885 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1870 1886
1871 /* 1887 /*
@@ -1900,7 +1916,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1900 1916
1901 dout("write_inode %p wait=%d\n", inode, wait); 1917 dout("write_inode %p wait=%d\n", inode, wait);
1902 if (wait) { 1918 if (wait) {
1903 dirty = try_flush_caps(inode, NULL, &flush_tid); 1919 dirty = try_flush_caps(inode, &flush_tid);
1904 if (dirty) 1920 if (dirty)
1905 err = wait_event_interruptible(ci->i_cap_wq, 1921 err = wait_event_interruptible(ci->i_cap_wq,
1906 caps_are_flushed(inode, flush_tid)); 1922 caps_are_flushed(inode, flush_tid));
@@ -2350,11 +2366,11 @@ static void invalidate_aliases(struct inode *inode)
2350 d_prune_aliases(inode); 2366 d_prune_aliases(inode);
2351 /* 2367 /*
2352 * For non-directory inode, d_find_alias() only returns 2368 * For non-directory inode, d_find_alias() only returns
2353 * connected dentry. After calling d_invalidate(), the 2369 * hashed dentry. After calling d_invalidate(), the
2354 * dentry become disconnected. 2370 * dentry becomes unhashed.
2355 * 2371 *
2356 * For directory inode, d_find_alias() can return 2372 * For directory inode, d_find_alias() can return
2357 * disconnected dentry. But directory inode should have 2373 * unhashed dentry. But directory inode should have
2358 * one alias at most. 2374 * one alias at most.
2359 */ 2375 */
2360 while ((dn = d_find_alias(inode))) { 2376 while ((dn = d_find_alias(inode))) {
@@ -2408,6 +2424,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2408 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2424 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2409 inode->i_size); 2425 inode->i_size);
2410 2426
2427
2428 /*
2429 * auth mds of the inode changed. we received the cap export message,
2430 * but still haven't received the cap import message. handle_cap_export
2431 * updated the new auth MDS' cap.
2432 *
2433 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
2434 * that was sent before the cap import message. So don't remove caps.
2435 */
2436 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
2437 WARN_ON(cap != ci->i_auth_cap);
2438 WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
2439 seq = cap->seq;
2440 newcaps |= cap->issued;
2441 }
2442
2411 /* 2443 /*
2412 * If CACHE is being revoked, and we have no dirty buffers, 2444 * If CACHE is being revoked, and we have no dirty buffers,
2413 * try to invalidate (once). (If there are dirty buffers, we 2445 * try to invalidate (once). (If there are dirty buffers, we
@@ -2434,6 +2466,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2434 issued |= implemented | __ceph_caps_dirty(ci); 2466 issued |= implemented | __ceph_caps_dirty(ci);
2435 2467
2436 cap->cap_gen = session->s_cap_gen; 2468 cap->cap_gen = session->s_cap_gen;
2469 cap->seq = seq;
2437 2470
2438 __check_cap_issue(ci, cap, newcaps); 2471 __check_cap_issue(ci, cap, newcaps);
2439 2472
@@ -2464,6 +2497,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2464 ceph_buffer_put(ci->i_xattrs.blob); 2497 ceph_buffer_put(ci->i_xattrs.blob);
2465 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); 2498 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2466 ci->i_xattrs.version = version; 2499 ci->i_xattrs.version = version;
2500 ceph_forget_all_cached_acls(inode);
2467 } 2501 }
2468 } 2502 }
2469 2503
@@ -2483,6 +2517,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2483 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, 2517 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2484 &atime); 2518 &atime);
2485 2519
2520
2521 /* file layout may have changed */
2522 ci->i_layout = grant->layout;
2523
2486 /* max size increase? */ 2524 /* max size increase? */
2487 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { 2525 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
2488 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); 2526 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
@@ -2511,11 +2549,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2511 check_caps = 1; 2549 check_caps = 1;
2512 } 2550 }
2513 2551
2514 cap->seq = seq;
2515
2516 /* file layout may have changed */
2517 ci->i_layout = grant->layout;
2518
2519 /* revocation, grant, or no-op? */ 2552 /* revocation, grant, or no-op? */
2520 if (cap->issued & ~newcaps) { 2553 if (cap->issued & ~newcaps) {
2521 int revoking = cap->issued & ~newcaps; 2554 int revoking = cap->issued & ~newcaps;
@@ -2741,65 +2774,114 @@ static void handle_cap_trunc(struct inode *inode,
2741 * caller holds s_mutex 2774 * caller holds s_mutex
2742 */ 2775 */
2743static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2776static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2744 struct ceph_mds_session *session, 2777 struct ceph_mds_cap_peer *ph,
2745 int *open_target_sessions) 2778 struct ceph_mds_session *session)
2746{ 2779{
2747 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2780 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2781 struct ceph_mds_session *tsession = NULL;
2782 struct ceph_cap *cap, *tcap;
2748 struct ceph_inode_info *ci = ceph_inode(inode); 2783 struct ceph_inode_info *ci = ceph_inode(inode);
2749 int mds = session->s_mds; 2784 u64 t_cap_id;
2750 unsigned mseq = le32_to_cpu(ex->migrate_seq); 2785 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2751 struct ceph_cap *cap = NULL, *t; 2786 unsigned t_seq, t_mseq;
2752 struct rb_node *p; 2787 int target, issued;
2753 int remember = 1; 2788 int mds = session->s_mds;
2754 2789
2755 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", 2790 if (ph) {
2756 inode, ci, mds, mseq); 2791 t_cap_id = le64_to_cpu(ph->cap_id);
2792 t_seq = le32_to_cpu(ph->seq);
2793 t_mseq = le32_to_cpu(ph->mseq);
2794 target = le32_to_cpu(ph->mds);
2795 } else {
2796 t_cap_id = t_seq = t_mseq = 0;
2797 target = -1;
2798 }
2757 2799
2800 dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
2801 inode, ci, mds, mseq, target);
2802retry:
2758 spin_lock(&ci->i_ceph_lock); 2803 spin_lock(&ci->i_ceph_lock);
2804 cap = __get_cap_for_mds(ci, mds);
2805 if (!cap)
2806 goto out_unlock;
2759 2807
2760 /* make sure we haven't seen a higher mseq */ 2808 if (target < 0) {
2761 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 2809 __ceph_remove_cap(cap, false);
2762 t = rb_entry(p, struct ceph_cap, ci_node); 2810 goto out_unlock;
2763 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2764 dout(" higher mseq on cap from mds%d\n",
2765 t->session->s_mds);
2766 remember = 0;
2767 }
2768 if (t->session->s_mds == mds)
2769 cap = t;
2770 } 2811 }
2771 2812
2772 if (cap) { 2813 /*
2773 if (remember) { 2814 * now we know we haven't received the cap import message yet
2774 /* make note */ 2815 * because the exported cap still exist.
2775 ci->i_cap_exporting_mds = mds; 2816 */
2776 ci->i_cap_exporting_mseq = mseq;
2777 ci->i_cap_exporting_issued = cap->issued;
2778
2779 /*
2780 * make sure we have open sessions with all possible
2781 * export targets, so that we get the matching IMPORT
2782 */
2783 *open_target_sessions = 1;
2784 2817
2785 /* 2818 issued = cap->issued;
2786 * we can't flush dirty caps that we've seen the 2819 WARN_ON(issued != cap->implemented);
2787 * EXPORT but no IMPORT for 2820
2788 */ 2821 tcap = __get_cap_for_mds(ci, target);
2789 spin_lock(&mdsc->cap_dirty_lock); 2822 if (tcap) {
2790 if (!list_empty(&ci->i_dirty_item)) { 2823 /* already have caps from the target */
2791 dout(" moving %p to cap_dirty_migrating\n", 2824 if (tcap->cap_id != t_cap_id ||
2792 inode); 2825 ceph_seq_cmp(tcap->seq, t_seq) < 0) {
2793 list_move(&ci->i_dirty_item, 2826 dout(" updating import cap %p mds%d\n", tcap, target);
2794 &mdsc->cap_dirty_migrating); 2827 tcap->cap_id = t_cap_id;
2828 tcap->seq = t_seq - 1;
2829 tcap->issue_seq = t_seq - 1;
2830 tcap->mseq = t_mseq;
2831 tcap->issued |= issued;
2832 tcap->implemented |= issued;
2833 if (cap == ci->i_auth_cap)
2834 ci->i_auth_cap = tcap;
2835 if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
2836 spin_lock(&mdsc->cap_dirty_lock);
2837 list_move_tail(&ci->i_flushing_item,
2838 &tcap->session->s_cap_flushing);
2839 spin_unlock(&mdsc->cap_dirty_lock);
2795 } 2840 }
2796 spin_unlock(&mdsc->cap_dirty_lock);
2797 } 2841 }
2798 __ceph_remove_cap(cap, false); 2842 __ceph_remove_cap(cap, false);
2843 goto out_unlock;
2799 } 2844 }
2800 /* else, we already released it */
2801 2845
2846 if (tsession) {
2847 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
2848 spin_unlock(&ci->i_ceph_lock);
2849 /* add placeholder for the export tagert */
2850 ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
2851 t_seq - 1, t_mseq, (u64)-1, flag, NULL);
2852 goto retry;
2853 }
2854
2855 spin_unlock(&ci->i_ceph_lock);
2856 mutex_unlock(&session->s_mutex);
2857
2858 /* open target session */
2859 tsession = ceph_mdsc_open_export_target_session(mdsc, target);
2860 if (!IS_ERR(tsession)) {
2861 if (mds > target) {
2862 mutex_lock(&session->s_mutex);
2863 mutex_lock_nested(&tsession->s_mutex,
2864 SINGLE_DEPTH_NESTING);
2865 } else {
2866 mutex_lock(&tsession->s_mutex);
2867 mutex_lock_nested(&session->s_mutex,
2868 SINGLE_DEPTH_NESTING);
2869 }
2870 ceph_add_cap_releases(mdsc, tsession);
2871 } else {
2872 WARN_ON(1);
2873 tsession = NULL;
2874 target = -1;
2875 }
2876 goto retry;
2877
2878out_unlock:
2802 spin_unlock(&ci->i_ceph_lock); 2879 spin_unlock(&ci->i_ceph_lock);
2880 mutex_unlock(&session->s_mutex);
2881 if (tsession) {
2882 mutex_unlock(&tsession->s_mutex);
2883 ceph_put_mds_session(tsession);
2884 }
2803} 2885}
2804 2886
2805/* 2887/*
@@ -2810,10 +2892,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2810 */ 2892 */
2811static void handle_cap_import(struct ceph_mds_client *mdsc, 2893static void handle_cap_import(struct ceph_mds_client *mdsc,
2812 struct inode *inode, struct ceph_mds_caps *im, 2894 struct inode *inode, struct ceph_mds_caps *im,
2895 struct ceph_mds_cap_peer *ph,
2813 struct ceph_mds_session *session, 2896 struct ceph_mds_session *session,
2814 void *snaptrace, int snaptrace_len) 2897 void *snaptrace, int snaptrace_len)
2815{ 2898{
2816 struct ceph_inode_info *ci = ceph_inode(inode); 2899 struct ceph_inode_info *ci = ceph_inode(inode);
2900 struct ceph_cap *cap;
2817 int mds = session->s_mds; 2901 int mds = session->s_mds;
2818 unsigned issued = le32_to_cpu(im->caps); 2902 unsigned issued = le32_to_cpu(im->caps);
2819 unsigned wanted = le32_to_cpu(im->wanted); 2903 unsigned wanted = le32_to_cpu(im->wanted);
@@ -2821,28 +2905,44 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2821 unsigned mseq = le32_to_cpu(im->migrate_seq); 2905 unsigned mseq = le32_to_cpu(im->migrate_seq);
2822 u64 realmino = le64_to_cpu(im->realm); 2906 u64 realmino = le64_to_cpu(im->realm);
2823 u64 cap_id = le64_to_cpu(im->cap_id); 2907 u64 cap_id = le64_to_cpu(im->cap_id);
2908 u64 p_cap_id;
2909 int peer;
2824 2910
2825 if (ci->i_cap_exporting_mds >= 0 && 2911 if (ph) {
2826 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { 2912 p_cap_id = le64_to_cpu(ph->cap_id);
2827 dout("handle_cap_import inode %p ci %p mds%d mseq %d" 2913 peer = le32_to_cpu(ph->mds);
2828 " - cleared exporting from mds%d\n", 2914 } else {
2829 inode, ci, mds, mseq, 2915 p_cap_id = 0;
2830 ci->i_cap_exporting_mds); 2916 peer = -1;
2831 ci->i_cap_exporting_issued = 0; 2917 }
2832 ci->i_cap_exporting_mseq = 0;
2833 ci->i_cap_exporting_mds = -1;
2834 2918
2835 spin_lock(&mdsc->cap_dirty_lock); 2919 dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
2836 if (!list_empty(&ci->i_dirty_item)) { 2920 inode, ci, mds, mseq, peer);
2837 dout(" moving %p back to cap_dirty\n", inode); 2921
2838 list_move(&ci->i_dirty_item, &mdsc->cap_dirty); 2922 spin_lock(&ci->i_ceph_lock);
2923 cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
2924 if (cap && cap->cap_id == p_cap_id) {
2925 dout(" remove export cap %p mds%d flags %d\n",
2926 cap, peer, ph->flags);
2927 if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
2928 (cap->seq != le32_to_cpu(ph->seq) ||
2929 cap->mseq != le32_to_cpu(ph->mseq))) {
2930 pr_err("handle_cap_import: mismatched seq/mseq: "
2931 "ino (%llx.%llx) mds%d seq %d mseq %d "
2932 "importer mds%d has peer seq %d mseq %d\n",
2933 ceph_vinop(inode), peer, cap->seq,
2934 cap->mseq, mds, le32_to_cpu(ph->seq),
2935 le32_to_cpu(ph->mseq));
2839 } 2936 }
2840 spin_unlock(&mdsc->cap_dirty_lock); 2937 ci->i_cap_exporting_issued = cap->issued;
2841 } else { 2938 __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
2842 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2843 inode, ci, mds, mseq);
2844 } 2939 }
2845 2940
2941 /* make sure we re-request max_size, if necessary */
2942 ci->i_wanted_max_size = 0;
2943 ci->i_requested_max_size = 0;
2944 spin_unlock(&ci->i_ceph_lock);
2945
2846 down_write(&mdsc->snap_rwsem); 2946 down_write(&mdsc->snap_rwsem);
2847 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, 2947 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2848 false); 2948 false);
@@ -2853,11 +2953,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2853 kick_flushing_inode_caps(mdsc, session, inode); 2953 kick_flushing_inode_caps(mdsc, session, inode);
2854 up_read(&mdsc->snap_rwsem); 2954 up_read(&mdsc->snap_rwsem);
2855 2955
2856 /* make sure we re-request max_size, if necessary */
2857 spin_lock(&ci->i_ceph_lock);
2858 ci->i_wanted_max_size = 0; /* reset */
2859 ci->i_requested_max_size = 0;
2860 spin_unlock(&ci->i_ceph_lock);
2861} 2956}
2862 2957
2863/* 2958/*
@@ -2875,6 +2970,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2875 struct ceph_inode_info *ci; 2970 struct ceph_inode_info *ci;
2876 struct ceph_cap *cap; 2971 struct ceph_cap *cap;
2877 struct ceph_mds_caps *h; 2972 struct ceph_mds_caps *h;
2973 struct ceph_mds_cap_peer *peer = NULL;
2878 int mds = session->s_mds; 2974 int mds = session->s_mds;
2879 int op; 2975 int op;
2880 u32 seq, mseq; 2976 u32 seq, mseq;
@@ -2885,12 +2981,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2885 void *snaptrace; 2981 void *snaptrace;
2886 size_t snaptrace_len; 2982 size_t snaptrace_len;
2887 void *flock; 2983 void *flock;
2984 void *end;
2888 u32 flock_len; 2985 u32 flock_len;
2889 int open_target_sessions = 0;
2890 2986
2891 dout("handle_caps from mds%d\n", mds); 2987 dout("handle_caps from mds%d\n", mds);
2892 2988
2893 /* decode */ 2989 /* decode */
2990 end = msg->front.iov_base + msg->front.iov_len;
2894 tid = le64_to_cpu(msg->hdr.tid); 2991 tid = le64_to_cpu(msg->hdr.tid);
2895 if (msg->front.iov_len < sizeof(*h)) 2992 if (msg->front.iov_len < sizeof(*h))
2896 goto bad; 2993 goto bad;
@@ -2908,17 +3005,28 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2908 snaptrace_len = le32_to_cpu(h->snap_trace_len); 3005 snaptrace_len = le32_to_cpu(h->snap_trace_len);
2909 3006
2910 if (le16_to_cpu(msg->hdr.version) >= 2) { 3007 if (le16_to_cpu(msg->hdr.version) >= 2) {
2911 void *p, *end; 3008 void *p = snaptrace + snaptrace_len;
2912
2913 p = snaptrace + snaptrace_len;
2914 end = msg->front.iov_base + msg->front.iov_len;
2915 ceph_decode_32_safe(&p, end, flock_len, bad); 3009 ceph_decode_32_safe(&p, end, flock_len, bad);
3010 if (p + flock_len > end)
3011 goto bad;
2916 flock = p; 3012 flock = p;
2917 } else { 3013 } else {
2918 flock = NULL; 3014 flock = NULL;
2919 flock_len = 0; 3015 flock_len = 0;
2920 } 3016 }
2921 3017
3018 if (le16_to_cpu(msg->hdr.version) >= 3) {
3019 if (op == CEPH_CAP_OP_IMPORT) {
3020 void *p = flock + flock_len;
3021 if (p + sizeof(*peer) > end)
3022 goto bad;
3023 peer = p;
3024 } else if (op == CEPH_CAP_OP_EXPORT) {
3025 /* recorded in unused fields */
3026 peer = (void *)&h->size;
3027 }
3028 }
3029
2922 mutex_lock(&session->s_mutex); 3030 mutex_lock(&session->s_mutex);
2923 session->s_seq++; 3031 session->s_seq++;
2924 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 3032 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2951,11 +3059,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2951 goto done; 3059 goto done;
2952 3060
2953 case CEPH_CAP_OP_EXPORT: 3061 case CEPH_CAP_OP_EXPORT:
2954 handle_cap_export(inode, h, session, &open_target_sessions); 3062 handle_cap_export(inode, h, peer, session);
2955 goto done; 3063 goto done_unlocked;
2956 3064
2957 case CEPH_CAP_OP_IMPORT: 3065 case CEPH_CAP_OP_IMPORT:
2958 handle_cap_import(mdsc, inode, h, session, 3066 handle_cap_import(mdsc, inode, h, peer, session,
2959 snaptrace, snaptrace_len); 3067 snaptrace, snaptrace_len);
2960 } 3068 }
2961 3069
@@ -3007,8 +3115,6 @@ done:
3007done_unlocked: 3115done_unlocked:
3008 if (inode) 3116 if (inode)
3009 iput(inode); 3117 iput(inode);
3010 if (open_target_sessions)
3011 ceph_mdsc_open_export_target_sessions(mdsc, session);
3012 return; 3118 return;
3013 3119
3014bad: 3120bad:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2a0bcaeb189a..45eda6d7a40c 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -100,6 +100,14 @@ static unsigned fpos_off(loff_t p)
100 return p & 0xffffffff; 100 return p & 0xffffffff;
101} 101}
102 102
103static int fpos_cmp(loff_t l, loff_t r)
104{
105 int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
106 if (v)
107 return v;
108 return (int)(fpos_off(l) - fpos_off(r));
109}
110
103/* 111/*
104 * When possible, we try to satisfy a readdir by peeking at the 112 * When possible, we try to satisfy a readdir by peeking at the
105 * dcache. We make this work by carefully ordering dentries on 113 * dcache. We make this work by carefully ordering dentries on
@@ -156,7 +164,7 @@ more:
156 if (!d_unhashed(dentry) && dentry->d_inode && 164 if (!d_unhashed(dentry) && dentry->d_inode &&
157 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && 165 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
158 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && 166 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
159 ctx->pos <= di->offset) 167 fpos_cmp(ctx->pos, di->offset) <= 0)
160 break; 168 break;
161 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, 169 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
162 dentry->d_name.len, dentry->d_name.name, di->offset, 170 dentry->d_name.len, dentry->d_name.name, di->offset,
@@ -693,7 +701,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
693 if (!err && !req->r_reply_info.head->is_dentry) 701 if (!err && !req->r_reply_info.head->is_dentry)
694 err = ceph_handle_notrace_create(dir, dentry); 702 err = ceph_handle_notrace_create(dir, dentry);
695 ceph_mdsc_put_request(req); 703 ceph_mdsc_put_request(req);
696 if (err) 704
705 if (!err)
706 ceph_init_acl(dentry, dentry->d_inode, dir);
707 else
697 d_drop(dentry); 708 d_drop(dentry);
698 return err; 709 return err;
699} 710}
@@ -731,7 +742,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
731 if (!err && !req->r_reply_info.head->is_dentry) 742 if (!err && !req->r_reply_info.head->is_dentry)
732 err = ceph_handle_notrace_create(dir, dentry); 743 err = ceph_handle_notrace_create(dir, dentry);
733 ceph_mdsc_put_request(req); 744 ceph_mdsc_put_request(req);
734 if (err) 745 if (!err)
746 ceph_init_acl(dentry, dentry->d_inode, dir);
747 else
735 d_drop(dentry); 748 d_drop(dentry);
736 return err; 749 return err;
737} 750}
@@ -772,7 +785,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
772 err = ceph_handle_notrace_create(dir, dentry); 785 err = ceph_handle_notrace_create(dir, dentry);
773 ceph_mdsc_put_request(req); 786 ceph_mdsc_put_request(req);
774out: 787out:
775 if (err < 0) 788 if (!err)
789 ceph_init_acl(dentry, dentry->d_inode, dir);
790 else
776 d_drop(dentry); 791 d_drop(dentry);
777 return err; 792 return err;
778} 793}
@@ -1037,14 +1052,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
1037 valid = 1; 1052 valid = 1;
1038 } else if (dentry_lease_is_valid(dentry) || 1053 } else if (dentry_lease_is_valid(dentry) ||
1039 dir_lease_is_valid(dir, dentry)) { 1054 dir_lease_is_valid(dir, dentry)) {
1040 valid = 1; 1055 if (dentry->d_inode)
1056 valid = ceph_is_any_caps(dentry->d_inode);
1057 else
1058 valid = 1;
1041 } 1059 }
1042 1060
1043 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1061 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
1044 if (valid) 1062 if (valid) {
1045 ceph_dentry_lru_touch(dentry); 1063 ceph_dentry_lru_touch(dentry);
1046 else 1064 } else {
1065 ceph_dir_clear_complete(dir);
1047 d_drop(dentry); 1066 d_drop(dentry);
1067 }
1048 iput(dir); 1068 iput(dir);
1049 return valid; 1069 return valid;
1050} 1070}
@@ -1293,6 +1313,8 @@ const struct inode_operations ceph_dir_iops = {
1293 .getxattr = ceph_getxattr, 1313 .getxattr = ceph_getxattr,
1294 .listxattr = ceph_listxattr, 1314 .listxattr = ceph_listxattr,
1295 .removexattr = ceph_removexattr, 1315 .removexattr = ceph_removexattr,
1316 .get_acl = ceph_get_acl,
1317 .set_acl = ceph_set_acl,
1296 .mknod = ceph_mknod, 1318 .mknod = ceph_mknod,
1297 .symlink = ceph_symlink, 1319 .symlink = ceph_symlink,
1298 .mkdir = ceph_mkdir, 1320 .mkdir = ceph_mkdir,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3de89829e2a1..09c7afe32e49 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -286,6 +286,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
286 } else { 286 } else {
287 dout("atomic_open finish_open on dn %p\n", dn); 287 dout("atomic_open finish_open on dn %p\n", dn);
288 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { 288 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
289 ceph_init_acl(dentry, dentry->d_inode, dir);
289 *opened |= FILE_CREATED; 290 *opened |= FILE_CREATED;
290 } 291 }
291 err = finish_open(file, dentry, ceph_open, opened); 292 err = finish_open(file, dentry, ceph_open, opened);
@@ -408,51 +409,92 @@ more:
408 * 409 *
409 * If the read spans object boundary, just do multiple reads. 410 * If the read spans object boundary, just do multiple reads.
410 */ 411 */
411static ssize_t ceph_sync_read(struct file *file, char __user *data, 412static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
412 unsigned len, loff_t *poff, int *checkeof) 413 int *checkeof)
413{ 414{
415 struct file *file = iocb->ki_filp;
414 struct inode *inode = file_inode(file); 416 struct inode *inode = file_inode(file);
415 struct page **pages; 417 struct page **pages;
416 u64 off = *poff; 418 u64 off = iocb->ki_pos;
417 int num_pages, ret; 419 int num_pages, ret;
420 size_t len = i->count;
418 421
419 dout("sync_read on file %p %llu~%u %s\n", file, off, len, 422 dout("sync_read on file %p %llu~%u %s\n", file, off,
423 (unsigned)len,
420 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 424 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
421
422 if (file->f_flags & O_DIRECT) {
423 num_pages = calc_pages_for((unsigned long)data, len);
424 pages = ceph_get_direct_page_vector(data, num_pages, true);
425 } else {
426 num_pages = calc_pages_for(off, len);
427 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
428 }
429 if (IS_ERR(pages))
430 return PTR_ERR(pages);
431
432 /* 425 /*
433 * flush any page cache pages in this range. this 426 * flush any page cache pages in this range. this
434 * will make concurrent normal and sync io slow, 427 * will make concurrent normal and sync io slow,
435 * but it will at least behave sensibly when they are 428 * but it will at least behave sensibly when they are
436 * in sequence. 429 * in sequence.
437 */ 430 */
438 ret = filemap_write_and_wait(inode->i_mapping); 431 ret = filemap_write_and_wait_range(inode->i_mapping, off,
432 off + len);
439 if (ret < 0) 433 if (ret < 0)
440 goto done; 434 return ret;
441 435
442 ret = striped_read(inode, off, len, pages, num_pages, checkeof, 436 if (file->f_flags & O_DIRECT) {
443 file->f_flags & O_DIRECT, 437 while (iov_iter_count(i)) {
444 (unsigned long)data & ~PAGE_MASK); 438 void __user *data = i->iov[0].iov_base + i->iov_offset;
439 size_t len = i->iov[0].iov_len - i->iov_offset;
440
441 num_pages = calc_pages_for((unsigned long)data, len);
442 pages = ceph_get_direct_page_vector(data,
443 num_pages, true);
444 if (IS_ERR(pages))
445 return PTR_ERR(pages);
446
447 ret = striped_read(inode, off, len,
448 pages, num_pages, checkeof,
449 1, (unsigned long)data & ~PAGE_MASK);
450 ceph_put_page_vector(pages, num_pages, true);
451
452 if (ret <= 0)
453 break;
454 off += ret;
455 iov_iter_advance(i, ret);
456 if (ret < len)
457 break;
458 }
459 } else {
460 num_pages = calc_pages_for(off, len);
461 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
462 if (IS_ERR(pages))
463 return PTR_ERR(pages);
464 ret = striped_read(inode, off, len, pages,
465 num_pages, checkeof, 0, 0);
466 if (ret > 0) {
467 int l, k = 0;
468 size_t left = len = ret;
469
470 while (left) {
471 void __user *data = i->iov[0].iov_base
472 + i->iov_offset;
473 l = min(i->iov[0].iov_len - i->iov_offset,
474 left);
475
476 ret = ceph_copy_page_vector_to_user(&pages[k],
477 data, off,
478 l);
479 if (ret > 0) {
480 iov_iter_advance(i, ret);
481 left -= ret;
482 off += ret;
483 k = calc_pages_for(iocb->ki_pos,
484 len - left + 1) - 1;
485 BUG_ON(k >= num_pages && left);
486 } else
487 break;
488 }
489 }
490 ceph_release_page_vector(pages, num_pages);
491 }
445 492
446 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 493 if (off > iocb->ki_pos) {
447 ret = ceph_copy_page_vector_to_user(pages, data, off, ret); 494 ret = off - iocb->ki_pos;
448 if (ret >= 0) 495 iocb->ki_pos = off;
449 *poff = off + ret; 496 }
450 497
451done:
452 if (file->f_flags & O_DIRECT)
453 ceph_put_page_vector(pages, num_pages, true);
454 else
455 ceph_release_page_vector(pages, num_pages);
456 dout("sync_read result %d\n", ret); 498 dout("sync_read result %d\n", ret);
457 return ret; 499 return ret;
458} 500}
@@ -489,83 +531,79 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
489 } 531 }
490} 532}
491 533
534
492/* 535/*
493 * Synchronous write, straight from __user pointer or user pages (if 536 * Synchronous write, straight from __user pointer or user pages.
494 * O_DIRECT).
495 * 537 *
496 * If write spans object boundary, just do multiple writes. (For a 538 * If write spans object boundary, just do multiple writes. (For a
497 * correct atomic write, we should e.g. take write locks on all 539 * correct atomic write, we should e.g. take write locks on all
498 * objects, rollback on failure, etc.) 540 * objects, rollback on failure, etc.)
499 */ 541 */
500static ssize_t ceph_sync_write(struct file *file, const char __user *data, 542static ssize_t
501 size_t left, loff_t pos, loff_t *ppos) 543ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
544 unsigned long nr_segs, size_t count)
502{ 545{
546 struct file *file = iocb->ki_filp;
503 struct inode *inode = file_inode(file); 547 struct inode *inode = file_inode(file);
504 struct ceph_inode_info *ci = ceph_inode(inode); 548 struct ceph_inode_info *ci = ceph_inode(inode);
505 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 549 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
506 struct ceph_snap_context *snapc; 550 struct ceph_snap_context *snapc;
507 struct ceph_vino vino; 551 struct ceph_vino vino;
508 struct ceph_osd_request *req; 552 struct ceph_osd_request *req;
509 int num_ops = 1;
510 struct page **pages; 553 struct page **pages;
511 int num_pages; 554 int num_pages;
512 u64 len;
513 int written = 0; 555 int written = 0;
514 int flags; 556 int flags;
515 int check_caps = 0; 557 int check_caps = 0;
516 int page_align, io_align; 558 int page_align;
517 unsigned long buf_align;
518 int ret; 559 int ret;
519 struct timespec mtime = CURRENT_TIME; 560 struct timespec mtime = CURRENT_TIME;
520 bool own_pages = false; 561 loff_t pos = iocb->ki_pos;
562 struct iov_iter i;
521 563
522 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 564 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
523 return -EROFS; 565 return -EROFS;
524 566
525 dout("sync_write on file %p %lld~%u %s\n", file, pos, 567 dout("sync_direct_write on file %p %lld~%u\n", file, pos,
526 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 568 (unsigned)count);
527 569
528 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 570 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
529 if (ret < 0) 571 if (ret < 0)
530 return ret; 572 return ret;
531 573
532 ret = invalidate_inode_pages2_range(inode->i_mapping, 574 ret = invalidate_inode_pages2_range(inode->i_mapping,
533 pos >> PAGE_CACHE_SHIFT, 575 pos >> PAGE_CACHE_SHIFT,
534 (pos + left) >> PAGE_CACHE_SHIFT); 576 (pos + count) >> PAGE_CACHE_SHIFT);
535 if (ret < 0) 577 if (ret < 0)
536 dout("invalidate_inode_pages2_range returned %d\n", ret); 578 dout("invalidate_inode_pages2_range returned %d\n", ret);
537 579
538 flags = CEPH_OSD_FLAG_ORDERSNAP | 580 flags = CEPH_OSD_FLAG_ORDERSNAP |
539 CEPH_OSD_FLAG_ONDISK | 581 CEPH_OSD_FLAG_ONDISK |
540 CEPH_OSD_FLAG_WRITE; 582 CEPH_OSD_FLAG_WRITE;
541 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
542 flags |= CEPH_OSD_FLAG_ACK;
543 else
544 num_ops++; /* Also include a 'startsync' command. */
545 583
546 /* 584 iov_iter_init(&i, iov, nr_segs, count, 0);
547 * we may need to do multiple writes here if we span an object 585
548 * boundary. this isn't atomic, unfortunately. :( 586 while (iov_iter_count(&i) > 0) {
549 */ 587 void __user *data = i.iov->iov_base + i.iov_offset;
550more: 588 u64 len = i.iov->iov_len - i.iov_offset;
551 io_align = pos & ~PAGE_MASK; 589
552 buf_align = (unsigned long)data & ~PAGE_MASK; 590 page_align = (unsigned long)data & ~PAGE_MASK;
553 len = left; 591
554 592 snapc = ci->i_snap_realm->cached_context;
555 snapc = ci->i_snap_realm->cached_context; 593 vino = ceph_vino(inode);
556 vino = ceph_vino(inode); 594 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
557 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 595 vino, pos, &len,
558 vino, pos, &len, num_ops, 596 2,/*include a 'startsync' command*/
559 CEPH_OSD_OP_WRITE, flags, snapc, 597 CEPH_OSD_OP_WRITE, flags, snapc,
560 ci->i_truncate_seq, ci->i_truncate_size, 598 ci->i_truncate_seq,
561 false); 599 ci->i_truncate_size,
562 if (IS_ERR(req)) 600 false);
563 return PTR_ERR(req); 601 if (IS_ERR(req)) {
602 ret = PTR_ERR(req);
603 goto out;
604 }
564 605
565 /* write from beginning of first page, regardless of io alignment */ 606 num_pages = calc_pages_for(page_align, len);
566 page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
567 num_pages = calc_pages_for(page_align, len);
568 if (file->f_flags & O_DIRECT) {
569 pages = ceph_get_direct_page_vector(data, num_pages, false); 607 pages = ceph_get_direct_page_vector(data, num_pages, false);
570 if (IS_ERR(pages)) { 608 if (IS_ERR(pages)) {
571 ret = PTR_ERR(pages); 609 ret = PTR_ERR(pages);
@@ -577,60 +615,175 @@ more:
577 * may block. 615 * may block.
578 */ 616 */
579 truncate_inode_pages_range(inode->i_mapping, pos, 617 truncate_inode_pages_range(inode->i_mapping, pos,
580 (pos+len) | (PAGE_CACHE_SIZE-1)); 618 (pos+len) | (PAGE_CACHE_SIZE-1));
581 } else { 619 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
620 false, false);
621
622 /* BUG_ON(vino.snap != CEPH_NOSNAP); */
623 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
624
625 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
626 if (!ret)
627 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
628
629 ceph_put_page_vector(pages, num_pages, false);
630
631out:
632 ceph_osdc_put_request(req);
633 if (ret == 0) {
634 pos += len;
635 written += len;
636 iov_iter_advance(&i, (size_t)len);
637
638 if (pos > i_size_read(inode)) {
639 check_caps = ceph_inode_set_size(inode, pos);
640 if (check_caps)
641 ceph_check_caps(ceph_inode(inode),
642 CHECK_CAPS_AUTHONLY,
643 NULL);
644 }
645 } else
646 break;
647 }
648
649 if (ret != -EOLDSNAPC && written > 0) {
650 iocb->ki_pos = pos;
651 ret = written;
652 }
653 return ret;
654}
655
656
657/*
658 * Synchronous write, straight from __user pointer or user pages.
659 *
660 * If write spans object boundary, just do multiple writes. (For a
661 * correct atomic write, we should e.g. take write locks on all
662 * objects, rollback on failure, etc.)
663 */
664static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
665 unsigned long nr_segs, size_t count)
666{
667 struct file *file = iocb->ki_filp;
668 struct inode *inode = file_inode(file);
669 struct ceph_inode_info *ci = ceph_inode(inode);
670 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
671 struct ceph_snap_context *snapc;
672 struct ceph_vino vino;
673 struct ceph_osd_request *req;
674 struct page **pages;
675 u64 len;
676 int num_pages;
677 int written = 0;
678 int flags;
679 int check_caps = 0;
680 int ret;
681 struct timespec mtime = CURRENT_TIME;
682 loff_t pos = iocb->ki_pos;
683 struct iov_iter i;
684
685 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
686 return -EROFS;
687
688 dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
689
690 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
691 if (ret < 0)
692 return ret;
693
694 ret = invalidate_inode_pages2_range(inode->i_mapping,
695 pos >> PAGE_CACHE_SHIFT,
696 (pos + count) >> PAGE_CACHE_SHIFT);
697 if (ret < 0)
698 dout("invalidate_inode_pages2_range returned %d\n", ret);
699
700 flags = CEPH_OSD_FLAG_ORDERSNAP |
701 CEPH_OSD_FLAG_ONDISK |
702 CEPH_OSD_FLAG_WRITE |
703 CEPH_OSD_FLAG_ACK;
704
705 iov_iter_init(&i, iov, nr_segs, count, 0);
706
707 while ((len = iov_iter_count(&i)) > 0) {
708 size_t left;
709 int n;
710
711 snapc = ci->i_snap_realm->cached_context;
712 vino = ceph_vino(inode);
713 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
714 vino, pos, &len, 1,
715 CEPH_OSD_OP_WRITE, flags, snapc,
716 ci->i_truncate_seq,
717 ci->i_truncate_size,
718 false);
719 if (IS_ERR(req)) {
720 ret = PTR_ERR(req);
721 goto out;
722 }
723
724 /*
725 * write from beginning of first page,
726 * regardless of io alignment
727 */
728 num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
729
582 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 730 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
583 if (IS_ERR(pages)) { 731 if (IS_ERR(pages)) {
584 ret = PTR_ERR(pages); 732 ret = PTR_ERR(pages);
585 goto out; 733 goto out;
586 } 734 }
587 ret = ceph_copy_user_to_page_vector(pages, data, pos, len); 735
736 left = len;
737 for (n = 0; n < num_pages; n++) {
738 size_t plen = min_t(size_t, left, PAGE_SIZE);
739 ret = iov_iter_copy_from_user(pages[n], &i, 0, plen);
740 if (ret != plen) {
741 ret = -EFAULT;
742 break;
743 }
744 left -= ret;
745 iov_iter_advance(&i, ret);
746 }
747
588 if (ret < 0) { 748 if (ret < 0) {
589 ceph_release_page_vector(pages, num_pages); 749 ceph_release_page_vector(pages, num_pages);
590 goto out; 750 goto out;
591 } 751 }
592 752
593 if ((file->f_flags & O_SYNC) == 0) { 753 /* get a second commit callback */
594 /* get a second commit callback */ 754 req->r_unsafe_callback = ceph_sync_write_unsafe;
595 req->r_unsafe_callback = ceph_sync_write_unsafe; 755 req->r_inode = inode;
596 req->r_inode = inode;
597 own_pages = true;
598 }
599 }
600 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
601 false, own_pages);
602 756
603 /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 757 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
604 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 758 false, true);
605 759
606 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 760 /* BUG_ON(vino.snap != CEPH_NOSNAP); */
607 if (!ret) 761 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
608 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
609 762
610 if (file->f_flags & O_DIRECT) 763 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
611 ceph_put_page_vector(pages, num_pages, false); 764 if (!ret)
612 else if (file->f_flags & O_SYNC) 765 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
613 ceph_release_page_vector(pages, num_pages);
614 766
615out: 767out:
616 ceph_osdc_put_request(req); 768 ceph_osdc_put_request(req);
617 if (ret == 0) { 769 if (ret == 0) {
618 pos += len; 770 pos += len;
619 written += len; 771 written += len;
620 left -= len; 772
621 data += len; 773 if (pos > i_size_read(inode)) {
622 if (left) 774 check_caps = ceph_inode_set_size(inode, pos);
623 goto more; 775 if (check_caps)
776 ceph_check_caps(ceph_inode(inode),
777 CHECK_CAPS_AUTHONLY,
778 NULL);
779 }
780 } else
781 break;
782 }
624 783
784 if (ret != -EOLDSNAPC && written > 0) {
625 ret = written; 785 ret = written;
626 *ppos = pos; 786 iocb->ki_pos = pos;
627 if (pos > i_size_read(inode))
628 check_caps = ceph_inode_set_size(inode, pos);
629 if (check_caps)
630 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
631 NULL);
632 } else if (ret != -EOLDSNAPC && written > 0) {
633 ret = written;
634 } 787 }
635 return ret; 788 return ret;
636} 789}
@@ -647,55 +800,84 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
647{ 800{
648 struct file *filp = iocb->ki_filp; 801 struct file *filp = iocb->ki_filp;
649 struct ceph_file_info *fi = filp->private_data; 802 struct ceph_file_info *fi = filp->private_data;
650 loff_t *ppos = &iocb->ki_pos; 803 size_t len = iocb->ki_nbytes;
651 size_t len = iov->iov_len;
652 struct inode *inode = file_inode(filp); 804 struct inode *inode = file_inode(filp);
653 struct ceph_inode_info *ci = ceph_inode(inode); 805 struct ceph_inode_info *ci = ceph_inode(inode);
654 void __user *base = iov->iov_base;
655 ssize_t ret; 806 ssize_t ret;
656 int want, got = 0; 807 int want, got = 0;
657 int checkeof = 0, read = 0; 808 int checkeof = 0, read = 0;
658 809
659 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
660 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
661again: 810again:
811 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
812 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
813
662 if (fi->fmode & CEPH_FILE_MODE_LAZY) 814 if (fi->fmode & CEPH_FILE_MODE_LAZY)
663 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 815 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
664 else 816 else
665 want = CEPH_CAP_FILE_CACHE; 817 want = CEPH_CAP_FILE_CACHE;
666 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 818 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
667 if (ret < 0) 819 if (ret < 0)
668 goto out; 820 return ret;
669 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
670 inode, ceph_vinop(inode), pos, (unsigned)len,
671 ceph_cap_string(got));
672 821
673 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 822 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
674 (iocb->ki_filp->f_flags & O_DIRECT) || 823 (iocb->ki_filp->f_flags & O_DIRECT) ||
675 (fi->flags & CEPH_F_SYNC)) 824 (fi->flags & CEPH_F_SYNC)) {
825 struct iov_iter i;
826
827 dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
828 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
829 ceph_cap_string(got));
830
831 if (!read) {
832 ret = generic_segment_checks(iov, &nr_segs,
833 &len, VERIFY_WRITE);
834 if (ret)
835 goto out;
836 }
837
838 iov_iter_init(&i, iov, nr_segs, len, read);
839
676 /* hmm, this isn't really async... */ 840 /* hmm, this isn't really async... */
677 ret = ceph_sync_read(filp, base, len, ppos, &checkeof); 841 ret = ceph_sync_read(iocb, &i, &checkeof);
678 else 842 } else {
679 ret = generic_file_aio_read(iocb, iov, nr_segs, pos); 843 /*
844 * We can't modify the content of iov,
845 * so we only read from beginning.
846 */
847 if (read) {
848 iocb->ki_pos = pos;
849 len = iocb->ki_nbytes;
850 read = 0;
851 }
852 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
853 inode, ceph_vinop(inode), pos, (unsigned)len,
854 ceph_cap_string(got));
680 855
856 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
857 }
681out: 858out:
682 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 859 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
683 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 860 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
684 ceph_put_cap_refs(ci, got); 861 ceph_put_cap_refs(ci, got);
685 862
686 if (checkeof && ret >= 0) { 863 if (checkeof && ret >= 0) {
687 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 864 int statret = ceph_do_getattr(inode,
865 CEPH_STAT_CAP_SIZE);
688 866
689 /* hit EOF or hole? */ 867 /* hit EOF or hole? */
690 if (statret == 0 && *ppos < inode->i_size) { 868 if (statret == 0 && iocb->ki_pos < inode->i_size &&
691 dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); 869 ret < len) {
870 dout("sync_read hit hole, ppos %lld < size %lld"
871 ", reading more\n", iocb->ki_pos,
872 inode->i_size);
873
692 read += ret; 874 read += ret;
693 base += ret;
694 len -= ret; 875 len -= ret;
695 checkeof = 0; 876 checkeof = 0;
696 goto again; 877 goto again;
697 } 878 }
698 } 879 }
880
699 if (ret >= 0) 881 if (ret >= 0)
700 ret += read; 882 ret += read;
701 883
@@ -772,11 +954,13 @@ retry_snap:
772 inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); 954 inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
773 955
774 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 956 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
775 (iocb->ki_filp->f_flags & O_DIRECT) || 957 (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
776 (fi->flags & CEPH_F_SYNC)) {
777 mutex_unlock(&inode->i_mutex); 958 mutex_unlock(&inode->i_mutex);
778 written = ceph_sync_write(file, iov->iov_base, count, 959 if (file->f_flags & O_DIRECT)
779 pos, &iocb->ki_pos); 960 written = ceph_sync_direct_write(iocb, iov,
961 nr_segs, count);
962 else
963 written = ceph_sync_write(iocb, iov, nr_segs, count);
780 if (written == -EOLDSNAPC) { 964 if (written == -EOLDSNAPC) {
781 dout("aio_write %p %llx.%llx %llu~%u" 965 dout("aio_write %p %llx.%llx %llu~%u"
782 "got EOLDSNAPC, retrying\n", 966 "got EOLDSNAPC, retrying\n",
@@ -1018,7 +1202,7 @@ static long ceph_fallocate(struct file *file, int mode,
1018 loff_t offset, loff_t length) 1202 loff_t offset, loff_t length)
1019{ 1203{
1020 struct ceph_file_info *fi = file->private_data; 1204 struct ceph_file_info *fi = file->private_data;
1021 struct inode *inode = file->f_dentry->d_inode; 1205 struct inode *inode = file_inode(file);
1022 struct ceph_inode_info *ci = ceph_inode(inode); 1206 struct ceph_inode_info *ci = ceph_inode(inode);
1023 struct ceph_osd_client *osdc = 1207 struct ceph_osd_client *osdc =
1024 &ceph_inode_to_client(inode)->client->osdc; 1208 &ceph_inode_to_client(inode)->client->osdc;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 278fd2891288..32d519d8a2e2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,6 +9,7 @@
9#include <linux/namei.h> 9#include <linux/namei.h>
10#include <linux/writeback.h> 10#include <linux/writeback.h>
11#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include <linux/posix_acl.h>
12 13
13#include "super.h" 14#include "super.h"
14#include "mds_client.h" 15#include "mds_client.h"
@@ -95,6 +96,8 @@ const struct inode_operations ceph_file_iops = {
95 .getxattr = ceph_getxattr, 96 .getxattr = ceph_getxattr,
96 .listxattr = ceph_listxattr, 97 .listxattr = ceph_listxattr,
97 .removexattr = ceph_removexattr, 98 .removexattr = ceph_removexattr,
99 .get_acl = ceph_get_acl,
100 .set_acl = ceph_set_acl,
98}; 101};
99 102
100 103
@@ -335,12 +338,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
335 ci->i_hold_caps_min = 0; 338 ci->i_hold_caps_min = 0;
336 ci->i_hold_caps_max = 0; 339 ci->i_hold_caps_max = 0;
337 INIT_LIST_HEAD(&ci->i_cap_delay_list); 340 INIT_LIST_HEAD(&ci->i_cap_delay_list);
338 ci->i_cap_exporting_mds = 0;
339 ci->i_cap_exporting_mseq = 0;
340 ci->i_cap_exporting_issued = 0;
341 INIT_LIST_HEAD(&ci->i_cap_snaps); 341 INIT_LIST_HEAD(&ci->i_cap_snaps);
342 ci->i_head_snapc = NULL; 342 ci->i_head_snapc = NULL;
343 ci->i_snap_caps = 0; 343 ci->i_snap_caps = 0;
344 ci->i_cap_exporting_issued = 0;
344 345
345 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 346 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
346 ci->i_nr_by_mode[i] = 0; 347 ci->i_nr_by_mode[i] = 0;
@@ -436,6 +437,16 @@ void ceph_destroy_inode(struct inode *inode)
436 call_rcu(&inode->i_rcu, ceph_i_callback); 437 call_rcu(&inode->i_rcu, ceph_i_callback);
437} 438}
438 439
440int ceph_drop_inode(struct inode *inode)
441{
442 /*
443 * Positve dentry and corresponding inode are always accompanied
444 * in MDS reply. So no need to keep inode in the cache after
445 * dropping all its aliases.
446 */
447 return 1;
448}
449
439/* 450/*
440 * Helpers to fill in size, ctime, mtime, and atime. We have to be 451 * Helpers to fill in size, ctime, mtime, and atime. We have to be
441 * careful because either the client or MDS may have more up to date 452 * careful because either the client or MDS may have more up to date
@@ -670,6 +681,7 @@ static int fill_inode(struct inode *inode,
670 memcpy(ci->i_xattrs.blob->vec.iov_base, 681 memcpy(ci->i_xattrs.blob->vec.iov_base,
671 iinfo->xattr_data, iinfo->xattr_len); 682 iinfo->xattr_data, iinfo->xattr_len);
672 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 683 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
684 ceph_forget_all_cached_acls(inode);
673 xattr_blob = NULL; 685 xattr_blob = NULL;
674 } 686 }
675 687
@@ -1454,7 +1466,8 @@ static void ceph_invalidate_work(struct work_struct *work)
1454 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1466 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1455 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1467 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1456 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 1468 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1457 /* nevermind! */ 1469 if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
1470 check = 1;
1458 spin_unlock(&ci->i_ceph_lock); 1471 spin_unlock(&ci->i_ceph_lock);
1459 mutex_unlock(&ci->i_truncate_mutex); 1472 mutex_unlock(&ci->i_truncate_mutex);
1460 goto out; 1473 goto out;
@@ -1475,13 +1488,14 @@ static void ceph_invalidate_work(struct work_struct *work)
1475 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", 1488 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
1476 inode, orig_gen, ci->i_rdcache_gen, 1489 inode, orig_gen, ci->i_rdcache_gen,
1477 ci->i_rdcache_revoking); 1490 ci->i_rdcache_revoking);
1491 if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
1492 check = 1;
1478 } 1493 }
1479 spin_unlock(&ci->i_ceph_lock); 1494 spin_unlock(&ci->i_ceph_lock);
1480 mutex_unlock(&ci->i_truncate_mutex); 1495 mutex_unlock(&ci->i_truncate_mutex);
1481 1496out:
1482 if (check) 1497 if (check)
1483 ceph_check_caps(ci, 0, NULL); 1498 ceph_check_caps(ci, 0, NULL);
1484out:
1485 iput(inode); 1499 iput(inode);
1486} 1500}
1487 1501
@@ -1602,6 +1616,8 @@ static const struct inode_operations ceph_symlink_iops = {
1602 .getxattr = ceph_getxattr, 1616 .getxattr = ceph_getxattr,
1603 .listxattr = ceph_listxattr, 1617 .listxattr = ceph_listxattr,
1604 .removexattr = ceph_removexattr, 1618 .removexattr = ceph_removexattr,
1619 .get_acl = ceph_get_acl,
1620 .set_acl = ceph_set_acl,
1605}; 1621};
1606 1622
1607/* 1623/*
@@ -1675,6 +1691,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1675 dirtied |= CEPH_CAP_AUTH_EXCL; 1691 dirtied |= CEPH_CAP_AUTH_EXCL;
1676 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || 1692 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1677 attr->ia_mode != inode->i_mode) { 1693 attr->ia_mode != inode->i_mode) {
1694 inode->i_mode = attr->ia_mode;
1678 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); 1695 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1679 mask |= CEPH_SETATTR_MODE; 1696 mask |= CEPH_SETATTR_MODE;
1680 release |= CEPH_CAP_AUTH_SHARED; 1697 release |= CEPH_CAP_AUTH_SHARED;
@@ -1790,6 +1807,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1790 if (inode_dirty_flags) 1807 if (inode_dirty_flags)
1791 __mark_inode_dirty(inode, inode_dirty_flags); 1808 __mark_inode_dirty(inode, inode_dirty_flags);
1792 1809
1810 if (ia_valid & ATTR_MODE) {
1811 err = posix_acl_chmod(inode, attr->ia_mode);
1812 if (err)
1813 goto out_put;
1814 }
1815
1793 if (mask) { 1816 if (mask) {
1794 req->r_inode = inode; 1817 req->r_inode = inode;
1795 ihold(inode); 1818 ihold(inode);
@@ -1809,6 +1832,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1809 return err; 1832 return err;
1810out: 1833out:
1811 spin_unlock(&ci->i_ceph_lock); 1834 spin_unlock(&ci->i_ceph_lock);
1835out_put:
1812 ceph_mdsc_put_request(req); 1836 ceph_mdsc_put_request(req);
1813 return err; 1837 return err;
1814} 1838}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 669622fd1ae3..dc66c9e023e4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
183 struct ceph_inode_info *ci = ceph_inode(inode); 183 struct ceph_inode_info *ci = ceph_inode(inode);
184 struct ceph_osd_client *osdc = 184 struct ceph_osd_client *osdc =
185 &ceph_sb_to_client(inode->i_sb)->client->osdc; 185 &ceph_sb_to_client(inode->i_sb)->client->osdc;
186 struct ceph_object_locator oloc;
187 struct ceph_object_id oid;
186 u64 len = 1, olen; 188 u64 len = 1, olen;
187 u64 tmp; 189 u64 tmp;
188 struct ceph_pg pgid; 190 struct ceph_pg pgid;
@@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
211 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", 213 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
212 ceph_ino(inode), dl.object_no); 214 ceph_ino(inode), dl.object_no);
213 215
214 r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, 216 oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
215 ceph_file_layout_pg_pool(ci->i_layout)); 217 ceph_oid_set_name(&oid, dl.object_name);
218
219 r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
216 if (r < 0) { 220 if (r < 0) {
217 up_read(&osdc->map_sem); 221 up_read(&osdc->map_sem);
218 return r; 222 return r;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d90861f45210..f4f050a69a48 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops;
63 */ 63 */
64static int parse_reply_info_in(void **p, void *end, 64static int parse_reply_info_in(void **p, void *end,
65 struct ceph_mds_reply_info_in *info, 65 struct ceph_mds_reply_info_in *info,
66 int features) 66 u64 features)
67{ 67{
68 int err = -EIO; 68 int err = -EIO;
69 69
@@ -98,7 +98,7 @@ bad:
98 */ 98 */
99static int parse_reply_info_trace(void **p, void *end, 99static int parse_reply_info_trace(void **p, void *end,
100 struct ceph_mds_reply_info_parsed *info, 100 struct ceph_mds_reply_info_parsed *info,
101 int features) 101 u64 features)
102{ 102{
103 int err; 103 int err;
104 104
@@ -145,7 +145,7 @@ out_bad:
145 */ 145 */
146static int parse_reply_info_dir(void **p, void *end, 146static int parse_reply_info_dir(void **p, void *end,
147 struct ceph_mds_reply_info_parsed *info, 147 struct ceph_mds_reply_info_parsed *info,
148 int features) 148 u64 features)
149{ 149{
150 u32 num, i = 0; 150 u32 num, i = 0;
151 int err; 151 int err;
@@ -217,7 +217,7 @@ out_bad:
217 */ 217 */
218static int parse_reply_info_filelock(void **p, void *end, 218static int parse_reply_info_filelock(void **p, void *end,
219 struct ceph_mds_reply_info_parsed *info, 219 struct ceph_mds_reply_info_parsed *info,
220 int features) 220 u64 features)
221{ 221{
222 if (*p + sizeof(*info->filelock_reply) > end) 222 if (*p + sizeof(*info->filelock_reply) > end)
223 goto bad; 223 goto bad;
@@ -238,7 +238,7 @@ bad:
238 */ 238 */
239static int parse_reply_info_create(void **p, void *end, 239static int parse_reply_info_create(void **p, void *end,
240 struct ceph_mds_reply_info_parsed *info, 240 struct ceph_mds_reply_info_parsed *info,
241 int features) 241 u64 features)
242{ 242{
243 if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { 243 if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
244 if (*p == end) { 244 if (*p == end) {
@@ -262,7 +262,7 @@ bad:
262 */ 262 */
263static int parse_reply_info_extra(void **p, void *end, 263static int parse_reply_info_extra(void **p, void *end,
264 struct ceph_mds_reply_info_parsed *info, 264 struct ceph_mds_reply_info_parsed *info,
265 int features) 265 u64 features)
266{ 266{
267 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 267 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
268 return parse_reply_info_filelock(p, end, info, features); 268 return parse_reply_info_filelock(p, end, info, features);
@@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end,
280 */ 280 */
281static int parse_reply_info(struct ceph_msg *msg, 281static int parse_reply_info(struct ceph_msg *msg,
282 struct ceph_mds_reply_info_parsed *info, 282 struct ceph_mds_reply_info_parsed *info,
283 int features) 283 u64 features)
284{ 284{
285 void *p, *end; 285 void *p, *end;
286 u32 len; 286 u32 len;
@@ -713,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
713 struct dentry *dn = get_nonsnap_parent(parent); 713 struct dentry *dn = get_nonsnap_parent(parent);
714 inode = dn->d_inode; 714 inode = dn->d_inode;
715 dout("__choose_mds using nonsnap parent %p\n", inode); 715 dout("__choose_mds using nonsnap parent %p\n", inode);
716 } else if (req->r_dentry->d_inode) { 716 } else {
717 /* dentry target */ 717 /* dentry target */
718 inode = req->r_dentry->d_inode; 718 inode = req->r_dentry->d_inode;
719 } else { 719 if (!inode || mode == USE_AUTH_MDS) {
720 /* dir + name */ 720 /* dir + name */
721 inode = dir; 721 inode = dir;
722 hash = ceph_dentry_hash(dir, req->r_dentry); 722 hash = ceph_dentry_hash(dir, req->r_dentry);
723 is_hash = true; 723 is_hash = true;
724 }
724 } 725 }
725 } 726 }
726 727
@@ -846,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc,
846 * 847 *
847 * called under mdsc->mutex 848 * called under mdsc->mutex
848 */ 849 */
850static struct ceph_mds_session *
851__open_export_target_session(struct ceph_mds_client *mdsc, int target)
852{
853 struct ceph_mds_session *session;
854
855 session = __ceph_lookup_mds_session(mdsc, target);
856 if (!session) {
857 session = register_session(mdsc, target);
858 if (IS_ERR(session))
859 return session;
860 }
861 if (session->s_state == CEPH_MDS_SESSION_NEW ||
862 session->s_state == CEPH_MDS_SESSION_CLOSING)
863 __open_session(mdsc, session);
864
865 return session;
866}
867
868struct ceph_mds_session *
869ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
870{
871 struct ceph_mds_session *session;
872
873 dout("open_export_target_session to mds%d\n", target);
874
875 mutex_lock(&mdsc->mutex);
876 session = __open_export_target_session(mdsc, target);
877 mutex_unlock(&mdsc->mutex);
878
879 return session;
880}
881
849static void __open_export_target_sessions(struct ceph_mds_client *mdsc, 882static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
850 struct ceph_mds_session *session) 883 struct ceph_mds_session *session)
851{ 884{
852 struct ceph_mds_info *mi; 885 struct ceph_mds_info *mi;
853 struct ceph_mds_session *ts; 886 struct ceph_mds_session *ts;
854 int i, mds = session->s_mds; 887 int i, mds = session->s_mds;
855 int target;
856 888
857 if (mds >= mdsc->mdsmap->m_max_mds) 889 if (mds >= mdsc->mdsmap->m_max_mds)
858 return; 890 return;
891
859 mi = &mdsc->mdsmap->m_info[mds]; 892 mi = &mdsc->mdsmap->m_info[mds];
860 dout("open_export_target_sessions for mds%d (%d targets)\n", 893 dout("open_export_target_sessions for mds%d (%d targets)\n",
861 session->s_mds, mi->num_export_targets); 894 session->s_mds, mi->num_export_targets);
862 895
863 for (i = 0; i < mi->num_export_targets; i++) { 896 for (i = 0; i < mi->num_export_targets; i++) {
864 target = mi->export_targets[i]; 897 ts = __open_export_target_session(mdsc, mi->export_targets[i]);
865 ts = __ceph_lookup_mds_session(mdsc, target); 898 if (!IS_ERR(ts))
866 if (!ts) { 899 ceph_put_mds_session(ts);
867 ts = register_session(mdsc, target);
868 if (IS_ERR(ts))
869 return;
870 }
871 if (session->s_state == CEPH_MDS_SESSION_NEW ||
872 session->s_state == CEPH_MDS_SESSION_CLOSING)
873 __open_session(mdsc, session);
874 else
875 dout(" mds%d target mds%d %p is %s\n", session->s_mds,
876 i, ts, session_state_name(ts->s_state));
877 ceph_put_mds_session(ts);
878 } 900 }
879} 901}
880 902
@@ -1136,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
1136 return 0; 1158 return 0;
1137} 1159}
1138 1160
1161static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
1162 struct ceph_mds_session *session, u64 seq)
1163{
1164 struct ceph_msg *msg;
1165
1166 dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
1167 session->s_mds, session_state_name(session->s_state), seq);
1168 msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
1169 if (!msg)
1170 return -ENOMEM;
1171 ceph_con_send(&session->s_con, msg);
1172 return 0;
1173}
1174
1175
1139/* 1176/*
1140 * Note new cap ttl, and any transition from stale -> not stale (fresh?). 1177 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
1141 * 1178 *
@@ -1214,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1214{ 1251{
1215 struct ceph_mds_session *session = arg; 1252 struct ceph_mds_session *session = arg;
1216 struct ceph_inode_info *ci = ceph_inode(inode); 1253 struct ceph_inode_info *ci = ceph_inode(inode);
1217 int used, oissued, mine; 1254 int used, wanted, oissued, mine;
1218 1255
1219 if (session->s_trim_caps <= 0) 1256 if (session->s_trim_caps <= 0)
1220 return -1; 1257 return -1;
@@ -1222,14 +1259,19 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1222 spin_lock(&ci->i_ceph_lock); 1259 spin_lock(&ci->i_ceph_lock);
1223 mine = cap->issued | cap->implemented; 1260 mine = cap->issued | cap->implemented;
1224 used = __ceph_caps_used(ci); 1261 used = __ceph_caps_used(ci);
1262 wanted = __ceph_caps_file_wanted(ci);
1225 oissued = __ceph_caps_issued_other(ci, cap); 1263 oissued = __ceph_caps_issued_other(ci, cap);
1226 1264
1227 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", 1265 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
1228 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), 1266 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
1229 ceph_cap_string(used)); 1267 ceph_cap_string(used), ceph_cap_string(wanted));
1230 if (ci->i_dirty_caps) 1268 if (cap == ci->i_auth_cap) {
1231 goto out; /* dirty caps */ 1269 if (ci->i_dirty_caps | ci->i_flushing_caps)
1232 if ((used & ~oissued) & mine) 1270 goto out;
1271 if ((used | wanted) & CEPH_CAP_ANY_WR)
1272 goto out;
1273 }
1274 if ((used | wanted) & ~oissued & mine)
1233 goto out; /* we need these caps */ 1275 goto out; /* we need these caps */
1234 1276
1235 session->s_trim_caps--; 1277 session->s_trim_caps--;
@@ -2156,26 +2198,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2156 */ 2198 */
2157 if (result == -ESTALE) { 2199 if (result == -ESTALE) {
2158 dout("got ESTALE on request %llu", req->r_tid); 2200 dout("got ESTALE on request %llu", req->r_tid);
2159 if (!req->r_inode) { 2201 if (req->r_direct_mode != USE_AUTH_MDS) {
2160 /* do nothing; not an authority problem */
2161 } else if (req->r_direct_mode != USE_AUTH_MDS) {
2162 dout("not using auth, setting for that now"); 2202 dout("not using auth, setting for that now");
2163 req->r_direct_mode = USE_AUTH_MDS; 2203 req->r_direct_mode = USE_AUTH_MDS;
2164 __do_request(mdsc, req); 2204 __do_request(mdsc, req);
2165 mutex_unlock(&mdsc->mutex); 2205 mutex_unlock(&mdsc->mutex);
2166 goto out; 2206 goto out;
2167 } else { 2207 } else {
2168 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 2208 int mds = __choose_mds(mdsc, req);
2169 struct ceph_cap *cap = NULL; 2209 if (mds >= 0 && mds != req->r_session->s_mds) {
2170 2210 dout("but auth changed, so resending");
2171 if (req->r_session)
2172 cap = ceph_get_cap_for_mds(ci,
2173 req->r_session->s_mds);
2174
2175 dout("already using auth");
2176 if ((!cap || cap != ci->i_auth_cap) ||
2177 (cap->mseq != req->r_sent_on_mseq)) {
2178 dout("but cap changed, so resending");
2179 __do_request(mdsc, req); 2211 __do_request(mdsc, req);
2180 mutex_unlock(&mdsc->mutex); 2212 mutex_unlock(&mdsc->mutex);
2181 goto out; 2213 goto out;
@@ -2400,6 +2432,10 @@ static void handle_session(struct ceph_mds_session *session,
2400 trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); 2432 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2401 break; 2433 break;
2402 2434
2435 case CEPH_SESSION_FLUSHMSG:
2436 send_flushmsg_ack(mdsc, session, seq);
2437 break;
2438
2403 default: 2439 default:
2404 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); 2440 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2405 WARN_ON(1); 2441 WARN_ON(1);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4c053d099ae4..68288917c737 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -383,6 +383,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
383extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, 383extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
384 struct ceph_msg *msg); 384 struct ceph_msg *msg);
385 385
386extern struct ceph_mds_session *
387ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
386extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, 388extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
387 struct ceph_mds_session *session); 389 struct ceph_mds_session *session);
388 390
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 89fa4a940a0f..4440f447fd3f 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op)
41 case CEPH_SESSION_RENEWCAPS: return "renewcaps"; 41 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
42 case CEPH_SESSION_STALE: return "stale"; 42 case CEPH_SESSION_STALE: return "stale";
43 case CEPH_SESSION_RECALL_STATE: return "recall_state"; 43 case CEPH_SESSION_RECALL_STATE: return "recall_state";
44 case CEPH_SESSION_FLUSHMSG: return "flushmsg";
45 case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
44 } 46 }
45 return "???"; 47 return "???";
46} 48}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6a0951e43044..10a4ccbf38da 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -144,7 +144,11 @@ enum {
144 Opt_ino32, 144 Opt_ino32,
145 Opt_noino32, 145 Opt_noino32,
146 Opt_fscache, 146 Opt_fscache,
147 Opt_nofscache 147 Opt_nofscache,
148#ifdef CONFIG_CEPH_FS_POSIX_ACL
149 Opt_acl,
150#endif
151 Opt_noacl
148}; 152};
149 153
150static match_table_t fsopt_tokens = { 154static match_table_t fsopt_tokens = {
@@ -172,6 +176,10 @@ static match_table_t fsopt_tokens = {
172 {Opt_noino32, "noino32"}, 176 {Opt_noino32, "noino32"},
173 {Opt_fscache, "fsc"}, 177 {Opt_fscache, "fsc"},
174 {Opt_nofscache, "nofsc"}, 178 {Opt_nofscache, "nofsc"},
179#ifdef CONFIG_CEPH_FS_POSIX_ACL
180 {Opt_acl, "acl"},
181#endif
182 {Opt_noacl, "noacl"},
175 {-1, NULL} 183 {-1, NULL}
176}; 184};
177 185
@@ -271,6 +279,14 @@ static int parse_fsopt_token(char *c, void *private)
271 case Opt_nofscache: 279 case Opt_nofscache:
272 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; 280 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
273 break; 281 break;
282#ifdef CONFIG_CEPH_FS_POSIX_ACL
283 case Opt_acl:
284 fsopt->sb_flags |= MS_POSIXACL;
285 break;
286#endif
287 case Opt_noacl:
288 fsopt->sb_flags &= ~MS_POSIXACL;
289 break;
274 default: 290 default:
275 BUG_ON(token); 291 BUG_ON(token);
276 } 292 }
@@ -438,6 +454,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
438 else 454 else
439 seq_puts(m, ",nofsc"); 455 seq_puts(m, ",nofsc");
440 456
457#ifdef CONFIG_CEPH_FS_POSIX_ACL
458 if (fsopt->sb_flags & MS_POSIXACL)
459 seq_puts(m, ",acl");
460 else
461 seq_puts(m, ",noacl");
462#endif
463
441 if (fsopt->wsize) 464 if (fsopt->wsize)
442 seq_printf(m, ",wsize=%d", fsopt->wsize); 465 seq_printf(m, ",wsize=%d", fsopt->wsize);
443 if (fsopt->rsize != CEPH_RSIZE_DEFAULT) 466 if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -490,10 +513,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
490 struct ceph_options *opt) 513 struct ceph_options *opt)
491{ 514{
492 struct ceph_fs_client *fsc; 515 struct ceph_fs_client *fsc;
493 const unsigned supported_features = 516 const u64 supported_features =
494 CEPH_FEATURE_FLOCK | 517 CEPH_FEATURE_FLOCK |
495 CEPH_FEATURE_DIRLAYOUTHASH; 518 CEPH_FEATURE_DIRLAYOUTHASH;
496 const unsigned required_features = 0; 519 const u64 required_features = 0;
497 int page_count; 520 int page_count;
498 size_t size; 521 size_t size;
499 int err = -ENOMEM; 522 int err = -ENOMEM;
@@ -686,6 +709,7 @@ static const struct super_operations ceph_super_ops = {
686 .alloc_inode = ceph_alloc_inode, 709 .alloc_inode = ceph_alloc_inode,
687 .destroy_inode = ceph_destroy_inode, 710 .destroy_inode = ceph_destroy_inode,
688 .write_inode = ceph_write_inode, 711 .write_inode = ceph_write_inode,
712 .drop_inode = ceph_drop_inode,
689 .sync_fs = ceph_sync_fs, 713 .sync_fs = ceph_sync_fs,
690 .put_super = ceph_put_super, 714 .put_super = ceph_put_super,
691 .show_options = ceph_show_options, 715 .show_options = ceph_show_options,
@@ -819,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data)
819 s->s_flags = fsc->mount_options->sb_flags; 843 s->s_flags = fsc->mount_options->sb_flags;
820 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 844 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
821 845
846 s->s_xattr = ceph_xattr_handlers;
822 s->s_fs_info = fsc; 847 s->s_fs_info = fsc;
823 fsc->sb = s; 848 fsc->sb = s;
824 849
@@ -906,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
906 struct ceph_options *opt = NULL; 931 struct ceph_options *opt = NULL;
907 932
908 dout("ceph_mount\n"); 933 dout("ceph_mount\n");
934
935#ifdef CONFIG_CEPH_FS_POSIX_ACL
936 flags |= MS_POSIXACL;
937#endif
909 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); 938 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
910 if (err < 0) { 939 if (err < 0) {
911 res = ERR_PTR(err); 940 res = ERR_PTR(err);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ef4ac38bb614..d8801a95b685 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -13,6 +13,7 @@
13#include <linux/wait.h> 13#include <linux/wait.h>
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/posix_acl.h>
16 17
17#include <linux/ceph/libceph.h> 18#include <linux/ceph/libceph.h>
18 19
@@ -287,14 +288,12 @@ struct ceph_inode_info {
287 unsigned long i_hold_caps_min; /* jiffies */ 288 unsigned long i_hold_caps_min; /* jiffies */
288 unsigned long i_hold_caps_max; /* jiffies */ 289 unsigned long i_hold_caps_max; /* jiffies */
289 struct list_head i_cap_delay_list; /* for delayed cap release to mds */ 290 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
290 int i_cap_exporting_mds; /* to handle cap migration between */
291 unsigned i_cap_exporting_mseq; /* mds's. */
292 unsigned i_cap_exporting_issued;
293 struct ceph_cap_reservation i_cap_migration_resv; 291 struct ceph_cap_reservation i_cap_migration_resv;
294 struct list_head i_cap_snaps; /* snapped state pending flush to mds */ 292 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
295 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or 293 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
296 dirty|flushing caps */ 294 dirty|flushing caps */
297 unsigned i_snap_caps; /* cap bits for snapped files */ 295 unsigned i_snap_caps; /* cap bits for snapped files */
296 unsigned i_cap_exporting_issued;
298 297
299 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 298 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
300 299
@@ -335,7 +334,6 @@ struct ceph_inode_info {
335 u32 i_fscache_gen; /* sequence, for delayed fscache validate */ 334 u32 i_fscache_gen; /* sequence, for delayed fscache validate */
336 struct work_struct i_revalidate_work; 335 struct work_struct i_revalidate_work;
337#endif 336#endif
338
339 struct inode vfs_inode; /* at end */ 337 struct inode vfs_inode; /* at end */
340}; 338};
341 339
@@ -529,6 +527,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
529} 527}
530extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); 528extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
531 529
530extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
531 struct ceph_cap *ocap, int mask);
532extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); 532extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
533extern int __ceph_caps_used(struct ceph_inode_info *ci); 533extern int __ceph_caps_used(struct ceph_inode_info *ci);
534 534
@@ -691,6 +691,7 @@ extern const struct inode_operations ceph_file_iops;
691 691
692extern struct inode *ceph_alloc_inode(struct super_block *sb); 692extern struct inode *ceph_alloc_inode(struct super_block *sb);
693extern void ceph_destroy_inode(struct inode *inode); 693extern void ceph_destroy_inode(struct inode *inode);
694extern int ceph_drop_inode(struct inode *inode);
694 695
695extern struct inode *ceph_get_inode(struct super_block *sb, 696extern struct inode *ceph_get_inode(struct super_block *sb,
696 struct ceph_vino vino); 697 struct ceph_vino vino);
@@ -724,6 +725,9 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
724/* xattr.c */ 725/* xattr.c */
725extern int ceph_setxattr(struct dentry *, const char *, const void *, 726extern int ceph_setxattr(struct dentry *, const char *, const void *,
726 size_t, int); 727 size_t, int);
728int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
729ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
730int __ceph_removexattr(struct dentry *, const char *);
727extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); 731extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
728extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); 732extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
729extern int ceph_removexattr(struct dentry *, const char *); 733extern int ceph_removexattr(struct dentry *, const char *);
@@ -732,6 +736,42 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
732extern void __init ceph_xattr_init(void); 736extern void __init ceph_xattr_init(void);
733extern void ceph_xattr_exit(void); 737extern void ceph_xattr_exit(void);
734 738
739/* acl.c */
740extern const struct xattr_handler *ceph_xattr_handlers[];
741
742#ifdef CONFIG_CEPH_FS_POSIX_ACL
743
744struct posix_acl *ceph_get_acl(struct inode *, int);
745int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
746int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
747
748static inline void ceph_forget_all_cached_acls(struct inode *inode)
749{
750 forget_all_cached_acls(inode);
751}
752
753#else
754
755#define ceph_get_acl NULL
756#define ceph_set_acl NULL
757
758static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
759 struct inode *dir)
760{
761 return 0;
762}
763
764static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
765{
766 return 0;
767}
768
769static inline void ceph_forget_all_cached_acls(struct inode *inode)
770{
771}
772
773#endif
774
735/* caps.c */ 775/* caps.c */
736extern const char *ceph_cap_string(int c); 776extern const char *ceph_cap_string(int c);
737extern void ceph_handle_caps(struct ceph_mds_session *session, 777extern void ceph_handle_caps(struct ceph_mds_session *session,
@@ -744,6 +784,7 @@ extern int ceph_add_cap(struct inode *inode,
744extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); 784extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
745extern void ceph_put_cap(struct ceph_mds_client *mdsc, 785extern void ceph_put_cap(struct ceph_mds_client *mdsc,
746 struct ceph_cap *cap); 786 struct ceph_cap *cap);
787extern int ceph_is_any_caps(struct inode *inode);
747 788
748extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, 789extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
749 u64 cap_id, u32 migrate_seq, u32 issue_seq); 790 u64 cap_id, u32 migrate_seq, u32 issue_seq);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index be661d8f532a..a55ec37378c6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -6,16 +6,33 @@
6#include <linux/ceph/decode.h> 6#include <linux/ceph/decode.h>
7 7
8#include <linux/xattr.h> 8#include <linux/xattr.h>
9#include <linux/posix_acl_xattr.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
10 11
11#define XATTR_CEPH_PREFIX "ceph." 12#define XATTR_CEPH_PREFIX "ceph."
12#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) 13#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
13 14
15static int __remove_xattr(struct ceph_inode_info *ci,
16 struct ceph_inode_xattr *xattr);
17
18/*
19 * List of handlers for synthetic system.* attributes. Other
20 * attributes are handled directly.
21 */
22const struct xattr_handler *ceph_xattr_handlers[] = {
23#ifdef CONFIG_CEPH_FS_POSIX_ACL
24 &posix_acl_access_xattr_handler,
25 &posix_acl_default_xattr_handler,
26#endif
27 NULL,
28};
29
14static bool ceph_is_valid_xattr(const char *name) 30static bool ceph_is_valid_xattr(const char *name)
15{ 31{
16 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || 32 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
17 !strncmp(name, XATTR_SECURITY_PREFIX, 33 !strncmp(name, XATTR_SECURITY_PREFIX,
18 XATTR_SECURITY_PREFIX_LEN) || 34 XATTR_SECURITY_PREFIX_LEN) ||
35 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
19 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 36 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
20 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 37 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
21} 38}
@@ -305,8 +322,7 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
305static int __set_xattr(struct ceph_inode_info *ci, 322static int __set_xattr(struct ceph_inode_info *ci,
306 const char *name, int name_len, 323 const char *name, int name_len,
307 const char *val, int val_len, 324 const char *val, int val_len,
308 int dirty, 325 int flags, int update_xattr,
309 int should_free_name, int should_free_val,
310 struct ceph_inode_xattr **newxattr) 326 struct ceph_inode_xattr **newxattr)
311{ 327{
312 struct rb_node **p; 328 struct rb_node **p;
@@ -335,12 +351,31 @@ static int __set_xattr(struct ceph_inode_info *ci,
335 xattr = NULL; 351 xattr = NULL;
336 } 352 }
337 353
354 if (update_xattr) {
355 int err = 0;
356 if (xattr && (flags & XATTR_CREATE))
357 err = -EEXIST;
358 else if (!xattr && (flags & XATTR_REPLACE))
359 err = -ENODATA;
360 if (err) {
361 kfree(name);
362 kfree(val);
363 return err;
364 }
365 if (update_xattr < 0) {
366 if (xattr)
367 __remove_xattr(ci, xattr);
368 kfree(name);
369 return 0;
370 }
371 }
372
338 if (!xattr) { 373 if (!xattr) {
339 new = 1; 374 new = 1;
340 xattr = *newxattr; 375 xattr = *newxattr;
341 xattr->name = name; 376 xattr->name = name;
342 xattr->name_len = name_len; 377 xattr->name_len = name_len;
343 xattr->should_free_name = should_free_name; 378 xattr->should_free_name = update_xattr;
344 379
345 ci->i_xattrs.count++; 380 ci->i_xattrs.count++;
346 dout("__set_xattr count=%d\n", ci->i_xattrs.count); 381 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
@@ -350,7 +385,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
350 if (xattr->should_free_val) 385 if (xattr->should_free_val)
351 kfree((void *)xattr->val); 386 kfree((void *)xattr->val);
352 387
353 if (should_free_name) { 388 if (update_xattr) {
354 kfree((void *)name); 389 kfree((void *)name);
355 name = xattr->name; 390 name = xattr->name;
356 } 391 }
@@ -365,8 +400,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
365 xattr->val = ""; 400 xattr->val = "";
366 401
367 xattr->val_len = val_len; 402 xattr->val_len = val_len;
368 xattr->dirty = dirty; 403 xattr->dirty = update_xattr;
369 xattr->should_free_val = (val && should_free_val); 404 xattr->should_free_val = (val && update_xattr);
370 405
371 if (new) { 406 if (new) {
372 rb_link_node(&xattr->node, parent, p); 407 rb_link_node(&xattr->node, parent, p);
@@ -428,7 +463,7 @@ static int __remove_xattr(struct ceph_inode_info *ci,
428 struct ceph_inode_xattr *xattr) 463 struct ceph_inode_xattr *xattr)
429{ 464{
430 if (!xattr) 465 if (!xattr)
431 return -EOPNOTSUPP; 466 return -ENODATA;
432 467
433 rb_erase(&xattr->node, &ci->i_xattrs.index); 468 rb_erase(&xattr->node, &ci->i_xattrs.index);
434 469
@@ -574,7 +609,7 @@ start:
574 p += len; 609 p += len;
575 610
576 err = __set_xattr(ci, name, namelen, val, len, 611 err = __set_xattr(ci, name, namelen, val, len,
577 0, 0, 0, &xattrs[numattr]); 612 0, 0, &xattrs[numattr]);
578 613
579 if (err < 0) 614 if (err < 0)
580 goto bad; 615 goto bad;
@@ -663,10 +698,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
663 } 698 }
664} 699}
665 700
666ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, 701ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
667 size_t size) 702 size_t size)
668{ 703{
669 struct inode *inode = dentry->d_inode;
670 struct ceph_inode_info *ci = ceph_inode(inode); 704 struct ceph_inode_info *ci = ceph_inode(inode);
671 int err; 705 int err;
672 struct ceph_inode_xattr *xattr; 706 struct ceph_inode_xattr *xattr;
@@ -675,7 +709,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
675 if (!ceph_is_valid_xattr(name)) 709 if (!ceph_is_valid_xattr(name))
676 return -ENODATA; 710 return -ENODATA;
677 711
678
679 /* let's see if a virtual xattr was requested */ 712 /* let's see if a virtual xattr was requested */
680 vxattr = ceph_match_vxattr(inode, name); 713 vxattr = ceph_match_vxattr(inode, name);
681 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { 714 if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
@@ -725,6 +758,15 @@ out:
725 return err; 758 return err;
726} 759}
727 760
761ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
762 size_t size)
763{
764 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
765 return generic_getxattr(dentry, name, value, size);
766
767 return __ceph_getxattr(dentry->d_inode, name, value, size);
768}
769
728ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) 770ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
729{ 771{
730 struct inode *inode = dentry->d_inode; 772 struct inode *inode = dentry->d_inode;
@@ -829,6 +871,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
829 871
830 dout("setxattr value=%.*s\n", (int)size, value); 872 dout("setxattr value=%.*s\n", (int)size, value);
831 873
874 if (!value)
875 flags |= CEPH_XATTR_REMOVE;
876
832 /* do request */ 877 /* do request */
833 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, 878 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
834 USE_AUTH_MDS); 879 USE_AUTH_MDS);
@@ -863,15 +908,15 @@ out:
863 return err; 908 return err;
864} 909}
865 910
866int ceph_setxattr(struct dentry *dentry, const char *name, 911int __ceph_setxattr(struct dentry *dentry, const char *name,
867 const void *value, size_t size, int flags) 912 const void *value, size_t size, int flags)
868{ 913{
869 struct inode *inode = dentry->d_inode; 914 struct inode *inode = dentry->d_inode;
870 struct ceph_vxattr *vxattr; 915 struct ceph_vxattr *vxattr;
871 struct ceph_inode_info *ci = ceph_inode(inode); 916 struct ceph_inode_info *ci = ceph_inode(inode);
872 int issued; 917 int issued;
873 int err; 918 int err;
874 int dirty; 919 int dirty = 0;
875 int name_len = strlen(name); 920 int name_len = strlen(name);
876 int val_len = size; 921 int val_len = size;
877 char *newname = NULL; 922 char *newname = NULL;
@@ -879,9 +924,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
879 struct ceph_inode_xattr *xattr = NULL; 924 struct ceph_inode_xattr *xattr = NULL;
880 int required_blob_size; 925 int required_blob_size;
881 926
882 if (ceph_snap(inode) != CEPH_NOSNAP)
883 return -EROFS;
884
885 if (!ceph_is_valid_xattr(name)) 927 if (!ceph_is_valid_xattr(name))
886 return -EOPNOTSUPP; 928 return -EOPNOTSUPP;
887 929
@@ -935,12 +977,14 @@ retry:
935 goto retry; 977 goto retry;
936 } 978 }
937 979
938 err = __set_xattr(ci, newname, name_len, newval, 980 err = __set_xattr(ci, newname, name_len, newval, val_len,
939 val_len, 1, 1, 1, &xattr); 981 flags, value ? 1 : -1, &xattr);
940 982
941 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); 983 if (!err) {
942 ci->i_xattrs.dirty = true; 984 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
943 inode->i_ctime = CURRENT_TIME; 985 ci->i_xattrs.dirty = true;
986 inode->i_ctime = CURRENT_TIME;
987 }
944 988
945 spin_unlock(&ci->i_ceph_lock); 989 spin_unlock(&ci->i_ceph_lock);
946 if (dirty) 990 if (dirty)
@@ -958,6 +1002,18 @@ out:
958 return err; 1002 return err;
959} 1003}
960 1004
1005int ceph_setxattr(struct dentry *dentry, const char *name,
1006 const void *value, size_t size, int flags)
1007{
1008 if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
1009 return -EROFS;
1010
1011 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
1012 return generic_setxattr(dentry, name, value, size, flags);
1013
1014 return __ceph_setxattr(dentry, name, value, size, flags);
1015}
1016
961static int ceph_send_removexattr(struct dentry *dentry, const char *name) 1017static int ceph_send_removexattr(struct dentry *dentry, const char *name)
962{ 1018{
963 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 1019 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
@@ -984,7 +1040,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
984 return err; 1040 return err;
985} 1041}
986 1042
987int ceph_removexattr(struct dentry *dentry, const char *name) 1043int __ceph_removexattr(struct dentry *dentry, const char *name)
988{ 1044{
989 struct inode *inode = dentry->d_inode; 1045 struct inode *inode = dentry->d_inode;
990 struct ceph_vxattr *vxattr; 1046 struct ceph_vxattr *vxattr;
@@ -994,9 +1050,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
994 int required_blob_size; 1050 int required_blob_size;
995 int dirty; 1051 int dirty;
996 1052
997 if (ceph_snap(inode) != CEPH_NOSNAP)
998 return -EROFS;
999
1000 if (!ceph_is_valid_xattr(name)) 1053 if (!ceph_is_valid_xattr(name))
1001 return -EOPNOTSUPP; 1054 return -EOPNOTSUPP;
1002 1055
@@ -1053,3 +1106,13 @@ out:
1053 return err; 1106 return err;
1054} 1107}
1055 1108
1109int ceph_removexattr(struct dentry *dentry, const char *name)
1110{
1111 if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
1112 return -EROFS;
1113
1114 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
1115 return generic_removexattr(dentry, name);
1116
1117 return __ceph_removexattr(dentry, name);
1118}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 51f5e0ee7237..7ff866dbb89e 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -865,8 +865,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
865 return rc; 865 return rc;
866} 866}
867 867
868static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, 868struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
869 __u16 fid, u32 *pacllen) 869 const struct cifs_fid *cifsfid, u32 *pacllen)
870{ 870{
871 struct cifs_ntsd *pntsd = NULL; 871 struct cifs_ntsd *pntsd = NULL;
872 unsigned int xid; 872 unsigned int xid;
@@ -877,7 +877,8 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
877 return ERR_CAST(tlink); 877 return ERR_CAST(tlink);
878 878
879 xid = get_xid(); 879 xid = get_xid();
880 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); 880 rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd,
881 pacllen);
881 free_xid(xid); 882 free_xid(xid);
882 883
883 cifs_put_tlink(tlink); 884 cifs_put_tlink(tlink);
@@ -895,9 +896,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
895 int oplock = 0; 896 int oplock = 0;
896 unsigned int xid; 897 unsigned int xid;
897 int rc, create_options = 0; 898 int rc, create_options = 0;
898 __u16 fid;
899 struct cifs_tcon *tcon; 899 struct cifs_tcon *tcon;
900 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 900 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
901 struct cifs_fid fid;
902 struct cifs_open_parms oparms;
901 903
902 if (IS_ERR(tlink)) 904 if (IS_ERR(tlink))
903 return ERR_CAST(tlink); 905 return ERR_CAST(tlink);
@@ -908,12 +910,19 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
908 if (backup_cred(cifs_sb)) 910 if (backup_cred(cifs_sb))
909 create_options |= CREATE_OPEN_BACKUP_INTENT; 911 create_options |= CREATE_OPEN_BACKUP_INTENT;
910 912
911 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 913 oparms.tcon = tcon;
912 create_options, &fid, &oplock, NULL, cifs_sb->local_nls, 914 oparms.cifs_sb = cifs_sb;
913 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 915 oparms.desired_access = READ_CONTROL;
916 oparms.create_options = create_options;
917 oparms.disposition = FILE_OPEN;
918 oparms.path = path;
919 oparms.fid = &fid;
920 oparms.reconnect = false;
921
922 rc = CIFS_open(xid, &oparms, &oplock, NULL);
914 if (!rc) { 923 if (!rc) {
915 rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); 924 rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
916 CIFSSMBClose(xid, tcon, fid); 925 CIFSSMBClose(xid, tcon, fid.netfid);
917 } 926 }
918 927
919 cifs_put_tlink(tlink); 928 cifs_put_tlink(tlink);
@@ -938,7 +947,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
938 if (!open_file) 947 if (!open_file)
939 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 948 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
940 949
941 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->fid.netfid, pacllen); 950 pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen);
942 cifsFileInfo_put(open_file); 951 cifsFileInfo_put(open_file);
943 return pntsd; 952 return pntsd;
944} 953}
@@ -950,10 +959,11 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
950 int oplock = 0; 959 int oplock = 0;
951 unsigned int xid; 960 unsigned int xid;
952 int rc, access_flags, create_options = 0; 961 int rc, access_flags, create_options = 0;
953 __u16 fid;
954 struct cifs_tcon *tcon; 962 struct cifs_tcon *tcon;
955 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 963 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
956 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); 964 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
965 struct cifs_fid fid;
966 struct cifs_open_parms oparms;
957 967
958 if (IS_ERR(tlink)) 968 if (IS_ERR(tlink))
959 return PTR_ERR(tlink); 969 return PTR_ERR(tlink);
@@ -969,18 +979,25 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
969 else 979 else
970 access_flags = WRITE_DAC; 980 access_flags = WRITE_DAC;
971 981
972 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags, 982 oparms.tcon = tcon;
973 create_options, &fid, &oplock, NULL, cifs_sb->local_nls, 983 oparms.cifs_sb = cifs_sb;
974 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 984 oparms.desired_access = access_flags;
985 oparms.create_options = create_options;
986 oparms.disposition = FILE_OPEN;
987 oparms.path = path;
988 oparms.fid = &fid;
989 oparms.reconnect = false;
990
991 rc = CIFS_open(xid, &oparms, &oplock, NULL);
975 if (rc) { 992 if (rc) {
976 cifs_dbg(VFS, "Unable to open file to set ACL\n"); 993 cifs_dbg(VFS, "Unable to open file to set ACL\n");
977 goto out; 994 goto out;
978 } 995 }
979 996
980 rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag); 997 rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag);
981 cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc); 998 cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc);
982 999
983 CIFSSMBClose(xid, tcon, fid); 1000 CIFSSMBClose(xid, tcon, fid.netfid);
984out: 1001out:
985 free_xid(xid); 1002 free_xid(xid);
986 cifs_put_tlink(tlink); 1003 cifs_put_tlink(tlink);
@@ -990,19 +1007,31 @@ out:
990/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ 1007/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
991int 1008int
992cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, 1009cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
993 struct inode *inode, const char *path, const __u16 *pfid) 1010 struct inode *inode, const char *path,
1011 const struct cifs_fid *pfid)
994{ 1012{
995 struct cifs_ntsd *pntsd = NULL; 1013 struct cifs_ntsd *pntsd = NULL;
996 u32 acllen = 0; 1014 u32 acllen = 0;
997 int rc = 0; 1015 int rc = 0;
1016 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
1017 struct cifs_tcon *tcon;
998 1018
999 cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); 1019 cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
1000 1020
1001 if (pfid) 1021 if (IS_ERR(tlink))
1002 pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); 1022 return PTR_ERR(tlink);
1003 else 1023 tcon = tlink_tcon(tlink);
1004 pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
1005 1024
1025 if (pfid && (tcon->ses->server->ops->get_acl_by_fid))
1026 pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid,
1027 &acllen);
1028 else if (tcon->ses->server->ops->get_acl)
1029 pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
1030 &acllen);
1031 else {
1032 cifs_put_tlink(tlink);
1033 return -EOPNOTSUPP;
1034 }
1006 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ 1035 /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
1007 if (IS_ERR(pntsd)) { 1036 if (IS_ERR(pntsd)) {
1008 rc = PTR_ERR(pntsd); 1037 rc = PTR_ERR(pntsd);
@@ -1014,6 +1043,8 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
1014 cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc); 1043 cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc);
1015 } 1044 }
1016 1045
1046 cifs_put_tlink(tlink);
1047
1017 return rc; 1048 return rc;
1018} 1049}
1019 1050
@@ -1027,15 +1058,30 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
1027 __u32 secdesclen = 0; 1058 __u32 secdesclen = 0;
1028 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ 1059 struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
1029 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ 1060 struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
1061 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
1062 struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
1063 struct cifs_tcon *tcon;
1064
1065 if (IS_ERR(tlink))
1066 return PTR_ERR(tlink);
1067 tcon = tlink_tcon(tlink);
1030 1068
1031 cifs_dbg(NOISY, "set ACL from mode for %s\n", path); 1069 cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
1032 1070
1033 /* Get the security descriptor */ 1071 /* Get the security descriptor */
1034 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); 1072
1073 if (tcon->ses->server->ops->get_acl == NULL) {
1074 cifs_put_tlink(tlink);
1075 return -EOPNOTSUPP;
1076 }
1077
1078 pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
1079 &secdesclen);
1035 if (IS_ERR(pntsd)) { 1080 if (IS_ERR(pntsd)) {
1036 rc = PTR_ERR(pntsd); 1081 rc = PTR_ERR(pntsd);
1037 cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); 1082 cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
1038 goto out; 1083 cifs_put_tlink(tlink);
1084 return rc;
1039 } 1085 }
1040 1086
1041 /* 1087 /*
@@ -1048,6 +1094,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
1048 pnntsd = kmalloc(secdesclen, GFP_KERNEL); 1094 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
1049 if (!pnntsd) { 1095 if (!pnntsd) {
1050 kfree(pntsd); 1096 kfree(pntsd);
1097 cifs_put_tlink(tlink);
1051 return -ENOMEM; 1098 return -ENOMEM;
1052 } 1099 }
1053 1100
@@ -1056,14 +1103,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
1056 1103
1057 cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); 1104 cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
1058 1105
1106 if (tcon->ses->server->ops->set_acl == NULL)
1107 rc = -EOPNOTSUPP;
1108
1059 if (!rc) { 1109 if (!rc) {
1060 /* Set the security descriptor */ 1110 /* Set the security descriptor */
1061 rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); 1111 rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode,
1112 path, aclflag);
1062 cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); 1113 cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
1063 } 1114 }
1115 cifs_put_tlink(tlink);
1064 1116
1065 kfree(pnntsd); 1117 kfree(pnntsd);
1066 kfree(pntsd); 1118 kfree(pntsd);
1067out:
1068 return rc; 1119 return rc;
1069} 1120}
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f918a998a087..c0f3718b77a8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -323,7 +323,8 @@ struct smb_version_operations {
323 /* async read from the server */ 323 /* async read from the server */
324 int (*async_readv)(struct cifs_readdata *); 324 int (*async_readv)(struct cifs_readdata *);
325 /* async write to the server */ 325 /* async write to the server */
326 int (*async_writev)(struct cifs_writedata *); 326 int (*async_writev)(struct cifs_writedata *,
327 void (*release)(struct kref *));
327 /* sync read from the server */ 328 /* sync read from the server */
328 int (*sync_read)(const unsigned int, struct cifsFileInfo *, 329 int (*sync_read)(const unsigned int, struct cifsFileInfo *,
329 struct cifs_io_parms *, unsigned int *, char **, 330 struct cifs_io_parms *, unsigned int *, char **,
@@ -370,8 +371,12 @@ struct smb_version_operations {
370 void (*new_lease_key)(struct cifs_fid *); 371 void (*new_lease_key)(struct cifs_fid *);
371 int (*generate_signingkey)(struct cifs_ses *); 372 int (*generate_signingkey)(struct cifs_ses *);
372 int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); 373 int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
373 int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *, 374 int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
374 struct cifs_sb_info *, unsigned int); 375 struct cifs_sb_info *, const unsigned char *,
376 char *, unsigned int *);
377 int (*create_mf_symlink)(unsigned int, struct cifs_tcon *,
378 struct cifs_sb_info *, const unsigned char *,
379 char *, unsigned int *);
375 /* if we can do cache read operations */ 380 /* if we can do cache read operations */
376 bool (*is_read_op)(__u32); 381 bool (*is_read_op)(__u32);
377 /* set oplock level for the inode */ 382 /* set oplock level for the inode */
@@ -385,6 +390,18 @@ struct smb_version_operations {
385 struct cifsFileInfo *target_file, u64 src_off, u64 len, 390 struct cifsFileInfo *target_file, u64 src_off, u64 len,
386 u64 dest_off); 391 u64 dest_off);
387 int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); 392 int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
393 ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
394 const unsigned char *, const unsigned char *, char *,
395 size_t, const struct nls_table *, int);
396 int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
397 const char *, const void *, const __u16,
398 const struct nls_table *, int);
399 struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *,
400 const char *, u32 *);
401 struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *,
402 const struct cifs_fid *, u32 *);
403 int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
404 int);
388}; 405};
389 406
390struct smb_version_values { 407struct smb_version_values {
@@ -496,7 +513,7 @@ struct cifs_mnt_data {
496static inline unsigned int 513static inline unsigned int
497get_rfc1002_length(void *buf) 514get_rfc1002_length(void *buf)
498{ 515{
499 return be32_to_cpu(*((__be32 *)buf)); 516 return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;
500} 517}
501 518
502static inline void 519static inline void
@@ -1054,7 +1071,7 @@ struct cifs_writedata {
1054 unsigned int pagesz; 1071 unsigned int pagesz;
1055 unsigned int tailsz; 1072 unsigned int tailsz;
1056 unsigned int nr_pages; 1073 unsigned int nr_pages;
1057 struct page *pages[1]; 1074 struct page *pages[];
1058}; 1075};
1059 1076
1060/* 1077/*
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 2c29db6a247e..acc4ee8ed075 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -151,7 +151,7 @@ extern struct inode *cifs_iget(struct super_block *sb,
151 151
152extern int cifs_get_inode_info(struct inode **inode, const char *full_path, 152extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
153 FILE_ALL_INFO *data, struct super_block *sb, 153 FILE_ALL_INFO *data, struct super_block *sb,
154 int xid, const __u16 *fid); 154 int xid, const struct cifs_fid *fid);
155extern int cifs_get_inode_info_unix(struct inode **pinode, 155extern int cifs_get_inode_info_unix(struct inode **pinode,
156 const unsigned char *search_path, 156 const unsigned char *search_path,
157 struct super_block *sb, unsigned int xid); 157 struct super_block *sb, unsigned int xid);
@@ -162,11 +162,13 @@ extern int cifs_rename_pending_delete(const char *full_path,
162 const unsigned int xid); 162 const unsigned int xid);
163extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, 163extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
164 struct cifs_fattr *fattr, struct inode *inode, 164 struct cifs_fattr *fattr, struct inode *inode,
165 const char *path, const __u16 *pfid); 165 const char *path, const struct cifs_fid *pfid);
166extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64, 166extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
167 kuid_t, kgid_t); 167 kuid_t, kgid_t);
168extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, 168extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
169 const char *, u32 *); 169 const char *, u32 *);
170extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
171 const struct cifs_fid *, u32 *);
170extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, 172extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
171 const char *, int); 173 const char *, int);
172 174
@@ -362,11 +364,8 @@ extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
362 const struct nls_table *nls_codepage); 364 const struct nls_table *nls_codepage);
363extern int CIFSSMB_set_compression(const unsigned int xid, 365extern int CIFSSMB_set_compression(const unsigned int xid,
364 struct cifs_tcon *tcon, __u16 fid); 366 struct cifs_tcon *tcon, __u16 fid);
365extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, 367extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
366 const char *fileName, const int disposition, 368 int *oplock, FILE_ALL_INFO *buf);
367 const int access_flags, const int omode,
368 __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
369 const struct nls_table *nls_codepage, int remap);
370extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon, 369extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
371 const char *fileName, const int disposition, 370 const char *fileName, const int disposition,
372 const int access_flags, const int omode, 371 const int access_flags, const int omode,
@@ -476,8 +475,8 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
476extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, 475extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
477 const int netfid, __u64 *pExtAttrBits, __u64 *pMask); 476 const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
478extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); 477extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
479extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); 478extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr);
480extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, 479extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
481 struct cifs_sb_info *cifs_sb, 480 struct cifs_sb_info *cifs_sb,
482 struct cifs_fattr *fattr, 481 struct cifs_fattr *fattr,
483 const unsigned char *path); 482 const unsigned char *path);
@@ -491,12 +490,18 @@ void cifs_readdata_release(struct kref *refcount);
491int cifs_async_readv(struct cifs_readdata *rdata); 490int cifs_async_readv(struct cifs_readdata *rdata);
492int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); 491int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
493 492
494int cifs_async_writev(struct cifs_writedata *wdata); 493int cifs_async_writev(struct cifs_writedata *wdata,
494 void (*release)(struct kref *kref));
495void cifs_writev_complete(struct work_struct *work); 495void cifs_writev_complete(struct work_struct *work);
496struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages, 496struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
497 work_func_t complete); 497 work_func_t complete);
498void cifs_writedata_release(struct kref *refcount); 498void cifs_writedata_release(struct kref *refcount);
499int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, 499int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
500 unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, 500 struct cifs_sb_info *cifs_sb,
501 unsigned int xid); 501 const unsigned char *path, char *pbuf,
502 unsigned int *pbytes_read);
503int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
504 struct cifs_sb_info *cifs_sb,
505 const unsigned char *path, char *pbuf,
506 unsigned int *pbytes_written);
502#endif /* _CIFSPROTO_H */ 507#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d707edb6b852..f3264bd7a83d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1273,104 +1273,124 @@ OldOpenRetry:
1273} 1273}
1274 1274
1275int 1275int
1276CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, 1276CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
1277 const char *fileName, const int openDisposition, 1277 FILE_ALL_INFO *buf)
1278 const int access_flags, const int create_options, __u16 *netfid,
1279 int *pOplock, FILE_ALL_INFO *pfile_info,
1280 const struct nls_table *nls_codepage, int remap)
1281{ 1278{
1282 int rc = -EACCES; 1279 int rc = -EACCES;
1283 OPEN_REQ *pSMB = NULL; 1280 OPEN_REQ *req = NULL;
1284 OPEN_RSP *pSMBr = NULL; 1281 OPEN_RSP *rsp = NULL;
1285 int bytes_returned; 1282 int bytes_returned;
1286 int name_len; 1283 int name_len;
1287 __u16 count; 1284 __u16 count;
1285 struct cifs_sb_info *cifs_sb = oparms->cifs_sb;
1286 struct cifs_tcon *tcon = oparms->tcon;
1287 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
1288 const struct nls_table *nls = cifs_sb->local_nls;
1289 int create_options = oparms->create_options;
1290 int desired_access = oparms->desired_access;
1291 int disposition = oparms->disposition;
1292 const char *path = oparms->path;
1288 1293
1289openRetry: 1294openRetry:
1290 rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB, 1295 rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req,
1291 (void **) &pSMBr); 1296 (void **)&rsp);
1292 if (rc) 1297 if (rc)
1293 return rc; 1298 return rc;
1294 1299
1295 pSMB->AndXCommand = 0xFF; /* none */ 1300 /* no commands go after this */
1301 req->AndXCommand = 0xFF;
1296 1302
1297 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 1303 if (req->hdr.Flags2 & SMBFLG2_UNICODE) {
1298 count = 1; /* account for one byte pad to word boundary */ 1304 /* account for one byte pad to word boundary */
1299 name_len = 1305 count = 1;
1300 cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1), 1306 name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1),
1301 fileName, PATH_MAX, nls_codepage, remap); 1307 path, PATH_MAX, nls, remap);
1302 name_len++; /* trailing null */ 1308 /* trailing null */
1309 name_len++;
1303 name_len *= 2; 1310 name_len *= 2;
1304 pSMB->NameLength = cpu_to_le16(name_len); 1311 req->NameLength = cpu_to_le16(name_len);
1305 } else { /* BB improve check for buffer overruns BB */ 1312 } else {
1306 count = 0; /* no pad */ 1313 /* BB improve check for buffer overruns BB */
1307 name_len = strnlen(fileName, PATH_MAX); 1314 /* no pad */
1308 name_len++; /* trailing null */ 1315 count = 0;
1309 pSMB->NameLength = cpu_to_le16(name_len); 1316 name_len = strnlen(path, PATH_MAX);
1310 strncpy(pSMB->fileName, fileName, name_len); 1317 /* trailing null */
1318 name_len++;
1319 req->NameLength = cpu_to_le16(name_len);
1320 strncpy(req->fileName, path, name_len);
1311 } 1321 }
1312 if (*pOplock & REQ_OPLOCK) 1322
1313 pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK); 1323 if (*oplock & REQ_OPLOCK)
1314 else if (*pOplock & REQ_BATCHOPLOCK) 1324 req->OpenFlags = cpu_to_le32(REQ_OPLOCK);
1315 pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK); 1325 else if (*oplock & REQ_BATCHOPLOCK)
1316 pSMB->DesiredAccess = cpu_to_le32(access_flags); 1326 req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
1317 pSMB->AllocationSize = 0; 1327
1318 /* set file as system file if special file such 1328 req->DesiredAccess = cpu_to_le32(desired_access);
1319 as fifo and server expecting SFU style and 1329 req->AllocationSize = 0;
1320 no Unix extensions */ 1330
1331 /*
1332 * Set file as system file if special file such as fifo and server
1333 * expecting SFU style and no Unix extensions.
1334 */
1321 if (create_options & CREATE_OPTION_SPECIAL) 1335 if (create_options & CREATE_OPTION_SPECIAL)
1322 pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM); 1336 req->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
1323 else 1337 else
1324 pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL); 1338 req->FileAttributes = cpu_to_le32(ATTR_NORMAL);
1325 1339
1326 /* XP does not handle ATTR_POSIX_SEMANTICS */ 1340 /*
1327 /* but it helps speed up case sensitive checks for other 1341 * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case
1328 servers such as Samba */ 1342 * sensitive checks for other servers such as Samba.
1343 */
1329 if (tcon->ses->capabilities & CAP_UNIX) 1344 if (tcon->ses->capabilities & CAP_UNIX)
1330 pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS); 1345 req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
1331 1346
1332 if (create_options & CREATE_OPTION_READONLY) 1347 if (create_options & CREATE_OPTION_READONLY)
1333 pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY); 1348 req->FileAttributes |= cpu_to_le32(ATTR_READONLY);
1349
1350 req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
1351 req->CreateDisposition = cpu_to_le32(disposition);
1352 req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
1334 1353
1335 pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
1336 pSMB->CreateDisposition = cpu_to_le32(openDisposition);
1337 pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
1338 /* BB Expirement with various impersonation levels and verify */ 1354 /* BB Expirement with various impersonation levels and verify */
1339 pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION); 1355 req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
1340 pSMB->SecurityFlags = 1356 req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY;
1341 SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
1342 1357
1343 count += name_len; 1358 count += name_len;
1344 inc_rfc1001_len(pSMB, count); 1359 inc_rfc1001_len(req, count);
1345 1360
1346 pSMB->ByteCount = cpu_to_le16(count); 1361 req->ByteCount = cpu_to_le16(count);
1347 /* long_op set to 1 to allow for oplock break timeouts */ 1362 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req,
1348 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, 1363 (struct smb_hdr *)rsp, &bytes_returned, 0);
1349 (struct smb_hdr *)pSMBr, &bytes_returned, 0);
1350 cifs_stats_inc(&tcon->stats.cifs_stats.num_opens); 1364 cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
1351 if (rc) { 1365 if (rc) {
1352 cifs_dbg(FYI, "Error in Open = %d\n", rc); 1366 cifs_dbg(FYI, "Error in Open = %d\n", rc);
1353 } else { 1367 cifs_buf_release(req);
1354 *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */ 1368 if (rc == -EAGAIN)
1355 *netfid = pSMBr->Fid; /* cifs fid stays in le */ 1369 goto openRetry;
1356 /* Let caller know file was created so we can set the mode. */ 1370 return rc;
1357 /* Do we care about the CreateAction in any other cases? */
1358 if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
1359 *pOplock |= CIFS_CREATE_ACTION;
1360 if (pfile_info) {
1361 memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime,
1362 36 /* CreationTime to Attributes */);
1363 /* the file_info buf is endian converted by caller */
1364 pfile_info->AllocationSize = pSMBr->AllocationSize;
1365 pfile_info->EndOfFile = pSMBr->EndOfFile;
1366 pfile_info->NumberOfLinks = cpu_to_le32(1);
1367 pfile_info->DeletePending = 0;
1368 }
1369 } 1371 }
1370 1372
1371 cifs_buf_release(pSMB); 1373 /* 1 byte no need to le_to_cpu */
1372 if (rc == -EAGAIN) 1374 *oplock = rsp->OplockLevel;
1373 goto openRetry; 1375 /* cifs fid stays in le */
1376 oparms->fid->netfid = rsp->Fid;
1377
1378 /* Let caller know file was created so we can set the mode. */
1379 /* Do we care about the CreateAction in any other cases? */
1380 if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction)
1381 *oplock |= CIFS_CREATE_ACTION;
1382
1383 if (buf) {
1384 /* copy from CreationTime to Attributes */
1385 memcpy((char *)buf, (char *)&rsp->CreationTime, 36);
1386 /* the file_info buf is endian converted by caller */
1387 buf->AllocationSize = rsp->AllocationSize;
1388 buf->EndOfFile = rsp->EndOfFile;
1389 buf->NumberOfLinks = cpu_to_le32(1);
1390 buf->DeletePending = 0;
1391 }
1392
1393 cifs_buf_release(req);
1374 return rc; 1394 return rc;
1375} 1395}
1376 1396
@@ -1890,7 +1910,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
1890 1910
1891 do { 1911 do {
1892 server = tlink_tcon(wdata->cfile->tlink)->ses->server; 1912 server = tlink_tcon(wdata->cfile->tlink)->ses->server;
1893 rc = server->ops->async_writev(wdata); 1913 rc = server->ops->async_writev(wdata, cifs_writedata_release);
1894 } while (rc == -EAGAIN); 1914 } while (rc == -EAGAIN);
1895 1915
1896 for (i = 0; i < wdata->nr_pages; i++) { 1916 for (i = 0; i < wdata->nr_pages; i++) {
@@ -1942,15 +1962,9 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
1942{ 1962{
1943 struct cifs_writedata *wdata; 1963 struct cifs_writedata *wdata;
1944 1964
1945 /* this would overflow */
1946 if (nr_pages == 0) {
1947 cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__);
1948 return NULL;
1949 }
1950
1951 /* writedata + number of page pointers */ 1965 /* writedata + number of page pointers */
1952 wdata = kzalloc(sizeof(*wdata) + 1966 wdata = kzalloc(sizeof(*wdata) +
1953 sizeof(struct page *) * (nr_pages - 1), GFP_NOFS); 1967 sizeof(struct page *) * nr_pages, GFP_NOFS);
1954 if (wdata != NULL) { 1968 if (wdata != NULL) {
1955 kref_init(&wdata->refcount); 1969 kref_init(&wdata->refcount);
1956 INIT_LIST_HEAD(&wdata->list); 1970 INIT_LIST_HEAD(&wdata->list);
@@ -2011,7 +2025,8 @@ cifs_writev_callback(struct mid_q_entry *mid)
2011 2025
2012/* cifs_async_writev - send an async write, and set up mid to handle result */ 2026/* cifs_async_writev - send an async write, and set up mid to handle result */
2013int 2027int
2014cifs_async_writev(struct cifs_writedata *wdata) 2028cifs_async_writev(struct cifs_writedata *wdata,
2029 void (*release)(struct kref *kref))
2015{ 2030{
2016 int rc = -EACCES; 2031 int rc = -EACCES;
2017 WRITE_REQ *smb = NULL; 2032 WRITE_REQ *smb = NULL;
@@ -2085,7 +2100,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
2085 if (rc == 0) 2100 if (rc == 0)
2086 cifs_stats_inc(&tcon->stats.cifs_stats.num_writes); 2101 cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
2087 else 2102 else
2088 kref_put(&wdata->refcount, cifs_writedata_release); 2103 kref_put(&wdata->refcount, release);
2089 2104
2090async_writev_out: 2105async_writev_out:
2091 cifs_small_buf_release(smb); 2106 cifs_small_buf_release(smb);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index a514e0a65f69..3db0c5fd9a11 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -378,7 +378,7 @@ cifs_create_get_file_info:
378 xid); 378 xid);
379 else { 379 else {
380 rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb, 380 rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
381 xid, &fid->netfid); 381 xid, fid);
382 if (newinode) { 382 if (newinode) {
383 if (server->ops->set_lease_key) 383 if (server->ops->set_lease_key)
384 server->ops->set_lease_key(newinode, fid); 384 server->ops->set_lease_key(newinode, fid);
@@ -565,12 +565,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
565 int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; 565 int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
566 struct cifs_sb_info *cifs_sb; 566 struct cifs_sb_info *cifs_sb;
567 struct tcon_link *tlink; 567 struct tcon_link *tlink;
568 struct cifs_tcon *pTcon; 568 struct cifs_tcon *tcon;
569 struct cifs_io_parms io_parms; 569 struct cifs_io_parms io_parms;
570 char *full_path = NULL; 570 char *full_path = NULL;
571 struct inode *newinode = NULL; 571 struct inode *newinode = NULL;
572 int oplock = 0; 572 int oplock = 0;
573 u16 fileHandle; 573 struct cifs_fid fid;
574 struct cifs_open_parms oparms;
574 FILE_ALL_INFO *buf = NULL; 575 FILE_ALL_INFO *buf = NULL;
575 unsigned int bytes_written; 576 unsigned int bytes_written;
576 struct win_dev *pdev; 577 struct win_dev *pdev;
@@ -583,7 +584,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
583 if (IS_ERR(tlink)) 584 if (IS_ERR(tlink))
584 return PTR_ERR(tlink); 585 return PTR_ERR(tlink);
585 586
586 pTcon = tlink_tcon(tlink); 587 tcon = tlink_tcon(tlink);
587 588
588 xid = get_xid(); 589 xid = get_xid();
589 590
@@ -593,7 +594,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
593 goto mknod_out; 594 goto mknod_out;
594 } 595 }
595 596
596 if (pTcon->unix_ext) { 597 if (tcon->unix_ext) {
597 struct cifs_unix_set_info_args args = { 598 struct cifs_unix_set_info_args args = {
598 .mode = mode & ~current_umask(), 599 .mode = mode & ~current_umask(),
599 .ctime = NO_CHANGE_64, 600 .ctime = NO_CHANGE_64,
@@ -608,7 +609,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
608 args.uid = INVALID_UID; /* no change */ 609 args.uid = INVALID_UID; /* no change */
609 args.gid = INVALID_GID; /* no change */ 610 args.gid = INVALID_GID; /* no change */
610 } 611 }
611 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args, 612 rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
612 cifs_sb->local_nls, 613 cifs_sb->local_nls,
613 cifs_sb->mnt_cifs_flags & 614 cifs_sb->mnt_cifs_flags &
614 CIFS_MOUNT_MAP_SPECIAL_CHR); 615 CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -640,42 +641,44 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
640 if (backup_cred(cifs_sb)) 641 if (backup_cred(cifs_sb))
641 create_options |= CREATE_OPEN_BACKUP_INTENT; 642 create_options |= CREATE_OPEN_BACKUP_INTENT;
642 643
643 rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, 644 oparms.tcon = tcon;
644 GENERIC_WRITE, create_options, 645 oparms.cifs_sb = cifs_sb;
645 &fileHandle, &oplock, buf, cifs_sb->local_nls, 646 oparms.desired_access = GENERIC_WRITE;
646 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 647 oparms.create_options = create_options;
648 oparms.disposition = FILE_CREATE;
649 oparms.path = full_path;
650 oparms.fid = &fid;
651 oparms.reconnect = false;
652
653 rc = CIFS_open(xid, &oparms, &oplock, buf);
647 if (rc) 654 if (rc)
648 goto mknod_out; 655 goto mknod_out;
649 656
650 /* BB Do not bother to decode buf since no local inode yet to put 657 /*
651 * timestamps in, but we can reuse it safely */ 658 * BB Do not bother to decode buf since no local inode yet to put
659 * timestamps in, but we can reuse it safely.
660 */
652 661
653 pdev = (struct win_dev *)buf; 662 pdev = (struct win_dev *)buf;
654 io_parms.netfid = fileHandle; 663 io_parms.netfid = fid.netfid;
655 io_parms.pid = current->tgid; 664 io_parms.pid = current->tgid;
656 io_parms.tcon = pTcon; 665 io_parms.tcon = tcon;
657 io_parms.offset = 0; 666 io_parms.offset = 0;
658 io_parms.length = sizeof(struct win_dev); 667 io_parms.length = sizeof(struct win_dev);
659 if (S_ISCHR(mode)) { 668 if (S_ISCHR(mode)) {
660 memcpy(pdev->type, "IntxCHR", 8); 669 memcpy(pdev->type, "IntxCHR", 8);
661 pdev->major = 670 pdev->major = cpu_to_le64(MAJOR(device_number));
662 cpu_to_le64(MAJOR(device_number)); 671 pdev->minor = cpu_to_le64(MINOR(device_number));
663 pdev->minor = 672 rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
664 cpu_to_le64(MINOR(device_number)); 673 NULL, 0);
665 rc = CIFSSMBWrite(xid, &io_parms,
666 &bytes_written, (char *)pdev,
667 NULL, 0);
668 } else if (S_ISBLK(mode)) { 674 } else if (S_ISBLK(mode)) {
669 memcpy(pdev->type, "IntxBLK", 8); 675 memcpy(pdev->type, "IntxBLK", 8);
670 pdev->major = 676 pdev->major = cpu_to_le64(MAJOR(device_number));
671 cpu_to_le64(MAJOR(device_number)); 677 pdev->minor = cpu_to_le64(MINOR(device_number));
672 pdev->minor = 678 rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
673 cpu_to_le64(MINOR(device_number)); 679 NULL, 0);
674 rc = CIFSSMBWrite(xid, &io_parms,
675 &bytes_written, (char *)pdev,
676 NULL, 0);
677 } /* else if (S_ISFIFO) */ 680 } /* else if (S_ISFIFO) */
678 CIFSSMBClose(xid, pTcon, fileHandle); 681 CIFSSMBClose(xid, tcon, fid.netfid);
679 d_drop(direntry); 682 d_drop(direntry);
680 683
681 /* FIXME: add code here to set EAs */ 684 /* FIXME: add code here to set EAs */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5a5a87240fe2..834fce759d80 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -244,7 +244,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
244 xid); 244 xid);
245 else 245 else
246 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, 246 rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
247 xid, &fid->netfid); 247 xid, fid);
248 248
249out: 249out:
250 kfree(buf); 250 kfree(buf);
@@ -678,7 +678,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
678 678
679 /* 679 /*
680 * Can not refresh inode by passing in file_info buf to be returned by 680 * Can not refresh inode by passing in file_info buf to be returned by
681 * CIFSSMBOpen and then calling get_inode_info with returned buf since 681 * ops->open and then calling get_inode_info with returned buf since
682 * file might have write behind data that needs to be flushed and server 682 * file might have write behind data that needs to be flushed and server
683 * version of file size can be stale. If we knew for sure that inode was 683 * version of file size can be stale. If we knew for sure that inode was
684 * not dirty locally we could do this. 684 * not dirty locally we could do this.
@@ -2043,7 +2043,8 @@ retry:
2043 } 2043 }
2044 wdata->pid = wdata->cfile->pid; 2044 wdata->pid = wdata->cfile->pid;
2045 server = tlink_tcon(wdata->cfile->tlink)->ses->server; 2045 server = tlink_tcon(wdata->cfile->tlink)->ses->server;
2046 rc = server->ops->async_writev(wdata); 2046 rc = server->ops->async_writev(wdata,
2047 cifs_writedata_release);
2047 } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN); 2048 } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
2048 2049
2049 for (i = 0; i < nr_pages; ++i) 2050 for (i = 0; i < nr_pages; ++i)
@@ -2331,9 +2332,20 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
2331} 2332}
2332 2333
2333static void 2334static void
2334cifs_uncached_writev_complete(struct work_struct *work) 2335cifs_uncached_writedata_release(struct kref *refcount)
2335{ 2336{
2336 int i; 2337 int i;
2338 struct cifs_writedata *wdata = container_of(refcount,
2339 struct cifs_writedata, refcount);
2340
2341 for (i = 0; i < wdata->nr_pages; i++)
2342 put_page(wdata->pages[i]);
2343 cifs_writedata_release(refcount);
2344}
2345
2346static void
2347cifs_uncached_writev_complete(struct work_struct *work)
2348{
2337 struct cifs_writedata *wdata = container_of(work, 2349 struct cifs_writedata *wdata = container_of(work,
2338 struct cifs_writedata, work); 2350 struct cifs_writedata, work);
2339 struct inode *inode = wdata->cfile->dentry->d_inode; 2351 struct inode *inode = wdata->cfile->dentry->d_inode;
@@ -2347,12 +2359,7 @@ cifs_uncached_writev_complete(struct work_struct *work)
2347 2359
2348 complete(&wdata->done); 2360 complete(&wdata->done);
2349 2361
2350 if (wdata->result != -EAGAIN) { 2362 kref_put(&wdata->refcount, cifs_uncached_writedata_release);
2351 for (i = 0; i < wdata->nr_pages; i++)
2352 put_page(wdata->pages[i]);
2353 }
2354
2355 kref_put(&wdata->refcount, cifs_writedata_release);
2356} 2363}
2357 2364
2358/* attempt to send write to server, retry on any -EAGAIN errors */ 2365/* attempt to send write to server, retry on any -EAGAIN errors */
@@ -2370,7 +2377,8 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata)
2370 if (rc != 0) 2377 if (rc != 0)
2371 continue; 2378 continue;
2372 } 2379 }
2373 rc = server->ops->async_writev(wdata); 2380 rc = server->ops->async_writev(wdata,
2381 cifs_uncached_writedata_release);
2374 } while (rc == -EAGAIN); 2382 } while (rc == -EAGAIN);
2375 2383
2376 return rc; 2384 return rc;
@@ -2381,7 +2389,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
2381 unsigned long nr_segs, loff_t *poffset) 2389 unsigned long nr_segs, loff_t *poffset)
2382{ 2390{
2383 unsigned long nr_pages, i; 2391 unsigned long nr_pages, i;
2384 size_t copied, len, cur_len; 2392 size_t bytes, copied, len, cur_len;
2385 ssize_t total_written = 0; 2393 ssize_t total_written = 0;
2386 loff_t offset; 2394 loff_t offset;
2387 struct iov_iter it; 2395 struct iov_iter it;
@@ -2436,14 +2444,45 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
2436 2444
2437 save_len = cur_len; 2445 save_len = cur_len;
2438 for (i = 0; i < nr_pages; i++) { 2446 for (i = 0; i < nr_pages; i++) {
2439 copied = min_t(const size_t, cur_len, PAGE_SIZE); 2447 bytes = min_t(const size_t, cur_len, PAGE_SIZE);
2440 copied = iov_iter_copy_from_user(wdata->pages[i], &it, 2448 copied = iov_iter_copy_from_user(wdata->pages[i], &it,
2441 0, copied); 2449 0, bytes);
2442 cur_len -= copied; 2450 cur_len -= copied;
2443 iov_iter_advance(&it, copied); 2451 iov_iter_advance(&it, copied);
2452 /*
2453 * If we didn't copy as much as we expected, then that
2454 * may mean we trod into an unmapped area. Stop copying
2455 * at that point. On the next pass through the big
2456 * loop, we'll likely end up getting a zero-length
2457 * write and bailing out of it.
2458 */
2459 if (copied < bytes)
2460 break;
2444 } 2461 }
2445 cur_len = save_len - cur_len; 2462 cur_len = save_len - cur_len;
2446 2463
2464 /*
2465 * If we have no data to send, then that probably means that
2466 * the copy above failed altogether. That's most likely because
2467 * the address in the iovec was bogus. Set the rc to -EFAULT,
2468 * free anything we allocated and bail out.
2469 */
2470 if (!cur_len) {
2471 for (i = 0; i < nr_pages; i++)
2472 put_page(wdata->pages[i]);
2473 kfree(wdata);
2474 rc = -EFAULT;
2475 break;
2476 }
2477
2478 /*
2479 * i + 1 now represents the number of pages we actually used in
2480 * the copy phase above. Bring nr_pages down to that, and free
2481 * any pages that we didn't use.
2482 */
2483 for ( ; nr_pages > i + 1; nr_pages--)
2484 put_page(wdata->pages[nr_pages - 1]);
2485
2447 wdata->sync_mode = WB_SYNC_ALL; 2486 wdata->sync_mode = WB_SYNC_ALL;
2448 wdata->nr_pages = nr_pages; 2487 wdata->nr_pages = nr_pages;
2449 wdata->offset = (__u64)offset; 2488 wdata->offset = (__u64)offset;
@@ -2454,7 +2493,8 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
2454 wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); 2493 wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
2455 rc = cifs_uncached_retry_writev(wdata); 2494 rc = cifs_uncached_retry_writev(wdata);
2456 if (rc) { 2495 if (rc) {
2457 kref_put(&wdata->refcount, cifs_writedata_release); 2496 kref_put(&wdata->refcount,
2497 cifs_uncached_writedata_release);
2458 break; 2498 break;
2459 } 2499 }
2460 2500
@@ -2496,7 +2536,7 @@ restart_loop:
2496 } 2536 }
2497 } 2537 }
2498 list_del_init(&wdata->list); 2538 list_del_init(&wdata->list);
2499 kref_put(&wdata->refcount, cifs_writedata_release); 2539 kref_put(&wdata->refcount, cifs_uncached_writedata_release);
2500 } 2540 }
2501 2541
2502 if (total_written > 0) 2542 if (total_written > 0)
@@ -2539,31 +2579,19 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2539 struct cifsInodeInfo *cinode = CIFS_I(inode); 2579 struct cifsInodeInfo *cinode = CIFS_I(inode);
2540 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; 2580 struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
2541 ssize_t rc = -EACCES; 2581 ssize_t rc = -EACCES;
2582 loff_t lock_pos = pos;
2542 2583
2543 BUG_ON(iocb->ki_pos != pos); 2584 if (file->f_flags & O_APPEND)
2544 2585 lock_pos = i_size_read(inode);
2545 /* 2586 /*
2546 * We need to hold the sem to be sure nobody modifies lock list 2587 * We need to hold the sem to be sure nobody modifies lock list
2547 * with a brlock that prevents writing. 2588 * with a brlock that prevents writing.
2548 */ 2589 */
2549 down_read(&cinode->lock_sem); 2590 down_read(&cinode->lock_sem);
2550 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), 2591 if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
2551 server->vals->exclusive_lock_type, NULL, 2592 server->vals->exclusive_lock_type, NULL,
2552 CIFS_WRITE_OP)) { 2593 CIFS_WRITE_OP))
2553 mutex_lock(&inode->i_mutex); 2594 rc = generic_file_aio_write(iocb, iov, nr_segs, pos);
2554 rc = __generic_file_aio_write(iocb, iov, nr_segs,
2555 &iocb->ki_pos);
2556 mutex_unlock(&inode->i_mutex);
2557 }
2558
2559 if (rc > 0) {
2560 ssize_t err;
2561
2562 err = generic_write_sync(file, pos, rc);
2563 if (err < 0 && rc > 0)
2564 rc = err;
2565 }
2566
2567 up_read(&cinode->lock_sem); 2595 up_read(&cinode->lock_sem);
2568 return rc; 2596 return rc;
2569} 2597}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 49719b8228e5..aadc2b68678b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -383,10 +383,10 @@ int cifs_get_inode_info_unix(struct inode **pinode,
383 383
384 /* check for Minshall+French symlinks */ 384 /* check for Minshall+French symlinks */
385 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { 385 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
386 int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, 386 int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
387 full_path); 387 full_path);
388 if (tmprc) 388 if (tmprc)
389 cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); 389 cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
390 } 390 }
391 391
392 if (*pinode == NULL) { 392 if (*pinode == NULL) {
@@ -404,18 +404,20 @@ int cifs_get_inode_info_unix(struct inode **pinode,
404} 404}
405 405
406static int 406static int
407cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path, 407cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
408 struct cifs_sb_info *cifs_sb, unsigned int xid) 408 struct cifs_sb_info *cifs_sb, unsigned int xid)
409{ 409{
410 int rc; 410 int rc;
411 int oplock = 0; 411 int oplock = 0;
412 __u16 netfid;
413 struct tcon_link *tlink; 412 struct tcon_link *tlink;
414 struct cifs_tcon *tcon; 413 struct cifs_tcon *tcon;
414 struct cifs_fid fid;
415 struct cifs_open_parms oparms;
415 struct cifs_io_parms io_parms; 416 struct cifs_io_parms io_parms;
416 char buf[24]; 417 char buf[24];
417 unsigned int bytes_read; 418 unsigned int bytes_read;
418 char *pbuf; 419 char *pbuf;
420 int buf_type = CIFS_NO_BUFFER;
419 421
420 pbuf = buf; 422 pbuf = buf;
421 423
@@ -436,62 +438,69 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
436 return PTR_ERR(tlink); 438 return PTR_ERR(tlink);
437 tcon = tlink_tcon(tlink); 439 tcon = tlink_tcon(tlink);
438 440
439 rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ, 441 oparms.tcon = tcon;
440 CREATE_NOT_DIR, &netfid, &oplock, NULL, 442 oparms.cifs_sb = cifs_sb;
441 cifs_sb->local_nls, 443 oparms.desired_access = GENERIC_READ;
442 cifs_sb->mnt_cifs_flags & 444 oparms.create_options = CREATE_NOT_DIR;
443 CIFS_MOUNT_MAP_SPECIAL_CHR); 445 oparms.disposition = FILE_OPEN;
444 if (rc == 0) { 446 oparms.path = path;
445 int buf_type = CIFS_NO_BUFFER; 447 oparms.fid = &fid;
446 /* Read header */ 448 oparms.reconnect = false;
447 io_parms.netfid = netfid; 449
448 io_parms.pid = current->tgid; 450 rc = CIFS_open(xid, &oparms, &oplock, NULL);
449 io_parms.tcon = tcon; 451 if (rc) {
450 io_parms.offset = 0; 452 cifs_put_tlink(tlink);
451 io_parms.length = 24; 453 return rc;
452 rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, 454 }
453 &buf_type); 455
454 if ((rc == 0) && (bytes_read >= 8)) { 456 /* Read header */
455 if (memcmp("IntxBLK", pbuf, 8) == 0) { 457 io_parms.netfid = fid.netfid;
456 cifs_dbg(FYI, "Block device\n"); 458 io_parms.pid = current->tgid;
457 fattr->cf_mode |= S_IFBLK; 459 io_parms.tcon = tcon;
458 fattr->cf_dtype = DT_BLK; 460 io_parms.offset = 0;
459 if (bytes_read == 24) { 461 io_parms.length = 24;
460 /* we have enough to decode dev num */ 462
461 __u64 mjr; /* major */ 463 rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
462 __u64 mnr; /* minor */ 464 if ((rc == 0) && (bytes_read >= 8)) {
463 mjr = le64_to_cpu(*(__le64 *)(pbuf+8)); 465 if (memcmp("IntxBLK", pbuf, 8) == 0) {
464 mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); 466 cifs_dbg(FYI, "Block device\n");
465 fattr->cf_rdev = MKDEV(mjr, mnr); 467 fattr->cf_mode |= S_IFBLK;
466 } 468 fattr->cf_dtype = DT_BLK;
467 } else if (memcmp("IntxCHR", pbuf, 8) == 0) { 469 if (bytes_read == 24) {
468 cifs_dbg(FYI, "Char device\n"); 470 /* we have enough to decode dev num */
469 fattr->cf_mode |= S_IFCHR; 471 __u64 mjr; /* major */
470 fattr->cf_dtype = DT_CHR; 472 __u64 mnr; /* minor */
471 if (bytes_read == 24) { 473 mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
472 /* we have enough to decode dev num */ 474 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
473 __u64 mjr; /* major */ 475 fattr->cf_rdev = MKDEV(mjr, mnr);
474 __u64 mnr; /* minor */
475 mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
476 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
477 fattr->cf_rdev = MKDEV(mjr, mnr);
478 }
479 } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
480 cifs_dbg(FYI, "Symlink\n");
481 fattr->cf_mode |= S_IFLNK;
482 fattr->cf_dtype = DT_LNK;
483 } else {
484 fattr->cf_mode |= S_IFREG; /* file? */
485 fattr->cf_dtype = DT_REG;
486 rc = -EOPNOTSUPP;
487 } 476 }
477 } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
478 cifs_dbg(FYI, "Char device\n");
479 fattr->cf_mode |= S_IFCHR;
480 fattr->cf_dtype = DT_CHR;
481 if (bytes_read == 24) {
482 /* we have enough to decode dev num */
483 __u64 mjr; /* major */
484 __u64 mnr; /* minor */
485 mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
486 mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
487 fattr->cf_rdev = MKDEV(mjr, mnr);
488 }
489 } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
490 cifs_dbg(FYI, "Symlink\n");
491 fattr->cf_mode |= S_IFLNK;
492 fattr->cf_dtype = DT_LNK;
488 } else { 493 } else {
489 fattr->cf_mode |= S_IFREG; /* then it is a file */ 494 fattr->cf_mode |= S_IFREG; /* file? */
490 fattr->cf_dtype = DT_REG; 495 fattr->cf_dtype = DT_REG;
491 rc = -EOPNOTSUPP; /* or some unknown SFU type */ 496 rc = -EOPNOTSUPP;
492 } 497 }
493 CIFSSMBClose(xid, tcon, netfid); 498 } else {
499 fattr->cf_mode |= S_IFREG; /* then it is a file */
500 fattr->cf_dtype = DT_REG;
501 rc = -EOPNOTSUPP; /* or some unknown SFU type */
494 } 502 }
503 CIFSSMBClose(xid, tcon, fid.netfid);
495 cifs_put_tlink(tlink); 504 cifs_put_tlink(tlink);
496 return rc; 505 return rc;
497} 506}
@@ -518,10 +527,15 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
518 return PTR_ERR(tlink); 527 return PTR_ERR(tlink);
519 tcon = tlink_tcon(tlink); 528 tcon = tlink_tcon(tlink);
520 529
521 rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS", 530 if (tcon->ses->server->ops->query_all_EAs == NULL) {
522 ea_value, 4 /* size of buf */, cifs_sb->local_nls, 531 cifs_put_tlink(tlink);
523 cifs_sb->mnt_cifs_flags & 532 return -EOPNOTSUPP;
524 CIFS_MOUNT_MAP_SPECIAL_CHR); 533 }
534
535 rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
536 "SETFILEBITS", ea_value, 4 /* size of buf */,
537 cifs_sb->local_nls,
538 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
525 cifs_put_tlink(tlink); 539 cifs_put_tlink(tlink);
526 if (rc < 0) 540 if (rc < 0)
527 return (int)rc; 541 return (int)rc;
@@ -663,7 +677,7 @@ cgfi_exit:
663int 677int
664cifs_get_inode_info(struct inode **inode, const char *full_path, 678cifs_get_inode_info(struct inode **inode, const char *full_path,
665 FILE_ALL_INFO *data, struct super_block *sb, int xid, 679 FILE_ALL_INFO *data, struct super_block *sb, int xid,
666 const __u16 *fid) 680 const struct cifs_fid *fid)
667{ 681{
668 bool validinum = false; 682 bool validinum = false;
669 __u16 srchflgs; 683 __u16 srchflgs;
@@ -800,10 +814,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
800 814
801 /* check for Minshall+French symlinks */ 815 /* check for Minshall+French symlinks */
802 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { 816 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
803 tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, 817 tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
804 full_path); 818 full_path);
805 if (tmprc) 819 if (tmprc)
806 cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); 820 cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
807 } 821 }
808 822
809 if (!*inode) { 823 if (!*inode) {
@@ -1032,7 +1046,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
1032{ 1046{
1033 int oplock = 0; 1047 int oplock = 0;
1034 int rc; 1048 int rc;
1035 __u16 netfid; 1049 struct cifs_fid fid;
1050 struct cifs_open_parms oparms;
1036 struct inode *inode = dentry->d_inode; 1051 struct inode *inode = dentry->d_inode;
1037 struct cifsInodeInfo *cifsInode = CIFS_I(inode); 1052 struct cifsInodeInfo *cifsInode = CIFS_I(inode);
1038 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 1053 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -1055,10 +1070,16 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
1055 goto out; 1070 goto out;
1056 } 1071 }
1057 1072
1058 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, 1073 oparms.tcon = tcon;
1059 DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, 1074 oparms.cifs_sb = cifs_sb;
1060 &netfid, &oplock, NULL, cifs_sb->local_nls, 1075 oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES;
1061 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1076 oparms.create_options = CREATE_NOT_DIR;
1077 oparms.disposition = FILE_OPEN;
1078 oparms.path = full_path;
1079 oparms.fid = &fid;
1080 oparms.reconnect = false;
1081
1082 rc = CIFS_open(xid, &oparms, &oplock, NULL);
1062 if (rc != 0) 1083 if (rc != 0)
1063 goto out; 1084 goto out;
1064 1085
@@ -1079,7 +1100,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
1079 goto out_close; 1100 goto out_close;
1080 } 1101 }
1081 info_buf->Attributes = cpu_to_le32(dosattr); 1102 info_buf->Attributes = cpu_to_le32(dosattr);
1082 rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, 1103 rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
1083 current->tgid); 1104 current->tgid);
1084 /* although we would like to mark the file hidden 1105 /* although we would like to mark the file hidden
1085 if that fails we will still try to rename it */ 1106 if that fails we will still try to rename it */
@@ -1090,7 +1111,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
1090 } 1111 }
1091 1112
1092 /* rename the file */ 1113 /* rename the file */
1093 rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls, 1114 rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL,
1115 cifs_sb->local_nls,
1094 cifs_sb->mnt_cifs_flags & 1116 cifs_sb->mnt_cifs_flags &
1095 CIFS_MOUNT_MAP_SPECIAL_CHR); 1117 CIFS_MOUNT_MAP_SPECIAL_CHR);
1096 if (rc != 0) { 1118 if (rc != 0) {
@@ -1100,7 +1122,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
1100 1122
1101 /* try to set DELETE_ON_CLOSE */ 1123 /* try to set DELETE_ON_CLOSE */
1102 if (!cifsInode->delete_pending) { 1124 if (!cifsInode->delete_pending) {
1103 rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, 1125 rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid,
1104 current->tgid); 1126 current->tgid);
1105 /* 1127 /*
1106 * some samba versions return -ENOENT when we try to set the 1128 * some samba versions return -ENOENT when we try to set the
@@ -1120,7 +1142,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
1120 } 1142 }
1121 1143
1122out_close: 1144out_close:
1123 CIFSSMBClose(xid, tcon, netfid); 1145 CIFSSMBClose(xid, tcon, fid.netfid);
1124out: 1146out:
1125 kfree(info_buf); 1147 kfree(info_buf);
1126 cifs_put_tlink(tlink); 1148 cifs_put_tlink(tlink);
@@ -1132,13 +1154,13 @@ out:
1132 * them anyway. 1154 * them anyway.
1133 */ 1155 */
1134undo_rename: 1156undo_rename:
1135 CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name, 1157 CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name,
1136 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 1158 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1137 CIFS_MOUNT_MAP_SPECIAL_CHR); 1159 CIFS_MOUNT_MAP_SPECIAL_CHR);
1138undo_setattr: 1160undo_setattr:
1139 if (dosattr != origattr) { 1161 if (dosattr != origattr) {
1140 info_buf->Attributes = cpu_to_le32(origattr); 1162 info_buf->Attributes = cpu_to_le32(origattr);
1141 if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, 1163 if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
1142 current->tgid)) 1164 current->tgid))
1143 cifsInode->cifsAttrs = origattr; 1165 cifsInode->cifsAttrs = origattr;
1144 } 1166 }
@@ -1549,7 +1571,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
1549 struct tcon_link *tlink; 1571 struct tcon_link *tlink;
1550 struct cifs_tcon *tcon; 1572 struct cifs_tcon *tcon;
1551 struct TCP_Server_Info *server; 1573 struct TCP_Server_Info *server;
1552 __u16 srcfid; 1574 struct cifs_fid fid;
1575 struct cifs_open_parms oparms;
1553 int oplock, rc; 1576 int oplock, rc;
1554 1577
1555 tlink = cifs_sb_tlink(cifs_sb); 1578 tlink = cifs_sb_tlink(cifs_sb);
@@ -1576,17 +1599,23 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
1576 if (to_dentry->d_parent != from_dentry->d_parent) 1599 if (to_dentry->d_parent != from_dentry->d_parent)
1577 goto do_rename_exit; 1600 goto do_rename_exit;
1578 1601
1602 oparms.tcon = tcon;
1603 oparms.cifs_sb = cifs_sb;
1579 /* open the file to be renamed -- we need DELETE perms */ 1604 /* open the file to be renamed -- we need DELETE perms */
1580 rc = CIFSSMBOpen(xid, tcon, from_path, FILE_OPEN, DELETE, 1605 oparms.desired_access = DELETE;
1581 CREATE_NOT_DIR, &srcfid, &oplock, NULL, 1606 oparms.create_options = CREATE_NOT_DIR;
1582 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 1607 oparms.disposition = FILE_OPEN;
1583 CIFS_MOUNT_MAP_SPECIAL_CHR); 1608 oparms.path = from_path;
1609 oparms.fid = &fid;
1610 oparms.reconnect = false;
1611
1612 rc = CIFS_open(xid, &oparms, &oplock, NULL);
1584 if (rc == 0) { 1613 if (rc == 0) {
1585 rc = CIFSSMBRenameOpenFile(xid, tcon, srcfid, 1614 rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid,
1586 (const char *) to_dentry->d_name.name, 1615 (const char *) to_dentry->d_name.name,
1587 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 1616 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
1588 CIFS_MOUNT_MAP_SPECIAL_CHR); 1617 CIFS_MOUNT_MAP_SPECIAL_CHR);
1589 CIFSSMBClose(xid, tcon, srcfid); 1618 CIFSSMBClose(xid, tcon, fid.netfid);
1590 } 1619 }
1591do_rename_exit: 1620do_rename_exit:
1592 cifs_put_tlink(tlink); 1621 cifs_put_tlink(tlink);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 92aee08483a5..264ece71bdb2 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -29,6 +29,10 @@
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31 31
32/*
33 * M-F Symlink Functions - Begin
34 */
35
32#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1) 36#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
33#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1)) 37#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
34#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1)) 38#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
@@ -91,10 +95,8 @@ symlink_hash_err:
91} 95}
92 96
93static int 97static int
94CIFSParseMFSymlink(const u8 *buf, 98parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
95 unsigned int buf_len, 99 char **_link_str)
96 unsigned int *_link_len,
97 char **_link_str)
98{ 100{
99 int rc; 101 int rc;
100 unsigned int link_len; 102 unsigned int link_len;
@@ -137,7 +139,7 @@ CIFSParseMFSymlink(const u8 *buf,
137} 139}
138 140
139static int 141static int
140CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str) 142format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
141{ 143{
142 int rc; 144 int rc;
143 unsigned int link_len; 145 unsigned int link_len;
@@ -180,190 +182,94 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
180 return 0; 182 return 0;
181} 183}
182 184
185bool
186couldbe_mf_symlink(const struct cifs_fattr *fattr)
187{
188 if (!S_ISREG(fattr->cf_mode))
189 /* it's not a symlink */
190 return false;
191
192 if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
193 /* it's not a symlink */
194 return false;
195
196 return true;
197}
198
183static int 199static int
184CIFSCreateMFSymLink(const unsigned int xid, struct cifs_tcon *tcon, 200create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
185 const char *fromName, const char *toName, 201 struct cifs_sb_info *cifs_sb, const char *fromName,
186 struct cifs_sb_info *cifs_sb) 202 const char *toName)
187{ 203{
188 int rc; 204 int rc;
189 int oplock = 0;
190 int remap;
191 int create_options = CREATE_NOT_DIR;
192 __u16 netfid = 0;
193 u8 *buf; 205 u8 *buf;
194 unsigned int bytes_written = 0; 206 unsigned int bytes_written = 0;
195 struct cifs_io_parms io_parms;
196 struct nls_table *nls_codepage;
197
198 nls_codepage = cifs_sb->local_nls;
199 remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
200 207
201 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); 208 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
202 if (!buf) 209 if (!buf)
203 return -ENOMEM; 210 return -ENOMEM;
204 211
205 rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName); 212 rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
206 if (rc != 0) { 213 if (rc)
207 kfree(buf); 214 goto out;
208 return rc;
209 }
210
211 if (backup_cred(cifs_sb))
212 create_options |= CREATE_OPEN_BACKUP_INTENT;
213
214 rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
215 create_options, &netfid, &oplock, NULL,
216 nls_codepage, remap);
217 if (rc != 0) {
218 kfree(buf);
219 return rc;
220 }
221
222 io_parms.netfid = netfid;
223 io_parms.pid = current->tgid;
224 io_parms.tcon = tcon;
225 io_parms.offset = 0;
226 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
227 215
228 rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0); 216 rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
229 CIFSSMBClose(xid, tcon, netfid); 217 fromName, buf, &bytes_written);
230 kfree(buf); 218 if (rc)
231 if (rc != 0) 219 goto out;
232 return rc;
233 220
234 if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE) 221 if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
235 return -EIO; 222 rc = -EIO;
236 223out:
237 return 0; 224 kfree(buf);
225 return rc;
238} 226}
239 227
240static int 228static int
241CIFSQueryMFSymLink(const unsigned int xid, struct cifs_tcon *tcon, 229query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
242 const unsigned char *searchName, char **symlinkinfo, 230 struct cifs_sb_info *cifs_sb, const unsigned char *path,
243 const struct nls_table *nls_codepage, int remap) 231 char **symlinkinfo)
244{ 232{
245 int rc; 233 int rc;
246 int oplock = 0; 234 u8 *buf = NULL;
247 __u16 netfid = 0;
248 u8 *buf;
249 char *pbuf;
250 unsigned int bytes_read = 0;
251 int buf_type = CIFS_NO_BUFFER;
252 unsigned int link_len = 0; 235 unsigned int link_len = 0;
253 struct cifs_io_parms io_parms; 236 unsigned int bytes_read = 0;
254 FILE_ALL_INFO file_info;
255
256 rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
257 CREATE_NOT_DIR, &netfid, &oplock, &file_info,
258 nls_codepage, remap);
259 if (rc != 0)
260 return rc;
261
262 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
263 CIFSSMBClose(xid, tcon, netfid);
264 /* it's not a symlink */
265 return -EINVAL;
266 }
267 237
268 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); 238 buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
269 if (!buf) 239 if (!buf)
270 return -ENOMEM; 240 return -ENOMEM;
271 pbuf = buf;
272 io_parms.netfid = netfid;
273 io_parms.pid = current->tgid;
274 io_parms.tcon = tcon;
275 io_parms.offset = 0;
276 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
277
278 rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
279 CIFSSMBClose(xid, tcon, netfid);
280 if (rc != 0) {
281 kfree(buf);
282 return rc;
283 }
284
285 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
286 kfree(buf);
287 if (rc != 0)
288 return rc;
289
290 return 0;
291}
292
293bool
294CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
295{
296 if (!(fattr->cf_mode & S_IFREG))
297 /* it's not a symlink */
298 return false;
299 241
300 if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE) 242 if (tcon->ses->server->ops->query_mf_symlink)
301 /* it's not a symlink */ 243 rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
302 return false; 244 cifs_sb, path, buf, &bytes_read);
303 245 else
304 return true; 246 rc = -ENOSYS;
305}
306
307int
308open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
309 unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
310 unsigned int xid)
311{
312 int rc;
313 int oplock = 0;
314 __u16 netfid = 0;
315 struct tcon_link *tlink;
316 struct cifs_tcon *ptcon;
317 struct cifs_io_parms io_parms;
318 int buf_type = CIFS_NO_BUFFER;
319 FILE_ALL_INFO file_info;
320
321 tlink = cifs_sb_tlink(cifs_sb);
322 if (IS_ERR(tlink))
323 return PTR_ERR(tlink);
324 ptcon = tlink_tcon(tlink);
325 247
326 rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ, 248 if (rc)
327 CREATE_NOT_DIR, &netfid, &oplock, &file_info, 249 goto out;
328 cifs_sb->local_nls,
329 cifs_sb->mnt_cifs_flags &
330 CIFS_MOUNT_MAP_SPECIAL_CHR);
331 if (rc != 0) {
332 cifs_put_tlink(tlink);
333 return rc;
334 }
335 250
336 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { 251 if (bytes_read == 0) { /* not a symlink */
337 CIFSSMBClose(xid, ptcon, netfid); 252 rc = -EINVAL;
338 cifs_put_tlink(tlink); 253 goto out;
339 /* it's not a symlink */
340 return rc;
341 } 254 }
342 255
343 io_parms.netfid = netfid; 256 rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
344 io_parms.pid = current->tgid; 257out:
345 io_parms.tcon = ptcon; 258 kfree(buf);
346 io_parms.offset = 0;
347 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
348
349 rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
350 CIFSSMBClose(xid, ptcon, netfid);
351 cifs_put_tlink(tlink);
352 return rc; 259 return rc;
353} 260}
354 261
355
356int 262int
357CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, 263check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
358 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, 264 struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
359 const unsigned char *path) 265 const unsigned char *path)
360{ 266{
361 int rc; 267 int rc;
362 u8 *buf = NULL; 268 u8 *buf = NULL;
363 unsigned int link_len = 0; 269 unsigned int link_len = 0;
364 unsigned int bytes_read = 0; 270 unsigned int bytes_read = 0;
365 271
366 if (!CIFSCouldBeMFSymlink(fattr)) 272 if (!couldbe_mf_symlink(fattr))
367 /* it's not a symlink */ 273 /* it's not a symlink */
368 return 0; 274 return 0;
369 275
@@ -372,8 +278,8 @@ CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
372 return -ENOMEM; 278 return -ENOMEM;
373 279
374 if (tcon->ses->server->ops->query_mf_symlink) 280 if (tcon->ses->server->ops->query_mf_symlink)
375 rc = tcon->ses->server->ops->query_mf_symlink(path, buf, 281 rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
376 &bytes_read, cifs_sb, xid); 282 cifs_sb, path, buf, &bytes_read);
377 else 283 else
378 rc = -ENOSYS; 284 rc = -ENOSYS;
379 285
@@ -383,7 +289,7 @@ CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
383 if (bytes_read == 0) /* not a symlink */ 289 if (bytes_read == 0) /* not a symlink */
384 goto out; 290 goto out;
385 291
386 rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL); 292 rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
387 if (rc == -EINVAL) { 293 if (rc == -EINVAL) {
388 /* it's not a symlink */ 294 /* it's not a symlink */
389 rc = 0; 295 rc = 0;
@@ -403,6 +309,95 @@ out:
403 return rc; 309 return rc;
404} 310}
405 311
312/*
313 * SMB 1.0 Protocol specific functions
314 */
315
316int
317cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
318 struct cifs_sb_info *cifs_sb, const unsigned char *path,
319 char *pbuf, unsigned int *pbytes_read)
320{
321 int rc;
322 int oplock = 0;
323 struct cifs_fid fid;
324 struct cifs_open_parms oparms;
325 struct cifs_io_parms io_parms;
326 int buf_type = CIFS_NO_BUFFER;
327 FILE_ALL_INFO file_info;
328
329 oparms.tcon = tcon;
330 oparms.cifs_sb = cifs_sb;
331 oparms.desired_access = GENERIC_READ;
332 oparms.create_options = CREATE_NOT_DIR;
333 oparms.disposition = FILE_OPEN;
334 oparms.path = path;
335 oparms.fid = &fid;
336 oparms.reconnect = false;
337
338 rc = CIFS_open(xid, &oparms, &oplock, &file_info);
339 if (rc)
340 return rc;
341
342 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
343 /* it's not a symlink */
344 goto out;
345
346 io_parms.netfid = fid.netfid;
347 io_parms.pid = current->tgid;
348 io_parms.tcon = tcon;
349 io_parms.offset = 0;
350 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
351
352 rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
353out:
354 CIFSSMBClose(xid, tcon, fid.netfid);
355 return rc;
356}
357
358int
359cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
360 struct cifs_sb_info *cifs_sb, const unsigned char *path,
361 char *pbuf, unsigned int *pbytes_written)
362{
363 int rc;
364 int oplock = 0;
365 struct cifs_fid fid;
366 struct cifs_open_parms oparms;
367 struct cifs_io_parms io_parms;
368 int create_options = CREATE_NOT_DIR;
369
370 if (backup_cred(cifs_sb))
371 create_options |= CREATE_OPEN_BACKUP_INTENT;
372
373 oparms.tcon = tcon;
374 oparms.cifs_sb = cifs_sb;
375 oparms.desired_access = GENERIC_WRITE;
376 oparms.create_options = create_options;
377 oparms.disposition = FILE_OPEN;
378 oparms.path = path;
379 oparms.fid = &fid;
380 oparms.reconnect = false;
381
382 rc = CIFS_open(xid, &oparms, &oplock, NULL);
383 if (rc)
384 return rc;
385
386 io_parms.netfid = fid.netfid;
387 io_parms.pid = current->tgid;
388 io_parms.tcon = tcon;
389 io_parms.offset = 0;
390 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
391
392 rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
393 CIFSSMBClose(xid, tcon, fid.netfid);
394 return rc;
395}
396
397/*
398 * M-F Symlink Functions - End
399 */
400
406int 401int
407cifs_hardlink(struct dentry *old_file, struct inode *inode, 402cifs_hardlink(struct dentry *old_file, struct inode *inode,
408 struct dentry *direntry) 403 struct dentry *direntry)
@@ -438,8 +433,10 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
438 CIFS_MOUNT_MAP_SPECIAL_CHR); 433 CIFS_MOUNT_MAP_SPECIAL_CHR);
439 else { 434 else {
440 server = tcon->ses->server; 435 server = tcon->ses->server;
441 if (!server->ops->create_hardlink) 436 if (!server->ops->create_hardlink) {
442 return -ENOSYS; 437 rc = -ENOSYS;
438 goto cifs_hl_exit;
439 }
443 rc = server->ops->create_hardlink(xid, tcon, from_name, to_name, 440 rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
444 cifs_sb); 441 cifs_sb);
445 if ((rc == -EIO) || (rc == -EINVAL)) 442 if ((rc == -EIO) || (rc == -EINVAL))
@@ -530,15 +527,10 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
530 * and fallback to UNIX Extensions Symlinks. 527 * and fallback to UNIX Extensions Symlinks.
531 */ 528 */
532 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) 529 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
533 rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path, 530 rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
534 cifs_sb->local_nls, 531 &target_path);
535 cifs_sb->mnt_cifs_flags &
536 CIFS_MOUNT_MAP_SPECIAL_CHR);
537 532
538 if ((rc != 0) && cap_unix(tcon->ses)) 533 if (rc != 0 && server->ops->query_symlink)
539 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
540 cifs_sb->local_nls);
541 else if (rc != 0 && server->ops->query_symlink)
542 rc = server->ops->query_symlink(xid, tcon, full_path, 534 rc = server->ops->query_symlink(xid, tcon, full_path,
543 &target_path, cifs_sb); 535 &target_path, cifs_sb);
544 536
@@ -587,8 +579,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
587 579
588 /* BB what if DFS and this volume is on different share? BB */ 580 /* BB what if DFS and this volume is on different share? BB */
589 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) 581 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
590 rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname, 582 rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
591 cifs_sb);
592 else if (pTcon->unix_ext) 583 else if (pTcon->unix_ext)
593 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, 584 rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
594 cifs_sb->local_nls); 585 cifs_sb->local_nls);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 5940ecabbe6a..b15862e0f68c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -749,7 +749,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
749 } 749 }
750 750
751 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) && 751 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
752 CIFSCouldBeMFSymlink(&fattr)) 752 couldbe_mf_symlink(&fattr))
753 /* 753 /*
754 * trying to get the type and mode can be slow, 754 * trying to get the type and mode can be slow,
755 * so just call those regular files for now, and mark 755 * so just call those regular files for now, and mark
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 5f5ba0dc2ee1..526fb89f9230 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -560,17 +560,24 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
560 if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) { 560 if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
561 int tmprc; 561 int tmprc;
562 int oplock = 0; 562 int oplock = 0;
563 __u16 netfid; 563 struct cifs_fid fid;
564 struct cifs_open_parms oparms;
565
566 oparms.tcon = tcon;
567 oparms.cifs_sb = cifs_sb;
568 oparms.desired_access = FILE_READ_ATTRIBUTES;
569 oparms.create_options = 0;
570 oparms.disposition = FILE_OPEN;
571 oparms.path = full_path;
572 oparms.fid = &fid;
573 oparms.reconnect = false;
564 574
565 /* Need to check if this is a symbolic link or not */ 575 /* Need to check if this is a symbolic link or not */
566 tmprc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, 576 tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
567 FILE_READ_ATTRIBUTES, 0, &netfid, &oplock,
568 NULL, cifs_sb->local_nls,
569 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
570 if (tmprc == -EOPNOTSUPP) 577 if (tmprc == -EOPNOTSUPP)
571 *symlink = true; 578 *symlink = true;
572 else 579 else
573 CIFSSMBClose(xid, tcon, netfid); 580 CIFSSMBClose(xid, tcon, fid.netfid);
574 } 581 }
575 582
576 return rc; 583 return rc;
@@ -705,12 +712,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
705 oparms->cifs_sb->local_nls, 712 oparms->cifs_sb->local_nls,
706 oparms->cifs_sb->mnt_cifs_flags 713 oparms->cifs_sb->mnt_cifs_flags
707 & CIFS_MOUNT_MAP_SPECIAL_CHR); 714 & CIFS_MOUNT_MAP_SPECIAL_CHR);
708 return CIFSSMBOpen(xid, oparms->tcon, oparms->path, 715 return CIFS_open(xid, oparms, oplock, buf);
709 oparms->disposition, oparms->desired_access,
710 oparms->create_options, &oparms->fid->netfid, oplock,
711 buf, oparms->cifs_sb->local_nls,
712 oparms->cifs_sb->mnt_cifs_flags &
713 CIFS_MOUNT_MAP_SPECIAL_CHR);
714} 716}
715 717
716static void 718static void
@@ -761,8 +763,9 @@ smb_set_file_info(struct inode *inode, const char *full_path,
761{ 763{
762 int oplock = 0; 764 int oplock = 0;
763 int rc; 765 int rc;
764 __u16 netfid;
765 __u32 netpid; 766 __u32 netpid;
767 struct cifs_fid fid;
768 struct cifs_open_parms oparms;
766 struct cifsFileInfo *open_file; 769 struct cifsFileInfo *open_file;
767 struct cifsInodeInfo *cinode = CIFS_I(inode); 770 struct cifsInodeInfo *cinode = CIFS_I(inode);
768 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 771 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -772,7 +775,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
772 /* if the file is already open for write, just use that fileid */ 775 /* if the file is already open for write, just use that fileid */
773 open_file = find_writable_file(cinode, true); 776 open_file = find_writable_file(cinode, true);
774 if (open_file) { 777 if (open_file) {
775 netfid = open_file->fid.netfid; 778 fid.netfid = open_file->fid.netfid;
776 netpid = open_file->pid; 779 netpid = open_file->pid;
777 tcon = tlink_tcon(open_file->tlink); 780 tcon = tlink_tcon(open_file->tlink);
778 goto set_via_filehandle; 781 goto set_via_filehandle;
@@ -796,12 +799,17 @@ smb_set_file_info(struct inode *inode, const char *full_path,
796 goto out; 799 goto out;
797 } 800 }
798 801
799 cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); 802 oparms.tcon = tcon;
800 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, 803 oparms.cifs_sb = cifs_sb;
801 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR, 804 oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES;
802 &netfid, &oplock, NULL, cifs_sb->local_nls, 805 oparms.create_options = CREATE_NOT_DIR;
803 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 806 oparms.disposition = FILE_OPEN;
807 oparms.path = full_path;
808 oparms.fid = &fid;
809 oparms.reconnect = false;
804 810
811 cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
812 rc = CIFS_open(xid, &oparms, &oplock, NULL);
805 if (rc != 0) { 813 if (rc != 0) {
806 if (rc == -EIO) 814 if (rc == -EIO)
807 rc = -EINVAL; 815 rc = -EINVAL;
@@ -811,12 +819,12 @@ smb_set_file_info(struct inode *inode, const char *full_path,
811 netpid = current->tgid; 819 netpid = current->tgid;
812 820
813set_via_filehandle: 821set_via_filehandle:
814 rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid); 822 rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid);
815 if (!rc) 823 if (!rc)
816 cinode->cifsAttrs = le32_to_cpu(buf->Attributes); 824 cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
817 825
818 if (open_file == NULL) 826 if (open_file == NULL)
819 CIFSSMBClose(xid, tcon, netfid); 827 CIFSSMBClose(xid, tcon, fid.netfid);
820 else 828 else
821 cifsFileInfo_put(open_file); 829 cifsFileInfo_put(open_file);
822out: 830out:
@@ -908,33 +916,80 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
908} 916}
909 917
910static int 918static int
919cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
920 const unsigned char *searchName, char **symlinkinfo,
921 const struct nls_table *nls_codepage)
922{
923#ifdef CONFIG_CIFS_DFS_UPCALL
924 int rc;
925 unsigned int num_referrals = 0;
926 struct dfs_info3_param *referrals = NULL;
927
928 rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage,
929 &num_referrals, &referrals, 0);
930
931 if (!rc && num_referrals > 0) {
932 *symlinkinfo = kstrndup(referrals->node_name,
933 strlen(referrals->node_name),
934 GFP_KERNEL);
935 if (!*symlinkinfo)
936 rc = -ENOMEM;
937 free_dfs_info_array(referrals, num_referrals);
938 }
939 return rc;
940#else /* No DFS support */
941 return -EREMOTE;
942#endif
943}
944
945static int
911cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, 946cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
912 const char *full_path, char **target_path, 947 const char *full_path, char **target_path,
913 struct cifs_sb_info *cifs_sb) 948 struct cifs_sb_info *cifs_sb)
914{ 949{
915 int rc; 950 int rc;
916 int oplock = 0; 951 int oplock = 0;
917 __u16 netfid; 952 struct cifs_fid fid;
953 struct cifs_open_parms oparms;
918 954
919 cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); 955 cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
920 956
921 rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, 957 /* Check for unix extensions */
922 FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid, 958 if (cap_unix(tcon->ses)) {
923 &oplock, NULL, cifs_sb->local_nls, 959 rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
924 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 960 cifs_sb->local_nls);
961 if (rc == -EREMOTE)
962 rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
963 target_path,
964 cifs_sb->local_nls);
965
966 goto out;
967 }
968
969 oparms.tcon = tcon;
970 oparms.cifs_sb = cifs_sb;
971 oparms.desired_access = FILE_READ_ATTRIBUTES;
972 oparms.create_options = OPEN_REPARSE_POINT;
973 oparms.disposition = FILE_OPEN;
974 oparms.path = full_path;
975 oparms.fid = &fid;
976 oparms.reconnect = false;
977
978 rc = CIFS_open(xid, &oparms, &oplock, NULL);
925 if (rc) 979 if (rc)
926 return rc; 980 goto out;
927 981
928 rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path, 982 rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path,
929 cifs_sb->local_nls); 983 cifs_sb->local_nls);
930 if (rc) { 984 if (rc)
931 CIFSSMBClose(xid, tcon, netfid); 985 goto out_close;
932 return rc;
933 }
934 986
935 convert_delimiter(*target_path, '/'); 987 convert_delimiter(*target_path, '/');
936 CIFSSMBClose(xid, tcon, netfid); 988out_close:
937 cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); 989 CIFSSMBClose(xid, tcon, fid.netfid);
990out:
991 if (!rc)
992 cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
938 return rc; 993 return rc;
939} 994}
940 995
@@ -1009,8 +1064,18 @@ struct smb_version_operations smb1_operations = {
1009 .mand_lock = cifs_mand_lock, 1064 .mand_lock = cifs_mand_lock,
1010 .mand_unlock_range = cifs_unlock_range, 1065 .mand_unlock_range = cifs_unlock_range,
1011 .push_mand_locks = cifs_push_mandatory_locks, 1066 .push_mand_locks = cifs_push_mandatory_locks,
1012 .query_mf_symlink = open_query_close_cifs_symlink, 1067 .query_mf_symlink = cifs_query_mf_symlink,
1068 .create_mf_symlink = cifs_create_mf_symlink,
1013 .is_read_op = cifs_is_read_op, 1069 .is_read_op = cifs_is_read_op,
1070#ifdef CONFIG_CIFS_XATTR
1071 .query_all_EAs = CIFSSMBQAllEAs,
1072 .set_EA = CIFSSMBSetEA,
1073#endif /* CIFS_XATTR */
1074#ifdef CONFIG_CIFS_ACL
1075 .get_acl = get_cifs_acl,
1076 .get_acl_by_fid = get_cifs_acl_by_fid,
1077 .set_acl = set_cifs_acl,
1078#endif /* CIFS_ACL */
1014}; 1079};
1015 1080
1016struct smb_version_values smb1_values = { 1081struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index c38350851b08..bc0bb9c34f72 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -57,4 +57,7 @@
57#define SMB2_CMACAES_SIZE (16) 57#define SMB2_CMACAES_SIZE (16)
58#define SMB3_SIGNKEY_SIZE (16) 58#define SMB3_SIGNKEY_SIZE (16)
59 59
60/* Maximum buffer size value we can send with 1 credit */
61#define SMB2_MAX_BUFFER_SIZE 65536
62
60#endif /* _SMB2_GLOB_H */ 63#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 757da3e54d3d..192f51a12cf1 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -182,11 +182,8 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
182 /* start with specified wsize, or default */ 182 /* start with specified wsize, or default */
183 wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; 183 wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
184 wsize = min_t(unsigned int, wsize, server->max_write); 184 wsize = min_t(unsigned int, wsize, server->max_write);
185 /* 185 /* set it to the maximum buffer size value we can send with 1 credit */
186 * limit write size to 2 ** 16, because we don't support multicredit 186 wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
187 * requests now.
188 */
189 wsize = min_t(unsigned int, wsize, 2 << 15);
190 187
191 return wsize; 188 return wsize;
192} 189}
@@ -200,11 +197,8 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
200 /* start with specified rsize, or default */ 197 /* start with specified rsize, or default */
201 rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; 198 rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
202 rsize = min_t(unsigned int, rsize, server->max_read); 199 rsize = min_t(unsigned int, rsize, server->max_read);
203 /* 200 /* set it to the maximum buffer size value we can send with 1 credit */
204 * limit write size to 2 ** 16, because we don't support multicredit 201 rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
205 * requests now.
206 */
207 rsize = min_t(unsigned int, rsize, 2 << 15);
208 202
209 return rsize; 203 return rsize;
210} 204}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2013234b73ad..860344701067 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -413,7 +413,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
413 413
414 /* SMB2 only has an extended negflavor */ 414 /* SMB2 only has an extended negflavor */
415 server->negflavor = CIFS_NEGFLAVOR_EXTENDED; 415 server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
416 server->maxBuf = le32_to_cpu(rsp->MaxTransactSize); 416 /* set it to the maximum buffer size value we can send with 1 credit */
417 server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize),
418 SMB2_MAX_BUFFER_SIZE);
417 server->max_read = le32_to_cpu(rsp->MaxReadSize); 419 server->max_read = le32_to_cpu(rsp->MaxReadSize);
418 server->max_write = le32_to_cpu(rsp->MaxWriteSize); 420 server->max_write = le32_to_cpu(rsp->MaxWriteSize);
419 /* BB Do we need to validate the SecurityMode? */ 421 /* BB Do we need to validate the SecurityMode? */
@@ -1890,7 +1892,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
1890 1892
1891/* smb2_async_writev - send an async write, and set up mid to handle result */ 1893/* smb2_async_writev - send an async write, and set up mid to handle result */
1892int 1894int
1893smb2_async_writev(struct cifs_writedata *wdata) 1895smb2_async_writev(struct cifs_writedata *wdata,
1896 void (*release)(struct kref *kref))
1894{ 1897{
1895 int rc = -EACCES; 1898 int rc = -EACCES;
1896 struct smb2_write_req *req = NULL; 1899 struct smb2_write_req *req = NULL;
@@ -1938,7 +1941,7 @@ smb2_async_writev(struct cifs_writedata *wdata)
1938 smb2_writev_callback, wdata, 0); 1941 smb2_writev_callback, wdata, 0);
1939 1942
1940 if (rc) { 1943 if (rc) {
1941 kref_put(&wdata->refcount, cifs_writedata_release); 1944 kref_put(&wdata->refcount, release);
1942 cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); 1945 cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
1943 } 1946 }
1944 1947
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 93adc64666f3..0ce48db20a65 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -123,7 +123,8 @@ extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
123extern int smb2_async_readv(struct cifs_readdata *rdata); 123extern int smb2_async_readv(struct cifs_readdata *rdata);
124extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, 124extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
125 unsigned int *nbytes, char **buf, int *buf_type); 125 unsigned int *nbytes, char **buf, int *buf_type);
126extern int smb2_async_writev(struct cifs_writedata *wdata); 126extern int smb2_async_writev(struct cifs_writedata *wdata,
127 void (*release)(struct kref *kref));
127extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, 128extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
128 unsigned int *nbytes, struct kvec *iov, int n_vec); 129 unsigned int *nbytes, struct kvec *iov, int n_vec);
129extern int SMB2_echo(struct TCP_Server_Info *server); 130extern int SMB2_echo(struct TCP_Server_Info *server);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index b37570952846..18cd5650a5fc 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -270,6 +270,26 @@ cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
270 iov->iov_len = rqst->rq_pagesz; 270 iov->iov_len = rqst->rq_pagesz;
271} 271}
272 272
273static unsigned long
274rqst_len(struct smb_rqst *rqst)
275{
276 unsigned int i;
277 struct kvec *iov = rqst->rq_iov;
278 unsigned long buflen = 0;
279
280 /* total up iov array first */
281 for (i = 0; i < rqst->rq_nvec; i++)
282 buflen += iov[i].iov_len;
283
284 /* add in the page array if there is one */
285 if (rqst->rq_npages) {
286 buflen += rqst->rq_pagesz * (rqst->rq_npages - 1);
287 buflen += rqst->rq_tailsz;
288 }
289
290 return buflen;
291}
292
273static int 293static int
274smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) 294smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
275{ 295{
@@ -277,6 +297,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
277 struct kvec *iov = rqst->rq_iov; 297 struct kvec *iov = rqst->rq_iov;
278 int n_vec = rqst->rq_nvec; 298 int n_vec = rqst->rq_nvec;
279 unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); 299 unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
300 unsigned long send_length;
280 unsigned int i; 301 unsigned int i;
281 size_t total_len = 0, sent; 302 size_t total_len = 0, sent;
282 struct socket *ssocket = server->ssocket; 303 struct socket *ssocket = server->ssocket;
@@ -285,6 +306,14 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
285 if (ssocket == NULL) 306 if (ssocket == NULL)
286 return -ENOTSOCK; 307 return -ENOTSOCK;
287 308
309 /* sanity check send length */
310 send_length = rqst_len(rqst);
311 if (send_length != smb_buf_length + 4) {
312 WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n",
313 send_length, smb_buf_length);
314 return -EIO;
315 }
316
288 cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); 317 cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length);
289 dump_smb(iov[0].iov_base, iov[0].iov_len); 318 dump_smb(iov[0].iov_base, iov[0].iov_len);
290 319
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 09afda4cc58e..5ac836a86b18 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -82,9 +82,11 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
82 goto remove_ea_exit; 82 goto remove_ea_exit;
83 83
84 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ 84 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
85 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, 85 if (pTcon->ses->server->ops->set_EA)
86 (__u16)0, cifs_sb->local_nls, 86 rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
87 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 87 full_path, ea_name, NULL, (__u16)0,
88 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
89 CIFS_MOUNT_MAP_SPECIAL_CHR);
88 } 90 }
89remove_ea_exit: 91remove_ea_exit:
90 kfree(full_path); 92 kfree(full_path);
@@ -149,18 +151,22 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
149 cifs_dbg(FYI, "attempt to set cifs inode metadata\n"); 151 cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
150 152
151 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ 153 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
152 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, 154 if (pTcon->ses->server->ops->set_EA)
153 (__u16)value_size, cifs_sb->local_nls, 155 rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
154 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 156 full_path, ea_name, ea_value, (__u16)value_size,
157 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
158 CIFS_MOUNT_MAP_SPECIAL_CHR);
155 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) 159 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
156 == 0) { 160 == 0) {
157 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 161 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
158 goto set_ea_exit; 162 goto set_ea_exit;
159 163
160 ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ 164 ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
161 rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, 165 if (pTcon->ses->server->ops->set_EA)
162 (__u16)value_size, cifs_sb->local_nls, 166 rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
163 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 167 full_path, ea_name, ea_value, (__u16)value_size,
168 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
169 CIFS_MOUNT_MAP_SPECIAL_CHR);
164 } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, 170 } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
165 strlen(CIFS_XATTR_CIFS_ACL)) == 0) { 171 strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
166#ifdef CONFIG_CIFS_ACL 172#ifdef CONFIG_CIFS_ACL
@@ -170,8 +176,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
170 rc = -ENOMEM; 176 rc = -ENOMEM;
171 } else { 177 } else {
172 memcpy(pacl, ea_value, value_size); 178 memcpy(pacl, ea_value, value_size);
173 rc = set_cifs_acl(pacl, value_size, 179 if (pTcon->ses->server->ops->set_acl)
174 direntry->d_inode, full_path, CIFS_ACL_DACL); 180 rc = pTcon->ses->server->ops->set_acl(pacl,
181 value_size, direntry->d_inode,
182 full_path, CIFS_ACL_DACL);
183 else
184 rc = -EOPNOTSUPP;
175 if (rc == 0) /* force revalidate of the inode */ 185 if (rc == 0) /* force revalidate of the inode */
176 CIFS_I(direntry->d_inode)->time = 0; 186 CIFS_I(direntry->d_inode)->time = 0;
177 kfree(pacl); 187 kfree(pacl);
@@ -272,17 +282,21 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
272 /* revalidate/getattr then populate from inode */ 282 /* revalidate/getattr then populate from inode */
273 } /* BB add else when above is implemented */ 283 } /* BB add else when above is implemented */
274 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ 284 ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
275 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, 285 if (pTcon->ses->server->ops->query_all_EAs)
276 buf_size, cifs_sb->local_nls, 286 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
277 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 287 full_path, ea_name, ea_value, buf_size,
288 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
289 CIFS_MOUNT_MAP_SPECIAL_CHR);
278 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { 290 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
279 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 291 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
280 goto get_ea_exit; 292 goto get_ea_exit;
281 293
282 ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ 294 ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
283 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, 295 if (pTcon->ses->server->ops->query_all_EAs)
284 buf_size, cifs_sb->local_nls, 296 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
285 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 297 full_path, ea_name, ea_value, buf_size,
298 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
299 CIFS_MOUNT_MAP_SPECIAL_CHR);
286 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, 300 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
287 strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { 301 strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
288#ifdef CONFIG_CIFS_POSIX 302#ifdef CONFIG_CIFS_POSIX
@@ -313,8 +327,11 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
313 u32 acllen; 327 u32 acllen;
314 struct cifs_ntsd *pacl; 328 struct cifs_ntsd *pacl;
315 329
316 pacl = get_cifs_acl(cifs_sb, direntry->d_inode, 330 if (pTcon->ses->server->ops->get_acl == NULL)
317 full_path, &acllen); 331 goto get_ea_exit; /* rc already EOPNOTSUPP */
332
333 pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
334 direntry->d_inode, full_path, &acllen);
318 if (IS_ERR(pacl)) { 335 if (IS_ERR(pacl)) {
319 rc = PTR_ERR(pacl); 336 rc = PTR_ERR(pacl);
320 cifs_dbg(VFS, "%s: error %zd getting sec desc\n", 337 cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
@@ -400,11 +417,12 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
400 /* if proc/fs/cifs/streamstoxattr is set then 417 /* if proc/fs/cifs/streamstoxattr is set then
401 search server for EAs or streams to 418 search server for EAs or streams to
402 returns as xattrs */ 419 returns as xattrs */
403 rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
404 buf_size, cifs_sb->local_nls,
405 cifs_sb->mnt_cifs_flags &
406 CIFS_MOUNT_MAP_SPECIAL_CHR);
407 420
421 if (pTcon->ses->server->ops->query_all_EAs)
422 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
423 full_path, NULL, data, buf_size,
424 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
425 CIFS_MOUNT_MAP_SPECIAL_CHR);
408list_ea_exit: 426list_ea_exit:
409 kfree(full_path); 427 kfree(full_path);
410 free_xid(xid); 428 free_xid(xid);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dc52e13d58e0..3881610b6438 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
680 struct i2c_msg __user *tmsgs; 680 struct i2c_msg __user *tmsgs;
681 struct i2c_msg32 __user *umsgs; 681 struct i2c_msg32 __user *umsgs;
682 compat_caddr_t datap; 682 compat_caddr_t datap;
683 int nmsgs, i; 683 u32 nmsgs;
684 int i;
684 685
685 if (get_user(nmsgs, &udata->nmsgs)) 686 if (get_user(nmsgs, &udata->nmsgs))
686 return -EFAULT; 687 return -EFAULT;
diff --git a/fs/coredump.c b/fs/coredump.c
index bc3fbcd32558..e3ad709a4232 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -40,7 +40,6 @@
40 40
41#include <trace/events/task.h> 41#include <trace/events/task.h>
42#include "internal.h" 42#include "internal.h"
43#include "coredump.h"
44 43
45#include <trace/events/sched.h> 44#include <trace/events/sched.h>
46 45
diff --git a/fs/coredump.h b/fs/coredump.h
deleted file mode 100644
index e39ff072110d..000000000000
--- a/fs/coredump.h
+++ /dev/null
@@ -1,6 +0,0 @@
1#ifndef _FS_COREDUMP_H
2#define _FS_COREDUMP_H
3
4extern int __get_dumpable(unsigned long mm_flags);
5
6#endif
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index e501ac3a49ff..06610cf94d57 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,14 +17,30 @@
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/cramfs_fs.h>
21#include <linux/slab.h> 20#include <linux/slab.h>
22#include <linux/cramfs_fs_sb.h>
23#include <linux/vfs.h> 21#include <linux/vfs.h>
24#include <linux/mutex.h> 22#include <linux/mutex.h>
25 23#include <uapi/linux/cramfs_fs.h>
26#include <asm/uaccess.h> 24#include <asm/uaccess.h>
27 25
26#include "internal.h"
27
28/*
29 * cramfs super-block data in memory
30 */
31struct cramfs_sb_info {
32 unsigned long magic;
33 unsigned long size;
34 unsigned long blocks;
35 unsigned long files;
36 unsigned long flags;
37};
38
39static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
40{
41 return sb->s_fs_info;
42}
43
28static const struct super_operations cramfs_ops; 44static const struct super_operations cramfs_ops;
29static const struct inode_operations cramfs_dir_inode_operations; 45static const struct inode_operations cramfs_dir_inode_operations;
30static const struct file_operations cramfs_directory_operations; 46static const struct file_operations cramfs_directory_operations;
@@ -219,10 +235,11 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
219 return read_buffers[buffer] + offset; 235 return read_buffers[buffer] + offset;
220} 236}
221 237
222static void cramfs_put_super(struct super_block *sb) 238static void cramfs_kill_sb(struct super_block *sb)
223{ 239{
224 kfree(sb->s_fs_info); 240 struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
225 sb->s_fs_info = NULL; 241 kill_block_super(sb);
242 kfree(sbi);
226} 243}
227 244
228static int cramfs_remount(struct super_block *sb, int *flags, char *data) 245static int cramfs_remount(struct super_block *sb, int *flags, char *data)
@@ -261,7 +278,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
261 if (super.magic == CRAMFS_MAGIC_WEND) { 278 if (super.magic == CRAMFS_MAGIC_WEND) {
262 if (!silent) 279 if (!silent)
263 printk(KERN_ERR "cramfs: wrong endianness\n"); 280 printk(KERN_ERR "cramfs: wrong endianness\n");
264 goto out; 281 return -EINVAL;
265 } 282 }
266 283
267 /* check at 512 byte offset */ 284 /* check at 512 byte offset */
@@ -273,20 +290,20 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
273 printk(KERN_ERR "cramfs: wrong endianness\n"); 290 printk(KERN_ERR "cramfs: wrong endianness\n");
274 else if (!silent) 291 else if (!silent)
275 printk(KERN_ERR "cramfs: wrong magic\n"); 292 printk(KERN_ERR "cramfs: wrong magic\n");
276 goto out; 293 return -EINVAL;
277 } 294 }
278 } 295 }
279 296
280 /* get feature flags first */ 297 /* get feature flags first */
281 if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { 298 if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
282 printk(KERN_ERR "cramfs: unsupported filesystem features\n"); 299 printk(KERN_ERR "cramfs: unsupported filesystem features\n");
283 goto out; 300 return -EINVAL;
284 } 301 }
285 302
286 /* Check that the root inode is in a sane state */ 303 /* Check that the root inode is in a sane state */
287 if (!S_ISDIR(super.root.mode)) { 304 if (!S_ISDIR(super.root.mode)) {
288 printk(KERN_ERR "cramfs: root is not a directory\n"); 305 printk(KERN_ERR "cramfs: root is not a directory\n");
289 goto out; 306 return -EINVAL;
290 } 307 }
291 /* correct strange, hard-coded permissions of mkcramfs */ 308 /* correct strange, hard-coded permissions of mkcramfs */
292 super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); 309 super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
@@ -310,22 +327,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
310 (root_offset != 512 + sizeof(struct cramfs_super)))) 327 (root_offset != 512 + sizeof(struct cramfs_super))))
311 { 328 {
312 printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); 329 printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset);
313 goto out; 330 return -EINVAL;
314 } 331 }
315 332
316 /* Set it all up.. */ 333 /* Set it all up.. */
317 sb->s_op = &cramfs_ops; 334 sb->s_op = &cramfs_ops;
318 root = get_cramfs_inode(sb, &super.root, 0); 335 root = get_cramfs_inode(sb, &super.root, 0);
319 if (IS_ERR(root)) 336 if (IS_ERR(root))
320 goto out; 337 return PTR_ERR(root);
321 sb->s_root = d_make_root(root); 338 sb->s_root = d_make_root(root);
322 if (!sb->s_root) 339 if (!sb->s_root)
323 goto out; 340 return -ENOMEM;
324 return 0; 341 return 0;
325out:
326 kfree(sbi);
327 sb->s_fs_info = NULL;
328 return -EINVAL;
329} 342}
330 343
331static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) 344static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -550,7 +563,6 @@ static const struct inode_operations cramfs_dir_inode_operations = {
550}; 563};
551 564
552static const struct super_operations cramfs_ops = { 565static const struct super_operations cramfs_ops = {
553 .put_super = cramfs_put_super,
554 .remount_fs = cramfs_remount, 566 .remount_fs = cramfs_remount,
555 .statfs = cramfs_statfs, 567 .statfs = cramfs_statfs,
556}; 568};
@@ -565,7 +577,7 @@ static struct file_system_type cramfs_fs_type = {
565 .owner = THIS_MODULE, 577 .owner = THIS_MODULE,
566 .name = "cramfs", 578 .name = "cramfs",
567 .mount = cramfs_mount, 579 .mount = cramfs_mount,
568 .kill_sb = kill_block_super, 580 .kill_sb = cramfs_kill_sb,
569 .fs_flags = FS_REQUIRES_DEV, 581 .fs_flags = FS_REQUIRES_DEV,
570}; 582};
571MODULE_ALIAS_FS("cramfs"); 583MODULE_ALIAS_FS("cramfs");
diff --git a/fs/cramfs/internal.h b/fs/cramfs/internal.h
new file mode 100644
index 000000000000..349d71272157
--- /dev/null
+++ b/fs/cramfs/internal.h
@@ -0,0 +1,4 @@
1/* Uncompression interfaces to the underlying zlib */
2int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen);
3int cramfs_uncompress_init(void);
4void cramfs_uncompress_exit(void);
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 023329800d2e..1760c1b84d97 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -19,7 +19,7 @@
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
21#include <linux/zlib.h> 21#include <linux/zlib.h>
22#include <linux/cramfs_fs.h> 22#include "internal.h"
23 23
24static z_stream stream; 24static z_stream stream;
25static int initialized; 25static int initialized;
diff --git a/fs/dcache.c b/fs/dcache.c
index 6055d61811d3..ca02c13a84aa 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2833,9 +2833,9 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
2833 u32 dlen = ACCESS_ONCE(name->len); 2833 u32 dlen = ACCESS_ONCE(name->len);
2834 char *p; 2834 char *p;
2835 2835
2836 if (*buflen < dlen + 1)
2837 return -ENAMETOOLONG;
2838 *buflen -= dlen + 1; 2836 *buflen -= dlen + 1;
2837 if (*buflen < 0)
2838 return -ENAMETOOLONG;
2839 p = *buffer -= dlen + 1; 2839 p = *buffer -= dlen + 1;
2840 *p++ = '/'; 2840 *p++ = '/';
2841 while (dlen--) { 2841 while (dlen--) {
@@ -3061,8 +3061,13 @@ char *d_path(const struct path *path, char *buf, int buflen)
3061 * thus don't need to be hashed. They also don't need a name until a 3061 * thus don't need to be hashed. They also don't need a name until a
3062 * user wants to identify the object in /proc/pid/fd/. The little hack 3062 * user wants to identify the object in /proc/pid/fd/. The little hack
3063 * below allows us to generate a name for these objects on demand: 3063 * below allows us to generate a name for these objects on demand:
3064 *
3065 * Some pseudo inodes are mountable. When they are mounted
3066 * path->dentry == path->mnt->mnt_root. In that case don't call d_dname
3067 * and instead have d_path return the mounted path.
3064 */ 3068 */
3065 if (path->dentry->d_op && path->dentry->d_op->d_dname) 3069 if (path->dentry->d_op && path->dentry->d_op->d_dname &&
3070 (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
3066 return path->dentry->d_op->d_dname(path->dentry, buf, buflen); 3071 return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
3067 3072
3068 rcu_read_lock(); 3073 rcu_read_lock();
@@ -3111,26 +3116,28 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
3111/* 3116/*
3112 * Write full pathname from the root of the filesystem into the buffer. 3117 * Write full pathname from the root of the filesystem into the buffer.
3113 */ 3118 */
3114static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) 3119static char *__dentry_path(struct dentry *d, char *buf, int buflen)
3115{ 3120{
3121 struct dentry *dentry;
3116 char *end, *retval; 3122 char *end, *retval;
3117 int len, seq = 0; 3123 int len, seq = 0;
3118 int error = 0; 3124 int error = 0;
3119 3125
3126 if (buflen < 2)
3127 goto Elong;
3128
3120 rcu_read_lock(); 3129 rcu_read_lock();
3121restart: 3130restart:
3131 dentry = d;
3122 end = buf + buflen; 3132 end = buf + buflen;
3123 len = buflen; 3133 len = buflen;
3124 prepend(&end, &len, "\0", 1); 3134 prepend(&end, &len, "\0", 1);
3125 if (buflen < 1)
3126 goto Elong;
3127 /* Get '/' right */ 3135 /* Get '/' right */
3128 retval = end-1; 3136 retval = end-1;
3129 *retval = '/'; 3137 *retval = '/';
3130 read_seqbegin_or_lock(&rename_lock, &seq); 3138 read_seqbegin_or_lock(&rename_lock, &seq);
3131 while (!IS_ROOT(dentry)) { 3139 while (!IS_ROOT(dentry)) {
3132 struct dentry *parent = dentry->d_parent; 3140 struct dentry *parent = dentry->d_parent;
3133 int error;
3134 3141
3135 prefetch(parent); 3142 prefetch(parent);
3136 error = prepend_name(&end, &len, &dentry->d_name); 3143 error = prepend_name(&end, &len, &dentry->d_name);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index ab5954b50267..ac44a69fbea9 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -204,7 +204,7 @@ out:
204} 204}
205 205
206#ifdef CONFIG_COMPAT 206#ifdef CONFIG_COMPAT
207COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) 207COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
208{ 208{
209#ifdef __BIG_ENDIAN 209#ifdef __BIG_ENDIAN
210 return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); 210 return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0e04142d5962..160a5489a939 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -375,7 +375,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
375 bio = bio_alloc(GFP_KERNEL, nr_vecs); 375 bio = bio_alloc(GFP_KERNEL, nr_vecs);
376 376
377 bio->bi_bdev = bdev; 377 bio->bi_bdev = bdev;
378 bio->bi_sector = first_sector; 378 bio->bi_iter.bi_sector = first_sector;
379 if (dio->is_async) 379 if (dio->is_async)
380 bio->bi_end_io = dio_bio_end_aio; 380 bio->bi_end_io = dio_bio_end_aio;
381 else 381 else
@@ -719,7 +719,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
719 if (sdio->bio) { 719 if (sdio->bio) {
720 loff_t cur_offset = sdio->cur_page_fs_offset; 720 loff_t cur_offset = sdio->cur_page_fs_offset;
721 loff_t bio_next_offset = sdio->logical_offset_in_bio + 721 loff_t bio_next_offset = sdio->logical_offset_in_bio +
722 sdio->bio->bi_size; 722 sdio->bio->bi_iter.bi_size;
723 723
724 /* 724 /*
725 * See whether this new request is contiguous with the old. 725 * See whether this new request is contiguous with the old.
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa6..3190ca973dd6 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con,
649 struct msghdr *msg, char *buf) 649 struct msghdr *msg, char *buf)
650{ 650{
651 union sctp_notification *sn = (union sctp_notification *)buf; 651 union sctp_notification *sn = (union sctp_notification *)buf;
652 struct linger linger;
652 653
653 switch (sn->sn_header.sn_type) { 654 switch (sn->sn_header.sn_type) {
654 case SCTP_SEND_FAILED: 655 case SCTP_SEND_FAILED:
@@ -713,11 +714,11 @@ static void process_sctp_notification(struct connection *con,
713 return; 714 return;
714 715
715 /* Peel off a new sock */ 716 /* Peel off a new sock */
716 sctp_lock_sock(con->sock->sk); 717 lock_sock(con->sock->sk);
717 ret = sctp_do_peeloff(con->sock->sk, 718 ret = sctp_do_peeloff(con->sock->sk,
718 sn->sn_assoc_change.sac_assoc_id, 719 sn->sn_assoc_change.sac_assoc_id,
719 &new_con->sock); 720 &new_con->sock);
720 sctp_release_sock(con->sock->sk); 721 release_sock(con->sock->sk);
721 if (ret < 0) { 722 if (ret < 0) {
722 log_print("Can't peel off a socket for " 723 log_print("Can't peel off a socket for "
723 "connection %d to node %d: err=%d", 724 "connection %d to node %d: err=%d",
@@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con,
727 } 728 }
728 add_sock(new_con->sock, new_con); 729 add_sock(new_con->sock, new_con);
729 730
731 linger.l_onoff = 1;
732 linger.l_linger = 0;
733 ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
734 (char *)&linger, sizeof(linger));
735 if (ret < 0)
736 log_print("set socket option SO_LINGER failed");
737
730 log_print("connecting to %d sctp association %d", 738 log_print("connecting to %d sctp association %d",
731 nodeid, (int)sn->sn_assoc_change.sac_assoc_id); 739 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
732 740
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c36c44824471..b167ca48b8ee 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -659,19 +659,17 @@ out_lock:
659 return rc; 659 return rc;
660} 660}
661 661
662static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, 662static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz)
663 size_t *bufsiz)
664{ 663{
665 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 664 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
666 char *lower_buf; 665 char *lower_buf;
666 char *buf;
667 mm_segment_t old_fs; 667 mm_segment_t old_fs;
668 int rc; 668 int rc;
669 669
670 lower_buf = kmalloc(PATH_MAX, GFP_KERNEL); 670 lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
671 if (!lower_buf) { 671 if (!lower_buf)
672 rc = -ENOMEM; 672 return ERR_PTR(-ENOMEM);
673 goto out;
674 }
675 old_fs = get_fs(); 673 old_fs = get_fs();
676 set_fs(get_ds()); 674 set_fs(get_ds());
677 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, 675 rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
@@ -680,21 +678,18 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
680 set_fs(old_fs); 678 set_fs(old_fs);
681 if (rc < 0) 679 if (rc < 0)
682 goto out; 680 goto out;
683 rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb, 681 rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb,
684 lower_buf, rc); 682 lower_buf, rc);
685out: 683out:
686 kfree(lower_buf); 684 kfree(lower_buf);
687 return rc; 685 return rc ? ERR_PTR(rc) : buf;
688} 686}
689 687
690static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) 688static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
691{ 689{
692 char *buf; 690 size_t len;
693 size_t len = PATH_MAX; 691 char *buf = ecryptfs_readlink_lower(dentry, &len);
694 int rc; 692 if (IS_ERR(buf))
695
696 rc = ecryptfs_readlink_lower(dentry, &buf, &len);
697 if (rc)
698 goto out; 693 goto out;
699 fsstack_copy_attr_atime(dentry->d_inode, 694 fsstack_copy_attr_atime(dentry->d_inode,
700 ecryptfs_dentry_to_lower(dentry)->d_inode); 695 ecryptfs_dentry_to_lower(dentry)->d_inode);
@@ -1003,10 +998,12 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
1003 char *target; 998 char *target;
1004 size_t targetsiz; 999 size_t targetsiz;
1005 1000
1006 rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz); 1001 target = ecryptfs_readlink_lower(dentry, &targetsiz);
1007 if (!rc) { 1002 if (!IS_ERR(target)) {
1008 kfree(target); 1003 kfree(target);
1009 stat->size = targetsiz; 1004 stat->size = targetsiz;
1005 } else {
1006 rc = PTR_ERR(target);
1010 } 1007 }
1011 } 1008 }
1012 return rc; 1009 return rc;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c6f57a74a559..50215bbd6463 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -26,11 +26,18 @@ static struct dentry *efs_mount(struct file_system_type *fs_type,
26 return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); 26 return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
27} 27}
28 28
29static void efs_kill_sb(struct super_block *s)
30{
31 struct efs_sb_info *sbi = SUPER_INFO(s);
32 kill_block_super(s);
33 kfree(sbi);
34}
35
29static struct file_system_type efs_fs_type = { 36static struct file_system_type efs_fs_type = {
30 .owner = THIS_MODULE, 37 .owner = THIS_MODULE,
31 .name = "efs", 38 .name = "efs",
32 .mount = efs_mount, 39 .mount = efs_mount,
33 .kill_sb = kill_block_super, 40 .kill_sb = efs_kill_sb,
34 .fs_flags = FS_REQUIRES_DEV, 41 .fs_flags = FS_REQUIRES_DEV,
35}; 42};
36MODULE_ALIAS_FS("efs"); 43MODULE_ALIAS_FS("efs");
@@ -105,12 +112,6 @@ static void destroy_inodecache(void)
105 kmem_cache_destroy(efs_inode_cachep); 112 kmem_cache_destroy(efs_inode_cachep);
106} 113}
107 114
108static void efs_put_super(struct super_block *s)
109{
110 kfree(s->s_fs_info);
111 s->s_fs_info = NULL;
112}
113
114static int efs_remount(struct super_block *sb, int *flags, char *data) 115static int efs_remount(struct super_block *sb, int *flags, char *data)
115{ 116{
116 *flags |= MS_RDONLY; 117 *flags |= MS_RDONLY;
@@ -120,7 +121,6 @@ static int efs_remount(struct super_block *sb, int *flags, char *data)
120static const struct super_operations efs_superblock_operations = { 121static const struct super_operations efs_superblock_operations = {
121 .alloc_inode = efs_alloc_inode, 122 .alloc_inode = efs_alloc_inode,
122 .destroy_inode = efs_destroy_inode, 123 .destroy_inode = efs_destroy_inode,
123 .put_super = efs_put_super,
124 .statfs = efs_statfs, 124 .statfs = efs_statfs,
125 .remount_fs = efs_remount, 125 .remount_fs = efs_remount,
126}; 126};
@@ -259,7 +259,6 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
259 struct efs_sb_info *sb; 259 struct efs_sb_info *sb;
260 struct buffer_head *bh; 260 struct buffer_head *bh;
261 struct inode *root; 261 struct inode *root;
262 int ret = -EINVAL;
263 262
264 sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); 263 sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
265 if (!sb) 264 if (!sb)
@@ -270,7 +269,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
270 if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { 269 if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
271 printk(KERN_ERR "EFS: device does not support %d byte blocks\n", 270 printk(KERN_ERR "EFS: device does not support %d byte blocks\n",
272 EFS_BLOCKSIZE); 271 EFS_BLOCKSIZE);
273 goto out_no_fs_ul; 272 return -EINVAL;
274 } 273 }
275 274
276 /* read the vh (volume header) block */ 275 /* read the vh (volume header) block */
@@ -278,7 +277,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
278 277
279 if (!bh) { 278 if (!bh) {
280 printk(KERN_ERR "EFS: cannot read volume header\n"); 279 printk(KERN_ERR "EFS: cannot read volume header\n");
281 goto out_no_fs_ul; 280 return -EINVAL;
282 } 281 }
283 282
284 /* 283 /*
@@ -290,13 +289,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
290 brelse(bh); 289 brelse(bh);
291 290
292 if (sb->fs_start == -1) { 291 if (sb->fs_start == -1) {
293 goto out_no_fs_ul; 292 return -EINVAL;
294 } 293 }
295 294
296 bh = sb_bread(s, sb->fs_start + EFS_SUPER); 295 bh = sb_bread(s, sb->fs_start + EFS_SUPER);
297 if (!bh) { 296 if (!bh) {
298 printk(KERN_ERR "EFS: cannot read superblock\n"); 297 printk(KERN_ERR "EFS: cannot read superblock\n");
299 goto out_no_fs_ul; 298 return -EINVAL;
300 } 299 }
301 300
302 if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { 301 if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
@@ -304,7 +303,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
304 printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); 303 printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER);
305#endif 304#endif
306 brelse(bh); 305 brelse(bh);
307 goto out_no_fs_ul; 306 return -EINVAL;
308 } 307 }
309 brelse(bh); 308 brelse(bh);
310 309
@@ -319,24 +318,16 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
319 root = efs_iget(s, EFS_ROOTINODE); 318 root = efs_iget(s, EFS_ROOTINODE);
320 if (IS_ERR(root)) { 319 if (IS_ERR(root)) {
321 printk(KERN_ERR "EFS: get root inode failed\n"); 320 printk(KERN_ERR "EFS: get root inode failed\n");
322 ret = PTR_ERR(root); 321 return PTR_ERR(root);
323 goto out_no_fs;
324 } 322 }
325 323
326 s->s_root = d_make_root(root); 324 s->s_root = d_make_root(root);
327 if (!(s->s_root)) { 325 if (!(s->s_root)) {
328 printk(KERN_ERR "EFS: get root dentry failed\n"); 326 printk(KERN_ERR "EFS: get root dentry failed\n");
329 ret = -ENOMEM; 327 return -ENOMEM;
330 goto out_no_fs;
331 } 328 }
332 329
333 return 0; 330 return 0;
334
335out_no_fs_ul:
336out_no_fs:
337 s->s_fs_info = NULL;
338 kfree(sb);
339 return ret;
340} 331}
341 332
342static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { 333static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 35470d9b96e6..d6a88e7812f3 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -349,15 +349,12 @@ EXPORT_SYMBOL_GPL(eventfd_fget);
349 */ 349 */
350struct eventfd_ctx *eventfd_ctx_fdget(int fd) 350struct eventfd_ctx *eventfd_ctx_fdget(int fd)
351{ 351{
352 struct file *file;
353 struct eventfd_ctx *ctx; 352 struct eventfd_ctx *ctx;
354 353 struct fd f = fdget(fd);
355 file = eventfd_fget(fd); 354 if (!f.file)
356 if (IS_ERR(file)) 355 return ERR_PTR(-EBADF);
357 return (struct eventfd_ctx *) file; 356 ctx = eventfd_ctx_fileget(f.file);
358 ctx = eventfd_ctx_get(file->private_data); 357 fdput(f);
359 fput(file);
360
361 return ctx; 358 return ctx;
362} 359}
363EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); 360EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
diff --git a/fs/exec.c b/fs/exec.c
index 7ea097f6b341..3d78fccdd723 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -62,7 +62,6 @@
62 62
63#include <trace/events/task.h> 63#include <trace/events/task.h>
64#include "internal.h" 64#include "internal.h"
65#include "coredump.h"
66 65
67#include <trace/events/sched.h> 66#include <trace/events/sched.h>
68 67
@@ -749,11 +748,10 @@ EXPORT_SYMBOL(setup_arg_pages);
749 748
750#endif /* CONFIG_MMU */ 749#endif /* CONFIG_MMU */
751 750
752struct file *open_exec(const char *name) 751static struct file *do_open_exec(struct filename *name)
753{ 752{
754 struct file *file; 753 struct file *file;
755 int err; 754 int err;
756 struct filename tmp = { .name = name };
757 static const struct open_flags open_exec_flags = { 755 static const struct open_flags open_exec_flags = {
758 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 756 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
759 .acc_mode = MAY_EXEC | MAY_OPEN, 757 .acc_mode = MAY_EXEC | MAY_OPEN,
@@ -761,7 +759,7 @@ struct file *open_exec(const char *name)
761 .lookup_flags = LOOKUP_FOLLOW, 759 .lookup_flags = LOOKUP_FOLLOW,
762 }; 760 };
763 761
764 file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags); 762 file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
765 if (IS_ERR(file)) 763 if (IS_ERR(file))
766 goto out; 764 goto out;
767 765
@@ -785,6 +783,12 @@ exit:
785 fput(file); 783 fput(file);
786 return ERR_PTR(err); 784 return ERR_PTR(err);
787} 785}
786
787struct file *open_exec(const char *name)
788{
789 struct filename tmp = { .name = name };
790 return do_open_exec(&tmp);
791}
788EXPORT_SYMBOL(open_exec); 792EXPORT_SYMBOL(open_exec);
789 793
790int kernel_read(struct file *file, loff_t offset, 794int kernel_read(struct file *file, loff_t offset,
@@ -843,7 +847,6 @@ static int exec_mmap(struct mm_struct *mm)
843 tsk->active_mm = mm; 847 tsk->active_mm = mm;
844 activate_mm(active_mm, mm); 848 activate_mm(active_mm, mm);
845 task_unlock(tsk); 849 task_unlock(tsk);
846 arch_pick_mmap_layout(mm);
847 if (old_mm) { 850 if (old_mm) {
848 up_read(&old_mm->mmap_sem); 851 up_read(&old_mm->mmap_sem);
849 BUG_ON(active_mm != old_mm); 852 BUG_ON(active_mm != old_mm);
@@ -1088,8 +1091,8 @@ int flush_old_exec(struct linux_binprm * bprm)
1088 bprm->mm = NULL; /* We're using it now */ 1091 bprm->mm = NULL; /* We're using it now */
1089 1092
1090 set_fs(USER_DS); 1093 set_fs(USER_DS);
1091 current->flags &= 1094 current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
1092 ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE); 1095 PF_NOFREEZE | PF_NO_SETAFFINITY);
1093 flush_thread(); 1096 flush_thread();
1094 current->personality &= ~bprm->per_clear; 1097 current->personality &= ~bprm->per_clear;
1095 1098
@@ -1139,9 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm)
1139 1142
1140 /* An exec changes our domain. We are no longer part of the thread 1143 /* An exec changes our domain. We are no longer part of the thread
1141 group */ 1144 group */
1142
1143 current->self_exec_id++; 1145 current->self_exec_id++;
1144
1145 flush_signal_handlers(current, 0); 1146 flush_signal_handlers(current, 0);
1146 do_close_on_exec(current->files); 1147 do_close_on_exec(current->files);
1147} 1148}
@@ -1166,13 +1167,17 @@ int prepare_bprm_creds(struct linux_binprm *bprm)
1166 return -ENOMEM; 1167 return -ENOMEM;
1167} 1168}
1168 1169
1169void free_bprm(struct linux_binprm *bprm) 1170static void free_bprm(struct linux_binprm *bprm)
1170{ 1171{
1171 free_arg_pages(bprm); 1172 free_arg_pages(bprm);
1172 if (bprm->cred) { 1173 if (bprm->cred) {
1173 mutex_unlock(&current->signal->cred_guard_mutex); 1174 mutex_unlock(&current->signal->cred_guard_mutex);
1174 abort_creds(bprm->cred); 1175 abort_creds(bprm->cred);
1175 } 1176 }
1177 if (bprm->file) {
1178 allow_write_access(bprm->file);
1179 fput(bprm->file);
1180 }
1176 /* If a binfmt changed the interp, free it. */ 1181 /* If a binfmt changed the interp, free it. */
1177 if (bprm->interp != bprm->filename) 1182 if (bprm->interp != bprm->filename)
1178 kfree(bprm->interp); 1183 kfree(bprm->interp);
@@ -1224,11 +1229,10 @@ EXPORT_SYMBOL(install_exec_creds);
1224 * - the caller must hold ->cred_guard_mutex to protect against 1229 * - the caller must hold ->cred_guard_mutex to protect against
1225 * PTRACE_ATTACH 1230 * PTRACE_ATTACH
1226 */ 1231 */
1227static int check_unsafe_exec(struct linux_binprm *bprm) 1232static void check_unsafe_exec(struct linux_binprm *bprm)
1228{ 1233{
1229 struct task_struct *p = current, *t; 1234 struct task_struct *p = current, *t;
1230 unsigned n_fs; 1235 unsigned n_fs;
1231 int res = 0;
1232 1236
1233 if (p->ptrace) { 1237 if (p->ptrace) {
1234 if (p->ptrace & PT_PTRACE_CAP) 1238 if (p->ptrace & PT_PTRACE_CAP)
@@ -1244,31 +1248,25 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
1244 if (current->no_new_privs) 1248 if (current->no_new_privs)
1245 bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; 1249 bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1246 1250
1251 t = p;
1247 n_fs = 1; 1252 n_fs = 1;
1248 spin_lock(&p->fs->lock); 1253 spin_lock(&p->fs->lock);
1249 rcu_read_lock(); 1254 rcu_read_lock();
1250 for (t = next_thread(p); t != p; t = next_thread(t)) { 1255 while_each_thread(p, t) {
1251 if (t->fs == p->fs) 1256 if (t->fs == p->fs)
1252 n_fs++; 1257 n_fs++;
1253 } 1258 }
1254 rcu_read_unlock(); 1259 rcu_read_unlock();
1255 1260
1256 if (p->fs->users > n_fs) { 1261 if (p->fs->users > n_fs)
1257 bprm->unsafe |= LSM_UNSAFE_SHARE; 1262 bprm->unsafe |= LSM_UNSAFE_SHARE;
1258 } else { 1263 else
1259 res = -EAGAIN; 1264 p->fs->in_exec = 1;
1260 if (!p->fs->in_exec) {
1261 p->fs->in_exec = 1;
1262 res = 1;
1263 }
1264 }
1265 spin_unlock(&p->fs->lock); 1265 spin_unlock(&p->fs->lock);
1266
1267 return res;
1268} 1266}
1269 1267
1270/* 1268/*
1271 * Fill the binprm structure from the inode. 1269 * Fill the binprm structure from the inode.
1272 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes 1270 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1273 * 1271 *
1274 * This may be called multiple times for binary chains (scripts for example). 1272 * This may be called multiple times for binary chains (scripts for example).
@@ -1430,14 +1428,7 @@ static int exec_binprm(struct linux_binprm *bprm)
1430 audit_bprm(bprm); 1428 audit_bprm(bprm);
1431 trace_sched_process_exec(current, old_pid, bprm); 1429 trace_sched_process_exec(current, old_pid, bprm);
1432 ptrace_event(PTRACE_EVENT_EXEC, old_vpid); 1430 ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1433 current->did_exec = 1;
1434 proc_exec_connector(current); 1431 proc_exec_connector(current);
1435
1436 if (bprm->file) {
1437 allow_write_access(bprm->file);
1438 fput(bprm->file);
1439 bprm->file = NULL; /* to catch use-after-free */
1440 }
1441 } 1432 }
1442 1433
1443 return ret; 1434 return ret;
@@ -1446,16 +1437,18 @@ static int exec_binprm(struct linux_binprm *bprm)
1446/* 1437/*
1447 * sys_execve() executes a new program. 1438 * sys_execve() executes a new program.
1448 */ 1439 */
1449static int do_execve_common(const char *filename, 1440static int do_execve_common(struct filename *filename,
1450 struct user_arg_ptr argv, 1441 struct user_arg_ptr argv,
1451 struct user_arg_ptr envp) 1442 struct user_arg_ptr envp)
1452{ 1443{
1453 struct linux_binprm *bprm; 1444 struct linux_binprm *bprm;
1454 struct file *file; 1445 struct file *file;
1455 struct files_struct *displaced; 1446 struct files_struct *displaced;
1456 bool clear_in_exec;
1457 int retval; 1447 int retval;
1458 1448
1449 if (IS_ERR(filename))
1450 return PTR_ERR(filename);
1451
1459 /* 1452 /*
1460 * We move the actual failure in case of RLIMIT_NPROC excess from 1453 * We move the actual failure in case of RLIMIT_NPROC excess from
1461 * set*uid() to execve() because too many poorly written programs 1454 * set*uid() to execve() because too many poorly written programs
@@ -1485,13 +1478,10 @@ static int do_execve_common(const char *filename,
1485 if (retval) 1478 if (retval)
1486 goto out_free; 1479 goto out_free;
1487 1480
1488 retval = check_unsafe_exec(bprm); 1481 check_unsafe_exec(bprm);
1489 if (retval < 0)
1490 goto out_free;
1491 clear_in_exec = retval;
1492 current->in_execve = 1; 1482 current->in_execve = 1;
1493 1483
1494 file = open_exec(filename); 1484 file = do_open_exec(filename);
1495 retval = PTR_ERR(file); 1485 retval = PTR_ERR(file);
1496 if (IS_ERR(file)) 1486 if (IS_ERR(file))
1497 goto out_unmark; 1487 goto out_unmark;
@@ -1499,12 +1489,11 @@ static int do_execve_common(const char *filename,
1499 sched_exec(); 1489 sched_exec();
1500 1490
1501 bprm->file = file; 1491 bprm->file = file;
1502 bprm->filename = filename; 1492 bprm->filename = bprm->interp = filename->name;
1503 bprm->interp = filename;
1504 1493
1505 retval = bprm_mm_init(bprm); 1494 retval = bprm_mm_init(bprm);
1506 if (retval) 1495 if (retval)
1507 goto out_file; 1496 goto out_unmark;
1508 1497
1509 bprm->argc = count(argv, MAX_ARG_STRINGS); 1498 bprm->argc = count(argv, MAX_ARG_STRINGS);
1510 if ((retval = bprm->argc) < 0) 1499 if ((retval = bprm->argc) < 0)
@@ -1541,6 +1530,7 @@ static int do_execve_common(const char *filename,
1541 acct_update_integrals(current); 1530 acct_update_integrals(current);
1542 task_numa_free(current); 1531 task_numa_free(current);
1543 free_bprm(bprm); 1532 free_bprm(bprm);
1533 putname(filename);
1544 if (displaced) 1534 if (displaced)
1545 put_files_struct(displaced); 1535 put_files_struct(displaced);
1546 return retval; 1536 return retval;
@@ -1551,15 +1541,8 @@ out:
1551 mmput(bprm->mm); 1541 mmput(bprm->mm);
1552 } 1542 }
1553 1543
1554out_file:
1555 if (bprm->file) {
1556 allow_write_access(bprm->file);
1557 fput(bprm->file);
1558 }
1559
1560out_unmark: 1544out_unmark:
1561 if (clear_in_exec) 1545 current->fs->in_exec = 0;
1562 current->fs->in_exec = 0;
1563 current->in_execve = 0; 1546 current->in_execve = 0;
1564 1547
1565out_free: 1548out_free:
@@ -1569,10 +1552,11 @@ out_files:
1569 if (displaced) 1552 if (displaced)
1570 reset_files_struct(displaced); 1553 reset_files_struct(displaced);
1571out_ret: 1554out_ret:
1555 putname(filename);
1572 return retval; 1556 return retval;
1573} 1557}
1574 1558
1575int do_execve(const char *filename, 1559int do_execve(struct filename *filename,
1576 const char __user *const __user *__argv, 1560 const char __user *const __user *__argv,
1577 const char __user *const __user *__envp) 1561 const char __user *const __user *__envp)
1578{ 1562{
@@ -1582,7 +1566,7 @@ int do_execve(const char *filename,
1582} 1566}
1583 1567
1584#ifdef CONFIG_COMPAT 1568#ifdef CONFIG_COMPAT
1585static int compat_do_execve(const char *filename, 1569static int compat_do_execve(struct filename *filename,
1586 const compat_uptr_t __user *__argv, 1570 const compat_uptr_t __user *__argv,
1587 const compat_uptr_t __user *__envp) 1571 const compat_uptr_t __user *__envp)
1588{ 1572{
@@ -1609,67 +1593,22 @@ void set_binfmt(struct linux_binfmt *new)
1609 if (new) 1593 if (new)
1610 __module_get(new->module); 1594 __module_get(new->module);
1611} 1595}
1612
1613EXPORT_SYMBOL(set_binfmt); 1596EXPORT_SYMBOL(set_binfmt);
1614 1597
1615/* 1598/*
1616 * set_dumpable converts traditional three-value dumpable to two flags and 1599 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
1617 * stores them into mm->flags. It modifies lower two bits of mm->flags, but
1618 * these bits are not changed atomically. So get_dumpable can observe the
1619 * intermediate state. To avoid doing unexpected behavior, get get_dumpable
1620 * return either old dumpable or new one by paying attention to the order of
1621 * modifying the bits.
1622 *
1623 * dumpable | mm->flags (binary)
1624 * old new | initial interim final
1625 * ---------+-----------------------
1626 * 0 1 | 00 01 01
1627 * 0 2 | 00 10(*) 11
1628 * 1 0 | 01 00 00
1629 * 1 2 | 01 11 11
1630 * 2 0 | 11 10(*) 00
1631 * 2 1 | 11 11 01
1632 *
1633 * (*) get_dumpable regards interim value of 10 as 11.
1634 */ 1600 */
1635void set_dumpable(struct mm_struct *mm, int value) 1601void set_dumpable(struct mm_struct *mm, int value)
1636{ 1602{
1637 switch (value) { 1603 unsigned long old, new;
1638 case SUID_DUMP_DISABLE:
1639 clear_bit(MMF_DUMPABLE, &mm->flags);
1640 smp_wmb();
1641 clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1642 break;
1643 case SUID_DUMP_USER:
1644 set_bit(MMF_DUMPABLE, &mm->flags);
1645 smp_wmb();
1646 clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1647 break;
1648 case SUID_DUMP_ROOT:
1649 set_bit(MMF_DUMP_SECURELY, &mm->flags);
1650 smp_wmb();
1651 set_bit(MMF_DUMPABLE, &mm->flags);
1652 break;
1653 }
1654}
1655 1604
1656int __get_dumpable(unsigned long mm_flags) 1605 if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
1657{ 1606 return;
1658 int ret;
1659
1660 ret = mm_flags & MMF_DUMPABLE_MASK;
1661 return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
1662}
1663 1607
1664/* 1608 do {
1665 * This returns the actual value of the suid_dumpable flag. For things 1609 old = ACCESS_ONCE(mm->flags);
1666 * that are using this for checking for privilege transitions, it must 1610 new = (old & ~MMF_DUMPABLE_MASK) | value;
1667 * test against SUID_DUMP_USER rather than treating it as a boolean 1611 } while (cmpxchg(&mm->flags, old, new) != old);
1668 * value.
1669 */
1670int get_dumpable(struct mm_struct *mm)
1671{
1672 return __get_dumpable(mm->flags);
1673} 1612}
1674 1613
1675SYSCALL_DEFINE3(execve, 1614SYSCALL_DEFINE3(execve,
@@ -1677,25 +1616,13 @@ SYSCALL_DEFINE3(execve,
1677 const char __user *const __user *, argv, 1616 const char __user *const __user *, argv,
1678 const char __user *const __user *, envp) 1617 const char __user *const __user *, envp)
1679{ 1618{
1680 struct filename *path = getname(filename); 1619 return do_execve(getname(filename), argv, envp);
1681 int error = PTR_ERR(path);
1682 if (!IS_ERR(path)) {
1683 error = do_execve(path->name, argv, envp);
1684 putname(path);
1685 }
1686 return error;
1687} 1620}
1688#ifdef CONFIG_COMPAT 1621#ifdef CONFIG_COMPAT
1689asmlinkage long compat_sys_execve(const char __user * filename, 1622asmlinkage long compat_sys_execve(const char __user * filename,
1690 const compat_uptr_t __user * argv, 1623 const compat_uptr_t __user * argv,
1691 const compat_uptr_t __user * envp) 1624 const compat_uptr_t __user * envp)
1692{ 1625{
1693 struct filename *path = getname(filename); 1626 return compat_do_execve(getname(filename), argv, envp);
1694 int error = PTR_ERR(path);
1695 if (!IS_ERR(path)) {
1696 error = compat_do_execve(path->name, argv, envp);
1697 putname(path);
1698 }
1699 return error;
1700} 1627}
1701#endif 1628#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a52a5d23c30b..ee4317faccb1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -577,7 +577,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
577 577
578 if (offset >= i_size) { 578 if (offset >= i_size) {
579 *uptodate = true; 579 *uptodate = true;
580 EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index); 580 EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
581 return ZERO_PAGE(0); 581 return ZERO_PAGE(0);
582 } 582 }
583 583
@@ -596,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
596 *uptodate = true; 596 *uptodate = true;
597 else 597 else
598 *uptodate = PageUptodate(page); 598 *uptodate = PageUptodate(page);
599 EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); 599 EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
600 return page; 600 return page;
601 } else { 601 } else {
602 EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", 602 EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
603 pcol->that_locked_page->index); 603 pcol->that_locked_page->index);
604 *uptodate = true; 604 *uptodate = true;
605 return pcol->that_locked_page; 605 return pcol->that_locked_page;
@@ -611,11 +611,11 @@ static void __r4w_put_page(void *priv, struct page *page)
611 struct page_collect *pcol = priv; 611 struct page_collect *pcol = priv;
612 612
613 if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { 613 if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
614 EXOFS_DBGMSG("index=0x%lx\n", page->index); 614 EXOFS_DBGMSG2("index=0x%lx\n", page->index);
615 page_cache_release(page); 615 page_cache_release(page);
616 return; 616 return;
617 } 617 }
618 EXOFS_DBGMSG("that_locked_page index=0x%lx\n", 618 EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
619 ZERO_PAGE(0) == page ? -1 : page->index); 619 ZERO_PAGE(0) == page ? -1 : page->index);
620} 620}
621 621
@@ -961,6 +961,14 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
961 WARN_ON(1); 961 WARN_ON(1);
962} 962}
963 963
964
965 /* TODO: Should be easy enough to do proprly */
966static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
967 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
968{
969 return 0;
970}
971
964const struct address_space_operations exofs_aops = { 972const struct address_space_operations exofs_aops = {
965 .readpage = exofs_readpage, 973 .readpage = exofs_readpage,
966 .readpages = exofs_readpages, 974 .readpages = exofs_readpages,
@@ -974,7 +982,7 @@ const struct address_space_operations exofs_aops = {
974 982
975 /* Not implemented Yet */ 983 /* Not implemented Yet */
976 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ 984 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
977 .direct_IO = NULL, /* TODO: Should be trivial to do */ 985 .direct_IO = exofs_direct_IO,
978 986
979 /* With these NULL has special meaning or default is not exported */ 987 /* With these NULL has special meaning or default is not exported */
980 .get_xip_mem = NULL, 988 .get_xip_mem = NULL,
@@ -1010,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
1010 if (likely(!ret)) 1018 if (likely(!ret))
1011 truncate_setsize(inode, newsize); 1019 truncate_setsize(inode, newsize);
1012 1020
1013 EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", 1021 EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
1014 inode->i_ino, newsize, ret); 1022 inode->i_ino, newsize, ret);
1015 return ret; 1023 return ret;
1016} 1024}
@@ -1094,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
1094 /* If object is lost on target we might as well enable it's 1102 /* If object is lost on target we might as well enable it's
1095 * delete. 1103 * delete.
1096 */ 1104 */
1097 if ((ret == -ENOENT) || (ret == -EINVAL)) 1105 ret = 0;
1098 ret = 0;
1099 goto out; 1106 goto out;
1100 } 1107 }
1101 1108
1102 ret = extract_attr_from_ios(ios, &attrs[0]); 1109 ret = extract_attr_from_ios(ios, &attrs[0]);
1103 if (ret) { 1110 if (ret) {
1104 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 1111 EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
1105 goto out; 1112 goto out;
1106 } 1113 }
1107 WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); 1114 WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
@@ -1109,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
1109 1116
1110 ret = extract_attr_from_ios(ios, &attrs[1]); 1117 ret = extract_attr_from_ios(ios, &attrs[1]);
1111 if (ret) { 1118 if (ret) {
1112 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 1119 EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
1113 goto out; 1120 goto out;
1114 } 1121 }
1115 if (attrs[1].len) { 1122 if (attrs[1].len) {
@@ -1124,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
1124 1131
1125 ret = extract_attr_from_ios(ios, &attrs[2]); 1132 ret = extract_attr_from_ios(ios, &attrs[2]);
1126 if (ret) { 1133 if (ret) {
1127 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); 1134 EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
1128 goto out; 1135 goto out;
1129 } 1136 }
1130 if (attrs[2].len) { 1137 if (attrs[2].len) {
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index b74422888604..dae884694bd9 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -103,7 +103,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
103 103
104 layout->max_io_length = 104 layout->max_io_length =
105 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * 105 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
106 layout->group_width; 106 (layout->group_width - layout->parity);
107 if (layout->parity) { 107 if (layout->parity) {
108 unsigned stripe_length = 108 unsigned stripe_length =
109 (layout->group_width - layout->parity) * 109 (layout->group_width - layout->parity) *
@@ -286,7 +286,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
286 if (length) { 286 if (length) {
287 ore_calc_stripe_info(layout, offset, length, &ios->si); 287 ore_calc_stripe_info(layout, offset, length, &ios->si);
288 ios->length = ios->si.length; 288 ios->length = ios->si.length;
289 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; 289 ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
290 ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
290 if (layout->parity) 291 if (layout->parity)
291 _ore_post_alloc_raid_stuff(ios); 292 _ore_post_alloc_raid_stuff(ios);
292 } 293 }
@@ -430,8 +431,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
430 if (likely(!ret)) 431 if (likely(!ret))
431 continue; 432 continue;
432 433
433 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 434 if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
434 /* start read offset passed endof file */ 435 per_dev->bio) {
436 /* start read offset passed endof file.
437 * Note: if we do not have bio it means read-attributes
438 * In this case we should return error to caller.
439 */
435 _clear_bio(per_dev->bio); 440 _clear_bio(per_dev->bio);
436 ORE_DBGMSG("start read offset passed end of file " 441 ORE_DBGMSG("start read offset passed end of file "
437 "offset=0x%llx, length=0x%llx\n", 442 "offset=0x%llx, length=0x%llx\n",
@@ -536,6 +541,7 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
536 u64 H = LmodS - G * T; 541 u64 H = LmodS - G * T;
537 542
538 u32 N = div_u64(H, U); 543 u32 N = div_u64(H, U);
544 u32 Nlast;
539 545
540 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 546 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
541 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; 547 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
@@ -568,6 +574,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
568 si->length = T - H; 574 si->length = T - H;
569 if (si->length > length) 575 if (si->length > length)
570 si->length = length; 576 si->length = length;
577
578 Nlast = div_u64(H + si->length + U - 1, U);
579 si->maxdevUnits = Nlast - N;
580
571 si->M = M; 581 si->M = M;
572} 582}
573EXPORT_SYMBOL(ore_calc_stripe_info); 583EXPORT_SYMBOL(ore_calc_stripe_info);
@@ -583,13 +593,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
583 int ret; 593 int ret;
584 594
585 if (per_dev->bio == NULL) { 595 if (per_dev->bio == NULL) {
586 unsigned pages_in_stripe = ios->layout->group_width * 596 unsigned bio_size;
587 (ios->layout->stripe_unit / PAGE_SIZE); 597
588 unsigned nr_pages = ios->nr_pages * ios->layout->group_width / 598 if (!ios->reading) {
589 (ios->layout->group_width - 599 bio_size = ios->si.maxdevUnits;
590 ios->layout->parity); 600 } else {
591 unsigned bio_size = (nr_pages + pages_in_stripe) / 601 bio_size = (ios->si.maxdevUnits + 1) *
592 ios->layout->group_width; 602 (ios->layout->group_width - ios->layout->parity) /
603 ios->layout->group_width;
604 }
605 bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
593 606
594 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 607 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
595 if (unlikely(!per_dev->bio)) { 608 if (unlikely(!per_dev->bio)) {
@@ -609,8 +622,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
609 added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], 622 added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
610 pglen, pgbase); 623 pglen, pgbase);
611 if (unlikely(pglen != added_len)) { 624 if (unlikely(pglen != added_len)) {
612 ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", 625 /* If bi_vcnt == bi_max then this is a SW BUG */
613 per_dev->bio->bi_vcnt); 626 ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
627 "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
628 per_dev->bio->bi_vcnt,
629 per_dev->bio->bi_max_vecs,
630 BIO_MAX_PAGES_KMALLOC, cur_len);
614 ret = -ENOMEM; 631 ret = -ENOMEM;
615 goto out; 632 goto out;
616 } 633 }
@@ -1098,7 +1115,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
1098 size_attr->attr = g_attr_logical_length; 1115 size_attr->attr = g_attr_logical_length;
1099 size_attr->attr.val_ptr = &size_attr->newsize; 1116 size_attr->attr.val_ptr = &size_attr->newsize;
1100 1117
1101 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", 1118 ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
1102 _LLU(oc->comps->obj.id), _LLU(obj_size), i); 1119 _LLU(oc->comps->obj.id), _LLU(obj_size), i);
1103 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, 1120 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
1104 &size_attr->attr); 1121 &size_attr->attr);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 110b6b371a4e..1b8001bbe947 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -148,13 +148,6 @@ ext2_get_acl(struct inode *inode, int type)
148 struct posix_acl *acl; 148 struct posix_acl *acl;
149 int retval; 149 int retval;
150 150
151 if (!test_opt(inode->i_sb, POSIX_ACL))
152 return NULL;
153
154 acl = get_cached_acl(inode, type);
155 if (acl != ACL_NOT_CACHED)
156 return acl;
157
158 switch (type) { 151 switch (type) {
159 case ACL_TYPE_ACCESS: 152 case ACL_TYPE_ACCESS:
160 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; 153 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -189,19 +182,14 @@ ext2_get_acl(struct inode *inode, int type)
189/* 182/*
190 * inode->i_mutex: down 183 * inode->i_mutex: down
191 */ 184 */
192static int 185int
193ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) 186ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
194{ 187{
195 int name_index; 188 int name_index;
196 void *value = NULL; 189 void *value = NULL;
197 size_t size = 0; 190 size_t size = 0;
198 int error; 191 int error;
199 192
200 if (S_ISLNK(inode->i_mode))
201 return -EOPNOTSUPP;
202 if (!test_opt(inode->i_sb, POSIX_ACL))
203 return 0;
204
205 switch(type) { 193 switch(type) {
206 case ACL_TYPE_ACCESS: 194 case ACL_TYPE_ACCESS:
207 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; 195 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -250,169 +238,21 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
250int 238int
251ext2_init_acl(struct inode *inode, struct inode *dir) 239ext2_init_acl(struct inode *inode, struct inode *dir)
252{ 240{
253 struct posix_acl *acl = NULL; 241 struct posix_acl *default_acl, *acl;
254 int error = 0; 242 int error;
255
256 if (!S_ISLNK(inode->i_mode)) {
257 if (test_opt(dir->i_sb, POSIX_ACL)) {
258 acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT);
259 if (IS_ERR(acl))
260 return PTR_ERR(acl);
261 }
262 if (!acl)
263 inode->i_mode &= ~current_umask();
264 }
265 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
266 if (S_ISDIR(inode->i_mode)) {
267 error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
268 if (error)
269 goto cleanup;
270 }
271 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
272 if (error < 0)
273 return error;
274 if (error > 0) {
275 /* This is an extended ACL */
276 error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
277 }
278 }
279cleanup:
280 posix_acl_release(acl);
281 return error;
282}
283
284/*
285 * Does chmod for an inode that may have an Access Control List. The
286 * inode->i_mode field must be updated to the desired value by the caller
287 * before calling this function.
288 * Returns 0 on success, or a negative error number.
289 *
290 * We change the ACL rather than storing some ACL entries in the file
291 * mode permission bits (which would be more efficient), because that
292 * would break once additional permissions (like ACL_APPEND, ACL_DELETE
293 * for directories) are added. There are no more bits available in the
294 * file mode.
295 *
296 * inode->i_mutex: down
297 */
298int
299ext2_acl_chmod(struct inode *inode)
300{
301 struct posix_acl *acl;
302 int error;
303 243
304 if (!test_opt(inode->i_sb, POSIX_ACL)) 244 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
305 return 0;
306 if (S_ISLNK(inode->i_mode))
307 return -EOPNOTSUPP;
308 acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
309 if (IS_ERR(acl) || !acl)
310 return PTR_ERR(acl);
311 error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
312 if (error) 245 if (error)
313 return error; 246 return error;
314 error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
315 posix_acl_release(acl);
316 return error;
317}
318 247
319/* 248 if (default_acl) {
320 * Extended attribut handlers 249 error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
321 */ 250 posix_acl_release(default_acl);
322static size_t 251 }
323ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size, 252 if (acl) {
324 const char *name, size_t name_len, int type) 253 if (!error)
325{ 254 error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
326 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); 255 posix_acl_release(acl);
327 256 }
328 if (!test_opt(dentry->d_sb, POSIX_ACL))
329 return 0;
330 if (list && size <= list_size)
331 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
332 return size;
333}
334
335static size_t
336ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size,
337 const char *name, size_t name_len, int type)
338{
339 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
340
341 if (!test_opt(dentry->d_sb, POSIX_ACL))
342 return 0;
343 if (list && size <= list_size)
344 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
345 return size;
346}
347
348static int
349ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
350 size_t size, int type)
351{
352 struct posix_acl *acl;
353 int error;
354
355 if (strcmp(name, "") != 0)
356 return -EINVAL;
357 if (!test_opt(dentry->d_sb, POSIX_ACL))
358 return -EOPNOTSUPP;
359
360 acl = ext2_get_acl(dentry->d_inode, type);
361 if (IS_ERR(acl))
362 return PTR_ERR(acl);
363 if (acl == NULL)
364 return -ENODATA;
365 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
366 posix_acl_release(acl);
367
368 return error;
369}
370
371static int
372ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
373 size_t size, int flags, int type)
374{
375 struct posix_acl *acl;
376 int error;
377
378 if (strcmp(name, "") != 0)
379 return -EINVAL;
380 if (!test_opt(dentry->d_sb, POSIX_ACL))
381 return -EOPNOTSUPP;
382 if (!inode_owner_or_capable(dentry->d_inode))
383 return -EPERM;
384
385 if (value) {
386 acl = posix_acl_from_xattr(&init_user_ns, value, size);
387 if (IS_ERR(acl))
388 return PTR_ERR(acl);
389 else if (acl) {
390 error = posix_acl_valid(acl);
391 if (error)
392 goto release_and_out;
393 }
394 } else
395 acl = NULL;
396
397 error = ext2_set_acl(dentry->d_inode, type, acl);
398
399release_and_out:
400 posix_acl_release(acl);
401 return error; 257 return error;
402} 258}
403
404const struct xattr_handler ext2_xattr_acl_access_handler = {
405 .prefix = POSIX_ACL_XATTR_ACCESS,
406 .flags = ACL_TYPE_ACCESS,
407 .list = ext2_xattr_list_acl_access,
408 .get = ext2_xattr_get_acl,
409 .set = ext2_xattr_set_acl,
410};
411
412const struct xattr_handler ext2_xattr_acl_default_handler = {
413 .prefix = POSIX_ACL_XATTR_DEFAULT,
414 .flags = ACL_TYPE_DEFAULT,
415 .list = ext2_xattr_list_acl_default,
416 .get = ext2_xattr_get_acl,
417 .set = ext2_xattr_set_acl,
418};
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 503bfb0ed79b..44937f9fcf32 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size)
55 55
56/* acl.c */ 56/* acl.c */
57extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); 57extern struct posix_acl *ext2_get_acl(struct inode *inode, int type);
58extern int ext2_acl_chmod (struct inode *); 58extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
59extern int ext2_init_acl (struct inode *, struct inode *); 59extern int ext2_init_acl (struct inode *, struct inode *);
60 60
61#else 61#else
@@ -63,12 +63,6 @@ extern int ext2_init_acl (struct inode *, struct inode *);
63#define ext2_get_acl NULL 63#define ext2_get_acl NULL
64#define ext2_set_acl NULL 64#define ext2_set_acl NULL
65 65
66static inline int
67ext2_acl_chmod (struct inode *inode)
68{
69 return 0;
70}
71
72static inline int ext2_init_acl (struct inode *inode, struct inode *dir) 66static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
73{ 67{
74 return 0; 68 return 0;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a5b3a5db3120..44c36e590765 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -103,5 +103,6 @@ const struct inode_operations ext2_file_inode_operations = {
103#endif 103#endif
104 .setattr = ext2_setattr, 104 .setattr = ext2_setattr,
105 .get_acl = ext2_get_acl, 105 .get_acl = ext2_get_acl,
106 .set_acl = ext2_set_acl,
106 .fiemap = ext2_fiemap, 107 .fiemap = ext2_fiemap,
107}; 108};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 8a337640a46a..94ed36849b71 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1566,7 +1566,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1566 } 1566 }
1567 setattr_copy(inode, iattr); 1567 setattr_copy(inode, iattr);
1568 if (iattr->ia_valid & ATTR_MODE) 1568 if (iattr->ia_valid & ATTR_MODE)
1569 error = ext2_acl_chmod(inode); 1569 error = posix_acl_chmod(inode, inode->i_mode);
1570 mark_inode_dirty(inode); 1570 mark_inode_dirty(inode);
1571 1571
1572 return error; 1572 return error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 256dd5f4c1c4..c268d0af1db9 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -421,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
421#endif 421#endif
422 .setattr = ext2_setattr, 422 .setattr = ext2_setattr,
423 .get_acl = ext2_get_acl, 423 .get_acl = ext2_get_acl,
424 .set_acl = ext2_set_acl,
424 .tmpfile = ext2_tmpfile, 425 .tmpfile = ext2_tmpfile,
425}; 426};
426 427
@@ -433,4 +434,5 @@ const struct inode_operations ext2_special_inode_operations = {
433#endif 434#endif
434 .setattr = ext2_setattr, 435 .setattr = ext2_setattr,
435 .get_acl = ext2_get_acl, 436 .get_acl = ext2_get_acl,
437 .set_acl = ext2_set_acl,
436}; 438};
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 2d7557db3ae8..91426141c33a 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -103,8 +103,8 @@ static struct mb_cache *ext2_xattr_cache;
103static const struct xattr_handler *ext2_xattr_handler_map[] = { 103static const struct xattr_handler *ext2_xattr_handler_map[] = {
104 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, 104 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
105#ifdef CONFIG_EXT2_FS_POSIX_ACL 105#ifdef CONFIG_EXT2_FS_POSIX_ACL
106 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, 106 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
107 [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler, 107 [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
108#endif 108#endif
109 [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler, 109 [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler,
110#ifdef CONFIG_EXT2_FS_SECURITY 110#ifdef CONFIG_EXT2_FS_SECURITY
@@ -116,8 +116,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = {
116 &ext2_xattr_user_handler, 116 &ext2_xattr_user_handler,
117 &ext2_xattr_trusted_handler, 117 &ext2_xattr_trusted_handler,
118#ifdef CONFIG_EXT2_FS_POSIX_ACL 118#ifdef CONFIG_EXT2_FS_POSIX_ACL
119 &ext2_xattr_acl_access_handler, 119 &posix_acl_access_xattr_handler,
120 &ext2_xattr_acl_default_handler, 120 &posix_acl_default_xattr_handler,
121#endif 121#endif
122#ifdef CONFIG_EXT2_FS_SECURITY 122#ifdef CONFIG_EXT2_FS_SECURITY
123 &ext2_xattr_security_handler, 123 &ext2_xattr_security_handler,
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 5e41cccff762..60edf298644e 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -57,8 +57,6 @@ struct ext2_xattr_entry {
57 57
58extern const struct xattr_handler ext2_xattr_user_handler; 58extern const struct xattr_handler ext2_xattr_user_handler;
59extern const struct xattr_handler ext2_xattr_trusted_handler; 59extern const struct xattr_handler ext2_xattr_trusted_handler;
60extern const struct xattr_handler ext2_xattr_acl_access_handler;
61extern const struct xattr_handler ext2_xattr_acl_default_handler;
62extern const struct xattr_handler ext2_xattr_security_handler; 60extern const struct xattr_handler ext2_xattr_security_handler;
63 61
64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); 62extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index dbb5ad59a7fc..8bbaf5bcf982 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -145,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type)
145 struct posix_acl *acl; 145 struct posix_acl *acl;
146 int retval; 146 int retval;
147 147
148 if (!test_opt(inode->i_sb, POSIX_ACL))
149 return NULL;
150
151 acl = get_cached_acl(inode, type);
152 if (acl != ACL_NOT_CACHED)
153 return acl;
154
155 switch (type) { 148 switch (type) {
156 case ACL_TYPE_ACCESS: 149 case ACL_TYPE_ACCESS:
157 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; 150 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -190,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type)
190 * inode->i_mutex: down unless called from ext3_new_inode 183 * inode->i_mutex: down unless called from ext3_new_inode
191 */ 184 */
192static int 185static int
193ext3_set_acl(handle_t *handle, struct inode *inode, int type, 186__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
194 struct posix_acl *acl) 187 struct posix_acl *acl)
195{ 188{
196 int name_index; 189 int name_index;
@@ -198,9 +191,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
198 size_t size = 0; 191 size_t size = 0;
199 int error; 192 int error;
200 193
201 if (S_ISLNK(inode->i_mode))
202 return -EOPNOTSUPP;
203
204 switch(type) { 194 switch(type) {
205 case ACL_TYPE_ACCESS: 195 case ACL_TYPE_ACCESS:
206 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; 196 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -243,204 +233,49 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
243 return error; 233 return error;
244} 234}
245 235
246/*
247 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
248 *
249 * dir->i_mutex: down
250 * inode->i_mutex: up (access to inode is still exclusive)
251 */
252int 236int
253ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) 237ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
254{ 238{
255 struct posix_acl *acl = NULL;
256 int error = 0;
257
258 if (!S_ISLNK(inode->i_mode)) {
259 if (test_opt(dir->i_sb, POSIX_ACL)) {
260 acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
261 if (IS_ERR(acl))
262 return PTR_ERR(acl);
263 }
264 if (!acl)
265 inode->i_mode &= ~current_umask();
266 }
267 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
268 if (S_ISDIR(inode->i_mode)) {
269 error = ext3_set_acl(handle, inode,
270 ACL_TYPE_DEFAULT, acl);
271 if (error)
272 goto cleanup;
273 }
274 error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
275 if (error < 0)
276 return error;
277
278 if (error > 0) {
279 /* This is an extended ACL */
280 error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
281 }
282 }
283cleanup:
284 posix_acl_release(acl);
285 return error;
286}
287
288/*
289 * Does chmod for an inode that may have an Access Control List. The
290 * inode->i_mode field must be updated to the desired value by the caller
291 * before calling this function.
292 * Returns 0 on success, or a negative error number.
293 *
294 * We change the ACL rather than storing some ACL entries in the file
295 * mode permission bits (which would be more efficient), because that
296 * would break once additional permissions (like ACL_APPEND, ACL_DELETE
297 * for directories) are added. There are no more bits available in the
298 * file mode.
299 *
300 * inode->i_mutex: down
301 */
302int
303ext3_acl_chmod(struct inode *inode)
304{
305 struct posix_acl *acl;
306 handle_t *handle; 239 handle_t *handle;
307 int retries = 0; 240 int error, retries = 0;
308 int error;
309 241
310 if (S_ISLNK(inode->i_mode))
311 return -EOPNOTSUPP;
312 if (!test_opt(inode->i_sb, POSIX_ACL))
313 return 0;
314 acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
315 if (IS_ERR(acl) || !acl)
316 return PTR_ERR(acl);
317 error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
318 if (error)
319 return error;
320retry: 242retry:
321 handle = ext3_journal_start(inode, 243 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
322 EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); 244 if (IS_ERR(handle))
323 if (IS_ERR(handle)) { 245 return PTR_ERR(handle);
324 error = PTR_ERR(handle); 246 error = __ext3_set_acl(handle, inode, type, acl);
325 ext3_std_error(inode->i_sb, error);
326 goto out;
327 }
328 error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
329 ext3_journal_stop(handle); 247 ext3_journal_stop(handle);
330 if (error == -ENOSPC && 248 if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
331 ext3_should_retry_alloc(inode->i_sb, &retries))
332 goto retry; 249 goto retry;
333out:
334 posix_acl_release(acl);
335 return error; 250 return error;
336} 251}
337 252
338/* 253/*
339 * Extended attribute handlers 254 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
255 *
256 * dir->i_mutex: down
257 * inode->i_mutex: up (access to inode is still exclusive)
340 */ 258 */
341static size_t 259int
342ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, 260ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
343 const char *name, size_t name_len, int type)
344{
345 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
346
347 if (!test_opt(dentry->d_sb, POSIX_ACL))
348 return 0;
349 if (list && size <= list_len)
350 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
351 return size;
352}
353
354static size_t
355ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
356 const char *name, size_t name_len, int type)
357{
358 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
359
360 if (!test_opt(dentry->d_sb, POSIX_ACL))
361 return 0;
362 if (list && size <= list_len)
363 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
364 return size;
365}
366
367static int
368ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
369 size_t size, int type)
370{ 261{
371 struct posix_acl *acl; 262 struct posix_acl *default_acl, *acl;
372 int error; 263 int error;
373 264
374 if (strcmp(name, "") != 0) 265 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
375 return -EINVAL; 266 if (error)
376 if (!test_opt(dentry->d_sb, POSIX_ACL)) 267 return error;
377 return -EOPNOTSUPP;
378
379 acl = ext3_get_acl(dentry->d_inode, type);
380 if (IS_ERR(acl))
381 return PTR_ERR(acl);
382 if (acl == NULL)
383 return -ENODATA;
384 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
385 posix_acl_release(acl);
386
387 return error;
388}
389
390static int
391ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
392 size_t size, int flags, int type)
393{
394 struct inode *inode = dentry->d_inode;
395 handle_t *handle;
396 struct posix_acl *acl;
397 int error, retries = 0;
398
399 if (strcmp(name, "") != 0)
400 return -EINVAL;
401 if (!test_opt(inode->i_sb, POSIX_ACL))
402 return -EOPNOTSUPP;
403 if (!inode_owner_or_capable(inode))
404 return -EPERM;
405
406 if (value) {
407 acl = posix_acl_from_xattr(&init_user_ns, value, size);
408 if (IS_ERR(acl))
409 return PTR_ERR(acl);
410 else if (acl) {
411 error = posix_acl_valid(acl);
412 if (error)
413 goto release_and_out;
414 }
415 } else
416 acl = NULL;
417
418retry:
419 handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
420 if (IS_ERR(handle))
421 return PTR_ERR(handle);
422 error = ext3_set_acl(handle, inode, type, acl);
423 ext3_journal_stop(handle);
424 if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
425 goto retry;
426 268
427release_and_out: 269 if (default_acl) {
428 posix_acl_release(acl); 270 error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
271 default_acl);
272 posix_acl_release(default_acl);
273 }
274 if (acl) {
275 if (!error)
276 error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
277 acl);
278 posix_acl_release(acl);
279 }
429 return error; 280 return error;
430} 281}
431
432const struct xattr_handler ext3_xattr_acl_access_handler = {
433 .prefix = POSIX_ACL_XATTR_ACCESS,
434 .flags = ACL_TYPE_ACCESS,
435 .list = ext3_xattr_list_acl_access,
436 .get = ext3_xattr_get_acl,
437 .set = ext3_xattr_set_acl,
438};
439
440const struct xattr_handler ext3_xattr_acl_default_handler = {
441 .prefix = POSIX_ACL_XATTR_DEFAULT,
442 .flags = ACL_TYPE_DEFAULT,
443 .list = ext3_xattr_list_acl_default,
444 .get = ext3_xattr_get_acl,
445 .set = ext3_xattr_set_acl,
446};
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index dbc921e458c5..ea1c69edab9e 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -55,18 +55,13 @@ static inline int ext3_acl_count(size_t size)
55 55
56/* acl.c */ 56/* acl.c */
57extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); 57extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
58extern int ext3_acl_chmod (struct inode *); 58extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); 59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT3_FS_POSIX_ACL */ 61#else /* CONFIG_EXT3_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext3_get_acl NULL 63#define ext3_get_acl NULL
64 64#define ext3_set_acl NULL
65static inline int
66ext3_acl_chmod(struct inode *inode)
67{
68 return 0;
69}
70 65
71static inline int 66static inline int
72ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) 67ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index bafdd48eefde..e66e4808719f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -309,43 +309,17 @@ struct fname {
309 */ 309 */
310static void free_rb_tree_fname(struct rb_root *root) 310static void free_rb_tree_fname(struct rb_root *root)
311{ 311{
312 struct rb_node *n = root->rb_node; 312 struct fname *fname, *next;
313 struct rb_node *parent; 313
314 struct fname *fname; 314 rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
315 315 do {
316 while (n) { 316 struct fname *old = fname;
317 /* Do the node's children first */
318 if (n->rb_left) {
319 n = n->rb_left;
320 continue;
321 }
322 if (n->rb_right) {
323 n = n->rb_right;
324 continue;
325 }
326 /*
327 * The node has no children; free it, and then zero
328 * out parent's link to it. Finally go to the
329 * beginning of the loop and try to free the parent
330 * node.
331 */
332 parent = rb_parent(n);
333 fname = rb_entry(n, struct fname, rb_hash);
334 while (fname) {
335 struct fname * old = fname;
336 fname = fname->next; 317 fname = fname->next;
337 kfree (old); 318 kfree(old);
338 } 319 } while (fname);
339 if (!parent)
340 *root = RB_ROOT;
341 else if (parent->rb_left == n)
342 parent->rb_left = NULL;
343 else if (parent->rb_right == n)
344 parent->rb_right = NULL;
345 n = parent;
346 }
347}
348 320
321 *root = RB_ROOT;
322}
349 323
350static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, 324static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
351 loff_t pos) 325 loff_t pos)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 25cb413277e9..aad05311392a 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -75,6 +75,7 @@ const struct inode_operations ext3_file_inode_operations = {
75 .removexattr = generic_removexattr, 75 .removexattr = generic_removexattr,
76#endif 76#endif
77 .get_acl = ext3_get_acl, 77 .get_acl = ext3_get_acl,
78 .set_acl = ext3_set_acl,
78 .fiemap = ext3_fiemap, 79 .fiemap = ext3_fiemap,
79}; 80};
80 81
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2bd85486b879..384b6ebb655f 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3365,7 +3365,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
3365 mark_inode_dirty(inode); 3365 mark_inode_dirty(inode);
3366 3366
3367 if (ia_valid & ATTR_MODE) 3367 if (ia_valid & ATTR_MODE)
3368 rc = ext3_acl_chmod(inode); 3368 rc = posix_acl_chmod(inode, inode->i_mode);
3369 3369
3370err_out: 3370err_out:
3371 ext3_std_error(inode->i_sb, error); 3371 ext3_std_error(inode->i_sb, error);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index f8cde46de9cd..f197736dccfa 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2569,6 +2569,7 @@ const struct inode_operations ext3_dir_inode_operations = {
2569 .removexattr = generic_removexattr, 2569 .removexattr = generic_removexattr,
2570#endif 2570#endif
2571 .get_acl = ext3_get_acl, 2571 .get_acl = ext3_get_acl,
2572 .set_acl = ext3_set_acl,
2572}; 2573};
2573 2574
2574const struct inode_operations ext3_special_inode_operations = { 2575const struct inode_operations ext3_special_inode_operations = {
@@ -2580,4 +2581,5 @@ const struct inode_operations ext3_special_inode_operations = {
2580 .removexattr = generic_removexattr, 2581 .removexattr = generic_removexattr,
2581#endif 2582#endif
2582 .get_acl = ext3_get_acl, 2583 .get_acl = ext3_get_acl,
2584 .set_acl = ext3_set_acl,
2583}; 2585};
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index b1fc96383e08..c6874be6d58b 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -102,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache;
102static const struct xattr_handler *ext3_xattr_handler_map[] = { 102static const struct xattr_handler *ext3_xattr_handler_map[] = {
103 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, 103 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
104#ifdef CONFIG_EXT3_FS_POSIX_ACL 104#ifdef CONFIG_EXT3_FS_POSIX_ACL
105 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, 105 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
106 [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler, 106 [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
107#endif 107#endif
108 [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, 108 [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
109#ifdef CONFIG_EXT3_FS_SECURITY 109#ifdef CONFIG_EXT3_FS_SECURITY
@@ -115,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = {
115 &ext3_xattr_user_handler, 115 &ext3_xattr_user_handler,
116 &ext3_xattr_trusted_handler, 116 &ext3_xattr_trusted_handler,
117#ifdef CONFIG_EXT3_FS_POSIX_ACL 117#ifdef CONFIG_EXT3_FS_POSIX_ACL
118 &ext3_xattr_acl_access_handler, 118 &posix_acl_access_xattr_handler,
119 &ext3_xattr_acl_default_handler, 119 &posix_acl_default_xattr_handler,
120#endif 120#endif
121#ifdef CONFIG_EXT3_FS_SECURITY 121#ifdef CONFIG_EXT3_FS_SECURITY
122 &ext3_xattr_security_handler, 122 &ext3_xattr_security_handler,
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 2be4f69bfa64..32e93ebf8031 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -60,8 +60,6 @@ struct ext3_xattr_entry {
60 60
61extern const struct xattr_handler ext3_xattr_user_handler; 61extern const struct xattr_handler ext3_xattr_user_handler;
62extern const struct xattr_handler ext3_xattr_trusted_handler; 62extern const struct xattr_handler ext3_xattr_trusted_handler;
63extern const struct xattr_handler ext3_xattr_acl_access_handler;
64extern const struct xattr_handler ext3_xattr_acl_default_handler;
65extern const struct xattr_handler ext3_xattr_security_handler; 63extern const struct xattr_handler ext3_xattr_security_handler;
66 64
67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); 65extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 39a54a0e9fe4..d40c8dbbb0d6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type)
152 struct posix_acl *acl; 152 struct posix_acl *acl;
153 int retval; 153 int retval;
154 154
155 if (!test_opt(inode->i_sb, POSIX_ACL))
156 return NULL;
157
158 acl = get_cached_acl(inode, type);
159 if (acl != ACL_NOT_CACHED)
160 return acl;
161
162 switch (type) { 155 switch (type) {
163 case ACL_TYPE_ACCESS: 156 case ACL_TYPE_ACCESS:
164 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; 157 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type)
196 * inode->i_mutex: down unless called from ext4_new_inode 189 * inode->i_mutex: down unless called from ext4_new_inode
197 */ 190 */
198static int 191static int
199ext4_set_acl(handle_t *handle, struct inode *inode, int type, 192__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
200 struct posix_acl *acl) 193 struct posix_acl *acl)
201{ 194{
202 int name_index; 195 int name_index;
@@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
204 size_t size = 0; 197 size_t size = 0;
205 int error; 198 int error;
206 199
207 if (S_ISLNK(inode->i_mode))
208 return -EOPNOTSUPP;
209
210 switch (type) { 200 switch (type) {
211 case ACL_TYPE_ACCESS: 201 case ACL_TYPE_ACCESS:
212 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; 202 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
248 return error; 238 return error;
249} 239}
250 240
251/*
252 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
253 *
254 * dir->i_mutex: down
255 * inode->i_mutex: up (access to inode is still exclusive)
256 */
257int 241int
258ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) 242ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
259{ 243{
260 struct posix_acl *acl = NULL;
261 int error = 0;
262
263 if (!S_ISLNK(inode->i_mode)) {
264 if (test_opt(dir->i_sb, POSIX_ACL)) {
265 acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
266 if (IS_ERR(acl))
267 return PTR_ERR(acl);
268 }
269 if (!acl)
270 inode->i_mode &= ~current_umask();
271 }
272 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
273 if (S_ISDIR(inode->i_mode)) {
274 error = ext4_set_acl(handle, inode,
275 ACL_TYPE_DEFAULT, acl);
276 if (error)
277 goto cleanup;
278 }
279 error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
280 if (error < 0)
281 return error;
282
283 if (error > 0) {
284 /* This is an extended ACL */
285 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
286 }
287 }
288cleanup:
289 posix_acl_release(acl);
290 return error;
291}
292
293/*
294 * Does chmod for an inode that may have an Access Control List. The
295 * inode->i_mode field must be updated to the desired value by the caller
296 * before calling this function.
297 * Returns 0 on success, or a negative error number.
298 *
299 * We change the ACL rather than storing some ACL entries in the file
300 * mode permission bits (which would be more efficient), because that
301 * would break once additional permissions (like ACL_APPEND, ACL_DELETE
302 * for directories) are added. There are no more bits available in the
303 * file mode.
304 *
305 * inode->i_mutex: down
306 */
307int
308ext4_acl_chmod(struct inode *inode)
309{
310 struct posix_acl *acl;
311 handle_t *handle; 244 handle_t *handle;
312 int retries = 0; 245 int error, retries = 0;
313 int error;
314
315 246
316 if (S_ISLNK(inode->i_mode))
317 return -EOPNOTSUPP;
318 if (!test_opt(inode->i_sb, POSIX_ACL))
319 return 0;
320 acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
321 if (IS_ERR(acl) || !acl)
322 return PTR_ERR(acl);
323 error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
324 if (error)
325 return error;
326retry: 247retry:
327 handle = ext4_journal_start(inode, EXT4_HT_XATTR, 248 handle = ext4_journal_start(inode, EXT4_HT_XATTR,
328 ext4_jbd2_credits_xattr(inode)); 249 ext4_jbd2_credits_xattr(inode));
329 if (IS_ERR(handle)) { 250 if (IS_ERR(handle))
330 error = PTR_ERR(handle); 251 return PTR_ERR(handle);
331 ext4_std_error(inode->i_sb, error); 252
332 goto out; 253 error = __ext4_set_acl(handle, inode, type, acl);
333 }
334 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
335 ext4_journal_stop(handle); 254 ext4_journal_stop(handle);
336 if (error == -ENOSPC && 255 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
337 ext4_should_retry_alloc(inode->i_sb, &retries))
338 goto retry; 256 goto retry;
339out:
340 posix_acl_release(acl);
341 return error; 257 return error;
342} 258}
343 259
344/* 260/*
345 * Extended attribute handlers 261 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
262 *
263 * dir->i_mutex: down
264 * inode->i_mutex: up (access to inode is still exclusive)
346 */ 265 */
347static size_t 266int
348ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, 267ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
349 const char *name, size_t name_len, int type)
350{
351 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
352
353 if (!test_opt(dentry->d_sb, POSIX_ACL))
354 return 0;
355 if (list && size <= list_len)
356 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
357 return size;
358}
359
360static size_t
361ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
362 const char *name, size_t name_len, int type)
363{
364 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
365
366 if (!test_opt(dentry->d_sb, POSIX_ACL))
367 return 0;
368 if (list && size <= list_len)
369 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
370 return size;
371}
372
373static int
374ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
375 size_t size, int type)
376{ 268{
377 struct posix_acl *acl; 269 struct posix_acl *default_acl, *acl;
378 int error; 270 int error;
379 271
380 if (strcmp(name, "") != 0) 272 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
381 return -EINVAL; 273 if (error)
382 if (!test_opt(dentry->d_sb, POSIX_ACL)) 274 return error;
383 return -EOPNOTSUPP;
384
385 acl = ext4_get_acl(dentry->d_inode, type);
386 if (IS_ERR(acl))
387 return PTR_ERR(acl);
388 if (acl == NULL)
389 return -ENODATA;
390 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
391 posix_acl_release(acl);
392
393 return error;
394}
395
396static int
397ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
398 size_t size, int flags, int type)
399{
400 struct inode *inode = dentry->d_inode;
401 handle_t *handle;
402 struct posix_acl *acl;
403 int error, retries = 0;
404
405 if (strcmp(name, "") != 0)
406 return -EINVAL;
407 if (!test_opt(inode->i_sb, POSIX_ACL))
408 return -EOPNOTSUPP;
409 if (!inode_owner_or_capable(inode))
410 return -EPERM;
411
412 if (value) {
413 acl = posix_acl_from_xattr(&init_user_ns, value, size);
414 if (IS_ERR(acl))
415 return PTR_ERR(acl);
416 else if (acl) {
417 error = posix_acl_valid(acl);
418 if (error)
419 goto release_and_out;
420 }
421 } else
422 acl = NULL;
423 275
424retry: 276 if (default_acl) {
425 handle = ext4_journal_start(inode, EXT4_HT_XATTR, 277 error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
426 ext4_jbd2_credits_xattr(inode)); 278 default_acl);
427 if (IS_ERR(handle)) { 279 posix_acl_release(default_acl);
428 error = PTR_ERR(handle); 280 }
429 goto release_and_out; 281 if (acl) {
282 if (!error)
283 error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
284 acl);
285 posix_acl_release(acl);
430 } 286 }
431 error = ext4_set_acl(handle, inode, type, acl);
432 ext4_journal_stop(handle);
433 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
434 goto retry;
435
436release_and_out:
437 posix_acl_release(acl);
438 return error; 287 return error;
439} 288}
440
441const struct xattr_handler ext4_xattr_acl_access_handler = {
442 .prefix = POSIX_ACL_XATTR_ACCESS,
443 .flags = ACL_TYPE_ACCESS,
444 .list = ext4_xattr_list_acl_access,
445 .get = ext4_xattr_get_acl,
446 .set = ext4_xattr_set_acl,
447};
448
449const struct xattr_handler ext4_xattr_acl_default_handler = {
450 .prefix = POSIX_ACL_XATTR_DEFAULT,
451 .flags = ACL_TYPE_DEFAULT,
452 .list = ext4_xattr_list_acl_default,
453 .get = ext4_xattr_get_acl,
454 .set = ext4_xattr_set_acl,
455};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 18cb39ed7c7b..da2c79577d72 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size)
55 55
56/* acl.c */ 56/* acl.c */
57struct posix_acl *ext4_get_acl(struct inode *inode, int type); 57struct posix_acl *ext4_get_acl(struct inode *inode, int type);
58extern int ext4_acl_chmod(struct inode *); 58int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); 59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT4_FS_POSIX_ACL */ 61#else /* CONFIG_EXT4_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext4_get_acl NULL 63#define ext4_get_acl NULL
64 64#define ext4_set_acl NULL
65static inline int
66ext4_acl_chmod(struct inode *inode)
67{
68 return 0;
69}
70 65
71static inline int 66static inline int
72ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) 67ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3f11656bd72e..41eb9dcfac7e 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)
180/* Called when the filesystem is unmounted */ 180/* Called when the filesystem is unmounted */
181void ext4_release_system_zone(struct super_block *sb) 181void ext4_release_system_zone(struct super_block *sb)
182{ 182{
183 struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; 183 struct ext4_system_zone *entry, *n;
184 struct rb_node *parent;
185 struct ext4_system_zone *entry;
186 184
187 while (n) { 185 rbtree_postorder_for_each_entry_safe(entry, n,
188 /* Do the node's children first */ 186 &EXT4_SB(sb)->system_blks, node)
189 if (n->rb_left) {
190 n = n->rb_left;
191 continue;
192 }
193 if (n->rb_right) {
194 n = n->rb_right;
195 continue;
196 }
197 /*
198 * The node has no children; free it, and then zero
199 * out parent's link to it. Finally go to the
200 * beginning of the loop and try to free the parent
201 * node.
202 */
203 parent = rb_parent(n);
204 entry = rb_entry(n, struct ext4_system_zone, node);
205 kmem_cache_free(ext4_system_zone_cachep, entry); 187 kmem_cache_free(ext4_system_zone_cachep, entry);
206 if (!parent) 188
207 EXT4_SB(sb)->system_blks = RB_ROOT;
208 else if (parent->rb_left == n)
209 parent->rb_left = NULL;
210 else if (parent->rb_right == n)
211 parent->rb_right = NULL;
212 n = parent;
213 }
214 EXT4_SB(sb)->system_blks = RB_ROOT; 189 EXT4_SB(sb)->system_blks = RB_ROOT;
215} 190}
216 191
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 680bb3388919..d638c57e996e 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -353,41 +353,16 @@ struct fname {
353 */ 353 */
354static void free_rb_tree_fname(struct rb_root *root) 354static void free_rb_tree_fname(struct rb_root *root)
355{ 355{
356 struct rb_node *n = root->rb_node; 356 struct fname *fname, *next;
357 struct rb_node *parent; 357
358 struct fname *fname; 358 rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
359
360 while (n) {
361 /* Do the node's children first */
362 if (n->rb_left) {
363 n = n->rb_left;
364 continue;
365 }
366 if (n->rb_right) {
367 n = n->rb_right;
368 continue;
369 }
370 /*
371 * The node has no children; free it, and then zero
372 * out parent's link to it. Finally go to the
373 * beginning of the loop and try to free the parent
374 * node.
375 */
376 parent = rb_parent(n);
377 fname = rb_entry(n, struct fname, rb_hash);
378 while (fname) { 359 while (fname) {
379 struct fname *old = fname; 360 struct fname *old = fname;
380 fname = fname->next; 361 fname = fname->next;
381 kfree(old); 362 kfree(old);
382 } 363 }
383 if (!parent) 364
384 *root = RB_ROOT; 365 *root = RB_ROOT;
385 else if (parent->rb_left == n)
386 parent->rb_left = NULL;
387 else if (parent->rb_right == n)
388 parent->rb_right = NULL;
389 n = parent;
390 }
391} 366}
392 367
393 368
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ece55565b9cd..d3a534fdc5ff 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -771,6 +771,8 @@ do { \
771 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ 771 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
772 (einode)->xtime.tv_sec = \ 772 (einode)->xtime.tv_sec = \
773 (signed)le32_to_cpu((raw_inode)->xtime); \ 773 (signed)le32_to_cpu((raw_inode)->xtime); \
774 else \
775 (einode)->xtime.tv_sec = 0; \
774 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ 776 if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
775 ext4_decode_extra_time(&(einode)->xtime, \ 777 ext4_decode_extra_time(&(einode)->xtime, \
776 raw_inode->xtime ## _extra); \ 778 raw_inode->xtime ## _extra); \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4410cc3d6ee2..74bc2d549c58 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3477,7 +3477,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3477 WARN_ON(map->m_lblk < ee_block); 3477 WARN_ON(map->m_lblk < ee_block);
3478 /* 3478 /*
3479 * It is safe to convert extent to initialized via explicit 3479 * It is safe to convert extent to initialized via explicit
3480 * zeroout only if extent is fully insde i_size or new_size. 3480 * zeroout only if extent is fully inside i_size or new_size.
3481 */ 3481 */
3482 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3482 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3483 3483
@@ -3906,6 +3906,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3906 } else 3906 } else
3907 err = ret; 3907 err = ret;
3908 map->m_flags |= EXT4_MAP_MAPPED; 3908 map->m_flags |= EXT4_MAP_MAPPED;
3909 map->m_pblk = newblock;
3909 if (allocated > map->m_len) 3910 if (allocated > map->m_len)
3910 allocated = map->m_len; 3911 allocated = map->m_len;
3911 map->m_len = allocated; 3912 map->m_len = allocated;
@@ -4218,7 +4219,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4218 */ 4219 */
4219 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; 4220 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
4220 newex.ee_block = cpu_to_le32(map->m_lblk); 4221 newex.ee_block = cpu_to_le32(map->m_lblk);
4221 cluster_offset = EXT4_LBLK_CMASK(sbi, map->m_lblk); 4222 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4222 4223
4223 /* 4224 /*
4224 * If we are doing bigalloc, check to see if the extent returned 4225 * If we are doing bigalloc, check to see if the extent returned
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3da21945ff1f..1a5073959f32 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -152,7 +152,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
152 if (ret > 0) { 152 if (ret > 0) {
153 ssize_t err; 153 ssize_t err;
154 154
155 err = generic_write_sync(file, pos, ret); 155 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
156 if (err < 0 && ret > 0) 156 if (err < 0 && ret > 0)
157 ret = err; 157 ret = err;
158 } 158 }
@@ -617,6 +617,7 @@ const struct inode_operations ext4_file_inode_operations = {
617 .listxattr = ext4_listxattr, 617 .listxattr = ext4_listxattr,
618 .removexattr = generic_removexattr, 618 .removexattr = generic_removexattr,
619 .get_acl = ext4_get_acl, 619 .get_acl = ext4_get_acl,
620 .set_acl = ext4_set_acl,
620 .fiemap = ext4_fiemap, 621 .fiemap = ext4_fiemap,
621}; 622};
622 623
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index bae987549dc3..82edf5b93352 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -849,15 +849,16 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
849 handle_t *handle; 849 handle_t *handle;
850 struct page *page; 850 struct page *page;
851 struct ext4_iloc iloc; 851 struct ext4_iloc iloc;
852 int retries;
852 853
853 ret = ext4_get_inode_loc(inode, &iloc); 854 ret = ext4_get_inode_loc(inode, &iloc);
854 if (ret) 855 if (ret)
855 return ret; 856 return ret;
856 857
858retry_journal:
857 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 859 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
858 if (IS_ERR(handle)) { 860 if (IS_ERR(handle)) {
859 ret = PTR_ERR(handle); 861 ret = PTR_ERR(handle);
860 handle = NULL;
861 goto out; 862 goto out;
862 } 863 }
863 864
@@ -867,7 +868,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
867 if (inline_size >= pos + len) { 868 if (inline_size >= pos + len) {
868 ret = ext4_prepare_inline_data(handle, inode, pos + len); 869 ret = ext4_prepare_inline_data(handle, inode, pos + len);
869 if (ret && ret != -ENOSPC) 870 if (ret && ret != -ENOSPC)
870 goto out; 871 goto out_journal;
871 } 872 }
872 873
873 if (ret == -ENOSPC) { 874 if (ret == -ENOSPC) {
@@ -875,6 +876,10 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
875 inode, 876 inode,
876 flags, 877 flags,
877 fsdata); 878 fsdata);
879 ext4_journal_stop(handle);
880 if (ret == -ENOSPC &&
881 ext4_should_retry_alloc(inode->i_sb, &retries))
882 goto retry_journal;
878 goto out; 883 goto out;
879 } 884 }
880 885
@@ -887,7 +892,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
887 page = grab_cache_page_write_begin(mapping, 0, flags); 892 page = grab_cache_page_write_begin(mapping, 0, flags);
888 if (!page) { 893 if (!page) {
889 ret = -ENOMEM; 894 ret = -ENOMEM;
890 goto out; 895 goto out_journal;
891 } 896 }
892 897
893 down_read(&EXT4_I(inode)->xattr_sem); 898 down_read(&EXT4_I(inode)->xattr_sem);
@@ -904,16 +909,15 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
904 909
905 up_read(&EXT4_I(inode)->xattr_sem); 910 up_read(&EXT4_I(inode)->xattr_sem);
906 *pagep = page; 911 *pagep = page;
907 handle = NULL;
908 brelse(iloc.bh); 912 brelse(iloc.bh);
909 return 1; 913 return 1;
910out_release_page: 914out_release_page:
911 up_read(&EXT4_I(inode)->xattr_sem); 915 up_read(&EXT4_I(inode)->xattr_sem);
912 unlock_page(page); 916 unlock_page(page);
913 page_cache_release(page); 917 page_cache_release(page);
918out_journal:
919 ext4_journal_stop(handle);
914out: 920out:
915 if (handle)
916 ext4_journal_stop(handle);
917 brelse(iloc.bh); 921 brelse(iloc.bh);
918 return ret; 922 return ret;
919} 923}
@@ -1837,7 +1841,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
1837{ 1841{
1838 int error; 1842 int error;
1839 struct ext4_xattr_entry *entry; 1843 struct ext4_xattr_entry *entry;
1840 struct ext4_xattr_ibody_header *header;
1841 struct ext4_inode *raw_inode; 1844 struct ext4_inode *raw_inode;
1842 struct ext4_iloc iloc; 1845 struct ext4_iloc iloc;
1843 1846
@@ -1846,7 +1849,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
1846 return error; 1849 return error;
1847 1850
1848 raw_inode = ext4_raw_inode(&iloc); 1851 raw_inode = ext4_raw_inode(&iloc);
1849 header = IHDR(inode, raw_inode);
1850 entry = (struct ext4_xattr_entry *)((void *)raw_inode + 1852 entry = (struct ext4_xattr_entry *)((void *)raw_inode +
1851 EXT4_I(inode)->i_inline_off); 1853 EXT4_I(inode)->i_inline_off);
1852 if (EXT4_XATTR_LEN(entry->e_name_len) + 1854 if (EXT4_XATTR_LEN(entry->e_name_len) +
@@ -1924,9 +1926,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
1924 } 1926 }
1925 1927
1926 /* Clear the content within i_blocks. */ 1928 /* Clear the content within i_blocks. */
1927 if (i_size < EXT4_MIN_INLINE_DATA_SIZE) 1929 if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
1928 memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, 1930 void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
1929 EXT4_MIN_INLINE_DATA_SIZE - i_size); 1931 memset(p + i_size, 0,
1932 EXT4_MIN_INLINE_DATA_SIZE - i_size);
1933 }
1930 1934
1931 EXT4_I(inode)->i_inline_size = i_size < 1935 EXT4_I(inode)->i_inline_size = i_size <
1932 EXT4_MIN_INLINE_DATA_SIZE ? 1936 EXT4_MIN_INLINE_DATA_SIZE ?
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61d49ff22c81..24bfd7ff3049 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/aio.h> 40#include <linux/aio.h>
41#include <linux/bitops.h>
41 42
42#include "ext4_jbd2.h" 43#include "ext4_jbd2.h"
43#include "xattr.h" 44#include "xattr.h"
@@ -144,8 +145,8 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
144 */ 145 */
145static int ext4_inode_is_fast_symlink(struct inode *inode) 146static int ext4_inode_is_fast_symlink(struct inode *inode)
146{ 147{
147 int ea_blocks = EXT4_I(inode)->i_file_acl ? 148 int ea_blocks = EXT4_I(inode)->i_file_acl ?
148 (inode->i_sb->s_blocksize >> 9) : 0; 149 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
149 150
150 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 151 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
151} 152}
@@ -1772,7 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page,
1772 ret = err; 1773 ret = err;
1773 1774
1774 if (!ext4_has_inline_data(inode)) 1775 if (!ext4_has_inline_data(inode))
1775 ext4_walk_page_buffers(handle, page_bufs, 0, len, 1776 ext4_walk_page_buffers(NULL, page_bufs, 0, len,
1776 NULL, bput_one); 1777 NULL, bput_one);
1777 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1778 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1778out: 1779out:
@@ -3501,11 +3502,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3501 if (!S_ISREG(inode->i_mode)) 3502 if (!S_ISREG(inode->i_mode))
3502 return -EOPNOTSUPP; 3503 return -EOPNOTSUPP;
3503 3504
3504 if (EXT4_SB(sb)->s_cluster_ratio > 1) {
3505 /* TODO: Add support for bigalloc file systems */
3506 return -EOPNOTSUPP;
3507 }
3508
3509 trace_ext4_punch_hole(inode, offset, length); 3505 trace_ext4_punch_hole(inode, offset, length);
3510 3506
3511 /* 3507 /*
@@ -3926,18 +3922,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
3926void ext4_set_inode_flags(struct inode *inode) 3922void ext4_set_inode_flags(struct inode *inode)
3927{ 3923{
3928 unsigned int flags = EXT4_I(inode)->i_flags; 3924 unsigned int flags = EXT4_I(inode)->i_flags;
3925 unsigned int new_fl = 0;
3929 3926
3930 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
3931 if (flags & EXT4_SYNC_FL) 3927 if (flags & EXT4_SYNC_FL)
3932 inode->i_flags |= S_SYNC; 3928 new_fl |= S_SYNC;
3933 if (flags & EXT4_APPEND_FL) 3929 if (flags & EXT4_APPEND_FL)
3934 inode->i_flags |= S_APPEND; 3930 new_fl |= S_APPEND;
3935 if (flags & EXT4_IMMUTABLE_FL) 3931 if (flags & EXT4_IMMUTABLE_FL)
3936 inode->i_flags |= S_IMMUTABLE; 3932 new_fl |= S_IMMUTABLE;
3937 if (flags & EXT4_NOATIME_FL) 3933 if (flags & EXT4_NOATIME_FL)
3938 inode->i_flags |= S_NOATIME; 3934 new_fl |= S_NOATIME;
3939 if (flags & EXT4_DIRSYNC_FL) 3935 if (flags & EXT4_DIRSYNC_FL)
3940 inode->i_flags |= S_DIRSYNC; 3936 new_fl |= S_DIRSYNC;
3937 set_mask_bits(&inode->i_flags,
3938 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
3941} 3939}
3942 3940
3943/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 3941/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4586,6 +4584,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4586 if (attr->ia_size > sbi->s_bitmap_maxbytes) 4584 if (attr->ia_size > sbi->s_bitmap_maxbytes)
4587 return -EFBIG; 4585 return -EFBIG;
4588 } 4586 }
4587
4588 if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
4589 inode_inc_iversion(inode);
4590
4589 if (S_ISREG(inode->i_mode) && 4591 if (S_ISREG(inode->i_mode) &&
4590 (attr->ia_size < inode->i_size)) { 4592 (attr->ia_size < inode->i_size)) {
4591 if (ext4_should_order_data(inode)) { 4593 if (ext4_should_order_data(inode)) {
@@ -4663,7 +4665,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4663 ext4_orphan_del(NULL, inode); 4665 ext4_orphan_del(NULL, inode);
4664 4666
4665 if (!rc && (ia_valid & ATTR_MODE)) 4667 if (!rc && (ia_valid & ATTR_MODE))
4666 rc = ext4_acl_chmod(inode); 4668 rc = posix_acl_chmod(inode, inode->i_mode);
4667 4669
4668err_out: 4670err_out:
4669 ext4_std_error(inode->i_sb, error); 4671 ext4_std_error(inode->i_sb, error);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 60589b60e9b0..a2a837f00407 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -101,9 +101,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
101 handle_t *handle; 101 handle_t *handle;
102 int err; 102 int err;
103 struct inode *inode_bl; 103 struct inode *inode_bl;
104 struct ext4_inode_info *ei;
105 struct ext4_inode_info *ei_bl; 104 struct ext4_inode_info *ei_bl;
106 struct ext4_sb_info *sbi; 105 struct ext4_sb_info *sbi = EXT4_SB(sb);
107 106
108 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { 107 if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
109 err = -EINVAL; 108 err = -EINVAL;
@@ -115,9 +114,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
115 goto swap_boot_out; 114 goto swap_boot_out;
116 } 115 }
117 116
118 sbi = EXT4_SB(sb);
119 ei = EXT4_I(inode);
120
121 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); 117 inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
122 if (IS_ERR(inode_bl)) { 118 if (IS_ERR(inode_bl)) {
123 err = PTR_ERR(inode_bl); 119 err = PTR_ERR(inode_bl);
@@ -144,7 +140,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
144 handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); 140 handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
145 if (IS_ERR(handle)) { 141 if (IS_ERR(handle)) {
146 err = -EINVAL; 142 err = -EINVAL;
147 goto swap_boot_out; 143 goto journal_err_out;
148 } 144 }
149 145
150 /* Protect extent tree against block allocations via delalloc */ 146 /* Protect extent tree against block allocations via delalloc */
@@ -202,6 +198,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
202 198
203 ext4_double_up_write_data_sem(inode, inode_bl); 199 ext4_double_up_write_data_sem(inode, inode_bl);
204 200
201journal_err_out:
205 ext4_inode_resume_unlocked_dio(inode); 202 ext4_inode_resume_unlocked_dio(inode);
206 ext4_inode_resume_unlocked_dio(inode_bl); 203 ext4_inode_resume_unlocked_dio(inode_bl);
207 204
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5a0408d7b114..d050e043e884 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1425,9 +1425,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
1425 return ERR_PTR(-EIO); 1425 return ERR_PTR(-EIO);
1426 } 1426 }
1427 if (unlikely(ino == dir->i_ino)) { 1427 if (unlikely(ino == dir->i_ino)) {
1428 EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", 1428 EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
1429 dentry->d_name.len, 1429 dentry);
1430 dentry->d_name.name);
1431 return ERR_PTR(-EIO); 1430 return ERR_PTR(-EIO);
1432 } 1431 }
1433 inode = ext4_iget(dir->i_sb, ino); 1432 inode = ext4_iget(dir->i_sb, ino);
@@ -3225,6 +3224,7 @@ const struct inode_operations ext4_dir_inode_operations = {
3225 .listxattr = ext4_listxattr, 3224 .listxattr = ext4_listxattr,
3226 .removexattr = generic_removexattr, 3225 .removexattr = generic_removexattr,
3227 .get_acl = ext4_get_acl, 3226 .get_acl = ext4_get_acl,
3227 .set_acl = ext4_set_acl,
3228 .fiemap = ext4_fiemap, 3228 .fiemap = ext4_fiemap,
3229}; 3229};
3230 3230
@@ -3235,4 +3235,5 @@ const struct inode_operations ext4_special_inode_operations = {
3235 .listxattr = ext4_listxattr, 3235 .listxattr = ext4_listxattr,
3236 .removexattr = generic_removexattr, 3236 .removexattr = generic_removexattr,
3237 .get_acl = ext4_get_acl, 3237 .get_acl = ext4_get_acl,
3238 .set_acl = ext4_set_acl,
3238}; 3239};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d488f80ee32d..ab95508e3d40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio)
65{ 65{
66 int i; 66 int i;
67 int error = !test_bit(BIO_UPTODATE, &bio->bi_flags); 67 int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
68 struct bio_vec *bvec;
68 69
69 for (i = 0; i < bio->bi_vcnt; i++) { 70 bio_for_each_segment_all(bvec, bio, i) {
70 struct bio_vec *bvec = &bio->bi_io_vec[i];
71 struct page *page = bvec->bv_page; 71 struct page *page = bvec->bv_page;
72 struct buffer_head *bh, *head; 72 struct buffer_head *bh, *head;
73 unsigned bio_start = bvec->bv_offset; 73 unsigned bio_start = bvec->bv_offset;
@@ -298,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
298static void ext4_end_bio(struct bio *bio, int error) 298static void ext4_end_bio(struct bio *bio, int error)
299{ 299{
300 ext4_io_end_t *io_end = bio->bi_private; 300 ext4_io_end_t *io_end = bio->bi_private;
301 sector_t bi_sector = bio->bi_sector; 301 sector_t bi_sector = bio->bi_iter.bi_sector;
302 302
303 BUG_ON(!io_end); 303 BUG_ON(!io_end);
304 bio->bi_end_io = NULL; 304 bio->bi_end_io = NULL;
@@ -366,7 +366,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
366 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 366 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
367 if (!bio) 367 if (!bio)
368 return -ENOMEM; 368 return -ENOMEM;
369 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 369 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
370 bio->bi_bdev = bh->b_bdev; 370 bio->bi_bdev = bh->b_bdev;
371 bio->bi_end_io = ext4_end_bio; 371 bio->bi_end_io = ext4_end_bio;
372 bio->bi_private = ext4_get_io_end(io->io_end); 372 bio->bi_private = ext4_get_io_end(io->io_end);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c5adbb318a90..f3b84cd9de56 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -243,6 +243,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
243 ext4_group_t group; 243 ext4_group_t group;
244 ext4_group_t last_group; 244 ext4_group_t last_group;
245 unsigned overhead; 245 unsigned overhead;
246 __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
246 247
247 BUG_ON(flex_gd->count == 0 || group_data == NULL); 248 BUG_ON(flex_gd->count == 0 || group_data == NULL);
248 249
@@ -266,7 +267,7 @@ next_group:
266 src_group++; 267 src_group++;
267 for (; src_group <= last_group; src_group++) { 268 for (; src_group <= last_group; src_group++) {
268 overhead = ext4_group_overhead_blocks(sb, src_group); 269 overhead = ext4_group_overhead_blocks(sb, src_group);
269 if (overhead != 0) 270 if (overhead == 0)
270 last_blk += group_data[src_group - group].blocks_count; 271 last_blk += group_data[src_group - group].blocks_count;
271 else 272 else
272 break; 273 break;
@@ -280,8 +281,7 @@ next_group:
280 group = ext4_get_group_number(sb, start_blk - 1); 281 group = ext4_get_group_number(sb, start_blk - 1);
281 group -= group_data[0].group; 282 group -= group_data[0].group;
282 group_data[group].free_blocks_count--; 283 group_data[group].free_blocks_count--;
283 if (flexbg_size > 1) 284 flex_gd->bg_flags[group] &= uninit_mask;
284 flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
285 } 285 }
286 286
287 /* Allocate inode bitmaps */ 287 /* Allocate inode bitmaps */
@@ -292,22 +292,30 @@ next_group:
292 group = ext4_get_group_number(sb, start_blk - 1); 292 group = ext4_get_group_number(sb, start_blk - 1);
293 group -= group_data[0].group; 293 group -= group_data[0].group;
294 group_data[group].free_blocks_count--; 294 group_data[group].free_blocks_count--;
295 if (flexbg_size > 1) 295 flex_gd->bg_flags[group] &= uninit_mask;
296 flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
297 } 296 }
298 297
299 /* Allocate inode tables */ 298 /* Allocate inode tables */
300 for (; it_index < flex_gd->count; it_index++) { 299 for (; it_index < flex_gd->count; it_index++) {
301 if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) 300 unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
301 ext4_fsblk_t next_group_start;
302
303 if (start_blk + itb > last_blk)
302 goto next_group; 304 goto next_group;
303 group_data[it_index].inode_table = start_blk; 305 group_data[it_index].inode_table = start_blk;
304 group = ext4_get_group_number(sb, start_blk - 1); 306 group = ext4_get_group_number(sb, start_blk);
307 next_group_start = ext4_group_first_block_no(sb, group + 1);
305 group -= group_data[0].group; 308 group -= group_data[0].group;
306 group_data[group].free_blocks_count -=
307 EXT4_SB(sb)->s_itb_per_group;
308 if (flexbg_size > 1)
309 flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
310 309
310 if (start_blk + itb > next_group_start) {
311 flex_gd->bg_flags[group + 1] &= uninit_mask;
312 overhead = start_blk + itb - next_group_start;
313 group_data[group + 1].free_blocks_count -= overhead;
314 itb -= overhead;
315 }
316
317 group_data[group].free_blocks_count -= itb;
318 flex_gd->bg_flags[group] &= uninit_mask;
311 start_blk += EXT4_SB(sb)->s_itb_per_group; 319 start_blk += EXT4_SB(sb)->s_itb_per_group;
312 } 320 }
313 321
@@ -401,7 +409,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
401 start = ext4_group_first_block_no(sb, group); 409 start = ext4_group_first_block_no(sb, group);
402 group -= flex_gd->groups[0].group; 410 group -= flex_gd->groups[0].group;
403 411
404 count2 = sb->s_blocksize * 8 - (block - start); 412 count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
405 if (count2 > count) 413 if (count2 > count)
406 count2 = count; 414 count2 = count;
407 415
@@ -620,7 +628,7 @@ handle_ib:
620 if (err) 628 if (err)
621 goto out; 629 goto out;
622 count = group_table_count[j]; 630 count = group_table_count[j];
623 start = group_data[i].block_bitmap; 631 start = (&group_data[i].block_bitmap)[j];
624 block = start; 632 block = start;
625 } 633 }
626 634
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1f7784de05b6..710fed2377d4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3695,16 +3695,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3695 for (i = 0; i < 4; i++) 3695 for (i = 0; i < 4; i++)
3696 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 3696 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3697 sbi->s_def_hash_version = es->s_def_hash_version; 3697 sbi->s_def_hash_version = es->s_def_hash_version;
3698 i = le32_to_cpu(es->s_flags); 3698 if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
3699 if (i & EXT2_FLAGS_UNSIGNED_HASH) 3699 i = le32_to_cpu(es->s_flags);
3700 sbi->s_hash_unsigned = 3; 3700 if (i & EXT2_FLAGS_UNSIGNED_HASH)
3701 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { 3701 sbi->s_hash_unsigned = 3;
3702 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
3702#ifdef __CHAR_UNSIGNED__ 3703#ifdef __CHAR_UNSIGNED__
3703 es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); 3704 if (!(sb->s_flags & MS_RDONLY))
3704 sbi->s_hash_unsigned = 3; 3705 es->s_flags |=
3706 cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
3707 sbi->s_hash_unsigned = 3;
3705#else 3708#else
3706 es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 3709 if (!(sb->s_flags & MS_RDONLY))
3710 es->s_flags |=
3711 cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
3707#endif 3712#endif
3713 }
3708 } 3714 }
3709 3715
3710 /* Handle clustersize */ 3716 /* Handle clustersize */
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1423c4816a47..e175e94116ac 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -95,8 +95,8 @@ static struct mb_cache *ext4_xattr_cache;
95static const struct xattr_handler *ext4_xattr_handler_map[] = { 95static const struct xattr_handler *ext4_xattr_handler_map[] = {
96 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 96 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
97#ifdef CONFIG_EXT4_FS_POSIX_ACL 97#ifdef CONFIG_EXT4_FS_POSIX_ACL
98 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 98 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
99 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, 99 [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
100#endif 100#endif
101 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, 101 [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
102#ifdef CONFIG_EXT4_FS_SECURITY 102#ifdef CONFIG_EXT4_FS_SECURITY
@@ -108,8 +108,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
108 &ext4_xattr_user_handler, 108 &ext4_xattr_user_handler,
109 &ext4_xattr_trusted_handler, 109 &ext4_xattr_trusted_handler,
110#ifdef CONFIG_EXT4_FS_POSIX_ACL 110#ifdef CONFIG_EXT4_FS_POSIX_ACL
111 &ext4_xattr_acl_access_handler, 111 &posix_acl_access_xattr_handler,
112 &ext4_xattr_acl_default_handler, 112 &posix_acl_default_xattr_handler,
113#endif 113#endif
114#ifdef CONFIG_EXT4_FS_SECURITY 114#ifdef CONFIG_EXT4_FS_SECURITY
115 &ext4_xattr_security_handler, 115 &ext4_xattr_security_handler,
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index c767dbdd7fc4..819d6398833f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find {
96 96
97extern const struct xattr_handler ext4_xattr_user_handler; 97extern const struct xattr_handler ext4_xattr_user_handler;
98extern const struct xattr_handler ext4_xattr_trusted_handler; 98extern const struct xattr_handler ext4_xattr_trusted_handler;
99extern const struct xattr_handler ext4_xattr_acl_access_handler;
100extern const struct xattr_handler ext4_xattr_acl_default_handler;
101extern const struct xattr_handler ext4_xattr_security_handler; 99extern const struct xattr_handler ext4_xattr_security_handler;
102 100
103extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); 101extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 27a0820340b9..2e35da12d292 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -1,6 +1,6 @@
1obj-$(CONFIG_F2FS_FS) += f2fs.o 1obj-$(CONFIG_F2FS_FS) += f2fs.o
2 2
3f2fs-y := dir.o file.o inode.o namei.o hash.o super.o 3f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o
4f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o 4f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o 5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o 6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index d0fc287efeff..fa8da4cb8c4b 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -17,9 +17,6 @@
17#include "xattr.h" 17#include "xattr.h"
18#include "acl.h" 18#include "acl.h"
19 19
20#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
21 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
22
23static inline size_t f2fs_acl_size(int count) 20static inline size_t f2fs_acl_size(int count)
24{ 21{
25 if (count <= 4) { 22 if (count <= 4) {
@@ -167,19 +164,11 @@ fail:
167 164
168struct posix_acl *f2fs_get_acl(struct inode *inode, int type) 165struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
169{ 166{
170 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
171 int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; 167 int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
172 void *value = NULL; 168 void *value = NULL;
173 struct posix_acl *acl; 169 struct posix_acl *acl;
174 int retval; 170 int retval;
175 171
176 if (!test_opt(sbi, POSIX_ACL))
177 return NULL;
178
179 acl = get_cached_acl(inode, type);
180 if (acl != ACL_NOT_CACHED)
181 return acl;
182
183 if (type == ACL_TYPE_ACCESS) 172 if (type == ACL_TYPE_ACCESS)
184 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; 173 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
185 174
@@ -205,21 +194,15 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
205 return acl; 194 return acl;
206} 195}
207 196
208static int f2fs_set_acl(struct inode *inode, int type, 197static int __f2fs_set_acl(struct inode *inode, int type,
209 struct posix_acl *acl, struct page *ipage) 198 struct posix_acl *acl, struct page *ipage)
210{ 199{
211 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
212 struct f2fs_inode_info *fi = F2FS_I(inode); 200 struct f2fs_inode_info *fi = F2FS_I(inode);
213 int name_index; 201 int name_index;
214 void *value = NULL; 202 void *value = NULL;
215 size_t size = 0; 203 size_t size = 0;
216 int error; 204 int error;
217 205
218 if (!test_opt(sbi, POSIX_ACL))
219 return 0;
220 if (S_ISLNK(inode->i_mode))
221 return -EOPNOTSUPP;
222
223 switch (type) { 206 switch (type) {
224 case ACL_TYPE_ACCESS: 207 case ACL_TYPE_ACCESS:
225 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; 208 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -261,154 +244,31 @@ static int f2fs_set_acl(struct inode *inode, int type,
261 return error; 244 return error;
262} 245}
263 246
264int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) 247int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
265{ 248{
266 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 249 return __f2fs_set_acl(inode, type, acl, NULL);
267 struct posix_acl *acl = NULL;
268 int error = 0;
269
270 if (!S_ISLNK(inode->i_mode)) {
271 if (test_opt(sbi, POSIX_ACL)) {
272 acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
273 if (IS_ERR(acl))
274 return PTR_ERR(acl);
275 }
276 if (!acl)
277 inode->i_mode &= ~current_umask();
278 }
279
280 if (!test_opt(sbi, POSIX_ACL) || !acl)
281 goto cleanup;
282
283 if (S_ISDIR(inode->i_mode)) {
284 error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl, ipage);
285 if (error)
286 goto cleanup;
287 }
288 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
289 if (error < 0)
290 return error;
291 if (error > 0)
292 error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage);
293cleanup:
294 posix_acl_release(acl);
295 return error;
296} 250}
297 251
298int f2fs_acl_chmod(struct inode *inode) 252int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
299{ 253{
300 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 254 struct posix_acl *default_acl, *acl;
301 struct posix_acl *acl; 255 int error = 0;
302 int error;
303 umode_t mode = get_inode_mode(inode);
304
305 if (!test_opt(sbi, POSIX_ACL))
306 return 0;
307 if (S_ISLNK(mode))
308 return -EOPNOTSUPP;
309
310 acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
311 if (IS_ERR(acl) || !acl)
312 return PTR_ERR(acl);
313 256
314 error = posix_acl_chmod(&acl, GFP_KERNEL, mode); 257 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
315 if (error) 258 if (error)
316 return error; 259 return error;
317 260
318 error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, NULL); 261 if (default_acl) {
319 posix_acl_release(acl); 262 error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
320 return error; 263 ipage);
321} 264 posix_acl_release(default_acl);
322 265 }
323static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list, 266 if (acl) {
324 size_t list_size, const char *name, size_t name_len, int type) 267 if (error)
325{ 268 error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
326 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); 269 ipage);
327 const char *xname = POSIX_ACL_XATTR_DEFAULT; 270 posix_acl_release(acl);
328 size_t size;
329
330 if (!test_opt(sbi, POSIX_ACL))
331 return 0;
332
333 if (type == ACL_TYPE_ACCESS)
334 xname = POSIX_ACL_XATTR_ACCESS;
335
336 size = strlen(xname) + 1;
337 if (list && size <= list_size)
338 memcpy(list, xname, size);
339 return size;
340}
341
342static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
343 void *buffer, size_t size, int type)
344{
345 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
346 struct posix_acl *acl;
347 int error;
348
349 if (strcmp(name, "") != 0)
350 return -EINVAL;
351 if (!test_opt(sbi, POSIX_ACL))
352 return -EOPNOTSUPP;
353
354 acl = f2fs_get_acl(dentry->d_inode, type);
355 if (IS_ERR(acl))
356 return PTR_ERR(acl);
357 if (!acl)
358 return -ENODATA;
359 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
360 posix_acl_release(acl);
361
362 return error;
363}
364
365static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
366 const void *value, size_t size, int flags, int type)
367{
368 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
369 struct inode *inode = dentry->d_inode;
370 struct posix_acl *acl = NULL;
371 int error;
372
373 if (strcmp(name, "") != 0)
374 return -EINVAL;
375 if (!test_opt(sbi, POSIX_ACL))
376 return -EOPNOTSUPP;
377 if (!inode_owner_or_capable(inode))
378 return -EPERM;
379
380 if (value) {
381 acl = posix_acl_from_xattr(&init_user_ns, value, size);
382 if (IS_ERR(acl))
383 return PTR_ERR(acl);
384 if (acl) {
385 error = posix_acl_valid(acl);
386 if (error)
387 goto release_and_out;
388 }
389 } else {
390 acl = NULL;
391 } 271 }
392 272
393 error = f2fs_set_acl(inode, type, acl, NULL);
394
395release_and_out:
396 posix_acl_release(acl);
397 return error; 273 return error;
398} 274}
399
400const struct xattr_handler f2fs_xattr_acl_default_handler = {
401 .prefix = POSIX_ACL_XATTR_DEFAULT,
402 .flags = ACL_TYPE_DEFAULT,
403 .list = f2fs_xattr_list_acl,
404 .get = f2fs_xattr_get_acl,
405 .set = f2fs_xattr_set_acl,
406};
407
408const struct xattr_handler f2fs_xattr_acl_access_handler = {
409 .prefix = POSIX_ACL_XATTR_ACCESS,
410 .flags = ACL_TYPE_ACCESS,
411 .list = f2fs_xattr_list_acl,
412 .get = f2fs_xattr_get_acl,
413 .set = f2fs_xattr_set_acl,
414};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 49633131e038..e0864651cdc1 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -37,18 +37,13 @@ struct f2fs_acl_header {
37#ifdef CONFIG_F2FS_FS_POSIX_ACL 37#ifdef CONFIG_F2FS_FS_POSIX_ACL
38 38
39extern struct posix_acl *f2fs_get_acl(struct inode *, int); 39extern struct posix_acl *f2fs_get_acl(struct inode *, int);
40extern int f2fs_acl_chmod(struct inode *); 40extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
41extern int f2fs_init_acl(struct inode *, struct inode *, struct page *); 41extern int f2fs_init_acl(struct inode *, struct inode *, struct page *);
42#else 42#else
43#define f2fs_check_acl NULL 43#define f2fs_check_acl NULL
44#define f2fs_get_acl NULL 44#define f2fs_get_acl NULL
45#define f2fs_set_acl NULL 45#define f2fs_set_acl NULL
46 46
47static inline int f2fs_acl_chmod(struct inode *inode)
48{
49 return 0;
50}
51
52static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, 47static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
53 struct page *page) 48 struct page *page)
54{ 49{
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5716e5eb4e8e..293d0486a40f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab;
30 */ 30 */
31struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) 31struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
32{ 32{
33 struct address_space *mapping = sbi->meta_inode->i_mapping; 33 struct address_space *mapping = META_MAPPING(sbi);
34 struct page *page = NULL; 34 struct page *page = NULL;
35repeat: 35repeat:
36 page = grab_cache_page(mapping, index); 36 page = grab_cache_page(mapping, index);
@@ -50,7 +50,7 @@ repeat:
50 */ 50 */
51struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) 51struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
52{ 52{
53 struct address_space *mapping = sbi->meta_inode->i_mapping; 53 struct address_space *mapping = META_MAPPING(sbi);
54 struct page *page; 54 struct page *page;
55repeat: 55repeat:
56 page = grab_cache_page(mapping, index); 56 page = grab_cache_page(mapping, index);
@@ -61,11 +61,12 @@ repeat:
61 if (PageUptodate(page)) 61 if (PageUptodate(page))
62 goto out; 62 goto out;
63 63
64 if (f2fs_readpage(sbi, page, index, READ_SYNC)) 64 if (f2fs_submit_page_bio(sbi, page, index,
65 READ_SYNC | REQ_META | REQ_PRIO))
65 goto repeat; 66 goto repeat;
66 67
67 lock_page(page); 68 lock_page(page);
68 if (page->mapping != mapping) { 69 if (unlikely(page->mapping != mapping)) {
69 f2fs_put_page(page, 1); 70 f2fs_put_page(page, 1);
70 goto repeat; 71 goto repeat;
71 } 72 }
@@ -81,13 +82,12 @@ static int f2fs_write_meta_page(struct page *page,
81 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 82 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
82 83
83 /* Should not write any meta pages, if any IO error was occurred */ 84 /* Should not write any meta pages, if any IO error was occurred */
84 if (wbc->for_reclaim || sbi->por_doing || 85 if (unlikely(sbi->por_doing ||
85 is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { 86 is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
86 dec_page_count(sbi, F2FS_DIRTY_META); 87 goto redirty_out;
87 wbc->pages_skipped++; 88
88 set_page_dirty(page); 89 if (wbc->for_reclaim)
89 return AOP_WRITEPAGE_ACTIVATE; 90 goto redirty_out;
90 }
91 91
92 wait_on_page_writeback(page); 92 wait_on_page_writeback(page);
93 93
@@ -95,24 +95,31 @@ static int f2fs_write_meta_page(struct page *page,
95 dec_page_count(sbi, F2FS_DIRTY_META); 95 dec_page_count(sbi, F2FS_DIRTY_META);
96 unlock_page(page); 96 unlock_page(page);
97 return 0; 97 return 0;
98
99redirty_out:
100 dec_page_count(sbi, F2FS_DIRTY_META);
101 wbc->pages_skipped++;
102 set_page_dirty(page);
103 return AOP_WRITEPAGE_ACTIVATE;
98} 104}
99 105
100static int f2fs_write_meta_pages(struct address_space *mapping, 106static int f2fs_write_meta_pages(struct address_space *mapping,
101 struct writeback_control *wbc) 107 struct writeback_control *wbc)
102{ 108{
103 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 109 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
104 struct block_device *bdev = sbi->sb->s_bdev; 110 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
105 long written; 111 long written;
106 112
107 if (wbc->for_kupdate) 113 if (wbc->for_kupdate)
108 return 0; 114 return 0;
109 115
110 if (get_pages(sbi, F2FS_DIRTY_META) == 0) 116 /* collect a number of dirty meta pages and write together */
117 if (get_pages(sbi, F2FS_DIRTY_META) < nrpages)
111 return 0; 118 return 0;
112 119
113 /* if mounting is failed, skip writing node pages */ 120 /* if mounting is failed, skip writing node pages */
114 mutex_lock(&sbi->cp_mutex); 121 mutex_lock(&sbi->cp_mutex);
115 written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); 122 written = sync_meta_pages(sbi, META, nrpages);
116 mutex_unlock(&sbi->cp_mutex); 123 mutex_unlock(&sbi->cp_mutex);
117 wbc->nr_to_write -= written; 124 wbc->nr_to_write -= written;
118 return 0; 125 return 0;
@@ -121,7 +128,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
121long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, 128long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
122 long nr_to_write) 129 long nr_to_write)
123{ 130{
124 struct address_space *mapping = sbi->meta_inode->i_mapping; 131 struct address_space *mapping = META_MAPPING(sbi);
125 pgoff_t index = 0, end = LONG_MAX; 132 pgoff_t index = 0, end = LONG_MAX;
126 struct pagevec pvec; 133 struct pagevec pvec;
127 long nwritten = 0; 134 long nwritten = 0;
@@ -136,7 +143,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
136 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 143 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
137 PAGECACHE_TAG_DIRTY, 144 PAGECACHE_TAG_DIRTY,
138 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 145 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
139 if (nr_pages == 0) 146 if (unlikely(nr_pages == 0))
140 break; 147 break;
141 148
142 for (i = 0; i < nr_pages; i++) { 149 for (i = 0; i < nr_pages; i++) {
@@ -149,7 +156,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
149 unlock_page(page); 156 unlock_page(page);
150 break; 157 break;
151 } 158 }
152 if (nwritten++ >= nr_to_write) 159 nwritten++;
160 if (unlikely(nwritten >= nr_to_write))
153 break; 161 break;
154 } 162 }
155 pagevec_release(&pvec); 163 pagevec_release(&pvec);
@@ -157,7 +165,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
157 } 165 }
158 166
159 if (nwritten) 167 if (nwritten)
160 f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); 168 f2fs_submit_merged_bio(sbi, type, WRITE);
161 169
162 return nwritten; 170 return nwritten;
163} 171}
@@ -186,31 +194,24 @@ const struct address_space_operations f2fs_meta_aops = {
186 194
187int acquire_orphan_inode(struct f2fs_sb_info *sbi) 195int acquire_orphan_inode(struct f2fs_sb_info *sbi)
188{ 196{
189 unsigned int max_orphans;
190 int err = 0; 197 int err = 0;
191 198
192 /* 199 spin_lock(&sbi->orphan_inode_lock);
193 * considering 512 blocks in a segment 5 blocks are needed for cp 200 if (unlikely(sbi->n_orphans >= sbi->max_orphans))
194 * and log segment summaries. Remaining blocks are used to keep
195 * orphan entries with the limitation one reserved segment
196 * for cp pack we can have max 1020*507 orphan entries
197 */
198 max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
199 mutex_lock(&sbi->orphan_inode_mutex);
200 if (sbi->n_orphans >= max_orphans)
201 err = -ENOSPC; 201 err = -ENOSPC;
202 else 202 else
203 sbi->n_orphans++; 203 sbi->n_orphans++;
204 mutex_unlock(&sbi->orphan_inode_mutex); 204 spin_unlock(&sbi->orphan_inode_lock);
205
205 return err; 206 return err;
206} 207}
207 208
208void release_orphan_inode(struct f2fs_sb_info *sbi) 209void release_orphan_inode(struct f2fs_sb_info *sbi)
209{ 210{
210 mutex_lock(&sbi->orphan_inode_mutex); 211 spin_lock(&sbi->orphan_inode_lock);
211 f2fs_bug_on(sbi->n_orphans == 0); 212 f2fs_bug_on(sbi->n_orphans == 0);
212 sbi->n_orphans--; 213 sbi->n_orphans--;
213 mutex_unlock(&sbi->orphan_inode_mutex); 214 spin_unlock(&sbi->orphan_inode_lock);
214} 215}
215 216
216void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 217void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -218,27 +219,30 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
218 struct list_head *head, *this; 219 struct list_head *head, *this;
219 struct orphan_inode_entry *new = NULL, *orphan = NULL; 220 struct orphan_inode_entry *new = NULL, *orphan = NULL;
220 221
221 mutex_lock(&sbi->orphan_inode_mutex); 222 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
223 new->ino = ino;
224
225 spin_lock(&sbi->orphan_inode_lock);
222 head = &sbi->orphan_inode_list; 226 head = &sbi->orphan_inode_list;
223 list_for_each(this, head) { 227 list_for_each(this, head) {
224 orphan = list_entry(this, struct orphan_inode_entry, list); 228 orphan = list_entry(this, struct orphan_inode_entry, list);
225 if (orphan->ino == ino) 229 if (orphan->ino == ino) {
226 goto out; 230 spin_unlock(&sbi->orphan_inode_lock);
231 kmem_cache_free(orphan_entry_slab, new);
232 return;
233 }
234
227 if (orphan->ino > ino) 235 if (orphan->ino > ino)
228 break; 236 break;
229 orphan = NULL; 237 orphan = NULL;
230 } 238 }
231 239
232 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
233 new->ino = ino;
234
235 /* add new_oentry into list which is sorted by inode number */ 240 /* add new_oentry into list which is sorted by inode number */
236 if (orphan) 241 if (orphan)
237 list_add(&new->list, this->prev); 242 list_add(&new->list, this->prev);
238 else 243 else
239 list_add_tail(&new->list, head); 244 list_add_tail(&new->list, head);
240out: 245 spin_unlock(&sbi->orphan_inode_lock);
241 mutex_unlock(&sbi->orphan_inode_mutex);
242} 246}
243 247
244void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 248void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -246,7 +250,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
246 struct list_head *head; 250 struct list_head *head;
247 struct orphan_inode_entry *orphan; 251 struct orphan_inode_entry *orphan;
248 252
249 mutex_lock(&sbi->orphan_inode_mutex); 253 spin_lock(&sbi->orphan_inode_lock);
250 head = &sbi->orphan_inode_list; 254 head = &sbi->orphan_inode_list;
251 list_for_each_entry(orphan, head, list) { 255 list_for_each_entry(orphan, head, list) {
252 if (orphan->ino == ino) { 256 if (orphan->ino == ino) {
@@ -257,7 +261,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
257 break; 261 break;
258 } 262 }
259 } 263 }
260 mutex_unlock(&sbi->orphan_inode_mutex); 264 spin_unlock(&sbi->orphan_inode_lock);
261} 265}
262 266
263static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 267static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -270,12 +274,12 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
270 iput(inode); 274 iput(inode);
271} 275}
272 276
273int recover_orphan_inodes(struct f2fs_sb_info *sbi) 277void recover_orphan_inodes(struct f2fs_sb_info *sbi)
274{ 278{
275 block_t start_blk, orphan_blkaddr, i, j; 279 block_t start_blk, orphan_blkaddr, i, j;
276 280
277 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) 281 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
278 return 0; 282 return;
279 283
280 sbi->por_doing = true; 284 sbi->por_doing = true;
281 start_blk = __start_cp_addr(sbi) + 1; 285 start_blk = __start_cp_addr(sbi) + 1;
@@ -295,29 +299,39 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
295 /* clear Orphan Flag */ 299 /* clear Orphan Flag */
296 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); 300 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
297 sbi->por_doing = false; 301 sbi->por_doing = false;
298 return 0; 302 return;
299} 303}
300 304
301static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) 305static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
302{ 306{
303 struct list_head *head, *this, *next; 307 struct list_head *head;
304 struct f2fs_orphan_block *orphan_blk = NULL; 308 struct f2fs_orphan_block *orphan_blk = NULL;
305 struct page *page = NULL;
306 unsigned int nentries = 0; 309 unsigned int nentries = 0;
307 unsigned short index = 1; 310 unsigned short index;
308 unsigned short orphan_blocks; 311 unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
309
310 orphan_blocks = (unsigned short)((sbi->n_orphans +
311 (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); 312 (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
313 struct page *page = NULL;
314 struct orphan_inode_entry *orphan = NULL;
315
316 for (index = 0; index < orphan_blocks; index++)
317 grab_meta_page(sbi, start_blk + index);
312 318
313 mutex_lock(&sbi->orphan_inode_mutex); 319 index = 1;
320 spin_lock(&sbi->orphan_inode_lock);
314 head = &sbi->orphan_inode_list; 321 head = &sbi->orphan_inode_list;
315 322
316 /* loop for each orphan inode entry and write them in Jornal block */ 323 /* loop for each orphan inode entry and write them in Jornal block */
317 list_for_each_safe(this, next, head) { 324 list_for_each_entry(orphan, head, list) {
318 struct orphan_inode_entry *orphan; 325 if (!page) {
326 page = find_get_page(META_MAPPING(sbi), start_blk++);
327 f2fs_bug_on(!page);
328 orphan_blk =
329 (struct f2fs_orphan_block *)page_address(page);
330 memset(orphan_blk, 0, sizeof(*orphan_blk));
331 f2fs_put_page(page, 0);
332 }
319 333
320 orphan = list_entry(this, struct orphan_inode_entry, list); 334 orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
321 335
322 if (nentries == F2FS_ORPHANS_PER_BLOCK) { 336 if (nentries == F2FS_ORPHANS_PER_BLOCK) {
323 /* 337 /*
@@ -331,29 +345,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
331 set_page_dirty(page); 345 set_page_dirty(page);
332 f2fs_put_page(page, 1); 346 f2fs_put_page(page, 1);
333 index++; 347 index++;
334 start_blk++;
335 nentries = 0; 348 nentries = 0;
336 page = NULL; 349 page = NULL;
337 } 350 }
338 if (page) 351 }
339 goto page_exist;
340 352
341 page = grab_meta_page(sbi, start_blk); 353 if (page) {
342 orphan_blk = (struct f2fs_orphan_block *)page_address(page); 354 orphan_blk->blk_addr = cpu_to_le16(index);
343 memset(orphan_blk, 0, sizeof(*orphan_blk)); 355 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
344page_exist: 356 orphan_blk->entry_count = cpu_to_le32(nentries);
345 orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); 357 set_page_dirty(page);
358 f2fs_put_page(page, 1);
346 } 359 }
347 if (!page)
348 goto end;
349 360
350 orphan_blk->blk_addr = cpu_to_le16(index); 361 spin_unlock(&sbi->orphan_inode_lock);
351 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
352 orphan_blk->entry_count = cpu_to_le32(nentries);
353 set_page_dirty(page);
354 f2fs_put_page(page, 1);
355end:
356 mutex_unlock(&sbi->orphan_inode_mutex);
357} 362}
358 363
359static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, 364static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -428,7 +433,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
428 cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); 433 cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
429 434
430 /* The second checkpoint pack should start at the next segment */ 435 /* The second checkpoint pack should start at the next segment */
431 cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); 436 cp_start_blk_no += ((unsigned long long)1) <<
437 le32_to_cpu(fsb->log_blocks_per_seg);
432 cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); 438 cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
433 439
434 if (cp1 && cp2) { 440 if (cp1 && cp2) {
@@ -465,7 +471,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
465 list_for_each(this, head) { 471 list_for_each(this, head) {
466 struct dir_inode_entry *entry; 472 struct dir_inode_entry *entry;
467 entry = list_entry(this, struct dir_inode_entry, list); 473 entry = list_entry(this, struct dir_inode_entry, list);
468 if (entry->inode == inode) 474 if (unlikely(entry->inode == inode))
469 return -EEXIST; 475 return -EEXIST;
470 } 476 }
471 list_add_tail(&new->list, head); 477 list_add_tail(&new->list, head);
@@ -513,8 +519,8 @@ void add_dirty_dir_inode(struct inode *inode)
513void remove_dirty_dir_inode(struct inode *inode) 519void remove_dirty_dir_inode(struct inode *inode)
514{ 520{
515 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 521 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
516 struct list_head *head = &sbi->dir_inode_list; 522
517 struct list_head *this; 523 struct list_head *this, *head;
518 524
519 if (!S_ISDIR(inode->i_mode)) 525 if (!S_ISDIR(inode->i_mode))
520 return; 526 return;
@@ -525,6 +531,7 @@ void remove_dirty_dir_inode(struct inode *inode)
525 return; 531 return;
526 } 532 }
527 533
534 head = &sbi->dir_inode_list;
528 list_for_each(this, head) { 535 list_for_each(this, head) {
529 struct dir_inode_entry *entry; 536 struct dir_inode_entry *entry;
530 entry = list_entry(this, struct dir_inode_entry, list); 537 entry = list_entry(this, struct dir_inode_entry, list);
@@ -546,11 +553,13 @@ void remove_dirty_dir_inode(struct inode *inode)
546 553
547struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) 554struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
548{ 555{
549 struct list_head *head = &sbi->dir_inode_list; 556
550 struct list_head *this; 557 struct list_head *this, *head;
551 struct inode *inode = NULL; 558 struct inode *inode = NULL;
552 559
553 spin_lock(&sbi->dir_inode_lock); 560 spin_lock(&sbi->dir_inode_lock);
561
562 head = &sbi->dir_inode_list;
554 list_for_each(this, head) { 563 list_for_each(this, head) {
555 struct dir_inode_entry *entry; 564 struct dir_inode_entry *entry;
556 entry = list_entry(this, struct dir_inode_entry, list); 565 entry = list_entry(this, struct dir_inode_entry, list);
@@ -565,11 +574,13 @@ struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
565 574
566void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) 575void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
567{ 576{
568 struct list_head *head = &sbi->dir_inode_list; 577 struct list_head *head;
569 struct dir_inode_entry *entry; 578 struct dir_inode_entry *entry;
570 struct inode *inode; 579 struct inode *inode;
571retry: 580retry:
572 spin_lock(&sbi->dir_inode_lock); 581 spin_lock(&sbi->dir_inode_lock);
582
583 head = &sbi->dir_inode_list;
573 if (list_empty(head)) { 584 if (list_empty(head)) {
574 spin_unlock(&sbi->dir_inode_lock); 585 spin_unlock(&sbi->dir_inode_lock);
575 return; 586 return;
@@ -585,7 +596,7 @@ retry:
585 * We should submit bio, since it exists several 596 * We should submit bio, since it exists several
586 * wribacking dentry pages in the freeing inode. 597 * wribacking dentry pages in the freeing inode.
587 */ 598 */
588 f2fs_submit_bio(sbi, DATA, true); 599 f2fs_submit_merged_bio(sbi, DATA, WRITE);
589 } 600 }
590 goto retry; 601 goto retry;
591} 602}
@@ -760,8 +771,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
760 /* wait for previous submitted node/meta pages writeback */ 771 /* wait for previous submitted node/meta pages writeback */
761 wait_on_all_pages_writeback(sbi); 772 wait_on_all_pages_writeback(sbi);
762 773
763 filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); 774 filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
764 filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); 775 filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
765 776
766 /* update user_block_counts */ 777 /* update user_block_counts */
767 sbi->last_valid_block_count = sbi->total_valid_block_count; 778 sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -770,7 +781,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
770 /* Here, we only have one bio having CP pack */ 781 /* Here, we only have one bio having CP pack */
771 sync_meta_pages(sbi, META_FLUSH, LONG_MAX); 782 sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
772 783
773 if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { 784 if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
774 clear_prefree_segments(sbi); 785 clear_prefree_segments(sbi);
775 F2FS_RESET_SB_DIRT(sbi); 786 F2FS_RESET_SB_DIRT(sbi);
776 } 787 }
@@ -791,9 +802,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
791 802
792 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); 803 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
793 804
794 f2fs_submit_bio(sbi, DATA, true); 805 f2fs_submit_merged_bio(sbi, DATA, WRITE);
795 f2fs_submit_bio(sbi, NODE, true); 806 f2fs_submit_merged_bio(sbi, NODE, WRITE);
796 f2fs_submit_bio(sbi, META, true); 807 f2fs_submit_merged_bio(sbi, META, WRITE);
797 808
798 /* 809 /*
799 * update checkpoint pack index 810 * update checkpoint pack index
@@ -818,20 +829,28 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
818 829
819void init_orphan_info(struct f2fs_sb_info *sbi) 830void init_orphan_info(struct f2fs_sb_info *sbi)
820{ 831{
821 mutex_init(&sbi->orphan_inode_mutex); 832 spin_lock_init(&sbi->orphan_inode_lock);
822 INIT_LIST_HEAD(&sbi->orphan_inode_list); 833 INIT_LIST_HEAD(&sbi->orphan_inode_list);
823 sbi->n_orphans = 0; 834 sbi->n_orphans = 0;
835 /*
836 * considering 512 blocks in a segment 8 blocks are needed for cp
837 * and log segment summaries. Remaining blocks are used to keep
838 * orphan entries with the limitation one reserved segment
839 * for cp pack we can have max 1020*504 orphan entries
840 */
841 sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
842 * F2FS_ORPHANS_PER_BLOCK;
824} 843}
825 844
826int __init create_checkpoint_caches(void) 845int __init create_checkpoint_caches(void)
827{ 846{
828 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", 847 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
829 sizeof(struct orphan_inode_entry), NULL); 848 sizeof(struct orphan_inode_entry), NULL);
830 if (unlikely(!orphan_entry_slab)) 849 if (!orphan_entry_slab)
831 return -ENOMEM; 850 return -ENOMEM;
832 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", 851 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
833 sizeof(struct dir_inode_entry), NULL); 852 sizeof(struct dir_inode_entry), NULL);
834 if (unlikely(!inode_entry_slab)) { 853 if (!inode_entry_slab) {
835 kmem_cache_destroy(orphan_entry_slab); 854 kmem_cache_destroy(orphan_entry_slab);
836 return -ENOMEM; 855 return -ENOMEM;
837 } 856 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa3438c571fa..2261ccdd0b5f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -24,6 +24,188 @@
24#include "segment.h" 24#include "segment.h"
25#include <trace/events/f2fs.h> 25#include <trace/events/f2fs.h>
26 26
27static void f2fs_read_end_io(struct bio *bio, int err)
28{
29 struct bio_vec *bvec;
30 int i;
31
32 bio_for_each_segment_all(bvec, bio, i) {
33 struct page *page = bvec->bv_page;
34
35 if (!err) {
36 SetPageUptodate(page);
37 } else {
38 ClearPageUptodate(page);
39 SetPageError(page);
40 }
41 unlock_page(page);
42 }
43 bio_put(bio);
44}
45
46static void f2fs_write_end_io(struct bio *bio, int err)
47{
48 struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb);
49 struct bio_vec *bvec;
50 int i;
51
52 bio_for_each_segment_all(bvec, bio, i) {
53 struct page *page = bvec->bv_page;
54
55 if (unlikely(err)) {
56 SetPageError(page);
57 set_bit(AS_EIO, &page->mapping->flags);
58 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
59 sbi->sb->s_flags |= MS_RDONLY;
60 }
61 end_page_writeback(page);
62 dec_page_count(sbi, F2FS_WRITEBACK);
63 }
64
65 if (bio->bi_private)
66 complete(bio->bi_private);
67
68 if (!get_pages(sbi, F2FS_WRITEBACK) &&
69 !list_empty(&sbi->cp_wait.task_list))
70 wake_up(&sbi->cp_wait);
71
72 bio_put(bio);
73}
74
75/*
76 * Low-level block read/write IO operations.
77 */
78static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
79 int npages, bool is_read)
80{
81 struct bio *bio;
82
83 /* No failure on bio allocation */
84 bio = bio_alloc(GFP_NOIO, npages);
85
86 bio->bi_bdev = sbi->sb->s_bdev;
87 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
88 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
89
90 return bio;
91}
92
93static void __submit_merged_bio(struct f2fs_bio_info *io)
94{
95 struct f2fs_io_info *fio = &io->fio;
96 int rw;
97
98 if (!io->bio)
99 return;
100
101 rw = fio->rw;
102
103 if (is_read_io(rw)) {
104 trace_f2fs_submit_read_bio(io->sbi->sb, rw,
105 fio->type, io->bio);
106 submit_bio(rw, io->bio);
107 } else {
108 trace_f2fs_submit_write_bio(io->sbi->sb, rw,
109 fio->type, io->bio);
110 /*
111 * META_FLUSH is only from the checkpoint procedure, and we
112 * should wait this metadata bio for FS consistency.
113 */
114 if (fio->type == META_FLUSH) {
115 DECLARE_COMPLETION_ONSTACK(wait);
116 io->bio->bi_private = &wait;
117 submit_bio(rw, io->bio);
118 wait_for_completion(&wait);
119 } else {
120 submit_bio(rw, io->bio);
121 }
122 }
123
124 io->bio = NULL;
125}
126
127void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
128 enum page_type type, int rw)
129{
130 enum page_type btype = PAGE_TYPE_OF_BIO(type);
131 struct f2fs_bio_info *io;
132
133 io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
134
135 mutex_lock(&io->io_mutex);
136
137 /* change META to META_FLUSH in the checkpoint procedure */
138 if (type >= META_FLUSH) {
139 io->fio.type = META_FLUSH;
140 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
141 }
142 __submit_merged_bio(io);
143 mutex_unlock(&io->io_mutex);
144}
145
146/*
147 * Fill the locked page with data located in the block address.
148 * Return unlocked page.
149 */
150int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
151 block_t blk_addr, int rw)
152{
153 struct bio *bio;
154
155 trace_f2fs_submit_page_bio(page, blk_addr, rw);
156
157 /* Allocate a new bio */
158 bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
159
160 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
161 bio_put(bio);
162 f2fs_put_page(page, 1);
163 return -EFAULT;
164 }
165
166 submit_bio(rw, bio);
167 return 0;
168}
169
170void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
171 block_t blk_addr, struct f2fs_io_info *fio)
172{
173 enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
174 struct f2fs_bio_info *io;
175 bool is_read = is_read_io(fio->rw);
176
177 io = is_read ? &sbi->read_io : &sbi->write_io[btype];
178
179 verify_block_addr(sbi, blk_addr);
180
181 mutex_lock(&io->io_mutex);
182
183 if (!is_read)
184 inc_page_count(sbi, F2FS_WRITEBACK);
185
186 if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
187 io->fio.rw != fio->rw))
188 __submit_merged_bio(io);
189alloc_new:
190 if (io->bio == NULL) {
191 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
192
193 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
194 io->fio = *fio;
195 }
196
197 if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
198 PAGE_CACHE_SIZE) {
199 __submit_merged_bio(io);
200 goto alloc_new;
201 }
202
203 io->last_block_in_bio = blk_addr;
204
205 mutex_unlock(&io->io_mutex);
206 trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
207}
208
27/* 209/*
28 * Lock ordering for the change of data block address: 210 * Lock ordering for the change of data block address:
29 * ->data_page 211 * ->data_page
@@ -37,7 +219,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
37 struct page *node_page = dn->node_page; 219 struct page *node_page = dn->node_page;
38 unsigned int ofs_in_node = dn->ofs_in_node; 220 unsigned int ofs_in_node = dn->ofs_in_node;
39 221
40 f2fs_wait_on_page_writeback(node_page, NODE, false); 222 f2fs_wait_on_page_writeback(node_page, NODE);
41 223
42 rn = F2FS_NODE(node_page); 224 rn = F2FS_NODE(node_page);
43 225
@@ -51,19 +233,39 @@ int reserve_new_block(struct dnode_of_data *dn)
51{ 233{
52 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 234 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
53 235
54 if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) 236 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
55 return -EPERM; 237 return -EPERM;
56 if (!inc_valid_block_count(sbi, dn->inode, 1)) 238 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
57 return -ENOSPC; 239 return -ENOSPC;
58 240
59 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); 241 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
60 242
61 __set_data_blkaddr(dn, NEW_ADDR); 243 __set_data_blkaddr(dn, NEW_ADDR);
62 dn->data_blkaddr = NEW_ADDR; 244 dn->data_blkaddr = NEW_ADDR;
245 mark_inode_dirty(dn->inode);
63 sync_inode_page(dn); 246 sync_inode_page(dn);
64 return 0; 247 return 0;
65} 248}
66 249
250int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
251{
252 bool need_put = dn->inode_page ? false : true;
253 int err;
254
255 /* if inode_page exists, index should be zero */
256 f2fs_bug_on(!need_put && index);
257
258 err = get_dnode_of_data(dn, index, ALLOC_NODE);
259 if (err)
260 return err;
261
262 if (dn->data_blkaddr == NULL_ADDR)
263 err = reserve_new_block(dn);
264 if (err || need_put)
265 f2fs_put_dnode(dn);
266 return err;
267}
268
67static int check_extent_cache(struct inode *inode, pgoff_t pgofs, 269static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
68 struct buffer_head *bh_result) 270 struct buffer_head *bh_result)
69{ 271{
@@ -71,6 +273,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
71 pgoff_t start_fofs, end_fofs; 273 pgoff_t start_fofs, end_fofs;
72 block_t start_blkaddr; 274 block_t start_blkaddr;
73 275
276 if (is_inode_flag_set(fi, FI_NO_EXTENT))
277 return 0;
278
74 read_lock(&fi->ext.ext_lock); 279 read_lock(&fi->ext.ext_lock);
75 if (fi->ext.len == 0) { 280 if (fi->ext.len == 0) {
76 read_unlock(&fi->ext.ext_lock); 281 read_unlock(&fi->ext.ext_lock);
@@ -109,6 +314,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
109 struct f2fs_inode_info *fi = F2FS_I(dn->inode); 314 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
110 pgoff_t fofs, start_fofs, end_fofs; 315 pgoff_t fofs, start_fofs, end_fofs;
111 block_t start_blkaddr, end_blkaddr; 316 block_t start_blkaddr, end_blkaddr;
317 int need_update = true;
112 318
113 f2fs_bug_on(blk_addr == NEW_ADDR); 319 f2fs_bug_on(blk_addr == NEW_ADDR);
114 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 320 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -117,6 +323,9 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
117 /* Update the page address in the parent node */ 323 /* Update the page address in the parent node */
118 __set_data_blkaddr(dn, blk_addr); 324 __set_data_blkaddr(dn, blk_addr);
119 325
326 if (is_inode_flag_set(fi, FI_NO_EXTENT))
327 return;
328
120 write_lock(&fi->ext.ext_lock); 329 write_lock(&fi->ext.ext_lock);
121 330
122 start_fofs = fi->ext.fofs; 331 start_fofs = fi->ext.fofs;
@@ -163,14 +372,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
163 fofs - start_fofs + 1; 372 fofs - start_fofs + 1;
164 fi->ext.len -= fofs - start_fofs + 1; 373 fi->ext.len -= fofs - start_fofs + 1;
165 } 374 }
166 goto end_update; 375 } else {
376 need_update = false;
167 } 377 }
168 write_unlock(&fi->ext.ext_lock);
169 return;
170 378
379 /* Finally, if the extent is very fragmented, let's drop the cache. */
380 if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
381 fi->ext.len = 0;
382 set_inode_flag(fi, FI_NO_EXTENT);
383 need_update = true;
384 }
171end_update: 385end_update:
172 write_unlock(&fi->ext.ext_lock); 386 write_unlock(&fi->ext.ext_lock);
173 sync_inode_page(dn); 387 if (need_update)
388 sync_inode_page(dn);
389 return;
174} 390}
175 391
176struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) 392struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
@@ -196,7 +412,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
196 return ERR_PTR(-ENOENT); 412 return ERR_PTR(-ENOENT);
197 413
198 /* By fallocate(), there is no cached page, but with NEW_ADDR */ 414 /* By fallocate(), there is no cached page, but with NEW_ADDR */
199 if (dn.data_blkaddr == NEW_ADDR) 415 if (unlikely(dn.data_blkaddr == NEW_ADDR))
200 return ERR_PTR(-EINVAL); 416 return ERR_PTR(-EINVAL);
201 417
202 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); 418 page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
@@ -208,11 +424,14 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
208 return page; 424 return page;
209 } 425 }
210 426
211 err = f2fs_readpage(sbi, page, dn.data_blkaddr, 427 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
212 sync ? READ_SYNC : READA); 428 sync ? READ_SYNC : READA);
429 if (err)
430 return ERR_PTR(err);
431
213 if (sync) { 432 if (sync) {
214 wait_on_page_locked(page); 433 wait_on_page_locked(page);
215 if (!PageUptodate(page)) { 434 if (unlikely(!PageUptodate(page))) {
216 f2fs_put_page(page, 0); 435 f2fs_put_page(page, 0);
217 return ERR_PTR(-EIO); 436 return ERR_PTR(-EIO);
218 } 437 }
@@ -246,7 +465,7 @@ repeat:
246 } 465 }
247 f2fs_put_dnode(&dn); 466 f2fs_put_dnode(&dn);
248 467
249 if (dn.data_blkaddr == NULL_ADDR) { 468 if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
250 f2fs_put_page(page, 1); 469 f2fs_put_page(page, 1);
251 return ERR_PTR(-ENOENT); 470 return ERR_PTR(-ENOENT);
252 } 471 }
@@ -266,16 +485,16 @@ repeat:
266 return page; 485 return page;
267 } 486 }
268 487
269 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); 488 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
270 if (err) 489 if (err)
271 return ERR_PTR(err); 490 return ERR_PTR(err);
272 491
273 lock_page(page); 492 lock_page(page);
274 if (!PageUptodate(page)) { 493 if (unlikely(!PageUptodate(page))) {
275 f2fs_put_page(page, 1); 494 f2fs_put_page(page, 1);
276 return ERR_PTR(-EIO); 495 return ERR_PTR(-EIO);
277 } 496 }
278 if (page->mapping != mapping) { 497 if (unlikely(page->mapping != mapping)) {
279 f2fs_put_page(page, 1); 498 f2fs_put_page(page, 1);
280 goto repeat; 499 goto repeat;
281 } 500 }
@@ -286,12 +505,12 @@ repeat:
286 * Caller ensures that this data page is never allocated. 505 * Caller ensures that this data page is never allocated.
287 * A new zero-filled data page is allocated in the page cache. 506 * A new zero-filled data page is allocated in the page cache.
288 * 507 *
289 * Also, caller should grab and release a mutex by calling mutex_lock_op() and 508 * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
290 * mutex_unlock_op(). 509 * f2fs_unlock_op().
291 * Note that, npage is set only by make_empty_dir. 510 * Note that, ipage is set only by make_empty_dir.
292 */ 511 */
293struct page *get_new_data_page(struct inode *inode, 512struct page *get_new_data_page(struct inode *inode,
294 struct page *npage, pgoff_t index, bool new_i_size) 513 struct page *ipage, pgoff_t index, bool new_i_size)
295{ 514{
296 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 515 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
297 struct address_space *mapping = inode->i_mapping; 516 struct address_space *mapping = inode->i_mapping;
@@ -299,24 +518,16 @@ struct page *get_new_data_page(struct inode *inode,
299 struct dnode_of_data dn; 518 struct dnode_of_data dn;
300 int err; 519 int err;
301 520
302 set_new_dnode(&dn, inode, npage, npage, 0); 521 set_new_dnode(&dn, inode, ipage, NULL, 0);
303 err = get_dnode_of_data(&dn, index, ALLOC_NODE); 522 err = f2fs_reserve_block(&dn, index);
304 if (err) 523 if (err)
305 return ERR_PTR(err); 524 return ERR_PTR(err);
306
307 if (dn.data_blkaddr == NULL_ADDR) {
308 if (reserve_new_block(&dn)) {
309 if (!npage)
310 f2fs_put_dnode(&dn);
311 return ERR_PTR(-ENOSPC);
312 }
313 }
314 if (!npage)
315 f2fs_put_dnode(&dn);
316repeat: 525repeat:
317 page = grab_cache_page(mapping, index); 526 page = grab_cache_page(mapping, index);
318 if (!page) 527 if (!page) {
319 return ERR_PTR(-ENOMEM); 528 err = -ENOMEM;
529 goto put_err;
530 }
320 531
321 if (PageUptodate(page)) 532 if (PageUptodate(page))
322 return page; 533 return page;
@@ -325,15 +536,18 @@ repeat:
325 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 536 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
326 SetPageUptodate(page); 537 SetPageUptodate(page);
327 } else { 538 } else {
328 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); 539 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
540 READ_SYNC);
329 if (err) 541 if (err)
330 return ERR_PTR(err); 542 goto put_err;
543
331 lock_page(page); 544 lock_page(page);
332 if (!PageUptodate(page)) { 545 if (unlikely(!PageUptodate(page))) {
333 f2fs_put_page(page, 1); 546 f2fs_put_page(page, 1);
334 return ERR_PTR(-EIO); 547 err = -EIO;
548 goto put_err;
335 } 549 }
336 if (page->mapping != mapping) { 550 if (unlikely(page->mapping != mapping)) {
337 f2fs_put_page(page, 1); 551 f2fs_put_page(page, 1);
338 goto repeat; 552 goto repeat;
339 } 553 }
@@ -344,140 +558,187 @@ repeat:
344 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); 558 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
345 /* Only the directory inode sets new_i_size */ 559 /* Only the directory inode sets new_i_size */
346 set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); 560 set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
347 mark_inode_dirty_sync(inode);
348 } 561 }
349 return page; 562 return page;
350}
351 563
352static void read_end_io(struct bio *bio, int err) 564put_err:
353{ 565 f2fs_put_dnode(&dn);
354 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 566 return ERR_PTR(err);
355 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
356
357 do {
358 struct page *page = bvec->bv_page;
359
360 if (--bvec >= bio->bi_io_vec)
361 prefetchw(&bvec->bv_page->flags);
362
363 if (uptodate) {
364 SetPageUptodate(page);
365 } else {
366 ClearPageUptodate(page);
367 SetPageError(page);
368 }
369 unlock_page(page);
370 } while (bvec >= bio->bi_io_vec);
371 bio_put(bio);
372} 567}
373 568
374/* 569static int __allocate_data_block(struct dnode_of_data *dn)
375 * Fill the locked page with data located in the block address.
376 * Return unlocked page.
377 */
378int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
379 block_t blk_addr, int type)
380{ 570{
381 struct block_device *bdev = sbi->sb->s_bdev; 571 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
382 struct bio *bio; 572 struct f2fs_summary sum;
573 block_t new_blkaddr;
574 struct node_info ni;
575 int type;
383 576
384 trace_f2fs_readpage(page, blk_addr, type); 577 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
578 return -EPERM;
579 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
580 return -ENOSPC;
385 581
386 down_read(&sbi->bio_sem); 582 __set_data_blkaddr(dn, NEW_ADDR);
583 dn->data_blkaddr = NEW_ADDR;
387 584
388 /* Allocate a new bio */ 585 get_node_info(sbi, dn->nid, &ni);
389 bio = f2fs_bio_alloc(bdev, 1); 586 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
390 587
391 /* Initialize the bio */ 588 type = CURSEG_WARM_DATA;
392 bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
393 bio->bi_end_io = read_end_io;
394 589
395 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 590 allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
396 bio_put(bio); 591
397 up_read(&sbi->bio_sem); 592 /* direct IO doesn't use extent cache to maximize the performance */
398 f2fs_put_page(page, 1); 593 set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
399 return -EFAULT; 594 update_extent_cache(new_blkaddr, dn);
400 } 595 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
401 596
402 submit_bio(type, bio); 597 dn->data_blkaddr = new_blkaddr;
403 up_read(&sbi->bio_sem);
404 return 0; 598 return 0;
405} 599}
406 600
407/* 601/*
408 * This function should be used by the data read flow only where it 602 * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
409 * does not check the "create" flag that indicates block allocation. 603 * If original data blocks are allocated, then give them to blockdev.
410 * The reason for this special functionality is to exploit VFS readahead 604 * Otherwise,
411 * mechanism. 605 * a. preallocate requested block addresses
606 * b. do not use extent cache for better performance
607 * c. give the block addresses to blockdev
412 */ 608 */
413static int get_data_block_ro(struct inode *inode, sector_t iblock, 609static int get_data_block(struct inode *inode, sector_t iblock,
414 struct buffer_head *bh_result, int create) 610 struct buffer_head *bh_result, int create)
415{ 611{
612 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
416 unsigned int blkbits = inode->i_sb->s_blocksize_bits; 613 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
417 unsigned maxblocks = bh_result->b_size >> blkbits; 614 unsigned maxblocks = bh_result->b_size >> blkbits;
418 struct dnode_of_data dn; 615 struct dnode_of_data dn;
419 pgoff_t pgofs; 616 int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
420 int err; 617 pgoff_t pgofs, end_offset;
618 int err = 0, ofs = 1;
619 bool allocated = false;
421 620
422 /* Get the page offset from the block offset(iblock) */ 621 /* Get the page offset from the block offset(iblock) */
423 pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); 622 pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
424 623
425 if (check_extent_cache(inode, pgofs, bh_result)) { 624 if (check_extent_cache(inode, pgofs, bh_result))
426 trace_f2fs_get_data_block(inode, iblock, bh_result, 0); 625 goto out;
427 return 0; 626
428 } 627 if (create)
628 f2fs_lock_op(sbi);
429 629
430 /* When reading holes, we need its node page */ 630 /* When reading holes, we need its node page */
431 set_new_dnode(&dn, inode, NULL, NULL, 0); 631 set_new_dnode(&dn, inode, NULL, NULL, 0);
432 err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); 632 err = get_dnode_of_data(&dn, pgofs, mode);
433 if (err) { 633 if (err) {
434 trace_f2fs_get_data_block(inode, iblock, bh_result, err); 634 if (err == -ENOENT)
435 return (err == -ENOENT) ? 0 : err; 635 err = 0;
636 goto unlock_out;
637 }
638 if (dn.data_blkaddr == NEW_ADDR)
639 goto put_out;
640
641 if (dn.data_blkaddr != NULL_ADDR) {
642 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
643 } else if (create) {
644 err = __allocate_data_block(&dn);
645 if (err)
646 goto put_out;
647 allocated = true;
648 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
649 } else {
650 goto put_out;
436 } 651 }
437 652
438 /* It does not support data allocation */ 653 end_offset = IS_INODE(dn.node_page) ?
439 f2fs_bug_on(create); 654 ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
655 bh_result->b_size = (((size_t)1) << blkbits);
656 dn.ofs_in_node++;
657 pgofs++;
658
659get_next:
660 if (dn.ofs_in_node >= end_offset) {
661 if (allocated)
662 sync_inode_page(&dn);
663 allocated = false;
664 f2fs_put_dnode(&dn);
440 665
441 if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { 666 set_new_dnode(&dn, inode, NULL, NULL, 0);
442 int i; 667 err = get_dnode_of_data(&dn, pgofs, mode);
443 unsigned int end_offset; 668 if (err) {
669 if (err == -ENOENT)
670 err = 0;
671 goto unlock_out;
672 }
673 if (dn.data_blkaddr == NEW_ADDR)
674 goto put_out;
444 675
445 end_offset = IS_INODE(dn.node_page) ? 676 end_offset = IS_INODE(dn.node_page) ?
446 ADDRS_PER_INODE(F2FS_I(inode)) : 677 ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
447 ADDRS_PER_BLOCK; 678 }
448
449 clear_buffer_new(bh_result);
450 679
680 if (maxblocks > (bh_result->b_size >> blkbits)) {
681 block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
682 if (blkaddr == NULL_ADDR && create) {
683 err = __allocate_data_block(&dn);
684 if (err)
685 goto sync_out;
686 allocated = true;
687 blkaddr = dn.data_blkaddr;
688 }
451 /* Give more consecutive addresses for the read ahead */ 689 /* Give more consecutive addresses for the read ahead */
452 for (i = 0; i < end_offset - dn.ofs_in_node; i++) 690 if (blkaddr == (bh_result->b_blocknr + ofs)) {
453 if (((datablock_addr(dn.node_page, 691 ofs++;
454 dn.ofs_in_node + i)) 692 dn.ofs_in_node++;
455 != (dn.data_blkaddr + i)) || maxblocks == i) 693 pgofs++;
456 break; 694 bh_result->b_size += (((size_t)1) << blkbits);
457 map_bh(bh_result, inode->i_sb, dn.data_blkaddr); 695 goto get_next;
458 bh_result->b_size = (i << blkbits); 696 }
459 } 697 }
698sync_out:
699 if (allocated)
700 sync_inode_page(&dn);
701put_out:
460 f2fs_put_dnode(&dn); 702 f2fs_put_dnode(&dn);
461 trace_f2fs_get_data_block(inode, iblock, bh_result, 0); 703unlock_out:
462 return 0; 704 if (create)
705 f2fs_unlock_op(sbi);
706out:
707 trace_f2fs_get_data_block(inode, iblock, bh_result, err);
708 return err;
463} 709}
464 710
465static int f2fs_read_data_page(struct file *file, struct page *page) 711static int f2fs_read_data_page(struct file *file, struct page *page)
466{ 712{
467 return mpage_readpage(page, get_data_block_ro); 713 struct inode *inode = page->mapping->host;
714 int ret;
715
716 /* If the file has inline data, try to read it directlly */
717 if (f2fs_has_inline_data(inode))
718 ret = f2fs_read_inline_data(inode, page);
719 else
720 ret = mpage_readpage(page, get_data_block);
721
722 return ret;
468} 723}
469 724
470static int f2fs_read_data_pages(struct file *file, 725static int f2fs_read_data_pages(struct file *file,
471 struct address_space *mapping, 726 struct address_space *mapping,
472 struct list_head *pages, unsigned nr_pages) 727 struct list_head *pages, unsigned nr_pages)
473{ 728{
474 return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); 729 struct inode *inode = file->f_mapping->host;
730
731 /* If the file has inline data, skip readpages */
732 if (f2fs_has_inline_data(inode))
733 return 0;
734
735 return mpage_readpages(mapping, pages, nr_pages, get_data_block);
475} 736}
476 737
477int do_write_data_page(struct page *page) 738int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
478{ 739{
479 struct inode *inode = page->mapping->host; 740 struct inode *inode = page->mapping->host;
480 block_t old_blk_addr, new_blk_addr; 741 block_t old_blkaddr, new_blkaddr;
481 struct dnode_of_data dn; 742 struct dnode_of_data dn;
482 int err = 0; 743 int err = 0;
483 744
@@ -486,10 +747,10 @@ int do_write_data_page(struct page *page)
486 if (err) 747 if (err)
487 return err; 748 return err;
488 749
489 old_blk_addr = dn.data_blkaddr; 750 old_blkaddr = dn.data_blkaddr;
490 751
491 /* This page is already truncated */ 752 /* This page is already truncated */
492 if (old_blk_addr == NULL_ADDR) 753 if (old_blkaddr == NULL_ADDR)
493 goto out_writepage; 754 goto out_writepage;
494 755
495 set_page_writeback(page); 756 set_page_writeback(page);
@@ -498,15 +759,13 @@ int do_write_data_page(struct page *page)
498 * If current allocation needs SSR, 759 * If current allocation needs SSR,
499 * it had better in-place writes for updated data. 760 * it had better in-place writes for updated data.
500 */ 761 */
501 if (unlikely(old_blk_addr != NEW_ADDR && 762 if (unlikely(old_blkaddr != NEW_ADDR &&
502 !is_cold_data(page) && 763 !is_cold_data(page) &&
503 need_inplace_update(inode))) { 764 need_inplace_update(inode))) {
504 rewrite_data_page(F2FS_SB(inode->i_sb), page, 765 rewrite_data_page(page, old_blkaddr, fio);
505 old_blk_addr);
506 } else { 766 } else {
507 write_data_page(inode, page, &dn, 767 write_data_page(page, &dn, &new_blkaddr, fio);
508 old_blk_addr, &new_blk_addr); 768 update_extent_cache(new_blkaddr, &dn);
509 update_extent_cache(new_blk_addr, &dn);
510 } 769 }
511out_writepage: 770out_writepage:
512 f2fs_put_dnode(&dn); 771 f2fs_put_dnode(&dn);
@@ -521,9 +780,13 @@ static int f2fs_write_data_page(struct page *page,
521 loff_t i_size = i_size_read(inode); 780 loff_t i_size = i_size_read(inode);
522 const pgoff_t end_index = ((unsigned long long) i_size) 781 const pgoff_t end_index = ((unsigned long long) i_size)
523 >> PAGE_CACHE_SHIFT; 782 >> PAGE_CACHE_SHIFT;
524 unsigned offset; 783 unsigned offset = 0;
525 bool need_balance_fs = false; 784 bool need_balance_fs = false;
526 int err = 0; 785 int err = 0;
786 struct f2fs_io_info fio = {
787 .type = DATA,
788 .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
789 };
527 790
528 if (page->index < end_index) 791 if (page->index < end_index)
529 goto write; 792 goto write;
@@ -543,7 +806,7 @@ static int f2fs_write_data_page(struct page *page,
543 806
544 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 807 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
545write: 808write:
546 if (sbi->por_doing) { 809 if (unlikely(sbi->por_doing)) {
547 err = AOP_WRITEPAGE_ACTIVATE; 810 err = AOP_WRITEPAGE_ACTIVATE;
548 goto redirty_out; 811 goto redirty_out;
549 } 812 }
@@ -552,10 +815,18 @@ write:
552 if (S_ISDIR(inode->i_mode)) { 815 if (S_ISDIR(inode->i_mode)) {
553 dec_page_count(sbi, F2FS_DIRTY_DENTS); 816 dec_page_count(sbi, F2FS_DIRTY_DENTS);
554 inode_dec_dirty_dents(inode); 817 inode_dec_dirty_dents(inode);
555 err = do_write_data_page(page); 818 err = do_write_data_page(page, &fio);
556 } else { 819 } else {
557 f2fs_lock_op(sbi); 820 f2fs_lock_op(sbi);
558 err = do_write_data_page(page); 821
822 if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
823 err = f2fs_write_inline_data(inode, page, offset);
824 f2fs_unlock_op(sbi);
825 goto out;
826 } else {
827 err = do_write_data_page(page, &fio);
828 }
829
559 f2fs_unlock_op(sbi); 830 f2fs_unlock_op(sbi);
560 need_balance_fs = true; 831 need_balance_fs = true;
561 } 832 }
@@ -564,8 +835,10 @@ write:
564 else if (err) 835 else if (err)
565 goto redirty_out; 836 goto redirty_out;
566 837
567 if (wbc->for_reclaim) 838 if (wbc->for_reclaim) {
568 f2fs_submit_bio(sbi, DATA, true); 839 f2fs_submit_merged_bio(sbi, DATA, WRITE);
840 need_balance_fs = false;
841 }
569 842
570 clear_cold_data(page); 843 clear_cold_data(page);
571out: 844out:
@@ -617,7 +890,8 @@ static int f2fs_write_data_pages(struct address_space *mapping,
617 ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); 890 ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
618 if (locked) 891 if (locked)
619 mutex_unlock(&sbi->writepages); 892 mutex_unlock(&sbi->writepages);
620 f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); 893
894 f2fs_submit_merged_bio(sbi, DATA, WRITE);
621 895
622 remove_dirty_dir_inode(inode); 896 remove_dirty_dir_inode(inode);
623 897
@@ -638,27 +912,28 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
638 912
639 f2fs_balance_fs(sbi); 913 f2fs_balance_fs(sbi);
640repeat: 914repeat:
915 err = f2fs_convert_inline_data(inode, pos + len);
916 if (err)
917 return err;
918
641 page = grab_cache_page_write_begin(mapping, index, flags); 919 page = grab_cache_page_write_begin(mapping, index, flags);
642 if (!page) 920 if (!page)
643 return -ENOMEM; 921 return -ENOMEM;
644 *pagep = page; 922 *pagep = page;
645 923
646 f2fs_lock_op(sbi); 924 if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
925 goto inline_data;
647 926
927 f2fs_lock_op(sbi);
648 set_new_dnode(&dn, inode, NULL, NULL, 0); 928 set_new_dnode(&dn, inode, NULL, NULL, 0);
649 err = get_dnode_of_data(&dn, index, ALLOC_NODE); 929 err = f2fs_reserve_block(&dn, index);
650 if (err)
651 goto err;
652
653 if (dn.data_blkaddr == NULL_ADDR)
654 err = reserve_new_block(&dn);
655
656 f2fs_put_dnode(&dn);
657 if (err)
658 goto err;
659
660 f2fs_unlock_op(sbi); 930 f2fs_unlock_op(sbi);
661 931
932 if (err) {
933 f2fs_put_page(page, 1);
934 return err;
935 }
936inline_data:
662 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) 937 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
663 return 0; 938 return 0;
664 939
@@ -674,15 +949,19 @@ repeat:
674 if (dn.data_blkaddr == NEW_ADDR) { 949 if (dn.data_blkaddr == NEW_ADDR) {
675 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 950 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
676 } else { 951 } else {
677 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); 952 if (f2fs_has_inline_data(inode))
953 err = f2fs_read_inline_data(inode, page);
954 else
955 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
956 READ_SYNC);
678 if (err) 957 if (err)
679 return err; 958 return err;
680 lock_page(page); 959 lock_page(page);
681 if (!PageUptodate(page)) { 960 if (unlikely(!PageUptodate(page))) {
682 f2fs_put_page(page, 1); 961 f2fs_put_page(page, 1);
683 return -EIO; 962 return -EIO;
684 } 963 }
685 if (page->mapping != mapping) { 964 if (unlikely(page->mapping != mapping)) {
686 f2fs_put_page(page, 1); 965 f2fs_put_page(page, 1);
687 goto repeat; 966 goto repeat;
688 } 967 }
@@ -691,11 +970,6 @@ out:
691 SetPageUptodate(page); 970 SetPageUptodate(page);
692 clear_cold_data(page); 971 clear_cold_data(page);
693 return 0; 972 return 0;
694
695err:
696 f2fs_unlock_op(sbi);
697 f2fs_put_page(page, 1);
698 return err;
699} 973}
700 974
701static int f2fs_write_end(struct file *file, 975static int f2fs_write_end(struct file *file,
@@ -714,23 +988,43 @@ static int f2fs_write_end(struct file *file,
714 update_inode_page(inode); 988 update_inode_page(inode);
715 } 989 }
716 990
717 unlock_page(page); 991 f2fs_put_page(page, 1);
718 page_cache_release(page);
719 return copied; 992 return copied;
720} 993}
721 994
995static int check_direct_IO(struct inode *inode, int rw,
996 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
997{
998 unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
999 int i;
1000
1001 if (rw == READ)
1002 return 0;
1003
1004 if (offset & blocksize_mask)
1005 return -EINVAL;
1006
1007 for (i = 0; i < nr_segs; i++)
1008 if (iov[i].iov_len & blocksize_mask)
1009 return -EINVAL;
1010 return 0;
1011}
1012
722static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, 1013static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
723 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 1014 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
724{ 1015{
725 struct file *file = iocb->ki_filp; 1016 struct file *file = iocb->ki_filp;
726 struct inode *inode = file->f_mapping->host; 1017 struct inode *inode = file->f_mapping->host;
727 1018
728 if (rw == WRITE) 1019 /* Let buffer I/O handle the inline data case. */
1020 if (f2fs_has_inline_data(inode))
1021 return 0;
1022
1023 if (check_direct_IO(inode, rw, iov, offset, nr_segs))
729 return 0; 1024 return 0;
730 1025
731 /* Needs synchronization with the cleaner */
732 return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, 1026 return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
733 get_data_block_ro); 1027 get_data_block);
734} 1028}
735 1029
736static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, 1030static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
@@ -759,6 +1053,8 @@ static int f2fs_set_data_page_dirty(struct page *page)
759 trace_f2fs_set_page_dirty(page, DATA); 1053 trace_f2fs_set_page_dirty(page, DATA);
760 1054
761 SetPageUptodate(page); 1055 SetPageUptodate(page);
1056 mark_inode_dirty(inode);
1057
762 if (!PageDirty(page)) { 1058 if (!PageDirty(page)) {
763 __set_page_dirty_nobuffers(page); 1059 __set_page_dirty_nobuffers(page);
764 set_dirty_dir_page(inode, page); 1060 set_dirty_dir_page(inode, page);
@@ -769,7 +1065,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
769 1065
770static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) 1066static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
771{ 1067{
772 return generic_block_bmap(mapping, block, get_data_block_ro); 1068 return generic_block_bmap(mapping, block, get_data_block);
773} 1069}
774 1070
775const struct address_space_operations f2fs_dblock_aops = { 1071const struct address_space_operations f2fs_dblock_aops = {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a84b0a8e6854..3de9d20d0c14 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -24,7 +24,7 @@
24#include "gc.h" 24#include "gc.h"
25 25
26static LIST_HEAD(f2fs_stat_list); 26static LIST_HEAD(f2fs_stat_list);
27static struct dentry *debugfs_root; 27static struct dentry *f2fs_debugfs_root;
28static DEFINE_MUTEX(f2fs_stat_mutex); 28static DEFINE_MUTEX(f2fs_stat_mutex);
29 29
30static void update_general_status(struct f2fs_sb_info *sbi) 30static void update_general_status(struct f2fs_sb_info *sbi)
@@ -45,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi)
45 si->valid_count = valid_user_blocks(sbi); 45 si->valid_count = valid_user_blocks(sbi);
46 si->valid_node_count = valid_node_count(sbi); 46 si->valid_node_count = valid_node_count(sbi);
47 si->valid_inode_count = valid_inode_count(sbi); 47 si->valid_inode_count = valid_inode_count(sbi);
48 si->inline_inode = sbi->inline_inode;
48 si->utilization = utilization(sbi); 49 si->utilization = utilization(sbi);
49 50
50 si->free_segs = free_segments(sbi); 51 si->free_segs = free_segments(sbi);
51 si->free_secs = free_sections(sbi); 52 si->free_secs = free_sections(sbi);
52 si->prefree_count = prefree_segments(sbi); 53 si->prefree_count = prefree_segments(sbi);
53 si->dirty_count = dirty_segments(sbi); 54 si->dirty_count = dirty_segments(sbi);
54 si->node_pages = sbi->node_inode->i_mapping->nrpages; 55 si->node_pages = NODE_MAPPING(sbi)->nrpages;
55 si->meta_pages = sbi->meta_inode->i_mapping->nrpages; 56 si->meta_pages = META_MAPPING(sbi)->nrpages;
56 si->nats = NM_I(sbi)->nat_cnt; 57 si->nats = NM_I(sbi)->nat_cnt;
57 si->sits = SIT_I(sbi)->dirty_sentries; 58 si->sits = SIT_I(sbi)->dirty_sentries;
58 si->fnids = NM_I(sbi)->fcnt; 59 si->fnids = NM_I(sbi)->fcnt;
@@ -165,9 +166,9 @@ get_cache:
165 /* free nids */ 166 /* free nids */
166 si->cache_mem = NM_I(sbi)->fcnt; 167 si->cache_mem = NM_I(sbi)->fcnt;
167 si->cache_mem += NM_I(sbi)->nat_cnt; 168 si->cache_mem += NM_I(sbi)->nat_cnt;
168 npages = sbi->node_inode->i_mapping->nrpages; 169 npages = NODE_MAPPING(sbi)->nrpages;
169 si->cache_mem += npages << PAGE_CACHE_SHIFT; 170 si->cache_mem += npages << PAGE_CACHE_SHIFT;
170 npages = sbi->meta_inode->i_mapping->nrpages; 171 npages = META_MAPPING(sbi)->nrpages;
171 si->cache_mem += npages << PAGE_CACHE_SHIFT; 172 si->cache_mem += npages << PAGE_CACHE_SHIFT;
172 si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); 173 si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
173 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); 174 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
@@ -200,6 +201,8 @@ static int stat_show(struct seq_file *s, void *v)
200 seq_printf(s, "Other: %u)\n - Data: %u\n", 201 seq_printf(s, "Other: %u)\n - Data: %u\n",
201 si->valid_node_count - si->valid_inode_count, 202 si->valid_node_count - si->valid_inode_count,
202 si->valid_count - si->valid_node_count); 203 si->valid_count - si->valid_node_count);
204 seq_printf(s, " - Inline_data Inode: %u\n",
205 si->inline_inode);
203 seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", 206 seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
204 si->main_area_segs, si->main_area_sections, 207 si->main_area_segs, si->main_area_sections,
205 si->main_area_zones); 208 si->main_area_zones);
@@ -242,14 +245,14 @@ static int stat_show(struct seq_file *s, void *v)
242 seq_printf(s, " - node blocks : %d\n", si->node_blks); 245 seq_printf(s, " - node blocks : %d\n", si->node_blks);
243 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", 246 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
244 si->hit_ext, si->total_ext); 247 si->hit_ext, si->total_ext);
245 seq_printf(s, "\nBalancing F2FS Async:\n"); 248 seq_puts(s, "\nBalancing F2FS Async:\n");
246 seq_printf(s, " - nodes %4d in %4d\n", 249 seq_printf(s, " - nodes: %4d in %4d\n",
247 si->ndirty_node, si->node_pages); 250 si->ndirty_node, si->node_pages);
248 seq_printf(s, " - dents %4d in dirs:%4d\n", 251 seq_printf(s, " - dents: %4d in dirs:%4d\n",
249 si->ndirty_dent, si->ndirty_dirs); 252 si->ndirty_dent, si->ndirty_dirs);
250 seq_printf(s, " - meta %4d in %4d\n", 253 seq_printf(s, " - meta: %4d in %4d\n",
251 si->ndirty_meta, si->meta_pages); 254 si->ndirty_meta, si->meta_pages);
252 seq_printf(s, " - NATs %5d > %lu\n", 255 seq_printf(s, " - NATs: %5d > %lu\n",
253 si->nats, NM_WOUT_THRESHOLD); 256 si->nats, NM_WOUT_THRESHOLD);
254 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", 257 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
255 si->sits, si->fnids); 258 si->sits, si->fnids);
@@ -340,14 +343,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
340 343
341void __init f2fs_create_root_stats(void) 344void __init f2fs_create_root_stats(void)
342{ 345{
343 debugfs_root = debugfs_create_dir("f2fs", NULL); 346 struct dentry *file;
344 if (debugfs_root) 347
345 debugfs_create_file("status", S_IRUGO, debugfs_root, 348 f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
346 NULL, &stat_fops); 349 if (!f2fs_debugfs_root)
350 goto bail;
351
352 file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
353 NULL, &stat_fops);
354 if (!file)
355 goto free_debugfs_dir;
356
357 return;
358
359free_debugfs_dir:
360 debugfs_remove(f2fs_debugfs_root);
361
362bail:
363 f2fs_debugfs_root = NULL;
364 return;
347} 365}
348 366
349void f2fs_destroy_root_stats(void) 367void f2fs_destroy_root_stats(void)
350{ 368{
351 debugfs_remove_recursive(debugfs_root); 369 if (!f2fs_debugfs_root)
352 debugfs_root = NULL; 370 return;
371
372 debugfs_remove_recursive(f2fs_debugfs_root);
373 f2fs_debugfs_root = NULL;
353} 374}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 594fc1bb64ef..2b7c255bcbdf 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -190,9 +190,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
190 unsigned int max_depth; 190 unsigned int max_depth;
191 unsigned int level; 191 unsigned int level;
192 192
193 if (namelen > F2FS_NAME_LEN)
194 return NULL;
195
196 if (npages == 0) 193 if (npages == 0)
197 return NULL; 194 return NULL;
198 195
@@ -259,20 +256,17 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
259 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 256 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
260 mark_inode_dirty(dir); 257 mark_inode_dirty(dir);
261 258
262 /* update parent inode number before releasing dentry page */
263 F2FS_I(inode)->i_pino = dir->i_ino;
264
265 f2fs_put_page(page, 1); 259 f2fs_put_page(page, 1);
266} 260}
267 261
268static void init_dent_inode(const struct qstr *name, struct page *ipage) 262static void init_dent_inode(const struct qstr *name, struct page *ipage)
269{ 263{
270 struct f2fs_node *rn; 264 struct f2fs_inode *ri;
271 265
272 /* copy name info. to this inode page */ 266 /* copy name info. to this inode page */
273 rn = F2FS_NODE(ipage); 267 ri = F2FS_INODE(ipage);
274 rn->i.i_namelen = cpu_to_le32(name->len); 268 ri->i_namelen = cpu_to_le32(name->len);
275 memcpy(rn->i.i_name, name->name, name->len); 269 memcpy(ri->i_name, name->name, name->len);
276 set_page_dirty(ipage); 270 set_page_dirty(ipage);
277} 271}
278 272
@@ -348,11 +342,11 @@ static struct page *init_inode_metadata(struct inode *inode,
348 342
349 err = f2fs_init_acl(inode, dir, page); 343 err = f2fs_init_acl(inode, dir, page);
350 if (err) 344 if (err)
351 goto error; 345 goto put_error;
352 346
353 err = f2fs_init_security(inode, dir, name, page); 347 err = f2fs_init_security(inode, dir, name, page);
354 if (err) 348 if (err)
355 goto error; 349 goto put_error;
356 350
357 wait_on_page_writeback(page); 351 wait_on_page_writeback(page);
358 } else { 352 } else {
@@ -376,8 +370,9 @@ static struct page *init_inode_metadata(struct inode *inode,
376 } 370 }
377 return page; 371 return page;
378 372
379error: 373put_error:
380 f2fs_put_page(page, 1); 374 f2fs_put_page(page, 1);
375error:
381 remove_inode_page(inode); 376 remove_inode_page(inode);
382 return ERR_PTR(err); 377 return ERR_PTR(err);
383} 378}
@@ -393,6 +388,8 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
393 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); 388 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
394 } 389 }
395 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 390 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
391 mark_inode_dirty(dir);
392
396 if (F2FS_I(dir)->i_current_depth != current_depth) { 393 if (F2FS_I(dir)->i_current_depth != current_depth) {
397 F2FS_I(dir)->i_current_depth = current_depth; 394 F2FS_I(dir)->i_current_depth = current_depth;
398 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 395 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -400,8 +397,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
400 397
401 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) 398 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
402 update_inode_page(dir); 399 update_inode_page(dir);
403 else
404 mark_inode_dirty(dir);
405 400
406 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) 401 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
407 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 402 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
@@ -432,10 +427,11 @@ next:
432} 427}
433 428
434/* 429/*
435 * Caller should grab and release a mutex by calling mutex_lock_op() and 430 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
436 * mutex_unlock_op(). 431 * f2fs_unlock_op().
437 */ 432 */
438int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) 433int __f2fs_add_link(struct inode *dir, const struct qstr *name,
434 struct inode *inode)
439{ 435{
440 unsigned int bit_pos; 436 unsigned int bit_pos;
441 unsigned int level; 437 unsigned int level;
@@ -461,7 +457,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
461 } 457 }
462 458
463start: 459start:
464 if (current_depth == MAX_DIR_HASH_DEPTH) 460 if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
465 return -ENOSPC; 461 return -ENOSPC;
466 462
467 /* Increase the depth, if required */ 463 /* Increase the depth, if required */
@@ -554,14 +550,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
554 550
555 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 551 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
556 552
557 if (inode && S_ISDIR(inode->i_mode)) {
558 drop_nlink(dir);
559 update_inode_page(dir);
560 } else {
561 mark_inode_dirty(dir);
562 }
563
564 if (inode) { 553 if (inode) {
554 if (S_ISDIR(inode->i_mode)) {
555 drop_nlink(dir);
556 update_inode_page(dir);
557 }
565 inode->i_ctime = CURRENT_TIME; 558 inode->i_ctime = CURRENT_TIME;
566 drop_nlink(inode); 559 drop_nlink(inode);
567 if (S_ISDIR(inode->i_mode)) { 560 if (S_ISDIR(inode->i_mode)) {
@@ -636,7 +629,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
636 629
637 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); 630 bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
638 631
639 for ( ; n < npages; n++) { 632 for (; n < npages; n++) {
640 dentry_page = get_lock_data_page(inode, n); 633 dentry_page = get_lock_data_page(inode, n);
641 if (IS_ERR(dentry_page)) 634 if (IS_ERR(dentry_page))
642 continue; 635 continue;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 89dc7508faf2..fc3c558cb4f3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,8 +22,10 @@
22 22
23#ifdef CONFIG_F2FS_CHECK_FS 23#ifdef CONFIG_F2FS_CHECK_FS
24#define f2fs_bug_on(condition) BUG_ON(condition) 24#define f2fs_bug_on(condition) BUG_ON(condition)
25#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
25#else 26#else
26#define f2fs_bug_on(condition) 27#define f2fs_bug_on(condition)
28#define f2fs_down_write(x, y) down_write(x)
27#endif 29#endif
28 30
29/* 31/*
@@ -37,6 +39,7 @@
37#define F2FS_MOUNT_POSIX_ACL 0x00000020 39#define F2FS_MOUNT_POSIX_ACL 0x00000020
38#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 40#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
39#define F2FS_MOUNT_INLINE_XATTR 0x00000080 41#define F2FS_MOUNT_INLINE_XATTR 0x00000080
42#define F2FS_MOUNT_INLINE_DATA 0x00000100
40 43
41#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) 44#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
42#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) 45#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -97,6 +100,13 @@ struct dir_inode_entry {
97 struct inode *inode; /* vfs inode pointer */ 100 struct inode *inode; /* vfs inode pointer */
98}; 101};
99 102
103/* for the list of blockaddresses to be discarded */
104struct discard_entry {
105 struct list_head list; /* list head */
106 block_t blkaddr; /* block address to be discarded */
107 int len; /* # of consecutive blocks of the discard */
108};
109
100/* for the list of fsync inodes, used only during recovery */ 110/* for the list of fsync inodes, used only during recovery */
101struct fsync_inode_entry { 111struct fsync_inode_entry {
102 struct list_head list; /* list head */ 112 struct list_head list; /* list head */
@@ -155,13 +165,15 @@ enum {
155 LOOKUP_NODE, /* look up a node without readahead */ 165 LOOKUP_NODE, /* look up a node without readahead */
156 LOOKUP_NODE_RA, /* 166 LOOKUP_NODE_RA, /*
157 * look up a node with readahead called 167 * look up a node with readahead called
158 * by get_datablock_ro. 168 * by get_data_block.
159 */ 169 */
160}; 170};
161 171
162#define F2FS_LINK_MAX 32000 /* maximum link count per file */ 172#define F2FS_LINK_MAX 32000 /* maximum link count per file */
163 173
164/* for in-memory extent cache entry */ 174/* for in-memory extent cache entry */
175#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */
176
165struct extent_info { 177struct extent_info {
166 rwlock_t ext_lock; /* rwlock for consistency */ 178 rwlock_t ext_lock; /* rwlock for consistency */
167 unsigned int fofs; /* start offset in a file */ 179 unsigned int fofs; /* start offset in a file */
@@ -308,6 +320,14 @@ struct f2fs_sm_info {
308 320
309 /* a threshold to reclaim prefree segments */ 321 /* a threshold to reclaim prefree segments */
310 unsigned int rec_prefree_segments; 322 unsigned int rec_prefree_segments;
323
324 /* for small discard management */
325 struct list_head discard_list; /* 4KB discard list */
326 int nr_discards; /* # of discards in the list */
327 int max_discards; /* max. discards to be issued */
328
329 unsigned int ipu_policy; /* in-place-update policy */
330 unsigned int min_ipu_util; /* in-place-update threshold */
311}; 331};
312 332
313/* 333/*
@@ -338,6 +358,7 @@ enum count_type {
338 * with waiting the bio's completion 358 * with waiting the bio's completion
339 * ... Only can be used with META. 359 * ... Only can be used with META.
340 */ 360 */
361#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
341enum page_type { 362enum page_type {
342 DATA, 363 DATA,
343 NODE, 364 NODE,
@@ -346,6 +367,20 @@ enum page_type {
346 META_FLUSH, 367 META_FLUSH,
347}; 368};
348 369
370struct f2fs_io_info {
371 enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
372 int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
373};
374
375#define is_read_io(rw) (((rw) & 1) == READ)
376struct f2fs_bio_info {
377 struct f2fs_sb_info *sbi; /* f2fs superblock */
378 struct bio *bio; /* bios to merge */
379 sector_t last_block_in_bio; /* last block number */
380 struct f2fs_io_info fio; /* store buffered io info. */
381 struct mutex io_mutex; /* mutex for bio */
382};
383
349struct f2fs_sb_info { 384struct f2fs_sb_info {
350 struct super_block *sb; /* pointer to VFS super block */ 385 struct super_block *sb; /* pointer to VFS super block */
351 struct proc_dir_entry *s_proc; /* proc entry */ 386 struct proc_dir_entry *s_proc; /* proc entry */
@@ -359,9 +394,10 @@ struct f2fs_sb_info {
359 394
360 /* for segment-related operations */ 395 /* for segment-related operations */
361 struct f2fs_sm_info *sm_info; /* segment manager */ 396 struct f2fs_sm_info *sm_info; /* segment manager */
362 struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */ 397
363 sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */ 398 /* for bio operations */
364 struct rw_semaphore bio_sem; /* IO semaphore */ 399 struct f2fs_bio_info read_io; /* for read bios */
400 struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
365 401
366 /* for checkpoint */ 402 /* for checkpoint */
367 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ 403 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
@@ -376,8 +412,9 @@ struct f2fs_sb_info {
376 412
377 /* for orphan inode management */ 413 /* for orphan inode management */
378 struct list_head orphan_inode_list; /* orphan inode list */ 414 struct list_head orphan_inode_list; /* orphan inode list */
379 struct mutex orphan_inode_mutex; /* for orphan inode list */ 415 spinlock_t orphan_inode_lock; /* for orphan inode list */
380 unsigned int n_orphans; /* # of orphan inodes */ 416 unsigned int n_orphans; /* # of orphan inodes */
417 unsigned int max_orphans; /* max orphan inodes */
381 418
382 /* for directory inode management */ 419 /* for directory inode management */
383 struct list_head dir_inode_list; /* dir inode list */ 420 struct list_head dir_inode_list; /* dir inode list */
@@ -414,6 +451,9 @@ struct f2fs_sb_info {
414 struct f2fs_gc_kthread *gc_thread; /* GC thread */ 451 struct f2fs_gc_kthread *gc_thread; /* GC thread */
415 unsigned int cur_victim_sec; /* current victim section num */ 452 unsigned int cur_victim_sec; /* current victim section num */
416 453
454 /* maximum # of trials to find a victim segment for SSR and GC */
455 unsigned int max_victim_search;
456
417 /* 457 /*
418 * for stat information. 458 * for stat information.
419 * one is for the LFS mode, and the other is for the SSR mode. 459 * one is for the LFS mode, and the other is for the SSR mode.
@@ -423,6 +463,7 @@ struct f2fs_sb_info {
423 unsigned int segment_count[2]; /* # of allocated segments */ 463 unsigned int segment_count[2]; /* # of allocated segments */
424 unsigned int block_count[2]; /* # of allocated blocks */ 464 unsigned int block_count[2]; /* # of allocated blocks */
425 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ 465 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
466 int inline_inode; /* # of inline_data inodes */
426 int bg_gc; /* background gc calls */ 467 int bg_gc; /* background gc calls */
427 unsigned int n_dirty_dirs; /* # of dir inodes */ 468 unsigned int n_dirty_dirs; /* # of dir inodes */
428#endif 469#endif
@@ -462,6 +503,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page)
462 return (struct f2fs_node *)page_address(page); 503 return (struct f2fs_node *)page_address(page);
463} 504}
464 505
506static inline struct f2fs_inode *F2FS_INODE(struct page *page)
507{
508 return &((struct f2fs_node *)page_address(page))->i;
509}
510
465static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) 511static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
466{ 512{
467 return (struct f2fs_nm_info *)(sbi->nm_info); 513 return (struct f2fs_nm_info *)(sbi->nm_info);
@@ -487,6 +533,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
487 return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); 533 return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
488} 534}
489 535
536static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi)
537{
538 return sbi->meta_inode->i_mapping;
539}
540
541static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
542{
543 return sbi->node_inode->i_mapping;
544}
545
490static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) 546static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
491{ 547{
492 sbi->s_dirty = 1; 548 sbi->s_dirty = 1;
@@ -534,7 +590,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
534 590
535static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) 591static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
536{ 592{
537 down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex); 593 f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
538} 594}
539 595
540static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) 596static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -548,7 +604,7 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
548static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) 604static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
549{ 605{
550 WARN_ON((nid >= NM_I(sbi)->max_nid)); 606 WARN_ON((nid >= NM_I(sbi)->max_nid));
551 if (nid >= NM_I(sbi)->max_nid) 607 if (unlikely(nid >= NM_I(sbi)->max_nid))
552 return -EINVAL; 608 return -EINVAL;
553 return 0; 609 return 0;
554} 610}
@@ -561,9 +617,9 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
561static inline int F2FS_HAS_BLOCKS(struct inode *inode) 617static inline int F2FS_HAS_BLOCKS(struct inode *inode)
562{ 618{
563 if (F2FS_I(inode)->i_xattr_nid) 619 if (F2FS_I(inode)->i_xattr_nid)
564 return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); 620 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
565 else 621 else
566 return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); 622 return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
567} 623}
568 624
569static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, 625static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
@@ -574,7 +630,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
574 spin_lock(&sbi->stat_lock); 630 spin_lock(&sbi->stat_lock);
575 valid_block_count = 631 valid_block_count =
576 sbi->total_valid_block_count + (block_t)count; 632 sbi->total_valid_block_count + (block_t)count;
577 if (valid_block_count > sbi->user_block_count) { 633 if (unlikely(valid_block_count > sbi->user_block_count)) {
578 spin_unlock(&sbi->stat_lock); 634 spin_unlock(&sbi->stat_lock);
579 return false; 635 return false;
580 } 636 }
@@ -585,7 +641,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
585 return true; 641 return true;
586} 642}
587 643
588static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, 644static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
589 struct inode *inode, 645 struct inode *inode,
590 blkcnt_t count) 646 blkcnt_t count)
591{ 647{
@@ -595,7 +651,6 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
595 inode->i_blocks -= count; 651 inode->i_blocks -= count;
596 sbi->total_valid_block_count -= (block_t)count; 652 sbi->total_valid_block_count -= (block_t)count;
597 spin_unlock(&sbi->stat_lock); 653 spin_unlock(&sbi->stat_lock);
598 return 0;
599} 654}
600 655
601static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) 656static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -686,50 +741,48 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
686} 741}
687 742
688static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, 743static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
689 struct inode *inode, 744 struct inode *inode)
690 unsigned int count)
691{ 745{
692 block_t valid_block_count; 746 block_t valid_block_count;
693 unsigned int valid_node_count; 747 unsigned int valid_node_count;
694 748
695 spin_lock(&sbi->stat_lock); 749 spin_lock(&sbi->stat_lock);
696 750
697 valid_block_count = sbi->total_valid_block_count + (block_t)count; 751 valid_block_count = sbi->total_valid_block_count + 1;
698 sbi->alloc_valid_block_count += (block_t)count; 752 if (unlikely(valid_block_count > sbi->user_block_count)) {
699 valid_node_count = sbi->total_valid_node_count + count;
700
701 if (valid_block_count > sbi->user_block_count) {
702 spin_unlock(&sbi->stat_lock); 753 spin_unlock(&sbi->stat_lock);
703 return false; 754 return false;
704 } 755 }
705 756
706 if (valid_node_count > sbi->total_node_count) { 757 valid_node_count = sbi->total_valid_node_count + 1;
758 if (unlikely(valid_node_count > sbi->total_node_count)) {
707 spin_unlock(&sbi->stat_lock); 759 spin_unlock(&sbi->stat_lock);
708 return false; 760 return false;
709 } 761 }
710 762
711 if (inode) 763 if (inode)
712 inode->i_blocks += count; 764 inode->i_blocks++;
713 sbi->total_valid_node_count = valid_node_count; 765
714 sbi->total_valid_block_count = valid_block_count; 766 sbi->alloc_valid_block_count++;
767 sbi->total_valid_node_count++;
768 sbi->total_valid_block_count++;
715 spin_unlock(&sbi->stat_lock); 769 spin_unlock(&sbi->stat_lock);
716 770
717 return true; 771 return true;
718} 772}
719 773
720static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, 774static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
721 struct inode *inode, 775 struct inode *inode)
722 unsigned int count)
723{ 776{
724 spin_lock(&sbi->stat_lock); 777 spin_lock(&sbi->stat_lock);
725 778
726 f2fs_bug_on(sbi->total_valid_block_count < count); 779 f2fs_bug_on(!sbi->total_valid_block_count);
727 f2fs_bug_on(sbi->total_valid_node_count < count); 780 f2fs_bug_on(!sbi->total_valid_node_count);
728 f2fs_bug_on(inode->i_blocks < count); 781 f2fs_bug_on(!inode->i_blocks);
729 782
730 inode->i_blocks -= count; 783 inode->i_blocks--;
731 sbi->total_valid_node_count -= count; 784 sbi->total_valid_node_count--;
732 sbi->total_valid_block_count -= (block_t)count; 785 sbi->total_valid_block_count--;
733 786
734 spin_unlock(&sbi->stat_lock); 787 spin_unlock(&sbi->stat_lock);
735} 788}
@@ -751,13 +804,12 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
751 spin_unlock(&sbi->stat_lock); 804 spin_unlock(&sbi->stat_lock);
752} 805}
753 806
754static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) 807static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
755{ 808{
756 spin_lock(&sbi->stat_lock); 809 spin_lock(&sbi->stat_lock);
757 f2fs_bug_on(!sbi->total_valid_inode_count); 810 f2fs_bug_on(!sbi->total_valid_inode_count);
758 sbi->total_valid_inode_count--; 811 sbi->total_valid_inode_count--;
759 spin_unlock(&sbi->stat_lock); 812 spin_unlock(&sbi->stat_lock);
760 return 0;
761} 813}
762 814
763static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) 815static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
@@ -771,7 +823,7 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
771 823
772static inline void f2fs_put_page(struct page *page, int unlock) 824static inline void f2fs_put_page(struct page *page, int unlock)
773{ 825{
774 if (!page || IS_ERR(page)) 826 if (!page)
775 return; 827 return;
776 828
777 if (unlock) { 829 if (unlock) {
@@ -876,7 +928,9 @@ enum {
876 FI_NO_ALLOC, /* should not allocate any blocks */ 928 FI_NO_ALLOC, /* should not allocate any blocks */
877 FI_UPDATE_DIR, /* should update inode block for consistency */ 929 FI_UPDATE_DIR, /* should update inode block for consistency */
878 FI_DELAY_IPUT, /* used for the recovery */ 930 FI_DELAY_IPUT, /* used for the recovery */
931 FI_NO_EXTENT, /* not to use the extent cache */
879 FI_INLINE_XATTR, /* used for inline xattr */ 932 FI_INLINE_XATTR, /* used for inline xattr */
933 FI_INLINE_DATA, /* used for inline data*/
880}; 934};
881 935
882static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 936static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -914,6 +968,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
914{ 968{
915 if (ri->i_inline & F2FS_INLINE_XATTR) 969 if (ri->i_inline & F2FS_INLINE_XATTR)
916 set_inode_flag(fi, FI_INLINE_XATTR); 970 set_inode_flag(fi, FI_INLINE_XATTR);
971 if (ri->i_inline & F2FS_INLINE_DATA)
972 set_inode_flag(fi, FI_INLINE_DATA);
917} 973}
918 974
919static inline void set_raw_inline(struct f2fs_inode_info *fi, 975static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -923,6 +979,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
923 979
924 if (is_inode_flag_set(fi, FI_INLINE_XATTR)) 980 if (is_inode_flag_set(fi, FI_INLINE_XATTR))
925 ri->i_inline |= F2FS_INLINE_XATTR; 981 ri->i_inline |= F2FS_INLINE_XATTR;
982 if (is_inode_flag_set(fi, FI_INLINE_DATA))
983 ri->i_inline |= F2FS_INLINE_DATA;
926} 984}
927 985
928static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) 986static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
@@ -948,16 +1006,33 @@ static inline int inline_xattr_size(struct inode *inode)
948 return 0; 1006 return 0;
949} 1007}
950 1008
1009static inline int f2fs_has_inline_data(struct inode *inode)
1010{
1011 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
1012}
1013
1014static inline void *inline_data_addr(struct page *page)
1015{
1016 struct f2fs_inode *ri;
1017 ri = (struct f2fs_inode *)page_address(page);
1018 return (void *)&(ri->i_addr[1]);
1019}
1020
951static inline int f2fs_readonly(struct super_block *sb) 1021static inline int f2fs_readonly(struct super_block *sb)
952{ 1022{
953 return sb->s_flags & MS_RDONLY; 1023 return sb->s_flags & MS_RDONLY;
954} 1024}
955 1025
1026#define get_inode_mode(i) \
1027 ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
1028 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
1029
956/* 1030/*
957 * file.c 1031 * file.c
958 */ 1032 */
959int f2fs_sync_file(struct file *, loff_t, loff_t, int); 1033int f2fs_sync_file(struct file *, loff_t, loff_t, int);
960void truncate_data_blocks(struct dnode_of_data *); 1034void truncate_data_blocks(struct dnode_of_data *);
1035int truncate_blocks(struct inode *, u64);
961void f2fs_truncate(struct inode *); 1036void f2fs_truncate(struct inode *);
962int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 1037int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
963int f2fs_setattr(struct dentry *, struct iattr *); 1038int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1027,7 +1102,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
1027int truncate_inode_blocks(struct inode *, pgoff_t); 1102int truncate_inode_blocks(struct inode *, pgoff_t);
1028int truncate_xattr_node(struct inode *, struct page *); 1103int truncate_xattr_node(struct inode *, struct page *);
1029int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); 1104int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
1030int remove_inode_page(struct inode *); 1105void remove_inode_page(struct inode *);
1031struct page *new_inode_page(struct inode *, const struct qstr *); 1106struct page *new_inode_page(struct inode *, const struct qstr *);
1032struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); 1107struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
1033void ra_node_page(struct f2fs_sb_info *, nid_t); 1108void ra_node_page(struct f2fs_sb_info *, nid_t);
@@ -1059,19 +1134,19 @@ void clear_prefree_segments(struct f2fs_sb_info *);
1059int npages_for_summary_flush(struct f2fs_sb_info *); 1134int npages_for_summary_flush(struct f2fs_sb_info *);
1060void allocate_new_segments(struct f2fs_sb_info *); 1135void allocate_new_segments(struct f2fs_sb_info *);
1061struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); 1136struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
1062struct bio *f2fs_bio_alloc(struct block_device *, int);
1063void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
1064void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
1065void write_meta_page(struct f2fs_sb_info *, struct page *); 1137void write_meta_page(struct f2fs_sb_info *, struct page *);
1066void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, 1138void write_node_page(struct f2fs_sb_info *, struct page *,
1067 block_t, block_t *); 1139 struct f2fs_io_info *, unsigned int, block_t, block_t *);
1068void write_data_page(struct inode *, struct page *, struct dnode_of_data*, 1140void write_data_page(struct page *, struct dnode_of_data *, block_t *,
1069 block_t, block_t *); 1141 struct f2fs_io_info *);
1070void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); 1142void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
1071void recover_data_page(struct f2fs_sb_info *, struct page *, 1143void recover_data_page(struct f2fs_sb_info *, struct page *,
1072 struct f2fs_summary *, block_t, block_t); 1144 struct f2fs_summary *, block_t, block_t);
1073void rewrite_node_page(struct f2fs_sb_info *, struct page *, 1145void rewrite_node_page(struct f2fs_sb_info *, struct page *,
1074 struct f2fs_summary *, block_t, block_t); 1146 struct f2fs_summary *, block_t, block_t);
1147void allocate_data_block(struct f2fs_sb_info *, struct page *,
1148 block_t, block_t *, struct f2fs_summary *, int);
1149void f2fs_wait_on_page_writeback(struct page *, enum page_type);
1075void write_data_summaries(struct f2fs_sb_info *, block_t); 1150void write_data_summaries(struct f2fs_sb_info *, block_t);
1076void write_node_summaries(struct f2fs_sb_info *, block_t); 1151void write_node_summaries(struct f2fs_sb_info *, block_t);
1077int lookup_journal_in_cursum(struct f2fs_summary_block *, 1152int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1079,6 +1154,8 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
1079void flush_sit_entries(struct f2fs_sb_info *); 1154void flush_sit_entries(struct f2fs_sb_info *);
1080int build_segment_manager(struct f2fs_sb_info *); 1155int build_segment_manager(struct f2fs_sb_info *);
1081void destroy_segment_manager(struct f2fs_sb_info *); 1156void destroy_segment_manager(struct f2fs_sb_info *);
1157int __init create_segment_manager_caches(void);
1158void destroy_segment_manager_caches(void);
1082 1159
1083/* 1160/*
1084 * checkpoint.c 1161 * checkpoint.c
@@ -1090,7 +1167,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *);
1090void release_orphan_inode(struct f2fs_sb_info *); 1167void release_orphan_inode(struct f2fs_sb_info *);
1091void add_orphan_inode(struct f2fs_sb_info *, nid_t); 1168void add_orphan_inode(struct f2fs_sb_info *, nid_t);
1092void remove_orphan_inode(struct f2fs_sb_info *, nid_t); 1169void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
1093int recover_orphan_inodes(struct f2fs_sb_info *); 1170void recover_orphan_inodes(struct f2fs_sb_info *);
1094int get_valid_checkpoint(struct f2fs_sb_info *); 1171int get_valid_checkpoint(struct f2fs_sb_info *);
1095void set_dirty_dir_page(struct inode *, struct page *); 1172void set_dirty_dir_page(struct inode *, struct page *);
1096void add_dirty_dir_inode(struct inode *); 1173void add_dirty_dir_inode(struct inode *);
@@ -1105,13 +1182,17 @@ void destroy_checkpoint_caches(void);
1105/* 1182/*
1106 * data.c 1183 * data.c
1107 */ 1184 */
1185void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
1186int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
1187void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
1188 struct f2fs_io_info *);
1108int reserve_new_block(struct dnode_of_data *); 1189int reserve_new_block(struct dnode_of_data *);
1190int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
1109void update_extent_cache(block_t, struct dnode_of_data *); 1191void update_extent_cache(block_t, struct dnode_of_data *);
1110struct page *find_data_page(struct inode *, pgoff_t, bool); 1192struct page *find_data_page(struct inode *, pgoff_t, bool);
1111struct page *get_lock_data_page(struct inode *, pgoff_t); 1193struct page *get_lock_data_page(struct inode *, pgoff_t);
1112struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); 1194struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
1113int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); 1195int do_write_data_page(struct page *, struct f2fs_io_info *);
1114int do_write_data_page(struct page *);
1115 1196
1116/* 1197/*
1117 * gc.c 1198 * gc.c
@@ -1144,7 +1225,7 @@ struct f2fs_stat_info {
1144 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; 1225 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
1145 int nats, sits, fnids; 1226 int nats, sits, fnids;
1146 int total_count, utilization; 1227 int total_count, utilization;
1147 int bg_gc; 1228 int bg_gc, inline_inode;
1148 unsigned int valid_count, valid_node_count, valid_inode_count; 1229 unsigned int valid_count, valid_node_count, valid_inode_count;
1149 unsigned int bimodal, avg_vblocks; 1230 unsigned int bimodal, avg_vblocks;
1150 int util_free, util_valid, util_invalid; 1231 int util_free, util_valid, util_invalid;
@@ -1164,7 +1245,7 @@ struct f2fs_stat_info {
1164 1245
1165static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) 1246static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1166{ 1247{
1167 return (struct f2fs_stat_info*)sbi->stat_info; 1248 return (struct f2fs_stat_info *)sbi->stat_info;
1168} 1249}
1169 1250
1170#define stat_inc_call_count(si) ((si)->call_count++) 1251#define stat_inc_call_count(si) ((si)->call_count++)
@@ -1173,6 +1254,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1173#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) 1254#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--)
1174#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) 1255#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++)
1175#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) 1256#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++)
1257#define stat_inc_inline_inode(inode) \
1258 do { \
1259 if (f2fs_has_inline_data(inode)) \
1260 ((F2FS_SB(inode->i_sb))->inline_inode++); \
1261 } while (0)
1262#define stat_dec_inline_inode(inode) \
1263 do { \
1264 if (f2fs_has_inline_data(inode)) \
1265 ((F2FS_SB(inode->i_sb))->inline_inode--); \
1266 } while (0)
1267
1176#define stat_inc_seg_type(sbi, curseg) \ 1268#define stat_inc_seg_type(sbi, curseg) \
1177 ((sbi)->segment_count[(curseg)->alloc_type]++) 1269 ((sbi)->segment_count[(curseg)->alloc_type]++)
1178#define stat_inc_block_count(sbi, curseg) \ 1270#define stat_inc_block_count(sbi, curseg) \
@@ -1216,6 +1308,8 @@ void f2fs_destroy_root_stats(void);
1216#define stat_dec_dirty_dir(sbi) 1308#define stat_dec_dirty_dir(sbi)
1217#define stat_inc_total_hit(sb) 1309#define stat_inc_total_hit(sb)
1218#define stat_inc_read_hit(sb) 1310#define stat_inc_read_hit(sb)
1311#define stat_inc_inline_inode(inode)
1312#define stat_dec_inline_inode(inode)
1219#define stat_inc_seg_type(sbi, curseg) 1313#define stat_inc_seg_type(sbi, curseg)
1220#define stat_inc_block_count(sbi, curseg) 1314#define stat_inc_block_count(sbi, curseg)
1221#define stat_inc_seg_count(si, type) 1315#define stat_inc_seg_count(si, type)
@@ -1238,4 +1332,13 @@ extern const struct address_space_operations f2fs_meta_aops;
1238extern const struct inode_operations f2fs_dir_inode_operations; 1332extern const struct inode_operations f2fs_dir_inode_operations;
1239extern const struct inode_operations f2fs_symlink_inode_operations; 1333extern const struct inode_operations f2fs_symlink_inode_operations;
1240extern const struct inode_operations f2fs_special_inode_operations; 1334extern const struct inode_operations f2fs_special_inode_operations;
1335
1336/*
1337 * inline.c
1338 */
1339bool f2fs_may_inline(struct inode *);
1340int f2fs_read_inline_data(struct inode *, struct page *);
1341int f2fs_convert_inline_data(struct inode *, pgoff_t);
1342int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
1343int recover_inline_data(struct inode *, struct page *);
1241#endif 1344#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7d714f4972d5..0dfcef53a6ed 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
33 struct page *page = vmf->page; 33 struct page *page = vmf->page;
34 struct inode *inode = file_inode(vma->vm_file); 34 struct inode *inode = file_inode(vma->vm_file);
35 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 35 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
36 block_t old_blk_addr;
37 struct dnode_of_data dn; 36 struct dnode_of_data dn;
38 int err; 37 int err;
39 38
@@ -44,30 +43,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
44 /* block allocation */ 43 /* block allocation */
45 f2fs_lock_op(sbi); 44 f2fs_lock_op(sbi);
46 set_new_dnode(&dn, inode, NULL, NULL, 0); 45 set_new_dnode(&dn, inode, NULL, NULL, 0);
47 err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); 46 err = f2fs_reserve_block(&dn, page->index);
48 if (err) {
49 f2fs_unlock_op(sbi);
50 goto out;
51 }
52
53 old_blk_addr = dn.data_blkaddr;
54
55 if (old_blk_addr == NULL_ADDR) {
56 err = reserve_new_block(&dn);
57 if (err) {
58 f2fs_put_dnode(&dn);
59 f2fs_unlock_op(sbi);
60 goto out;
61 }
62 }
63 f2fs_put_dnode(&dn);
64 f2fs_unlock_op(sbi); 47 f2fs_unlock_op(sbi);
48 if (err)
49 goto out;
65 50
66 file_update_time(vma->vm_file); 51 file_update_time(vma->vm_file);
67 lock_page(page); 52 lock_page(page);
68 if (page->mapping != inode->i_mapping || 53 if (unlikely(page->mapping != inode->i_mapping ||
69 page_offset(page) > i_size_read(inode) || 54 page_offset(page) > i_size_read(inode) ||
70 !PageUptodate(page)) { 55 !PageUptodate(page))) {
71 unlock_page(page); 56 unlock_page(page);
72 err = -EFAULT; 57 err = -EFAULT;
73 goto out; 58 goto out;
@@ -130,12 +115,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
130 int ret = 0; 115 int ret = 0;
131 bool need_cp = false; 116 bool need_cp = false;
132 struct writeback_control wbc = { 117 struct writeback_control wbc = {
133 .sync_mode = WB_SYNC_ALL, 118 .sync_mode = WB_SYNC_NONE,
134 .nr_to_write = LONG_MAX, 119 .nr_to_write = LONG_MAX,
135 .for_reclaim = 0, 120 .for_reclaim = 0,
136 }; 121 };
137 122
138 if (f2fs_readonly(inode->i_sb)) 123 if (unlikely(f2fs_readonly(inode->i_sb)))
139 return 0; 124 return 0;
140 125
141 trace_f2fs_sync_file_enter(inode); 126 trace_f2fs_sync_file_enter(inode);
@@ -217,7 +202,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
217 raw_node = F2FS_NODE(dn->node_page); 202 raw_node = F2FS_NODE(dn->node_page);
218 addr = blkaddr_in_node(raw_node) + ofs; 203 addr = blkaddr_in_node(raw_node) + ofs;
219 204
220 for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { 205 for (; count > 0; count--, addr++, dn->ofs_in_node++) {
221 block_t blkaddr = le32_to_cpu(*addr); 206 block_t blkaddr = le32_to_cpu(*addr);
222 if (blkaddr == NULL_ADDR) 207 if (blkaddr == NULL_ADDR)
223 continue; 208 continue;
@@ -256,7 +241,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
256 return; 241 return;
257 242
258 lock_page(page); 243 lock_page(page);
259 if (page->mapping != inode->i_mapping) { 244 if (unlikely(page->mapping != inode->i_mapping)) {
260 f2fs_put_page(page, 1); 245 f2fs_put_page(page, 1);
261 return; 246 return;
262 } 247 }
@@ -266,21 +251,24 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
266 f2fs_put_page(page, 1); 251 f2fs_put_page(page, 1);
267} 252}
268 253
269static int truncate_blocks(struct inode *inode, u64 from) 254int truncate_blocks(struct inode *inode, u64 from)
270{ 255{
271 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 256 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
272 unsigned int blocksize = inode->i_sb->s_blocksize; 257 unsigned int blocksize = inode->i_sb->s_blocksize;
273 struct dnode_of_data dn; 258 struct dnode_of_data dn;
274 pgoff_t free_from; 259 pgoff_t free_from;
275 int count = 0; 260 int count = 0, err = 0;
276 int err;
277 261
278 trace_f2fs_truncate_blocks_enter(inode, from); 262 trace_f2fs_truncate_blocks_enter(inode, from);
279 263
264 if (f2fs_has_inline_data(inode))
265 goto done;
266
280 free_from = (pgoff_t) 267 free_from = (pgoff_t)
281 ((from + blocksize - 1) >> (sbi->log_blocksize)); 268 ((from + blocksize - 1) >> (sbi->log_blocksize));
282 269
283 f2fs_lock_op(sbi); 270 f2fs_lock_op(sbi);
271
284 set_new_dnode(&dn, inode, NULL, NULL, 0); 272 set_new_dnode(&dn, inode, NULL, NULL, 0);
285 err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); 273 err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
286 if (err) { 274 if (err) {
@@ -308,7 +296,7 @@ static int truncate_blocks(struct inode *inode, u64 from)
308free_next: 296free_next:
309 err = truncate_inode_blocks(inode, free_from); 297 err = truncate_inode_blocks(inode, free_from);
310 f2fs_unlock_op(sbi); 298 f2fs_unlock_op(sbi);
311 299done:
312 /* lastly zero out the first data page */ 300 /* lastly zero out the first data page */
313 truncate_partial_data_page(inode, from); 301 truncate_partial_data_page(inode, from);
314 302
@@ -382,6 +370,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
382 370
383 if ((attr->ia_valid & ATTR_SIZE) && 371 if ((attr->ia_valid & ATTR_SIZE) &&
384 attr->ia_size != i_size_read(inode)) { 372 attr->ia_size != i_size_read(inode)) {
373 err = f2fs_convert_inline_data(inode, attr->ia_size);
374 if (err)
375 return err;
376
385 truncate_setsize(inode, attr->ia_size); 377 truncate_setsize(inode, attr->ia_size);
386 f2fs_truncate(inode); 378 f2fs_truncate(inode);
387 f2fs_balance_fs(F2FS_SB(inode->i_sb)); 379 f2fs_balance_fs(F2FS_SB(inode->i_sb));
@@ -390,7 +382,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
390 __setattr_copy(inode, attr); 382 __setattr_copy(inode, attr);
391 383
392 if (attr->ia_valid & ATTR_MODE) { 384 if (attr->ia_valid & ATTR_MODE) {
393 err = f2fs_acl_chmod(inode); 385 err = posix_acl_chmod(inode, get_inode_mode(inode));
394 if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { 386 if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
395 inode->i_mode = fi->i_acl_mode; 387 inode->i_mode = fi->i_acl_mode;
396 clear_inode_flag(fi, FI_ACL_MODE); 388 clear_inode_flag(fi, FI_ACL_MODE);
@@ -405,6 +397,7 @@ const struct inode_operations f2fs_file_inode_operations = {
405 .getattr = f2fs_getattr, 397 .getattr = f2fs_getattr,
406 .setattr = f2fs_setattr, 398 .setattr = f2fs_setattr,
407 .get_acl = f2fs_get_acl, 399 .get_acl = f2fs_get_acl,
400 .set_acl = f2fs_set_acl,
408#ifdef CONFIG_F2FS_FS_XATTR 401#ifdef CONFIG_F2FS_FS_XATTR
409 .setxattr = generic_setxattr, 402 .setxattr = generic_setxattr,
410 .getxattr = generic_getxattr, 403 .getxattr = generic_getxattr,
@@ -459,12 +452,16 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
459 return 0; 452 return 0;
460} 453}
461 454
462static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) 455static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
463{ 456{
464 pgoff_t pg_start, pg_end; 457 pgoff_t pg_start, pg_end;
465 loff_t off_start, off_end; 458 loff_t off_start, off_end;
466 int ret = 0; 459 int ret = 0;
467 460
461 ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
462 if (ret)
463 return ret;
464
468 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; 465 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
469 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; 466 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
470 467
@@ -499,12 +496,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
499 } 496 }
500 } 497 }
501 498
502 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
503 i_size_read(inode) <= (offset + len)) {
504 i_size_write(inode, offset);
505 mark_inode_dirty(inode);
506 }
507
508 return ret; 499 return ret;
509} 500}
510 501
@@ -521,6 +512,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
521 if (ret) 512 if (ret)
522 return ret; 513 return ret;
523 514
515 ret = f2fs_convert_inline_data(inode, offset + len);
516 if (ret)
517 return ret;
518
524 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; 519 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
525 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; 520 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
526 521
@@ -532,22 +527,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
532 527
533 f2fs_lock_op(sbi); 528 f2fs_lock_op(sbi);
534 set_new_dnode(&dn, inode, NULL, NULL, 0); 529 set_new_dnode(&dn, inode, NULL, NULL, 0);
535 ret = get_dnode_of_data(&dn, index, ALLOC_NODE); 530 ret = f2fs_reserve_block(&dn, index);
536 if (ret) {
537 f2fs_unlock_op(sbi);
538 break;
539 }
540
541 if (dn.data_blkaddr == NULL_ADDR) {
542 ret = reserve_new_block(&dn);
543 if (ret) {
544 f2fs_put_dnode(&dn);
545 f2fs_unlock_op(sbi);
546 break;
547 }
548 }
549 f2fs_put_dnode(&dn);
550 f2fs_unlock_op(sbi); 531 f2fs_unlock_op(sbi);
532 if (ret)
533 break;
551 534
552 if (pg_start == pg_end) 535 if (pg_start == pg_end)
553 new_size = offset + len; 536 new_size = offset + len;
@@ -578,7 +561,7 @@ static long f2fs_fallocate(struct file *file, int mode,
578 return -EOPNOTSUPP; 561 return -EOPNOTSUPP;
579 562
580 if (mode & FALLOC_FL_PUNCH_HOLE) 563 if (mode & FALLOC_FL_PUNCH_HOLE)
581 ret = punch_hole(inode, offset, len, mode); 564 ret = punch_hole(inode, offset, len);
582 else 565 else
583 ret = expand_inode_data(inode, offset, len, mode); 566 ret = expand_inode_data(inode, offset, len, mode);
584 567
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b7ad1ec7e4cc..ea0371e854b4 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -119,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
119 kfree(gc_th); 119 kfree(gc_th);
120 sbi->gc_thread = NULL; 120 sbi->gc_thread = NULL;
121 } 121 }
122
123out: 122out:
124 return err; 123 return err;
125} 124}
@@ -164,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
164 p->ofs_unit = sbi->segs_per_sec; 163 p->ofs_unit = sbi->segs_per_sec;
165 } 164 }
166 165
167 if (p->max_search > MAX_VICTIM_SEARCH) 166 if (p->max_search > sbi->max_victim_search)
168 p->max_search = MAX_VICTIM_SEARCH; 167 p->max_search = sbi->max_victim_search;
169 168
170 p->offset = sbi->last_victim[p->gc_mode]; 169 p->offset = sbi->last_victim[p->gc_mode];
171} 170}
@@ -429,7 +428,7 @@ next_step:
429 428
430 /* set page dirty and write it */ 429 /* set page dirty and write it */
431 if (gc_type == FG_GC) { 430 if (gc_type == FG_GC) {
432 f2fs_wait_on_page_writeback(node_page, NODE, true); 431 f2fs_wait_on_page_writeback(node_page, NODE);
433 set_page_dirty(node_page); 432 set_page_dirty(node_page);
434 } else { 433 } else {
435 if (!PageWriteback(node_page)) 434 if (!PageWriteback(node_page))
@@ -521,6 +520,11 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
521 520
522static void move_data_page(struct inode *inode, struct page *page, int gc_type) 521static void move_data_page(struct inode *inode, struct page *page, int gc_type)
523{ 522{
523 struct f2fs_io_info fio = {
524 .type = DATA,
525 .rw = WRITE_SYNC,
526 };
527
524 if (gc_type == BG_GC) { 528 if (gc_type == BG_GC) {
525 if (PageWriteback(page)) 529 if (PageWriteback(page))
526 goto out; 530 goto out;
@@ -529,7 +533,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
529 } else { 533 } else {
530 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 534 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
531 535
532 f2fs_wait_on_page_writeback(page, DATA, true); 536 f2fs_wait_on_page_writeback(page, DATA);
533 537
534 if (clear_page_dirty_for_io(page) && 538 if (clear_page_dirty_for_io(page) &&
535 S_ISDIR(inode->i_mode)) { 539 S_ISDIR(inode->i_mode)) {
@@ -537,7 +541,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
537 inode_dec_dirty_dents(inode); 541 inode_dec_dirty_dents(inode);
538 } 542 }
539 set_cold_data(page); 543 set_cold_data(page);
540 do_write_data_page(page); 544 do_write_data_page(page, &fio);
541 clear_cold_data(page); 545 clear_cold_data(page);
542 } 546 }
543out: 547out:
@@ -631,7 +635,7 @@ next_iput:
631 goto next_step; 635 goto next_step;
632 636
633 if (gc_type == FG_GC) { 637 if (gc_type == FG_GC) {
634 f2fs_submit_bio(sbi, DATA, true); 638 f2fs_submit_merged_bio(sbi, DATA, WRITE);
635 639
636 /* 640 /*
637 * In the case of FG_GC, it'd be better to reclaim this victim 641 * In the case of FG_GC, it'd be better to reclaim this victim
@@ -664,8 +668,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
664 668
665 /* read segment summary of victim */ 669 /* read segment summary of victim */
666 sum_page = get_sum_page(sbi, segno); 670 sum_page = get_sum_page(sbi, segno);
667 if (IS_ERR(sum_page))
668 return;
669 671
670 blk_start_plug(&plug); 672 blk_start_plug(&plug);
671 673
@@ -697,7 +699,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
697 699
698 INIT_LIST_HEAD(&ilist); 700 INIT_LIST_HEAD(&ilist);
699gc_more: 701gc_more:
700 if (!(sbi->sb->s_flags & MS_ACTIVE)) 702 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
701 goto stop; 703 goto stop;
702 704
703 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { 705 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 507056d22205..5d5eb6047bf4 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -20,7 +20,7 @@
20#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ 20#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
21 21
22/* Search max. number of dirty segments to select a victim segment */ 22/* Search max. number of dirty segments to select a victim segment */
23#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */ 23#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
24 24
25struct f2fs_gc_kthread { 25struct f2fs_gc_kthread {
26 struct task_struct *f2fs_gc_task; 26 struct task_struct *f2fs_gc_task;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
new file mode 100644
index 000000000000..31ee5b164ff9
--- /dev/null
+++ b/fs/f2fs/inline.c
@@ -0,0 +1,222 @@
1/*
2 * fs/f2fs/inline.c
3 * Copyright (c) 2013, Intel Corporation
4 * Authors: Huajun Li <huajun.li@intel.com>
5 * Haicheng Li <haicheng.li@intel.com>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13
14#include "f2fs.h"
15
16bool f2fs_may_inline(struct inode *inode)
17{
18 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
19 block_t nr_blocks;
20 loff_t i_size;
21
22 if (!test_opt(sbi, INLINE_DATA))
23 return false;
24
25 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
26 if (inode->i_blocks > nr_blocks)
27 return false;
28
29 i_size = i_size_read(inode);
30 if (i_size > MAX_INLINE_DATA)
31 return false;
32
33 return true;
34}
35
36int f2fs_read_inline_data(struct inode *inode, struct page *page)
37{
38 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
39 struct page *ipage;
40 void *src_addr, *dst_addr;
41
42 if (page->index) {
43 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
44 goto out;
45 }
46
47 ipage = get_node_page(sbi, inode->i_ino);
48 if (IS_ERR(ipage))
49 return PTR_ERR(ipage);
50
51 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
52
53 /* Copy the whole inline data block */
54 src_addr = inline_data_addr(ipage);
55 dst_addr = kmap(page);
56 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
57 kunmap(page);
58 f2fs_put_page(ipage, 1);
59
60out:
61 SetPageUptodate(page);
62 unlock_page(page);
63
64 return 0;
65}
66
67static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
68{
69 int err;
70 struct page *ipage;
71 struct dnode_of_data dn;
72 void *src_addr, *dst_addr;
73 block_t new_blk_addr;
74 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
75 struct f2fs_io_info fio = {
76 .type = DATA,
77 .rw = WRITE_SYNC | REQ_PRIO,
78 };
79
80 f2fs_lock_op(sbi);
81 ipage = get_node_page(sbi, inode->i_ino);
82 if (IS_ERR(ipage))
83 return PTR_ERR(ipage);
84
85 /*
86 * i_addr[0] is not used for inline data,
87 * so reserving new block will not destroy inline data
88 */
89 set_new_dnode(&dn, inode, ipage, NULL, 0);
90 err = f2fs_reserve_block(&dn, 0);
91 if (err) {
92 f2fs_unlock_op(sbi);
93 return err;
94 }
95
96 zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
97
98 /* Copy the whole inline data block */
99 src_addr = inline_data_addr(ipage);
100 dst_addr = kmap(page);
101 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
102 kunmap(page);
103 SetPageUptodate(page);
104
105 /* write data page to try to make data consistent */
106 set_page_writeback(page);
107 write_data_page(page, &dn, &new_blk_addr, &fio);
108 update_extent_cache(new_blk_addr, &dn);
109 f2fs_wait_on_page_writeback(page, DATA);
110
111 /* clear inline data and flag after data writeback */
112 zero_user_segment(ipage, INLINE_DATA_OFFSET,
113 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
114 clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
115 stat_dec_inline_inode(inode);
116
117 sync_inode_page(&dn);
118 f2fs_put_dnode(&dn);
119 f2fs_unlock_op(sbi);
120 return err;
121}
122
123int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
124{
125 struct page *page;
126 int err;
127
128 if (!f2fs_has_inline_data(inode))
129 return 0;
130 else if (to_size <= MAX_INLINE_DATA)
131 return 0;
132
133 page = grab_cache_page_write_begin(inode->i_mapping, 0, AOP_FLAG_NOFS);
134 if (!page)
135 return -ENOMEM;
136
137 err = __f2fs_convert_inline_data(inode, page);
138 f2fs_put_page(page, 1);
139 return err;
140}
141
142int f2fs_write_inline_data(struct inode *inode,
143 struct page *page, unsigned size)
144{
145 void *src_addr, *dst_addr;
146 struct page *ipage;
147 struct dnode_of_data dn;
148 int err;
149
150 set_new_dnode(&dn, inode, NULL, NULL, 0);
151 err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
152 if (err)
153 return err;
154 ipage = dn.inode_page;
155
156 zero_user_segment(ipage, INLINE_DATA_OFFSET,
157 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
158 src_addr = kmap(page);
159 dst_addr = inline_data_addr(ipage);
160 memcpy(dst_addr, src_addr, size);
161 kunmap(page);
162
163 /* Release the first data block if it is allocated */
164 if (!f2fs_has_inline_data(inode)) {
165 truncate_data_blocks_range(&dn, 1);
166 set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
167 stat_inc_inline_inode(inode);
168 }
169
170 sync_inode_page(&dn);
171 f2fs_put_dnode(&dn);
172
173 return 0;
174}
175
176int recover_inline_data(struct inode *inode, struct page *npage)
177{
178 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
179 struct f2fs_inode *ri = NULL;
180 void *src_addr, *dst_addr;
181 struct page *ipage;
182
183 /*
184 * The inline_data recovery policy is as follows.
185 * [prev.] [next] of inline_data flag
186 * o o -> recover inline_data
187 * o x -> remove inline_data, and then recover data blocks
188 * x o -> remove inline_data, and then recover inline_data
189 * x x -> recover data blocks
190 */
191 if (IS_INODE(npage))
192 ri = F2FS_INODE(npage);
193
194 if (f2fs_has_inline_data(inode) &&
195 ri && ri->i_inline & F2FS_INLINE_DATA) {
196process_inline:
197 ipage = get_node_page(sbi, inode->i_ino);
198 f2fs_bug_on(IS_ERR(ipage));
199
200 src_addr = inline_data_addr(npage);
201 dst_addr = inline_data_addr(ipage);
202 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
203 update_inode(inode, ipage);
204 f2fs_put_page(ipage, 1);
205 return -1;
206 }
207
208 if (f2fs_has_inline_data(inode)) {
209 ipage = get_node_page(sbi, inode->i_ino);
210 f2fs_bug_on(IS_ERR(ipage));
211 zero_user_segment(ipage, INLINE_DATA_OFFSET,
212 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
213 clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
214 update_inode(inode, ipage);
215 f2fs_put_page(ipage, 1);
216 } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
217 truncate_blocks(inode, 0);
218 set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
219 goto process_inline;
220 }
221 return 0;
222}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d0eaa9faeca0..4d67ed736dca 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -42,9 +42,11 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
42 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 42 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
43 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 43 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
44 if (ri->i_addr[0]) 44 if (ri->i_addr[0])
45 inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); 45 inode->i_rdev =
46 old_decode_dev(le32_to_cpu(ri->i_addr[0]));
46 else 47 else
47 inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); 48 inode->i_rdev =
49 new_decode_dev(le32_to_cpu(ri->i_addr[1]));
48 } 50 }
49} 51}
50 52
@@ -52,11 +54,13 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
52{ 54{
53 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 55 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
54 if (old_valid_dev(inode->i_rdev)) { 56 if (old_valid_dev(inode->i_rdev)) {
55 ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); 57 ri->i_addr[0] =
58 cpu_to_le32(old_encode_dev(inode->i_rdev));
56 ri->i_addr[1] = 0; 59 ri->i_addr[1] = 0;
57 } else { 60 } else {
58 ri->i_addr[0] = 0; 61 ri->i_addr[0] = 0;
59 ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); 62 ri->i_addr[1] =
63 cpu_to_le32(new_encode_dev(inode->i_rdev));
60 ri->i_addr[2] = 0; 64 ri->i_addr[2] = 0;
61 } 65 }
62 } 66 }
@@ -67,7 +71,6 @@ static int do_read_inode(struct inode *inode)
67 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 71 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
68 struct f2fs_inode_info *fi = F2FS_I(inode); 72 struct f2fs_inode_info *fi = F2FS_I(inode);
69 struct page *node_page; 73 struct page *node_page;
70 struct f2fs_node *rn;
71 struct f2fs_inode *ri; 74 struct f2fs_inode *ri;
72 75
73 /* Check if ino is within scope */ 76 /* Check if ino is within scope */
@@ -81,8 +84,7 @@ static int do_read_inode(struct inode *inode)
81 if (IS_ERR(node_page)) 84 if (IS_ERR(node_page))
82 return PTR_ERR(node_page); 85 return PTR_ERR(node_page);
83 86
84 rn = F2FS_NODE(node_page); 87 ri = F2FS_INODE(node_page);
85 ri = &(rn->i);
86 88
87 inode->i_mode = le16_to_cpu(ri->i_mode); 89 inode->i_mode = le16_to_cpu(ri->i_mode);
88 i_uid_write(inode, le32_to_cpu(ri->i_uid)); 90 i_uid_write(inode, le32_to_cpu(ri->i_uid));
@@ -175,13 +177,11 @@ bad_inode:
175 177
176void update_inode(struct inode *inode, struct page *node_page) 178void update_inode(struct inode *inode, struct page *node_page)
177{ 179{
178 struct f2fs_node *rn;
179 struct f2fs_inode *ri; 180 struct f2fs_inode *ri;
180 181
181 f2fs_wait_on_page_writeback(node_page, NODE, false); 182 f2fs_wait_on_page_writeback(node_page, NODE);
182 183
183 rn = F2FS_NODE(node_page); 184 ri = F2FS_INODE(node_page);
184 ri = &(rn->i);
185 185
186 ri->i_mode = cpu_to_le16(inode->i_mode); 186 ri->i_mode = cpu_to_le16(inode->i_mode);
187 ri->i_advise = F2FS_I(inode)->i_advise; 187 ri->i_advise = F2FS_I(inode)->i_advise;
@@ -281,6 +281,7 @@ void f2fs_evict_inode(struct inode *inode)
281 281
282 f2fs_lock_op(sbi); 282 f2fs_lock_op(sbi);
283 remove_inode_page(inode); 283 remove_inode_page(inode);
284 stat_dec_inline_inode(inode);
284 f2fs_unlock_op(sbi); 285 f2fs_unlock_op(sbi);
285 286
286 sb_end_intwrite(inode->i_sb); 287 sb_end_intwrite(inode->i_sb);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 575adac17f8b..397d459e97bf 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -424,11 +424,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
424 } 424 }
425 425
426 f2fs_set_link(new_dir, new_entry, new_page, old_inode); 426 f2fs_set_link(new_dir, new_entry, new_page, old_inode);
427 F2FS_I(old_inode)->i_pino = new_dir->i_ino;
427 428
428 new_inode->i_ctime = CURRENT_TIME; 429 new_inode->i_ctime = CURRENT_TIME;
429 if (old_dir_entry) 430 if (old_dir_entry)
430 drop_nlink(new_inode); 431 drop_nlink(new_inode);
431 drop_nlink(new_inode); 432 drop_nlink(new_inode);
433 mark_inode_dirty(new_inode);
432 434
433 if (!new_inode->i_nlink) 435 if (!new_inode->i_nlink)
434 add_orphan_inode(sbi, new_inode->i_ino); 436 add_orphan_inode(sbi, new_inode->i_ino);
@@ -457,11 +459,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
457 if (old_dir != new_dir) { 459 if (old_dir != new_dir) {
458 f2fs_set_link(old_inode, old_dir_entry, 460 f2fs_set_link(old_inode, old_dir_entry,
459 old_dir_page, new_dir); 461 old_dir_page, new_dir);
462 F2FS_I(old_inode)->i_pino = new_dir->i_ino;
463 update_inode_page(old_inode);
460 } else { 464 } else {
461 kunmap(old_dir_page); 465 kunmap(old_dir_page);
462 f2fs_put_page(old_dir_page, 0); 466 f2fs_put_page(old_dir_page, 0);
463 } 467 }
464 drop_nlink(old_dir); 468 drop_nlink(old_dir);
469 mark_inode_dirty(old_dir);
465 update_inode_page(old_dir); 470 update_inode_page(old_dir);
466 } 471 }
467 472
@@ -496,6 +501,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
496 .getattr = f2fs_getattr, 501 .getattr = f2fs_getattr,
497 .setattr = f2fs_setattr, 502 .setattr = f2fs_setattr,
498 .get_acl = f2fs_get_acl, 503 .get_acl = f2fs_get_acl,
504 .set_acl = f2fs_set_acl,
499#ifdef CONFIG_F2FS_FS_XATTR 505#ifdef CONFIG_F2FS_FS_XATTR
500 .setxattr = generic_setxattr, 506 .setxattr = generic_setxattr,
501 .getxattr = generic_getxattr, 507 .getxattr = generic_getxattr,
@@ -522,6 +528,7 @@ const struct inode_operations f2fs_special_inode_operations = {
522 .getattr = f2fs_getattr, 528 .getattr = f2fs_getattr,
523 .setattr = f2fs_setattr, 529 .setattr = f2fs_setattr,
524 .get_acl = f2fs_get_acl, 530 .get_acl = f2fs_get_acl,
531 .set_acl = f2fs_set_acl,
525#ifdef CONFIG_F2FS_FS_XATTR 532#ifdef CONFIG_F2FS_FS_XATTR
526 .setxattr = generic_setxattr, 533 .setxattr = generic_setxattr,
527 .getxattr = generic_getxattr, 534 .getxattr = generic_getxattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4ac4150d421d..b0649b76eb4f 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -87,17 +87,19 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
87 */ 87 */
88static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) 88static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
89{ 89{
90 struct address_space *mapping = sbi->meta_inode->i_mapping; 90 struct address_space *mapping = META_MAPPING(sbi);
91 struct f2fs_nm_info *nm_i = NM_I(sbi); 91 struct f2fs_nm_info *nm_i = NM_I(sbi);
92 struct blk_plug plug;
93 struct page *page; 92 struct page *page;
94 pgoff_t index; 93 pgoff_t index;
95 int i; 94 int i;
95 struct f2fs_io_info fio = {
96 .type = META,
97 .rw = READ_SYNC | REQ_META | REQ_PRIO
98 };
96 99
97 blk_start_plug(&plug);
98 100
99 for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { 101 for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
100 if (nid >= nm_i->max_nid) 102 if (unlikely(nid >= nm_i->max_nid))
101 nid = 0; 103 nid = 0;
102 index = current_nat_addr(sbi, nid); 104 index = current_nat_addr(sbi, nid);
103 105
@@ -105,15 +107,15 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
105 if (!page) 107 if (!page)
106 continue; 108 continue;
107 if (PageUptodate(page)) { 109 if (PageUptodate(page)) {
110 mark_page_accessed(page);
108 f2fs_put_page(page, 1); 111 f2fs_put_page(page, 1);
109 continue; 112 continue;
110 } 113 }
111 if (f2fs_readpage(sbi, page, index, READ)) 114 f2fs_submit_page_mbio(sbi, page, index, &fio);
112 continue; 115 mark_page_accessed(page);
113
114 f2fs_put_page(page, 0); 116 f2fs_put_page(page, 0);
115 } 117 }
116 blk_finish_plug(&plug); 118 f2fs_submit_merged_bio(sbi, META, READ);
117} 119}
118 120
119static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) 121static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
@@ -391,8 +393,8 @@ got:
391 393
392/* 394/*
393 * Caller should call f2fs_put_dnode(dn). 395 * Caller should call f2fs_put_dnode(dn).
394 * Also, it should grab and release a mutex by calling mutex_lock_op() and 396 * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
395 * mutex_unlock_op() only if ro is not set RDONLY_NODE. 397 * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
396 * In the case of RDONLY_NODE, we don't need to care about mutex. 398 * In the case of RDONLY_NODE, we don't need to care about mutex.
397 */ 399 */
398int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) 400int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
@@ -502,7 +504,7 @@ static void truncate_node(struct dnode_of_data *dn)
502 504
503 /* Deallocate node address */ 505 /* Deallocate node address */
504 invalidate_blocks(sbi, ni.blk_addr); 506 invalidate_blocks(sbi, ni.blk_addr);
505 dec_valid_node_count(sbi, dn->inode, 1); 507 dec_valid_node_count(sbi, dn->inode);
506 set_node_addr(sbi, &ni, NULL_ADDR); 508 set_node_addr(sbi, &ni, NULL_ADDR);
507 509
508 if (dn->nid == dn->inode->i_ino) { 510 if (dn->nid == dn->inode->i_ino) {
@@ -516,6 +518,10 @@ invalidate:
516 F2FS_SET_SB_DIRT(sbi); 518 F2FS_SET_SB_DIRT(sbi);
517 519
518 f2fs_put_page(dn->node_page, 1); 520 f2fs_put_page(dn->node_page, 1);
521
522 invalidate_mapping_pages(NODE_MAPPING(sbi),
523 dn->node_page->index, dn->node_page->index);
524
519 dn->node_page = NULL; 525 dn->node_page = NULL;
520 trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); 526 trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
521} 527}
@@ -631,19 +637,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
631 return 0; 637 return 0;
632 638
633 /* get indirect nodes in the path */ 639 /* get indirect nodes in the path */
634 for (i = 0; i < depth - 1; i++) { 640 for (i = 0; i < idx + 1; i++) {
635 /* refernece count'll be increased */ 641 /* refernece count'll be increased */
636 pages[i] = get_node_page(sbi, nid[i]); 642 pages[i] = get_node_page(sbi, nid[i]);
637 if (IS_ERR(pages[i])) { 643 if (IS_ERR(pages[i])) {
638 depth = i + 1;
639 err = PTR_ERR(pages[i]); 644 err = PTR_ERR(pages[i]);
645 idx = i - 1;
640 goto fail; 646 goto fail;
641 } 647 }
642 nid[i + 1] = get_nid(pages[i], offset[i + 1], false); 648 nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
643 } 649 }
644 650
645 /* free direct nodes linked to a partial indirect node */ 651 /* free direct nodes linked to a partial indirect node */
646 for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { 652 for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
647 child_nid = get_nid(pages[idx], i, false); 653 child_nid = get_nid(pages[idx], i, false);
648 if (!child_nid) 654 if (!child_nid)
649 continue; 655 continue;
@@ -654,7 +660,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
654 set_nid(pages[idx], i, 0, false); 660 set_nid(pages[idx], i, 0, false);
655 } 661 }
656 662
657 if (offset[depth - 1] == 0) { 663 if (offset[idx + 1] == 0) {
658 dn->node_page = pages[idx]; 664 dn->node_page = pages[idx];
659 dn->nid = nid[idx]; 665 dn->nid = nid[idx];
660 truncate_node(dn); 666 truncate_node(dn);
@@ -662,9 +668,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
662 f2fs_put_page(pages[idx], 1); 668 f2fs_put_page(pages[idx], 1);
663 } 669 }
664 offset[idx]++; 670 offset[idx]++;
665 offset[depth - 1] = 0; 671 offset[idx + 1] = 0;
672 idx--;
666fail: 673fail:
667 for (i = depth - 3; i >= 0; i--) 674 for (i = idx; i >= 0; i--)
668 f2fs_put_page(pages[i], 1); 675 f2fs_put_page(pages[i], 1);
669 676
670 trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); 677 trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
@@ -678,11 +685,10 @@ fail:
678int truncate_inode_blocks(struct inode *inode, pgoff_t from) 685int truncate_inode_blocks(struct inode *inode, pgoff_t from)
679{ 686{
680 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 687 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
681 struct address_space *node_mapping = sbi->node_inode->i_mapping;
682 int err = 0, cont = 1; 688 int err = 0, cont = 1;
683 int level, offset[4], noffset[4]; 689 int level, offset[4], noffset[4];
684 unsigned int nofs = 0; 690 unsigned int nofs = 0;
685 struct f2fs_node *rn; 691 struct f2fs_inode *ri;
686 struct dnode_of_data dn; 692 struct dnode_of_data dn;
687 struct page *page; 693 struct page *page;
688 694
@@ -699,7 +705,7 @@ restart:
699 set_new_dnode(&dn, inode, page, NULL, 0); 705 set_new_dnode(&dn, inode, page, NULL, 0);
700 unlock_page(page); 706 unlock_page(page);
701 707
702 rn = F2FS_NODE(page); 708 ri = F2FS_INODE(page);
703 switch (level) { 709 switch (level) {
704 case 0: 710 case 0:
705 case 1: 711 case 1:
@@ -709,7 +715,7 @@ restart:
709 nofs = noffset[1]; 715 nofs = noffset[1];
710 if (!offset[level - 1]) 716 if (!offset[level - 1])
711 goto skip_partial; 717 goto skip_partial;
712 err = truncate_partial_nodes(&dn, &rn->i, offset, level); 718 err = truncate_partial_nodes(&dn, ri, offset, level);
713 if (err < 0 && err != -ENOENT) 719 if (err < 0 && err != -ENOENT)
714 goto fail; 720 goto fail;
715 nofs += 1 + NIDS_PER_BLOCK; 721 nofs += 1 + NIDS_PER_BLOCK;
@@ -718,7 +724,7 @@ restart:
718 nofs = 5 + 2 * NIDS_PER_BLOCK; 724 nofs = 5 + 2 * NIDS_PER_BLOCK;
719 if (!offset[level - 1]) 725 if (!offset[level - 1])
720 goto skip_partial; 726 goto skip_partial;
721 err = truncate_partial_nodes(&dn, &rn->i, offset, level); 727 err = truncate_partial_nodes(&dn, ri, offset, level);
722 if (err < 0 && err != -ENOENT) 728 if (err < 0 && err != -ENOENT)
723 goto fail; 729 goto fail;
724 break; 730 break;
@@ -728,7 +734,7 @@ restart:
728 734
729skip_partial: 735skip_partial:
730 while (cont) { 736 while (cont) {
731 dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); 737 dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
732 switch (offset[0]) { 738 switch (offset[0]) {
733 case NODE_DIR1_BLOCK: 739 case NODE_DIR1_BLOCK:
734 case NODE_DIR2_BLOCK: 740 case NODE_DIR2_BLOCK:
@@ -751,14 +757,14 @@ skip_partial:
751 if (err < 0 && err != -ENOENT) 757 if (err < 0 && err != -ENOENT)
752 goto fail; 758 goto fail;
753 if (offset[1] == 0 && 759 if (offset[1] == 0 &&
754 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { 760 ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
755 lock_page(page); 761 lock_page(page);
756 if (page->mapping != node_mapping) { 762 if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
757 f2fs_put_page(page, 1); 763 f2fs_put_page(page, 1);
758 goto restart; 764 goto restart;
759 } 765 }
760 wait_on_page_writeback(page); 766 wait_on_page_writeback(page);
761 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; 767 ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
762 set_page_dirty(page); 768 set_page_dirty(page);
763 unlock_page(page); 769 unlock_page(page);
764 } 770 }
@@ -794,38 +800,34 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
794 set_new_dnode(&dn, inode, page, npage, nid); 800 set_new_dnode(&dn, inode, page, npage, nid);
795 801
796 if (page) 802 if (page)
797 dn.inode_page_locked = 1; 803 dn.inode_page_locked = true;
798 truncate_node(&dn); 804 truncate_node(&dn);
799 return 0; 805 return 0;
800} 806}
801 807
802/* 808/*
803 * Caller should grab and release a mutex by calling mutex_lock_op() and 809 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
804 * mutex_unlock_op(). 810 * f2fs_unlock_op().
805 */ 811 */
806int remove_inode_page(struct inode *inode) 812void remove_inode_page(struct inode *inode)
807{ 813{
808 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 814 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
809 struct page *page; 815 struct page *page;
810 nid_t ino = inode->i_ino; 816 nid_t ino = inode->i_ino;
811 struct dnode_of_data dn; 817 struct dnode_of_data dn;
812 int err;
813 818
814 page = get_node_page(sbi, ino); 819 page = get_node_page(sbi, ino);
815 if (IS_ERR(page)) 820 if (IS_ERR(page))
816 return PTR_ERR(page); 821 return;
817 822
818 err = truncate_xattr_node(inode, page); 823 if (truncate_xattr_node(inode, page)) {
819 if (err) {
820 f2fs_put_page(page, 1); 824 f2fs_put_page(page, 1);
821 return err; 825 return;
822 } 826 }
823
824 /* 0 is possible, after f2fs_new_inode() is failed */ 827 /* 0 is possible, after f2fs_new_inode() is failed */
825 f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); 828 f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
826 set_new_dnode(&dn, inode, page, page, ino); 829 set_new_dnode(&dn, inode, page, page, ino);
827 truncate_node(&dn); 830 truncate_node(&dn);
828 return 0;
829} 831}
830 832
831struct page *new_inode_page(struct inode *inode, const struct qstr *name) 833struct page *new_inode_page(struct inode *inode, const struct qstr *name)
@@ -843,19 +845,18 @@ struct page *new_node_page(struct dnode_of_data *dn,
843 unsigned int ofs, struct page *ipage) 845 unsigned int ofs, struct page *ipage)
844{ 846{
845 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 847 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
846 struct address_space *mapping = sbi->node_inode->i_mapping;
847 struct node_info old_ni, new_ni; 848 struct node_info old_ni, new_ni;
848 struct page *page; 849 struct page *page;
849 int err; 850 int err;
850 851
851 if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) 852 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
852 return ERR_PTR(-EPERM); 853 return ERR_PTR(-EPERM);
853 854
854 page = grab_cache_page(mapping, dn->nid); 855 page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
855 if (!page) 856 if (!page)
856 return ERR_PTR(-ENOMEM); 857 return ERR_PTR(-ENOMEM);
857 858
858 if (!inc_valid_node_count(sbi, dn->inode, 1)) { 859 if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
859 err = -ENOSPC; 860 err = -ENOSPC;
860 goto fail; 861 goto fail;
861 } 862 }
@@ -898,14 +899,14 @@ fail:
898 * LOCKED_PAGE: f2fs_put_page(page, 1) 899 * LOCKED_PAGE: f2fs_put_page(page, 1)
899 * error: nothing 900 * error: nothing
900 */ 901 */
901static int read_node_page(struct page *page, int type) 902static int read_node_page(struct page *page, int rw)
902{ 903{
903 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 904 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
904 struct node_info ni; 905 struct node_info ni;
905 906
906 get_node_info(sbi, page->index, &ni); 907 get_node_info(sbi, page->index, &ni);
907 908
908 if (ni.blk_addr == NULL_ADDR) { 909 if (unlikely(ni.blk_addr == NULL_ADDR)) {
909 f2fs_put_page(page, 1); 910 f2fs_put_page(page, 1);
910 return -ENOENT; 911 return -ENOENT;
911 } 912 }
@@ -913,7 +914,7 @@ static int read_node_page(struct page *page, int type)
913 if (PageUptodate(page)) 914 if (PageUptodate(page))
914 return LOCKED_PAGE; 915 return LOCKED_PAGE;
915 916
916 return f2fs_readpage(sbi, page, ni.blk_addr, type); 917 return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
917} 918}
918 919
919/* 920/*
@@ -921,18 +922,17 @@ static int read_node_page(struct page *page, int type)
921 */ 922 */
922void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) 923void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
923{ 924{
924 struct address_space *mapping = sbi->node_inode->i_mapping;
925 struct page *apage; 925 struct page *apage;
926 int err; 926 int err;
927 927
928 apage = find_get_page(mapping, nid); 928 apage = find_get_page(NODE_MAPPING(sbi), nid);
929 if (apage && PageUptodate(apage)) { 929 if (apage && PageUptodate(apage)) {
930 f2fs_put_page(apage, 0); 930 f2fs_put_page(apage, 0);
931 return; 931 return;
932 } 932 }
933 f2fs_put_page(apage, 0); 933 f2fs_put_page(apage, 0);
934 934
935 apage = grab_cache_page(mapping, nid); 935 apage = grab_cache_page(NODE_MAPPING(sbi), nid);
936 if (!apage) 936 if (!apage)
937 return; 937 return;
938 938
@@ -945,11 +945,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
945 945
946struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) 946struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
947{ 947{
948 struct address_space *mapping = sbi->node_inode->i_mapping;
949 struct page *page; 948 struct page *page;
950 int err; 949 int err;
951repeat: 950repeat:
952 page = grab_cache_page(mapping, nid); 951 page = grab_cache_page(NODE_MAPPING(sbi), nid);
953 if (!page) 952 if (!page)
954 return ERR_PTR(-ENOMEM); 953 return ERR_PTR(-ENOMEM);
955 954
@@ -960,11 +959,11 @@ repeat:
960 goto got_it; 959 goto got_it;
961 960
962 lock_page(page); 961 lock_page(page);
963 if (!PageUptodate(page)) { 962 if (unlikely(!PageUptodate(page))) {
964 f2fs_put_page(page, 1); 963 f2fs_put_page(page, 1);
965 return ERR_PTR(-EIO); 964 return ERR_PTR(-EIO);
966 } 965 }
967 if (page->mapping != mapping) { 966 if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
968 f2fs_put_page(page, 1); 967 f2fs_put_page(page, 1);
969 goto repeat; 968 goto repeat;
970 } 969 }
@@ -981,7 +980,6 @@ got_it:
981struct page *get_node_page_ra(struct page *parent, int start) 980struct page *get_node_page_ra(struct page *parent, int start)
982{ 981{
983 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); 982 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
984 struct address_space *mapping = sbi->node_inode->i_mapping;
985 struct blk_plug plug; 983 struct blk_plug plug;
986 struct page *page; 984 struct page *page;
987 int err, i, end; 985 int err, i, end;
@@ -992,7 +990,7 @@ struct page *get_node_page_ra(struct page *parent, int start)
992 if (!nid) 990 if (!nid)
993 return ERR_PTR(-ENOENT); 991 return ERR_PTR(-ENOENT);
994repeat: 992repeat:
995 page = grab_cache_page(mapping, nid); 993 page = grab_cache_page(NODE_MAPPING(sbi), nid);
996 if (!page) 994 if (!page)
997 return ERR_PTR(-ENOMEM); 995 return ERR_PTR(-ENOMEM);
998 996
@@ -1017,12 +1015,12 @@ repeat:
1017 blk_finish_plug(&plug); 1015 blk_finish_plug(&plug);
1018 1016
1019 lock_page(page); 1017 lock_page(page);
1020 if (page->mapping != mapping) { 1018 if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1021 f2fs_put_page(page, 1); 1019 f2fs_put_page(page, 1);
1022 goto repeat; 1020 goto repeat;
1023 } 1021 }
1024page_hit: 1022page_hit:
1025 if (!PageUptodate(page)) { 1023 if (unlikely(!PageUptodate(page))) {
1026 f2fs_put_page(page, 1); 1024 f2fs_put_page(page, 1);
1027 return ERR_PTR(-EIO); 1025 return ERR_PTR(-EIO);
1028 } 1026 }
@@ -1048,7 +1046,6 @@ void sync_inode_page(struct dnode_of_data *dn)
1048int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, 1046int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
1049 struct writeback_control *wbc) 1047 struct writeback_control *wbc)
1050{ 1048{
1051 struct address_space *mapping = sbi->node_inode->i_mapping;
1052 pgoff_t index, end; 1049 pgoff_t index, end;
1053 struct pagevec pvec; 1050 struct pagevec pvec;
1054 int step = ino ? 2 : 0; 1051 int step = ino ? 2 : 0;
@@ -1062,7 +1059,7 @@ next_step:
1062 1059
1063 while (index <= end) { 1060 while (index <= end) {
1064 int i, nr_pages; 1061 int i, nr_pages;
1065 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1062 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1066 PAGECACHE_TAG_DIRTY, 1063 PAGECACHE_TAG_DIRTY,
1067 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 1064 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1068 if (nr_pages == 0) 1065 if (nr_pages == 0)
@@ -1095,7 +1092,7 @@ next_step:
1095 else if (!trylock_page(page)) 1092 else if (!trylock_page(page))
1096 continue; 1093 continue;
1097 1094
1098 if (unlikely(page->mapping != mapping)) { 1095 if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1099continue_unlock: 1096continue_unlock:
1100 unlock_page(page); 1097 unlock_page(page);
1101 continue; 1098 continue;
@@ -1122,7 +1119,7 @@ continue_unlock:
1122 set_fsync_mark(page, 0); 1119 set_fsync_mark(page, 0);
1123 set_dentry_mark(page, 0); 1120 set_dentry_mark(page, 0);
1124 } 1121 }
1125 mapping->a_ops->writepage(page, wbc); 1122 NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
1126 wrote++; 1123 wrote++;
1127 1124
1128 if (--wbc->nr_to_write == 0) 1125 if (--wbc->nr_to_write == 0)
@@ -1143,31 +1140,31 @@ continue_unlock:
1143 } 1140 }
1144 1141
1145 if (wrote) 1142 if (wrote)
1146 f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); 1143 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1147
1148 return nwritten; 1144 return nwritten;
1149} 1145}
1150 1146
1151int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) 1147int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1152{ 1148{
1153 struct address_space *mapping = sbi->node_inode->i_mapping;
1154 pgoff_t index = 0, end = LONG_MAX; 1149 pgoff_t index = 0, end = LONG_MAX;
1155 struct pagevec pvec; 1150 struct pagevec pvec;
1156 int nr_pages;
1157 int ret2 = 0, ret = 0; 1151 int ret2 = 0, ret = 0;
1158 1152
1159 pagevec_init(&pvec, 0); 1153 pagevec_init(&pvec, 0);
1160 while ((index <= end) && 1154
1161 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 1155 while (index <= end) {
1162 PAGECACHE_TAG_WRITEBACK, 1156 int i, nr_pages;
1163 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { 1157 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1164 unsigned i; 1158 PAGECACHE_TAG_WRITEBACK,
1159 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1160 if (nr_pages == 0)
1161 break;
1165 1162
1166 for (i = 0; i < nr_pages; i++) { 1163 for (i = 0; i < nr_pages; i++) {
1167 struct page *page = pvec.pages[i]; 1164 struct page *page = pvec.pages[i];
1168 1165
1169 /* until radix tree lookup accepts end_index */ 1166 /* until radix tree lookup accepts end_index */
1170 if (page->index > end) 1167 if (unlikely(page->index > end))
1171 continue; 1168 continue;
1172 1169
1173 if (ino && ino_of_node(page) == ino) { 1170 if (ino && ino_of_node(page) == ino) {
@@ -1180,9 +1177,9 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1180 cond_resched(); 1177 cond_resched();
1181 } 1178 }
1182 1179
1183 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 1180 if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
1184 ret2 = -ENOSPC; 1181 ret2 = -ENOSPC;
1185 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 1182 if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
1186 ret2 = -EIO; 1183 ret2 = -EIO;
1187 if (!ret) 1184 if (!ret)
1188 ret = ret2; 1185 ret = ret2;
@@ -1196,8 +1193,12 @@ static int f2fs_write_node_page(struct page *page,
1196 nid_t nid; 1193 nid_t nid;
1197 block_t new_addr; 1194 block_t new_addr;
1198 struct node_info ni; 1195 struct node_info ni;
1196 struct f2fs_io_info fio = {
1197 .type = NODE,
1198 .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
1199 };
1199 1200
1200 if (sbi->por_doing) 1201 if (unlikely(sbi->por_doing))
1201 goto redirty_out; 1202 goto redirty_out;
1202 1203
1203 wait_on_page_writeback(page); 1204 wait_on_page_writeback(page);
@@ -1209,7 +1210,7 @@ static int f2fs_write_node_page(struct page *page,
1209 get_node_info(sbi, nid, &ni); 1210 get_node_info(sbi, nid, &ni);
1210 1211
1211 /* This page is already truncated */ 1212 /* This page is already truncated */
1212 if (ni.blk_addr == NULL_ADDR) { 1213 if (unlikely(ni.blk_addr == NULL_ADDR)) {
1213 dec_page_count(sbi, F2FS_DIRTY_NODES); 1214 dec_page_count(sbi, F2FS_DIRTY_NODES);
1214 unlock_page(page); 1215 unlock_page(page);
1215 return 0; 1216 return 0;
@@ -1220,7 +1221,7 @@ static int f2fs_write_node_page(struct page *page,
1220 1221
1221 mutex_lock(&sbi->node_write); 1222 mutex_lock(&sbi->node_write);
1222 set_page_writeback(page); 1223 set_page_writeback(page);
1223 write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); 1224 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
1224 set_node_addr(sbi, &ni, new_addr); 1225 set_node_addr(sbi, &ni, new_addr);
1225 dec_page_count(sbi, F2FS_DIRTY_NODES); 1226 dec_page_count(sbi, F2FS_DIRTY_NODES);
1226 mutex_unlock(&sbi->node_write); 1227 mutex_unlock(&sbi->node_write);
@@ -1255,6 +1256,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
1255 1256
1256 /* if mounting is failed, skip writing node pages */ 1257 /* if mounting is failed, skip writing node pages */
1257 wbc->nr_to_write = 3 * max_hw_blocks(sbi); 1258 wbc->nr_to_write = 3 * max_hw_blocks(sbi);
1259 wbc->sync_mode = WB_SYNC_NONE;
1258 sync_node_pages(sbi, 0, wbc); 1260 sync_node_pages(sbi, 0, wbc);
1259 wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - 1261 wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
1260 wbc->nr_to_write); 1262 wbc->nr_to_write);
@@ -1333,7 +1335,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
1333 return -1; 1335 return -1;
1334 1336
1335 /* 0 nid should not be used */ 1337 /* 0 nid should not be used */
1336 if (nid == 0) 1338 if (unlikely(nid == 0))
1337 return 0; 1339 return 0;
1338 1340
1339 if (build) { 1341 if (build) {
@@ -1386,7 +1388,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i,
1386 1388
1387 for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { 1389 for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1388 1390
1389 if (start_nid >= nm_i->max_nid) 1391 if (unlikely(start_nid >= nm_i->max_nid))
1390 break; 1392 break;
1391 1393
1392 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); 1394 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
@@ -1420,7 +1422,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
1420 f2fs_put_page(page, 1); 1422 f2fs_put_page(page, 1);
1421 1423
1422 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); 1424 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
1423 if (nid >= nm_i->max_nid) 1425 if (unlikely(nid >= nm_i->max_nid))
1424 nid = 0; 1426 nid = 0;
1425 1427
1426 if (i++ == FREE_NID_PAGES) 1428 if (i++ == FREE_NID_PAGES)
@@ -1454,7 +1456,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
1454 struct free_nid *i = NULL; 1456 struct free_nid *i = NULL;
1455 struct list_head *this; 1457 struct list_head *this;
1456retry: 1458retry:
1457 if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) 1459 if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
1458 return false; 1460 return false;
1459 1461
1460 spin_lock(&nm_i->free_nid_list_lock); 1462 spin_lock(&nm_i->free_nid_list_lock);
@@ -1535,13 +1537,12 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
1535 1537
1536int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) 1538int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1537{ 1539{
1538 struct address_space *mapping = sbi->node_inode->i_mapping; 1540 struct f2fs_inode *src, *dst;
1539 struct f2fs_node *src, *dst;
1540 nid_t ino = ino_of_node(page); 1541 nid_t ino = ino_of_node(page);
1541 struct node_info old_ni, new_ni; 1542 struct node_info old_ni, new_ni;
1542 struct page *ipage; 1543 struct page *ipage;
1543 1544
1544 ipage = grab_cache_page(mapping, ino); 1545 ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
1545 if (!ipage) 1546 if (!ipage)
1546 return -ENOMEM; 1547 return -ENOMEM;
1547 1548
@@ -1552,19 +1553,19 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1552 SetPageUptodate(ipage); 1553 SetPageUptodate(ipage);
1553 fill_node_footer(ipage, ino, ino, 0, true); 1554 fill_node_footer(ipage, ino, ino, 0, true);
1554 1555
1555 src = F2FS_NODE(page); 1556 src = F2FS_INODE(page);
1556 dst = F2FS_NODE(ipage); 1557 dst = F2FS_INODE(ipage);
1557 1558
1558 memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); 1559 memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
1559 dst->i.i_size = 0; 1560 dst->i_size = 0;
1560 dst->i.i_blocks = cpu_to_le64(1); 1561 dst->i_blocks = cpu_to_le64(1);
1561 dst->i.i_links = cpu_to_le32(1); 1562 dst->i_links = cpu_to_le32(1);
1562 dst->i.i_xattr_nid = 0; 1563 dst->i_xattr_nid = 0;
1563 1564
1564 new_ni = old_ni; 1565 new_ni = old_ni;
1565 new_ni.ino = ino; 1566 new_ni.ino = ino;
1566 1567
1567 if (!inc_valid_node_count(sbi, NULL, 1)) 1568 if (unlikely(!inc_valid_node_count(sbi, NULL)))
1568 WARN_ON(1); 1569 WARN_ON(1);
1569 set_node_addr(sbi, &new_ni, NEW_ADDR); 1570 set_node_addr(sbi, &new_ni, NEW_ADDR);
1570 inc_valid_inode_count(sbi); 1571 inc_valid_inode_count(sbi);
@@ -1572,47 +1573,88 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1572 return 0; 1573 return 0;
1573} 1574}
1574 1575
1576/*
1577 * ra_sum_pages() merge contiguous pages into one bio and submit.
1578 * these pre-readed pages are linked in pages list.
1579 */
1580static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
1581 int start, int nrpages)
1582{
1583 struct page *page;
1584 int page_idx = start;
1585 struct f2fs_io_info fio = {
1586 .type = META,
1587 .rw = READ_SYNC | REQ_META | REQ_PRIO
1588 };
1589
1590 for (; page_idx < start + nrpages; page_idx++) {
1591 /* alloc temporal page for read node summary info*/
1592 page = alloc_page(GFP_F2FS_ZERO);
1593 if (!page) {
1594 struct page *tmp;
1595 list_for_each_entry_safe(page, tmp, pages, lru) {
1596 list_del(&page->lru);
1597 unlock_page(page);
1598 __free_pages(page, 0);
1599 }
1600 return -ENOMEM;
1601 }
1602
1603 lock_page(page);
1604 page->index = page_idx;
1605 list_add_tail(&page->lru, pages);
1606 }
1607
1608 list_for_each_entry(page, pages, lru)
1609 f2fs_submit_page_mbio(sbi, page, page->index, &fio);
1610
1611 f2fs_submit_merged_bio(sbi, META, READ);
1612 return 0;
1613}
1614
1575int restore_node_summary(struct f2fs_sb_info *sbi, 1615int restore_node_summary(struct f2fs_sb_info *sbi,
1576 unsigned int segno, struct f2fs_summary_block *sum) 1616 unsigned int segno, struct f2fs_summary_block *sum)
1577{ 1617{
1578 struct f2fs_node *rn; 1618 struct f2fs_node *rn;
1579 struct f2fs_summary *sum_entry; 1619 struct f2fs_summary *sum_entry;
1580 struct page *page; 1620 struct page *page, *tmp;
1581 block_t addr; 1621 block_t addr;
1582 int i, last_offset; 1622 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
1583 1623 int i, last_offset, nrpages, err = 0;
1584 /* alloc temporal page for read node */ 1624 LIST_HEAD(page_list);
1585 page = alloc_page(GFP_NOFS | __GFP_ZERO);
1586 if (!page)
1587 return -ENOMEM;
1588 lock_page(page);
1589 1625
1590 /* scan the node segment */ 1626 /* scan the node segment */
1591 last_offset = sbi->blocks_per_seg; 1627 last_offset = sbi->blocks_per_seg;
1592 addr = START_BLOCK(sbi, segno); 1628 addr = START_BLOCK(sbi, segno);
1593 sum_entry = &sum->entries[0]; 1629 sum_entry = &sum->entries[0];
1594 1630
1595 for (i = 0; i < last_offset; i++, sum_entry++) { 1631 for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
1596 /* 1632 nrpages = min(last_offset - i, bio_blocks);
1597 * In order to read next node page,
1598 * we must clear PageUptodate flag.
1599 */
1600 ClearPageUptodate(page);
1601 1633
1602 if (f2fs_readpage(sbi, page, addr, READ_SYNC)) 1634 /* read ahead node pages */
1603 goto out; 1635 err = ra_sum_pages(sbi, &page_list, addr, nrpages);
1636 if (err)
1637 return err;
1604 1638
1605 lock_page(page); 1639 list_for_each_entry_safe(page, tmp, &page_list, lru) {
1606 rn = F2FS_NODE(page); 1640
1607 sum_entry->nid = rn->footer.nid; 1641 lock_page(page);
1608 sum_entry->version = 0; 1642 if (unlikely(!PageUptodate(page))) {
1609 sum_entry->ofs_in_node = 0; 1643 err = -EIO;
1610 addr++; 1644 } else {
1645 rn = F2FS_NODE(page);
1646 sum_entry->nid = rn->footer.nid;
1647 sum_entry->version = 0;
1648 sum_entry->ofs_in_node = 0;
1649 sum_entry++;
1650 }
1651
1652 list_del(&page->lru);
1653 unlock_page(page);
1654 __free_pages(page, 0);
1655 }
1611 } 1656 }
1612 unlock_page(page); 1657 return err;
1613out:
1614 __free_pages(page, 0);
1615 return 0;
1616} 1658}
1617 1659
1618static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) 1660static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3496bb3e15dc..c4c79885c993 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -224,7 +224,13 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
224 * | `- direct node (5 + N => 5 + 2N - 1) 224 * | `- direct node (5 + N => 5 + 2N - 1)
225 * `- double indirect node (5 + 2N) 225 * `- double indirect node (5 + 2N)
226 * `- indirect node (6 + 2N) 226 * `- indirect node (6 + 2N)
227 * `- direct node (x(N + 1)) 227 * `- direct node
228 * ......
229 * `- indirect node ((6 + 2N) + x(N + 1))
230 * `- direct node
231 * ......
232 * `- indirect node ((6 + 2N) + (N - 1)(N + 1))
233 * `- direct node
228 */ 234 */
229static inline bool IS_DNODE(struct page *node_page) 235static inline bool IS_DNODE(struct page *node_page)
230{ 236{
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fdc81161f254..976a7a934db5 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
40 40
41static int recover_dentry(struct page *ipage, struct inode *inode) 41static int recover_dentry(struct page *ipage, struct inode *inode)
42{ 42{
43 struct f2fs_node *raw_node = F2FS_NODE(ipage); 43 struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
44 struct f2fs_inode *raw_inode = &(raw_node->i);
45 nid_t pino = le32_to_cpu(raw_inode->i_pino); 44 nid_t pino = le32_to_cpu(raw_inode->i_pino);
46 struct f2fs_dir_entry *de; 45 struct f2fs_dir_entry *de;
47 struct qstr name; 46 struct qstr name;
@@ -62,6 +61,12 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
62 61
63 name.len = le32_to_cpu(raw_inode->i_namelen); 62 name.len = le32_to_cpu(raw_inode->i_namelen);
64 name.name = raw_inode->i_name; 63 name.name = raw_inode->i_name;
64
65 if (unlikely(name.len > F2FS_NAME_LEN)) {
66 WARN_ON(1);
67 err = -ENAMETOOLONG;
68 goto out;
69 }
65retry: 70retry:
66 de = f2fs_find_entry(dir, &name, &page); 71 de = f2fs_find_entry(dir, &name, &page);
67 if (de && inode->i_ino == le32_to_cpu(de->ino)) 72 if (de && inode->i_ino == le32_to_cpu(de->ino))
@@ -90,17 +95,16 @@ out_unmap_put:
90 kunmap(page); 95 kunmap(page);
91 f2fs_put_page(page, 0); 96 f2fs_put_page(page, 0);
92out: 97out:
93 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " 98 f2fs_msg(inode->i_sb, KERN_NOTICE,
94 "ino = %x, name = %s, dir = %lx, err = %d", 99 "%s: ino = %x, name = %s, dir = %lx, err = %d",
95 ino_of_node(ipage), raw_inode->i_name, 100 __func__, ino_of_node(ipage), raw_inode->i_name,
96 IS_ERR(dir) ? 0 : dir->i_ino, err); 101 IS_ERR(dir) ? 0 : dir->i_ino, err);
97 return err; 102 return err;
98} 103}
99 104
100static int recover_inode(struct inode *inode, struct page *node_page) 105static int recover_inode(struct inode *inode, struct page *node_page)
101{ 106{
102 struct f2fs_node *raw_node = F2FS_NODE(node_page); 107 struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
103 struct f2fs_inode *raw_inode = &(raw_node->i);
104 108
105 if (!IS_INODE(node_page)) 109 if (!IS_INODE(node_page))
106 return 0; 110 return 0;
@@ -143,9 +147,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
143 while (1) { 147 while (1) {
144 struct fsync_inode_entry *entry; 148 struct fsync_inode_entry *entry;
145 149
146 err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); 150 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
147 if (err) 151 if (err)
148 goto out; 152 return err;
149 153
150 lock_page(page); 154 lock_page(page);
151 155
@@ -191,9 +195,10 @@ next:
191 /* check next segment */ 195 /* check next segment */
192 blkaddr = next_blkaddr_of_node(page); 196 blkaddr = next_blkaddr_of_node(page);
193 } 197 }
198
194 unlock_page(page); 199 unlock_page(page);
195out:
196 __free_pages(page, 0); 200 __free_pages(page, 0);
201
197 return err; 202 return err;
198} 203}
199 204
@@ -293,6 +298,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
293 struct node_info ni; 298 struct node_info ni;
294 int err = 0, recovered = 0; 299 int err = 0, recovered = 0;
295 300
301 if (recover_inline_data(inode, page))
302 goto out;
303
296 start = start_bidx_of_node(ofs_of_node(page), fi); 304 start = start_bidx_of_node(ofs_of_node(page), fi);
297 if (IS_INODE(page)) 305 if (IS_INODE(page))
298 end = start + ADDRS_PER_INODE(fi); 306 end = start + ADDRS_PER_INODE(fi);
@@ -300,12 +308,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
300 end = start + ADDRS_PER_BLOCK; 308 end = start + ADDRS_PER_BLOCK;
301 309
302 f2fs_lock_op(sbi); 310 f2fs_lock_op(sbi);
311
303 set_new_dnode(&dn, inode, NULL, NULL, 0); 312 set_new_dnode(&dn, inode, NULL, NULL, 0);
304 313
305 err = get_dnode_of_data(&dn, start, ALLOC_NODE); 314 err = get_dnode_of_data(&dn, start, ALLOC_NODE);
306 if (err) { 315 if (err) {
307 f2fs_unlock_op(sbi); 316 f2fs_unlock_op(sbi);
308 return err; 317 goto out;
309 } 318 }
310 319
311 wait_on_page_writeback(dn.node_page); 320 wait_on_page_writeback(dn.node_page);
@@ -356,10 +365,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
356err: 365err:
357 f2fs_put_dnode(&dn); 366 f2fs_put_dnode(&dn);
358 f2fs_unlock_op(sbi); 367 f2fs_unlock_op(sbi);
359 368out:
360 f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " 369 f2fs_msg(sbi->sb, KERN_NOTICE,
361 "recovered_data = %d blocks, err = %d", 370 "recover_data: ino = %lx, recovered = %d blocks, err = %d",
362 inode->i_ino, recovered, err); 371 inode->i_ino, recovered, err);
363 return err; 372 return err;
364} 373}
365 374
@@ -377,7 +386,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
377 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 386 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
378 387
379 /* read node page */ 388 /* read node page */
380 page = alloc_page(GFP_NOFS | __GFP_ZERO); 389 page = alloc_page(GFP_F2FS_ZERO);
381 if (!page) 390 if (!page)
382 return -ENOMEM; 391 return -ENOMEM;
383 392
@@ -386,9 +395,9 @@ static int recover_data(struct f2fs_sb_info *sbi,
386 while (1) { 395 while (1) {
387 struct fsync_inode_entry *entry; 396 struct fsync_inode_entry *entry;
388 397
389 err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); 398 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
390 if (err) 399 if (err)
391 goto out; 400 return err;
392 401
393 lock_page(page); 402 lock_page(page);
394 403
@@ -412,8 +421,8 @@ next:
412 /* check next segment */ 421 /* check next segment */
413 blkaddr = next_blkaddr_of_node(page); 422 blkaddr = next_blkaddr_of_node(page);
414 } 423 }
424
415 unlock_page(page); 425 unlock_page(page);
416out:
417 __free_pages(page, 0); 426 __free_pages(page, 0);
418 427
419 if (!err) 428 if (!err)
@@ -429,7 +438,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
429 438
430 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", 439 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
431 sizeof(struct fsync_inode_entry), NULL); 440 sizeof(struct fsync_inode_entry), NULL);
432 if (unlikely(!fsync_entry_slab)) 441 if (!fsync_entry_slab)
433 return -ENOMEM; 442 return -ENOMEM;
434 443
435 INIT_LIST_HEAD(&inode_list); 444 INIT_LIST_HEAD(&inode_list);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fa284d397199..7caac5f2ca9e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,12 +14,163 @@
14#include <linux/blkdev.h> 14#include <linux/blkdev.h>
15#include <linux/prefetch.h> 15#include <linux/prefetch.h>
16#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
17#include <linux/swap.h>
17 18
18#include "f2fs.h" 19#include "f2fs.h"
19#include "segment.h" 20#include "segment.h"
20#include "node.h" 21#include "node.h"
21#include <trace/events/f2fs.h> 22#include <trace/events/f2fs.h>
22 23
24#define __reverse_ffz(x) __reverse_ffs(~(x))
25
26static struct kmem_cache *discard_entry_slab;
27
28/*
29 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
30 * MSB and LSB are reversed in a byte by f2fs_set_bit.
31 */
32static inline unsigned long __reverse_ffs(unsigned long word)
33{
34 int num = 0;
35
36#if BITS_PER_LONG == 64
37 if ((word & 0xffffffff) == 0) {
38 num += 32;
39 word >>= 32;
40 }
41#endif
42 if ((word & 0xffff) == 0) {
43 num += 16;
44 word >>= 16;
45 }
46 if ((word & 0xff) == 0) {
47 num += 8;
48 word >>= 8;
49 }
50 if ((word & 0xf0) == 0)
51 num += 4;
52 else
53 word >>= 4;
54 if ((word & 0xc) == 0)
55 num += 2;
56 else
57 word >>= 2;
58 if ((word & 0x2) == 0)
59 num += 1;
60 return num;
61}
62
63/*
64 * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
65 * f2fs_set_bit makes MSB and LSB reversed in a byte.
66 * Example:
67 * LSB <--> MSB
68 * f2fs_set_bit(0, bitmap) => 0000 0001
69 * f2fs_set_bit(7, bitmap) => 1000 0000
70 */
71static unsigned long __find_rev_next_bit(const unsigned long *addr,
72 unsigned long size, unsigned long offset)
73{
74 const unsigned long *p = addr + BIT_WORD(offset);
75 unsigned long result = offset & ~(BITS_PER_LONG - 1);
76 unsigned long tmp;
77 unsigned long mask, submask;
78 unsigned long quot, rest;
79
80 if (offset >= size)
81 return size;
82
83 size -= result;
84 offset %= BITS_PER_LONG;
85 if (!offset)
86 goto aligned;
87
88 tmp = *(p++);
89 quot = (offset >> 3) << 3;
90 rest = offset & 0x7;
91 mask = ~0UL << quot;
92 submask = (unsigned char)(0xff << rest) >> rest;
93 submask <<= quot;
94 mask &= submask;
95 tmp &= mask;
96 if (size < BITS_PER_LONG)
97 goto found_first;
98 if (tmp)
99 goto found_middle;
100
101 size -= BITS_PER_LONG;
102 result += BITS_PER_LONG;
103aligned:
104 while (size & ~(BITS_PER_LONG-1)) {
105 tmp = *(p++);
106 if (tmp)
107 goto found_middle;
108 result += BITS_PER_LONG;
109 size -= BITS_PER_LONG;
110 }
111 if (!size)
112 return result;
113 tmp = *p;
114found_first:
115 tmp &= (~0UL >> (BITS_PER_LONG - size));
116 if (tmp == 0UL) /* Are any bits set? */
117 return result + size; /* Nope. */
118found_middle:
119 return result + __reverse_ffs(tmp);
120}
121
122static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
123 unsigned long size, unsigned long offset)
124{
125 const unsigned long *p = addr + BIT_WORD(offset);
126 unsigned long result = offset & ~(BITS_PER_LONG - 1);
127 unsigned long tmp;
128 unsigned long mask, submask;
129 unsigned long quot, rest;
130
131 if (offset >= size)
132 return size;
133
134 size -= result;
135 offset %= BITS_PER_LONG;
136 if (!offset)
137 goto aligned;
138
139 tmp = *(p++);
140 quot = (offset >> 3) << 3;
141 rest = offset & 0x7;
142 mask = ~(~0UL << quot);
143 submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
144 submask <<= quot;
145 mask += submask;
146 tmp |= mask;
147 if (size < BITS_PER_LONG)
148 goto found_first;
149 if (~tmp)
150 goto found_middle;
151
152 size -= BITS_PER_LONG;
153 result += BITS_PER_LONG;
154aligned:
155 while (size & ~(BITS_PER_LONG - 1)) {
156 tmp = *(p++);
157 if (~tmp)
158 goto found_middle;
159 result += BITS_PER_LONG;
160 size -= BITS_PER_LONG;
161 }
162 if (!size)
163 return result;
164 tmp = *p;
165
166found_first:
167 tmp |= ~0UL << size;
168 if (tmp == ~0UL) /* Are any bits zero? */
169 return result + size; /* Nope. */
170found_middle:
171 return result + __reverse_ffz(tmp);
172}
173
23/* 174/*
24 * This function balances dirty node and dentry pages. 175 * This function balances dirty node and dentry pages.
25 * In addition, it controls garbage collection. 176 * In addition, it controls garbage collection.
@@ -116,6 +267,56 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
116 mutex_unlock(&dirty_i->seglist_lock); 267 mutex_unlock(&dirty_i->seglist_lock);
117} 268}
118 269
270static void f2fs_issue_discard(struct f2fs_sb_info *sbi,
271 block_t blkstart, block_t blklen)
272{
273 sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
274 sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
275 blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
276 trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
277}
278
279static void add_discard_addrs(struct f2fs_sb_info *sbi,
280 unsigned int segno, struct seg_entry *se)
281{
282 struct list_head *head = &SM_I(sbi)->discard_list;
283 struct discard_entry *new;
284 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
285 int max_blocks = sbi->blocks_per_seg;
286 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
287 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
288 unsigned long dmap[entries];
289 unsigned int start = 0, end = -1;
290 int i;
291
292 if (!test_opt(sbi, DISCARD))
293 return;
294
295 /* zero block will be discarded through the prefree list */
296 if (!se->valid_blocks || se->valid_blocks == max_blocks)
297 return;
298
299 /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
300 for (i = 0; i < entries; i++)
301 dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
302
303 while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
304 start = __find_rev_next_bit(dmap, max_blocks, end + 1);
305 if (start >= max_blocks)
306 break;
307
308 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
309
310 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
311 INIT_LIST_HEAD(&new->list);
312 new->blkaddr = START_BLOCK(sbi, segno) + start;
313 new->len = end - start;
314
315 list_add_tail(&new->list, head);
316 SM_I(sbi)->nr_discards += end - start;
317 }
318}
319
119/* 320/*
120 * Should call clear_prefree_segments after checkpoint is done. 321 * Should call clear_prefree_segments after checkpoint is done.
121 */ 322 */
@@ -138,6 +339,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
138 339
139void clear_prefree_segments(struct f2fs_sb_info *sbi) 340void clear_prefree_segments(struct f2fs_sb_info *sbi)
140{ 341{
342 struct list_head *head = &(SM_I(sbi)->discard_list);
343 struct list_head *this, *next;
344 struct discard_entry *entry;
141 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 345 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
142 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; 346 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
143 unsigned int total_segs = TOTAL_SEGS(sbi); 347 unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -160,14 +364,19 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
160 if (!test_opt(sbi, DISCARD)) 364 if (!test_opt(sbi, DISCARD))
161 continue; 365 continue;
162 366
163 blkdev_issue_discard(sbi->sb->s_bdev, 367 f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
164 START_BLOCK(sbi, start) << 368 (end - start) << sbi->log_blocks_per_seg);
165 sbi->log_sectors_per_block,
166 (1 << (sbi->log_sectors_per_block +
167 sbi->log_blocks_per_seg)) * (end - start),
168 GFP_NOFS, 0);
169 } 369 }
170 mutex_unlock(&dirty_i->seglist_lock); 370 mutex_unlock(&dirty_i->seglist_lock);
371
372 /* send small discards */
373 list_for_each_safe(this, next, head) {
374 entry = list_entry(this, struct discard_entry, list);
375 f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
376 list_del(&entry->list);
377 SM_I(sbi)->nr_discards -= entry->len;
378 kmem_cache_free(discard_entry_slab, entry);
379 }
171} 380}
172 381
173static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) 382static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -459,13 +668,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
459 struct curseg_info *seg, block_t start) 668 struct curseg_info *seg, block_t start)
460{ 669{
461 struct seg_entry *se = get_seg_entry(sbi, seg->segno); 670 struct seg_entry *se = get_seg_entry(sbi, seg->segno);
462 block_t ofs; 671 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
463 for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { 672 unsigned long target_map[entries];
464 if (!f2fs_test_bit(ofs, se->ckpt_valid_map) 673 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
465 && !f2fs_test_bit(ofs, se->cur_valid_map)) 674 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
466 break; 675 int i, pos;
467 } 676
468 seg->next_blkoff = ofs; 677 for (i = 0; i < entries; i++)
678 target_map[i] = ckpt_map[i] | cur_map[i];
679
680 pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
681
682 seg->next_blkoff = pos;
469} 683}
470 684
471/* 685/*
@@ -573,148 +787,6 @@ static const struct segment_allocation default_salloc_ops = {
573 .allocate_segment = allocate_segment_by_default, 787 .allocate_segment = allocate_segment_by_default,
574}; 788};
575 789
576static void f2fs_end_io_write(struct bio *bio, int err)
577{
578 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
579 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
580 struct bio_private *p = bio->bi_private;
581
582 do {
583 struct page *page = bvec->bv_page;
584
585 if (--bvec >= bio->bi_io_vec)
586 prefetchw(&bvec->bv_page->flags);
587 if (!uptodate) {
588 SetPageError(page);
589 if (page->mapping)
590 set_bit(AS_EIO, &page->mapping->flags);
591 set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
592 p->sbi->sb->s_flags |= MS_RDONLY;
593 }
594 end_page_writeback(page);
595 dec_page_count(p->sbi, F2FS_WRITEBACK);
596 } while (bvec >= bio->bi_io_vec);
597
598 if (p->is_sync)
599 complete(p->wait);
600
601 if (!get_pages(p->sbi, F2FS_WRITEBACK) &&
602 !list_empty(&p->sbi->cp_wait.task_list))
603 wake_up(&p->sbi->cp_wait);
604
605 kfree(p);
606 bio_put(bio);
607}
608
609struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
610{
611 struct bio *bio;
612
613 /* No failure on bio allocation */
614 bio = bio_alloc(GFP_NOIO, npages);
615 bio->bi_bdev = bdev;
616 bio->bi_private = NULL;
617
618 return bio;
619}
620
621static void do_submit_bio(struct f2fs_sb_info *sbi,
622 enum page_type type, bool sync)
623{
624 int rw = sync ? WRITE_SYNC : WRITE;
625 enum page_type btype = type > META ? META : type;
626
627 if (type >= META_FLUSH)
628 rw = WRITE_FLUSH_FUA;
629
630 if (btype == META)
631 rw |= REQ_META;
632
633 if (sbi->bio[btype]) {
634 struct bio_private *p = sbi->bio[btype]->bi_private;
635 p->sbi = sbi;
636 sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
637
638 trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
639
640 if (type == META_FLUSH) {
641 DECLARE_COMPLETION_ONSTACK(wait);
642 p->is_sync = true;
643 p->wait = &wait;
644 submit_bio(rw, sbi->bio[btype]);
645 wait_for_completion(&wait);
646 } else {
647 p->is_sync = false;
648 submit_bio(rw, sbi->bio[btype]);
649 }
650 sbi->bio[btype] = NULL;
651 }
652}
653
654void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
655{
656 down_write(&sbi->bio_sem);
657 do_submit_bio(sbi, type, sync);
658 up_write(&sbi->bio_sem);
659}
660
661static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
662 block_t blk_addr, enum page_type type)
663{
664 struct block_device *bdev = sbi->sb->s_bdev;
665 int bio_blocks;
666
667 verify_block_addr(sbi, blk_addr);
668
669 down_write(&sbi->bio_sem);
670
671 inc_page_count(sbi, F2FS_WRITEBACK);
672
673 if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
674 do_submit_bio(sbi, type, false);
675alloc_new:
676 if (sbi->bio[type] == NULL) {
677 struct bio_private *priv;
678retry:
679 priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
680 if (!priv) {
681 cond_resched();
682 goto retry;
683 }
684
685 bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
686 sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks);
687 sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
688 sbi->bio[type]->bi_private = priv;
689 /*
690 * The end_io will be assigned at the sumbission phase.
691 * Until then, let bio_add_page() merge consecutive IOs as much
692 * as possible.
693 */
694 }
695
696 if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
697 PAGE_CACHE_SIZE) {
698 do_submit_bio(sbi, type, false);
699 goto alloc_new;
700 }
701
702 sbi->last_block_in_bio[type] = blk_addr;
703
704 up_write(&sbi->bio_sem);
705 trace_f2fs_submit_write_page(page, blk_addr, type);
706}
707
708void f2fs_wait_on_page_writeback(struct page *page,
709 enum page_type type, bool sync)
710{
711 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
712 if (PageWriteback(page)) {
713 f2fs_submit_bio(sbi, type, sync);
714 wait_on_page_writeback(page);
715 }
716}
717
718static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) 790static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
719{ 791{
720 struct curseg_info *curseg = CURSEG_I(sbi, type); 792 struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -782,16 +854,14 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
782 return __get_segment_type_6(page, p_type); 854 return __get_segment_type_6(page, p_type);
783} 855}
784 856
785static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, 857void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
786 block_t old_blkaddr, block_t *new_blkaddr, 858 block_t old_blkaddr, block_t *new_blkaddr,
787 struct f2fs_summary *sum, enum page_type p_type) 859 struct f2fs_summary *sum, int type)
788{ 860{
789 struct sit_info *sit_i = SIT_I(sbi); 861 struct sit_info *sit_i = SIT_I(sbi);
790 struct curseg_info *curseg; 862 struct curseg_info *curseg;
791 unsigned int old_cursegno; 863 unsigned int old_cursegno;
792 int type;
793 864
794 type = __get_segment_type(page, p_type);
795 curseg = CURSEG_I(sbi, type); 865 curseg = CURSEG_I(sbi, type);
796 866
797 mutex_lock(&curseg->curseg_mutex); 867 mutex_lock(&curseg->curseg_mutex);
@@ -824,49 +894,64 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
824 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); 894 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
825 mutex_unlock(&sit_i->sentry_lock); 895 mutex_unlock(&sit_i->sentry_lock);
826 896
827 if (p_type == NODE) 897 if (page && IS_NODESEG(type))
828 fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); 898 fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
829 899
830 /* writeout dirty page into bdev */
831 submit_write_page(sbi, page, *new_blkaddr, p_type);
832
833 mutex_unlock(&curseg->curseg_mutex); 900 mutex_unlock(&curseg->curseg_mutex);
834} 901}
835 902
903static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
904 block_t old_blkaddr, block_t *new_blkaddr,
905 struct f2fs_summary *sum, struct f2fs_io_info *fio)
906{
907 int type = __get_segment_type(page, fio->type);
908
909 allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
910
911 /* writeout dirty page into bdev */
912 f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
913}
914
836void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) 915void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
837{ 916{
917 struct f2fs_io_info fio = {
918 .type = META,
919 .rw = WRITE_SYNC | REQ_META | REQ_PRIO
920 };
921
838 set_page_writeback(page); 922 set_page_writeback(page);
839 submit_write_page(sbi, page, page->index, META); 923 f2fs_submit_page_mbio(sbi, page, page->index, &fio);
840} 924}
841 925
842void write_node_page(struct f2fs_sb_info *sbi, struct page *page, 926void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
927 struct f2fs_io_info *fio,
843 unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) 928 unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
844{ 929{
845 struct f2fs_summary sum; 930 struct f2fs_summary sum;
846 set_summary(&sum, nid, 0, 0); 931 set_summary(&sum, nid, 0, 0);
847 do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); 932 do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
848} 933}
849 934
850void write_data_page(struct inode *inode, struct page *page, 935void write_data_page(struct page *page, struct dnode_of_data *dn,
851 struct dnode_of_data *dn, block_t old_blkaddr, 936 block_t *new_blkaddr, struct f2fs_io_info *fio)
852 block_t *new_blkaddr)
853{ 937{
854 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 938 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
855 struct f2fs_summary sum; 939 struct f2fs_summary sum;
856 struct node_info ni; 940 struct node_info ni;
857 941
858 f2fs_bug_on(old_blkaddr == NULL_ADDR); 942 f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
859 get_node_info(sbi, dn->nid, &ni); 943 get_node_info(sbi, dn->nid, &ni);
860 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 944 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
861 945
862 do_write_page(sbi, page, old_blkaddr, 946 do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
863 new_blkaddr, &sum, DATA);
864} 947}
865 948
866void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, 949void rewrite_data_page(struct page *page, block_t old_blkaddr,
867 block_t old_blk_addr) 950 struct f2fs_io_info *fio)
868{ 951{
869 submit_write_page(sbi, page, old_blk_addr, DATA); 952 struct inode *inode = page->mapping->host;
953 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
954 f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
870} 955}
871 956
872void recover_data_page(struct f2fs_sb_info *sbi, 957void recover_data_page(struct f2fs_sb_info *sbi,
@@ -925,6 +1010,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
925 unsigned int segno, old_cursegno; 1010 unsigned int segno, old_cursegno;
926 block_t next_blkaddr = next_blkaddr_of_node(page); 1011 block_t next_blkaddr = next_blkaddr_of_node(page);
927 unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); 1012 unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
1013 struct f2fs_io_info fio = {
1014 .type = NODE,
1015 .rw = WRITE_SYNC,
1016 };
928 1017
929 curseg = CURSEG_I(sbi, type); 1018 curseg = CURSEG_I(sbi, type);
930 1019
@@ -953,8 +1042,8 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
953 1042
954 /* rewrite node page */ 1043 /* rewrite node page */
955 set_page_writeback(page); 1044 set_page_writeback(page);
956 submit_write_page(sbi, page, new_blkaddr, NODE); 1045 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
957 f2fs_submit_bio(sbi, NODE, true); 1046 f2fs_submit_merged_bio(sbi, NODE, WRITE);
958 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); 1047 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
959 1048
960 locate_dirty_segment(sbi, old_cursegno); 1049 locate_dirty_segment(sbi, old_cursegno);
@@ -964,6 +1053,16 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
964 mutex_unlock(&curseg->curseg_mutex); 1053 mutex_unlock(&curseg->curseg_mutex);
965} 1054}
966 1055
1056void f2fs_wait_on_page_writeback(struct page *page,
1057 enum page_type type)
1058{
1059 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1060 if (PageWriteback(page)) {
1061 f2fs_submit_merged_bio(sbi, type, WRITE);
1062 wait_on_page_writeback(page);
1063 }
1064}
1065
967static int read_compacted_summaries(struct f2fs_sb_info *sbi) 1066static int read_compacted_summaries(struct f2fs_sb_info *sbi)
968{ 1067{
969 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 1068 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1314,6 +1413,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
1314 1413
1315 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); 1414 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
1316 1415
1416 /* add discard candidates */
1417 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
1418 add_discard_addrs(sbi, segno, se);
1419
1317 if (flushed) 1420 if (flushed)
1318 goto to_sit_page; 1421 goto to_sit_page;
1319 1422
@@ -1480,41 +1583,94 @@ static int build_curseg(struct f2fs_sb_info *sbi)
1480 return restore_curseg_summaries(sbi); 1583 return restore_curseg_summaries(sbi);
1481} 1584}
1482 1585
1586static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
1587{
1588 struct address_space *mapping = META_MAPPING(sbi);
1589 struct page *page;
1590 block_t blk_addr, prev_blk_addr = 0;
1591 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1592 int blkno = start;
1593 struct f2fs_io_info fio = {
1594 .type = META,
1595 .rw = READ_SYNC | REQ_META | REQ_PRIO
1596 };
1597
1598 for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
1599
1600 blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
1601
1602 if (blkno != start && prev_blk_addr + 1 != blk_addr)
1603 break;
1604 prev_blk_addr = blk_addr;
1605repeat:
1606 page = grab_cache_page(mapping, blk_addr);
1607 if (!page) {
1608 cond_resched();
1609 goto repeat;
1610 }
1611 if (PageUptodate(page)) {
1612 mark_page_accessed(page);
1613 f2fs_put_page(page, 1);
1614 continue;
1615 }
1616
1617 f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
1618
1619 mark_page_accessed(page);
1620 f2fs_put_page(page, 0);
1621 }
1622
1623 f2fs_submit_merged_bio(sbi, META, READ);
1624 return blkno - start;
1625}
1626
1483static void build_sit_entries(struct f2fs_sb_info *sbi) 1627static void build_sit_entries(struct f2fs_sb_info *sbi)
1484{ 1628{
1485 struct sit_info *sit_i = SIT_I(sbi); 1629 struct sit_info *sit_i = SIT_I(sbi);
1486 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1630 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1487 struct f2fs_summary_block *sum = curseg->sum_blk; 1631 struct f2fs_summary_block *sum = curseg->sum_blk;
1488 unsigned int start; 1632 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1633 unsigned int i, start, end;
1634 unsigned int readed, start_blk = 0;
1635 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
1489 1636
1490 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 1637 do {
1491 struct seg_entry *se = &sit_i->sentries[start]; 1638 readed = ra_sit_pages(sbi, start_blk, nrpages);
1492 struct f2fs_sit_block *sit_blk; 1639
1493 struct f2fs_sit_entry sit; 1640 start = start_blk * sit_i->sents_per_block;
1494 struct page *page; 1641 end = (start_blk + readed) * sit_i->sents_per_block;
1495 int i; 1642
1496 1643 for (; start < end && start < TOTAL_SEGS(sbi); start++) {
1497 mutex_lock(&curseg->curseg_mutex); 1644 struct seg_entry *se = &sit_i->sentries[start];
1498 for (i = 0; i < sits_in_cursum(sum); i++) { 1645 struct f2fs_sit_block *sit_blk;
1499 if (le32_to_cpu(segno_in_journal(sum, i)) == start) { 1646 struct f2fs_sit_entry sit;
1500 sit = sit_in_journal(sum, i); 1647 struct page *page;
1501 mutex_unlock(&curseg->curseg_mutex); 1648
1502 goto got_it; 1649 mutex_lock(&curseg->curseg_mutex);
1650 for (i = 0; i < sits_in_cursum(sum); i++) {
1651 if (le32_to_cpu(segno_in_journal(sum, i))
1652 == start) {
1653 sit = sit_in_journal(sum, i);
1654 mutex_unlock(&curseg->curseg_mutex);
1655 goto got_it;
1656 }
1503 } 1657 }
1504 } 1658 mutex_unlock(&curseg->curseg_mutex);
1505 mutex_unlock(&curseg->curseg_mutex); 1659
1506 page = get_current_sit_page(sbi, start); 1660 page = get_current_sit_page(sbi, start);
1507 sit_blk = (struct f2fs_sit_block *)page_address(page); 1661 sit_blk = (struct f2fs_sit_block *)page_address(page);
1508 sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; 1662 sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
1509 f2fs_put_page(page, 1); 1663 f2fs_put_page(page, 1);
1510got_it: 1664got_it:
1511 check_block_count(sbi, start, &sit); 1665 check_block_count(sbi, start, &sit);
1512 seg_info_from_raw_sit(se, &sit); 1666 seg_info_from_raw_sit(se, &sit);
1513 if (sbi->segs_per_sec > 1) { 1667 if (sbi->segs_per_sec > 1) {
1514 struct sec_entry *e = get_sec_entry(sbi, start); 1668 struct sec_entry *e = get_sec_entry(sbi, start);
1515 e->valid_blocks += se->valid_blocks; 1669 e->valid_blocks += se->valid_blocks;
1670 }
1516 } 1671 }
1517 } 1672 start_blk += readed;
1673 } while (start_blk < sit_blk_cnt);
1518} 1674}
1519 1675
1520static void init_free_segmap(struct f2fs_sb_info *sbi) 1676static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -1644,6 +1800,12 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1644 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); 1800 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
1645 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); 1801 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1646 sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; 1802 sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
1803 sm_info->ipu_policy = F2FS_IPU_DISABLE;
1804 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
1805
1806 INIT_LIST_HEAD(&sm_info->discard_list);
1807 sm_info->nr_discards = 0;
1808 sm_info->max_discards = 0;
1647 1809
1648 err = build_sit_info(sbi); 1810 err = build_sit_info(sbi);
1649 if (err) 1811 if (err)
@@ -1760,3 +1922,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
1760 sbi->sm_info = NULL; 1922 sbi->sm_info = NULL;
1761 kfree(sm_info); 1923 kfree(sm_info);
1762} 1924}
1925
1926int __init create_segment_manager_caches(void)
1927{
1928 discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
1929 sizeof(struct discard_entry), NULL);
1930 if (!discard_entry_slab)
1931 return -ENOMEM;
1932 return 0;
1933}
1934
1935void destroy_segment_manager_caches(void)
1936{
1937 kmem_cache_destroy(discard_entry_slab);
1938}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 269f690b4e24..5731682d7516 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -20,13 +20,8 @@
20#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) 20#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
21#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) 21#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
22 22
23#define IS_DATASEG(t) \ 23#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA)
24 ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \ 24#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE)
25 (t == CURSEG_WARM_DATA))
26
27#define IS_NODESEG(t) \
28 ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \
29 (t == CURSEG_WARM_NODE))
30 25
31#define IS_CURSEG(sbi, seg) \ 26#define IS_CURSEG(sbi, seg) \
32 ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ 27 ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
@@ -83,25 +78,20 @@
83 (segno / SIT_ENTRY_PER_BLOCK) 78 (segno / SIT_ENTRY_PER_BLOCK)
84#define START_SEGNO(sit_i, segno) \ 79#define START_SEGNO(sit_i, segno) \
85 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) 80 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
81#define SIT_BLK_CNT(sbi) \
82 ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
86#define f2fs_bitmap_size(nr) \ 83#define f2fs_bitmap_size(nr) \
87 (BITS_TO_LONGS(nr) * sizeof(unsigned long)) 84 (BITS_TO_LONGS(nr) * sizeof(unsigned long))
88#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) 85#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
89#define TOTAL_SECS(sbi) (sbi->total_sections) 86#define TOTAL_SECS(sbi) (sbi->total_sections)
90 87
91#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ 88#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
92 (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) 89 (((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
93#define SECTOR_TO_BLOCK(sbi, sectors) \ 90#define SECTOR_TO_BLOCK(sbi, sectors) \
94 (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) 91 (sectors >> (sbi)->log_sectors_per_block)
95#define MAX_BIO_BLOCKS(max_hw_blocks) \ 92#define MAX_BIO_BLOCKS(max_hw_blocks) \
96 (min((int)max_hw_blocks, BIO_MAX_PAGES)) 93 (min((int)max_hw_blocks, BIO_MAX_PAGES))
97 94
98/* during checkpoint, bio_private is used to synchronize the last bio */
99struct bio_private {
100 struct f2fs_sb_info *sbi;
101 bool is_sync;
102 void *wait;
103};
104
105/* 95/*
106 * indicate a block allocation direction: RIGHT and LEFT. 96 * indicate a block allocation direction: RIGHT and LEFT.
107 * RIGHT means allocating new sections towards the end of volume. 97 * RIGHT means allocating new sections towards the end of volume.
@@ -458,8 +448,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
458 448
459static inline bool need_SSR(struct f2fs_sb_info *sbi) 449static inline bool need_SSR(struct f2fs_sb_info *sbi)
460{ 450{
461 return ((prefree_segments(sbi) / sbi->segs_per_sec) 451 return (prefree_segments(sbi) / sbi->segs_per_sec)
462 + free_sections(sbi) < overprovision_sections(sbi)); 452 + free_sections(sbi) < overprovision_sections(sbi);
463} 453}
464 454
465static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) 455static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -467,38 +457,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
467 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); 457 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
468 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); 458 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
469 459
470 if (sbi->por_doing) 460 if (unlikely(sbi->por_doing))
471 return false; 461 return false;
472 462
473 return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + 463 return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
474 reserved_sections(sbi))); 464 reserved_sections(sbi));
475} 465}
476 466
477static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) 467static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
478{ 468{
479 return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments); 469 return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments;
480} 470}
481 471
482static inline int utilization(struct f2fs_sb_info *sbi) 472static inline int utilization(struct f2fs_sb_info *sbi)
483{ 473{
484 return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); 474 return div_u64((u64)valid_user_blocks(sbi) * 100,
475 sbi->user_block_count);
485} 476}
486 477
487/* 478/*
488 * Sometimes f2fs may be better to drop out-of-place update policy. 479 * Sometimes f2fs may be better to drop out-of-place update policy.
489 * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write 480 * And, users can control the policy through sysfs entries.
490 * data in the original place likewise other traditional file systems. 481 * There are five policies with triggering conditions as follows.
491 * But, currently set 100 in percentage, which means it is disabled. 482 * F2FS_IPU_FORCE - all the time,
492 * See below need_inplace_update(). 483 * F2FS_IPU_SSR - if SSR mode is activated,
484 * F2FS_IPU_UTIL - if FS utilization is over threashold,
485 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
486 * threashold,
487 * F2FS_IPUT_DISABLE - disable IPU. (=default option)
493 */ 488 */
494#define MIN_IPU_UTIL 100 489#define DEF_MIN_IPU_UTIL 70
490
491enum {
492 F2FS_IPU_FORCE,
493 F2FS_IPU_SSR,
494 F2FS_IPU_UTIL,
495 F2FS_IPU_SSR_UTIL,
496 F2FS_IPU_DISABLE,
497};
498
495static inline bool need_inplace_update(struct inode *inode) 499static inline bool need_inplace_update(struct inode *inode)
496{ 500{
497 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 501 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
502
503 /* IPU can be done only for the user data */
498 if (S_ISDIR(inode->i_mode)) 504 if (S_ISDIR(inode->i_mode))
499 return false; 505 return false;
500 if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) 506
507 switch (SM_I(sbi)->ipu_policy) {
508 case F2FS_IPU_FORCE:
501 return true; 509 return true;
510 case F2FS_IPU_SSR:
511 if (need_SSR(sbi))
512 return true;
513 break;
514 case F2FS_IPU_UTIL:
515 if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
516 return true;
517 break;
518 case F2FS_IPU_SSR_UTIL:
519 if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
520 return true;
521 break;
522 case F2FS_IPU_DISABLE:
523 break;
524 }
502 return false; 525 return false;
503} 526}
504 527
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index bafff72de8e8..1a85f83abd53 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -50,6 +50,7 @@ enum {
50 Opt_active_logs, 50 Opt_active_logs,
51 Opt_disable_ext_identify, 51 Opt_disable_ext_identify,
52 Opt_inline_xattr, 52 Opt_inline_xattr,
53 Opt_inline_data,
53 Opt_err, 54 Opt_err,
54}; 55};
55 56
@@ -65,6 +66,7 @@ static match_table_t f2fs_tokens = {
65 {Opt_active_logs, "active_logs=%u"}, 66 {Opt_active_logs, "active_logs=%u"},
66 {Opt_disable_ext_identify, "disable_ext_identify"}, 67 {Opt_disable_ext_identify, "disable_ext_identify"},
67 {Opt_inline_xattr, "inline_xattr"}, 68 {Opt_inline_xattr, "inline_xattr"},
69 {Opt_inline_data, "inline_data"},
68 {Opt_err, NULL}, 70 {Opt_err, NULL},
69}; 71};
70 72
@@ -72,6 +74,7 @@ static match_table_t f2fs_tokens = {
72enum { 74enum {
73 GC_THREAD, /* struct f2fs_gc_thread */ 75 GC_THREAD, /* struct f2fs_gc_thread */
74 SM_INFO, /* struct f2fs_sm_info */ 76 SM_INFO, /* struct f2fs_sm_info */
77 F2FS_SBI, /* struct f2fs_sb_info */
75}; 78};
76 79
77struct f2fs_attr { 80struct f2fs_attr {
@@ -89,6 +92,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
89 return (unsigned char *)sbi->gc_thread; 92 return (unsigned char *)sbi->gc_thread;
90 else if (struct_type == SM_INFO) 93 else if (struct_type == SM_INFO)
91 return (unsigned char *)SM_I(sbi); 94 return (unsigned char *)SM_I(sbi);
95 else if (struct_type == F2FS_SBI)
96 return (unsigned char *)sbi;
92 return NULL; 97 return NULL;
93} 98}
94 99
@@ -175,6 +180,10 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
175F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); 180F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
176F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); 181F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
177F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); 182F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
183F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
184F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
185F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
186F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
178 187
179#define ATTR_LIST(name) (&f2fs_attr_##name.attr) 188#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
180static struct attribute *f2fs_attrs[] = { 189static struct attribute *f2fs_attrs[] = {
@@ -183,6 +192,10 @@ static struct attribute *f2fs_attrs[] = {
183 ATTR_LIST(gc_no_gc_sleep_time), 192 ATTR_LIST(gc_no_gc_sleep_time),
184 ATTR_LIST(gc_idle), 193 ATTR_LIST(gc_idle),
185 ATTR_LIST(reclaim_segments), 194 ATTR_LIST(reclaim_segments),
195 ATTR_LIST(max_small_discards),
196 ATTR_LIST(ipu_policy),
197 ATTR_LIST(min_ipu_util),
198 ATTR_LIST(max_victim_search),
186 NULL, 199 NULL,
187}; 200};
188 201
@@ -311,6 +324,9 @@ static int parse_options(struct super_block *sb, char *options)
311 case Opt_disable_ext_identify: 324 case Opt_disable_ext_identify:
312 set_opt(sbi, DISABLE_EXT_IDENTIFY); 325 set_opt(sbi, DISABLE_EXT_IDENTIFY);
313 break; 326 break;
327 case Opt_inline_data:
328 set_opt(sbi, INLINE_DATA);
329 break;
314 default: 330 default:
315 f2fs_msg(sb, KERN_ERR, 331 f2fs_msg(sb, KERN_ERR,
316 "Unrecognized mount option \"%s\" or missing value", 332 "Unrecognized mount option \"%s\" or missing value",
@@ -325,7 +341,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
325{ 341{
326 struct f2fs_inode_info *fi; 342 struct f2fs_inode_info *fi;
327 343
328 fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); 344 fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
329 if (!fi) 345 if (!fi)
330 return NULL; 346 return NULL;
331 347
@@ -508,7 +524,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
508#endif 524#endif
509 if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) 525 if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
510 seq_puts(seq, ",disable_ext_identify"); 526 seq_puts(seq, ",disable_ext_identify");
511 527 if (test_opt(sbi, INLINE_DATA))
528 seq_puts(seq, ",inline_data");
512 seq_printf(seq, ",active_logs=%u", sbi->active_logs); 529 seq_printf(seq, ",active_logs=%u", sbi->active_logs);
513 530
514 return 0; 531 return 0;
@@ -518,7 +535,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
518{ 535{
519 struct super_block *sb = seq->private; 536 struct super_block *sb = seq->private;
520 struct f2fs_sb_info *sbi = F2FS_SB(sb); 537 struct f2fs_sb_info *sbi = F2FS_SB(sb);
521 unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); 538 unsigned int total_segs =
539 le32_to_cpu(sbi->raw_super->segment_count_main);
522 int i; 540 int i;
523 541
524 for (i = 0; i < total_segs; i++) { 542 for (i = 0; i < total_segs; i++) {
@@ -618,7 +636,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
618 struct f2fs_sb_info *sbi = F2FS_SB(sb); 636 struct f2fs_sb_info *sbi = F2FS_SB(sb);
619 struct inode *inode; 637 struct inode *inode;
620 638
621 if (ino < F2FS_ROOT_INO(sbi)) 639 if (unlikely(ino < F2FS_ROOT_INO(sbi)))
622 return ERR_PTR(-ESTALE); 640 return ERR_PTR(-ESTALE);
623 641
624 /* 642 /*
@@ -629,7 +647,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
629 inode = f2fs_iget(sb, ino); 647 inode = f2fs_iget(sb, ino);
630 if (IS_ERR(inode)) 648 if (IS_ERR(inode))
631 return ERR_CAST(inode); 649 return ERR_CAST(inode);
632 if (generation && inode->i_generation != generation) { 650 if (unlikely(generation && inode->i_generation != generation)) {
633 /* we didn't find the right inode.. */ 651 /* we didn't find the right inode.. */
634 iput(inode); 652 iput(inode);
635 return ERR_PTR(-ESTALE); 653 return ERR_PTR(-ESTALE);
@@ -732,10 +750,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
732 fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); 750 fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
733 fsmeta += le32_to_cpu(raw_super->segment_count_ssa); 751 fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
734 752
735 if (fsmeta >= total) 753 if (unlikely(fsmeta >= total))
736 return 1; 754 return 1;
737 755
738 if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { 756 if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
739 f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); 757 f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
740 return 1; 758 return 1;
741 } 759 }
@@ -763,6 +781,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
763 sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); 781 sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
764 sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); 782 sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
765 sbi->cur_victim_sec = NULL_SECNO; 783 sbi->cur_victim_sec = NULL_SECNO;
784 sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
766 785
767 for (i = 0; i < NR_COUNT_TYPE; i++) 786 for (i = 0; i < NR_COUNT_TYPE; i++)
768 atomic_set(&sbi->nr_pages[i], 0); 787 atomic_set(&sbi->nr_pages[i], 0);
@@ -798,9 +817,10 @@ retry:
798 /* sanity checking of raw super */ 817 /* sanity checking of raw super */
799 if (sanity_check_raw_super(sb, *raw_super)) { 818 if (sanity_check_raw_super(sb, *raw_super)) {
800 brelse(*raw_super_buf); 819 brelse(*raw_super_buf);
801 f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " 820 f2fs_msg(sb, KERN_ERR,
802 "in %dth superblock", block + 1); 821 "Can't find valid F2FS filesystem in %dth superblock",
803 if(block == 0) { 822 block + 1);
823 if (block == 0) {
804 block++; 824 block++;
805 goto retry; 825 goto retry;
806 } else { 826 } else {
@@ -818,6 +838,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
818 struct buffer_head *raw_super_buf; 838 struct buffer_head *raw_super_buf;
819 struct inode *root; 839 struct inode *root;
820 long err = -EINVAL; 840 long err = -EINVAL;
841 int i;
821 842
822 /* allocate memory for f2fs-specific super block info */ 843 /* allocate memory for f2fs-specific super block info */
823 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); 844 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
@@ -825,7 +846,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
825 return -ENOMEM; 846 return -ENOMEM;
826 847
827 /* set a block size */ 848 /* set a block size */
828 if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { 849 if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
829 f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); 850 f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
830 goto free_sbi; 851 goto free_sbi;
831 } 852 }
@@ -874,7 +895,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
874 mutex_init(&sbi->node_write); 895 mutex_init(&sbi->node_write);
875 sbi->por_doing = false; 896 sbi->por_doing = false;
876 spin_lock_init(&sbi->stat_lock); 897 spin_lock_init(&sbi->stat_lock);
877 init_rwsem(&sbi->bio_sem); 898
899 mutex_init(&sbi->read_io.io_mutex);
900 sbi->read_io.sbi = sbi;
901 sbi->read_io.bio = NULL;
902 for (i = 0; i < NR_PAGE_TYPE; i++) {
903 mutex_init(&sbi->write_io[i].io_mutex);
904 sbi->write_io[i].sbi = sbi;
905 sbi->write_io[i].bio = NULL;
906 }
907
878 init_rwsem(&sbi->cp_rwsem); 908 init_rwsem(&sbi->cp_rwsem);
879 init_waitqueue_head(&sbi->cp_wait); 909 init_waitqueue_head(&sbi->cp_wait);
880 init_sb_info(sbi); 910 init_sb_info(sbi);
@@ -939,9 +969,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
939 } 969 }
940 970
941 /* if there are nt orphan nodes free them */ 971 /* if there are nt orphan nodes free them */
942 err = -EINVAL; 972 recover_orphan_inodes(sbi);
943 if (recover_orphan_inodes(sbi))
944 goto free_node_inode;
945 973
946 /* read root inode and dentry */ 974 /* read root inode and dentry */
947 root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); 975 root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -950,8 +978,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
950 err = PTR_ERR(root); 978 err = PTR_ERR(root);
951 goto free_node_inode; 979 goto free_node_inode;
952 } 980 }
953 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) 981 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
982 err = -EINVAL;
954 goto free_root_inode; 983 goto free_root_inode;
984 }
955 985
956 sb->s_root = d_make_root(root); /* allocate root dentry */ 986 sb->s_root = d_make_root(root); /* allocate root dentry */
957 if (!sb->s_root) { 987 if (!sb->s_root) {
@@ -1053,7 +1083,7 @@ static int __init init_inodecache(void)
1053{ 1083{
1054 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", 1084 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
1055 sizeof(struct f2fs_inode_info), NULL); 1085 sizeof(struct f2fs_inode_info), NULL);
1056 if (f2fs_inode_cachep == NULL) 1086 if (!f2fs_inode_cachep)
1057 return -ENOMEM; 1087 return -ENOMEM;
1058 return 0; 1088 return 0;
1059} 1089}
@@ -1078,9 +1108,12 @@ static int __init init_f2fs_fs(void)
1078 err = create_node_manager_caches(); 1108 err = create_node_manager_caches();
1079 if (err) 1109 if (err)
1080 goto free_inodecache; 1110 goto free_inodecache;
1081 err = create_gc_caches(); 1111 err = create_segment_manager_caches();
1082 if (err) 1112 if (err)
1083 goto free_node_manager_caches; 1113 goto free_node_manager_caches;
1114 err = create_gc_caches();
1115 if (err)
1116 goto free_segment_manager_caches;
1084 err = create_checkpoint_caches(); 1117 err = create_checkpoint_caches();
1085 if (err) 1118 if (err)
1086 goto free_gc_caches; 1119 goto free_gc_caches;
@@ -1102,6 +1135,8 @@ free_checkpoint_caches:
1102 destroy_checkpoint_caches(); 1135 destroy_checkpoint_caches();
1103free_gc_caches: 1136free_gc_caches:
1104 destroy_gc_caches(); 1137 destroy_gc_caches();
1138free_segment_manager_caches:
1139 destroy_segment_manager_caches();
1105free_node_manager_caches: 1140free_node_manager_caches:
1106 destroy_node_manager_caches(); 1141 destroy_node_manager_caches();
1107free_inodecache: 1142free_inodecache:
@@ -1117,6 +1152,7 @@ static void __exit exit_f2fs_fs(void)
1117 unregister_filesystem(&f2fs_fs_type); 1152 unregister_filesystem(&f2fs_fs_type);
1118 destroy_checkpoint_caches(); 1153 destroy_checkpoint_caches();
1119 destroy_gc_caches(); 1154 destroy_gc_caches();
1155 destroy_segment_manager_caches();
1120 destroy_node_manager_caches(); 1156 destroy_node_manager_caches();
1121 destroy_inodecache(); 1157 destroy_inodecache();
1122 kset_unregister(f2fs_kset); 1158 kset_unregister(f2fs_kset);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index aa7a3f139fe5..89d0422a91a8 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -21,6 +21,7 @@
21#include <linux/rwsem.h> 21#include <linux/rwsem.h>
22#include <linux/f2fs_fs.h> 22#include <linux/f2fs_fs.h>
23#include <linux/security.h> 23#include <linux/security.h>
24#include <linux/posix_acl_xattr.h>
24#include "f2fs.h" 25#include "f2fs.h"
25#include "xattr.h" 26#include "xattr.h"
26 27
@@ -216,8 +217,8 @@ const struct xattr_handler f2fs_xattr_security_handler = {
216static const struct xattr_handler *f2fs_xattr_handler_map[] = { 217static const struct xattr_handler *f2fs_xattr_handler_map[] = {
217 [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, 218 [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
218#ifdef CONFIG_F2FS_FS_POSIX_ACL 219#ifdef CONFIG_F2FS_FS_POSIX_ACL
219 [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler, 220 [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
220 [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, 221 [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
221#endif 222#endif
222 [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, 223 [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
223#ifdef CONFIG_F2FS_FS_SECURITY 224#ifdef CONFIG_F2FS_FS_SECURITY
@@ -229,8 +230,8 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
229const struct xattr_handler *f2fs_xattr_handlers[] = { 230const struct xattr_handler *f2fs_xattr_handlers[] = {
230 &f2fs_xattr_user_handler, 231 &f2fs_xattr_user_handler,
231#ifdef CONFIG_F2FS_FS_POSIX_ACL 232#ifdef CONFIG_F2FS_FS_POSIX_ACL
232 &f2fs_xattr_acl_access_handler, 233 &posix_acl_access_xattr_handler,
233 &f2fs_xattr_acl_default_handler, 234 &posix_acl_default_xattr_handler,
234#endif 235#endif
235 &f2fs_xattr_trusted_handler, 236 &f2fs_xattr_trusted_handler,
236#ifdef CONFIG_F2FS_FS_SECURITY 237#ifdef CONFIG_F2FS_FS_SECURITY
@@ -522,7 +523,7 @@ static int __f2fs_setxattr(struct inode *inode, int name_index,
522 if (found) 523 if (found)
523 free = free + ENTRY_SIZE(here); 524 free = free + ENTRY_SIZE(here);
524 525
525 if (free < newsize) { 526 if (unlikely(free < newsize)) {
526 error = -ENOSPC; 527 error = -ENOSPC;
527 goto exit; 528 goto exit;
528 } 529 }
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 02a08fb88a15..b21d9ebdeff3 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -108,8 +108,6 @@ struct f2fs_xattr_entry {
108#ifdef CONFIG_F2FS_FS_XATTR 108#ifdef CONFIG_F2FS_FS_XATTR
109extern const struct xattr_handler f2fs_xattr_user_handler; 109extern const struct xattr_handler f2fs_xattr_user_handler;
110extern const struct xattr_handler f2fs_xattr_trusted_handler; 110extern const struct xattr_handler f2fs_xattr_trusted_handler;
111extern const struct xattr_handler f2fs_xattr_acl_access_handler;
112extern const struct xattr_handler f2fs_xattr_acl_default_handler;
113extern const struct xattr_handler f2fs_xattr_advise_handler; 111extern const struct xattr_handler f2fs_xattr_advise_handler;
114extern const struct xattr_handler f2fs_xattr_security_handler; 112extern const struct xattr_handler f2fs_xattr_security_handler;
115 113
diff --git a/fs/file.c b/fs/file.c
index 4a78f981557a..eb56a13dab3e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -34,7 +34,7 @@ static void *alloc_fdmem(size_t size)
34 * vmalloc() if the allocation size will be considered "large" by the VM. 34 * vmalloc() if the allocation size will be considered "large" by the VM.
35 */ 35 */
36 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 36 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
37 void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); 37 void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
38 if (data != NULL) 38 if (data != NULL)
39 return data; 39 return data;
40 } 40 }
@@ -348,21 +348,16 @@ out:
348 return NULL; 348 return NULL;
349} 349}
350 350
351static void close_files(struct files_struct * files) 351static struct fdtable *close_files(struct files_struct * files)
352{ 352{
353 int i, j;
354 struct fdtable *fdt;
355
356 j = 0;
357
358 /* 353 /*
359 * It is safe to dereference the fd table without RCU or 354 * It is safe to dereference the fd table without RCU or
360 * ->file_lock because this is the last reference to the 355 * ->file_lock because this is the last reference to the
361 * files structure. But use RCU to shut RCU-lockdep up. 356 * files structure.
362 */ 357 */
363 rcu_read_lock(); 358 struct fdtable *fdt = rcu_dereference_raw(files->fdt);
364 fdt = files_fdtable(files); 359 int i, j = 0;
365 rcu_read_unlock(); 360
366 for (;;) { 361 for (;;) {
367 unsigned long set; 362 unsigned long set;
368 i = j * BITS_PER_LONG; 363 i = j * BITS_PER_LONG;
@@ -381,6 +376,8 @@ static void close_files(struct files_struct * files)
381 set >>= 1; 376 set >>= 1;
382 } 377 }
383 } 378 }
379
380 return fdt;
384} 381}
385 382
386struct files_struct *get_files_struct(struct task_struct *task) 383struct files_struct *get_files_struct(struct task_struct *task)
@@ -398,14 +395,9 @@ struct files_struct *get_files_struct(struct task_struct *task)
398 395
399void put_files_struct(struct files_struct *files) 396void put_files_struct(struct files_struct *files)
400{ 397{
401 struct fdtable *fdt;
402
403 if (atomic_dec_and_test(&files->count)) { 398 if (atomic_dec_and_test(&files->count)) {
404 close_files(files); 399 struct fdtable *fdt = close_files(files);
405 /* not really needed, since nobody can see us */ 400
406 rcu_read_lock();
407 fdt = files_fdtable(files);
408 rcu_read_unlock();
409 /* free the arrays if they are not embedded */ 401 /* free the arrays if they are not embedded */
410 if (fdt != &files->fdtab) 402 if (fdt != &files->fdtab)
411 __free_fdtable(fdt); 403 __free_fdtable(fdt);
@@ -645,16 +637,16 @@ void do_close_on_exec(struct files_struct *files)
645 spin_unlock(&files->file_lock); 637 spin_unlock(&files->file_lock);
646} 638}
647 639
648struct file *fget(unsigned int fd) 640static struct file *__fget(unsigned int fd, fmode_t mask)
649{ 641{
650 struct file *file;
651 struct files_struct *files = current->files; 642 struct files_struct *files = current->files;
643 struct file *file;
652 644
653 rcu_read_lock(); 645 rcu_read_lock();
654 file = fcheck_files(files, fd); 646 file = fcheck_files(files, fd);
655 if (file) { 647 if (file) {
656 /* File object ref couldn't be taken */ 648 /* File object ref couldn't be taken */
657 if (file->f_mode & FMODE_PATH || 649 if ((file->f_mode & mask) ||
658 !atomic_long_inc_not_zero(&file->f_count)) 650 !atomic_long_inc_not_zero(&file->f_count))
659 file = NULL; 651 file = NULL;
660 } 652 }
@@ -663,25 +655,16 @@ struct file *fget(unsigned int fd)
663 return file; 655 return file;
664} 656}
665 657
658struct file *fget(unsigned int fd)
659{
660 return __fget(fd, FMODE_PATH);
661}
666EXPORT_SYMBOL(fget); 662EXPORT_SYMBOL(fget);
667 663
668struct file *fget_raw(unsigned int fd) 664struct file *fget_raw(unsigned int fd)
669{ 665{
670 struct file *file; 666 return __fget(fd, 0);
671 struct files_struct *files = current->files;
672
673 rcu_read_lock();
674 file = fcheck_files(files, fd);
675 if (file) {
676 /* File object ref couldn't be taken */
677 if (!atomic_long_inc_not_zero(&file->f_count))
678 file = NULL;
679 }
680 rcu_read_unlock();
681
682 return file;
683} 667}
684
685EXPORT_SYMBOL(fget_raw); 668EXPORT_SYMBOL(fget_raw);
686 669
687/* 670/*
@@ -700,58 +683,54 @@ EXPORT_SYMBOL(fget_raw);
700 * The fput_needed flag returned by fget_light should be passed to the 683 * The fput_needed flag returned by fget_light should be passed to the
701 * corresponding fput_light. 684 * corresponding fput_light.
702 */ 685 */
703struct file *fget_light(unsigned int fd, int *fput_needed) 686static unsigned long __fget_light(unsigned int fd, fmode_t mask)
704{ 687{
705 struct file *file;
706 struct files_struct *files = current->files; 688 struct files_struct *files = current->files;
689 struct file *file;
707 690
708 *fput_needed = 0;
709 if (atomic_read(&files->count) == 1) { 691 if (atomic_read(&files->count) == 1) {
710 file = fcheck_files(files, fd); 692 file = __fcheck_files(files, fd);
711 if (file && (file->f_mode & FMODE_PATH)) 693 if (!file || unlikely(file->f_mode & mask))
712 file = NULL; 694 return 0;
695 return (unsigned long)file;
713 } else { 696 } else {
714 rcu_read_lock(); 697 file = __fget(fd, mask);
715 file = fcheck_files(files, fd); 698 if (!file)
716 if (file) { 699 return 0;
717 if (!(file->f_mode & FMODE_PATH) && 700 return FDPUT_FPUT | (unsigned long)file;
718 atomic_long_inc_not_zero(&file->f_count))
719 *fput_needed = 1;
720 else
721 /* Didn't get the reference, someone's freed */
722 file = NULL;
723 }
724 rcu_read_unlock();
725 } 701 }
702}
703unsigned long __fdget(unsigned int fd)
704{
705 return __fget_light(fd, FMODE_PATH);
706}
707EXPORT_SYMBOL(__fdget);
726 708
727 return file; 709unsigned long __fdget_raw(unsigned int fd)
710{
711 return __fget_light(fd, 0);
728} 712}
729EXPORT_SYMBOL(fget_light);
730 713
731struct file *fget_raw_light(unsigned int fd, int *fput_needed) 714unsigned long __fdget_pos(unsigned int fd)
732{ 715{
733 struct file *file; 716 unsigned long v = __fdget(fd);
734 struct files_struct *files = current->files; 717 struct file *file = (struct file *)(v & ~3);
735 718
736 *fput_needed = 0; 719 if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
737 if (atomic_read(&files->count) == 1) { 720 if (file_count(file) > 1) {
738 file = fcheck_files(files, fd); 721 v |= FDPUT_POS_UNLOCK;
739 } else { 722 mutex_lock(&file->f_pos_lock);
740 rcu_read_lock();
741 file = fcheck_files(files, fd);
742 if (file) {
743 if (atomic_long_inc_not_zero(&file->f_count))
744 *fput_needed = 1;
745 else
746 /* Didn't get the reference, someone's freed */
747 file = NULL;
748 } 723 }
749 rcu_read_unlock();
750 } 724 }
751 725 return v;
752 return file;
753} 726}
754 727
728/*
729 * We only lock f_pos if we have threads or if the file might be
730 * shared with another process. In both cases we'll have an elevated
731 * file count (done either by fdget() or by fork()).
732 */
733
755void set_close_on_exec(unsigned int fd, int flag) 734void set_close_on_exec(unsigned int fd, int flag)
756{ 735{
757 struct files_struct *files = current->files; 736 struct files_struct *files = current->files;
diff --git a/fs/file_table.c b/fs/file_table.c
index 5fff9030be34..5b24008ea4f6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
135 atomic_long_set(&f->f_count, 1); 135 atomic_long_set(&f->f_count, 1);
136 rwlock_init(&f->f_owner.lock); 136 rwlock_init(&f->f_owner.lock);
137 spin_lock_init(&f->f_lock); 137 spin_lock_init(&f->f_lock);
138 mutex_init(&f->f_pos_lock);
138 eventpoll_init_file(f); 139 eventpoll_init_file(f);
139 /* f->f_version: 0 */ 140 /* f->f_version: 0 */
140 return f; 141 return f;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f4a10ece2f1..d754e3cf99a8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -40,18 +40,13 @@
40struct wb_writeback_work { 40struct wb_writeback_work {
41 long nr_pages; 41 long nr_pages;
42 struct super_block *sb; 42 struct super_block *sb;
43 /* 43 unsigned long *older_than_this;
44 * Write only inodes dirtied before this time. Don't forget to set
45 * older_than_this_is_set when you set this.
46 */
47 unsigned long older_than_this;
48 enum writeback_sync_modes sync_mode; 44 enum writeback_sync_modes sync_mode;
49 unsigned int tagged_writepages:1; 45 unsigned int tagged_writepages:1;
50 unsigned int for_kupdate:1; 46 unsigned int for_kupdate:1;
51 unsigned int range_cyclic:1; 47 unsigned int range_cyclic:1;
52 unsigned int for_background:1; 48 unsigned int for_background:1;
53 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ 49 unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
54 unsigned int older_than_this_is_set:1;
55 enum wb_reason reason; /* why was writeback initiated? */ 50 enum wb_reason reason; /* why was writeback initiated? */
56 51
57 struct list_head list; /* pending work list */ 52 struct list_head list; /* pending work list */
@@ -252,10 +247,10 @@ static int move_expired_inodes(struct list_head *delaying_queue,
252 int do_sb_sort = 0; 247 int do_sb_sort = 0;
253 int moved = 0; 248 int moved = 0;
254 249
255 WARN_ON_ONCE(!work->older_than_this_is_set);
256 while (!list_empty(delaying_queue)) { 250 while (!list_empty(delaying_queue)) {
257 inode = wb_inode(delaying_queue->prev); 251 inode = wb_inode(delaying_queue->prev);
258 if (inode_dirtied_after(inode, work->older_than_this)) 252 if (work->older_than_this &&
253 inode_dirtied_after(inode, *work->older_than_this))
259 break; 254 break;
260 list_move(&inode->i_wb_list, &tmp); 255 list_move(&inode->i_wb_list, &tmp);
261 moved++; 256 moved++;
@@ -516,13 +511,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
516 } 511 }
517 WARN_ON(inode->i_state & I_SYNC); 512 WARN_ON(inode->i_state & I_SYNC);
518 /* 513 /*
519 * Skip inode if it is clean. We don't want to mess with writeback 514 * Skip inode if it is clean and we have no outstanding writeback in
520 * lists in this function since flusher thread may be doing for example 515 * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
521 * sync in parallel and if we move the inode, it could get skipped. So 516 * function since flusher thread may be doing for example sync in
522 * here we make sure inode is on some writeback list and leave it there 517 * parallel and if we move the inode, it could get skipped. So here we
523 * unless we have completely cleaned the inode. 518 * make sure inode is on some writeback list and leave it there unless
519 * we have completely cleaned the inode.
524 */ 520 */
525 if (!(inode->i_state & I_DIRTY)) 521 if (!(inode->i_state & I_DIRTY) &&
522 (wbc->sync_mode != WB_SYNC_ALL ||
523 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
526 goto out; 524 goto out;
527 inode->i_state |= I_SYNC; 525 inode->i_state |= I_SYNC;
528 spin_unlock(&inode->i_lock); 526 spin_unlock(&inode->i_lock);
@@ -739,8 +737,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
739 .sync_mode = WB_SYNC_NONE, 737 .sync_mode = WB_SYNC_NONE,
740 .range_cyclic = 1, 738 .range_cyclic = 1,
741 .reason = reason, 739 .reason = reason,
742 .older_than_this = jiffies,
743 .older_than_this_is_set = 1,
744 }; 740 };
745 741
746 spin_lock(&wb->list_lock); 742 spin_lock(&wb->list_lock);
@@ -799,13 +795,12 @@ static long wb_writeback(struct bdi_writeback *wb,
799{ 795{
800 unsigned long wb_start = jiffies; 796 unsigned long wb_start = jiffies;
801 long nr_pages = work->nr_pages; 797 long nr_pages = work->nr_pages;
798 unsigned long oldest_jif;
802 struct inode *inode; 799 struct inode *inode;
803 long progress; 800 long progress;
804 801
805 if (!work->older_than_this_is_set) { 802 oldest_jif = jiffies;
806 work->older_than_this = jiffies; 803 work->older_than_this = &oldest_jif;
807 work->older_than_this_is_set = 1;
808 }
809 804
810 spin_lock(&wb->list_lock); 805 spin_lock(&wb->list_lock);
811 for (;;) { 806 for (;;) {
@@ -839,10 +834,10 @@ static long wb_writeback(struct bdi_writeback *wb,
839 * safe. 834 * safe.
840 */ 835 */
841 if (work->for_kupdate) { 836 if (work->for_kupdate) {
842 work->older_than_this = jiffies - 837 oldest_jif = jiffies -
843 msecs_to_jiffies(dirty_expire_interval * 10); 838 msecs_to_jiffies(dirty_expire_interval * 10);
844 } else if (work->for_background) 839 } else if (work->for_background)
845 work->older_than_this = jiffies; 840 oldest_jif = jiffies;
846 841
847 trace_writeback_start(wb->bdi, work); 842 trace_writeback_start(wb->bdi, work);
848 if (list_empty(&wb->b_io)) 843 if (list_empty(&wb->b_io))
@@ -1354,21 +1349,18 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
1354 1349
1355/** 1350/**
1356 * sync_inodes_sb - sync sb inode pages 1351 * sync_inodes_sb - sync sb inode pages
1357 * @sb: the superblock 1352 * @sb: the superblock
1358 * @older_than_this: timestamp
1359 * 1353 *
1360 * This function writes and waits on any dirty inode belonging to this 1354 * This function writes and waits on any dirty inode belonging to this
1361 * superblock that has been dirtied before given timestamp. 1355 * super_block.
1362 */ 1356 */
1363void sync_inodes_sb(struct super_block *sb, unsigned long older_than_this) 1357void sync_inodes_sb(struct super_block *sb)
1364{ 1358{
1365 DECLARE_COMPLETION_ONSTACK(done); 1359 DECLARE_COMPLETION_ONSTACK(done);
1366 struct wb_writeback_work work = { 1360 struct wb_writeback_work work = {
1367 .sb = sb, 1361 .sb = sb,
1368 .sync_mode = WB_SYNC_ALL, 1362 .sync_mode = WB_SYNC_ALL,
1369 .nr_pages = LONG_MAX, 1363 .nr_pages = LONG_MAX,
1370 .older_than_this = older_than_this,
1371 .older_than_this_is_set = 1,
1372 .range_cyclic = 0, 1364 .range_cyclic = 0,
1373 .done = &done, 1365 .done = &done,
1374 .reason = WB_REASON_SYNC, 1366 .reason = WB_REASON_SYNC,
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index e1959efad64f..b5ebc2d7d80d 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -50,6 +50,8 @@ void fscache_objlist_add(struct fscache_object *obj)
50 struct fscache_object *xobj; 50 struct fscache_object *xobj;
51 struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; 51 struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
52 52
53 ASSERT(RB_EMPTY_NODE(&obj->objlist_link));
54
53 write_lock(&fscache_object_list_lock); 55 write_lock(&fscache_object_list_lock);
54 56
55 while (*p) { 57 while (*p) {
@@ -75,6 +77,9 @@ void fscache_objlist_add(struct fscache_object *obj)
75 */ 77 */
76void fscache_objlist_remove(struct fscache_object *obj) 78void fscache_objlist_remove(struct fscache_object *obj)
77{ 79{
80 if (RB_EMPTY_NODE(&obj->objlist_link))
81 return;
82
78 write_lock(&fscache_object_list_lock); 83 write_lock(&fscache_object_list_lock);
79 84
80 BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); 85 BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 53d35c504240..d3b4539f1651 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -314,6 +314,9 @@ void fscache_object_init(struct fscache_object *object,
314 object->cache = cache; 314 object->cache = cache;
315 object->cookie = cookie; 315 object->cookie = cookie;
316 object->parent = NULL; 316 object->parent = NULL;
317#ifdef CONFIG_FSCACHE_OBJECT_LIST
318 RB_CLEAR_NODE(&object->objlist_link);
319#endif
317 320
318 object->oob_event_mask = 0; 321 object->oob_event_mask = 0;
319 for (t = object->oob_table; t->events; t++) 322 for (t = object->oob_table; t->events; t++)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ef74ad5fd362..0a648bb455ae 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1296 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); 1296 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
1297} 1297}
1298 1298
1299static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
1300 struct pipe_buffer *buf)
1301{
1302 return 1;
1303}
1304
1305static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
1306 .can_merge = 0,
1307 .map = generic_pipe_buf_map,
1308 .unmap = generic_pipe_buf_unmap,
1309 .confirm = generic_pipe_buf_confirm,
1310 .release = generic_pipe_buf_release,
1311 .steal = fuse_dev_pipe_buf_steal,
1312 .get = generic_pipe_buf_get,
1313};
1314
1315static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, 1299static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1316 struct pipe_inode_info *pipe, 1300 struct pipe_inode_info *pipe,
1317 size_t len, unsigned int flags) 1301 size_t len, unsigned int flags)
@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1358 buf->page = bufs[page_nr].page; 1342 buf->page = bufs[page_nr].page;
1359 buf->offset = bufs[page_nr].offset; 1343 buf->offset = bufs[page_nr].offset;
1360 buf->len = bufs[page_nr].len; 1344 buf->len = bufs[page_nr].len;
1361 buf->ops = &fuse_dev_pipe_buf_ops; 1345 /*
1346 * Need to be careful about this. Having buf->ops in module
1347 * code can Oops if the buffer persists after module unload.
1348 */
1349 buf->ops = &nosteal_pipe_buf_ops;
1362 1350
1363 pipe->nrbufs++; 1351 pipe->nrbufs++;
1364 page_nr++; 1352 page_nr++;
@@ -1599,7 +1587,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
1599 1587
1600 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); 1588 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1601 err = fuse_copy_page(cs, &page, offset, this_num, 0); 1589 err = fuse_copy_page(cs, &page, offset, this_num, 0);
1602 if (!err && offset == 0 && (num != 0 || file_size == end)) 1590 if (!err && offset == 0 &&
1591 (this_num == PAGE_CACHE_SIZE || file_size == end))
1603 SetPageUptodate(page); 1592 SetPageUptodate(page);
1604 unlock_page(page); 1593 unlock_page(page);
1605 page_cache_release(page); 1594 page_cache_release(page);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index c3eb2c46c8f1..1d1292c581c3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode)
112 get_fuse_inode(inode)->i_time = 0; 112 get_fuse_inode(inode)->i_time = 0;
113} 113}
114 114
115/**
116 * Mark the attributes as stale due to an atime change. Avoid the invalidate if
117 * atime is not used.
118 */
119void fuse_invalidate_atime(struct inode *inode)
120{
121 if (!IS_RDONLY(inode))
122 fuse_invalidate_attr(inode);
123}
124
115/* 125/*
116 * Just mark the entry as stale, so that a next attempt to look it up 126 * Just mark the entry as stale, so that a next attempt to look it up
117 * will result in a new lookup call to userspace 127 * will result in a new lookup call to userspace
@@ -1371,7 +1381,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
1371 } 1381 }
1372 1382
1373 __free_page(page); 1383 __free_page(page);
1374 fuse_invalidate_attr(inode); /* atime changed */ 1384 fuse_invalidate_atime(inode);
1375 return err; 1385 return err;
1376} 1386}
1377 1387
@@ -1404,7 +1414,7 @@ static char *read_link(struct dentry *dentry)
1404 link[req->out.args[0].size] = '\0'; 1414 link[req->out.args[0].size] = '\0';
1405 out: 1415 out:
1406 fuse_put_request(fc, req); 1416 fuse_put_request(fc, req);
1407 fuse_invalidate_attr(inode); /* atime changed */ 1417 fuse_invalidate_atime(inode);
1408 return link; 1418 return link;
1409} 1419}
1410 1420
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7e70506297bc..77bcc303c3ae 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
127 if (atomic_dec_and_test(&ff->count)) { 127 if (atomic_dec_and_test(&ff->count)) {
128 struct fuse_req *req = ff->reserved_req; 128 struct fuse_req *req = ff->reserved_req;
129 129
130 if (sync) { 130 if (ff->fc->no_open) {
131 /*
132 * Drop the release request when client does not
133 * implement 'open'
134 */
135 req->background = 0;
136 path_put(&req->misc.release.path);
137 fuse_put_request(ff->fc, req);
138 } else if (sync) {
131 req->background = 0; 139 req->background = 0;
132 fuse_request_send(ff->fc, req); 140 fuse_request_send(ff->fc, req);
133 path_put(&req->misc.release.path); 141 path_put(&req->misc.release.path);
@@ -144,27 +152,36 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
144int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 152int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
145 bool isdir) 153 bool isdir)
146{ 154{
147 struct fuse_open_out outarg;
148 struct fuse_file *ff; 155 struct fuse_file *ff;
149 int err;
150 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 156 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
151 157
152 ff = fuse_file_alloc(fc); 158 ff = fuse_file_alloc(fc);
153 if (!ff) 159 if (!ff)
154 return -ENOMEM; 160 return -ENOMEM;
155 161
156 err = fuse_send_open(fc, nodeid, file, opcode, &outarg); 162 ff->fh = 0;
157 if (err) { 163 ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
158 fuse_file_free(ff); 164 if (!fc->no_open || isdir) {
159 return err; 165 struct fuse_open_out outarg;
166 int err;
167
168 err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
169 if (!err) {
170 ff->fh = outarg.fh;
171 ff->open_flags = outarg.open_flags;
172
173 } else if (err != -ENOSYS || isdir) {
174 fuse_file_free(ff);
175 return err;
176 } else {
177 fc->no_open = 1;
178 }
160 } 179 }
161 180
162 if (isdir) 181 if (isdir)
163 outarg.open_flags &= ~FOPEN_DIRECT_IO; 182 ff->open_flags &= ~FOPEN_DIRECT_IO;
164 183
165 ff->fh = outarg.fh;
166 ff->nodeid = nodeid; 184 ff->nodeid = nodeid;
167 ff->open_flags = outarg.open_flags;
168 file->private_data = fuse_file_get(ff); 185 file->private_data = fuse_file_get(ff);
169 186
170 return 0; 187 return 0;
@@ -687,7 +704,7 @@ static int fuse_readpage(struct file *file, struct page *page)
687 SetPageUptodate(page); 704 SetPageUptodate(page);
688 } 705 }
689 706
690 fuse_invalidate_attr(inode); /* atime changed */ 707 fuse_invalidate_atime(inode);
691 out: 708 out:
692 unlock_page(page); 709 unlock_page(page);
693 return err; 710 return err;
@@ -716,7 +733,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
716 fuse_read_update_size(inode, pos, 733 fuse_read_update_size(inode, pos,
717 req->misc.read.attr_ver); 734 req->misc.read.attr_ver);
718 } 735 }
719 fuse_invalidate_attr(inode); /* atime changed */ 736 fuse_invalidate_atime(inode);
720 } 737 }
721 738
722 for (i = 0; i < req->num_pages; i++) { 739 for (i = 0; i < req->num_pages; i++) {
@@ -2710,6 +2727,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2710 inode = file->f_mapping->host; 2727 inode = file->f_mapping->host;
2711 i_size = i_size_read(inode); 2728 i_size = i_size_read(inode);
2712 2729
2730 if ((rw == READ) && (offset > i_size))
2731 return 0;
2732
2713 /* optimization for short read */ 2733 /* optimization for short read */
2714 if (async_dio && rw != WRITE && offset + count > i_size) { 2734 if (async_dio && rw != WRITE && offset + count > i_size) {
2715 if (offset >= i_size) 2735 if (offset >= i_size)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7d2730912667..2da5db2c8bdb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -485,6 +485,9 @@ struct fuse_conn {
485 * and hence races in setting them will not cause malfunction 485 * and hence races in setting them will not cause malfunction
486 */ 486 */
487 487
488 /** Is open/release not implemented by fs? */
489 unsigned no_open:1;
490
488 /** Is fsync not implemented by fs? */ 491 /** Is fsync not implemented by fs? */
489 unsigned no_fsync:1; 492 unsigned no_fsync:1;
490 493
@@ -788,6 +791,8 @@ void fuse_invalidate_attr(struct inode *inode);
788 791
789void fuse_invalidate_entry_cache(struct dentry *entry); 792void fuse_invalidate_entry_cache(struct dentry *entry);
790 793
794void fuse_invalidate_atime(struct inode *inode);
795
791/** 796/**
792 * Acquire reference to fuse_conn 797 * Acquire reference to fuse_conn
793 */ 798 */
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
deleted file mode 100644
index b3f3676796d3..000000000000
--- a/fs/generic_acl.c
+++ /dev/null
@@ -1,184 +0,0 @@
1/*
2 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
3 *
4 * This file is released under the GPL.
5 *
6 * Generic ACL support for in-memory filesystems.
7 */
8
9#include <linux/sched.h>
10#include <linux/gfp.h>
11#include <linux/fs.h>
12#include <linux/generic_acl.h>
13#include <linux/posix_acl.h>
14#include <linux/posix_acl_xattr.h>
15
16
17static size_t
18generic_acl_list(struct dentry *dentry, char *list, size_t list_size,
19 const char *name, size_t name_len, int type)
20{
21 struct posix_acl *acl;
22 const char *xname;
23 size_t size;
24
25 acl = get_cached_acl(dentry->d_inode, type);
26 if (!acl)
27 return 0;
28 posix_acl_release(acl);
29
30 switch (type) {
31 case ACL_TYPE_ACCESS:
32 xname = POSIX_ACL_XATTR_ACCESS;
33 break;
34 case ACL_TYPE_DEFAULT:
35 xname = POSIX_ACL_XATTR_DEFAULT;
36 break;
37 default:
38 return 0;
39 }
40 size = strlen(xname) + 1;
41 if (list && size <= list_size)
42 memcpy(list, xname, size);
43 return size;
44}
45
46static int
47generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
48 size_t size, int type)
49{
50 struct posix_acl *acl;
51 int error;
52
53 if (strcmp(name, "") != 0)
54 return -EINVAL;
55
56 acl = get_cached_acl(dentry->d_inode, type);
57 if (!acl)
58 return -ENODATA;
59 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
60 posix_acl_release(acl);
61
62 return error;
63}
64
65static int
66generic_acl_set(struct dentry *dentry, const char *name, const void *value,
67 size_t size, int flags, int type)
68{
69 struct inode *inode = dentry->d_inode;
70 struct posix_acl *acl = NULL;
71 int error;
72
73 if (strcmp(name, "") != 0)
74 return -EINVAL;
75 if (S_ISLNK(inode->i_mode))
76 return -EOPNOTSUPP;
77 if (!inode_owner_or_capable(inode))
78 return -EPERM;
79 if (value) {
80 acl = posix_acl_from_xattr(&init_user_ns, value, size);
81 if (IS_ERR(acl))
82 return PTR_ERR(acl);
83 }
84 if (acl) {
85 error = posix_acl_valid(acl);
86 if (error)
87 goto failed;
88 switch (type) {
89 case ACL_TYPE_ACCESS:
90 error = posix_acl_equiv_mode(acl, &inode->i_mode);
91 if (error < 0)
92 goto failed;
93 inode->i_ctime = CURRENT_TIME;
94 if (error == 0) {
95 posix_acl_release(acl);
96 acl = NULL;
97 }
98 break;
99 case ACL_TYPE_DEFAULT:
100 if (!S_ISDIR(inode->i_mode)) {
101 error = -EINVAL;
102 goto failed;
103 }
104 break;
105 }
106 }
107 set_cached_acl(inode, type, acl);
108 error = 0;
109failed:
110 posix_acl_release(acl);
111 return error;
112}
113
114/**
115 * generic_acl_init - Take care of acl inheritance at @inode create time
116 *
117 * Files created inside a directory with a default ACL inherit the
118 * directory's default ACL.
119 */
120int
121generic_acl_init(struct inode *inode, struct inode *dir)
122{
123 struct posix_acl *acl = NULL;
124 int error;
125
126 if (!S_ISLNK(inode->i_mode))
127 acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
128 if (acl) {
129 if (S_ISDIR(inode->i_mode))
130 set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
131 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
132 if (error < 0)
133 return error;
134 if (error > 0)
135 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
136 } else {
137 inode->i_mode &= ~current_umask();
138 }
139 error = 0;
140
141 posix_acl_release(acl);
142 return error;
143}
144
145/**
146 * generic_acl_chmod - change the access acl of @inode upon chmod()
147 *
148 * A chmod also changes the permissions of the owner, group/mask, and
149 * other ACL entries.
150 */
151int
152generic_acl_chmod(struct inode *inode)
153{
154 struct posix_acl *acl;
155 int error = 0;
156
157 if (S_ISLNK(inode->i_mode))
158 return -EOPNOTSUPP;
159 acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
160 if (acl) {
161 error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
162 if (error)
163 return error;
164 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
165 posix_acl_release(acl);
166 }
167 return error;
168}
169
170const struct xattr_handler generic_acl_access_handler = {
171 .prefix = POSIX_ACL_XATTR_ACCESS,
172 .flags = ACL_TYPE_ACCESS,
173 .list = generic_acl_list,
174 .get = generic_acl_get,
175 .set = generic_acl_set,
176};
177
178const struct xattr_handler generic_acl_default_handler = {
179 .prefix = POSIX_ACL_XATTR_DEFAULT,
180 .flags = ACL_TYPE_DEFAULT,
181 .list = generic_acl_list,
182 .get = generic_acl_get,
183 .set = generic_acl_set,
184};
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f69ac0af5496..ba9456685f47 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -49,10 +49,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
49 if (!ip->i_eattr) 49 if (!ip->i_eattr)
50 return NULL; 50 return NULL;
51 51
52 acl = get_cached_acl(&ip->i_inode, type);
53 if (acl != ACL_NOT_CACHED)
54 return acl;
55
56 name = gfs2_acl_name(type); 52 name = gfs2_acl_name(type);
57 if (name == NULL) 53 if (name == NULL)
58 return ERR_PTR(-EINVAL); 54 return ERR_PTR(-EINVAL);
@@ -80,7 +76,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode)
80 return error; 76 return error;
81} 77}
82 78
83static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) 79int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
84{ 80{
85 int error; 81 int error;
86 int len; 82 int len;
@@ -88,219 +84,49 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
88 const char *name = gfs2_acl_name(type); 84 const char *name = gfs2_acl_name(type);
89 85
90 BUG_ON(name == NULL); 86 BUG_ON(name == NULL);
91 len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
92 if (len == 0)
93 return 0;
94 data = kmalloc(len, GFP_NOFS);
95 if (data == NULL)
96 return -ENOMEM;
97 error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
98 if (error < 0)
99 goto out;
100 error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
101 if (!error)
102 set_cached_acl(inode, type, acl);
103out:
104 kfree(data);
105 return error;
106}
107
108int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
109{
110 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
111 struct posix_acl *acl;
112 umode_t mode = inode->i_mode;
113 int error = 0;
114
115 if (!sdp->sd_args.ar_posix_acl)
116 return 0;
117 if (S_ISLNK(inode->i_mode))
118 return 0;
119
120 acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT);
121 if (IS_ERR(acl))
122 return PTR_ERR(acl);
123 if (!acl) {
124 mode &= ~current_umask();
125 return gfs2_set_mode(inode, mode);
126 }
127
128 if (S_ISDIR(inode->i_mode)) {
129 error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
130 if (error)
131 goto out;
132 }
133
134 error = posix_acl_create(&acl, GFP_NOFS, &mode);
135 if (error < 0)
136 return error;
137 87
138 if (error == 0)
139 goto munge;
140
141 error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
142 if (error)
143 goto out;
144munge:
145 error = gfs2_set_mode(inode, mode);
146out:
147 posix_acl_release(acl);
148 return error;
149}
150
151int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
152{
153 struct inode *inode = &ip->i_inode;
154 struct posix_acl *acl;
155 char *data;
156 unsigned int len;
157 int error;
158
159 acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS);
160 if (IS_ERR(acl))
161 return PTR_ERR(acl);
162 if (!acl)
163 return gfs2_setattr_simple(inode, attr);
164
165 error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
166 if (error)
167 return error;
168
169 len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
170 data = kmalloc(len, GFP_NOFS);
171 error = -ENOMEM;
172 if (data == NULL)
173 goto out;
174 posix_acl_to_xattr(&init_user_ns, acl, data, len);
175 error = gfs2_xattr_acl_chmod(ip, attr, data);
176 kfree(data);
177 set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
178
179out:
180 posix_acl_release(acl);
181 return error;
182}
183
184static int gfs2_acl_type(const char *name)
185{
186 if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
187 return ACL_TYPE_ACCESS;
188 if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
189 return ACL_TYPE_DEFAULT;
190 return -EINVAL;
191}
192
193static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
194 void *buffer, size_t size, int xtype)
195{
196 struct inode *inode = dentry->d_inode;
197 struct gfs2_sbd *sdp = GFS2_SB(inode);
198 struct posix_acl *acl;
199 int type;
200 int error;
201
202 if (!sdp->sd_args.ar_posix_acl)
203 return -EOPNOTSUPP;
204
205 type = gfs2_acl_type(name);
206 if (type < 0)
207 return type;
208
209 acl = gfs2_get_acl(inode, type);
210 if (IS_ERR(acl))
211 return PTR_ERR(acl);
212 if (acl == NULL)
213 return -ENODATA;
214
215 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
216 posix_acl_release(acl);
217
218 return error;
219}
220
221static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
222 const void *value, size_t size, int flags,
223 int xtype)
224{
225 struct inode *inode = dentry->d_inode;
226 struct gfs2_sbd *sdp = GFS2_SB(inode);
227 struct posix_acl *acl = NULL;
228 int error = 0, type;
229
230 if (!sdp->sd_args.ar_posix_acl)
231 return -EOPNOTSUPP;
232
233 type = gfs2_acl_type(name);
234 if (type < 0)
235 return type;
236 if (flags & XATTR_CREATE)
237 return -EINVAL;
238 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
239 return value ? -EACCES : 0;
240 if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER))
241 return -EPERM;
242 if (S_ISLNK(inode->i_mode))
243 return -EOPNOTSUPP;
244
245 if (!value)
246 goto set_acl;
247
248 acl = posix_acl_from_xattr(&init_user_ns, value, size);
249 if (!acl) {
250 /*
251 * acl_set_file(3) may request that we set default ACLs with
252 * zero length -- defend (gracefully) against that here.
253 */
254 goto out;
255 }
256 if (IS_ERR(acl)) {
257 error = PTR_ERR(acl);
258 goto out;
259 }
260
261 error = posix_acl_valid(acl);
262 if (error)
263 goto out_release;
264
265 error = -EINVAL;
266 if (acl->a_count > GFS2_ACL_MAX_ENTRIES) 88 if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
267 goto out_release; 89 return -EINVAL;
268 90
269 if (type == ACL_TYPE_ACCESS) { 91 if (type == ACL_TYPE_ACCESS) {
270 umode_t mode = inode->i_mode; 92 umode_t mode = inode->i_mode;
93
271 error = posix_acl_equiv_mode(acl, &mode); 94 error = posix_acl_equiv_mode(acl, &mode);
95 if (error < 0)
96 return error;
272 97
273 if (error <= 0) { 98 if (error == 0)
274 posix_acl_release(acl);
275 acl = NULL; 99 acl = NULL;
276 100
277 if (error < 0)
278 return error;
279 }
280
281 error = gfs2_set_mode(inode, mode); 101 error = gfs2_set_mode(inode, mode);
282 if (error) 102 if (error)
283 goto out_release; 103 return error;
284 } 104 }
285 105
286set_acl: 106 if (acl) {
287 error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS); 107 len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
288 if (!error) { 108 if (len == 0)
289 if (acl) 109 return 0;
290 set_cached_acl(inode, type, acl); 110 data = kmalloc(len, GFP_NOFS);
291 else 111 if (data == NULL)
292 forget_cached_acl(inode, type); 112 return -ENOMEM;
113 error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
114 if (error < 0)
115 goto out;
116 } else {
117 data = NULL;
118 len = 0;
293 } 119 }
294out_release: 120
295 posix_acl_release(acl); 121 error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
122 if (error)
123 goto out;
124
125 if (acl)
126 set_cached_acl(inode, type, acl);
127 else
128 forget_cached_acl(inode, type);
296out: 129out:
130 kfree(data);
297 return error; 131 return error;
298} 132}
299
300const struct xattr_handler gfs2_xattr_system_handler = {
301 .prefix = XATTR_SYSTEM_PREFIX,
302 .flags = GFS2_EATYPE_SYS,
303 .get = gfs2_xattr_system_get,
304 .set = gfs2_xattr_system_set,
305};
306
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 0da38dc7efec..301260c999ba 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -17,8 +17,6 @@
17#define GFS2_ACL_MAX_ENTRIES 25 17#define GFS2_ACL_MAX_ENTRIES 25
18 18
19extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); 19extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); 20extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
22extern const struct xattr_handler gfs2_xattr_system_handler;
23 21
24#endif /* __ACL_DOT_H__ */ 22#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 73f3e4ee4037..49436fa7cd4f 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1032,8 +1032,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
1032 unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); 1032 unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
1033 rv = filemap_write_and_wait_range(mapping, lstart, end); 1033 rv = filemap_write_and_wait_range(mapping, lstart, end);
1034 if (rv) 1034 if (rv)
1035 return rv; 1035 goto out;
1036 truncate_inode_pages_range(mapping, lstart, end); 1036 if (rw == WRITE)
1037 truncate_inode_pages_range(mapping, lstart, end);
1037 } 1038 }
1038 1039
1039 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1040 rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
@@ -1080,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
1080 bh = bh->b_this_page; 1081 bh = bh->b_this_page;
1081 } while(bh != head); 1082 } while(bh != head);
1082 spin_unlock(&sdp->sd_ail_lock); 1083 spin_unlock(&sdp->sd_ail_lock);
1083 gfs2_log_unlock(sdp);
1084 1084
1085 head = bh = page_buffers(page); 1085 head = bh = page_buffers(page);
1086 do { 1086 do {
1087 gfs2_log_lock(sdp);
1088 bd = bh->b_private; 1087 bd = bh->b_private;
1089 if (bd) { 1088 if (bd) {
1090 gfs2_assert_warn(sdp, bd->bd_bh == bh); 1089 gfs2_assert_warn(sdp, bd->bd_bh == bh);
1091 if (!list_empty(&bd->bd_list)) { 1090 if (!list_empty(&bd->bd_list))
1092 if (!buffer_pinned(bh)) 1091 list_del_init(&bd->bd_list);
1093 list_del_init(&bd->bd_list); 1092 bd->bd_bh = NULL;
1094 else
1095 bd = NULL;
1096 }
1097 if (bd)
1098 bd->bd_bh = NULL;
1099 bh->b_private = NULL; 1093 bh->b_private = NULL;
1100 }
1101 gfs2_log_unlock(sdp);
1102 if (bd)
1103 kmem_cache_free(gfs2_bufdata_cachep, bd); 1094 kmem_cache_free(gfs2_bufdata_cachep, bd);
1095 }
1104 1096
1105 bh = bh->b_this_page; 1097 bh = bh->b_this_page;
1106 } while (bh != head); 1098 } while (bh != head);
1099 gfs2_log_unlock(sdp);
1107 1100
1108 return try_to_free_buffers(page); 1101 return try_to_free_buffers(page);
1109 1102
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2e5fc268d324..fa32655449c8 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
834 struct gfs2_leaf *leaf; 834 struct gfs2_leaf *leaf;
835 struct gfs2_dirent *dent; 835 struct gfs2_dirent *dent;
836 struct qstr name = { .name = "" }; 836 struct qstr name = { .name = "" };
837 struct timespec tv = CURRENT_TIME;
837 838
838 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); 839 error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
839 if (error) 840 if (error)
@@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
850 leaf->lf_entries = 0; 851 leaf->lf_entries = 0;
851 leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); 852 leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
852 leaf->lf_next = 0; 853 leaf->lf_next = 0;
853 memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); 854 leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
855 leaf->lf_dist = cpu_to_be32(1);
856 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
857 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
858 memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
854 dent = (struct gfs2_dirent *)(leaf+1); 859 dent = (struct gfs2_dirent *)(leaf+1);
855 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); 860 gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
856 *pbh = bh; 861 *pbh = bh;
@@ -1612,11 +1617,31 @@ out:
1612 return ret; 1617 return ret;
1613} 1618}
1614 1619
1620/**
1621 * dir_new_leaf - Add a new leaf onto hash chain
1622 * @inode: The directory
1623 * @name: The name we are adding
1624 *
1625 * This adds a new dir leaf onto an existing leaf when there is not
1626 * enough space to add a new dir entry. This is a last resort after
1627 * we've expanded the hash table to max size and also split existing
1628 * leaf blocks, so it will only occur for very large directories.
1629 *
1630 * The dist parameter is set to 1 for leaf blocks directly attached
1631 * to the hash table, 2 for one layer of indirection, 3 for two layers
1632 * etc. We are thus able to tell the difference between an old leaf
1633 * with dist set to zero (i.e. "don't know") and a new one where we
1634 * set this information for debug/fsck purposes.
1635 *
1636 * Returns: 0 on success, or -ve on error
1637 */
1638
1615static int dir_new_leaf(struct inode *inode, const struct qstr *name) 1639static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1616{ 1640{
1617 struct buffer_head *bh, *obh; 1641 struct buffer_head *bh, *obh;
1618 struct gfs2_inode *ip = GFS2_I(inode); 1642 struct gfs2_inode *ip = GFS2_I(inode);
1619 struct gfs2_leaf *leaf, *oleaf; 1643 struct gfs2_leaf *leaf, *oleaf;
1644 u32 dist = 1;
1620 int error; 1645 int error;
1621 u32 index; 1646 u32 index;
1622 u64 bn; 1647 u64 bn;
@@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1626 if (error) 1651 if (error)
1627 return error; 1652 return error;
1628 do { 1653 do {
1654 dist++;
1629 oleaf = (struct gfs2_leaf *)obh->b_data; 1655 oleaf = (struct gfs2_leaf *)obh->b_data;
1630 bn = be64_to_cpu(oleaf->lf_next); 1656 bn = be64_to_cpu(oleaf->lf_next);
1631 if (!bn) 1657 if (!bn)
@@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1643 brelse(obh); 1669 brelse(obh);
1644 return -ENOSPC; 1670 return -ENOSPC;
1645 } 1671 }
1672 leaf->lf_dist = cpu_to_be32(dist);
1646 oleaf->lf_next = cpu_to_be64(bh->b_blocknr); 1673 oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
1647 brelse(bh); 1674 brelse(bh);
1648 brelse(obh); 1675 brelse(obh);
@@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1659 1686
1660/** 1687/**
1661 * gfs2_dir_add - Add new filename into directory 1688 * gfs2_dir_add - Add new filename into directory
1662 * @dip: The GFS2 inode 1689 * @inode: The directory inode
1663 * @filename: The new name 1690 * @name: The new name
1664 * @inode: The inode number of the entry 1691 * @nip: The GFS2 inode to be linked in to the directory
1665 * @type: The type of the entry 1692 * @da: The directory addition info
1693 *
1694 * If the call to gfs2_diradd_alloc_required resulted in there being
1695 * no need to allocate any new directory blocks, then it will contain
1696 * a pointer to the directory entry and the bh in which it resides. We
1697 * can use that without having to repeat the search. If there was no
1698 * free space, then we must now create more space.
1666 * 1699 *
1667 * Returns: 0 on success, error code on failure 1700 * Returns: 0 on success, error code on failure
1668 */ 1701 */
1669 1702
1670int gfs2_dir_add(struct inode *inode, const struct qstr *name, 1703int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1671 const struct gfs2_inode *nip) 1704 const struct gfs2_inode *nip, struct gfs2_diradd *da)
1672{ 1705{
1673 struct gfs2_inode *ip = GFS2_I(inode); 1706 struct gfs2_inode *ip = GFS2_I(inode);
1674 struct buffer_head *bh; 1707 struct buffer_head *bh = da->bh;
1675 struct gfs2_dirent *dent; 1708 struct gfs2_dirent *dent = da->dent;
1709 struct timespec tv;
1676 struct gfs2_leaf *leaf; 1710 struct gfs2_leaf *leaf;
1677 int error; 1711 int error;
1678 1712
1679 while(1) { 1713 while(1) {
1680 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, 1714 if (da->bh == NULL) {
1681 &bh); 1715 dent = gfs2_dirent_search(inode, name,
1716 gfs2_dirent_find_space, &bh);
1717 }
1682 if (dent) { 1718 if (dent) {
1683 if (IS_ERR(dent)) 1719 if (IS_ERR(dent))
1684 return PTR_ERR(dent); 1720 return PTR_ERR(dent);
1685 dent = gfs2_init_dirent(inode, dent, name, bh); 1721 dent = gfs2_init_dirent(inode, dent, name, bh);
1686 gfs2_inum_out(nip, dent); 1722 gfs2_inum_out(nip, dent);
1687 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); 1723 dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
1724 tv = CURRENT_TIME;
1688 if (ip->i_diskflags & GFS2_DIF_EXHASH) { 1725 if (ip->i_diskflags & GFS2_DIF_EXHASH) {
1689 leaf = (struct gfs2_leaf *)bh->b_data; 1726 leaf = (struct gfs2_leaf *)bh->b_data;
1690 be16_add_cpu(&leaf->lf_entries, 1); 1727 be16_add_cpu(&leaf->lf_entries, 1);
1728 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
1729 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
1691 } 1730 }
1731 da->dent = NULL;
1732 da->bh = NULL;
1692 brelse(bh); 1733 brelse(bh);
1693 ip->i_entries++; 1734 ip->i_entries++;
1694 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1735 ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
1695 if (S_ISDIR(nip->i_inode.i_mode)) 1736 if (S_ISDIR(nip->i_inode.i_mode))
1696 inc_nlink(&ip->i_inode); 1737 inc_nlink(&ip->i_inode);
1697 mark_inode_dirty(inode); 1738 mark_inode_dirty(inode);
@@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1742 const struct qstr *name = &dentry->d_name; 1783 const struct qstr *name = &dentry->d_name;
1743 struct gfs2_dirent *dent, *prev = NULL; 1784 struct gfs2_dirent *dent, *prev = NULL;
1744 struct buffer_head *bh; 1785 struct buffer_head *bh;
1786 struct timespec tv = CURRENT_TIME;
1745 1787
1746 /* Returns _either_ the entry (if its first in block) or the 1788 /* Returns _either_ the entry (if its first in block) or the
1747 previous entry otherwise */ 1789 previous entry otherwise */
@@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
1767 if (!entries) 1809 if (!entries)
1768 gfs2_consist_inode(dip); 1810 gfs2_consist_inode(dip);
1769 leaf->lf_entries = cpu_to_be16(--entries); 1811 leaf->lf_entries = cpu_to_be16(--entries);
1812 leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
1813 leaf->lf_sec = cpu_to_be64(tv.tv_sec);
1770 } 1814 }
1771 brelse(bh); 1815 brelse(bh);
1772 1816
1773 if (!dip->i_entries) 1817 if (!dip->i_entries)
1774 gfs2_consist_inode(dip); 1818 gfs2_consist_inode(dip);
1775 dip->i_entries--; 1819 dip->i_entries--;
1776 dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; 1820 dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
1777 if (S_ISDIR(dentry->d_inode->i_mode)) 1821 if (S_ISDIR(dentry->d_inode->i_mode))
1778 drop_nlink(&dip->i_inode); 1822 drop_nlink(&dip->i_inode);
1779 mark_inode_dirty(&dip->i_inode); 1823 mark_inode_dirty(&dip->i_inode);
@@ -2017,22 +2061,36 @@ out:
2017 * gfs2_diradd_alloc_required - find if adding entry will require an allocation 2061 * gfs2_diradd_alloc_required - find if adding entry will require an allocation
2018 * @ip: the file being written to 2062 * @ip: the file being written to
2019 * @filname: the filename that's going to be added 2063 * @filname: the filename that's going to be added
2064 * @da: The structure to return dir alloc info
2020 * 2065 *
2021 * Returns: 1 if alloc required, 0 if not, -ve on error 2066 * Returns: 0 if ok, -ve on error
2022 */ 2067 */
2023 2068
2024int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) 2069int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
2070 struct gfs2_diradd *da)
2025{ 2071{
2072 struct gfs2_inode *ip = GFS2_I(inode);
2073 struct gfs2_sbd *sdp = GFS2_SB(inode);
2074 const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
2026 struct gfs2_dirent *dent; 2075 struct gfs2_dirent *dent;
2027 struct buffer_head *bh; 2076 struct buffer_head *bh;
2028 2077
2078 da->nr_blocks = 0;
2079 da->bh = NULL;
2080 da->dent = NULL;
2081
2029 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); 2082 dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
2030 if (!dent) { 2083 if (!dent) {
2031 return 1; 2084 da->nr_blocks = sdp->sd_max_dirres;
2085 if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
2086 (GFS2_DIRENT_SIZE(name->len) < extra))
2087 da->nr_blocks = 1;
2088 return 0;
2032 } 2089 }
2033 if (IS_ERR(dent)) 2090 if (IS_ERR(dent))
2034 return PTR_ERR(dent); 2091 return PTR_ERR(dent);
2035 brelse(bh); 2092 da->bh = bh;
2093 da->dent = dent;
2036 return 0; 2094 return 0;
2037} 2095}
2038 2096
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f03bbd1873f..126c65dda028 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,6 +16,14 @@
16struct inode; 16struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19struct buffer_head;
20struct gfs2_dirent;
21
22struct gfs2_diradd {
23 unsigned nr_blocks;
24 struct gfs2_dirent *dent;
25 struct buffer_head *bh;
26};
19 27
20extern struct inode *gfs2_dir_search(struct inode *dir, 28extern struct inode *gfs2_dir_search(struct inode *dir,
21 const struct qstr *filename, 29 const struct qstr *filename,
@@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
23extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 31extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
24 const struct gfs2_inode *ip); 32 const struct gfs2_inode *ip);
25extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 33extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
26 const struct gfs2_inode *ip); 34 const struct gfs2_inode *ip, struct gfs2_diradd *da);
35static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
36{
37 if (da->bh)
38 brelse(da->bh);
39 da->bh = NULL;
40}
27extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); 41extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
28extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, 42extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
29 struct file_ra_state *f_ra); 43 struct file_ra_state *f_ra);
@@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
33extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 47extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
34 48
35extern int gfs2_diradd_alloc_required(struct inode *dir, 49extern int gfs2_diradd_alloc_required(struct inode *dir,
36 const struct qstr *filename); 50 const struct qstr *filename,
51 struct gfs2_diradd *da);
37extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 52extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
38 struct buffer_head **bhp); 53 struct buffer_head **bhp);
39extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); 54extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6f7a47c05259..ca0be6c69a26 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1552 glock_hash_walk(thaw_glock, sdp); 1552 glock_hash_walk(thaw_glock, sdp);
1553} 1553}
1554 1554
1555static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) 1555static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1556{ 1556{
1557 int ret;
1558 spin_lock(&gl->gl_spin); 1557 spin_lock(&gl->gl_spin);
1559 ret = gfs2_dump_glock(seq, gl); 1558 gfs2_dump_glock(seq, gl);
1560 spin_unlock(&gl->gl_spin); 1559 spin_unlock(&gl->gl_spin);
1561 return ret;
1562} 1560}
1563 1561
1564static void dump_glock_func(struct gfs2_glock *gl) 1562static void dump_glock_func(struct gfs2_glock *gl)
@@ -1647,10 +1645,9 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
1647 * @seq: the seq_file struct 1645 * @seq: the seq_file struct
1648 * @gh: the glock holder 1646 * @gh: the glock holder
1649 * 1647 *
1650 * Returns: 0 on success, -ENOBUFS when we run out of space
1651 */ 1648 */
1652 1649
1653static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) 1650static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1654{ 1651{
1655 struct task_struct *gh_owner = NULL; 1652 struct task_struct *gh_owner = NULL;
1656 char flags_buf[32]; 1653 char flags_buf[32];
@@ -1666,7 +1663,6 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
1666 gh_owner ? gh_owner->comm : "(ended)", 1663 gh_owner ? gh_owner->comm : "(ended)",
1667 (void *)gh->gh_ip); 1664 (void *)gh->gh_ip);
1668 rcu_read_unlock(); 1665 rcu_read_unlock();
1669 return 0;
1670} 1666}
1671 1667
1672static const char *gflags2str(char *buf, const struct gfs2_glock *gl) 1668static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
@@ -1721,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
1721 * example. The field's are n = number (id of the object), f = flags, 1717 * example. The field's are n = number (id of the object), f = flags,
1722 * t = type, s = state, r = refcount, e = error, p = pid. 1718 * t = type, s = state, r = refcount, e = error, p = pid.
1723 * 1719 *
1724 * Returns: 0 on success, -ENOBUFS when we run out of space
1725 */ 1720 */
1726 1721
1727int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) 1722void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1728{ 1723{
1729 const struct gfs2_glock_operations *glops = gl->gl_ops; 1724 const struct gfs2_glock_operations *glops = gl->gl_ops;
1730 unsigned long long dtime; 1725 unsigned long long dtime;
1731 const struct gfs2_holder *gh; 1726 const struct gfs2_holder *gh;
1732 char gflags_buf[32]; 1727 char gflags_buf[32];
1733 int error = 0;
1734 1728
1735 dtime = jiffies - gl->gl_demote_time; 1729 dtime = jiffies - gl->gl_demote_time;
1736 dtime *= 1000000/HZ; /* demote time in uSec */ 1730 dtime *= 1000000/HZ; /* demote time in uSec */
@@ -1747,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1747 atomic_read(&gl->gl_revokes), 1741 atomic_read(&gl->gl_revokes),
1748 (int)gl->gl_lockref.count, gl->gl_hold_time); 1742 (int)gl->gl_lockref.count, gl->gl_hold_time);
1749 1743
1750 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 1744 list_for_each_entry(gh, &gl->gl_holders, gh_list)
1751 error = dump_holder(seq, gh); 1745 dump_holder(seq, gh);
1752 if (error) 1746
1753 goto out;
1754 }
1755 if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) 1747 if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
1756 error = glops->go_dump(seq, gl); 1748 glops->go_dump(seq, gl);
1757out:
1758 return error;
1759} 1749}
1760 1750
1761static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) 1751static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1953,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
1953 1943
1954static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) 1944static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
1955{ 1945{
1956 return dump_glock(seq, iter_ptr); 1946 dump_glock(seq, iter_ptr);
1947 return 0;
1957} 1948}
1958 1949
1959static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) 1950static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 6647d77366ba..32572f71f027 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
199 struct gfs2_holder *gh); 199 struct gfs2_holder *gh);
200extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 200extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
201extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 201extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
202extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); 202extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
203#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) 203#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
204extern __printf(2, 3) 204extern __printf(2, 3)
205void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 205void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f88dcd925010..3bf0631b5d56 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
133 133
134static void rgrp_go_sync(struct gfs2_glock *gl) 134static void rgrp_go_sync(struct gfs2_glock *gl)
135{ 135{
136 struct address_space *metamapping = gfs2_glock2aspace(gl); 136 struct gfs2_sbd *sdp = gl->gl_sbd;
137 struct address_space *mapping = &sdp->sd_aspace;
137 struct gfs2_rgrpd *rgd; 138 struct gfs2_rgrpd *rgd;
138 int error; 139 int error;
139 140
@@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
141 return; 142 return;
142 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); 143 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
143 144
144 gfs2_log_flush(gl->gl_sbd, gl); 145 gfs2_log_flush(sdp, gl);
145 filemap_fdatawrite(metamapping); 146 filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
146 error = filemap_fdatawait(metamapping); 147 error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
147 mapping_set_error(metamapping, error); 148 mapping_set_error(mapping, error);
148 gfs2_ail_empty_gl(gl); 149 gfs2_ail_empty_gl(gl);
149 150
150 spin_lock(&gl->gl_spin); 151 spin_lock(&gl->gl_spin);
@@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
166 167
167static void rgrp_go_inval(struct gfs2_glock *gl, int flags) 168static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
168{ 169{
169 struct address_space *mapping = gfs2_glock2aspace(gl); 170 struct gfs2_sbd *sdp = gl->gl_sbd;
171 struct address_space *mapping = &sdp->sd_aspace;
170 172
171 WARN_ON_ONCE(!(flags & DIO_METADATA)); 173 WARN_ON_ONCE(!(flags & DIO_METADATA));
172 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 174 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
173 truncate_inode_pages(mapping, 0); 175 truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
174 176
175 if (gl->gl_object) { 177 if (gl->gl_object) {
176 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; 178 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
@@ -435,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh)
435 * @seq: The iterator 437 * @seq: The iterator
436 * @ip: the inode 438 * @ip: the inode
437 * 439 *
438 * Returns: 0 on success, -ENOBUFS when we run out of space
439 */ 440 */
440 441
441static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) 442static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
442{ 443{
443 const struct gfs2_inode *ip = gl->gl_object; 444 const struct gfs2_inode *ip = gl->gl_object;
444 if (ip == NULL) 445 if (ip == NULL)
445 return 0; 446 return;
446 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", 447 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
447 (unsigned long long)ip->i_no_formal_ino, 448 (unsigned long long)ip->i_no_formal_ino,
448 (unsigned long long)ip->i_no_addr, 449 (unsigned long long)ip->i_no_addr,
449 IF2DT(ip->i_inode.i_mode), ip->i_flags, 450 IF2DT(ip->i_inode.i_mode), ip->i_flags,
450 (unsigned int)ip->i_diskflags, 451 (unsigned int)ip->i_diskflags,
451 (unsigned long long)i_size_read(&ip->i_inode)); 452 (unsigned long long)i_size_read(&ip->i_inode));
452 return 0;
453} 453}
454 454
455/** 455/**
@@ -558,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
558 .go_unlock = gfs2_rgrp_go_unlock, 558 .go_unlock = gfs2_rgrp_go_unlock,
559 .go_dump = gfs2_rgrp_dump, 559 .go_dump = gfs2_rgrp_dump,
560 .go_type = LM_TYPE_RGRP, 560 .go_type = LM_TYPE_RGRP,
561 .go_flags = GLOF_ASPACE | GLOF_LVB, 561 .go_flags = GLOF_LVB,
562}; 562};
563 563
564const struct gfs2_glock_operations gfs2_trans_glops = { 564const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ba1ea67f4eeb..cf0e34400f71 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -93,6 +93,7 @@ struct gfs2_rgrpd {
93 struct gfs2_rgrp_lvb *rd_rgl; 93 struct gfs2_rgrp_lvb *rd_rgl;
94 u32 rd_last_alloc; 94 u32 rd_last_alloc;
95 u32 rd_flags; 95 u32 rd_flags;
96 u32 rd_extfail_pt; /* extent failure point */
96#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ 97#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
97#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ 98#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
98#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ 99#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
@@ -217,7 +218,7 @@ struct gfs2_glock_operations {
217 int (*go_demote_ok) (const struct gfs2_glock *gl); 218 int (*go_demote_ok) (const struct gfs2_glock *gl);
218 int (*go_lock) (struct gfs2_holder *gh); 219 int (*go_lock) (struct gfs2_holder *gh);
219 void (*go_unlock) (struct gfs2_holder *gh); 220 void (*go_unlock) (struct gfs2_holder *gh);
220 int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); 221 void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
221 void (*go_callback)(struct gfs2_glock *gl, bool remote); 222 void (*go_callback)(struct gfs2_glock *gl, bool remote);
222 const int go_type; 223 const int go_type;
223 const unsigned long go_flags; 224 const unsigned long go_flags;
@@ -350,7 +351,15 @@ struct gfs2_glock {
350 atomic_t gl_ail_count; 351 atomic_t gl_ail_count;
351 atomic_t gl_revokes; 352 atomic_t gl_revokes;
352 struct delayed_work gl_work; 353 struct delayed_work gl_work;
353 struct work_struct gl_delete; 354 union {
355 /* For inode and iopen glocks only */
356 struct work_struct gl_delete;
357 /* For rgrp glocks only */
358 struct {
359 loff_t start;
360 loff_t end;
361 } gl_vm;
362 };
354 struct rcu_head gl_rcu; 363 struct rcu_head gl_rcu;
355}; 364};
356 365
@@ -419,10 +428,13 @@ enum {
419}; 428};
420 429
421struct gfs2_quota_data { 430struct gfs2_quota_data {
431 struct hlist_bl_node qd_hlist;
422 struct list_head qd_list; 432 struct list_head qd_list;
423 struct kqid qd_id; 433 struct kqid qd_id;
434 struct gfs2_sbd *qd_sbd;
424 struct lockref qd_lockref; 435 struct lockref qd_lockref;
425 struct list_head qd_lru; 436 struct list_head qd_lru;
437 unsigned qd_hash;
426 438
427 unsigned long qd_flags; /* QDF_... */ 439 unsigned long qd_flags; /* QDF_... */
428 440
@@ -441,6 +453,7 @@ struct gfs2_quota_data {
441 453
442 u64 qd_sync_gen; 454 u64 qd_sync_gen;
443 unsigned long qd_last_warn; 455 unsigned long qd_last_warn;
456 struct rcu_head qd_rcu;
444}; 457};
445 458
446struct gfs2_trans { 459struct gfs2_trans {
@@ -720,13 +733,15 @@ struct gfs2_sbd {
720 spinlock_t sd_trunc_lock; 733 spinlock_t sd_trunc_lock;
721 734
722 unsigned int sd_quota_slots; 735 unsigned int sd_quota_slots;
723 unsigned int sd_quota_chunks; 736 unsigned long *sd_quota_bitmap;
724 unsigned char **sd_quota_bitmap; 737 spinlock_t sd_bitmap_lock;
725 738
726 u64 sd_quota_sync_gen; 739 u64 sd_quota_sync_gen;
727 740
728 /* Log stuff */ 741 /* Log stuff */
729 742
743 struct address_space sd_aspace;
744
730 spinlock_t sd_log_lock; 745 spinlock_t sd_log_lock;
731 746
732 struct gfs2_trans *sd_log_tr; 747 struct gfs2_trans *sd_log_tr;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7119504159f1..5c524180c98e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
149 ip = GFS2_I(inode); 149 ip = GFS2_I(inode);
150 150
151 if (!inode) 151 if (!inode)
152 return ERR_PTR(-ENOBUFS); 152 return ERR_PTR(-ENOMEM);
153 153
154 if (inode->i_state & I_NEW) { 154 if (inode->i_state & I_NEW) {
155 struct gfs2_sbd *sdp = GFS2_SB(inode); 155 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
469 brelse(dibh); 469 brelse(dibh);
470} 470}
471 471
472/**
473 * gfs2_trans_da_blocks - Calculate number of blocks to link inode
474 * @dip: The directory we are linking into
475 * @da: The dir add information
476 * @nr_inodes: The number of inodes involved
477 *
478 * This calculate the number of blocks we need to reserve in a
479 * transaction to link @nr_inodes into a directory. In most cases
480 * @nr_inodes will be 2 (the directory plus the inode being linked in)
481 * but in case of rename, 4 may be required.
482 *
483 * Returns: Number of blocks
484 */
485
486static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
487 const struct gfs2_diradd *da,
488 unsigned nr_inodes)
489{
490 return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
491 (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
492}
493
472static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, 494static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
473 struct gfs2_inode *ip, int arq) 495 struct gfs2_inode *ip, struct gfs2_diradd *da)
474{ 496{
475 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 497 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
476 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 498 struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
477 int error; 499 int error;
478 500
479 if (arq) { 501 if (da->nr_blocks) {
480 error = gfs2_quota_lock_check(dip); 502 error = gfs2_quota_lock_check(dip);
481 if (error) 503 if (error)
482 goto fail_quota_locks; 504 goto fail_quota_locks;
@@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
485 if (error) 507 if (error)
486 goto fail_quota_locks; 508 goto fail_quota_locks;
487 509
488 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 510 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
489 dip->i_rgd->rd_length +
490 2 * RES_DINODE +
491 RES_STATFS + RES_QUOTA, 0);
492 if (error) 511 if (error)
493 goto fail_ipreserv; 512 goto fail_ipreserv;
494 } else { 513 } else {
@@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
497 goto fail_quota_locks; 516 goto fail_quota_locks;
498 } 517 }
499 518
500 error = gfs2_dir_add(&dip->i_inode, name, ip); 519 error = gfs2_dir_add(&dip->i_inode, name, ip, da);
501 if (error) 520 if (error)
502 goto fail_end_trans; 521 goto fail_end_trans;
503 522
@@ -552,6 +571,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
552 unsigned int size, int excl, int *opened) 571 unsigned int size, int excl, int *opened)
553{ 572{
554 const struct qstr *name = &dentry->d_name; 573 const struct qstr *name = &dentry->d_name;
574 struct posix_acl *default_acl, *acl;
555 struct gfs2_holder ghs[2]; 575 struct gfs2_holder ghs[2];
556 struct inode *inode = NULL; 576 struct inode *inode = NULL;
557 struct gfs2_inode *dip = GFS2_I(dir), *ip; 577 struct gfs2_inode *dip = GFS2_I(dir), *ip;
@@ -560,7 +580,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
560 struct dentry *d; 580 struct dentry *d;
561 int error; 581 int error;
562 u32 aflags = 0; 582 u32 aflags = 0;
563 int arq; 583 struct gfs2_diradd da = { .bh = NULL, };
564 584
565 if (!name->len || name->len > GFS2_FNAMESIZE) 585 if (!name->len || name->len > GFS2_FNAMESIZE)
566 return -ENAMETOOLONG; 586 return -ENAMETOOLONG;
@@ -585,6 +605,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
585 error = PTR_ERR(inode); 605 error = PTR_ERR(inode);
586 if (!IS_ERR(inode)) { 606 if (!IS_ERR(inode)) {
587 d = d_splice_alias(inode, dentry); 607 d = d_splice_alias(inode, dentry);
608 error = PTR_ERR(d);
609 if (IS_ERR(d))
610 goto fail_gunlock;
588 error = 0; 611 error = 0;
589 if (file) { 612 if (file) {
590 if (S_ISREG(inode->i_mode)) { 613 if (S_ISREG(inode->i_mode)) {
@@ -602,7 +625,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
602 goto fail_gunlock; 625 goto fail_gunlock;
603 } 626 }
604 627
605 arq = error = gfs2_diradd_alloc_required(dir, name); 628 error = gfs2_diradd_alloc_required(dir, name, &da);
606 if (error < 0) 629 if (error < 0)
607 goto fail_gunlock; 630 goto fail_gunlock;
608 631
@@ -611,10 +634,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
611 if (!inode) 634 if (!inode)
612 goto fail_gunlock; 635 goto fail_gunlock;
613 636
637 error = posix_acl_create(dir, &mode, &default_acl, &acl);
638 if (error)
639 goto fail_free_vfs_inode;
640
614 ip = GFS2_I(inode); 641 ip = GFS2_I(inode);
615 error = gfs2_rs_alloc(ip); 642 error = gfs2_rs_alloc(ip);
616 if (error) 643 if (error)
617 goto fail_free_inode; 644 goto fail_free_acls;
618 645
619 inode->i_mode = mode; 646 inode->i_mode = mode;
620 set_nlink(inode, S_ISDIR(mode) ? 2 : 1); 647 set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
@@ -682,7 +709,16 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
682 gfs2_set_iop(inode); 709 gfs2_set_iop(inode);
683 insert_inode_hash(inode); 710 insert_inode_hash(inode);
684 711
685 error = gfs2_acl_create(dip, inode); 712 if (default_acl) {
713 error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
714 posix_acl_release(default_acl);
715 }
716 if (acl) {
717 if (!error)
718 error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
719 posix_acl_release(acl);
720 }
721
686 if (error) 722 if (error)
687 goto fail_gunlock3; 723 goto fail_gunlock3;
688 724
@@ -690,7 +726,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
690 if (error) 726 if (error)
691 goto fail_gunlock3; 727 goto fail_gunlock3;
692 728
693 error = link_dinode(dip, name, ip, arq); 729 error = link_dinode(dip, name, ip, &da);
694 if (error) 730 if (error)
695 goto fail_gunlock3; 731 goto fail_gunlock3;
696 732
@@ -716,9 +752,16 @@ fail_free_inode:
716 if (ip->i_gl) 752 if (ip->i_gl)
717 gfs2_glock_put(ip->i_gl); 753 gfs2_glock_put(ip->i_gl);
718 gfs2_rs_delete(ip, NULL); 754 gfs2_rs_delete(ip, NULL);
755fail_free_acls:
756 if (default_acl)
757 posix_acl_release(default_acl);
758 if (acl)
759 posix_acl_release(acl);
760fail_free_vfs_inode:
719 free_inode_nonrcu(inode); 761 free_inode_nonrcu(inode);
720 inode = NULL; 762 inode = NULL;
721fail_gunlock: 763fail_gunlock:
764 gfs2_dir_no_add(&da);
722 gfs2_glock_dq_uninit(ghs); 765 gfs2_glock_dq_uninit(ghs);
723 if (inode && !IS_ERR(inode)) { 766 if (inode && !IS_ERR(inode)) {
724 clear_nlink(inode); 767 clear_nlink(inode);
@@ -779,6 +822,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
779 } 822 }
780 823
781 d = d_splice_alias(inode, dentry); 824 d = d_splice_alias(inode, dentry);
825 if (IS_ERR(d)) {
826 iput(inode);
827 gfs2_glock_dq_uninit(&gh);
828 return d;
829 }
782 if (file && S_ISREG(inode->i_mode)) 830 if (file && S_ISREG(inode->i_mode))
783 error = finish_open(file, dentry, gfs2_open_common, opened); 831 error = finish_open(file, dentry, gfs2_open_common, opened);
784 832
@@ -817,7 +865,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
817 struct gfs2_inode *ip = GFS2_I(inode); 865 struct gfs2_inode *ip = GFS2_I(inode);
818 struct gfs2_holder ghs[2]; 866 struct gfs2_holder ghs[2];
819 struct buffer_head *dibh; 867 struct buffer_head *dibh;
820 int alloc_required; 868 struct gfs2_diradd da = { .bh = NULL, };
821 int error; 869 int error;
822 870
823 if (S_ISDIR(inode->i_mode)) 871 if (S_ISDIR(inode->i_mode))
@@ -872,13 +920,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
872 if (ip->i_inode.i_nlink == (u32)-1) 920 if (ip->i_inode.i_nlink == (u32)-1)
873 goto out_gunlock; 921 goto out_gunlock;
874 922
875 alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); 923 error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
876 if (error < 0) 924 if (error < 0)
877 goto out_gunlock; 925 goto out_gunlock;
878 error = 0;
879 926
880 if (alloc_required) { 927 if (da.nr_blocks) {
881 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 928 struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
882 error = gfs2_quota_lock_check(dip); 929 error = gfs2_quota_lock_check(dip);
883 if (error) 930 if (error)
884 goto out_gunlock; 931 goto out_gunlock;
@@ -887,10 +934,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
887 if (error) 934 if (error)
888 goto out_gunlock_q; 935 goto out_gunlock_q;
889 936
890 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 937 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
891 gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
892 2 * RES_DINODE + RES_STATFS +
893 RES_QUOTA, 0);
894 if (error) 938 if (error)
895 goto out_ipres; 939 goto out_ipres;
896 } else { 940 } else {
@@ -903,7 +947,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
903 if (error) 947 if (error)
904 goto out_end_trans; 948 goto out_end_trans;
905 949
906 error = gfs2_dir_add(dir, &dentry->d_name, ip); 950 error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
907 if (error) 951 if (error)
908 goto out_brelse; 952 goto out_brelse;
909 953
@@ -919,12 +963,13 @@ out_brelse:
919out_end_trans: 963out_end_trans:
920 gfs2_trans_end(sdp); 964 gfs2_trans_end(sdp);
921out_ipres: 965out_ipres:
922 if (alloc_required) 966 if (da.nr_blocks)
923 gfs2_inplace_release(dip); 967 gfs2_inplace_release(dip);
924out_gunlock_q: 968out_gunlock_q:
925 if (alloc_required) 969 if (da.nr_blocks)
926 gfs2_quota_unlock(dip); 970 gfs2_quota_unlock(dip);
927out_gunlock: 971out_gunlock:
972 gfs2_dir_no_add(&da);
928 gfs2_glock_dq(ghs + 1); 973 gfs2_glock_dq(ghs + 1);
929out_child: 974out_child:
930 gfs2_glock_dq(ghs); 975 gfs2_glock_dq(ghs);
@@ -1254,7 +1299,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1254 struct gfs2_rgrpd *nrgd; 1299 struct gfs2_rgrpd *nrgd;
1255 unsigned int num_gh; 1300 unsigned int num_gh;
1256 int dir_rename = 0; 1301 int dir_rename = 0;
1257 int alloc_required = 0; 1302 struct gfs2_diradd da = { .nr_blocks = 0, };
1258 unsigned int x; 1303 unsigned int x;
1259 int error; 1304 int error;
1260 1305
@@ -1388,14 +1433,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1388 goto out_gunlock; 1433 goto out_gunlock;
1389 } 1434 }
1390 1435
1391 if (nip == NULL) 1436 if (nip == NULL) {
1392 alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); 1437 error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
1393 error = alloc_required; 1438 if (error)
1394 if (error < 0) 1439 goto out_gunlock;
1395 goto out_gunlock; 1440 }
1396 1441
1397 if (alloc_required) { 1442 if (da.nr_blocks) {
1398 struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; 1443 struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
1399 error = gfs2_quota_lock_check(ndip); 1444 error = gfs2_quota_lock_check(ndip);
1400 if (error) 1445 if (error)
1401 goto out_gunlock; 1446 goto out_gunlock;
@@ -1404,10 +1449,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1404 if (error) 1449 if (error)
1405 goto out_gunlock_q; 1450 goto out_gunlock_q;
1406 1451
1407 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 1452 error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
1408 gfs2_rg_blocks(ndip, sdp->sd_max_dirres) + 1453 4 * RES_LEAF + 4, 0);
1409 4 * RES_DINODE + 4 * RES_LEAF +
1410 RES_STATFS + RES_QUOTA + 4, 0);
1411 if (error) 1454 if (error)
1412 goto out_ipreserv; 1455 goto out_ipreserv;
1413 } else { 1456 } else {
@@ -1441,19 +1484,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1441 if (error) 1484 if (error)
1442 goto out_end_trans; 1485 goto out_end_trans;
1443 1486
1444 error = gfs2_dir_add(ndir, &ndentry->d_name, ip); 1487 error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
1445 if (error) 1488 if (error)
1446 goto out_end_trans; 1489 goto out_end_trans;
1447 1490
1448out_end_trans: 1491out_end_trans:
1449 gfs2_trans_end(sdp); 1492 gfs2_trans_end(sdp);
1450out_ipreserv: 1493out_ipreserv:
1451 if (alloc_required) 1494 if (da.nr_blocks)
1452 gfs2_inplace_release(ndip); 1495 gfs2_inplace_release(ndip);
1453out_gunlock_q: 1496out_gunlock_q:
1454 if (alloc_required) 1497 if (da.nr_blocks)
1455 gfs2_quota_unlock(ndip); 1498 gfs2_quota_unlock(ndip);
1456out_gunlock: 1499out_gunlock:
1500 gfs2_dir_no_add(&da);
1457 while (x--) { 1501 while (x--) {
1458 gfs2_glock_dq(ghs + x); 1502 gfs2_glock_dq(ghs + x);
1459 gfs2_holder_uninit(ghs + x); 1503 gfs2_holder_uninit(ghs + x);
@@ -1607,10 +1651,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
1607 if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) 1651 if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
1608 ogid = ngid = NO_GID_QUOTA_CHANGE; 1652 ogid = ngid = NO_GID_QUOTA_CHANGE;
1609 1653
1610 error = gfs2_quota_lock(ip, nuid, ngid); 1654 error = get_write_access(inode);
1611 if (error) 1655 if (error)
1612 return error; 1656 return error;
1613 1657
1658 error = gfs2_rs_alloc(ip);
1659 if (error)
1660 goto out;
1661
1662 error = gfs2_rindex_update(sdp);
1663 if (error)
1664 goto out;
1665
1666 error = gfs2_quota_lock(ip, nuid, ngid);
1667 if (error)
1668 goto out;
1669
1614 if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || 1670 if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
1615 !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { 1671 !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
1616 error = gfs2_quota_check(ip, nuid, ngid); 1672 error = gfs2_quota_check(ip, nuid, ngid);
@@ -1637,6 +1693,8 @@ out_end_trans:
1637 gfs2_trans_end(sdp); 1693 gfs2_trans_end(sdp);
1638out_gunlock_q: 1694out_gunlock_q:
1639 gfs2_quota_unlock(ip); 1695 gfs2_quota_unlock(ip);
1696out:
1697 put_write_access(inode);
1640 return error; 1698 return error;
1641} 1699}
1642 1700
@@ -1678,10 +1736,11 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1678 error = gfs2_setattr_size(inode, attr->ia_size); 1736 error = gfs2_setattr_size(inode, attr->ia_size);
1679 else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) 1737 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1680 error = setattr_chown(inode, attr); 1738 error = setattr_chown(inode, attr);
1681 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1739 else {
1682 error = gfs2_acl_chmod(ip, attr);
1683 else
1684 error = gfs2_setattr_simple(inode, attr); 1740 error = gfs2_setattr_simple(inode, attr);
1741 if (!error && attr->ia_valid & ATTR_MODE)
1742 error = posix_acl_chmod(inode, inode->i_mode);
1743 }
1685 1744
1686out: 1745out:
1687 if (!error) 1746 if (!error)
@@ -1841,6 +1900,7 @@ const struct inode_operations gfs2_file_iops = {
1841 .removexattr = gfs2_removexattr, 1900 .removexattr = gfs2_removexattr,
1842 .fiemap = gfs2_fiemap, 1901 .fiemap = gfs2_fiemap,
1843 .get_acl = gfs2_get_acl, 1902 .get_acl = gfs2_get_acl,
1903 .set_acl = gfs2_set_acl,
1844}; 1904};
1845 1905
1846const struct inode_operations gfs2_dir_iops = { 1906const struct inode_operations gfs2_dir_iops = {
@@ -1862,6 +1922,7 @@ const struct inode_operations gfs2_dir_iops = {
1862 .removexattr = gfs2_removexattr, 1922 .removexattr = gfs2_removexattr,
1863 .fiemap = gfs2_fiemap, 1923 .fiemap = gfs2_fiemap,
1864 .get_acl = gfs2_get_acl, 1924 .get_acl = gfs2_get_acl,
1925 .set_acl = gfs2_set_acl,
1865 .atomic_open = gfs2_atomic_open, 1926 .atomic_open = gfs2_atomic_open,
1866}; 1927};
1867 1928
@@ -1877,6 +1938,5 @@ const struct inode_operations gfs2_symlink_iops = {
1877 .listxattr = gfs2_listxattr, 1938 .listxattr = gfs2_listxattr,
1878 .removexattr = gfs2_removexattr, 1939 .removexattr = gfs2_removexattr,
1879 .fiemap = gfs2_fiemap, 1940 .fiemap = gfs2_fiemap,
1880 .get_acl = gfs2_get_acl,
1881}; 1941};
1882 1942
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec6..76693793cedd 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
83 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); 83 bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
84 clear_bit(GBF_FULL, &bi->bi_flags); 84 clear_bit(GBF_FULL, &bi->bi_flags);
85 rgd->rd_free_clone = rgd->rd_free; 85 rgd->rd_free_clone = rgd->rd_free;
86 rgd->rd_extfail_pt = rgd->rd_free;
86} 87}
87 88
88/** 89/**
@@ -272,7 +273,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
272 nrvecs = max(nrvecs/2, 1U); 273 nrvecs = max(nrvecs/2, 1U);
273 } 274 }
274 275
275 bio->bi_sector = blkno * (sb->s_blocksize >> 9); 276 bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
276 bio->bi_bdev = sb->s_bdev; 277 bio->bi_bdev = sb->s_bdev;
277 bio->bi_end_io = gfs2_end_log_write; 278 bio->bi_end_io = gfs2_end_log_write;
278 bio->bi_private = sdp; 279 bio->bi_private = sdp;
@@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
588static void gfs2_meta_sync(struct gfs2_glock *gl) 589static void gfs2_meta_sync(struct gfs2_glock *gl)
589{ 590{
590 struct address_space *mapping = gfs2_glock2aspace(gl); 591 struct address_space *mapping = gfs2_glock2aspace(gl);
592 struct gfs2_sbd *sdp = gl->gl_sbd;
591 int error; 593 int error;
592 594
595 if (mapping == NULL)
596 mapping = &sdp->sd_aspace;
597
593 filemap_fdatawrite(mapping); 598 filemap_fdatawrite(mapping);
594 error = filemap_fdatawait(mapping); 599 error = filemap_fdatawait(mapping);
595 600
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0650db2541ef..c272e73063de 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
76 76
77 gfs2_str2qstr(&gfs2_qdot, "."); 77 gfs2_str2qstr(&gfs2_qdot, ".");
78 gfs2_str2qstr(&gfs2_qdotdot, ".."); 78 gfs2_str2qstr(&gfs2_qdotdot, "..");
79 gfs2_quota_hash_init();
79 80
80 error = gfs2_sys_init(); 81 error = gfs2_sys_init();
81 if (error) 82 if (error)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 52f177be3bf8..c7f24690ed05 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
116 unsigned long index; 116 unsigned long index;
117 unsigned int bufnum; 117 unsigned int bufnum;
118 118
119 if (mapping == NULL)
120 mapping = &sdp->sd_aspace;
121
119 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; 122 shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
120 index = blkno >> shift; /* convert block to page */ 123 index = blkno >> shift; /* convert block to page */
121 bufnum = blkno - (index << shift); /* block buf index within page */ 124 bufnum = blkno - (index << shift); /* block buf index within page */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 52fa88314f5c..c6872d09561a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -36,6 +36,7 @@
36#include "log.h" 36#include "log.h"
37#include "quota.h" 37#include "quota.h"
38#include "dir.h" 38#include "dir.h"
39#include "meta_io.h"
39#include "trace_gfs2.h" 40#include "trace_gfs2.h"
40 41
41#define DO 0 42#define DO 0
@@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
62static struct gfs2_sbd *init_sbd(struct super_block *sb) 63static struct gfs2_sbd *init_sbd(struct super_block *sb)
63{ 64{
64 struct gfs2_sbd *sdp; 65 struct gfs2_sbd *sdp;
66 struct address_space *mapping;
65 67
66 sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); 68 sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
67 if (!sdp) 69 if (!sdp)
@@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
97 init_waitqueue_head(&sdp->sd_quota_wait); 99 init_waitqueue_head(&sdp->sd_quota_wait);
98 INIT_LIST_HEAD(&sdp->sd_trunc_list); 100 INIT_LIST_HEAD(&sdp->sd_trunc_list);
99 spin_lock_init(&sdp->sd_trunc_lock); 101 spin_lock_init(&sdp->sd_trunc_lock);
102 spin_lock_init(&sdp->sd_bitmap_lock);
103
104 mapping = &sdp->sd_aspace;
105
106 address_space_init_once(mapping);
107 mapping->a_ops = &gfs2_meta_aops;
108 mapping->host = sb->s_bdev->bd_inode;
109 mapping->flags = 0;
110 mapping_set_gfp_mask(mapping, GFP_NOFS);
111 mapping->private_data = NULL;
112 mapping->backing_dev_info = sb->s_bdi;
113 mapping->writeback_index = 0;
100 114
101 spin_lock_init(&sdp->sd_log_lock); 115 spin_lock_init(&sdp->sd_log_lock);
102 atomic_set(&sdp->sd_log_pinned, 0); 116 atomic_set(&sdp->sd_log_pinned, 0);
@@ -217,14 +231,14 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
217 231
218 page = alloc_page(GFP_NOFS); 232 page = alloc_page(GFP_NOFS);
219 if (unlikely(!page)) 233 if (unlikely(!page))
220 return -ENOBUFS; 234 return -ENOMEM;
221 235
222 ClearPageUptodate(page); 236 ClearPageUptodate(page);
223 ClearPageDirty(page); 237 ClearPageDirty(page);
224 lock_page(page); 238 lock_page(page);
225 239
226 bio = bio_alloc(GFP_NOFS, 1); 240 bio = bio_alloc(GFP_NOFS, 1);
227 bio->bi_sector = sector * (sb->s_blocksize >> 9); 241 bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
228 bio->bi_bdev = sb->s_bdev; 242 bio->bi_bdev = sb->s_bdev;
229 bio_add_page(bio, page, PAGE_SIZE, 0); 243 bio_add_page(bio, page, PAGE_SIZE, 0);
230 244
@@ -956,40 +970,6 @@ fail:
956 return error; 970 return error;
957} 971}
958 972
959static int init_threads(struct gfs2_sbd *sdp, int undo)
960{
961 struct task_struct *p;
962 int error = 0;
963
964 if (undo)
965 goto fail_quotad;
966
967 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
968 if (IS_ERR(p)) {
969 error = PTR_ERR(p);
970 fs_err(sdp, "can't start logd thread: %d\n", error);
971 return error;
972 }
973 sdp->sd_logd_process = p;
974
975 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
976 if (IS_ERR(p)) {
977 error = PTR_ERR(p);
978 fs_err(sdp, "can't start quotad thread: %d\n", error);
979 goto fail;
980 }
981 sdp->sd_quotad_process = p;
982
983 return 0;
984
985
986fail_quotad:
987 kthread_stop(sdp->sd_quotad_process);
988fail:
989 kthread_stop(sdp->sd_logd_process);
990 return error;
991}
992
993static const match_table_t nolock_tokens = { 973static const match_table_t nolock_tokens = {
994 { Opt_jid, "jid=%d\n", }, 974 { Opt_jid, "jid=%d\n", },
995 { Opt_err, NULL }, 975 { Opt_err, NULL },
@@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1254 goto fail_per_node; 1234 goto fail_per_node;
1255 } 1235 }
1256 1236
1257 error = init_threads(sdp, DO);
1258 if (error)
1259 goto fail_per_node;
1260
1261 if (!(sb->s_flags & MS_RDONLY)) { 1237 if (!(sb->s_flags & MS_RDONLY)) {
1262 error = gfs2_make_fs_rw(sdp); 1238 error = gfs2_make_fs_rw(sdp);
1263 if (error) { 1239 if (error) {
1264 fs_err(sdp, "can't make FS RW: %d\n", error); 1240 fs_err(sdp, "can't make FS RW: %d\n", error);
1265 goto fail_threads; 1241 goto fail_per_node;
1266 } 1242 }
1267 } 1243 }
1268 1244
@@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1270 gfs2_online_uevent(sdp); 1246 gfs2_online_uevent(sdp);
1271 return 0; 1247 return 0;
1272 1248
1273fail_threads:
1274 init_threads(sdp, UNDO);
1275fail_per_node: 1249fail_per_node:
1276 init_per_node(sdp, UNDO); 1250 init_per_node(sdp, UNDO);
1277fail_inodes: 1251fail_inodes:
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 98236d0df3ca..8bec0e3192dd 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -52,6 +52,11 @@
52#include <linux/dqblk_xfs.h> 52#include <linux/dqblk_xfs.h>
53#include <linux/lockref.h> 53#include <linux/lockref.h>
54#include <linux/list_lru.h> 54#include <linux/list_lru.h>
55#include <linux/rcupdate.h>
56#include <linux/rculist_bl.h>
57#include <linux/bit_spinlock.h>
58#include <linux/jhash.h>
59#include <linux/vmalloc.h>
55 60
56#include "gfs2.h" 61#include "gfs2.h"
57#include "incore.h" 62#include "incore.h"
@@ -67,16 +72,44 @@
67#include "inode.h" 72#include "inode.h"
68#include "util.h" 73#include "util.h"
69 74
70struct gfs2_quota_change_host { 75#define GFS2_QD_HASH_SHIFT 12
71 u64 qc_change; 76#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT)
72 u32 qc_flags; /* GFS2_QCF_... */ 77#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1)
73 struct kqid qc_id;
74};
75 78
76/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */ 79/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
80/* -> sd_bitmap_lock */
77static DEFINE_SPINLOCK(qd_lock); 81static DEFINE_SPINLOCK(qd_lock);
78struct list_lru gfs2_qd_lru; 82struct list_lru gfs2_qd_lru;
79 83
84static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
85
86static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
87 const struct kqid qid)
88{
89 unsigned int h;
90
91 h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
92 h = jhash(&qid, sizeof(struct kqid), h);
93
94 return h & GFS2_QD_HASH_MASK;
95}
96
97static inline void spin_lock_bucket(unsigned int hash)
98{
99 hlist_bl_lock(&qd_hash_table[hash]);
100}
101
102static inline void spin_unlock_bucket(unsigned int hash)
103{
104 hlist_bl_unlock(&qd_hash_table[hash]);
105}
106
107static void gfs2_qd_dealloc(struct rcu_head *rcu)
108{
109 struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
110 kmem_cache_free(gfs2_quotad_cachep, qd);
111}
112
80static void gfs2_qd_dispose(struct list_head *list) 113static void gfs2_qd_dispose(struct list_head *list)
81{ 114{
82 struct gfs2_quota_data *qd; 115 struct gfs2_quota_data *qd;
@@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list)
93 list_del(&qd->qd_list); 126 list_del(&qd->qd_list);
94 spin_unlock(&qd_lock); 127 spin_unlock(&qd_lock);
95 128
129 spin_lock_bucket(qd->qd_hash);
130 hlist_bl_del_rcu(&qd->qd_hlist);
131 spin_unlock_bucket(qd->qd_hash);
132
96 gfs2_assert_warn(sdp, !qd->qd_change); 133 gfs2_assert_warn(sdp, !qd->qd_change);
97 gfs2_assert_warn(sdp, !qd->qd_slot_count); 134 gfs2_assert_warn(sdp, !qd->qd_slot_count);
98 gfs2_assert_warn(sdp, !qd->qd_bh_count); 135 gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list)
101 atomic_dec(&sdp->sd_quota_count); 138 atomic_dec(&sdp->sd_quota_count);
102 139
103 /* Delete it from the common reclaim list */ 140 /* Delete it from the common reclaim list */
104 kmem_cache_free(gfs2_quotad_cachep, qd); 141 call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
105 } 142 }
106} 143}
107 144
@@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
171 return offset; 208 return offset;
172} 209}
173 210
174static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, 211static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
175 struct gfs2_quota_data **qdp)
176{ 212{
177 struct gfs2_quota_data *qd; 213 struct gfs2_quota_data *qd;
178 int error; 214 int error;
179 215
180 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); 216 qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
181 if (!qd) 217 if (!qd)
182 return -ENOMEM; 218 return NULL;
183 219
220 qd->qd_sbd = sdp;
184 qd->qd_lockref.count = 1; 221 qd->qd_lockref.count = 1;
185 spin_lock_init(&qd->qd_lockref.lock); 222 spin_lock_init(&qd->qd_lockref.lock);
186 qd->qd_id = qid; 223 qd->qd_id = qid;
187 qd->qd_slot = -1; 224 qd->qd_slot = -1;
188 INIT_LIST_HEAD(&qd->qd_lru); 225 INIT_LIST_HEAD(&qd->qd_lru);
226 qd->qd_hash = hash;
189 227
190 error = gfs2_glock_get(sdp, qd2index(qd), 228 error = gfs2_glock_get(sdp, qd2index(qd),
191 &gfs2_quota_glops, CREATE, &qd->qd_gl); 229 &gfs2_quota_glops, CREATE, &qd->qd_gl);
192 if (error) 230 if (error)
193 goto fail; 231 goto fail;
194 232
195 *qdp = qd; 233 return qd;
196
197 return 0;
198 234
199fail: 235fail:
200 kmem_cache_free(gfs2_quotad_cachep, qd); 236 kmem_cache_free(gfs2_quotad_cachep, qd);
201 return error; 237 return NULL;
202} 238}
203 239
204static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, 240static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
205 struct gfs2_quota_data **qdp) 241 const struct gfs2_sbd *sdp,
242 struct kqid qid)
206{ 243{
207 struct gfs2_quota_data *qd = NULL, *new_qd = NULL; 244 struct gfs2_quota_data *qd;
208 int error, found; 245 struct hlist_bl_node *h;
209
210 *qdp = NULL;
211 246
212 for (;;) { 247 hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
213 found = 0; 248 if (!qid_eq(qd->qd_id, qid))
214 spin_lock(&qd_lock); 249 continue;
215 list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { 250 if (qd->qd_sbd != sdp)
216 if (qid_eq(qd->qd_id, qid) && 251 continue;
217 lockref_get_not_dead(&qd->qd_lockref)) { 252 if (lockref_get_not_dead(&qd->qd_lockref)) {
218 list_lru_del(&gfs2_qd_lru, &qd->qd_lru); 253 list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
219 found = 1; 254 return qd;
220 break;
221 }
222 } 255 }
256 }
223 257
224 if (!found) 258 return NULL;
225 qd = NULL; 259}
226 260
227 if (!qd && new_qd) {
228 qd = new_qd;
229 list_add(&qd->qd_list, &sdp->sd_quota_list);
230 atomic_inc(&sdp->sd_quota_count);
231 new_qd = NULL;
232 }
233 261
234 spin_unlock(&qd_lock); 262static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
263 struct gfs2_quota_data **qdp)
264{
265 struct gfs2_quota_data *qd, *new_qd;
266 unsigned int hash = gfs2_qd_hash(sdp, qid);
235 267
236 if (qd) { 268 rcu_read_lock();
237 if (new_qd) { 269 *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
238 gfs2_glock_put(new_qd->qd_gl); 270 rcu_read_unlock();
239 kmem_cache_free(gfs2_quotad_cachep, new_qd);
240 }
241 *qdp = qd;
242 return 0;
243 }
244 271
245 error = qd_alloc(sdp, qid, &new_qd); 272 if (qd)
246 if (error) 273 return 0;
247 return error; 274
275 new_qd = qd_alloc(hash, sdp, qid);
276 if (!new_qd)
277 return -ENOMEM;
278
279 spin_lock(&qd_lock);
280 spin_lock_bucket(hash);
281 *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
282 if (qd == NULL) {
283 *qdp = new_qd;
284 list_add(&new_qd->qd_list, &sdp->sd_quota_list);
285 hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
286 atomic_inc(&sdp->sd_quota_count);
248 } 287 }
288 spin_unlock_bucket(hash);
289 spin_unlock(&qd_lock);
290
291 if (qd) {
292 gfs2_glock_put(new_qd->qd_gl);
293 kmem_cache_free(gfs2_quotad_cachep, new_qd);
294 }
295
296 return 0;
249} 297}
250 298
299
251static void qd_hold(struct gfs2_quota_data *qd) 300static void qd_hold(struct gfs2_quota_data *qd)
252{ 301{
253 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 302 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
@@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd)
268 317
269static int slot_get(struct gfs2_quota_data *qd) 318static int slot_get(struct gfs2_quota_data *qd)
270{ 319{
271 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 320 struct gfs2_sbd *sdp = qd->qd_sbd;
272 unsigned int c, o = 0, b; 321 unsigned int bit;
273 unsigned char byte = 0; 322 int error = 0;
274 323
275 spin_lock(&qd_lock); 324 spin_lock(&sdp->sd_bitmap_lock);
325 if (qd->qd_slot_count != 0)
326 goto out;
276 327
277 if (qd->qd_slot_count++) { 328 error = -ENOSPC;
278 spin_unlock(&qd_lock); 329 bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
279 return 0; 330 if (bit < sdp->sd_quota_slots) {
331 set_bit(bit, sdp->sd_quota_bitmap);
332 qd->qd_slot = bit;
333out:
334 qd->qd_slot_count++;
280 } 335 }
336 spin_unlock(&sdp->sd_bitmap_lock);
281 337
282 for (c = 0; c < sdp->sd_quota_chunks; c++) 338 return error;
283 for (o = 0; o < PAGE_SIZE; o++) {
284 byte = sdp->sd_quota_bitmap[c][o];
285 if (byte != 0xFF)
286 goto found;
287 }
288
289 goto fail;
290
291found:
292 for (b = 0; b < 8; b++)
293 if (!(byte & (1 << b)))
294 break;
295 qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
296
297 if (qd->qd_slot >= sdp->sd_quota_slots)
298 goto fail;
299
300 sdp->sd_quota_bitmap[c][o] |= 1 << b;
301
302 spin_unlock(&qd_lock);
303
304 return 0;
305
306fail:
307 qd->qd_slot_count--;
308 spin_unlock(&qd_lock);
309 return -ENOSPC;
310} 339}
311 340
312static void slot_hold(struct gfs2_quota_data *qd) 341static void slot_hold(struct gfs2_quota_data *qd)
313{ 342{
314 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 343 struct gfs2_sbd *sdp = qd->qd_sbd;
315 344
316 spin_lock(&qd_lock); 345 spin_lock(&sdp->sd_bitmap_lock);
317 gfs2_assert(sdp, qd->qd_slot_count); 346 gfs2_assert(sdp, qd->qd_slot_count);
318 qd->qd_slot_count++; 347 qd->qd_slot_count++;
319 spin_unlock(&qd_lock); 348 spin_unlock(&sdp->sd_bitmap_lock);
320}
321
322static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
323 unsigned int bit, int new_value)
324{
325 unsigned int c, o, b = bit;
326 int old_value;
327
328 c = b / (8 * PAGE_SIZE);
329 b %= 8 * PAGE_SIZE;
330 o = b / 8;
331 b %= 8;
332
333 old_value = (bitmap[c][o] & (1 << b));
334 gfs2_assert_withdraw(sdp, !old_value != !new_value);
335
336 if (new_value)
337 bitmap[c][o] |= 1 << b;
338 else
339 bitmap[c][o] &= ~(1 << b);
340} 349}
341 350
342static void slot_put(struct gfs2_quota_data *qd) 351static void slot_put(struct gfs2_quota_data *qd)
343{ 352{
344 struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; 353 struct gfs2_sbd *sdp = qd->qd_sbd;
345 354
346 spin_lock(&qd_lock); 355 spin_lock(&sdp->sd_bitmap_lock);
347 gfs2_assert(sdp, qd->qd_slot_count); 356 gfs2_assert(sdp, qd->qd_slot_count);
348 if (!--qd->qd_slot_count) { 357 if (!--qd->qd_slot_count) {
349 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); 358 BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
350 qd->qd_slot = -1; 359 qd->qd_slot = -1;
351 } 360 }
352 spin_unlock(&qd_lock); 361 spin_unlock(&sdp->sd_bitmap_lock);
353} 362}
354 363
355static int bh_get(struct gfs2_quota_data *qd) 364static int bh_get(struct gfs2_quota_data *qd)
@@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
427 list_move_tail(&qd->qd_list, &sdp->sd_quota_list); 436 list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
428 set_bit(QDF_LOCKED, &qd->qd_flags); 437 set_bit(QDF_LOCKED, &qd->qd_flags);
429 qd->qd_change_sync = qd->qd_change; 438 qd->qd_change_sync = qd->qd_change;
430 gfs2_assert_warn(sdp, qd->qd_slot_count); 439 slot_hold(qd);
431 qd->qd_slot_count++;
432 return 1; 440 return 1;
433} 441}
434 442
@@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
1214 return error; 1222 return error;
1215} 1223}
1216 1224
1217static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
1218{
1219 const struct gfs2_quota_change *str = buf;
1220
1221 qc->qc_change = be64_to_cpu(str->qc_change);
1222 qc->qc_flags = be32_to_cpu(str->qc_flags);
1223 qc->qc_id = make_kqid(&init_user_ns,
1224 (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
1225 be32_to_cpu(str->qc_id));
1226}
1227
1228int gfs2_quota_init(struct gfs2_sbd *sdp) 1225int gfs2_quota_init(struct gfs2_sbd *sdp)
1229{ 1226{
1230 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1227 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
@@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1232 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; 1229 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1233 unsigned int x, slot = 0; 1230 unsigned int x, slot = 0;
1234 unsigned int found = 0; 1231 unsigned int found = 0;
1232 unsigned int hash;
1233 unsigned int bm_size;
1235 u64 dblock; 1234 u64 dblock;
1236 u32 extlen = 0; 1235 u32 extlen = 0;
1237 int error; 1236 int error;
@@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1240 return -EIO; 1239 return -EIO;
1241 1240
1242 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1241 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1243 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1242 bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
1244 1243 bm_size *= sizeof(unsigned long);
1245 error = -ENOMEM; 1244 error = -ENOMEM;
1246 1245 sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN);
1247 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, 1246 if (sdp->sd_quota_bitmap == NULL)
1248 sizeof(unsigned char *), GFP_NOFS); 1247 sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
1249 if (!sdp->sd_quota_bitmap) 1248 if (!sdp->sd_quota_bitmap)
1250 return error; 1249 return error;
1251 1250
1252 for (x = 0; x < sdp->sd_quota_chunks; x++) { 1251 memset(sdp->sd_quota_bitmap, 0, bm_size);
1253 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
1254 if (!sdp->sd_quota_bitmap[x])
1255 goto fail;
1256 }
1257 1252
1258 for (x = 0; x < blocks; x++) { 1253 for (x = 0; x < blocks; x++) {
1259 struct buffer_head *bh; 1254 struct buffer_head *bh;
1255 const struct gfs2_quota_change *qc;
1260 unsigned int y; 1256 unsigned int y;
1261 1257
1262 if (!extlen) { 1258 if (!extlen) {
@@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1274 goto fail; 1270 goto fail;
1275 } 1271 }
1276 1272
1273 qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
1277 for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; 1274 for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
1278 y++, slot++) { 1275 y++, slot++) {
1279 struct gfs2_quota_change_host qc;
1280 struct gfs2_quota_data *qd; 1276 struct gfs2_quota_data *qd;
1281 1277 s64 qc_change = be64_to_cpu(qc->qc_change);
1282 gfs2_quota_change_in(&qc, bh->b_data + 1278 u32 qc_flags = be32_to_cpu(qc->qc_flags);
1283 sizeof(struct gfs2_meta_header) + 1279 enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
1284 y * sizeof(struct gfs2_quota_change)); 1280 USRQUOTA : GRPQUOTA;
1285 if (!qc.qc_change) 1281 struct kqid qc_id = make_kqid(&init_user_ns, qtype,
1282 be32_to_cpu(qc->qc_id));
1283 qc++;
1284 if (!qc_change)
1286 continue; 1285 continue;
1287 1286
1288 error = qd_alloc(sdp, qc.qc_id, &qd); 1287 hash = gfs2_qd_hash(sdp, qc_id);
1289 if (error) { 1288 qd = qd_alloc(hash, sdp, qc_id);
1289 if (qd == NULL) {
1290 brelse(bh); 1290 brelse(bh);
1291 goto fail; 1291 goto fail;
1292 } 1292 }
1293 1293
1294 set_bit(QDF_CHANGE, &qd->qd_flags); 1294 set_bit(QDF_CHANGE, &qd->qd_flags);
1295 qd->qd_change = qc.qc_change; 1295 qd->qd_change = qc_change;
1296 qd->qd_slot = slot; 1296 qd->qd_slot = slot;
1297 qd->qd_slot_count = 1; 1297 qd->qd_slot_count = 1;
1298 1298
1299 spin_lock(&qd_lock); 1299 spin_lock(&qd_lock);
1300 gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); 1300 BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
1301 list_add(&qd->qd_list, &sdp->sd_quota_list); 1301 list_add(&qd->qd_list, &sdp->sd_quota_list);
1302 atomic_inc(&sdp->sd_quota_count); 1302 atomic_inc(&sdp->sd_quota_count);
1303 spin_unlock(&qd_lock); 1303 spin_unlock(&qd_lock);
1304 1304
1305 spin_lock_bucket(hash);
1306 hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
1307 spin_unlock_bucket(hash);
1308
1305 found++; 1309 found++;
1306 } 1310 }
1307 1311
@@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1324{ 1328{
1325 struct list_head *head = &sdp->sd_quota_list; 1329 struct list_head *head = &sdp->sd_quota_list;
1326 struct gfs2_quota_data *qd; 1330 struct gfs2_quota_data *qd;
1327 unsigned int x;
1328 1331
1329 spin_lock(&qd_lock); 1332 spin_lock(&qd_lock);
1330 while (!list_empty(head)) { 1333 while (!list_empty(head)) {
1331 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); 1334 qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
1332 1335
1333 /*
1334 * To be removed in due course... we should be able to
1335 * ensure that all refs to the qd have done by this point
1336 * so that this rather odd test is not required
1337 */
1338 spin_lock(&qd->qd_lockref.lock);
1339 if (qd->qd_lockref.count > 1 ||
1340 (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
1341 spin_unlock(&qd->qd_lockref.lock);
1342 list_move(&qd->qd_list, head);
1343 spin_unlock(&qd_lock);
1344 schedule();
1345 spin_lock(&qd_lock);
1346 continue;
1347 }
1348 spin_unlock(&qd->qd_lockref.lock);
1349
1350 list_del(&qd->qd_list); 1336 list_del(&qd->qd_list);
1337
1351 /* Also remove if this qd exists in the reclaim list */ 1338 /* Also remove if this qd exists in the reclaim list */
1352 list_lru_del(&gfs2_qd_lru, &qd->qd_lru); 1339 list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
1353 atomic_dec(&sdp->sd_quota_count); 1340 atomic_dec(&sdp->sd_quota_count);
1354 spin_unlock(&qd_lock); 1341 spin_unlock(&qd_lock);
1355 1342
1356 if (!qd->qd_lockref.count) { 1343 spin_lock_bucket(qd->qd_hash);
1357 gfs2_assert_warn(sdp, !qd->qd_change); 1344 hlist_bl_del_rcu(&qd->qd_hlist);
1358 gfs2_assert_warn(sdp, !qd->qd_slot_count); 1345 spin_unlock_bucket(qd->qd_hash);
1359 } else 1346
1360 gfs2_assert_warn(sdp, qd->qd_slot_count == 1); 1347 gfs2_assert_warn(sdp, !qd->qd_change);
1348 gfs2_assert_warn(sdp, !qd->qd_slot_count);
1361 gfs2_assert_warn(sdp, !qd->qd_bh_count); 1349 gfs2_assert_warn(sdp, !qd->qd_bh_count);
1362 1350
1363 gfs2_glock_put(qd->qd_gl); 1351 gfs2_glock_put(qd->qd_gl);
1364 kmem_cache_free(gfs2_quotad_cachep, qd); 1352 call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
1365 1353
1366 spin_lock(&qd_lock); 1354 spin_lock(&qd_lock);
1367 } 1355 }
@@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
1370 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); 1358 gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
1371 1359
1372 if (sdp->sd_quota_bitmap) { 1360 if (sdp->sd_quota_bitmap) {
1373 for (x = 0; x < sdp->sd_quota_chunks; x++) 1361 if (is_vmalloc_addr(sdp->sd_quota_bitmap))
1374 kfree(sdp->sd_quota_bitmap[x]); 1362 vfree(sdp->sd_quota_bitmap);
1375 kfree(sdp->sd_quota_bitmap); 1363 else
1364 kfree(sdp->sd_quota_bitmap);
1365 sdp->sd_quota_bitmap = NULL;
1376 } 1366 }
1377} 1367}
1378 1368
@@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
1656 .get_dqblk = gfs2_get_dqblk, 1646 .get_dqblk = gfs2_get_dqblk,
1657 .set_dqblk = gfs2_set_dqblk, 1647 .set_dqblk = gfs2_set_dqblk,
1658}; 1648};
1649
1650void __init gfs2_quota_hash_init(void)
1651{
1652 unsigned i;
1653
1654 for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
1655 INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
1656}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 96e4f34a03b0..55d506eb3c4a 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
57extern const struct quotactl_ops gfs2_quotactl_ops; 57extern const struct quotactl_ops gfs2_quotactl_ops;
58extern struct shrinker gfs2_qd_shrinker; 58extern struct shrinker gfs2_qd_shrinker;
59extern struct list_lru gfs2_qd_lru; 59extern struct list_lru gfs2_qd_lru;
60extern void __init gfs2_quota_hash_init(void);
60 61
61#endif /* __QUOTA_DOT_H__ */ 62#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c8d6161bd682..a1da21349235 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -57,6 +57,11 @@
57 * 3 = Used (metadata) 57 * 3 = Used (metadata)
58 */ 58 */
59 59
60struct gfs2_extent {
61 struct gfs2_rbm rbm;
62 u32 len;
63};
64
60static const char valid_change[16] = { 65static const char valid_change[16] = {
61 /* current */ 66 /* current */
62 /* n */ 0, 1, 1, 1, 67 /* n */ 0, 1, 1, 1,
@@ -65,8 +70,9 @@ static const char valid_change[16] = {
65 1, 0, 0, 0 70 1, 0, 0, 0
66}; 71};
67 72
68static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, 73static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
69 const struct gfs2_inode *ip, bool nowrap); 74 const struct gfs2_inode *ip, bool nowrap,
75 const struct gfs2_alloc_parms *ap);
70 76
71 77
72/** 78/**
@@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
635 /* return reserved blocks to the rgrp */ 641 /* return reserved blocks to the rgrp */
636 BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); 642 BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
637 rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; 643 rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
644 /* The rgrp extent failure point is likely not to increase;
645 it will only do so if the freed blocks are somehow
646 contiguous with a span of free blocks that follows. Still,
647 it will force the number to be recalculated later. */
648 rgd->rd_extfail_pt += rs->rs_free;
638 rs->rs_free = 0; 649 rs->rs_free = 0;
639 clear_bit(GBF_FULL, &bi->bi_flags); 650 clear_bit(GBF_FULL, &bi->bi_flags);
640 smp_mb__after_clear_bit();
641 } 651 }
642} 652}
643 653
@@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
876static int read_rindex_entry(struct gfs2_inode *ip) 886static int read_rindex_entry(struct gfs2_inode *ip)
877{ 887{
878 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 888 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
889 const unsigned bsize = sdp->sd_sb.sb_bsize;
879 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); 890 loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
880 struct gfs2_rindex buf; 891 struct gfs2_rindex buf;
881 int error; 892 int error;
@@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
913 goto fail; 924 goto fail;
914 925
915 rgd->rd_gl->gl_object = rgd; 926 rgd->rd_gl->gl_object = rgd;
927 rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
928 rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
916 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; 929 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
917 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 930 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
918 if (rgd->rd_data > sdp->sd_max_rg_data) 931 if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1126 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); 1139 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1127 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); 1140 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1128 rgd->rd_free_clone = rgd->rd_free; 1141 rgd->rd_free_clone = rgd->rd_free;
1142 /* max out the rgrp allocation failure point */
1143 rgd->rd_extfail_pt = rgd->rd_free;
1129 } 1144 }
1130 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { 1145 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1131 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); 1146 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
@@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1184 1199
1185 if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) 1200 if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1186 return 0; 1201 return 0;
1187 return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object); 1202 return gfs2_rgrp_bh_get(rgd);
1188} 1203}
1189 1204
1190/** 1205/**
@@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1455 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) 1470 if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1456 return; 1471 return;
1457 1472
1458 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true); 1473 ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
1459 if (ret == 0) { 1474 if (ret == 0) {
1460 rs->rs_rbm = rbm; 1475 rs->rs_rbm = rbm;
1461 rs->rs_free = extlen; 1476 rs->rs_free = extlen;
@@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1520 * @rbm: The current position in the resource group 1535 * @rbm: The current position in the resource group
1521 * @ip: The inode for which we are searching for blocks 1536 * @ip: The inode for which we are searching for blocks
1522 * @minext: The minimum extent length 1537 * @minext: The minimum extent length
1538 * @maxext: A pointer to the maximum extent structure
1523 * 1539 *
1524 * This checks the current position in the rgrp to see whether there is 1540 * This checks the current position in the rgrp to see whether there is
1525 * a reservation covering this block. If not then this function is a 1541 * a reservation covering this block. If not then this function is a
@@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1532 1548
1533static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, 1549static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1534 const struct gfs2_inode *ip, 1550 const struct gfs2_inode *ip,
1535 u32 minext) 1551 u32 minext,
1552 struct gfs2_extent *maxext)
1536{ 1553{
1537 u64 block = gfs2_rbm_to_block(rbm); 1554 u64 block = gfs2_rbm_to_block(rbm);
1538 u32 extlen = 1; 1555 u32 extlen = 1;
@@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1545 */ 1562 */
1546 if (minext) { 1563 if (minext) {
1547 extlen = gfs2_free_extlen(rbm, minext); 1564 extlen = gfs2_free_extlen(rbm, minext);
1548 nblock = block + extlen; 1565 if (extlen <= maxext->len)
1549 if (extlen < minext)
1550 goto fail; 1566 goto fail;
1551 } 1567 }
1552 1568
@@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1555 * and skip if parts of it are already reserved 1571 * and skip if parts of it are already reserved
1556 */ 1572 */
1557 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); 1573 nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1558 if (nblock == block) 1574 if (nblock == block) {
1559 return 0; 1575 if (!minext || extlen >= minext)
1576 return 0;
1577
1578 if (extlen > maxext->len) {
1579 maxext->len = extlen;
1580 maxext->rbm = *rbm;
1581 }
1560fail: 1582fail:
1583 nblock = block + extlen;
1584 }
1561 ret = gfs2_rbm_from_block(rbm, nblock); 1585 ret = gfs2_rbm_from_block(rbm, nblock);
1562 if (ret < 0) 1586 if (ret < 0)
1563 return ret; 1587 return ret;
@@ -1568,30 +1592,38 @@ fail:
1568 * gfs2_rbm_find - Look for blocks of a particular state 1592 * gfs2_rbm_find - Look for blocks of a particular state
1569 * @rbm: Value/result starting position and final position 1593 * @rbm: Value/result starting position and final position
1570 * @state: The state which we want to find 1594 * @state: The state which we want to find
1571 * @minext: The requested extent length (0 for a single block) 1595 * @minext: Pointer to the requested extent length (NULL for a single block)
1596 * This is updated to be the actual reservation size.
1572 * @ip: If set, check for reservations 1597 * @ip: If set, check for reservations
1573 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping 1598 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1574 * around until we've reached the starting point. 1599 * around until we've reached the starting point.
1600 * @ap: the allocation parameters
1575 * 1601 *
1576 * Side effects: 1602 * Side effects:
1577 * - If looking for free blocks, we set GBF_FULL on each bitmap which 1603 * - If looking for free blocks, we set GBF_FULL on each bitmap which
1578 * has no free blocks in it. 1604 * has no free blocks in it.
1605 * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1606 * has come up short on a free block search.
1579 * 1607 *
1580 * Returns: 0 on success, -ENOSPC if there is no block of the requested state 1608 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1581 */ 1609 */
1582 1610
1583static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, 1611static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1584 const struct gfs2_inode *ip, bool nowrap) 1612 const struct gfs2_inode *ip, bool nowrap,
1613 const struct gfs2_alloc_parms *ap)
1585{ 1614{
1586 struct buffer_head *bh; 1615 struct buffer_head *bh;
1587 int initial_bii; 1616 int initial_bii;
1588 u32 initial_offset; 1617 u32 initial_offset;
1618 int first_bii = rbm->bii;
1619 u32 first_offset = rbm->offset;
1589 u32 offset; 1620 u32 offset;
1590 u8 *buffer; 1621 u8 *buffer;
1591 int n = 0; 1622 int n = 0;
1592 int iters = rbm->rgd->rd_length; 1623 int iters = rbm->rgd->rd_length;
1593 int ret; 1624 int ret;
1594 struct gfs2_bitmap *bi; 1625 struct gfs2_bitmap *bi;
1626 struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1595 1627
1596 /* If we are not starting at the beginning of a bitmap, then we 1628 /* If we are not starting at the beginning of a bitmap, then we
1597 * need to add one to the bitmap count to ensure that we search 1629 * need to add one to the bitmap count to ensure that we search
@@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
1620 return 0; 1652 return 0;
1621 1653
1622 initial_bii = rbm->bii; 1654 initial_bii = rbm->bii;
1623 ret = gfs2_reservation_check_and_update(rbm, ip, minext); 1655 ret = gfs2_reservation_check_and_update(rbm, ip,
1656 minext ? *minext : 0,
1657 &maxext);
1624 if (ret == 0) 1658 if (ret == 0)
1625 return 0; 1659 return 0;
1626 if (ret > 0) { 1660 if (ret > 0) {
@@ -1655,6 +1689,24 @@ next_iter:
1655 break; 1689 break;
1656 } 1690 }
1657 1691
1692 if (minext == NULL || state != GFS2_BLKST_FREE)
1693 return -ENOSPC;
1694
1695 /* If the extent was too small, and it's smaller than the smallest
1696 to have failed before, remember for future reference that it's
1697 useless to search this rgrp again for this amount or more. */
1698 if ((first_offset == 0) && (first_bii == 0) &&
1699 (*minext < rbm->rgd->rd_extfail_pt))
1700 rbm->rgd->rd_extfail_pt = *minext;
1701
1702 /* If the maximum extent we found is big enough to fulfill the
1703 minimum requirements, use it anyway. */
1704 if (maxext.len) {
1705 *rbm = maxext.rbm;
1706 *minext = maxext.len;
1707 return 0;
1708 }
1709
1658 return -ENOSPC; 1710 return -ENOSPC;
1659} 1711}
1660 1712
@@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
1680 1732
1681 while (1) { 1733 while (1) {
1682 down_write(&sdp->sd_log_flush_lock); 1734 down_write(&sdp->sd_log_flush_lock);
1683 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true); 1735 error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1736 true, NULL);
1684 up_write(&sdp->sd_log_flush_lock); 1737 up_write(&sdp->sd_log_flush_lock);
1685 if (error == -ENOSPC) 1738 if (error == -ENOSPC)
1686 break; 1739 break;
@@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
1891 } 1944 }
1892 1945
1893 /* Skip unuseable resource groups */ 1946 /* Skip unuseable resource groups */
1894 if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) 1947 if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
1948 GFS2_RDF_ERROR)) ||
1949 (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
1895 goto skip_rgrp; 1950 goto skip_rgrp;
1896 1951
1897 if (sdp->sd_args.ar_rgrplvb) 1952 if (sdp->sd_args.ar_rgrplvb)
@@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
1911 return 0; 1966 return 0;
1912 } 1967 }
1913 1968
1914 /* Drop reservation, if we couldn't use reserved rgrp */
1915 if (gfs2_rs_active(rs))
1916 gfs2_rs_deltree(rs);
1917check_rgrp: 1969check_rgrp:
1918 /* Check for unlinked inodes which can be reclaimed */ 1970 /* Check for unlinked inodes which can be reclaimed */
1919 if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) 1971 if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
1920 try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, 1972 try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
1921 ip->i_no_addr); 1973 ip->i_no_addr);
1922skip_rgrp: 1974skip_rgrp:
1975 /* Drop reservation, if we couldn't use reserved rgrp */
1976 if (gfs2_rs_active(rs))
1977 gfs2_rs_deltree(rs);
1978
1923 /* Unlock rgrp if required */ 1979 /* Unlock rgrp if required */
1924 if (!rg_locked) 1980 if (!rg_locked)
1925 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1981 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2064 * 2120 *
2065 */ 2121 */
2066 2122
2067int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) 2123void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2068{ 2124{
2069 struct gfs2_rgrpd *rgd = gl->gl_object; 2125 struct gfs2_rgrpd *rgd = gl->gl_object;
2070 struct gfs2_blkreserv *trs; 2126 struct gfs2_blkreserv *trs;
2071 const struct rb_node *n; 2127 const struct rb_node *n;
2072 2128
2073 if (rgd == NULL) 2129 if (rgd == NULL)
2074 return 0; 2130 return;
2075 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n", 2131 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2076 (unsigned long long)rgd->rd_addr, rgd->rd_flags, 2132 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2077 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, 2133 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2078 rgd->rd_reserved); 2134 rgd->rd_reserved, rgd->rd_extfail_pt);
2079 spin_lock(&rgd->rd_rsspin); 2135 spin_lock(&rgd->rd_rsspin);
2080 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { 2136 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2081 trs = rb_entry(n, struct gfs2_blkreserv, rs_node); 2137 trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2082 dump_rs(seq, trs); 2138 dump_rs(seq, trs);
2083 } 2139 }
2084 spin_unlock(&rgd->rd_rsspin); 2140 spin_unlock(&rgd->rd_rsspin);
2085 return 0;
2086} 2141}
2087 2142
2088static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) 2143static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
@@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2184 int error; 2239 int error;
2185 2240
2186 gfs2_set_alloc_start(&rbm, ip, dinode); 2241 gfs2_set_alloc_start(&rbm, ip, dinode);
2187 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); 2242 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
2188 2243
2189 if (error == -ENOSPC) { 2244 if (error == -ENOSPC) {
2190 gfs2_set_alloc_start(&rbm, ip, dinode); 2245 gfs2_set_alloc_start(&rbm, ip, dinode);
2191 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); 2246 error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
2247 NULL);
2192 } 2248 }
2193 2249
2194 /* Since all blocks are reserved in advance, this shouldn't happen */ 2250 /* Since all blocks are reserved in advance, this shouldn't happen */
2195 if (error) { 2251 if (error) {
2196 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n", 2252 fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2197 (unsigned long long)ip->i_no_addr, error, *nblocks, 2253 (unsigned long long)ip->i_no_addr, error, *nblocks,
2198 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags)); 2254 test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2255 rbm.rgd->rd_extfail_pt);
2199 goto rgrp_error; 2256 goto rgrp_error;
2200 } 2257 }
2201 2258
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3a10d2ffbbe7..463ab2e95d1c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
68extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); 68extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
69extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 69extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
70extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); 70extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
71extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); 71extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
72extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, 72extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
73 struct buffer_head *bh, 73 struct buffer_head *bh,
74 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); 74 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 35da5b19c0de..60f60f6181f3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
369 return 0; 369 return 0;
370} 370}
371 371
372static int init_threads(struct gfs2_sbd *sdp)
373{
374 struct task_struct *p;
375 int error = 0;
376
377 p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
378 if (IS_ERR(p)) {
379 error = PTR_ERR(p);
380 fs_err(sdp, "can't start logd thread: %d\n", error);
381 return error;
382 }
383 sdp->sd_logd_process = p;
384
385 p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
386 if (IS_ERR(p)) {
387 error = PTR_ERR(p);
388 fs_err(sdp, "can't start quotad thread: %d\n", error);
389 goto fail;
390 }
391 sdp->sd_quotad_process = p;
392 return 0;
393
394fail:
395 kthread_stop(sdp->sd_logd_process);
396 return error;
397}
398
372/** 399/**
373 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one 400 * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
374 * @sdp: the filesystem 401 * @sdp: the filesystem
@@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
384 struct gfs2_log_header_host head; 411 struct gfs2_log_header_host head;
385 int error; 412 int error;
386 413
387 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); 414 error = init_threads(sdp);
388 if (error) 415 if (error)
389 return error; 416 return error;
390 417
418 error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
419 if (error)
420 goto fail_threads;
421
391 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); 422 j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
392 423
393 error = gfs2_find_jhead(sdp->sd_jdesc, &head); 424 error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
417fail: 448fail:
418 t_gh.gh_flags |= GL_NOCACHE; 449 t_gh.gh_flags |= GL_NOCACHE;
419 gfs2_glock_dq_uninit(&t_gh); 450 gfs2_glock_dq_uninit(&t_gh);
420 451fail_threads:
452 kthread_stop(sdp->sd_quotad_process);
453 kthread_stop(sdp->sd_logd_process);
421 return error; 454 return error;
422} 455}
423 456
@@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
800 struct gfs2_holder t_gh; 833 struct gfs2_holder t_gh;
801 int error; 834 int error;
802 835
836 kthread_stop(sdp->sd_quotad_process);
837 kthread_stop(sdp->sd_logd_process);
838
803 flush_workqueue(gfs2_delete_workqueue); 839 flush_workqueue(gfs2_delete_workqueue);
804 gfs2_quota_sync(sdp->sd_vfs, 0); 840 gfs2_quota_sync(sdp->sd_vfs, 0);
805 gfs2_statfs_sync(sdp->sd_vfs, 0); 841 gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -857,9 +893,6 @@ restart:
857 } 893 }
858 spin_unlock(&sdp->sd_jindex_spin); 894 spin_unlock(&sdp->sd_jindex_spin);
859 895
860 kthread_stop(sdp->sd_quotad_process);
861 kthread_stop(sdp->sd_logd_process);
862
863 if (!(sb->s_flags & MS_RDONLY)) { 896 if (!(sb->s_flags & MS_RDONLY)) {
864 error = gfs2_make_fs_ro(sdp); 897 error = gfs2_make_fs_ro(sdp);
865 if (error) 898 if (error)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 8c6a6f6bdba9..0b81f783f787 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -13,6 +13,7 @@
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/xattr.h> 14#include <linux/xattr.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/posix_acl_xattr.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18
18#include "gfs2.h" 19#include "gfs2.h"
@@ -1500,7 +1501,8 @@ static const struct xattr_handler gfs2_xattr_security_handler = {
1500const struct xattr_handler *gfs2_xattr_handlers[] = { 1501const struct xattr_handler *gfs2_xattr_handlers[] = {
1501 &gfs2_xattr_user_handler, 1502 &gfs2_xattr_user_handler,
1502 &gfs2_xattr_security_handler, 1503 &gfs2_xattr_security_handler,
1503 &gfs2_xattr_system_handler, 1504 &posix_acl_access_xattr_handler,
1505 &posix_acl_default_xattr_handler,
1504 NULL, 1506 NULL,
1505}; 1507};
1506 1508
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
index 07c0d4947527..95c8ed9ec17f 100644
--- a/fs/hfsplus/acl.h
+++ b/fs/hfsplus/acl.h
@@ -12,16 +12,13 @@
12 12
13/* posix_acl.c */ 13/* posix_acl.c */
14struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); 14struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
15extern int hfsplus_posix_acl_chmod(struct inode *); 15int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
16 int type);
16extern int hfsplus_init_posix_acl(struct inode *, struct inode *); 17extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
17 18
18#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ 19#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */
19#define hfsplus_get_posix_acl NULL 20#define hfsplus_get_posix_acl NULL
20 21#define hfsplus_set_posix_acl NULL
21static inline int hfsplus_posix_acl_chmod(struct inode *inode)
22{
23 return 0;
24}
25 22
26static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) 23static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
27{ 24{
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 968ce411db53..32602c667b4a 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -103,6 +103,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
103 folder = &entry->folder; 103 folder = &entry->folder;
104 memset(folder, 0, sizeof(*folder)); 104 memset(folder, 0, sizeof(*folder));
105 folder->type = cpu_to_be16(HFSPLUS_FOLDER); 105 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
106 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags))
107 folder->flags |= cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT);
106 folder->id = cpu_to_be32(inode->i_ino); 108 folder->id = cpu_to_be32(inode->i_ino);
107 HFSPLUS_I(inode)->create_date = 109 HFSPLUS_I(inode)->create_date =
108 folder->create_date = 110 folder->create_date =
@@ -203,6 +205,36 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
203 return hfs_brec_find(fd, hfs_find_rec_by_key); 205 return hfs_brec_find(fd, hfs_find_rec_by_key);
204} 206}
205 207
208static void hfsplus_subfolders_inc(struct inode *dir)
209{
210 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
211
212 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
213 /*
214 * Increment subfolder count. Note, the value is only meaningful
215 * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set.
216 */
217 HFSPLUS_I(dir)->subfolders++;
218 }
219}
220
221static void hfsplus_subfolders_dec(struct inode *dir)
222{
223 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
224
225 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
226 /*
227 * Decrement subfolder count. Note, the value is only meaningful
228 * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set.
229 *
230 * Check for zero. Some subfolders may have been created
231 * by an implementation ignorant of this counter.
232 */
233 if (HFSPLUS_I(dir)->subfolders)
234 HFSPLUS_I(dir)->subfolders--;
235 }
236}
237
206int hfsplus_create_cat(u32 cnid, struct inode *dir, 238int hfsplus_create_cat(u32 cnid, struct inode *dir,
207 struct qstr *str, struct inode *inode) 239 struct qstr *str, struct inode *inode)
208{ 240{
@@ -247,6 +279,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
247 goto err1; 279 goto err1;
248 280
249 dir->i_size++; 281 dir->i_size++;
282 if (S_ISDIR(inode->i_mode))
283 hfsplus_subfolders_inc(dir);
250 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 284 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
251 hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); 285 hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
252 286
@@ -336,6 +370,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
336 goto out; 370 goto out;
337 371
338 dir->i_size--; 372 dir->i_size--;
373 if (type == HFSPLUS_FOLDER)
374 hfsplus_subfolders_dec(dir);
339 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; 375 dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
340 hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); 376 hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
341 377
@@ -380,6 +416,7 @@ int hfsplus_rename_cat(u32 cnid,
380 416
381 hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset, 417 hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
382 src_fd.entrylength); 418 src_fd.entrylength);
419 type = be16_to_cpu(entry.type);
383 420
384 /* create new dir entry with the data from the old entry */ 421 /* create new dir entry with the data from the old entry */
385 hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); 422 hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
@@ -394,6 +431,8 @@ int hfsplus_rename_cat(u32 cnid,
394 if (err) 431 if (err)
395 goto out; 432 goto out;
396 dst_dir->i_size++; 433 dst_dir->i_size++;
434 if (type == HFSPLUS_FOLDER)
435 hfsplus_subfolders_inc(dst_dir);
397 dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; 436 dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
398 437
399 /* finally remove the old entry */ 438 /* finally remove the old entry */
@@ -405,6 +444,8 @@ int hfsplus_rename_cat(u32 cnid,
405 if (err) 444 if (err)
406 goto out; 445 goto out;
407 src_dir->i_size--; 446 src_dir->i_size--;
447 if (type == HFSPLUS_FOLDER)
448 hfsplus_subfolders_dec(src_dir);
408 src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; 449 src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
409 450
410 /* remove old thread entry */ 451 /* remove old thread entry */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4a4fea002673..bdec66522de3 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -529,9 +529,10 @@ const struct inode_operations hfsplus_dir_inode_operations = {
529 .setxattr = generic_setxattr, 529 .setxattr = generic_setxattr,
530 .getxattr = generic_getxattr, 530 .getxattr = generic_getxattr,
531 .listxattr = hfsplus_listxattr, 531 .listxattr = hfsplus_listxattr,
532 .removexattr = hfsplus_removexattr, 532 .removexattr = generic_removexattr,
533#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL 533#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
534 .get_acl = hfsplus_get_posix_acl, 534 .get_acl = hfsplus_get_posix_acl,
535 .set_acl = hfsplus_set_posix_acl,
535#endif 536#endif
536}; 537};
537 538
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 08846425b67f..62d571eb69ba 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -242,6 +242,7 @@ struct hfsplus_inode_info {
242 */ 242 */
243 sector_t fs_blocks; 243 sector_t fs_blocks;
244 u8 userflags; /* BSD user file flags */ 244 u8 userflags; /* BSD user file flags */
245 u32 subfolders; /* Subfolder count (HFSX only) */
245 struct list_head open_dir_list; 246 struct list_head open_dir_list;
246 loff_t phys_size; 247 loff_t phys_size;
247 248
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 8ffb3a8ffe75..5a126828d85e 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -261,7 +261,7 @@ struct hfsplus_cat_folder {
261 struct DInfo user_info; 261 struct DInfo user_info;
262 struct DXInfo finder_info; 262 struct DXInfo finder_info;
263 __be32 text_encoding; 263 __be32 text_encoding;
264 u32 reserved; 264 __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */
265} __packed; 265} __packed;
266 266
267/* HFS file info (stolen from hfs.h) */ 267/* HFS file info (stolen from hfs.h) */
@@ -301,11 +301,13 @@ struct hfsplus_cat_file {
301 struct hfsplus_fork_raw rsrc_fork; 301 struct hfsplus_fork_raw rsrc_fork;
302} __packed; 302} __packed;
303 303
304/* File attribute bits */ 304/* File and folder flag bits */
305#define HFSPLUS_FILE_LOCKED 0x0001 305#define HFSPLUS_FILE_LOCKED 0x0001
306#define HFSPLUS_FILE_THREAD_EXISTS 0x0002 306#define HFSPLUS_FILE_THREAD_EXISTS 0x0002
307#define HFSPLUS_XATTR_EXISTS 0x0004 307#define HFSPLUS_XATTR_EXISTS 0x0004
308#define HFSPLUS_ACL_EXISTS 0x0008 308#define HFSPLUS_ACL_EXISTS 0x0008
309#define HFSPLUS_HAS_FOLDER_COUNT 0x0010 /* Folder has subfolder count
310 * (HFSX only) */
309 311
310/* HFS+ catalog thread (part of a cat_entry) */ 312/* HFS+ catalog thread (part of a cat_entry) */
311struct hfsplus_cat_thread { 313struct hfsplus_cat_thread {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 37213d075f3c..a4f45bd88a63 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -178,64 +178,6 @@ const struct dentry_operations hfsplus_dentry_operations = {
178 .d_compare = hfsplus_compare_dentry, 178 .d_compare = hfsplus_compare_dentry,
179}; 179};
180 180
181static struct dentry *hfsplus_file_lookup(struct inode *dir,
182 struct dentry *dentry, unsigned int flags)
183{
184 struct hfs_find_data fd;
185 struct super_block *sb = dir->i_sb;
186 struct inode *inode = NULL;
187 struct hfsplus_inode_info *hip;
188 int err;
189
190 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
191 goto out;
192
193 inode = HFSPLUS_I(dir)->rsrc_inode;
194 if (inode)
195 goto out;
196
197 inode = new_inode(sb);
198 if (!inode)
199 return ERR_PTR(-ENOMEM);
200
201 hip = HFSPLUS_I(inode);
202 inode->i_ino = dir->i_ino;
203 INIT_LIST_HEAD(&hip->open_dir_list);
204 mutex_init(&hip->extents_lock);
205 hip->extent_state = 0;
206 hip->flags = 0;
207 hip->userflags = 0;
208 set_bit(HFSPLUS_I_RSRC, &hip->flags);
209
210 err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
211 if (!err) {
212 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
213 if (!err)
214 err = hfsplus_cat_read_inode(inode, &fd);
215 hfs_find_exit(&fd);
216 }
217 if (err) {
218 iput(inode);
219 return ERR_PTR(err);
220 }
221 hip->rsrc_inode = dir;
222 HFSPLUS_I(dir)->rsrc_inode = inode;
223 igrab(dir);
224
225 /*
226 * __mark_inode_dirty expects inodes to be hashed. Since we don't
227 * want resource fork inodes in the regular inode space, we make them
228 * appear hashed, but do not put on any lists. hlist_del()
229 * will work fine and require no locking.
230 */
231 hlist_add_fake(&inode->i_hash);
232
233 mark_inode_dirty(inode);
234out:
235 d_add(dentry, inode);
236 return NULL;
237}
238
239static void hfsplus_get_perms(struct inode *inode, 181static void hfsplus_get_perms(struct inode *inode,
240 struct hfsplus_perm *perms, int dir) 182 struct hfsplus_perm *perms, int dir)
241{ 183{
@@ -319,7 +261,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
319 mark_inode_dirty(inode); 261 mark_inode_dirty(inode);
320 262
321 if (attr->ia_valid & ATTR_MODE) { 263 if (attr->ia_valid & ATTR_MODE) {
322 error = hfsplus_posix_acl_chmod(inode); 264 error = posix_acl_chmod(inode, inode->i_mode);
323 if (unlikely(error)) 265 if (unlikely(error))
324 return error; 266 return error;
325 } 267 }
@@ -385,14 +327,14 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
385} 327}
386 328
387static const struct inode_operations hfsplus_file_inode_operations = { 329static const struct inode_operations hfsplus_file_inode_operations = {
388 .lookup = hfsplus_file_lookup,
389 .setattr = hfsplus_setattr, 330 .setattr = hfsplus_setattr,
390 .setxattr = generic_setxattr, 331 .setxattr = generic_setxattr,
391 .getxattr = generic_getxattr, 332 .getxattr = generic_getxattr,
392 .listxattr = hfsplus_listxattr, 333 .listxattr = hfsplus_listxattr,
393 .removexattr = hfsplus_removexattr, 334 .removexattr = generic_removexattr,
394#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL 335#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
395 .get_acl = hfsplus_get_posix_acl, 336 .get_acl = hfsplus_get_posix_acl,
337 .set_acl = hfsplus_set_posix_acl,
396#endif 338#endif
397}; 339};
398 340
@@ -433,6 +375,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
433 hip->extent_state = 0; 375 hip->extent_state = 0;
434 hip->flags = 0; 376 hip->flags = 0;
435 hip->userflags = 0; 377 hip->userflags = 0;
378 hip->subfolders = 0;
436 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); 379 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
437 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); 380 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
438 hip->alloc_blocks = 0; 381 hip->alloc_blocks = 0;
@@ -552,6 +495,10 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
552 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); 495 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
553 HFSPLUS_I(inode)->create_date = folder->create_date; 496 HFSPLUS_I(inode)->create_date = folder->create_date;
554 HFSPLUS_I(inode)->fs_blocks = 0; 497 HFSPLUS_I(inode)->fs_blocks = 0;
498 if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
499 HFSPLUS_I(inode)->subfolders =
500 be32_to_cpu(folder->subfolders);
501 }
555 inode->i_op = &hfsplus_dir_inode_operations; 502 inode->i_op = &hfsplus_dir_inode_operations;
556 inode->i_fop = &hfsplus_dir_operations; 503 inode->i_fop = &hfsplus_dir_operations;
557 } else if (type == HFSPLUS_FILE) { 504 } else if (type == HFSPLUS_FILE) {
@@ -624,6 +571,10 @@ int hfsplus_cat_write_inode(struct inode *inode)
624 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); 571 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
625 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 572 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
626 folder->valence = cpu_to_be32(inode->i_size - 2); 573 folder->valence = cpu_to_be32(inode->i_size - 2);
574 if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
575 folder->subfolders =
576 cpu_to_be32(HFSPLUS_I(inode)->subfolders);
577 }
627 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, 578 hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
628 sizeof(struct hfsplus_cat_folder)); 579 sizeof(struct hfsplus_cat_folder));
629 } else if (HFSPLUS_IS_RSRC(inode)) { 580 } else if (HFSPLUS_IS_RSRC(inode)) {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 968eab5bc1f5..68537e8b7a09 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -75,7 +75,7 @@ int hfsplus_parse_options_remount(char *input, int *force)
75 int token; 75 int token;
76 76
77 if (!input) 77 if (!input)
78 return 0; 78 return 1;
79 79
80 while ((p = strsep(&input, ",")) != NULL) { 80 while ((p = strsep(&input, ",")) != NULL) {
81 if (!*p) 81 if (!*p)
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index b609cc14c72e..df0c9af68d05 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -17,9 +17,7 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
17 char *value = NULL; 17 char *value = NULL;
18 ssize_t size; 18 ssize_t size;
19 19
20 acl = get_cached_acl(inode, type); 20 hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
21 if (acl != ACL_NOT_CACHED)
22 return acl;
23 21
24 switch (type) { 22 switch (type) {
25 case ACL_TYPE_ACCESS: 23 case ACL_TYPE_ACCESS:
@@ -56,17 +54,15 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
56 return acl; 54 return acl;
57} 55}
58 56
59static int hfsplus_set_posix_acl(struct inode *inode, 57int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
60 int type, 58 int type)
61 struct posix_acl *acl)
62{ 59{
63 int err; 60 int err;
64 char *xattr_name; 61 char *xattr_name;
65 size_t size = 0; 62 size_t size = 0;
66 char *value = NULL; 63 char *value = NULL;
67 64
68 if (S_ISLNK(inode->i_mode)) 65 hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
69 return -EOPNOTSUPP;
70 66
71 switch (type) { 67 switch (type) {
72 case ACL_TYPE_ACCESS: 68 case ACL_TYPE_ACCESS:
@@ -115,7 +111,7 @@ end_set_acl:
115int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) 111int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
116{ 112{
117 int err = 0; 113 int err = 0;
118 struct posix_acl *acl = NULL; 114 struct posix_acl *default_acl, *acl;
119 115
120 hfs_dbg(ACL_MOD, 116 hfs_dbg(ACL_MOD,
121 "[%s]: ino %lu, dir->ino %lu\n", 117 "[%s]: ino %lu, dir->ino %lu\n",
@@ -124,151 +120,21 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
124 if (S_ISLNK(inode->i_mode)) 120 if (S_ISLNK(inode->i_mode))
125 return 0; 121 return 0;
126 122
127 acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT); 123 err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
128 if (IS_ERR(acl)) 124 if (err)
129 return PTR_ERR(acl);
130
131 if (acl) {
132 if (S_ISDIR(inode->i_mode)) {
133 err = hfsplus_set_posix_acl(inode,
134 ACL_TYPE_DEFAULT,
135 acl);
136 if (unlikely(err))
137 goto init_acl_cleanup;
138 }
139
140 err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
141 if (unlikely(err < 0))
142 return err;
143
144 if (err > 0)
145 err = hfsplus_set_posix_acl(inode,
146 ACL_TYPE_ACCESS,
147 acl);
148 } else
149 inode->i_mode &= ~current_umask();
150
151init_acl_cleanup:
152 posix_acl_release(acl);
153 return err;
154}
155
156int hfsplus_posix_acl_chmod(struct inode *inode)
157{
158 int err;
159 struct posix_acl *acl;
160
161 hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
162
163 if (S_ISLNK(inode->i_mode))
164 return -EOPNOTSUPP;
165
166 acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS);
167 if (IS_ERR(acl) || !acl)
168 return PTR_ERR(acl);
169
170 err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
171 if (unlikely(err))
172 return err; 125 return err;
173 126
174 err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl); 127 if (default_acl) {
175 posix_acl_release(acl); 128 err = hfsplus_set_posix_acl(inode, default_acl,
176 return err; 129 ACL_TYPE_DEFAULT);
177} 130 posix_acl_release(default_acl);
178
179static int hfsplus_xattr_get_posix_acl(struct dentry *dentry,
180 const char *name,
181 void *buffer,
182 size_t size,
183 int type)
184{
185 int err = 0;
186 struct posix_acl *acl;
187
188 hfs_dbg(ACL_MOD,
189 "[%s]: ino %lu, buffer %p, size %zu, type %#x\n",
190 __func__, dentry->d_inode->i_ino, buffer, size, type);
191
192 if (strcmp(name, "") != 0)
193 return -EINVAL;
194
195 acl = hfsplus_get_posix_acl(dentry->d_inode, type);
196 if (IS_ERR(acl))
197 return PTR_ERR(acl);
198 if (acl == NULL)
199 return -ENODATA;
200
201 err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
202 posix_acl_release(acl);
203
204 return err;
205}
206
207static int hfsplus_xattr_set_posix_acl(struct dentry *dentry,
208 const char *name,
209 const void *value,
210 size_t size,
211 int flags,
212 int type)
213{
214 int err = 0;
215 struct inode *inode = dentry->d_inode;
216 struct posix_acl *acl = NULL;
217
218 hfs_dbg(ACL_MOD,
219 "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n",
220 __func__, inode->i_ino, value, size, flags, type);
221
222 if (strcmp(name, "") != 0)
223 return -EINVAL;
224
225 if (!inode_owner_or_capable(inode))
226 return -EPERM;
227
228 if (value) {
229 acl = posix_acl_from_xattr(&init_user_ns, value, size);
230 if (IS_ERR(acl))
231 return PTR_ERR(acl);
232 else if (acl) {
233 err = posix_acl_valid(acl);
234 if (err)
235 goto end_xattr_set_acl;
236 }
237 } 131 }
238 132
239 err = hfsplus_set_posix_acl(inode, type, acl); 133 if (acl) {
240 134 if (!err)
241end_xattr_set_acl: 135 err = hfsplus_set_posix_acl(inode, acl,
242 posix_acl_release(acl); 136 ACL_TYPE_ACCESS);
137 posix_acl_release(acl);
138 }
243 return err; 139 return err;
244} 140}
245
246static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry,
247 char *list,
248 size_t list_size,
249 const char *name,
250 size_t name_len,
251 int type)
252{
253 /*
254 * This method is not used.
255 * It is used hfsplus_listxattr() instead of generic_listxattr().
256 */
257 return -EOPNOTSUPP;
258}
259
260const struct xattr_handler hfsplus_xattr_acl_access_handler = {
261 .prefix = POSIX_ACL_XATTR_ACCESS,
262 .flags = ACL_TYPE_ACCESS,
263 .list = hfsplus_xattr_list_posix_acl,
264 .get = hfsplus_xattr_get_posix_acl,
265 .set = hfsplus_xattr_set_posix_acl,
266};
267
268const struct xattr_handler hfsplus_xattr_acl_default_handler = {
269 .prefix = POSIX_ACL_XATTR_DEFAULT,
270 .flags = ACL_TYPE_DEFAULT,
271 .list = hfsplus_xattr_list_posix_acl,
272 .get = hfsplus_xattr_get_posix_acl,
273 .set = hfsplus_xattr_set_posix_acl,
274};
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index e9a97a0d4314..3f999649587f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -63,7 +63,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
63 sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); 63 sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
64 64
65 bio = bio_alloc(GFP_NOIO, 1); 65 bio = bio_alloc(GFP_NOIO, 1);
66 bio->bi_sector = sector; 66 bio->bi_iter.bi_sector = sector;
67 bio->bi_bdev = sb->s_bdev; 67 bio->bi_bdev = sb->s_bdev;
68 68
69 if (!(rw & WRITE) && data) 69 if (!(rw & WRITE) && data)
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 3c6136f98c73..4e27edc082a4 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -7,16 +7,19 @@
7 */ 7 */
8 8
9#include "hfsplus_fs.h" 9#include "hfsplus_fs.h"
10#include <linux/posix_acl_xattr.h>
10#include "xattr.h" 11#include "xattr.h"
11#include "acl.h" 12#include "acl.h"
12 13
14static int hfsplus_removexattr(struct inode *inode, const char *name);
15
13const struct xattr_handler *hfsplus_xattr_handlers[] = { 16const struct xattr_handler *hfsplus_xattr_handlers[] = {
14 &hfsplus_xattr_osx_handler, 17 &hfsplus_xattr_osx_handler,
15 &hfsplus_xattr_user_handler, 18 &hfsplus_xattr_user_handler,
16 &hfsplus_xattr_trusted_handler, 19 &hfsplus_xattr_trusted_handler,
17#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL 20#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
18 &hfsplus_xattr_acl_access_handler, 21 &posix_acl_access_xattr_handler,
19 &hfsplus_xattr_acl_default_handler, 22 &posix_acl_default_xattr_handler,
20#endif 23#endif
21 &hfsplus_xattr_security_handler, 24 &hfsplus_xattr_security_handler,
22 NULL 25 NULL
@@ -51,82 +54,6 @@ static inline int is_known_namespace(const char *name)
51 return true; 54 return true;
52} 55}
53 56
54static int can_set_system_xattr(struct inode *inode, const char *name,
55 const void *value, size_t size)
56{
57#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
58 struct posix_acl *acl;
59 int err;
60
61 if (!inode_owner_or_capable(inode))
62 return -EPERM;
63
64 /*
65 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
66 */
67 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
68 acl = posix_acl_from_xattr(&init_user_ns, value, size);
69 if (IS_ERR(acl))
70 return PTR_ERR(acl);
71 if (acl) {
72 err = posix_acl_equiv_mode(acl, &inode->i_mode);
73 posix_acl_release(acl);
74 if (err < 0)
75 return err;
76 mark_inode_dirty(inode);
77 }
78 /*
79 * We're changing the ACL. Get rid of the cached one
80 */
81 forget_cached_acl(inode, ACL_TYPE_ACCESS);
82
83 return 0;
84 } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
85 acl = posix_acl_from_xattr(&init_user_ns, value, size);
86 if (IS_ERR(acl))
87 return PTR_ERR(acl);
88 posix_acl_release(acl);
89
90 /*
91 * We're changing the default ACL. Get rid of the cached one
92 */
93 forget_cached_acl(inode, ACL_TYPE_DEFAULT);
94
95 return 0;
96 }
97#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
98 return -EOPNOTSUPP;
99}
100
101static int can_set_xattr(struct inode *inode, const char *name,
102 const void *value, size_t value_len)
103{
104 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
105 return can_set_system_xattr(inode, name, value, value_len);
106
107 if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
108 /*
109 * This makes sure that we aren't trying to set an
110 * attribute in a different namespace by prefixing it
111 * with "osx."
112 */
113 if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN))
114 return -EOPNOTSUPP;
115
116 return 0;
117 }
118
119 /*
120 * Don't allow setting an attribute in an unknown namespace.
121 */
122 if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
123 strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
124 strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
125 return -EOPNOTSUPP;
126
127 return 0;
128}
129
130static void hfsplus_init_header_node(struct inode *attr_file, 57static void hfsplus_init_header_node(struct inode *attr_file,
131 u32 clump_size, 58 u32 clump_size,
132 char *buf, u16 node_size) 59 char *buf, u16 node_size)
@@ -349,18 +276,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
349 HFSPLUS_IS_RSRC(inode)) 276 HFSPLUS_IS_RSRC(inode))
350 return -EOPNOTSUPP; 277 return -EOPNOTSUPP;
351 278
352 err = can_set_xattr(inode, name, value, size); 279 if (value == NULL)
353 if (err) 280 return hfsplus_removexattr(inode, name);
354 return err;
355
356 if (strncmp(name, XATTR_MAC_OSX_PREFIX,
357 XATTR_MAC_OSX_PREFIX_LEN) == 0)
358 name += XATTR_MAC_OSX_PREFIX_LEN;
359
360 if (value == NULL) {
361 value = "";
362 size = 0;
363 }
364 281
365 err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); 282 err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
366 if (err) { 283 if (err) {
@@ -478,16 +395,11 @@ end_setxattr:
478 return err; 395 return err;
479} 396}
480 397
481static inline int is_osx_xattr(const char *xattr_name)
482{
483 return !is_known_namespace(xattr_name);
484}
485
486static int name_len(const char *xattr_name, int xattr_name_len) 398static int name_len(const char *xattr_name, int xattr_name_len)
487{ 399{
488 int len = xattr_name_len + 1; 400 int len = xattr_name_len + 1;
489 401
490 if (is_osx_xattr(xattr_name)) 402 if (!is_known_namespace(xattr_name))
491 len += XATTR_MAC_OSX_PREFIX_LEN; 403 len += XATTR_MAC_OSX_PREFIX_LEN;
492 404
493 return len; 405 return len;
@@ -498,7 +410,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
498 int len = name_len; 410 int len = name_len;
499 int offset = 0; 411 int offset = 0;
500 412
501 if (is_osx_xattr(xattr_name)) { 413 if (!is_known_namespace(xattr_name)) {
502 strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN); 414 strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
503 offset += XATTR_MAC_OSX_PREFIX_LEN; 415 offset += XATTR_MAC_OSX_PREFIX_LEN;
504 len += XATTR_MAC_OSX_PREFIX_LEN; 416 len += XATTR_MAC_OSX_PREFIX_LEN;
@@ -576,18 +488,6 @@ ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
576 HFSPLUS_IS_RSRC(inode)) 488 HFSPLUS_IS_RSRC(inode))
577 return -EOPNOTSUPP; 489 return -EOPNOTSUPP;
578 490
579 if (strncmp(name, XATTR_MAC_OSX_PREFIX,
580 XATTR_MAC_OSX_PREFIX_LEN) == 0) {
581 /* skip "osx." prefix */
582 name += XATTR_MAC_OSX_PREFIX_LEN;
583 /*
584 * Don't allow retrieving properly prefixed attributes
585 * by prepending them with "osx."
586 */
587 if (is_known_namespace(name))
588 return -EOPNOTSUPP;
589 }
590
591 if (!strcmp_xattr_finder_info(name)) 491 if (!strcmp_xattr_finder_info(name))
592 return hfsplus_getxattr_finder_info(inode, value, size); 492 return hfsplus_getxattr_finder_info(inode, value, size);
593 493
@@ -822,32 +722,18 @@ end_listxattr:
822 return res; 722 return res;
823} 723}
824 724
825int hfsplus_removexattr(struct dentry *dentry, const char *name) 725static int hfsplus_removexattr(struct inode *inode, const char *name)
826{ 726{
827 int err = 0; 727 int err = 0;
828 struct inode *inode = dentry->d_inode;
829 struct hfs_find_data cat_fd; 728 struct hfs_find_data cat_fd;
830 u16 flags; 729 u16 flags;
831 u16 cat_entry_type; 730 u16 cat_entry_type;
832 int is_xattr_acl_deleted = 0; 731 int is_xattr_acl_deleted = 0;
833 int is_all_xattrs_deleted = 0; 732 int is_all_xattrs_deleted = 0;
834 733
835 if ((!S_ISREG(inode->i_mode) &&
836 !S_ISDIR(inode->i_mode)) ||
837 HFSPLUS_IS_RSRC(inode))
838 return -EOPNOTSUPP;
839
840 if (!HFSPLUS_SB(inode->i_sb)->attr_tree) 734 if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
841 return -EOPNOTSUPP; 735 return -EOPNOTSUPP;
842 736
843 err = can_set_xattr(inode, name, NULL, 0);
844 if (err)
845 return err;
846
847 if (strncmp(name, XATTR_MAC_OSX_PREFIX,
848 XATTR_MAC_OSX_PREFIX_LEN) == 0)
849 name += XATTR_MAC_OSX_PREFIX_LEN;
850
851 if (!strcmp_xattr_finder_info(name)) 737 if (!strcmp_xattr_finder_info(name))
852 return -EOPNOTSUPP; 738 return -EOPNOTSUPP;
853 739
@@ -921,8 +807,12 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
921 if (len > HFSPLUS_ATTR_MAX_STRLEN) 807 if (len > HFSPLUS_ATTR_MAX_STRLEN)
922 return -EOPNOTSUPP; 808 return -EOPNOTSUPP;
923 809
924 strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); 810 /*
925 strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); 811 * Don't allow retrieving properly prefixed attributes
812 * by prepending them with "osx."
813 */
814 if (is_known_namespace(name))
815 return -EOPNOTSUPP;
926 816
927 return hfsplus_getxattr(dentry, xattr_name, buffer, size); 817 return hfsplus_getxattr(dentry, xattr_name, buffer, size);
928} 818}
@@ -940,8 +830,12 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
940 if (len > HFSPLUS_ATTR_MAX_STRLEN) 830 if (len > HFSPLUS_ATTR_MAX_STRLEN)
941 return -EOPNOTSUPP; 831 return -EOPNOTSUPP;
942 832
943 strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); 833 /*
944 strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); 834 * Don't allow setting properly prefixed attributes
835 * by prepending them with "osx."
836 */
837 if (is_known_namespace(name))
838 return -EOPNOTSUPP;
945 839
946 return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); 840 return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
947} 841}
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 841b5698c0fc..288530cf80b5 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -14,8 +14,6 @@
14extern const struct xattr_handler hfsplus_xattr_osx_handler; 14extern const struct xattr_handler hfsplus_xattr_osx_handler;
15extern const struct xattr_handler hfsplus_xattr_user_handler; 15extern const struct xattr_handler hfsplus_xattr_user_handler;
16extern const struct xattr_handler hfsplus_xattr_trusted_handler; 16extern const struct xattr_handler hfsplus_xattr_trusted_handler;
17extern const struct xattr_handler hfsplus_xattr_acl_access_handler;
18extern const struct xattr_handler hfsplus_xattr_acl_default_handler;
19extern const struct xattr_handler hfsplus_xattr_security_handler; 17extern const struct xattr_handler hfsplus_xattr_security_handler;
20 18
21extern const struct xattr_handler *hfsplus_xattr_handlers[]; 19extern const struct xattr_handler *hfsplus_xattr_handlers[];
@@ -42,8 +40,6 @@ static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
42 40
43ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); 41ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
44 42
45int hfsplus_removexattr(struct dentry *dentry, const char *name);
46
47int hfsplus_init_security(struct inode *inode, struct inode *dir, 43int hfsplus_init_security(struct inode *inode, struct inode *dir,
48 const struct qstr *qstr); 44 const struct qstr *qstr);
49 45
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index db23ce1bd903..fe649d325b1f 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -186,7 +186,7 @@ static struct inode *hostfs_iget(struct super_block *sb)
186 return inode; 186 return inode;
187} 187}
188 188
189int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) 189static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
190{ 190{
191 /* 191 /*
192 * do_statfs uses struct statfs64 internally, but the linux kernel 192 * do_statfs uses struct statfs64 internally, but the linux kernel
@@ -268,7 +268,7 @@ static const struct super_operations hostfs_sbops = {
268 .show_options = hostfs_show_options, 268 .show_options = hostfs_show_options,
269}; 269};
270 270
271int hostfs_readdir(struct file *file, struct dir_context *ctx) 271static int hostfs_readdir(struct file *file, struct dir_context *ctx)
272{ 272{
273 void *dir; 273 void *dir;
274 char *name; 274 char *name;
@@ -293,7 +293,7 @@ int hostfs_readdir(struct file *file, struct dir_context *ctx)
293 return 0; 293 return 0;
294} 294}
295 295
296int hostfs_file_open(struct inode *ino, struct file *file) 296static int hostfs_file_open(struct inode *ino, struct file *file)
297{ 297{
298 static DEFINE_MUTEX(open_mutex); 298 static DEFINE_MUTEX(open_mutex);
299 char *name; 299 char *name;
@@ -359,7 +359,8 @@ static int hostfs_file_release(struct inode *inode, struct file *file)
359 return 0; 359 return 0;
360} 360}
361 361
362int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) 362static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
363 int datasync)
363{ 364{
364 struct inode *inode = file->f_mapping->host; 365 struct inode *inode = file->f_mapping->host;
365 int ret; 366 int ret;
@@ -394,7 +395,7 @@ static const struct file_operations hostfs_dir_fops = {
394 .read = generic_read_dir, 395 .read = generic_read_dir,
395}; 396};
396 397
397int hostfs_writepage(struct page *page, struct writeback_control *wbc) 398static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
398{ 399{
399 struct address_space *mapping = page->mapping; 400 struct address_space *mapping = page->mapping;
400 struct inode *inode = mapping->host; 401 struct inode *inode = mapping->host;
@@ -430,7 +431,7 @@ int hostfs_writepage(struct page *page, struct writeback_control *wbc)
430 return err; 431 return err;
431} 432}
432 433
433int hostfs_readpage(struct file *file, struct page *page) 434static int hostfs_readpage(struct file *file, struct page *page)
434{ 435{
435 char *buffer; 436 char *buffer;
436 long long start; 437 long long start;
@@ -455,9 +456,9 @@ int hostfs_readpage(struct file *file, struct page *page)
455 return err; 456 return err;
456} 457}
457 458
458int hostfs_write_begin(struct file *file, struct address_space *mapping, 459static int hostfs_write_begin(struct file *file, struct address_space *mapping,
459 loff_t pos, unsigned len, unsigned flags, 460 loff_t pos, unsigned len, unsigned flags,
460 struct page **pagep, void **fsdata) 461 struct page **pagep, void **fsdata)
461{ 462{
462 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 463 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
463 464
@@ -467,9 +468,9 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
467 return 0; 468 return 0;
468} 469}
469 470
470int hostfs_write_end(struct file *file, struct address_space *mapping, 471static int hostfs_write_end(struct file *file, struct address_space *mapping,
471 loff_t pos, unsigned len, unsigned copied, 472 loff_t pos, unsigned len, unsigned copied,
472 struct page *page, void *fsdata) 473 struct page *page, void *fsdata)
473{ 474{
474 struct inode *inode = mapping->host; 475 struct inode *inode = mapping->host;
475 void *buffer; 476 void *buffer;
@@ -549,8 +550,8 @@ static int read_name(struct inode *ino, char *name)
549 return 0; 550 return 0;
550} 551}
551 552
552int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 553static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
553 bool excl) 554 bool excl)
554{ 555{
555 struct inode *inode; 556 struct inode *inode;
556 char *name; 557 char *name;
@@ -591,8 +592,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
591 return error; 592 return error;
592} 593}
593 594
594struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, 595static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
595 unsigned int flags) 596 unsigned int flags)
596{ 597{
597 struct inode *inode; 598 struct inode *inode;
598 char *name; 599 char *name;
@@ -628,7 +629,8 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
628 return ERR_PTR(err); 629 return ERR_PTR(err);
629} 630}
630 631
631int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) 632static int hostfs_link(struct dentry *to, struct inode *ino,
633 struct dentry *from)
632{ 634{
633 char *from_name, *to_name; 635 char *from_name, *to_name;
634 int err; 636 int err;
@@ -646,7 +648,7 @@ int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
646 return err; 648 return err;
647} 649}
648 650
649int hostfs_unlink(struct inode *ino, struct dentry *dentry) 651static int hostfs_unlink(struct inode *ino, struct dentry *dentry)
650{ 652{
651 char *file; 653 char *file;
652 int err; 654 int err;
@@ -662,7 +664,8 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
662 return err; 664 return err;
663} 665}
664 666
665int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) 667static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
668 const char *to)
666{ 669{
667 char *file; 670 char *file;
668 int err; 671 int err;
@@ -674,7 +677,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
674 return err; 677 return err;
675} 678}
676 679
677int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) 680static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
678{ 681{
679 char *file; 682 char *file;
680 int err; 683 int err;
@@ -686,7 +689,7 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
686 return err; 689 return err;
687} 690}
688 691
689int hostfs_rmdir(struct inode *ino, struct dentry *dentry) 692static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
690{ 693{
691 char *file; 694 char *file;
692 int err; 695 int err;
@@ -738,8 +741,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
738 return err; 741 return err;
739} 742}
740 743
741int hostfs_rename(struct inode *from_ino, struct dentry *from, 744static int hostfs_rename(struct inode *from_ino, struct dentry *from,
742 struct inode *to_ino, struct dentry *to) 745 struct inode *to_ino, struct dentry *to)
743{ 746{
744 char *from_name, *to_name; 747 char *from_name, *to_name;
745 int err; 748 int err;
@@ -756,7 +759,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
756 return err; 759 return err;
757} 760}
758 761
759int hostfs_permission(struct inode *ino, int desired) 762static int hostfs_permission(struct inode *ino, int desired)
760{ 763{
761 char *name; 764 char *name;
762 int r = 0, w = 0, x = 0, err; 765 int r = 0, w = 0, x = 0, err;
@@ -782,7 +785,7 @@ int hostfs_permission(struct inode *ino, int desired)
782 return err; 785 return err;
783} 786}
784 787
785int hostfs_setattr(struct dentry *dentry, struct iattr *attr) 788static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
786{ 789{
787 struct inode *inode = dentry->d_inode; 790 struct inode *inode = dentry->d_inode;
788 struct hostfs_iattr attrs; 791 struct hostfs_iattr attrs;
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index cdb84a838068..58b5106186d0 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -8,6 +8,58 @@
8 8
9#include "hpfs_fn.h" 9#include "hpfs_fn.h"
10 10
11static void hpfs_claim_alloc(struct super_block *s, secno sec)
12{
13 struct hpfs_sb_info *sbi = hpfs_sb(s);
14 if (sbi->sb_n_free != (unsigned)-1) {
15 if (unlikely(!sbi->sb_n_free)) {
16 hpfs_error(s, "free count underflow, allocating sector %08x", sec);
17 sbi->sb_n_free = -1;
18 return;
19 }
20 sbi->sb_n_free--;
21 }
22}
23
24static void hpfs_claim_free(struct super_block *s, secno sec)
25{
26 struct hpfs_sb_info *sbi = hpfs_sb(s);
27 if (sbi->sb_n_free != (unsigned)-1) {
28 if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) {
29 hpfs_error(s, "free count overflow, freeing sector %08x", sec);
30 sbi->sb_n_free = -1;
31 return;
32 }
33 sbi->sb_n_free++;
34 }
35}
36
37static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec)
38{
39 struct hpfs_sb_info *sbi = hpfs_sb(s);
40 if (sbi->sb_n_free_dnodes != (unsigned)-1) {
41 if (unlikely(!sbi->sb_n_free_dnodes)) {
42 hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec);
43 sbi->sb_n_free_dnodes = -1;
44 return;
45 }
46 sbi->sb_n_free_dnodes--;
47 }
48}
49
50static void hpfs_claim_dirband_free(struct super_block *s, secno sec)
51{
52 struct hpfs_sb_info *sbi = hpfs_sb(s);
53 if (sbi->sb_n_free_dnodes != (unsigned)-1) {
54 if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) {
55 hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec);
56 sbi->sb_n_free_dnodes = -1;
57 return;
58 }
59 sbi->sb_n_free_dnodes++;
60 }
61}
62
11/* 63/*
12 * Check if a sector is allocated in bitmap 64 * Check if a sector is allocated in bitmap
13 * This is really slow. Turned on only if chk==2 65 * This is really slow. Turned on only if chk==2
@@ -203,9 +255,15 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa
203 } 255 }
204 sec = 0; 256 sec = 0;
205 ret: 257 ret:
258 if (sec) {
259 i = 0;
260 do
261 hpfs_claim_alloc(s, sec + i);
262 while (unlikely(++i < n));
263 }
206 if (sec && f_p) { 264 if (sec && f_p) {
207 for (i = 0; i < forward; i++) { 265 for (i = 0; i < forward; i++) {
208 if (!hpfs_alloc_if_possible(s, sec + i + 1)) { 266 if (!hpfs_alloc_if_possible(s, sec + n + i)) {
209 hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i); 267 hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i);
210 sec = 0; 268 sec = 0;
211 break; 269 break;
@@ -228,6 +286,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
228 nr >>= 2; 286 nr >>= 2;
229 sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0); 287 sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0);
230 if (!sec) return 0; 288 if (!sec) return 0;
289 hpfs_claim_dirband_alloc(s, sec);
231 return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start; 290 return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start;
232} 291}
233 292
@@ -242,6 +301,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
242 bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f))); 301 bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
243 hpfs_mark_4buffers_dirty(&qbh); 302 hpfs_mark_4buffers_dirty(&qbh);
244 hpfs_brelse4(&qbh); 303 hpfs_brelse4(&qbh);
304 hpfs_claim_alloc(s, sec);
245 return 1; 305 return 1;
246 } 306 }
247 hpfs_brelse4(&qbh); 307 hpfs_brelse4(&qbh);
@@ -275,6 +335,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
275 return; 335 return;
276 } 336 }
277 bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f)); 337 bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f));
338 hpfs_claim_free(s, sec);
278 if (!--n) { 339 if (!--n) {
279 hpfs_mark_4buffers_dirty(&qbh); 340 hpfs_mark_4buffers_dirty(&qbh);
280 hpfs_brelse4(&qbh); 341 hpfs_brelse4(&qbh);
@@ -359,6 +420,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
359 bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f)); 420 bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f));
360 hpfs_mark_4buffers_dirty(&qbh); 421 hpfs_mark_4buffers_dirty(&qbh);
361 hpfs_brelse4(&qbh); 422 hpfs_brelse4(&qbh);
423 hpfs_claim_dirband_free(s, dno);
362 } 424 }
363} 425}
364 426
@@ -366,7 +428,7 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near,
366 dnode_secno *dno, struct quad_buffer_head *qbh) 428 dnode_secno *dno, struct quad_buffer_head *qbh)
367{ 429{
368 struct dnode *d; 430 struct dnode *d;
369 if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) { 431 if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) {
370 if (!(*dno = alloc_in_dirband(s, near))) 432 if (!(*dno = alloc_in_dirband(s, near)))
371 if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL; 433 if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL;
372 } else { 434 } else {
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index 4d0a1afa058c..139ef1684d07 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -86,7 +86,6 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head
86void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh, 86void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh,
87 int ahead) 87 int ahead)
88{ 88{
89 struct buffer_head *bh;
90 char *data; 89 char *data;
91 90
92 hpfs_lock_assert(s); 91 hpfs_lock_assert(s);
@@ -100,34 +99,32 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
100 99
101 hpfs_prefetch_sectors(s, secno, 4 + ahead); 100 hpfs_prefetch_sectors(s, secno, 4 + ahead);
102 101
102 if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0;
103 if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1;
104 if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2;
105 if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3;
106
107 if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
108 likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
109 likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
110 return qbh->data = qbh->bh[0]->b_data;
111 }
112
103 qbh->data = data = kmalloc(2048, GFP_NOFS); 113 qbh->data = data = kmalloc(2048, GFP_NOFS);
104 if (!data) { 114 if (!data) {
105 printk("HPFS: hpfs_map_4sectors: out of memory\n"); 115 printk("HPFS: hpfs_map_4sectors: out of memory\n");
106 goto bail; 116 goto bail4;
107 } 117 }
108 118
109 qbh->bh[0] = bh = sb_bread(s, secno); 119 memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512);
110 if (!bh) 120 memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512);
111 goto bail0; 121 memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512);
112 memcpy(data, bh->b_data, 512); 122 memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512);
113
114 qbh->bh[1] = bh = sb_bread(s, secno + 1);
115 if (!bh)
116 goto bail1;
117 memcpy(data + 512, bh->b_data, 512);
118
119 qbh->bh[2] = bh = sb_bread(s, secno + 2);
120 if (!bh)
121 goto bail2;
122 memcpy(data + 2 * 512, bh->b_data, 512);
123
124 qbh->bh[3] = bh = sb_bread(s, secno + 3);
125 if (!bh)
126 goto bail3;
127 memcpy(data + 3 * 512, bh->b_data, 512);
128 123
129 return data; 124 return data;
130 125
126 bail4:
127 brelse(qbh->bh[3]);
131 bail3: 128 bail3:
132 brelse(qbh->bh[2]); 129 brelse(qbh->bh[2]);
133 bail2: 130 bail2:
@@ -135,9 +132,6 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
135 bail1: 132 bail1:
136 brelse(qbh->bh[0]); 133 brelse(qbh->bh[0]);
137 bail0: 134 bail0:
138 kfree(data);
139 printk("HPFS: hpfs_map_4sectors: read error\n");
140 bail:
141 return NULL; 135 return NULL;
142} 136}
143 137
@@ -155,44 +149,54 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno,
155 return NULL; 149 return NULL;
156 } 150 }
157 151
158 /*return hpfs_map_4sectors(s, secno, qbh, 0);*/ 152 if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0;
153 if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1;
154 if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2;
155 if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3;
156
157 if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
158 likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
159 likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
160 return qbh->data = qbh->bh[0]->b_data;
161 }
162
159 if (!(qbh->data = kmalloc(2048, GFP_NOFS))) { 163 if (!(qbh->data = kmalloc(2048, GFP_NOFS))) {
160 printk("HPFS: hpfs_get_4sectors: out of memory\n"); 164 printk("HPFS: hpfs_get_4sectors: out of memory\n");
161 return NULL; 165 goto bail4;
162 } 166 }
163 if (!(hpfs_get_sector(s, secno, &qbh->bh[0]))) goto bail0;
164 if (!(hpfs_get_sector(s, secno + 1, &qbh->bh[1]))) goto bail1;
165 if (!(hpfs_get_sector(s, secno + 2, &qbh->bh[2]))) goto bail2;
166 if (!(hpfs_get_sector(s, secno + 3, &qbh->bh[3]))) goto bail3;
167 memcpy(qbh->data, qbh->bh[0]->b_data, 512);
168 memcpy(qbh->data + 512, qbh->bh[1]->b_data, 512);
169 memcpy(qbh->data + 2*512, qbh->bh[2]->b_data, 512);
170 memcpy(qbh->data + 3*512, qbh->bh[3]->b_data, 512);
171 return qbh->data; 167 return qbh->data;
172 168
173 bail3: brelse(qbh->bh[2]); 169bail4:
174 bail2: brelse(qbh->bh[1]); 170 brelse(qbh->bh[3]);
175 bail1: brelse(qbh->bh[0]); 171bail3:
176 bail0: 172 brelse(qbh->bh[2]);
173bail2:
174 brelse(qbh->bh[1]);
175bail1:
176 brelse(qbh->bh[0]);
177bail0:
177 return NULL; 178 return NULL;
178} 179}
179 180
180 181
181void hpfs_brelse4(struct quad_buffer_head *qbh) 182void hpfs_brelse4(struct quad_buffer_head *qbh)
182{ 183{
183 brelse(qbh->bh[3]); 184 if (unlikely(qbh->data != qbh->bh[0]->b_data))
184 brelse(qbh->bh[2]); 185 kfree(qbh->data);
185 brelse(qbh->bh[1]);
186 brelse(qbh->bh[0]); 186 brelse(qbh->bh[0]);
187 kfree(qbh->data); 187 brelse(qbh->bh[1]);
188 brelse(qbh->bh[2]);
189 brelse(qbh->bh[3]);
188} 190}
189 191
190void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh) 192void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh)
191{ 193{
192 memcpy(qbh->bh[0]->b_data, qbh->data, 512); 194 if (unlikely(qbh->data != qbh->bh[0]->b_data)) {
193 memcpy(qbh->bh[1]->b_data, qbh->data + 512, 512); 195 memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512);
194 memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512); 196 memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512);
195 memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512); 197 memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
198 memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
199 }
196 mark_buffer_dirty(qbh->bh[0]); 200 mark_buffer_dirty(qbh->bh[0]);
197 mark_buffer_dirty(qbh->bh[1]); 201 mark_buffer_dirty(qbh->bh[1]);
198 mark_buffer_dirty(qbh->bh[2]); 202 mark_buffer_dirty(qbh->bh[2]);
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 6797bf80f6e2..3ba49c080e42 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -312,7 +312,7 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
312__printf(2, 3) 312__printf(2, 3)
313void hpfs_error(struct super_block *, const char *, ...); 313void hpfs_error(struct super_block *, const char *, ...);
314int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); 314int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
315unsigned hpfs_count_one_bitmap(struct super_block *, secno); 315unsigned hpfs_get_free_dnodes(struct super_block *);
316 316
317/* 317/*
318 * local time (HPFS) to GMT (Unix) 318 * local time (HPFS) to GMT (Unix)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b8d01ef6f531..4534ff688b76 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -121,7 +121,7 @@ static void hpfs_put_super(struct super_block *s)
121 call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi); 121 call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi);
122} 122}
123 123
124unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) 124static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
125{ 125{
126 struct quad_buffer_head qbh; 126 struct quad_buffer_head qbh;
127 unsigned long *bits; 127 unsigned long *bits;
@@ -129,7 +129,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
129 129
130 bits = hpfs_map_4sectors(s, secno, &qbh, 0); 130 bits = hpfs_map_4sectors(s, secno, &qbh, 0);
131 if (!bits) 131 if (!bits)
132 return 0; 132 return (unsigned)-1;
133 count = bitmap_weight(bits, 2048 * BITS_PER_BYTE); 133 count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
134 hpfs_brelse4(&qbh); 134 hpfs_brelse4(&qbh);
135 return count; 135 return count;
@@ -144,30 +144,45 @@ static unsigned count_bitmaps(struct super_block *s)
144 hpfs_prefetch_bitmap(s, n); 144 hpfs_prefetch_bitmap(s, n);
145 } 145 }
146 for (n = 0; n < n_bands; n++) { 146 for (n = 0; n < n_bands; n++) {
147 unsigned c;
147 hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD); 148 hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
148 count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n])); 149 c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
150 if (c != (unsigned)-1)
151 count += c;
149 } 152 }
150 return count; 153 return count;
151} 154}
152 155
156unsigned hpfs_get_free_dnodes(struct super_block *s)
157{
158 struct hpfs_sb_info *sbi = hpfs_sb(s);
159 if (sbi->sb_n_free_dnodes == (unsigned)-1) {
160 unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap);
161 if (c == (unsigned)-1)
162 return 0;
163 sbi->sb_n_free_dnodes = c;
164 }
165 return sbi->sb_n_free_dnodes;
166}
167
153static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) 168static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
154{ 169{
155 struct super_block *s = dentry->d_sb; 170 struct super_block *s = dentry->d_sb;
156 struct hpfs_sb_info *sbi = hpfs_sb(s); 171 struct hpfs_sb_info *sbi = hpfs_sb(s);
157 u64 id = huge_encode_dev(s->s_bdev->bd_dev); 172 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
173
158 hpfs_lock(s); 174 hpfs_lock(s);
159 175
160 /*if (sbi->sb_n_free == -1) {*/ 176 if (sbi->sb_n_free == (unsigned)-1)
161 sbi->sb_n_free = count_bitmaps(s); 177 sbi->sb_n_free = count_bitmaps(s);
162 sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap); 178
163 /*}*/
164 buf->f_type = s->s_magic; 179 buf->f_type = s->s_magic;
165 buf->f_bsize = 512; 180 buf->f_bsize = 512;
166 buf->f_blocks = sbi->sb_fs_size; 181 buf->f_blocks = sbi->sb_fs_size;
167 buf->f_bfree = sbi->sb_n_free; 182 buf->f_bfree = sbi->sb_n_free;
168 buf->f_bavail = sbi->sb_n_free; 183 buf->f_bavail = sbi->sb_n_free;
169 buf->f_files = sbi->sb_dirband_size / 4; 184 buf->f_files = sbi->sb_dirband_size / 4;
170 buf->f_ffree = sbi->sb_n_free_dnodes; 185 buf->f_ffree = hpfs_get_free_dnodes(s);
171 buf->f_fsid.val[0] = (u32)id; 186 buf->f_fsid.val[0] = (u32)id;
172 buf->f_fsid.val[1] = (u32)(id >> 32); 187 buf->f_fsid.val[1] = (u32)(id >> 32);
173 buf->f_namelen = 254; 188 buf->f_namelen = 254;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2d04f9afafd7..06fe11e0abfa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid)
573#ifdef CONFIG_JBD_DEBUG 573#ifdef CONFIG_JBD_DEBUG
574 spin_lock(&journal->j_state_lock); 574 spin_lock(&journal->j_state_lock);
575 if (!tid_geq(journal->j_commit_request, tid)) { 575 if (!tid_geq(journal->j_commit_request, tid)) {
576 printk(KERN_EMERG 576 printk(KERN_ERR
577 "%s: error: j_commit_request=%d, tid=%d\n", 577 "%s: error: j_commit_request=%d, tid=%d\n",
578 __func__, journal->j_commit_request, tid); 578 __func__, journal->j_commit_request, tid);
579 } 579 }
@@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
604out_unlock: 604out_unlock:
605 spin_unlock(&journal->j_state_lock); 605 spin_unlock(&journal->j_state_lock);
606 606
607 if (unlikely(is_journal_aborted(journal))) { 607 if (unlikely(is_journal_aborted(journal)))
608 printk(KERN_EMERG "journal commit I/O error\n");
609 err = -EIO; 608 err = -EIO;
610 }
611 return err; 609 return err;
612} 610}
613 611
@@ -2136,7 +2134,7 @@ static void __exit journal_exit(void)
2136#ifdef CONFIG_JBD_DEBUG 2134#ifdef CONFIG_JBD_DEBUG
2137 int n = atomic_read(&nr_journal_heads); 2135 int n = atomic_read(&nr_journal_heads);
2138 if (n) 2136 if (n)
2139 printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); 2137 printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
2140#endif 2138#endif
2141 jbd_remove_debugfs_entry(); 2139 jbd_remove_debugfs_entry();
2142 journal_destroy_caches(); 2140 journal_destroy_caches();
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index aa603e017d22..1695ba8334a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -675,7 +675,7 @@ repeat:
675 jbd_alloc(jh2bh(jh)->b_size, 675 jbd_alloc(jh2bh(jh)->b_size,
676 GFP_NOFS); 676 GFP_NOFS);
677 if (!frozen_buffer) { 677 if (!frozen_buffer) {
678 printk(KERN_EMERG 678 printk(KERN_ERR
679 "%s: OOM for frozen_buffer\n", 679 "%s: OOM for frozen_buffer\n",
680 __func__); 680 __func__);
681 JBUFFER_TRACE(jh, "oom!"); 681 JBUFFER_TRACE(jh, "oom!");
@@ -898,7 +898,7 @@ repeat:
898 if (!jh->b_committed_data) { 898 if (!jh->b_committed_data) {
899 committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); 899 committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
900 if (!committed_data) { 900 if (!committed_data) {
901 printk(KERN_EMERG "%s: No memory for committed data\n", 901 printk(KERN_ERR "%s: No memory for committed data\n",
902 __func__); 902 __func__);
903 err = -ENOMEM; 903 err = -ENOMEM;
904 goto out; 904 goto out;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 8360674c85bc..60bb365f54a5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -514,11 +514,13 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
514 * similarly constrained call sites 514 * similarly constrained call sites
515 */ 515 */
516 ret = start_this_handle(journal, handle, GFP_NOFS); 516 ret = start_this_handle(journal, handle, GFP_NOFS);
517 if (ret < 0) 517 if (ret < 0) {
518 jbd2_journal_free_reserved(handle); 518 jbd2_journal_free_reserved(handle);
519 return ret;
520 }
519 handle->h_type = type; 521 handle->h_type = type;
520 handle->h_line_no = line_no; 522 handle->h_line_no = line_no;
521 return ret; 523 return 0;
522} 524}
523EXPORT_SYMBOL(jbd2_journal_start_reserved); 525EXPORT_SYMBOL(jbd2_journal_start_reserved);
524 526
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 223283c30111..009ec0b5993d 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -178,10 +178,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
178 char *value = NULL; 178 char *value = NULL;
179 int rc, xprefix; 179 int rc, xprefix;
180 180
181 acl = get_cached_acl(inode, type);
182 if (acl != ACL_NOT_CACHED)
183 return acl;
184
185 switch (type) { 181 switch (type) {
186 case ACL_TYPE_ACCESS: 182 case ACL_TYPE_ACCESS:
187 xprefix = JFFS2_XPREFIX_ACL_ACCESS; 183 xprefix = JFFS2_XPREFIX_ACL_ACCESS;
@@ -232,13 +228,10 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
232 return rc; 228 return rc;
233} 229}
234 230
235static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) 231int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
236{ 232{
237 int rc, xprefix; 233 int rc, xprefix;
238 234
239 if (S_ISLNK(inode->i_mode))
240 return -EOPNOTSUPP;
241
242 switch (type) { 235 switch (type) {
243 case ACL_TYPE_ACCESS: 236 case ACL_TYPE_ACCESS:
244 xprefix = JFFS2_XPREFIX_ACL_ACCESS; 237 xprefix = JFFS2_XPREFIX_ACL_ACCESS;
@@ -277,30 +270,21 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
277 270
278int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) 271int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode)
279{ 272{
280 struct posix_acl *acl; 273 struct posix_acl *default_acl, *acl;
281 int rc; 274 int rc;
282 275
283 cache_no_acl(inode); 276 cache_no_acl(inode);
284 277
285 if (S_ISLNK(*i_mode)) 278 rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl);
286 return 0; /* Symlink always has no-ACL */ 279 if (rc)
287 280 return rc;
288 acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT);
289 if (IS_ERR(acl))
290 return PTR_ERR(acl);
291
292 if (!acl) {
293 *i_mode &= ~current_umask();
294 } else {
295 if (S_ISDIR(*i_mode))
296 set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
297
298 rc = posix_acl_create(&acl, GFP_KERNEL, i_mode);
299 if (rc < 0)
300 return rc;
301 if (rc > 0)
302 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
303 281
282 if (default_acl) {
283 set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
284 posix_acl_release(default_acl);
285 }
286 if (acl) {
287 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
304 posix_acl_release(acl); 288 posix_acl_release(acl);
305 } 289 }
306 return 0; 290 return 0;
@@ -324,106 +308,3 @@ int jffs2_init_acl_post(struct inode *inode)
324 308
325 return 0; 309 return 0;
326} 310}
327
328int jffs2_acl_chmod(struct inode *inode)
329{
330 struct posix_acl *acl;
331 int rc;
332
333 if (S_ISLNK(inode->i_mode))
334 return -EOPNOTSUPP;
335 acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
336 if (IS_ERR(acl) || !acl)
337 return PTR_ERR(acl);
338 rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
339 if (rc)
340 return rc;
341 rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl);
342 posix_acl_release(acl);
343 return rc;
344}
345
346static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list,
347 size_t list_size, const char *name, size_t name_len, int type)
348{
349 const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
350
351 if (list && retlen <= list_size)
352 strcpy(list, POSIX_ACL_XATTR_ACCESS);
353 return retlen;
354}
355
356static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list,
357 size_t list_size, const char *name, size_t name_len, int type)
358{
359 const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
360
361 if (list && retlen <= list_size)
362 strcpy(list, POSIX_ACL_XATTR_DEFAULT);
363 return retlen;
364}
365
366static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
367 void *buffer, size_t size, int type)
368{
369 struct posix_acl *acl;
370 int rc;
371
372 if (name[0] != '\0')
373 return -EINVAL;
374
375 acl = jffs2_get_acl(dentry->d_inode, type);
376 if (IS_ERR(acl))
377 return PTR_ERR(acl);
378 if (!acl)
379 return -ENODATA;
380 rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
381 posix_acl_release(acl);
382
383 return rc;
384}
385
386static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
387 const void *value, size_t size, int flags, int type)
388{
389 struct posix_acl *acl;
390 int rc;
391
392 if (name[0] != '\0')
393 return -EINVAL;
394 if (!inode_owner_or_capable(dentry->d_inode))
395 return -EPERM;
396
397 if (value) {
398 acl = posix_acl_from_xattr(&init_user_ns, value, size);
399 if (IS_ERR(acl))
400 return PTR_ERR(acl);
401 if (acl) {
402 rc = posix_acl_valid(acl);
403 if (rc)
404 goto out;
405 }
406 } else {
407 acl = NULL;
408 }
409 rc = jffs2_set_acl(dentry->d_inode, type, acl);
410 out:
411 posix_acl_release(acl);
412 return rc;
413}
414
415const struct xattr_handler jffs2_acl_access_xattr_handler = {
416 .prefix = POSIX_ACL_XATTR_ACCESS,
417 .flags = ACL_TYPE_DEFAULT,
418 .list = jffs2_acl_access_listxattr,
419 .get = jffs2_acl_getxattr,
420 .set = jffs2_acl_setxattr,
421};
422
423const struct xattr_handler jffs2_acl_default_xattr_handler = {
424 .prefix = POSIX_ACL_XATTR_DEFAULT,
425 .flags = ACL_TYPE_DEFAULT,
426 .list = jffs2_acl_default_listxattr,
427 .get = jffs2_acl_getxattr,
428 .set = jffs2_acl_setxattr,
429};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 9b477246f2a6..2e2b5745c3b7 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -27,17 +27,14 @@ struct jffs2_acl_header {
27#ifdef CONFIG_JFFS2_FS_POSIX_ACL 27#ifdef CONFIG_JFFS2_FS_POSIX_ACL
28 28
29struct posix_acl *jffs2_get_acl(struct inode *inode, int type); 29struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
30extern int jffs2_acl_chmod(struct inode *); 30int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
33 33
34extern const struct xattr_handler jffs2_acl_access_xattr_handler;
35extern const struct xattr_handler jffs2_acl_default_xattr_handler;
36
37#else 34#else
38 35
39#define jffs2_get_acl (NULL) 36#define jffs2_get_acl (NULL)
40#define jffs2_acl_chmod(inode) (0) 37#define jffs2_set_acl (NULL)
41#define jffs2_init_acl_pre(dir_i,inode,mode) (0) 38#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
42#define jffs2_init_acl_post(inode) (0) 39#define jffs2_init_acl_post(inode) (0)
43 40
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index e3aac222472e..938556025d64 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -59,6 +59,7 @@ const struct inode_operations jffs2_dir_inode_operations =
59 .mknod = jffs2_mknod, 59 .mknod = jffs2_mknod,
60 .rename = jffs2_rename, 60 .rename = jffs2_rename,
61 .get_acl = jffs2_get_acl, 61 .get_acl = jffs2_get_acl,
62 .set_acl = jffs2_set_acl,
62 .setattr = jffs2_setattr, 63 .setattr = jffs2_setattr,
63 .setxattr = jffs2_setxattr, 64 .setxattr = jffs2_setxattr,
64 .getxattr = jffs2_getxattr, 65 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 1506673c087e..256cd19a3b78 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -66,6 +66,7 @@ const struct file_operations jffs2_file_operations =
66const struct inode_operations jffs2_file_inode_operations = 66const struct inode_operations jffs2_file_inode_operations =
67{ 67{
68 .get_acl = jffs2_get_acl, 68 .get_acl = jffs2_get_acl,
69 .set_acl = jffs2_set_acl,
69 .setattr = jffs2_setattr, 70 .setattr = jffs2_setattr,
70 .setxattr = jffs2_setxattr, 71 .setxattr = jffs2_setxattr,
71 .getxattr = jffs2_getxattr, 72 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 09b3ed455724..a69e426435dd 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -190,15 +190,16 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
190 190
191int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) 191int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
192{ 192{
193 struct inode *inode = dentry->d_inode;
193 int rc; 194 int rc;
194 195
195 rc = inode_change_ok(dentry->d_inode, iattr); 196 rc = inode_change_ok(inode, iattr);
196 if (rc) 197 if (rc)
197 return rc; 198 return rc;
198 199
199 rc = jffs2_do_setattr(dentry->d_inode, iattr); 200 rc = jffs2_do_setattr(inode, iattr);
200 if (!rc && (iattr->ia_valid & ATTR_MODE)) 201 if (!rc && (iattr->ia_valid & ATTR_MODE))
201 rc = jffs2_acl_chmod(dentry->d_inode); 202 rc = posix_acl_chmod(inode, inode->i_mode);
202 203
203 return rc; 204 return rc;
204} 205}
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 4f47aa24b556..b8fd651307a4 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -288,6 +288,8 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
288 struct jffs2_xattr_datum *xd; 288 struct jffs2_xattr_datum *xd;
289 xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); 289 xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
290 dbg_memalloc("%p\n", xd); 290 dbg_memalloc("%p\n", xd);
291 if (!xd)
292 return NULL;
291 293
292 xd->class = RAWNODE_CLASS_XATTR_DATUM; 294 xd->class = RAWNODE_CLASS_XATTR_DATUM;
293 xd->node = (void *)xd; 295 xd->node = (void *)xd;
@@ -306,6 +308,8 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
306 struct jffs2_xattr_ref *ref; 308 struct jffs2_xattr_ref *ref;
307 ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); 309 ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
308 dbg_memalloc("%p\n", ref); 310 dbg_memalloc("%p\n", ref);
311 if (!ref)
312 return NULL;
309 313
310 ref->class = RAWNODE_CLASS_XATTR_REF; 314 ref->class = RAWNODE_CLASS_XATTR_REF;
311 ref->node = (void *)ref; 315 ref->node = (void *)ref;
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 975a1f562c10..9a5449bc3afb 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -564,25 +564,10 @@ struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_
564 they're killed. */ 564 they're killed. */
565void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) 565void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
566{ 566{
567 struct jffs2_node_frag *frag; 567 struct jffs2_node_frag *frag, *next;
568 struct jffs2_node_frag *parent;
569
570 if (!root->rb_node)
571 return;
572 568
573 dbg_fragtree("killing\n"); 569 dbg_fragtree("killing\n");
574 570 rbtree_postorder_for_each_entry_safe(frag, next, root, rb) {
575 frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb));
576 while(frag) {
577 if (frag->rb.rb_left) {
578 frag = frag_left(frag);
579 continue;
580 }
581 if (frag->rb.rb_right) {
582 frag = frag_right(frag);
583 continue;
584 }
585
586 if (frag->node && !(--frag->node->frags)) { 571 if (frag->node && !(--frag->node->frags)) {
587 /* Not a hole, and it's the final remaining frag 572 /* Not a hole, and it's the final remaining frag
588 of this node. Free the node */ 573 of this node. Free the node */
@@ -591,17 +576,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
591 576
592 jffs2_free_full_dnode(frag->node); 577 jffs2_free_full_dnode(frag->node);
593 } 578 }
594 parent = frag_parent(frag);
595 if (parent) {
596 if (frag_left(parent) == frag)
597 parent->rb.rb_left = NULL;
598 else
599 parent->rb.rb_right = NULL;
600 }
601 579
602 jffs2_free_node_frag(frag); 580 jffs2_free_node_frag(frag);
603 frag = parent;
604
605 cond_resched(); 581 cond_resched();
606 } 582 }
607} 583}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ae81b01e6fd7..386303dca382 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -543,33 +543,13 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
543 543
544static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) 544static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
545{ 545{
546 struct rb_node *this; 546 struct jffs2_tmp_dnode_info *tn, *next;
547 struct jffs2_tmp_dnode_info *tn;
548
549 this = list->rb_node;
550 547
551 /* Now at bottom of tree */ 548 rbtree_postorder_for_each_entry_safe(tn, next, list, rb) {
552 while (this) {
553 if (this->rb_left)
554 this = this->rb_left;
555 else if (this->rb_right)
556 this = this->rb_right;
557 else {
558 tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb);
559 jffs2_free_full_dnode(tn->fn); 549 jffs2_free_full_dnode(tn->fn);
560 jffs2_free_tmp_dnode_info(tn); 550 jffs2_free_tmp_dnode_info(tn);
561
562 this = rb_parent(this);
563 if (!this)
564 break;
565
566 if (this->rb_left == &tn->rb)
567 this->rb_left = NULL;
568 else if (this->rb_right == &tn->rb)
569 this->rb_right = NULL;
570 else BUG();
571 }
572 } 551 }
552
573 *list = RB_ROOT; 553 *list = RB_ROOT;
574} 554}
575 555
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 6e563332bb24..c7c77b0dfccd 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -22,7 +22,6 @@ const struct inode_operations jffs2_symlink_inode_operations =
22{ 22{
23 .readlink = generic_readlink, 23 .readlink = generic_readlink,
24 .follow_link = jffs2_follow_link, 24 .follow_link = jffs2_follow_link,
25 .get_acl = jffs2_get_acl,
26 .setattr = jffs2_setattr, 25 .setattr = jffs2_setattr,
27 .setxattr = jffs2_setxattr, 26 .setxattr = jffs2_setxattr,
28 .getxattr = jffs2_getxattr, 27 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3034e970eb9a..ad0f2e2a1700 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -22,6 +22,7 @@
22#include <linux/crc32.h> 22#include <linux/crc32.h>
23#include <linux/jffs2.h> 23#include <linux/jffs2.h>
24#include <linux/xattr.h> 24#include <linux/xattr.h>
25#include <linux/posix_acl_xattr.h>
25#include <linux/mtd/mtd.h> 26#include <linux/mtd/mtd.h>
26#include "nodelist.h" 27#include "nodelist.h"
27/* -------- xdatum related functions ---------------- 28/* -------- xdatum related functions ----------------
@@ -921,8 +922,8 @@ const struct xattr_handler *jffs2_xattr_handlers[] = {
921 &jffs2_security_xattr_handler, 922 &jffs2_security_xattr_handler,
922#endif 923#endif
923#ifdef CONFIG_JFFS2_FS_POSIX_ACL 924#ifdef CONFIG_JFFS2_FS_POSIX_ACL
924 &jffs2_acl_access_xattr_handler, 925 &posix_acl_access_xattr_handler,
925 &jffs2_acl_default_xattr_handler, 926 &posix_acl_default_xattr_handler,
926#endif 927#endif
927 &jffs2_trusted_xattr_handler, 928 &jffs2_trusted_xattr_handler,
928 NULL 929 NULL
@@ -942,10 +943,10 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) {
942#endif 943#endif
943#ifdef CONFIG_JFFS2_FS_POSIX_ACL 944#ifdef CONFIG_JFFS2_FS_POSIX_ACL
944 case JFFS2_XPREFIX_ACL_ACCESS: 945 case JFFS2_XPREFIX_ACL_ACCESS:
945 ret = &jffs2_acl_access_xattr_handler; 946 ret = &posix_acl_access_xattr_handler;
946 break; 947 break;
947 case JFFS2_XPREFIX_ACL_DEFAULT: 948 case JFFS2_XPREFIX_ACL_DEFAULT:
948 ret = &jffs2_acl_default_xattr_handler; 949 ret = &posix_acl_default_xattr_handler;
949 break; 950 break;
950#endif 951#endif
951 case JFFS2_XPREFIX_TRUSTED: 952 case JFFS2_XPREFIX_TRUSTED:
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d254d6d35995..5a8ea16eedbc 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -72,7 +72,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
72 return acl; 72 return acl;
73} 73}
74 74
75static int jfs_set_acl(tid_t tid, struct inode *inode, int type, 75static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
76 struct posix_acl *acl) 76 struct posix_acl *acl)
77{ 77{
78 char *ea_name; 78 char *ea_name;
@@ -80,21 +80,24 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
80 int size = 0; 80 int size = 0;
81 char *value = NULL; 81 char *value = NULL;
82 82
83 if (S_ISLNK(inode->i_mode)) 83 switch (type) {
84 return -EOPNOTSUPP; 84 case ACL_TYPE_ACCESS:
85 85 ea_name = POSIX_ACL_XATTR_ACCESS;
86 switch(type) { 86 rc = posix_acl_equiv_mode(acl, &inode->i_mode);
87 case ACL_TYPE_ACCESS: 87 if (rc < 0)
88 ea_name = POSIX_ACL_XATTR_ACCESS; 88 return rc;
89 break; 89 inode->i_ctime = CURRENT_TIME;
90 case ACL_TYPE_DEFAULT: 90 mark_inode_dirty(inode);
91 ea_name = POSIX_ACL_XATTR_DEFAULT; 91 if (rc == 0)
92 if (!S_ISDIR(inode->i_mode)) 92 acl = NULL;
93 return acl ? -EACCES : 0; 93 break;
94 break; 94 case ACL_TYPE_DEFAULT:
95 default: 95 ea_name = POSIX_ACL_XATTR_DEFAULT;
96 return -EINVAL; 96 break;
97 default:
98 return -EINVAL;
97 } 99 }
100
98 if (acl) { 101 if (acl) {
99 size = posix_acl_xattr_size(acl->a_count); 102 size = posix_acl_xattr_size(acl->a_count);
100 value = kmalloc(size, GFP_KERNEL); 103 value = kmalloc(size, GFP_KERNEL);
@@ -114,65 +117,43 @@ out:
114 return rc; 117 return rc;
115} 118}
116 119
120int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
121{
122 int rc;
123 tid_t tid;
124
125 tid = txBegin(inode->i_sb, 0);
126 mutex_lock(&JFS_IP(inode)->commit_mutex);
127 rc = __jfs_set_acl(tid, inode, type, acl);
128 if (!rc)
129 rc = txCommit(tid, 1, &inode, 0);
130 txEnd(tid);
131 mutex_unlock(&JFS_IP(inode)->commit_mutex);
132 return rc;
133}
134
117int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) 135int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
118{ 136{
119 struct posix_acl *acl = NULL; 137 struct posix_acl *default_acl, *acl;
120 int rc = 0; 138 int rc = 0;
121 139
122 if (S_ISLNK(inode->i_mode)) 140 rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
123 return 0; 141 if (rc)
142 return rc;
124 143
125 acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT); 144 if (default_acl) {
126 if (IS_ERR(acl)) 145 rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl);
127 return PTR_ERR(acl); 146 posix_acl_release(default_acl);
147 }
128 148
129 if (acl) { 149 if (acl) {
130 if (S_ISDIR(inode->i_mode)) { 150 if (!rc)
131 rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); 151 rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
132 if (rc)
133 goto cleanup;
134 }
135 rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
136 if (rc < 0)
137 goto cleanup; /* posix_acl_release(NULL) is no-op */
138 if (rc > 0)
139 rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
140cleanup:
141 posix_acl_release(acl); 152 posix_acl_release(acl);
142 } else 153 }
143 inode->i_mode &= ~current_umask();
144 154
145 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | 155 JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
146 inode->i_mode; 156 inode->i_mode;
147 157
148 return rc; 158 return rc;
149} 159}
150
151int jfs_acl_chmod(struct inode *inode)
152{
153 struct posix_acl *acl;
154 int rc;
155 tid_t tid;
156
157 if (S_ISLNK(inode->i_mode))
158 return -EOPNOTSUPP;
159
160 acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
161 if (IS_ERR(acl) || !acl)
162 return PTR_ERR(acl);
163
164 rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
165 if (rc)
166 return rc;
167
168 tid = txBegin(inode->i_sb, 0);
169 mutex_lock(&JFS_IP(inode)->commit_mutex);
170 rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
171 if (!rc)
172 rc = txCommit(tid, 1, &inode, 0);
173 txEnd(tid);
174 mutex_unlock(&JFS_IP(inode)->commit_mutex);
175
176 posix_acl_release(acl);
177 return rc;
178}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index dd7442c58358..794da944d5cd 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/posix_acl.h>
22#include <linux/quotaops.h> 23#include <linux/quotaops.h>
23#include "jfs_incore.h" 24#include "jfs_incore.h"
24#include "jfs_inode.h" 25#include "jfs_inode.h"
@@ -131,7 +132,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
131 mark_inode_dirty(inode); 132 mark_inode_dirty(inode);
132 133
133 if (iattr->ia_valid & ATTR_MODE) 134 if (iattr->ia_valid & ATTR_MODE)
134 rc = jfs_acl_chmod(inode); 135 rc = posix_acl_chmod(inode, inode->i_mode);
135 return rc; 136 return rc;
136} 137}
137 138
@@ -143,6 +144,7 @@ const struct inode_operations jfs_file_inode_operations = {
143 .setattr = jfs_setattr, 144 .setattr = jfs_setattr,
144#ifdef CONFIG_JFS_POSIX_ACL 145#ifdef CONFIG_JFS_POSIX_ACL
145 .get_acl = jfs_get_acl, 146 .get_acl = jfs_get_acl,
147 .set_acl = jfs_set_acl,
146#endif 148#endif
147}; 149};
148 150
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index ad84fe50ca9e..489f993b7b13 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -21,8 +21,8 @@
21#ifdef CONFIG_JFS_POSIX_ACL 21#ifdef CONFIG_JFS_POSIX_ACL
22 22
23struct posix_acl *jfs_get_acl(struct inode *inode, int type); 23struct posix_acl *jfs_get_acl(struct inode *inode, int type);
24int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 25int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_acl_chmod(struct inode *inode);
26 26
27#else 27#else
28 28
@@ -32,10 +32,5 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
32 return 0; 32 return 0;
33} 33}
34 34
35static inline int jfs_acl_chmod(struct inode *inode)
36{
37 return 0;
38}
39
40#endif 35#endif
41#endif /* _H_JFS_ACL */ 36#endif /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 360d27c48887..8d811e02b4b9 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1998,20 +1998,20 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
1998 1998
1999 bio = bio_alloc(GFP_NOFS, 1); 1999 bio = bio_alloc(GFP_NOFS, 1);
2000 2000
2001 bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); 2001 bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
2002 bio->bi_bdev = log->bdev; 2002 bio->bi_bdev = log->bdev;
2003 bio->bi_io_vec[0].bv_page = bp->l_page; 2003 bio->bi_io_vec[0].bv_page = bp->l_page;
2004 bio->bi_io_vec[0].bv_len = LOGPSIZE; 2004 bio->bi_io_vec[0].bv_len = LOGPSIZE;
2005 bio->bi_io_vec[0].bv_offset = bp->l_offset; 2005 bio->bi_io_vec[0].bv_offset = bp->l_offset;
2006 2006
2007 bio->bi_vcnt = 1; 2007 bio->bi_vcnt = 1;
2008 bio->bi_size = LOGPSIZE; 2008 bio->bi_iter.bi_size = LOGPSIZE;
2009 2009
2010 bio->bi_end_io = lbmIODone; 2010 bio->bi_end_io = lbmIODone;
2011 bio->bi_private = bp; 2011 bio->bi_private = bp;
2012 /*check if journaling to disk has been disabled*/ 2012 /*check if journaling to disk has been disabled*/
2013 if (log->no_integrity) { 2013 if (log->no_integrity) {
2014 bio->bi_size = 0; 2014 bio->bi_iter.bi_size = 0;
2015 lbmIODone(bio, 0); 2015 lbmIODone(bio, 0);
2016 } else { 2016 } else {
2017 submit_bio(READ_SYNC, bio); 2017 submit_bio(READ_SYNC, bio);
@@ -2144,21 +2144,21 @@ static void lbmStartIO(struct lbuf * bp)
2144 jfs_info("lbmStartIO\n"); 2144 jfs_info("lbmStartIO\n");
2145 2145
2146 bio = bio_alloc(GFP_NOFS, 1); 2146 bio = bio_alloc(GFP_NOFS, 1);
2147 bio->bi_sector = bp->l_blkno << (log->l2bsize - 9); 2147 bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
2148 bio->bi_bdev = log->bdev; 2148 bio->bi_bdev = log->bdev;
2149 bio->bi_io_vec[0].bv_page = bp->l_page; 2149 bio->bi_io_vec[0].bv_page = bp->l_page;
2150 bio->bi_io_vec[0].bv_len = LOGPSIZE; 2150 bio->bi_io_vec[0].bv_len = LOGPSIZE;
2151 bio->bi_io_vec[0].bv_offset = bp->l_offset; 2151 bio->bi_io_vec[0].bv_offset = bp->l_offset;
2152 2152
2153 bio->bi_vcnt = 1; 2153 bio->bi_vcnt = 1;
2154 bio->bi_size = LOGPSIZE; 2154 bio->bi_iter.bi_size = LOGPSIZE;
2155 2155
2156 bio->bi_end_io = lbmIODone; 2156 bio->bi_end_io = lbmIODone;
2157 bio->bi_private = bp; 2157 bio->bi_private = bp;
2158 2158
2159 /* check if journaling to disk has been disabled */ 2159 /* check if journaling to disk has been disabled */
2160 if (log->no_integrity) { 2160 if (log->no_integrity) {
2161 bio->bi_size = 0; 2161 bio->bi_iter.bi_size = 0;
2162 lbmIODone(bio, 0); 2162 lbmIODone(bio, 0);
2163 } else { 2163 } else {
2164 submit_bio(WRITE_SYNC, bio); 2164 submit_bio(WRITE_SYNC, bio);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d165cde0c68d..49ba7ff1bbb9 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
416 * count from hitting zero before we're through 416 * count from hitting zero before we're through
417 */ 417 */
418 inc_io(page); 418 inc_io(page);
419 if (!bio->bi_size) 419 if (!bio->bi_iter.bi_size)
420 goto dump_bio; 420 goto dump_bio;
421 submit_bio(WRITE, bio); 421 submit_bio(WRITE, bio);
422 nr_underway++; 422 nr_underway++;
@@ -438,7 +438,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
438 438
439 bio = bio_alloc(GFP_NOFS, 1); 439 bio = bio_alloc(GFP_NOFS, 1);
440 bio->bi_bdev = inode->i_sb->s_bdev; 440 bio->bi_bdev = inode->i_sb->s_bdev;
441 bio->bi_sector = pblock << (inode->i_blkbits - 9); 441 bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
442 bio->bi_end_io = metapage_write_end_io; 442 bio->bi_end_io = metapage_write_end_io;
443 bio->bi_private = page; 443 bio->bi_private = page;
444 444
@@ -452,7 +452,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
452 if (bio) { 452 if (bio) {
453 if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes) 453 if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes)
454 goto add_failed; 454 goto add_failed;
455 if (!bio->bi_size) 455 if (!bio->bi_iter.bi_size)
456 goto dump_bio; 456 goto dump_bio;
457 457
458 submit_bio(WRITE, bio); 458 submit_bio(WRITE, bio);
@@ -517,7 +517,8 @@ static int metapage_readpage(struct file *fp, struct page *page)
517 517
518 bio = bio_alloc(GFP_NOFS, 1); 518 bio = bio_alloc(GFP_NOFS, 1);
519 bio->bi_bdev = inode->i_sb->s_bdev; 519 bio->bi_bdev = inode->i_sb->s_bdev;
520 bio->bi_sector = pblock << (inode->i_blkbits - 9); 520 bio->bi_iter.bi_sector =
521 pblock << (inode->i_blkbits - 9);
521 bio->bi_end_io = metapage_read_end_io; 522 bio->bi_end_io = metapage_read_end_io;
522 bio->bi_private = page; 523 bio->bi_private = page;
523 len = xlen << inode->i_blkbits; 524 len = xlen << inode->i_blkbits;
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index e9e100fd7c09..e8d717dabca3 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -61,6 +61,8 @@ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
61extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); 61extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
62extern int jfs_removexattr(struct dentry *, const char *); 62extern int jfs_removexattr(struct dentry *, const char *);
63 63
64extern const struct xattr_handler *jfs_xattr_handlers[];
65
64#ifdef CONFIG_JFS_SECURITY 66#ifdef CONFIG_JFS_SECURITY
65extern int jfs_init_security(tid_t, struct inode *, struct inode *, 67extern int jfs_init_security(tid_t, struct inode *, struct inode *,
66 const struct qstr *); 68 const struct qstr *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index aa8a3370631b..d59c7defb1ef 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1524,6 +1524,7 @@ const struct inode_operations jfs_dir_inode_operations = {
1524 .setattr = jfs_setattr, 1524 .setattr = jfs_setattr,
1525#ifdef CONFIG_JFS_POSIX_ACL 1525#ifdef CONFIG_JFS_POSIX_ACL
1526 .get_acl = jfs_get_acl, 1526 .get_acl = jfs_get_acl,
1527 .set_acl = jfs_set_acl,
1527#endif 1528#endif
1528}; 1529};
1529 1530
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6669aa2042c3..e2b7483444fd 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -44,6 +44,7 @@
44#include "jfs_imap.h" 44#include "jfs_imap.h"
45#include "jfs_acl.h" 45#include "jfs_acl.h"
46#include "jfs_debug.h" 46#include "jfs_debug.h"
47#include "jfs_xattr.h"
47 48
48MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); 49MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
49MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); 50MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
@@ -522,6 +523,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
522 */ 523 */
523 sb->s_op = &jfs_super_operations; 524 sb->s_op = &jfs_super_operations;
524 sb->s_export_op = &jfs_export_operations; 525 sb->s_export_op = &jfs_export_operations;
526 sb->s_xattr = jfs_xattr_handlers;
525#ifdef CONFIG_QUOTA 527#ifdef CONFIG_QUOTA
526 sb->dq_op = &dquot_operations; 528 sb->dq_op = &dquot_operations;
527 sb->s_qcop = &dquot_quotactl_ops; 529 sb->s_qcop = &dquot_quotactl_ops;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index d3472f4cd530..46325d5c34fc 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -666,81 +666,12 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
666} 666}
667 667
668/* 668/*
669 * can_set_system_xattr
670 *
671 * This code is specific to the system.* namespace. It contains policy
672 * which doesn't belong in the main xattr codepath.
673 */
674static int can_set_system_xattr(struct inode *inode, const char *name,
675 const void *value, size_t value_len)
676{
677#ifdef CONFIG_JFS_POSIX_ACL
678 struct posix_acl *acl;
679 int rc;
680
681 if (!inode_owner_or_capable(inode))
682 return -EPERM;
683
684 /*
685 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
686 */
687 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
688 acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
689 if (IS_ERR(acl)) {
690 rc = PTR_ERR(acl);
691 printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
692 rc);
693 return rc;
694 }
695 if (acl) {
696 rc = posix_acl_equiv_mode(acl, &inode->i_mode);
697 posix_acl_release(acl);
698 if (rc < 0) {
699 printk(KERN_ERR
700 "posix_acl_equiv_mode returned %d\n",
701 rc);
702 return rc;
703 }
704 mark_inode_dirty(inode);
705 }
706 /*
707 * We're changing the ACL. Get rid of the cached one
708 */
709 forget_cached_acl(inode, ACL_TYPE_ACCESS);
710
711 return 0;
712 } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
713 acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
714 if (IS_ERR(acl)) {
715 rc = PTR_ERR(acl);
716 printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
717 rc);
718 return rc;
719 }
720 posix_acl_release(acl);
721
722 /*
723 * We're changing the default ACL. Get rid of the cached one
724 */
725 forget_cached_acl(inode, ACL_TYPE_DEFAULT);
726
727 return 0;
728 }
729#endif /* CONFIG_JFS_POSIX_ACL */
730 return -EOPNOTSUPP;
731}
732
733/*
734 * Most of the permission checking is done by xattr_permission in the vfs. 669 * Most of the permission checking is done by xattr_permission in the vfs.
735 * The local file system is responsible for handling the system.* namespace.
736 * We also need to verify that this is a namespace that we recognize. 670 * We also need to verify that this is a namespace that we recognize.
737 */ 671 */
738static int can_set_xattr(struct inode *inode, const char *name, 672static int can_set_xattr(struct inode *inode, const char *name,
739 const void *value, size_t value_len) 673 const void *value, size_t value_len)
740{ 674{
741 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
742 return can_set_system_xattr(inode, name, value, value_len);
743
744 if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { 675 if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
745 /* 676 /*
746 * This makes sure that we aren't trying to set an 677 * This makes sure that we aren't trying to set an
@@ -748,7 +679,7 @@ static int can_set_xattr(struct inode *inode, const char *name,
748 * with "os2." 679 * with "os2."
749 */ 680 */
750 if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) 681 if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
751 return -EOPNOTSUPP; 682 return -EOPNOTSUPP;
752 return 0; 683 return 0;
753 } 684 }
754 685
@@ -860,6 +791,19 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
860 /* Completely new ea list */ 791 /* Completely new ea list */
861 xattr_size = sizeof (struct jfs_ea_list); 792 xattr_size = sizeof (struct jfs_ea_list);
862 793
794 /*
795 * The size of EA value is limitted by on-disk format up to
796 * __le16, there would be an overflow if the size is equal
797 * to XATTR_SIZE_MAX (65536). In order to avoid this issue,
798 * we can pre-checkup the value size against USHRT_MAX, and
799 * return -E2BIG in this case, which is consistent with the
800 * VFS setxattr interface.
801 */
802 if (value_len >= USHRT_MAX) {
803 rc = -E2BIG;
804 goto release;
805 }
806
863 ea = (struct jfs_ea *) ((char *) ealist + xattr_size); 807 ea = (struct jfs_ea *) ((char *) ealist + xattr_size);
864 ea->flag = 0; 808 ea->flag = 0;
865 ea->namelen = namelen; 809 ea->namelen = namelen;
@@ -874,7 +818,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
874 /* DEBUG - If we did this right, these number match */ 818 /* DEBUG - If we did this right, these number match */
875 if (xattr_size != new_size) { 819 if (xattr_size != new_size) {
876 printk(KERN_ERR 820 printk(KERN_ERR
877 "jfs_xsetattr: xattr_size = %d, new_size = %d\n", 821 "__jfs_setxattr: xattr_size = %d, new_size = %d\n",
878 xattr_size, new_size); 822 xattr_size, new_size);
879 823
880 rc = -EINVAL; 824 rc = -EINVAL;
@@ -910,6 +854,14 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
910 int rc; 854 int rc;
911 tid_t tid; 855 tid_t tid;
912 856
857 /*
858 * If this is a request for a synthetic attribute in the system.*
859 * namespace use the generic infrastructure to resolve a handler
860 * for it via sb->s_xattr.
861 */
862 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
863 return generic_setxattr(dentry, name, value, value_len, flags);
864
913 if ((rc = can_set_xattr(inode, name, value, value_len))) 865 if ((rc = can_set_xattr(inode, name, value, value_len)))
914 return rc; 866 return rc;
915 867
@@ -986,6 +938,14 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
986{ 938{
987 int err; 939 int err;
988 940
941 /*
942 * If this is a request for a synthetic attribute in the system.*
943 * namespace use the generic infrastructure to resolve a handler
944 * for it via sb->s_xattr.
945 */
946 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
947 return generic_getxattr(dentry, name, data, buf_size);
948
989 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { 949 if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
990 /* 950 /*
991 * skip past "os2." prefix 951 * skip past "os2." prefix
@@ -1074,6 +1034,14 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
1074 int rc; 1034 int rc;
1075 tid_t tid; 1035 tid_t tid;
1076 1036
1037 /*
1038 * If this is a request for a synthetic attribute in the system.*
1039 * namespace use the generic infrastructure to resolve a handler
1040 * for it via sb->s_xattr.
1041 */
1042 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
1043 return generic_removexattr(dentry, name);
1044
1077 if ((rc = can_set_xattr(inode, name, NULL, 0))) 1045 if ((rc = can_set_xattr(inode, name, NULL, 0)))
1078 return rc; 1046 return rc;
1079 1047
@@ -1088,6 +1056,19 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
1088 return rc; 1056 return rc;
1089} 1057}
1090 1058
1059/*
1060 * List of handlers for synthetic system.* attributes. All real ondisk
1061 * attributes are handled directly.
1062 */
1063const struct xattr_handler *jfs_xattr_handlers[] = {
1064#ifdef CONFIG_JFS_POSIX_ACL
1065 &posix_acl_access_xattr_handler,
1066 &posix_acl_default_xattr_handler,
1067#endif
1068 NULL,
1069};
1070
1071
1091#ifdef CONFIG_JFS_SECURITY 1072#ifdef CONFIG_JFS_SECURITY
1092static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, 1073static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
1093 void *fs_info) 1074 void *fs_info)
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
new file mode 100644
index 000000000000..674337c76673
--- /dev/null
+++ b/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
1#
2# Makefile for the kernfs pseudo filesystem
3#
4
5obj-y := mount.o inode.o dir.o file.o symlink.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
new file mode 100644
index 000000000000..bd6e18be6e1a
--- /dev/null
+++ b/fs/kernfs/dir.c
@@ -0,0 +1,1077 @@
1/*
2 * fs/kernfs/dir.c - kernfs directory implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/fs.h>
12#include <linux/namei.h>
13#include <linux/idr.h>
14#include <linux/slab.h>
15#include <linux/security.h>
16#include <linux/hash.h>
17
18#include "kernfs-internal.h"
19
20DEFINE_MUTEX(kernfs_mutex);
21
22#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
23
24/**
25 * kernfs_name_hash
26 * @name: Null terminated string to hash
27 * @ns: Namespace tag to hash
28 *
29 * Returns 31 bit hash of ns + name (so it fits in an off_t )
30 */
31static unsigned int kernfs_name_hash(const char *name, const void *ns)
32{
33 unsigned long hash = init_name_hash();
34 unsigned int len = strlen(name);
35 while (len--)
36 hash = partial_name_hash(*name++, hash);
37 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
38 hash &= 0x7fffffffU;
39 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
40 if (hash < 1)
41 hash += 2;
42 if (hash >= INT_MAX)
43 hash = INT_MAX - 1;
44 return hash;
45}
46
47static int kernfs_name_compare(unsigned int hash, const char *name,
48 const void *ns, const struct kernfs_node *kn)
49{
50 if (hash != kn->hash)
51 return hash - kn->hash;
52 if (ns != kn->ns)
53 return ns - kn->ns;
54 return strcmp(name, kn->name);
55}
56
57static int kernfs_sd_compare(const struct kernfs_node *left,
58 const struct kernfs_node *right)
59{
60 return kernfs_name_compare(left->hash, left->name, left->ns, right);
61}
62
63/**
64 * kernfs_link_sibling - link kernfs_node into sibling rbtree
65 * @kn: kernfs_node of interest
66 *
67 * Link @kn into its sibling rbtree which starts from
68 * @kn->parent->dir.children.
69 *
70 * Locking:
71 * mutex_lock(kernfs_mutex)
72 *
73 * RETURNS:
74 * 0 on susccess -EEXIST on failure.
75 */
76static int kernfs_link_sibling(struct kernfs_node *kn)
77{
78 struct rb_node **node = &kn->parent->dir.children.rb_node;
79 struct rb_node *parent = NULL;
80
81 if (kernfs_type(kn) == KERNFS_DIR)
82 kn->parent->dir.subdirs++;
83
84 while (*node) {
85 struct kernfs_node *pos;
86 int result;
87
88 pos = rb_to_kn(*node);
89 parent = *node;
90 result = kernfs_sd_compare(kn, pos);
91 if (result < 0)
92 node = &pos->rb.rb_left;
93 else if (result > 0)
94 node = &pos->rb.rb_right;
95 else
96 return -EEXIST;
97 }
98 /* add new node and rebalance the tree */
99 rb_link_node(&kn->rb, parent, node);
100 rb_insert_color(&kn->rb, &kn->parent->dir.children);
101 return 0;
102}
103
104/**
105 * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
106 * @kn: kernfs_node of interest
107 *
108 * Unlink @kn from its sibling rbtree which starts from
109 * kn->parent->dir.children.
110 *
111 * Locking:
112 * mutex_lock(kernfs_mutex)
113 */
114static void kernfs_unlink_sibling(struct kernfs_node *kn)
115{
116 if (kernfs_type(kn) == KERNFS_DIR)
117 kn->parent->dir.subdirs--;
118
119 rb_erase(&kn->rb, &kn->parent->dir.children);
120}
121
122/**
123 * kernfs_get_active - get an active reference to kernfs_node
124 * @kn: kernfs_node to get an active reference to
125 *
126 * Get an active reference of @kn. This function is noop if @kn
127 * is NULL.
128 *
129 * RETURNS:
130 * Pointer to @kn on success, NULL on failure.
131 */
132struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
133{
134 if (unlikely(!kn))
135 return NULL;
136
137 if (!atomic_inc_unless_negative(&kn->active))
138 return NULL;
139
140 if (kn->flags & KERNFS_LOCKDEP)
141 rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
142 return kn;
143}
144
145/**
146 * kernfs_put_active - put an active reference to kernfs_node
147 * @kn: kernfs_node to put an active reference to
148 *
149 * Put an active reference to @kn. This function is noop if @kn
150 * is NULL.
151 */
152void kernfs_put_active(struct kernfs_node *kn)
153{
154 int v;
155
156 if (unlikely(!kn))
157 return;
158
159 if (kn->flags & KERNFS_LOCKDEP)
160 rwsem_release(&kn->dep_map, 1, _RET_IP_);
161 v = atomic_dec_return(&kn->active);
162 if (likely(v != KN_DEACTIVATED_BIAS))
163 return;
164
165 /*
166 * atomic_dec_return() is a mb(), we'll always see the updated
167 * kn->u.completion.
168 */
169 complete(kn->u.completion);
170}
171
172/**
173 * kernfs_deactivate - deactivate kernfs_node
174 * @kn: kernfs_node to deactivate
175 *
176 * Deny new active references and drain existing ones.
177 */
178static void kernfs_deactivate(struct kernfs_node *kn)
179{
180 DECLARE_COMPLETION_ONSTACK(wait);
181 int v;
182
183 BUG_ON(!(kn->flags & KERNFS_REMOVED));
184
185 if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
186 return;
187
188 kn->u.completion = (void *)&wait;
189
190 if (kn->flags & KERNFS_LOCKDEP)
191 rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
192 /* atomic_add_return() is a mb(), put_active() will always see
193 * the updated kn->u.completion.
194 */
195 v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
196
197 if (v != KN_DEACTIVATED_BIAS) {
198 if (kn->flags & KERNFS_LOCKDEP)
199 lock_contended(&kn->dep_map, _RET_IP_);
200 wait_for_completion(&wait);
201 }
202
203 if (kn->flags & KERNFS_LOCKDEP) {
204 lock_acquired(&kn->dep_map, _RET_IP_);
205 rwsem_release(&kn->dep_map, 1, _RET_IP_);
206 }
207}
208
209/**
210 * kernfs_get - get a reference count on a kernfs_node
211 * @kn: the target kernfs_node
212 */
213void kernfs_get(struct kernfs_node *kn)
214{
215 if (kn) {
216 WARN_ON(!atomic_read(&kn->count));
217 atomic_inc(&kn->count);
218 }
219}
220EXPORT_SYMBOL_GPL(kernfs_get);
221
222/**
223 * kernfs_put - put a reference count on a kernfs_node
224 * @kn: the target kernfs_node
225 *
226 * Put a reference count of @kn and destroy it if it reached zero.
227 */
228void kernfs_put(struct kernfs_node *kn)
229{
230 struct kernfs_node *parent;
231 struct kernfs_root *root;
232
233 if (!kn || !atomic_dec_and_test(&kn->count))
234 return;
235 root = kernfs_root(kn);
236 repeat:
237 /* Moving/renaming is always done while holding reference.
238 * kn->parent won't change beneath us.
239 */
240 parent = kn->parent;
241
242 WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
243 parent ? parent->name : "", kn->name);
244
245 if (kernfs_type(kn) == KERNFS_LINK)
246 kernfs_put(kn->symlink.target_kn);
247 if (!(kn->flags & KERNFS_STATIC_NAME))
248 kfree(kn->name);
249 if (kn->iattr) {
250 if (kn->iattr->ia_secdata)
251 security_release_secctx(kn->iattr->ia_secdata,
252 kn->iattr->ia_secdata_len);
253 simple_xattrs_free(&kn->iattr->xattrs);
254 }
255 kfree(kn->iattr);
256 ida_simple_remove(&root->ino_ida, kn->ino);
257 kmem_cache_free(kernfs_node_cache, kn);
258
259 kn = parent;
260 if (kn) {
261 if (atomic_dec_and_test(&kn->count))
262 goto repeat;
263 } else {
264 /* just released the root kn, free @root too */
265 ida_destroy(&root->ino_ida);
266 kfree(root);
267 }
268}
269EXPORT_SYMBOL_GPL(kernfs_put);
270
271static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
272{
273 struct kernfs_node *kn;
274
275 if (flags & LOOKUP_RCU)
276 return -ECHILD;
277
278 /* Always perform fresh lookup for negatives */
279 if (!dentry->d_inode)
280 goto out_bad_unlocked;
281
282 kn = dentry->d_fsdata;
283 mutex_lock(&kernfs_mutex);
284
285 /* The kernfs node has been deleted */
286 if (kn->flags & KERNFS_REMOVED)
287 goto out_bad;
288
289 /* The kernfs node has been moved? */
290 if (dentry->d_parent->d_fsdata != kn->parent)
291 goto out_bad;
292
293 /* The kernfs node has been renamed */
294 if (strcmp(dentry->d_name.name, kn->name) != 0)
295 goto out_bad;
296
297 /* The kernfs node has been moved to a different namespace */
298 if (kn->parent && kernfs_ns_enabled(kn->parent) &&
299 kernfs_info(dentry->d_sb)->ns != kn->ns)
300 goto out_bad;
301
302 mutex_unlock(&kernfs_mutex);
303out_valid:
304 return 1;
305out_bad:
306 mutex_unlock(&kernfs_mutex);
307out_bad_unlocked:
308 /*
309 * @dentry doesn't match the underlying kernfs node, drop the
310 * dentry and force lookup. If we have submounts we must allow the
311 * vfs caches to lie about the state of the filesystem to prevent
312 * leaks and other nasty things, so use check_submounts_and_drop()
313 * instead of d_drop().
314 */
315 if (check_submounts_and_drop(dentry) != 0)
316 goto out_valid;
317
318 return 0;
319}
320
321static void kernfs_dop_release(struct dentry *dentry)
322{
323 kernfs_put(dentry->d_fsdata);
324}
325
326const struct dentry_operations kernfs_dops = {
327 .d_revalidate = kernfs_dop_revalidate,
328 .d_release = kernfs_dop_release,
329};
330
331static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
332 const char *name, umode_t mode,
333 unsigned flags)
334{
335 char *dup_name = NULL;
336 struct kernfs_node *kn;
337 int ret;
338
339 if (!(flags & KERNFS_STATIC_NAME)) {
340 name = dup_name = kstrdup(name, GFP_KERNEL);
341 if (!name)
342 return NULL;
343 }
344
345 kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
346 if (!kn)
347 goto err_out1;
348
349 ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
350 if (ret < 0)
351 goto err_out2;
352 kn->ino = ret;
353
354 atomic_set(&kn->count, 1);
355 atomic_set(&kn->active, 0);
356
357 kn->name = name;
358 kn->mode = mode;
359 kn->flags = flags | KERNFS_REMOVED;
360
361 return kn;
362
363 err_out2:
364 kmem_cache_free(kernfs_node_cache, kn);
365 err_out1:
366 kfree(dup_name);
367 return NULL;
368}
369
370struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
371 const char *name, umode_t mode,
372 unsigned flags)
373{
374 struct kernfs_node *kn;
375
376 kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
377 if (kn) {
378 kernfs_get(parent);
379 kn->parent = parent;
380 }
381 return kn;
382}
383
384/**
385 * kernfs_addrm_start - prepare for kernfs_node add/remove
386 * @acxt: pointer to kernfs_addrm_cxt to be used
387 *
388 * This function is called when the caller is about to add or remove
389 * kernfs_node. This function acquires kernfs_mutex. @acxt is used
390 * to keep and pass context to other addrm functions.
391 *
392 * LOCKING:
393 * Kernel thread context (may sleep). kernfs_mutex is locked on
394 * return.
395 */
396void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
397 __acquires(kernfs_mutex)
398{
399 memset(acxt, 0, sizeof(*acxt));
400
401 mutex_lock(&kernfs_mutex);
402}
403
404/**
405 * kernfs_add_one - add kernfs_node to parent without warning
406 * @acxt: addrm context to use
407 * @kn: kernfs_node to be added
408 *
409 * The caller must already have initialized @kn->parent. This
410 * function increments nlink of the parent's inode if @kn is a
411 * directory and link into the children list of the parent.
412 *
413 * This function should be called between calls to
414 * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
415 * the same @acxt as passed to kernfs_addrm_start().
416 *
417 * LOCKING:
418 * Determined by kernfs_addrm_start().
419 *
420 * RETURNS:
421 * 0 on success, -EEXIST if entry with the given name already
422 * exists.
423 */
424int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
425{
426 struct kernfs_node *parent = kn->parent;
427 bool has_ns = kernfs_ns_enabled(parent);
428 struct kernfs_iattrs *ps_iattr;
429 int ret;
430
431 if (has_ns != (bool)kn->ns) {
432 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
433 has_ns ? "required" : "invalid", parent->name, kn->name);
434 return -EINVAL;
435 }
436
437 if (kernfs_type(parent) != KERNFS_DIR)
438 return -EINVAL;
439
440 if (parent->flags & KERNFS_REMOVED)
441 return -ENOENT;
442
443 kn->hash = kernfs_name_hash(kn->name, kn->ns);
444
445 ret = kernfs_link_sibling(kn);
446 if (ret)
447 return ret;
448
449 /* Update timestamps on the parent */
450 ps_iattr = parent->iattr;
451 if (ps_iattr) {
452 struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
453 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
454 }
455
456 /* Mark the entry added into directory tree */
457 kn->flags &= ~KERNFS_REMOVED;
458
459 return 0;
460}
461
462/**
463 * kernfs_remove_one - remove kernfs_node from parent
464 * @acxt: addrm context to use
465 * @kn: kernfs_node to be removed
466 *
467 * Mark @kn removed and drop nlink of parent inode if @kn is a
468 * directory. @kn is unlinked from the children list.
469 *
470 * This function should be called between calls to
471 * kernfs_addrm_start() and kernfs_addrm_finish() and should be
472 * passed the same @acxt as passed to kernfs_addrm_start().
473 *
474 * LOCKING:
475 * Determined by kernfs_addrm_start().
476 */
477static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
478 struct kernfs_node *kn)
479{
480 struct kernfs_iattrs *ps_iattr;
481
482 /*
483 * Removal can be called multiple times on the same node. Only the
484 * first invocation is effective and puts the base ref.
485 */
486 if (kn->flags & KERNFS_REMOVED)
487 return;
488
489 if (kn->parent) {
490 kernfs_unlink_sibling(kn);
491
492 /* Update timestamps on the parent */
493 ps_iattr = kn->parent->iattr;
494 if (ps_iattr) {
495 ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
496 ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
497 }
498 }
499
500 kn->flags |= KERNFS_REMOVED;
501 kn->u.removed_list = acxt->removed;
502 acxt->removed = kn;
503}
504
505/**
506 * kernfs_addrm_finish - finish up kernfs_node add/remove
507 * @acxt: addrm context to finish up
508 *
509 * Finish up kernfs_node add/remove. Resources acquired by
510 * kernfs_addrm_start() are released and removed kernfs_nodes are
511 * cleaned up.
512 *
513 * LOCKING:
514 * kernfs_mutex is released.
515 */
516void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
517 __releases(kernfs_mutex)
518{
519 /* release resources acquired by kernfs_addrm_start() */
520 mutex_unlock(&kernfs_mutex);
521
522 /* kill removed kernfs_nodes */
523 while (acxt->removed) {
524 struct kernfs_node *kn = acxt->removed;
525
526 acxt->removed = kn->u.removed_list;
527
528 kernfs_deactivate(kn);
529 kernfs_unmap_bin_file(kn);
530 kernfs_put(kn);
531 }
532}
533
534/**
535 * kernfs_find_ns - find kernfs_node with the given name
536 * @parent: kernfs_node to search under
537 * @name: name to look for
538 * @ns: the namespace tag to use
539 *
540 * Look for kernfs_node with name @name under @parent. Returns pointer to
541 * the found kernfs_node on success, %NULL on failure.
542 */
543static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
544 const unsigned char *name,
545 const void *ns)
546{
547 struct rb_node *node = parent->dir.children.rb_node;
548 bool has_ns = kernfs_ns_enabled(parent);
549 unsigned int hash;
550
551 lockdep_assert_held(&kernfs_mutex);
552
553 if (has_ns != (bool)ns) {
554 WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
555 has_ns ? "required" : "invalid", parent->name, name);
556 return NULL;
557 }
558
559 hash = kernfs_name_hash(name, ns);
560 while (node) {
561 struct kernfs_node *kn;
562 int result;
563
564 kn = rb_to_kn(node);
565 result = kernfs_name_compare(hash, name, ns, kn);
566 if (result < 0)
567 node = node->rb_left;
568 else if (result > 0)
569 node = node->rb_right;
570 else
571 return kn;
572 }
573 return NULL;
574}
575
576/**
577 * kernfs_find_and_get_ns - find and get kernfs_node with the given name
578 * @parent: kernfs_node to search under
579 * @name: name to look for
580 * @ns: the namespace tag to use
581 *
582 * Look for kernfs_node with name @name under @parent and get a reference
583 * if found. This function may sleep and returns pointer to the found
584 * kernfs_node on success, %NULL on failure.
585 */
586struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
587 const char *name, const void *ns)
588{
589 struct kernfs_node *kn;
590
591 mutex_lock(&kernfs_mutex);
592 kn = kernfs_find_ns(parent, name, ns);
593 kernfs_get(kn);
594 mutex_unlock(&kernfs_mutex);
595
596 return kn;
597}
598EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
599
600/**
601 * kernfs_create_root - create a new kernfs hierarchy
602 * @kdops: optional directory syscall operations for the hierarchy
603 * @priv: opaque data associated with the new directory
604 *
605 * Returns the root of the new hierarchy on success, ERR_PTR() value on
606 * failure.
607 */
608struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
609{
610 struct kernfs_root *root;
611 struct kernfs_node *kn;
612
613 root = kzalloc(sizeof(*root), GFP_KERNEL);
614 if (!root)
615 return ERR_PTR(-ENOMEM);
616
617 ida_init(&root->ino_ida);
618
619 kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
620 KERNFS_DIR);
621 if (!kn) {
622 ida_destroy(&root->ino_ida);
623 kfree(root);
624 return ERR_PTR(-ENOMEM);
625 }
626
627 kn->flags &= ~KERNFS_REMOVED;
628 kn->priv = priv;
629 kn->dir.root = root;
630
631 root->dir_ops = kdops;
632 root->kn = kn;
633
634 return root;
635}
636
637/**
638 * kernfs_destroy_root - destroy a kernfs hierarchy
639 * @root: root of the hierarchy to destroy
640 *
641 * Destroy the hierarchy anchored at @root by removing all existing
642 * directories and destroying @root.
643 */
644void kernfs_destroy_root(struct kernfs_root *root)
645{
646 kernfs_remove(root->kn); /* will also free @root */
647}
648
649/**
650 * kernfs_create_dir_ns - create a directory
651 * @parent: parent in which to create a new directory
652 * @name: name of the new directory
653 * @mode: mode of the new directory
654 * @priv: opaque data associated with the new directory
655 * @ns: optional namespace tag of the directory
656 *
657 * Returns the created node on success, ERR_PTR() value on failure.
658 */
659struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
660 const char *name, umode_t mode,
661 void *priv, const void *ns)
662{
663 struct kernfs_addrm_cxt acxt;
664 struct kernfs_node *kn;
665 int rc;
666
667 /* allocate */
668 kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
669 if (!kn)
670 return ERR_PTR(-ENOMEM);
671
672 kn->dir.root = parent->dir.root;
673 kn->ns = ns;
674 kn->priv = priv;
675
676 /* link in */
677 kernfs_addrm_start(&acxt);
678 rc = kernfs_add_one(&acxt, kn);
679 kernfs_addrm_finish(&acxt);
680
681 if (!rc)
682 return kn;
683
684 kernfs_put(kn);
685 return ERR_PTR(rc);
686}
687
688static struct dentry *kernfs_iop_lookup(struct inode *dir,
689 struct dentry *dentry,
690 unsigned int flags)
691{
692 struct dentry *ret;
693 struct kernfs_node *parent = dentry->d_parent->d_fsdata;
694 struct kernfs_node *kn;
695 struct inode *inode;
696 const void *ns = NULL;
697
698 mutex_lock(&kernfs_mutex);
699
700 if (kernfs_ns_enabled(parent))
701 ns = kernfs_info(dir->i_sb)->ns;
702
703 kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
704
705 /* no such entry */
706 if (!kn) {
707 ret = NULL;
708 goto out_unlock;
709 }
710 kernfs_get(kn);
711 dentry->d_fsdata = kn;
712
713 /* attach dentry and inode */
714 inode = kernfs_get_inode(dir->i_sb, kn);
715 if (!inode) {
716 ret = ERR_PTR(-ENOMEM);
717 goto out_unlock;
718 }
719
720 /* instantiate and hash dentry */
721 ret = d_materialise_unique(dentry, inode);
722 out_unlock:
723 mutex_unlock(&kernfs_mutex);
724 return ret;
725}
726
727static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
728 umode_t mode)
729{
730 struct kernfs_node *parent = dir->i_private;
731 struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops;
732
733 if (!kdops || !kdops->mkdir)
734 return -EPERM;
735
736 return kdops->mkdir(parent, dentry->d_name.name, mode);
737}
738
739static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
740{
741 struct kernfs_node *kn = dentry->d_fsdata;
742 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
743
744 if (!kdops || !kdops->rmdir)
745 return -EPERM;
746
747 return kdops->rmdir(kn);
748}
749
750static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
751 struct inode *new_dir, struct dentry *new_dentry)
752{
753 struct kernfs_node *kn = old_dentry->d_fsdata;
754 struct kernfs_node *new_parent = new_dir->i_private;
755 struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
756
757 if (!kdops || !kdops->rename)
758 return -EPERM;
759
760 return kdops->rename(kn, new_parent, new_dentry->d_name.name);
761}
762
763const struct inode_operations kernfs_dir_iops = {
764 .lookup = kernfs_iop_lookup,
765 .permission = kernfs_iop_permission,
766 .setattr = kernfs_iop_setattr,
767 .getattr = kernfs_iop_getattr,
768 .setxattr = kernfs_iop_setxattr,
769 .removexattr = kernfs_iop_removexattr,
770 .getxattr = kernfs_iop_getxattr,
771 .listxattr = kernfs_iop_listxattr,
772
773 .mkdir = kernfs_iop_mkdir,
774 .rmdir = kernfs_iop_rmdir,
775 .rename = kernfs_iop_rename,
776};
777
778static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
779{
780 struct kernfs_node *last;
781
782 while (true) {
783 struct rb_node *rbn;
784
785 last = pos;
786
787 if (kernfs_type(pos) != KERNFS_DIR)
788 break;
789
790 rbn = rb_first(&pos->dir.children);
791 if (!rbn)
792 break;
793
794 pos = rb_to_kn(rbn);
795 }
796
797 return last;
798}
799
800/**
801 * kernfs_next_descendant_post - find the next descendant for post-order walk
802 * @pos: the current position (%NULL to initiate traversal)
803 * @root: kernfs_node whose descendants to walk
804 *
805 * Find the next descendant to visit for post-order traversal of @root's
806 * descendants. @root is included in the iteration and the last node to be
807 * visited.
808 */
809static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
810 struct kernfs_node *root)
811{
812 struct rb_node *rbn;
813
814 lockdep_assert_held(&kernfs_mutex);
815
816 /* if first iteration, visit leftmost descendant which may be root */
817 if (!pos)
818 return kernfs_leftmost_descendant(root);
819
820 /* if we visited @root, we're done */
821 if (pos == root)
822 return NULL;
823
824 /* if there's an unvisited sibling, visit its leftmost descendant */
825 rbn = rb_next(&pos->rb);
826 if (rbn)
827 return kernfs_leftmost_descendant(rb_to_kn(rbn));
828
829 /* no sibling left, visit parent */
830 return pos->parent;
831}
832
833static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
834 struct kernfs_node *kn)
835{
836 struct kernfs_node *pos, *next;
837
838 if (!kn)
839 return;
840
841 pr_debug("kernfs %s: removing\n", kn->name);
842
843 next = NULL;
844 do {
845 pos = next;
846 next = kernfs_next_descendant_post(pos, kn);
847 if (pos)
848 kernfs_remove_one(acxt, pos);
849 } while (next);
850}
851
852/**
853 * kernfs_remove - remove a kernfs_node recursively
854 * @kn: the kernfs_node to remove
855 *
856 * Remove @kn along with all its subdirectories and files.
857 */
858void kernfs_remove(struct kernfs_node *kn)
859{
860 struct kernfs_addrm_cxt acxt;
861
862 kernfs_addrm_start(&acxt);
863 __kernfs_remove(&acxt, kn);
864 kernfs_addrm_finish(&acxt);
865}
866
867/**
868 * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
869 * @parent: parent of the target
870 * @name: name of the kernfs_node to remove
871 * @ns: namespace tag of the kernfs_node to remove
872 *
873 * Look for the kernfs_node with @name and @ns under @parent and remove it.
874 * Returns 0 on success, -ENOENT if such entry doesn't exist.
875 */
876int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
877 const void *ns)
878{
879 struct kernfs_addrm_cxt acxt;
880 struct kernfs_node *kn;
881
882 if (!parent) {
883 WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
884 name);
885 return -ENOENT;
886 }
887
888 kernfs_addrm_start(&acxt);
889
890 kn = kernfs_find_ns(parent, name, ns);
891 if (kn)
892 __kernfs_remove(&acxt, kn);
893
894 kernfs_addrm_finish(&acxt);
895
896 if (kn)
897 return 0;
898 else
899 return -ENOENT;
900}
901
902/**
903 * kernfs_rename_ns - move and rename a kernfs_node
904 * @kn: target node
905 * @new_parent: new parent to put @sd under
906 * @new_name: new name
907 * @new_ns: new namespace tag
908 */
909int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
910 const char *new_name, const void *new_ns)
911{
912 int error;
913
914 mutex_lock(&kernfs_mutex);
915
916 error = -ENOENT;
917 if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
918 goto out;
919
920 error = 0;
921 if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
922 (strcmp(kn->name, new_name) == 0))
923 goto out; /* nothing to rename */
924
925 error = -EEXIST;
926 if (kernfs_find_ns(new_parent, new_name, new_ns))
927 goto out;
928
929 /* rename kernfs_node */
930 if (strcmp(kn->name, new_name) != 0) {
931 error = -ENOMEM;
932 new_name = kstrdup(new_name, GFP_KERNEL);
933 if (!new_name)
934 goto out;
935
936 if (kn->flags & KERNFS_STATIC_NAME)
937 kn->flags &= ~KERNFS_STATIC_NAME;
938 else
939 kfree(kn->name);
940
941 kn->name = new_name;
942 }
943
944 /*
945 * Move to the appropriate place in the appropriate directories rbtree.
946 */
947 kernfs_unlink_sibling(kn);
948 kernfs_get(new_parent);
949 kernfs_put(kn->parent);
950 kn->ns = new_ns;
951 kn->hash = kernfs_name_hash(kn->name, kn->ns);
952 kn->parent = new_parent;
953 kernfs_link_sibling(kn);
954
955 error = 0;
956 out:
957 mutex_unlock(&kernfs_mutex);
958 return error;
959}
960
961/* Relationship between s_mode and the DT_xxx types */
962static inline unsigned char dt_type(struct kernfs_node *kn)
963{
964 return (kn->mode >> 12) & 15;
965}
966
967static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
968{
969 kernfs_put(filp->private_data);
970 return 0;
971}
972
973static struct kernfs_node *kernfs_dir_pos(const void *ns,
974 struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
975{
976 if (pos) {
977 int valid = !(pos->flags & KERNFS_REMOVED) &&
978 pos->parent == parent && hash == pos->hash;
979 kernfs_put(pos);
980 if (!valid)
981 pos = NULL;
982 }
983 if (!pos && (hash > 1) && (hash < INT_MAX)) {
984 struct rb_node *node = parent->dir.children.rb_node;
985 while (node) {
986 pos = rb_to_kn(node);
987
988 if (hash < pos->hash)
989 node = node->rb_left;
990 else if (hash > pos->hash)
991 node = node->rb_right;
992 else
993 break;
994 }
995 }
996 /* Skip over entries in the wrong namespace */
997 while (pos && pos->ns != ns) {
998 struct rb_node *node = rb_next(&pos->rb);
999 if (!node)
1000 pos = NULL;
1001 else
1002 pos = rb_to_kn(node);
1003 }
1004 return pos;
1005}
1006
1007static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
1008 struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
1009{
1010 pos = kernfs_dir_pos(ns, parent, ino, pos);
1011 if (pos)
1012 do {
1013 struct rb_node *node = rb_next(&pos->rb);
1014 if (!node)
1015 pos = NULL;
1016 else
1017 pos = rb_to_kn(node);
1018 } while (pos && pos->ns != ns);
1019 return pos;
1020}
1021
1022static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
1023{
1024 struct dentry *dentry = file->f_path.dentry;
1025 struct kernfs_node *parent = dentry->d_fsdata;
1026 struct kernfs_node *pos = file->private_data;
1027 const void *ns = NULL;
1028
1029 if (!dir_emit_dots(file, ctx))
1030 return 0;
1031 mutex_lock(&kernfs_mutex);
1032
1033 if (kernfs_ns_enabled(parent))
1034 ns = kernfs_info(dentry->d_sb)->ns;
1035
1036 for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
1037 pos;
1038 pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
1039 const char *name = pos->name;
1040 unsigned int type = dt_type(pos);
1041 int len = strlen(name);
1042 ino_t ino = pos->ino;
1043
1044 ctx->pos = pos->hash;
1045 file->private_data = pos;
1046 kernfs_get(pos);
1047
1048 mutex_unlock(&kernfs_mutex);
1049 if (!dir_emit(ctx, name, len, ino, type))
1050 return 0;
1051 mutex_lock(&kernfs_mutex);
1052 }
1053 mutex_unlock(&kernfs_mutex);
1054 file->private_data = NULL;
1055 ctx->pos = INT_MAX;
1056 return 0;
1057}
1058
1059static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
1060 int whence)
1061{
1062 struct inode *inode = file_inode(file);
1063 loff_t ret;
1064
1065 mutex_lock(&inode->i_mutex);
1066 ret = generic_file_llseek(file, offset, whence);
1067 mutex_unlock(&inode->i_mutex);
1068
1069 return ret;
1070}
1071
1072const struct file_operations kernfs_dir_fops = {
1073 .read = generic_read_dir,
1074 .iterate = kernfs_fop_readdir,
1075 .release = kernfs_dir_fop_release,
1076 .llseek = kernfs_dir_fop_llseek,
1077};
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
new file mode 100644
index 000000000000..dbf397bfdff2
--- /dev/null
+++ b/fs/kernfs/file.c
@@ -0,0 +1,867 @@
1/*
2 * fs/kernfs/file.c - kernfs file implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/fs.h>
12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/poll.h>
15#include <linux/pagemap.h>
16#include <linux/sched.h>
17
18#include "kernfs-internal.h"
19
20/*
21 * There's one kernfs_open_file for each open file and one kernfs_open_node
22 * for each kernfs_node with one or more open files.
23 *
24 * kernfs_node->attr.open points to kernfs_open_node. attr.open is
25 * protected by kernfs_open_node_lock.
26 *
27 * filp->private_data points to seq_file whose ->private points to
28 * kernfs_open_file. kernfs_open_files are chained at
29 * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
30 */
31static DEFINE_SPINLOCK(kernfs_open_node_lock);
32static DEFINE_MUTEX(kernfs_open_file_mutex);
33
34struct kernfs_open_node {
35 atomic_t refcnt;
36 atomic_t event;
37 wait_queue_head_t poll;
38 struct list_head files; /* goes through kernfs_open_file.list */
39};
40
41static struct kernfs_open_file *kernfs_of(struct file *file)
42{
43 return ((struct seq_file *)file->private_data)->private;
44}
45
46/*
47 * Determine the kernfs_ops for the given kernfs_node. This function must
48 * be called while holding an active reference.
49 */
50static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
51{
52 if (kn->flags & KERNFS_LOCKDEP)
53 lockdep_assert_held(kn);
54 return kn->attr.ops;
55}
56
57/*
58 * As kernfs_seq_stop() is also called after kernfs_seq_start() or
59 * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
60 * a seq_file iteration which is fully initialized with an active reference
61 * or an aborted kernfs_seq_start() due to get_active failure. The
62 * position pointer is the only context for each seq_file iteration and
63 * thus the stop condition should be encoded in it. As the return value is
64 * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
65 * choice to indicate get_active failure.
66 *
67 * Unfortunately, this is complicated due to the optional custom seq_file
68 * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop()
69 * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
70 * custom seq_file operations and thus can't decide whether put_active
71 * should be performed or not only on ERR_PTR(-ENODEV).
72 *
73 * This is worked around by factoring out the custom seq_stop() and
74 * put_active part into kernfs_seq_stop_active(), skipping it from
75 * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
76 * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
77 * that kernfs_seq_stop_active() is skipped only after get_active failure.
78 */
79static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
80{
81 struct kernfs_open_file *of = sf->private;
82 const struct kernfs_ops *ops = kernfs_ops(of->kn);
83
84 if (ops->seq_stop)
85 ops->seq_stop(sf, v);
86 kernfs_put_active(of->kn);
87}
88
89static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
90{
91 struct kernfs_open_file *of = sf->private;
92 const struct kernfs_ops *ops;
93
94 /*
95 * @of->mutex nests outside active ref and is just to ensure that
96 * the ops aren't called concurrently for the same open file.
97 */
98 mutex_lock(&of->mutex);
99 if (!kernfs_get_active(of->kn))
100 return ERR_PTR(-ENODEV);
101
102 ops = kernfs_ops(of->kn);
103 if (ops->seq_start) {
104 void *next = ops->seq_start(sf, ppos);
105 /* see the comment above kernfs_seq_stop_active() */
106 if (next == ERR_PTR(-ENODEV))
107 kernfs_seq_stop_active(sf, next);
108 return next;
109 } else {
110 /*
111 * The same behavior and code as single_open(). Returns
112 * !NULL if pos is at the beginning; otherwise, NULL.
113 */
114 return NULL + !*ppos;
115 }
116}
117
118static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
119{
120 struct kernfs_open_file *of = sf->private;
121 const struct kernfs_ops *ops = kernfs_ops(of->kn);
122
123 if (ops->seq_next) {
124 void *next = ops->seq_next(sf, v, ppos);
125 /* see the comment above kernfs_seq_stop_active() */
126 if (next == ERR_PTR(-ENODEV))
127 kernfs_seq_stop_active(sf, next);
128 return next;
129 } else {
130 /*
131 * The same behavior and code as single_open(), always
132 * terminate after the initial read.
133 */
134 ++*ppos;
135 return NULL;
136 }
137}
138
139static void kernfs_seq_stop(struct seq_file *sf, void *v)
140{
141 struct kernfs_open_file *of = sf->private;
142
143 if (v != ERR_PTR(-ENODEV))
144 kernfs_seq_stop_active(sf, v);
145 mutex_unlock(&of->mutex);
146}
147
148static int kernfs_seq_show(struct seq_file *sf, void *v)
149{
150 struct kernfs_open_file *of = sf->private;
151
152 of->event = atomic_read(&of->kn->attr.open->event);
153
154 return of->kn->attr.ops->seq_show(sf, v);
155}
156
157static const struct seq_operations kernfs_seq_ops = {
158 .start = kernfs_seq_start,
159 .next = kernfs_seq_next,
160 .stop = kernfs_seq_stop,
161 .show = kernfs_seq_show,
162};
163
164/*
165 * As reading a bin file can have side-effects, the exact offset and bytes
166 * specified in read(2) call should be passed to the read callback making
167 * it difficult to use seq_file. Implement simplistic custom buffering for
168 * bin files.
169 */
170static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
171 char __user *user_buf, size_t count,
172 loff_t *ppos)
173{
174 ssize_t len = min_t(size_t, count, PAGE_SIZE);
175 const struct kernfs_ops *ops;
176 char *buf;
177
178 buf = kmalloc(len, GFP_KERNEL);
179 if (!buf)
180 return -ENOMEM;
181
182 /*
183 * @of->mutex nests outside active ref and is just to ensure that
184 * the ops aren't called concurrently for the same open file.
185 */
186 mutex_lock(&of->mutex);
187 if (!kernfs_get_active(of->kn)) {
188 len = -ENODEV;
189 mutex_unlock(&of->mutex);
190 goto out_free;
191 }
192
193 ops = kernfs_ops(of->kn);
194 if (ops->read)
195 len = ops->read(of, buf, len, *ppos);
196 else
197 len = -EINVAL;
198
199 kernfs_put_active(of->kn);
200 mutex_unlock(&of->mutex);
201
202 if (len < 0)
203 goto out_free;
204
205 if (copy_to_user(user_buf, buf, len)) {
206 len = -EFAULT;
207 goto out_free;
208 }
209
210 *ppos += len;
211
212 out_free:
213 kfree(buf);
214 return len;
215}
216
217/**
218 * kernfs_fop_read - kernfs vfs read callback
219 * @file: file pointer
220 * @user_buf: data to write
221 * @count: number of bytes
222 * @ppos: starting offset
223 */
224static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
225 size_t count, loff_t *ppos)
226{
227 struct kernfs_open_file *of = kernfs_of(file);
228
229 if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
230 return seq_read(file, user_buf, count, ppos);
231 else
232 return kernfs_file_direct_read(of, user_buf, count, ppos);
233}
234
235/**
236 * kernfs_fop_write - kernfs vfs write callback
237 * @file: file pointer
238 * @user_buf: data to write
239 * @count: number of bytes
240 * @ppos: starting offset
241 *
242 * Copy data in from userland and pass it to the matching kernfs write
243 * operation.
244 *
245 * There is no easy way for us to know if userspace is only doing a partial
246 * write, so we don't support them. We expect the entire buffer to come on
247 * the first write. Hint: if you're writing a value, first read the file,
248 * modify only the the value you're changing, then write entire buffer
249 * back.
250 */
251static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
252 size_t count, loff_t *ppos)
253{
254 struct kernfs_open_file *of = kernfs_of(file);
255 ssize_t len = min_t(size_t, count, PAGE_SIZE);
256 const struct kernfs_ops *ops;
257 char *buf;
258
259 buf = kmalloc(len + 1, GFP_KERNEL);
260 if (!buf)
261 return -ENOMEM;
262
263 if (copy_from_user(buf, user_buf, len)) {
264 len = -EFAULT;
265 goto out_free;
266 }
267 buf[len] = '\0'; /* guarantee string termination */
268
269 /*
270 * @of->mutex nests outside active ref and is just to ensure that
271 * the ops aren't called concurrently for the same open file.
272 */
273 mutex_lock(&of->mutex);
274 if (!kernfs_get_active(of->kn)) {
275 mutex_unlock(&of->mutex);
276 len = -ENODEV;
277 goto out_free;
278 }
279
280 ops = kernfs_ops(of->kn);
281 if (ops->write)
282 len = ops->write(of, buf, len, *ppos);
283 else
284 len = -EINVAL;
285
286 kernfs_put_active(of->kn);
287 mutex_unlock(&of->mutex);
288
289 if (len > 0)
290 *ppos += len;
291out_free:
292 kfree(buf);
293 return len;
294}
295
296static void kernfs_vma_open(struct vm_area_struct *vma)
297{
298 struct file *file = vma->vm_file;
299 struct kernfs_open_file *of = kernfs_of(file);
300
301 if (!of->vm_ops)
302 return;
303
304 if (!kernfs_get_active(of->kn))
305 return;
306
307 if (of->vm_ops->open)
308 of->vm_ops->open(vma);
309
310 kernfs_put_active(of->kn);
311}
312
313static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
314{
315 struct file *file = vma->vm_file;
316 struct kernfs_open_file *of = kernfs_of(file);
317 int ret;
318
319 if (!of->vm_ops)
320 return VM_FAULT_SIGBUS;
321
322 if (!kernfs_get_active(of->kn))
323 return VM_FAULT_SIGBUS;
324
325 ret = VM_FAULT_SIGBUS;
326 if (of->vm_ops->fault)
327 ret = of->vm_ops->fault(vma, vmf);
328
329 kernfs_put_active(of->kn);
330 return ret;
331}
332
333static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
334 struct vm_fault *vmf)
335{
336 struct file *file = vma->vm_file;
337 struct kernfs_open_file *of = kernfs_of(file);
338 int ret;
339
340 if (!of->vm_ops)
341 return VM_FAULT_SIGBUS;
342
343 if (!kernfs_get_active(of->kn))
344 return VM_FAULT_SIGBUS;
345
346 ret = 0;
347 if (of->vm_ops->page_mkwrite)
348 ret = of->vm_ops->page_mkwrite(vma, vmf);
349 else
350 file_update_time(file);
351
352 kernfs_put_active(of->kn);
353 return ret;
354}
355
356static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
357 void *buf, int len, int write)
358{
359 struct file *file = vma->vm_file;
360 struct kernfs_open_file *of = kernfs_of(file);
361 int ret;
362
363 if (!of->vm_ops)
364 return -EINVAL;
365
366 if (!kernfs_get_active(of->kn))
367 return -EINVAL;
368
369 ret = -EINVAL;
370 if (of->vm_ops->access)
371 ret = of->vm_ops->access(vma, addr, buf, len, write);
372
373 kernfs_put_active(of->kn);
374 return ret;
375}
376
377#ifdef CONFIG_NUMA
378static int kernfs_vma_set_policy(struct vm_area_struct *vma,
379 struct mempolicy *new)
380{
381 struct file *file = vma->vm_file;
382 struct kernfs_open_file *of = kernfs_of(file);
383 int ret;
384
385 if (!of->vm_ops)
386 return 0;
387
388 if (!kernfs_get_active(of->kn))
389 return -EINVAL;
390
391 ret = 0;
392 if (of->vm_ops->set_policy)
393 ret = of->vm_ops->set_policy(vma, new);
394
395 kernfs_put_active(of->kn);
396 return ret;
397}
398
399static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
400 unsigned long addr)
401{
402 struct file *file = vma->vm_file;
403 struct kernfs_open_file *of = kernfs_of(file);
404 struct mempolicy *pol;
405
406 if (!of->vm_ops)
407 return vma->vm_policy;
408
409 if (!kernfs_get_active(of->kn))
410 return vma->vm_policy;
411
412 pol = vma->vm_policy;
413 if (of->vm_ops->get_policy)
414 pol = of->vm_ops->get_policy(vma, addr);
415
416 kernfs_put_active(of->kn);
417 return pol;
418}
419
420static int kernfs_vma_migrate(struct vm_area_struct *vma,
421 const nodemask_t *from, const nodemask_t *to,
422 unsigned long flags)
423{
424 struct file *file = vma->vm_file;
425 struct kernfs_open_file *of = kernfs_of(file);
426 int ret;
427
428 if (!of->vm_ops)
429 return 0;
430
431 if (!kernfs_get_active(of->kn))
432 return 0;
433
434 ret = 0;
435 if (of->vm_ops->migrate)
436 ret = of->vm_ops->migrate(vma, from, to, flags);
437
438 kernfs_put_active(of->kn);
439 return ret;
440}
441#endif
442
443static const struct vm_operations_struct kernfs_vm_ops = {
444 .open = kernfs_vma_open,
445 .fault = kernfs_vma_fault,
446 .page_mkwrite = kernfs_vma_page_mkwrite,
447 .access = kernfs_vma_access,
448#ifdef CONFIG_NUMA
449 .set_policy = kernfs_vma_set_policy,
450 .get_policy = kernfs_vma_get_policy,
451 .migrate = kernfs_vma_migrate,
452#endif
453};
454
455static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
456{
457 struct kernfs_open_file *of = kernfs_of(file);
458 const struct kernfs_ops *ops;
459 int rc;
460
461 /*
462 * mmap path and of->mutex are prone to triggering spurious lockdep
463 * warnings and we don't want to add spurious locking dependency
464 * between the two. Check whether mmap is actually implemented
465 * without grabbing @of->mutex by testing HAS_MMAP flag. See the
466 * comment in kernfs_file_open() for more details.
467 */
468 if (!(of->kn->flags & KERNFS_HAS_MMAP))
469 return -ENODEV;
470
471 mutex_lock(&of->mutex);
472
473 rc = -ENODEV;
474 if (!kernfs_get_active(of->kn))
475 goto out_unlock;
476
477 ops = kernfs_ops(of->kn);
478 rc = ops->mmap(of, vma);
479
480 /*
481 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
482 * to satisfy versions of X which crash if the mmap fails: that
483 * substitutes a new vm_file, and we don't then want bin_vm_ops.
484 */
485 if (vma->vm_file != file)
486 goto out_put;
487
488 rc = -EINVAL;
489 if (of->mmapped && of->vm_ops != vma->vm_ops)
490 goto out_put;
491
492 /*
493 * It is not possible to successfully wrap close.
494 * So error if someone is trying to use close.
495 */
496 rc = -EINVAL;
497 if (vma->vm_ops && vma->vm_ops->close)
498 goto out_put;
499
500 rc = 0;
501 of->mmapped = 1;
502 of->vm_ops = vma->vm_ops;
503 vma->vm_ops = &kernfs_vm_ops;
504out_put:
505 kernfs_put_active(of->kn);
506out_unlock:
507 mutex_unlock(&of->mutex);
508
509 return rc;
510}
511
512/**
513 * kernfs_get_open_node - get or create kernfs_open_node
514 * @kn: target kernfs_node
515 * @of: kernfs_open_file for this instance of open
516 *
517 * If @kn->attr.open exists, increment its reference count; otherwise,
518 * create one. @of is chained to the files list.
519 *
520 * LOCKING:
521 * Kernel thread context (may sleep).
522 *
523 * RETURNS:
524 * 0 on success, -errno on failure.
525 */
526static int kernfs_get_open_node(struct kernfs_node *kn,
527 struct kernfs_open_file *of)
528{
529 struct kernfs_open_node *on, *new_on = NULL;
530
531 retry:
532 mutex_lock(&kernfs_open_file_mutex);
533 spin_lock_irq(&kernfs_open_node_lock);
534
535 if (!kn->attr.open && new_on) {
536 kn->attr.open = new_on;
537 new_on = NULL;
538 }
539
540 on = kn->attr.open;
541 if (on) {
542 atomic_inc(&on->refcnt);
543 list_add_tail(&of->list, &on->files);
544 }
545
546 spin_unlock_irq(&kernfs_open_node_lock);
547 mutex_unlock(&kernfs_open_file_mutex);
548
549 if (on) {
550 kfree(new_on);
551 return 0;
552 }
553
554 /* not there, initialize a new one and retry */
555 new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
556 if (!new_on)
557 return -ENOMEM;
558
559 atomic_set(&new_on->refcnt, 0);
560 atomic_set(&new_on->event, 1);
561 init_waitqueue_head(&new_on->poll);
562 INIT_LIST_HEAD(&new_on->files);
563 goto retry;
564}
565
566/**
567 * kernfs_put_open_node - put kernfs_open_node
568 * @kn: target kernfs_nodet
569 * @of: associated kernfs_open_file
570 *
571 * Put @kn->attr.open and unlink @of from the files list. If
572 * reference count reaches zero, disassociate and free it.
573 *
574 * LOCKING:
575 * None.
576 */
577static void kernfs_put_open_node(struct kernfs_node *kn,
578 struct kernfs_open_file *of)
579{
580 struct kernfs_open_node *on = kn->attr.open;
581 unsigned long flags;
582
583 mutex_lock(&kernfs_open_file_mutex);
584 spin_lock_irqsave(&kernfs_open_node_lock, flags);
585
586 if (of)
587 list_del(&of->list);
588
589 if (atomic_dec_and_test(&on->refcnt))
590 kn->attr.open = NULL;
591 else
592 on = NULL;
593
594 spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
595 mutex_unlock(&kernfs_open_file_mutex);
596
597 kfree(on);
598}
599
600static int kernfs_fop_open(struct inode *inode, struct file *file)
601{
602 struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
603 const struct kernfs_ops *ops;
604 struct kernfs_open_file *of;
605 bool has_read, has_write, has_mmap;
606 int error = -EACCES;
607
608 if (!kernfs_get_active(kn))
609 return -ENODEV;
610
611 ops = kernfs_ops(kn);
612
613 has_read = ops->seq_show || ops->read || ops->mmap;
614 has_write = ops->write || ops->mmap;
615 has_mmap = ops->mmap;
616
617 /* check perms and supported operations */
618 if ((file->f_mode & FMODE_WRITE) &&
619 (!(inode->i_mode & S_IWUGO) || !has_write))
620 goto err_out;
621
622 if ((file->f_mode & FMODE_READ) &&
623 (!(inode->i_mode & S_IRUGO) || !has_read))
624 goto err_out;
625
626 /* allocate a kernfs_open_file for the file */
627 error = -ENOMEM;
628 of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
629 if (!of)
630 goto err_out;
631
632 /*
633 * The following is done to give a different lockdep key to
634 * @of->mutex for files which implement mmap. This is a rather
635 * crude way to avoid false positive lockdep warning around
636 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
637 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
638 * which mm->mmap_sem nests, while holding @of->mutex. As each
639 * open file has a separate mutex, it's okay as long as those don't
640 * happen on the same file. At this point, we can't easily give
641 * each file a separate locking class. Let's differentiate on
642 * whether the file has mmap or not for now.
643 *
644 * Both paths of the branch look the same. They're supposed to
645 * look that way and give @of->mutex different static lockdep keys.
646 */
647 if (has_mmap)
648 mutex_init(&of->mutex);
649 else
650 mutex_init(&of->mutex);
651
652 of->kn = kn;
653 of->file = file;
654
655 /*
656 * Always instantiate seq_file even if read access doesn't use
657 * seq_file or is not requested. This unifies private data access
658 * and readable regular files are the vast majority anyway.
659 */
660 if (ops->seq_show)
661 error = seq_open(file, &kernfs_seq_ops);
662 else
663 error = seq_open(file, NULL);
664 if (error)
665 goto err_free;
666
667 ((struct seq_file *)file->private_data)->private = of;
668
669 /* seq_file clears PWRITE unconditionally, restore it if WRITE */
670 if (file->f_mode & FMODE_WRITE)
671 file->f_mode |= FMODE_PWRITE;
672
673 /* make sure we have open node struct */
674 error = kernfs_get_open_node(kn, of);
675 if (error)
676 goto err_close;
677
678 /* open succeeded, put active references */
679 kernfs_put_active(kn);
680 return 0;
681
682err_close:
683 seq_release(inode, file);
684err_free:
685 kfree(of);
686err_out:
687 kernfs_put_active(kn);
688 return error;
689}
690
691static int kernfs_fop_release(struct inode *inode, struct file *filp)
692{
693 struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
694 struct kernfs_open_file *of = kernfs_of(filp);
695
696 kernfs_put_open_node(kn, of);
697 seq_release(inode, filp);
698 kfree(of);
699
700 return 0;
701}
702
703void kernfs_unmap_bin_file(struct kernfs_node *kn)
704{
705 struct kernfs_open_node *on;
706 struct kernfs_open_file *of;
707
708 if (!(kn->flags & KERNFS_HAS_MMAP))
709 return;
710
711 spin_lock_irq(&kernfs_open_node_lock);
712 on = kn->attr.open;
713 if (on)
714 atomic_inc(&on->refcnt);
715 spin_unlock_irq(&kernfs_open_node_lock);
716 if (!on)
717 return;
718
719 mutex_lock(&kernfs_open_file_mutex);
720 list_for_each_entry(of, &on->files, list) {
721 struct inode *inode = file_inode(of->file);
722 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
723 }
724 mutex_unlock(&kernfs_open_file_mutex);
725
726 kernfs_put_open_node(kn, NULL);
727}
728
729/*
730 * Kernfs attribute files are pollable. The idea is that you read
731 * the content and then you use 'poll' or 'select' to wait for
732 * the content to change. When the content changes (assuming the
733 * manager for the kobject supports notification), poll will
734 * return POLLERR|POLLPRI, and select will return the fd whether
735 * it is waiting for read, write, or exceptions.
736 * Once poll/select indicates that the value has changed, you
737 * need to close and re-open the file, or seek to 0 and read again.
738 * Reminder: this only works for attributes which actively support
739 * it, and it is not possible to test an attribute from userspace
740 * to see if it supports poll (Neither 'poll' nor 'select' return
741 * an appropriate error code). When in doubt, set a suitable timeout value.
742 */
743static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
744{
745 struct kernfs_open_file *of = kernfs_of(filp);
746 struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
747 struct kernfs_open_node *on = kn->attr.open;
748
749 /* need parent for the kobj, grab both */
750 if (!kernfs_get_active(kn))
751 goto trigger;
752
753 poll_wait(filp, &on->poll, wait);
754
755 kernfs_put_active(kn);
756
757 if (of->event != atomic_read(&on->event))
758 goto trigger;
759
760 return DEFAULT_POLLMASK;
761
762 trigger:
763 return DEFAULT_POLLMASK|POLLERR|POLLPRI;
764}
765
766/**
767 * kernfs_notify - notify a kernfs file
768 * @kn: file to notify
769 *
770 * Notify @kn such that poll(2) on @kn wakes up.
771 */
772void kernfs_notify(struct kernfs_node *kn)
773{
774 struct kernfs_open_node *on;
775 unsigned long flags;
776
777 spin_lock_irqsave(&kernfs_open_node_lock, flags);
778
779 if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
780 on = kn->attr.open;
781 if (on) {
782 atomic_inc(&on->event);
783 wake_up_interruptible(&on->poll);
784 }
785 }
786
787 spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
788}
789EXPORT_SYMBOL_GPL(kernfs_notify);
790
791const struct file_operations kernfs_file_fops = {
792 .read = kernfs_fop_read,
793 .write = kernfs_fop_write,
794 .llseek = generic_file_llseek,
795 .mmap = kernfs_fop_mmap,
796 .open = kernfs_fop_open,
797 .release = kernfs_fop_release,
798 .poll = kernfs_fop_poll,
799};
800
801/**
802 * __kernfs_create_file - kernfs internal function to create a file
803 * @parent: directory to create the file in
804 * @name: name of the file
805 * @mode: mode of the file
806 * @size: size of the file
807 * @ops: kernfs operations for the file
808 * @priv: private data for the file
809 * @ns: optional namespace tag of the file
810 * @static_name: don't copy file name
811 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
812 *
813 * Returns the created node on success, ERR_PTR() value on error.
814 */
815struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
816 const char *name,
817 umode_t mode, loff_t size,
818 const struct kernfs_ops *ops,
819 void *priv, const void *ns,
820 bool name_is_static,
821 struct lock_class_key *key)
822{
823 struct kernfs_addrm_cxt acxt;
824 struct kernfs_node *kn;
825 unsigned flags;
826 int rc;
827
828 flags = KERNFS_FILE;
829 if (name_is_static)
830 flags |= KERNFS_STATIC_NAME;
831
832 kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
833 if (!kn)
834 return ERR_PTR(-ENOMEM);
835
836 kn->attr.ops = ops;
837 kn->attr.size = size;
838 kn->ns = ns;
839 kn->priv = priv;
840
841#ifdef CONFIG_DEBUG_LOCK_ALLOC
842 if (key) {
843 lockdep_init_map(&kn->dep_map, "s_active", key, 0);
844 kn->flags |= KERNFS_LOCKDEP;
845 }
846#endif
847
848 /*
849 * kn->attr.ops is accesible only while holding active ref. We
850 * need to know whether some ops are implemented outside active
851 * ref. Cache their existence in flags.
852 */
853 if (ops->seq_show)
854 kn->flags |= KERNFS_HAS_SEQ_SHOW;
855 if (ops->mmap)
856 kn->flags |= KERNFS_HAS_MMAP;
857
858 kernfs_addrm_start(&acxt);
859 rc = kernfs_add_one(&acxt, kn);
860 kernfs_addrm_finish(&acxt);
861
862 if (rc) {
863 kernfs_put(kn);
864 return ERR_PTR(rc);
865 }
866 return kn;
867}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
new file mode 100644
index 000000000000..e55126f85bd2
--- /dev/null
+++ b/fs/kernfs/inode.c
@@ -0,0 +1,377 @@
1/*
2 * fs/kernfs/inode.c - kernfs inode implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/pagemap.h>
12#include <linux/backing-dev.h>
13#include <linux/capability.h>
14#include <linux/errno.h>
15#include <linux/slab.h>
16#include <linux/xattr.h>
17#include <linux/security.h>
18
19#include "kernfs-internal.h"
20
21static const struct address_space_operations kernfs_aops = {
22 .readpage = simple_readpage,
23 .write_begin = simple_write_begin,
24 .write_end = simple_write_end,
25};
26
27static struct backing_dev_info kernfs_bdi = {
28 .name = "kernfs",
29 .ra_pages = 0, /* No readahead */
30 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
31};
32
33static const struct inode_operations kernfs_iops = {
34 .permission = kernfs_iop_permission,
35 .setattr = kernfs_iop_setattr,
36 .getattr = kernfs_iop_getattr,
37 .setxattr = kernfs_iop_setxattr,
38 .removexattr = kernfs_iop_removexattr,
39 .getxattr = kernfs_iop_getxattr,
40 .listxattr = kernfs_iop_listxattr,
41};
42
43void __init kernfs_inode_init(void)
44{
45 if (bdi_init(&kernfs_bdi))
46 panic("failed to init kernfs_bdi");
47}
48
49static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
50{
51 struct iattr *iattrs;
52
53 if (kn->iattr)
54 return kn->iattr;
55
56 kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
57 if (!kn->iattr)
58 return NULL;
59 iattrs = &kn->iattr->ia_iattr;
60
61 /* assign default attributes */
62 iattrs->ia_mode = kn->mode;
63 iattrs->ia_uid = GLOBAL_ROOT_UID;
64 iattrs->ia_gid = GLOBAL_ROOT_GID;
65 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
66
67 simple_xattrs_init(&kn->iattr->xattrs);
68
69 return kn->iattr;
70}
71
72static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
73{
74 struct kernfs_iattrs *attrs;
75 struct iattr *iattrs;
76 unsigned int ia_valid = iattr->ia_valid;
77
78 attrs = kernfs_iattrs(kn);
79 if (!attrs)
80 return -ENOMEM;
81
82 iattrs = &attrs->ia_iattr;
83
84 if (ia_valid & ATTR_UID)
85 iattrs->ia_uid = iattr->ia_uid;
86 if (ia_valid & ATTR_GID)
87 iattrs->ia_gid = iattr->ia_gid;
88 if (ia_valid & ATTR_ATIME)
89 iattrs->ia_atime = iattr->ia_atime;
90 if (ia_valid & ATTR_MTIME)
91 iattrs->ia_mtime = iattr->ia_mtime;
92 if (ia_valid & ATTR_CTIME)
93 iattrs->ia_ctime = iattr->ia_ctime;
94 if (ia_valid & ATTR_MODE) {
95 umode_t mode = iattr->ia_mode;
96 iattrs->ia_mode = kn->mode = mode;
97 }
98 return 0;
99}
100
101/**
102 * kernfs_setattr - set iattr on a node
103 * @kn: target node
104 * @iattr: iattr to set
105 *
106 * Returns 0 on success, -errno on failure.
107 */
108int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
109{
110 int ret;
111
112 mutex_lock(&kernfs_mutex);
113 ret = __kernfs_setattr(kn, iattr);
114 mutex_unlock(&kernfs_mutex);
115 return ret;
116}
117
118int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
119{
120 struct inode *inode = dentry->d_inode;
121 struct kernfs_node *kn = dentry->d_fsdata;
122 int error;
123
124 if (!kn)
125 return -EINVAL;
126
127 mutex_lock(&kernfs_mutex);
128 error = inode_change_ok(inode, iattr);
129 if (error)
130 goto out;
131
132 error = __kernfs_setattr(kn, iattr);
133 if (error)
134 goto out;
135
136 /* this ignores size changes */
137 setattr_copy(inode, iattr);
138
139out:
140 mutex_unlock(&kernfs_mutex);
141 return error;
142}
143
144static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
145 u32 *secdata_len)
146{
147 struct kernfs_iattrs *attrs;
148 void *old_secdata;
149 size_t old_secdata_len;
150
151 attrs = kernfs_iattrs(kn);
152 if (!attrs)
153 return -ENOMEM;
154
155 old_secdata = attrs->ia_secdata;
156 old_secdata_len = attrs->ia_secdata_len;
157
158 attrs->ia_secdata = *secdata;
159 attrs->ia_secdata_len = *secdata_len;
160
161 *secdata = old_secdata;
162 *secdata_len = old_secdata_len;
163 return 0;
164}
165
166int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
167 const void *value, size_t size, int flags)
168{
169 struct kernfs_node *kn = dentry->d_fsdata;
170 struct kernfs_iattrs *attrs;
171 void *secdata;
172 int error;
173 u32 secdata_len = 0;
174
175 attrs = kernfs_iattrs(kn);
176 if (!attrs)
177 return -ENOMEM;
178
179 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
180 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
181 error = security_inode_setsecurity(dentry->d_inode, suffix,
182 value, size, flags);
183 if (error)
184 return error;
185 error = security_inode_getsecctx(dentry->d_inode,
186 &secdata, &secdata_len);
187 if (error)
188 return error;
189
190 mutex_lock(&kernfs_mutex);
191 error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
192 mutex_unlock(&kernfs_mutex);
193
194 if (secdata)
195 security_release_secctx(secdata, secdata_len);
196 return error;
197 } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
198 return simple_xattr_set(&attrs->xattrs, name, value, size,
199 flags);
200 }
201
202 return -EINVAL;
203}
204
205int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
206{
207 struct kernfs_node *kn = dentry->d_fsdata;
208 struct kernfs_iattrs *attrs;
209
210 attrs = kernfs_iattrs(kn);
211 if (!attrs)
212 return -ENOMEM;
213
214 return simple_xattr_remove(&attrs->xattrs, name);
215}
216
217ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
218 size_t size)
219{
220 struct kernfs_node *kn = dentry->d_fsdata;
221 struct kernfs_iattrs *attrs;
222
223 attrs = kernfs_iattrs(kn);
224 if (!attrs)
225 return -ENOMEM;
226
227 return simple_xattr_get(&attrs->xattrs, name, buf, size);
228}
229
230ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
231{
232 struct kernfs_node *kn = dentry->d_fsdata;
233 struct kernfs_iattrs *attrs;
234
235 attrs = kernfs_iattrs(kn);
236 if (!attrs)
237 return -ENOMEM;
238
239 return simple_xattr_list(&attrs->xattrs, buf, size);
240}
241
242static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
243{
244 inode->i_mode = mode;
245 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
246}
247
248static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
249{
250 inode->i_uid = iattr->ia_uid;
251 inode->i_gid = iattr->ia_gid;
252 inode->i_atime = iattr->ia_atime;
253 inode->i_mtime = iattr->ia_mtime;
254 inode->i_ctime = iattr->ia_ctime;
255}
256
257static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
258{
259 struct kernfs_iattrs *attrs = kn->iattr;
260
261 inode->i_mode = kn->mode;
262 if (attrs) {
263 /*
264 * kernfs_node has non-default attributes get them from
265 * persistent copy in kernfs_node.
266 */
267 set_inode_attr(inode, &attrs->ia_iattr);
268 security_inode_notifysecctx(inode, attrs->ia_secdata,
269 attrs->ia_secdata_len);
270 }
271
272 if (kernfs_type(kn) == KERNFS_DIR)
273 set_nlink(inode, kn->dir.subdirs + 2);
274}
275
276int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
277 struct kstat *stat)
278{
279 struct kernfs_node *kn = dentry->d_fsdata;
280 struct inode *inode = dentry->d_inode;
281
282 mutex_lock(&kernfs_mutex);
283 kernfs_refresh_inode(kn, inode);
284 mutex_unlock(&kernfs_mutex);
285
286 generic_fillattr(inode, stat);
287 return 0;
288}
289
290static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
291{
292 kernfs_get(kn);
293 inode->i_private = kn;
294 inode->i_mapping->a_ops = &kernfs_aops;
295 inode->i_mapping->backing_dev_info = &kernfs_bdi;
296 inode->i_op = &kernfs_iops;
297
298 set_default_inode_attr(inode, kn->mode);
299 kernfs_refresh_inode(kn, inode);
300
301 /* initialize inode according to type */
302 switch (kernfs_type(kn)) {
303 case KERNFS_DIR:
304 inode->i_op = &kernfs_dir_iops;
305 inode->i_fop = &kernfs_dir_fops;
306 break;
307 case KERNFS_FILE:
308 inode->i_size = kn->attr.size;
309 inode->i_fop = &kernfs_file_fops;
310 break;
311 case KERNFS_LINK:
312 inode->i_op = &kernfs_symlink_iops;
313 break;
314 default:
315 BUG();
316 }
317
318 unlock_new_inode(inode);
319}
320
321/**
322 * kernfs_get_inode - get inode for kernfs_node
323 * @sb: super block
324 * @kn: kernfs_node to allocate inode for
325 *
326 * Get inode for @kn. If such inode doesn't exist, a new inode is
327 * allocated and basics are initialized. New inode is returned
328 * locked.
329 *
330 * LOCKING:
331 * Kernel thread context (may sleep).
332 *
333 * RETURNS:
334 * Pointer to allocated inode on success, NULL on failure.
335 */
336struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
337{
338 struct inode *inode;
339
340 inode = iget_locked(sb, kn->ino);
341 if (inode && (inode->i_state & I_NEW))
342 kernfs_init_inode(kn, inode);
343
344 return inode;
345}
346
347/*
348 * The kernfs_node serves as both an inode and a directory entry for
349 * kernfs. To prevent the kernfs inode numbers from being freed
350 * prematurely we take a reference to kernfs_node from the kernfs inode. A
351 * super_operations.evict_inode() implementation is needed to drop that
352 * reference upon inode destruction.
353 */
354void kernfs_evict_inode(struct inode *inode)
355{
356 struct kernfs_node *kn = inode->i_private;
357
358 truncate_inode_pages(&inode->i_data, 0);
359 clear_inode(inode);
360 kernfs_put(kn);
361}
362
363int kernfs_iop_permission(struct inode *inode, int mask)
364{
365 struct kernfs_node *kn;
366
367 if (mask & MAY_NOT_BLOCK)
368 return -ECHILD;
369
370 kn = inode->i_private;
371
372 mutex_lock(&kernfs_mutex);
373 kernfs_refresh_inode(kn, inode);
374 mutex_unlock(&kernfs_mutex);
375
376 return generic_permission(inode, mask);
377}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
new file mode 100644
index 000000000000..eb536b76374a
--- /dev/null
+++ b/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,122 @@
1/*
2 * fs/kernfs/kernfs-internal.h - kernfs internal header file
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#ifndef __KERNFS_INTERNAL_H
12#define __KERNFS_INTERNAL_H
13
14#include <linux/lockdep.h>
15#include <linux/fs.h>
16#include <linux/mutex.h>
17#include <linux/xattr.h>
18
19#include <linux/kernfs.h>
20
21struct kernfs_iattrs {
22 struct iattr ia_iattr;
23 void *ia_secdata;
24 u32 ia_secdata_len;
25
26 struct simple_xattrs xattrs;
27};
28
29#define KN_DEACTIVATED_BIAS INT_MIN
30
31/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
32
33/**
34 * kernfs_root - find out the kernfs_root a kernfs_node belongs to
35 * @kn: kernfs_node of interest
36 *
37 * Return the kernfs_root @kn belongs to.
38 */
39static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
40{
41 /* if parent exists, it's always a dir; otherwise, @sd is a dir */
42 if (kn->parent)
43 kn = kn->parent;
44 return kn->dir.root;
45}
46
47/*
48 * Context structure to be used while adding/removing nodes.
49 */
50struct kernfs_addrm_cxt {
51 struct kernfs_node *removed;
52};
53
54/*
55 * mount.c
56 */
57struct kernfs_super_info {
58 /*
59 * The root associated with this super_block. Each super_block is
60 * identified by the root and ns it's associated with.
61 */
62 struct kernfs_root *root;
63
64 /*
65 * Each sb is associated with one namespace tag, currently the
66 * network namespace of the task which mounted this kernfs
67 * instance. If multiple tags become necessary, make the following
68 * an array and compare kernfs_node tag against every entry.
69 */
70 const void *ns;
71};
72#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
73
74extern struct kmem_cache *kernfs_node_cache;
75
76/*
77 * inode.c
78 */
79struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
80void kernfs_evict_inode(struct inode *inode);
81int kernfs_iop_permission(struct inode *inode, int mask);
82int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
83int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
84 struct kstat *stat);
85int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
86 size_t size, int flags);
87int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
88ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
89 size_t size);
90ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
91void kernfs_inode_init(void);
92
93/*
94 * dir.c
95 */
96extern struct mutex kernfs_mutex;
97extern const struct dentry_operations kernfs_dops;
98extern const struct file_operations kernfs_dir_fops;
99extern const struct inode_operations kernfs_dir_iops;
100
101struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
102void kernfs_put_active(struct kernfs_node *kn);
103void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
104int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
105void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
106struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
107 const char *name, umode_t mode,
108 unsigned flags);
109
110/*
111 * file.c
112 */
113extern const struct file_operations kernfs_file_fops;
114
115void kernfs_unmap_bin_file(struct kernfs_node *kn);
116
117/*
118 * symlink.c
119 */
120extern const struct inode_operations kernfs_symlink_iops;
121
122#endif /* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
new file mode 100644
index 000000000000..0f4152defe7b
--- /dev/null
+++ b/fs/kernfs/mount.c
@@ -0,0 +1,171 @@
1/*
2 * fs/kernfs/mount.c - kernfs mount implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/fs.h>
12#include <linux/mount.h>
13#include <linux/init.h>
14#include <linux/magic.h>
15#include <linux/slab.h>
16#include <linux/pagemap.h>
17
18#include "kernfs-internal.h"
19
20struct kmem_cache *kernfs_node_cache;
21
22static const struct super_operations kernfs_sops = {
23 .statfs = simple_statfs,
24 .drop_inode = generic_delete_inode,
25 .evict_inode = kernfs_evict_inode,
26};
27
28static int kernfs_fill_super(struct super_block *sb)
29{
30 struct kernfs_super_info *info = kernfs_info(sb);
31 struct inode *inode;
32 struct dentry *root;
33
34 sb->s_blocksize = PAGE_CACHE_SIZE;
35 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
36 sb->s_magic = SYSFS_MAGIC;
37 sb->s_op = &kernfs_sops;
38 sb->s_time_gran = 1;
39
40 /* get root inode, initialize and unlock it */
41 mutex_lock(&kernfs_mutex);
42 inode = kernfs_get_inode(sb, info->root->kn);
43 mutex_unlock(&kernfs_mutex);
44 if (!inode) {
45 pr_debug("kernfs: could not get root inode\n");
46 return -ENOMEM;
47 }
48
49 /* instantiate and link root dentry */
50 root = d_make_root(inode);
51 if (!root) {
52 pr_debug("%s: could not get root dentry!\n", __func__);
53 return -ENOMEM;
54 }
55 kernfs_get(info->root->kn);
56 root->d_fsdata = info->root->kn;
57 sb->s_root = root;
58 sb->s_d_op = &kernfs_dops;
59 return 0;
60}
61
62static int kernfs_test_super(struct super_block *sb, void *data)
63{
64 struct kernfs_super_info *sb_info = kernfs_info(sb);
65 struct kernfs_super_info *info = data;
66
67 return sb_info->root == info->root && sb_info->ns == info->ns;
68}
69
70static int kernfs_set_super(struct super_block *sb, void *data)
71{
72 int error;
73 error = set_anon_super(sb, data);
74 if (!error)
75 sb->s_fs_info = data;
76 return error;
77}
78
79/**
80 * kernfs_super_ns - determine the namespace tag of a kernfs super_block
81 * @sb: super_block of interest
82 *
83 * Return the namespace tag associated with kernfs super_block @sb.
84 */
85const void *kernfs_super_ns(struct super_block *sb)
86{
87 struct kernfs_super_info *info = kernfs_info(sb);
88
89 return info->ns;
90}
91
92/**
93 * kernfs_mount_ns - kernfs mount helper
94 * @fs_type: file_system_type of the fs being mounted
95 * @flags: mount flags specified for the mount
96 * @root: kernfs_root of the hierarchy being mounted
97 * @new_sb_created: tell the caller if we allocated a new superblock
98 * @ns: optional namespace tag of the mount
99 *
100 * This is to be called from each kernfs user's file_system_type->mount()
101 * implementation, which should pass through the specified @fs_type and
102 * @flags, and specify the hierarchy and namespace tag to mount via @root
103 * and @ns, respectively.
104 *
105 * The return value can be passed to the vfs layer verbatim.
106 */
107struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
108 struct kernfs_root *root, bool *new_sb_created,
109 const void *ns)
110{
111 struct super_block *sb;
112 struct kernfs_super_info *info;
113 int error;
114
115 info = kzalloc(sizeof(*info), GFP_KERNEL);
116 if (!info)
117 return ERR_PTR(-ENOMEM);
118
119 info->root = root;
120 info->ns = ns;
121
122 sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
123 if (IS_ERR(sb) || sb->s_fs_info != info)
124 kfree(info);
125 if (IS_ERR(sb))
126 return ERR_CAST(sb);
127
128 if (new_sb_created)
129 *new_sb_created = !sb->s_root;
130
131 if (!sb->s_root) {
132 error = kernfs_fill_super(sb);
133 if (error) {
134 deactivate_locked_super(sb);
135 return ERR_PTR(error);
136 }
137 sb->s_flags |= MS_ACTIVE;
138 }
139
140 return dget(sb->s_root);
141}
142
143/**
144 * kernfs_kill_sb - kill_sb for kernfs
145 * @sb: super_block being killed
146 *
147 * This can be used directly for file_system_type->kill_sb(). If a kernfs
148 * user needs extra cleanup, it can implement its own kill_sb() and call
149 * this function at the end.
150 */
151void kernfs_kill_sb(struct super_block *sb)
152{
153 struct kernfs_super_info *info = kernfs_info(sb);
154 struct kernfs_node *root_kn = sb->s_root->d_fsdata;
155
156 /*
157 * Remove the superblock from fs_supers/s_instances
158 * so we can't find it, before freeing kernfs_super_info.
159 */
160 kill_anon_super(sb);
161 kfree(info);
162 kernfs_put(root_kn);
163}
164
165void __init kernfs_init(void)
166{
167 kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
168 sizeof(struct kernfs_node),
169 0, SLAB_PANIC, NULL);
170 kernfs_inode_init();
171}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
new file mode 100644
index 000000000000..4d457055acb9
--- /dev/null
+++ b/fs/kernfs/symlink.c
@@ -0,0 +1,151 @@
1/*
2 * fs/kernfs/symlink.c - kernfs symlink implementation
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/fs.h>
12#include <linux/gfp.h>
13#include <linux/namei.h>
14
15#include "kernfs-internal.h"
16
17/**
18 * kernfs_create_link - create a symlink
19 * @parent: directory to create the symlink in
20 * @name: name of the symlink
21 * @target: target node for the symlink to point to
22 *
23 * Returns the created node on success, ERR_PTR() value on error.
24 */
25struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
26 const char *name,
27 struct kernfs_node *target)
28{
29 struct kernfs_node *kn;
30 struct kernfs_addrm_cxt acxt;
31 int error;
32
33 kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
34 if (!kn)
35 return ERR_PTR(-ENOMEM);
36
37 if (kernfs_ns_enabled(parent))
38 kn->ns = target->ns;
39 kn->symlink.target_kn = target;
40 kernfs_get(target); /* ref owned by symlink */
41
42 kernfs_addrm_start(&acxt);
43 error = kernfs_add_one(&acxt, kn);
44 kernfs_addrm_finish(&acxt);
45
46 if (!error)
47 return kn;
48
49 kernfs_put(kn);
50 return ERR_PTR(error);
51}
52
53static int kernfs_get_target_path(struct kernfs_node *parent,
54 struct kernfs_node *target, char *path)
55{
56 struct kernfs_node *base, *kn;
57 char *s = path;
58 int len = 0;
59
60 /* go up to the root, stop at the base */
61 base = parent;
62 while (base->parent) {
63 kn = target->parent;
64 while (kn->parent && base != kn)
65 kn = kn->parent;
66
67 if (base == kn)
68 break;
69
70 strcpy(s, "../");
71 s += 3;
72 base = base->parent;
73 }
74
75 /* determine end of target string for reverse fillup */
76 kn = target;
77 while (kn->parent && kn != base) {
78 len += strlen(kn->name) + 1;
79 kn = kn->parent;
80 }
81
82 /* check limits */
83 if (len < 2)
84 return -EINVAL;
85 len--;
86 if ((s - path) + len > PATH_MAX)
87 return -ENAMETOOLONG;
88
89 /* reverse fillup of target string from target to base */
90 kn = target;
91 while (kn->parent && kn != base) {
92 int slen = strlen(kn->name);
93
94 len -= slen;
95 strncpy(s + len, kn->name, slen);
96 if (len)
97 s[--len] = '/';
98
99 kn = kn->parent;
100 }
101
102 return 0;
103}
104
105static int kernfs_getlink(struct dentry *dentry, char *path)
106{
107 struct kernfs_node *kn = dentry->d_fsdata;
108 struct kernfs_node *parent = kn->parent;
109 struct kernfs_node *target = kn->symlink.target_kn;
110 int error;
111
112 mutex_lock(&kernfs_mutex);
113 error = kernfs_get_target_path(parent, target, path);
114 mutex_unlock(&kernfs_mutex);
115
116 return error;
117}
118
119static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
120{
121 int error = -ENOMEM;
122 unsigned long page = get_zeroed_page(GFP_KERNEL);
123 if (page) {
124 error = kernfs_getlink(dentry, (char *) page);
125 if (error < 0)
126 free_page((unsigned long)page);
127 }
128 nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
129 return NULL;
130}
131
132static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
133 void *cookie)
134{
135 char *page = nd_get_link(nd);
136 if (!IS_ERR(page))
137 free_page((unsigned long)page);
138}
139
140const struct inode_operations kernfs_symlink_iops = {
141 .setxattr = kernfs_iop_setxattr,
142 .removexattr = kernfs_iop_removexattr,
143 .getxattr = kernfs_iop_getxattr,
144 .listxattr = kernfs_iop_listxattr,
145 .readlink = generic_readlink,
146 .follow_link = kernfs_iop_follow_link,
147 .put_link = kernfs_iop_put_link,
148 .setattr = kernfs_iop_setattr,
149 .getattr = kernfs_iop_getattr,
150 .permission = kernfs_iop_permission,
151};
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e066a3902973..ab798a88ec1d 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -779,6 +779,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
779 struct nlm_file *file = block->b_file; 779 struct nlm_file *file = block->b_file;
780 struct nlm_lock *lock = &block->b_call->a_args.lock; 780 struct nlm_lock *lock = &block->b_call->a_args.lock;
781 int error; 781 int error;
782 loff_t fl_start, fl_end;
782 783
783 dprintk("lockd: grant blocked lock %p\n", block); 784 dprintk("lockd: grant blocked lock %p\n", block);
784 785
@@ -796,9 +797,16 @@ nlmsvc_grant_blocked(struct nlm_block *block)
796 } 797 }
797 798
798 /* Try the lock operation again */ 799 /* Try the lock operation again */
800 /* vfs_lock_file() can mangle fl_start and fl_end, but we need
801 * them unchanged for the GRANT_MSG
802 */
799 lock->fl.fl_flags |= FL_SLEEP; 803 lock->fl.fl_flags |= FL_SLEEP;
804 fl_start = lock->fl.fl_start;
805 fl_end = lock->fl.fl_end;
800 error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); 806 error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
801 lock->fl.fl_flags &= ~FL_SLEEP; 807 lock->fl.fl_flags &= ~FL_SLEEP;
808 lock->fl.fl_start = fl_start;
809 lock->fl.fl_end = fl_end;
802 810
803 switch (error) { 811 switch (error) {
804 case 0: 812 case 0:
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 0f95f0d0b313..76279e11982d 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -26,9 +26,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
26 bio_vec.bv_len = PAGE_SIZE; 26 bio_vec.bv_len = PAGE_SIZE;
27 bio_vec.bv_offset = 0; 27 bio_vec.bv_offset = 0;
28 bio.bi_vcnt = 1; 28 bio.bi_vcnt = 1;
29 bio.bi_size = PAGE_SIZE;
30 bio.bi_bdev = bdev; 29 bio.bi_bdev = bdev;
31 bio.bi_sector = page->index * (PAGE_SIZE >> 9); 30 bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
31 bio.bi_iter.bi_size = PAGE_SIZE;
32 32
33 return submit_bio_wait(rw, &bio); 33 return submit_bio_wait(rw, &bio);
34} 34}
@@ -56,22 +56,18 @@ static DECLARE_WAIT_QUEUE_HEAD(wq);
56static void writeseg_end_io(struct bio *bio, int err) 56static void writeseg_end_io(struct bio *bio, int err)
57{ 57{
58 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 58 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
59 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 59 struct bio_vec *bvec;
60 int i;
60 struct super_block *sb = bio->bi_private; 61 struct super_block *sb = bio->bi_private;
61 struct logfs_super *super = logfs_super(sb); 62 struct logfs_super *super = logfs_super(sb);
62 struct page *page;
63 63
64 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ 64 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
65 BUG_ON(err); 65 BUG_ON(err);
66 BUG_ON(bio->bi_vcnt == 0); 66
67 do { 67 bio_for_each_segment_all(bvec, bio, i) {
68 page = bvec->bv_page; 68 end_page_writeback(bvec->bv_page);
69 if (--bvec >= bio->bi_io_vec) 69 page_cache_release(bvec->bv_page);
70 prefetchw(&bvec->bv_page->flags); 70 }
71
72 end_page_writeback(page);
73 page_cache_release(page);
74 } while (bvec >= bio->bi_io_vec);
75 bio_put(bio); 71 bio_put(bio);
76 if (atomic_dec_and_test(&super->s_pending_writes)) 72 if (atomic_dec_and_test(&super->s_pending_writes))
77 wake_up(&wq); 73 wake_up(&wq);
@@ -96,9 +92,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
96 if (i >= max_pages) { 92 if (i >= max_pages) {
97 /* Block layer cannot split bios :( */ 93 /* Block layer cannot split bios :( */
98 bio->bi_vcnt = i; 94 bio->bi_vcnt = i;
99 bio->bi_size = i * PAGE_SIZE; 95 bio->bi_iter.bi_size = i * PAGE_SIZE;
100 bio->bi_bdev = super->s_bdev; 96 bio->bi_bdev = super->s_bdev;
101 bio->bi_sector = ofs >> 9; 97 bio->bi_iter.bi_sector = ofs >> 9;
102 bio->bi_private = sb; 98 bio->bi_private = sb;
103 bio->bi_end_io = writeseg_end_io; 99 bio->bi_end_io = writeseg_end_io;
104 atomic_inc(&super->s_pending_writes); 100 atomic_inc(&super->s_pending_writes);
@@ -123,9 +119,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
123 unlock_page(page); 119 unlock_page(page);
124 } 120 }
125 bio->bi_vcnt = nr_pages; 121 bio->bi_vcnt = nr_pages;
126 bio->bi_size = nr_pages * PAGE_SIZE; 122 bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
127 bio->bi_bdev = super->s_bdev; 123 bio->bi_bdev = super->s_bdev;
128 bio->bi_sector = ofs >> 9; 124 bio->bi_iter.bi_sector = ofs >> 9;
129 bio->bi_private = sb; 125 bio->bi_private = sb;
130 bio->bi_end_io = writeseg_end_io; 126 bio->bi_end_io = writeseg_end_io;
131 atomic_inc(&super->s_pending_writes); 127 atomic_inc(&super->s_pending_writes);
@@ -188,9 +184,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
188 if (i >= max_pages) { 184 if (i >= max_pages) {
189 /* Block layer cannot split bios :( */ 185 /* Block layer cannot split bios :( */
190 bio->bi_vcnt = i; 186 bio->bi_vcnt = i;
191 bio->bi_size = i * PAGE_SIZE; 187 bio->bi_iter.bi_size = i * PAGE_SIZE;
192 bio->bi_bdev = super->s_bdev; 188 bio->bi_bdev = super->s_bdev;
193 bio->bi_sector = ofs >> 9; 189 bio->bi_iter.bi_sector = ofs >> 9;
194 bio->bi_private = sb; 190 bio->bi_private = sb;
195 bio->bi_end_io = erase_end_io; 191 bio->bi_end_io = erase_end_io;
196 atomic_inc(&super->s_pending_writes); 192 atomic_inc(&super->s_pending_writes);
@@ -209,9 +205,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
209 bio->bi_io_vec[i].bv_offset = 0; 205 bio->bi_io_vec[i].bv_offset = 0;
210 } 206 }
211 bio->bi_vcnt = nr_pages; 207 bio->bi_vcnt = nr_pages;
212 bio->bi_size = nr_pages * PAGE_SIZE; 208 bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
213 bio->bi_bdev = super->s_bdev; 209 bio->bi_bdev = super->s_bdev;
214 bio->bi_sector = ofs >> 9; 210 bio->bi_iter.bi_sector = ofs >> 9;
215 bio->bi_private = sb; 211 bio->bi_private = sb;
216 bio->bi_end_io = erase_end_io; 212 bio->bi_end_io = erase_end_io;
217 atomic_inc(&super->s_pending_writes); 213 atomic_inc(&super->s_pending_writes);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index d448a777166b..7f9b096d8d57 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -62,7 +62,8 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
62 page = read_cache_page(mapping, index, filler, sb); 62 page = read_cache_page(mapping, index, filler, sb);
63 else { 63 else {
64 page = find_or_create_page(mapping, index, GFP_NOFS); 64 page = find_or_create_page(mapping, index, GFP_NOFS);
65 unlock_page(page); 65 if (page)
66 unlock_page(page);
66 } 67 }
67 return page; 68 return page;
68} 69}
diff --git a/fs/mount.h b/fs/mount.h
index d64c594be6c4..b29e42f05f34 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -19,13 +19,13 @@ struct mnt_pcp {
19}; 19};
20 20
21struct mountpoint { 21struct mountpoint {
22 struct list_head m_hash; 22 struct hlist_node m_hash;
23 struct dentry *m_dentry; 23 struct dentry *m_dentry;
24 int m_count; 24 int m_count;
25}; 25};
26 26
27struct mount { 27struct mount {
28 struct list_head mnt_hash; 28 struct hlist_node mnt_hash;
29 struct mount *mnt_parent; 29 struct mount *mnt_parent;
30 struct dentry *mnt_mountpoint; 30 struct dentry *mnt_mountpoint;
31 struct vfsmount mnt; 31 struct vfsmount mnt;
@@ -74,7 +74,7 @@ static inline int mnt_has_parent(struct mount *mnt)
74static inline int is_mounted(struct vfsmount *mnt) 74static inline int is_mounted(struct vfsmount *mnt)
75{ 75{
76 /* neither detached nor internal? */ 76 /* neither detached nor internal? */
77 return !IS_ERR_OR_NULL(real_mount(mnt)); 77 return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
78} 78}
79 79
80extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); 80extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
diff --git a/fs/mpage.c b/fs/mpage.c
index 0face1c4d4c6..4979ffa60aaa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -43,16 +43,14 @@
43 */ 43 */
44static void mpage_end_io(struct bio *bio, int err) 44static void mpage_end_io(struct bio *bio, int err)
45{ 45{
46 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 46 struct bio_vec *bv;
47 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 47 int i;
48 48
49 do { 49 bio_for_each_segment_all(bv, bio, i) {
50 struct page *page = bvec->bv_page; 50 struct page *page = bv->bv_page;
51 51
52 if (--bvec >= bio->bi_io_vec)
53 prefetchw(&bvec->bv_page->flags);
54 if (bio_data_dir(bio) == READ) { 52 if (bio_data_dir(bio) == READ) {
55 if (uptodate) { 53 if (!err) {
56 SetPageUptodate(page); 54 SetPageUptodate(page);
57 } else { 55 } else {
58 ClearPageUptodate(page); 56 ClearPageUptodate(page);
@@ -60,14 +58,15 @@ static void mpage_end_io(struct bio *bio, int err)
60 } 58 }
61 unlock_page(page); 59 unlock_page(page);
62 } else { /* bio_data_dir(bio) == WRITE */ 60 } else { /* bio_data_dir(bio) == WRITE */
63 if (!uptodate) { 61 if (err) {
64 SetPageError(page); 62 SetPageError(page);
65 if (page->mapping) 63 if (page->mapping)
66 set_bit(AS_EIO, &page->mapping->flags); 64 set_bit(AS_EIO, &page->mapping->flags);
67 } 65 }
68 end_page_writeback(page); 66 end_page_writeback(page);
69 } 67 }
70 } while (bvec >= bio->bi_io_vec); 68 }
69
71 bio_put(bio); 70 bio_put(bio);
72} 71}
73 72
@@ -94,7 +93,7 @@ mpage_alloc(struct block_device *bdev,
94 93
95 if (bio) { 94 if (bio) {
96 bio->bi_bdev = bdev; 95 bio->bi_bdev = bdev;
97 bio->bi_sector = first_sector; 96 bio->bi_iter.bi_sector = first_sector;
98 } 97 }
99 return bio; 98 return bio;
100} 99}
diff --git a/fs/namei.c b/fs/namei.c
index 3531deebad30..4b491b431990 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -196,6 +196,7 @@ recopy:
196 goto error; 196 goto error;
197 197
198 result->uptr = filename; 198 result->uptr = filename;
199 result->aname = NULL;
199 audit_getname(result); 200 audit_getname(result);
200 return result; 201 return result;
201 202
@@ -209,7 +210,35 @@ getname(const char __user * filename)
209{ 210{
210 return getname_flags(filename, 0, NULL); 211 return getname_flags(filename, 0, NULL);
211} 212}
212EXPORT_SYMBOL(getname); 213
214/*
215 * The "getname_kernel()" interface doesn't do pathnames longer
216 * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
217 */
218struct filename *
219getname_kernel(const char * filename)
220{
221 struct filename *result;
222 char *kname;
223 int len;
224
225 len = strlen(filename);
226 if (len >= EMBEDDED_NAME_MAX)
227 return ERR_PTR(-ENAMETOOLONG);
228
229 result = __getname();
230 if (unlikely(!result))
231 return ERR_PTR(-ENOMEM);
232
233 kname = (char *)result + sizeof(*result);
234 result->name = kname;
235 result->uptr = NULL;
236 result->aname = NULL;
237 result->separate = false;
238
239 strlcpy(kname, filename, EMBEDDED_NAME_MAX);
240 return result;
241}
213 242
214#ifdef CONFIG_AUDITSYSCALL 243#ifdef CONFIG_AUDITSYSCALL
215void putname(struct filename *name) 244void putname(struct filename *name)
@@ -235,27 +264,9 @@ static int check_acl(struct inode *inode, int mask)
235 return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); 264 return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
236 } 265 }
237 266
238 acl = get_cached_acl(inode, ACL_TYPE_ACCESS); 267 acl = get_acl(inode, ACL_TYPE_ACCESS);
239 268 if (IS_ERR(acl))
240 /* 269 return PTR_ERR(acl);
241 * A filesystem can force a ACL callback by just never filling the
242 * ACL cache. But normally you'd fill the cache either at inode
243 * instantiation time, or on the first ->get_acl call.
244 *
245 * If the filesystem doesn't have a get_acl() function at all, we'll
246 * just create the negative cache entry.
247 */
248 if (acl == ACL_NOT_CACHED) {
249 if (inode->i_op->get_acl) {
250 acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
251 if (IS_ERR(acl))
252 return PTR_ERR(acl);
253 } else {
254 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
255 return -EAGAIN;
256 }
257 }
258
259 if (acl) { 270 if (acl) {
260 int error = posix_acl_permission(inode, acl, mask); 271 int error = posix_acl_permission(inode, acl, mask);
261 posix_acl_release(acl); 272 posix_acl_release(acl);
@@ -1098,7 +1109,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1098 return false; 1109 return false;
1099 1110
1100 if (!d_mountpoint(path->dentry)) 1111 if (!d_mountpoint(path->dentry))
1101 break; 1112 return true;
1102 1113
1103 mounted = __lookup_mnt(path->mnt, path->dentry); 1114 mounted = __lookup_mnt(path->mnt, path->dentry);
1104 if (!mounted) 1115 if (!mounted)
@@ -1114,20 +1125,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1114 */ 1125 */
1115 *inode = path->dentry->d_inode; 1126 *inode = path->dentry->d_inode;
1116 } 1127 }
1117 return true; 1128 return read_seqretry(&mount_lock, nd->m_seq);
1118}
1119
1120static void follow_mount_rcu(struct nameidata *nd)
1121{
1122 while (d_mountpoint(nd->path.dentry)) {
1123 struct mount *mounted;
1124 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1125 if (!mounted)
1126 break;
1127 nd->path.mnt = &mounted->mnt;
1128 nd->path.dentry = mounted->mnt.mnt_root;
1129 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1130 }
1131} 1129}
1132 1130
1133static int follow_dotdot_rcu(struct nameidata *nd) 1131static int follow_dotdot_rcu(struct nameidata *nd)
@@ -1155,7 +1153,17 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1155 break; 1153 break;
1156 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 1154 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1157 } 1155 }
1158 follow_mount_rcu(nd); 1156 while (d_mountpoint(nd->path.dentry)) {
1157 struct mount *mounted;
1158 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
1159 if (!mounted)
1160 break;
1161 nd->path.mnt = &mounted->mnt;
1162 nd->path.dentry = mounted->mnt.mnt_root;
1163 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1164 if (!read_seqretry(&mount_lock, nd->m_seq))
1165 goto failed;
1166 }
1159 nd->inode = nd->path.dentry->d_inode; 1167 nd->inode = nd->path.dentry->d_inode;
1160 return 0; 1168 return 0;
1161 1169
@@ -1873,7 +1881,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1873 1881
1874 nd->path = f.file->f_path; 1882 nd->path = f.file->f_path;
1875 if (flags & LOOKUP_RCU) { 1883 if (flags & LOOKUP_RCU) {
1876 if (f.need_put) 1884 if (f.flags & FDPUT_FPUT)
1877 *fp = f.file; 1885 *fp = f.file;
1878 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1886 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1879 rcu_read_lock(); 1887 rcu_read_lock();
@@ -3945,10 +3953,13 @@ out_dput:
3945 done_path_create(&new_path, new_dentry); 3953 done_path_create(&new_path, new_dentry);
3946 if (delegated_inode) { 3954 if (delegated_inode) {
3947 error = break_deleg_wait(&delegated_inode); 3955 error = break_deleg_wait(&delegated_inode);
3948 if (!error) 3956 if (!error) {
3957 path_put(&old_path);
3949 goto retry; 3958 goto retry;
3959 }
3950 } 3960 }
3951 if (retry_estale(error, how)) { 3961 if (retry_estale(error, how)) {
3962 path_put(&old_path);
3952 how |= LOOKUP_REVAL; 3963 how |= LOOKUP_REVAL;
3953 goto retry; 3964 goto retry;
3954 } 3965 }
diff --git a/fs/namespace.c b/fs/namespace.c
index ac2ce8a766e1..2ffc5a2905d4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -23,11 +23,34 @@
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/proc_ns.h> 24#include <linux/proc_ns.h>
25#include <linux/magic.h> 25#include <linux/magic.h>
26#include <linux/bootmem.h>
26#include "pnode.h" 27#include "pnode.h"
27#include "internal.h" 28#include "internal.h"
28 29
29#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) 30static unsigned int m_hash_mask __read_mostly;
30#define HASH_SIZE (1UL << HASH_SHIFT) 31static unsigned int m_hash_shift __read_mostly;
32static unsigned int mp_hash_mask __read_mostly;
33static unsigned int mp_hash_shift __read_mostly;
34
35static __initdata unsigned long mhash_entries;
36static int __init set_mhash_entries(char *str)
37{
38 if (!str)
39 return 0;
40 mhash_entries = simple_strtoul(str, &str, 0);
41 return 1;
42}
43__setup("mhash_entries=", set_mhash_entries);
44
45static __initdata unsigned long mphash_entries;
46static int __init set_mphash_entries(char *str)
47{
48 if (!str)
49 return 0;
50 mphash_entries = simple_strtoul(str, &str, 0);
51 return 1;
52}
53__setup("mphash_entries=", set_mphash_entries);
31 54
32static int event; 55static int event;
33static DEFINE_IDA(mnt_id_ida); 56static DEFINE_IDA(mnt_id_ida);
@@ -36,8 +59,8 @@ static DEFINE_SPINLOCK(mnt_id_lock);
36static int mnt_id_start = 0; 59static int mnt_id_start = 0;
37static int mnt_group_start = 1; 60static int mnt_group_start = 1;
38 61
39static struct list_head *mount_hashtable __read_mostly; 62static struct hlist_head *mount_hashtable __read_mostly;
40static struct list_head *mountpoint_hashtable __read_mostly; 63static struct hlist_head *mountpoint_hashtable __read_mostly;
41static struct kmem_cache *mnt_cache __read_mostly; 64static struct kmem_cache *mnt_cache __read_mostly;
42static DECLARE_RWSEM(namespace_sem); 65static DECLARE_RWSEM(namespace_sem);
43 66
@@ -55,12 +78,19 @@ EXPORT_SYMBOL_GPL(fs_kobj);
55 */ 78 */
56__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); 79__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
57 80
58static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) 81static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
59{ 82{
60 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 83 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
61 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 84 tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
62 tmp = tmp + (tmp >> HASH_SHIFT); 85 tmp = tmp + (tmp >> m_hash_shift);
63 return tmp & (HASH_SIZE - 1); 86 return &mount_hashtable[tmp & m_hash_mask];
87}
88
89static inline struct hlist_head *mp_hash(struct dentry *dentry)
90{
91 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
92 tmp = tmp + (tmp >> mp_hash_shift);
93 return &mountpoint_hashtable[tmp & mp_hash_mask];
64} 94}
65 95
66/* 96/*
@@ -187,7 +217,7 @@ static struct mount *alloc_vfsmnt(const char *name)
187 mnt->mnt_writers = 0; 217 mnt->mnt_writers = 0;
188#endif 218#endif
189 219
190 INIT_LIST_HEAD(&mnt->mnt_hash); 220 INIT_HLIST_NODE(&mnt->mnt_hash);
191 INIT_LIST_HEAD(&mnt->mnt_child); 221 INIT_LIST_HEAD(&mnt->mnt_child);
192 INIT_LIST_HEAD(&mnt->mnt_mounts); 222 INIT_LIST_HEAD(&mnt->mnt_mounts);
193 INIT_LIST_HEAD(&mnt->mnt_list); 223 INIT_LIST_HEAD(&mnt->mnt_list);
@@ -575,10 +605,10 @@ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
575 */ 605 */
576struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 606struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
577{ 607{
578 struct list_head *head = mount_hashtable + hash(mnt, dentry); 608 struct hlist_head *head = m_hash(mnt, dentry);
579 struct mount *p; 609 struct mount *p;
580 610
581 list_for_each_entry_rcu(p, head, mnt_hash) 611 hlist_for_each_entry_rcu(p, head, mnt_hash)
582 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 612 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
583 return p; 613 return p;
584 return NULL; 614 return NULL;
@@ -590,13 +620,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
590 */ 620 */
591struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) 621struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
592{ 622{
593 struct list_head *head = mount_hashtable + hash(mnt, dentry); 623 struct mount *p, *res;
594 struct mount *p; 624 res = p = __lookup_mnt(mnt, dentry);
595 625 if (!p)
596 list_for_each_entry_reverse(p, head, mnt_hash) 626 goto out;
597 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 627 hlist_for_each_entry_continue(p, mnt_hash) {
598 return p; 628 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
599 return NULL; 629 break;
630 res = p;
631 }
632out:
633 return res;
600} 634}
601 635
602/* 636/*
@@ -633,11 +667,11 @@ struct vfsmount *lookup_mnt(struct path *path)
633 667
634static struct mountpoint *new_mountpoint(struct dentry *dentry) 668static struct mountpoint *new_mountpoint(struct dentry *dentry)
635{ 669{
636 struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry); 670 struct hlist_head *chain = mp_hash(dentry);
637 struct mountpoint *mp; 671 struct mountpoint *mp;
638 int ret; 672 int ret;
639 673
640 list_for_each_entry(mp, chain, m_hash) { 674 hlist_for_each_entry(mp, chain, m_hash) {
641 if (mp->m_dentry == dentry) { 675 if (mp->m_dentry == dentry) {
642 /* might be worth a WARN_ON() */ 676 /* might be worth a WARN_ON() */
643 if (d_unlinked(dentry)) 677 if (d_unlinked(dentry))
@@ -659,7 +693,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
659 693
660 mp->m_dentry = dentry; 694 mp->m_dentry = dentry;
661 mp->m_count = 1; 695 mp->m_count = 1;
662 list_add(&mp->m_hash, chain); 696 hlist_add_head(&mp->m_hash, chain);
663 return mp; 697 return mp;
664} 698}
665 699
@@ -670,7 +704,7 @@ static void put_mountpoint(struct mountpoint *mp)
670 spin_lock(&dentry->d_lock); 704 spin_lock(&dentry->d_lock);
671 dentry->d_flags &= ~DCACHE_MOUNTED; 705 dentry->d_flags &= ~DCACHE_MOUNTED;
672 spin_unlock(&dentry->d_lock); 706 spin_unlock(&dentry->d_lock);
673 list_del(&mp->m_hash); 707 hlist_del(&mp->m_hash);
674 kfree(mp); 708 kfree(mp);
675 } 709 }
676} 710}
@@ -712,7 +746,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
712 mnt->mnt_parent = mnt; 746 mnt->mnt_parent = mnt;
713 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 747 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
714 list_del_init(&mnt->mnt_child); 748 list_del_init(&mnt->mnt_child);
715 list_del_init(&mnt->mnt_hash); 749 hlist_del_init_rcu(&mnt->mnt_hash);
716 put_mountpoint(mnt->mnt_mp); 750 put_mountpoint(mnt->mnt_mp);
717 mnt->mnt_mp = NULL; 751 mnt->mnt_mp = NULL;
718} 752}
@@ -739,15 +773,14 @@ static void attach_mnt(struct mount *mnt,
739 struct mountpoint *mp) 773 struct mountpoint *mp)
740{ 774{
741 mnt_set_mountpoint(parent, mp, mnt); 775 mnt_set_mountpoint(parent, mp, mnt);
742 list_add_tail(&mnt->mnt_hash, mount_hashtable + 776 hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
743 hash(&parent->mnt, mp->m_dentry));
744 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 777 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
745} 778}
746 779
747/* 780/*
748 * vfsmount lock must be held for write 781 * vfsmount lock must be held for write
749 */ 782 */
750static void commit_tree(struct mount *mnt) 783static void commit_tree(struct mount *mnt, struct mount *shadows)
751{ 784{
752 struct mount *parent = mnt->mnt_parent; 785 struct mount *parent = mnt->mnt_parent;
753 struct mount *m; 786 struct mount *m;
@@ -762,8 +795,11 @@ static void commit_tree(struct mount *mnt)
762 795
763 list_splice(&head, n->list.prev); 796 list_splice(&head, n->list.prev);
764 797
765 list_add_tail(&mnt->mnt_hash, mount_hashtable + 798 if (shadows)
766 hash(&parent->mnt, mnt->mnt_mountpoint)); 799 hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
800 else
801 hlist_add_head_rcu(&mnt->mnt_hash,
802 m_hash(&parent->mnt, mnt->mnt_mountpoint));
767 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 803 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
768 touch_mnt_namespace(n); 804 touch_mnt_namespace(n);
769} 805}
@@ -1153,26 +1189,28 @@ int may_umount(struct vfsmount *mnt)
1153 1189
1154EXPORT_SYMBOL(may_umount); 1190EXPORT_SYMBOL(may_umount);
1155 1191
1156static LIST_HEAD(unmounted); /* protected by namespace_sem */ 1192static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1157 1193
1158static void namespace_unlock(void) 1194static void namespace_unlock(void)
1159{ 1195{
1160 struct mount *mnt; 1196 struct mount *mnt;
1161 LIST_HEAD(head); 1197 struct hlist_head head = unmounted;
1162 1198
1163 if (likely(list_empty(&unmounted))) { 1199 if (likely(hlist_empty(&head))) {
1164 up_write(&namespace_sem); 1200 up_write(&namespace_sem);
1165 return; 1201 return;
1166 } 1202 }
1167 1203
1168 list_splice_init(&unmounted, &head); 1204 head.first->pprev = &head.first;
1205 INIT_HLIST_HEAD(&unmounted);
1206
1169 up_write(&namespace_sem); 1207 up_write(&namespace_sem);
1170 1208
1171 synchronize_rcu(); 1209 synchronize_rcu();
1172 1210
1173 while (!list_empty(&head)) { 1211 while (!hlist_empty(&head)) {
1174 mnt = list_first_entry(&head, struct mount, mnt_hash); 1212 mnt = hlist_entry(head.first, struct mount, mnt_hash);
1175 list_del_init(&mnt->mnt_hash); 1213 hlist_del_init(&mnt->mnt_hash);
1176 if (mnt->mnt_ex_mountpoint.mnt) 1214 if (mnt->mnt_ex_mountpoint.mnt)
1177 path_put(&mnt->mnt_ex_mountpoint); 1215 path_put(&mnt->mnt_ex_mountpoint);
1178 mntput(&mnt->mnt); 1216 mntput(&mnt->mnt);
@@ -1193,16 +1231,19 @@ static inline void namespace_lock(void)
1193 */ 1231 */
1194void umount_tree(struct mount *mnt, int how) 1232void umount_tree(struct mount *mnt, int how)
1195{ 1233{
1196 LIST_HEAD(tmp_list); 1234 HLIST_HEAD(tmp_list);
1197 struct mount *p; 1235 struct mount *p;
1236 struct mount *last = NULL;
1198 1237
1199 for (p = mnt; p; p = next_mnt(p, mnt)) 1238 for (p = mnt; p; p = next_mnt(p, mnt)) {
1200 list_move(&p->mnt_hash, &tmp_list); 1239 hlist_del_init_rcu(&p->mnt_hash);
1240 hlist_add_head(&p->mnt_hash, &tmp_list);
1241 }
1201 1242
1202 if (how) 1243 if (how)
1203 propagate_umount(&tmp_list); 1244 propagate_umount(&tmp_list);
1204 1245
1205 list_for_each_entry(p, &tmp_list, mnt_hash) { 1246 hlist_for_each_entry(p, &tmp_list, mnt_hash) {
1206 list_del_init(&p->mnt_expire); 1247 list_del_init(&p->mnt_expire);
1207 list_del_init(&p->mnt_list); 1248 list_del_init(&p->mnt_list);
1208 __touch_mnt_namespace(p->mnt_ns); 1249 __touch_mnt_namespace(p->mnt_ns);
@@ -1220,8 +1261,13 @@ void umount_tree(struct mount *mnt, int how)
1220 p->mnt_mp = NULL; 1261 p->mnt_mp = NULL;
1221 } 1262 }
1222 change_mnt_propagation(p, MS_PRIVATE); 1263 change_mnt_propagation(p, MS_PRIVATE);
1264 last = p;
1265 }
1266 if (last) {
1267 last->mnt_hash.next = unmounted.first;
1268 unmounted.first = tmp_list.first;
1269 unmounted.first->pprev = &unmounted.first;
1223 } 1270 }
1224 list_splice(&tmp_list, &unmounted);
1225} 1271}
1226 1272
1227static void shrink_submounts(struct mount *mnt); 1273static void shrink_submounts(struct mount *mnt);
@@ -1605,24 +1651,23 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1605 struct mountpoint *dest_mp, 1651 struct mountpoint *dest_mp,
1606 struct path *parent_path) 1652 struct path *parent_path)
1607{ 1653{
1608 LIST_HEAD(tree_list); 1654 HLIST_HEAD(tree_list);
1609 struct mount *child, *p; 1655 struct mount *child, *p;
1656 struct hlist_node *n;
1610 int err; 1657 int err;
1611 1658
1612 if (IS_MNT_SHARED(dest_mnt)) { 1659 if (IS_MNT_SHARED(dest_mnt)) {
1613 err = invent_group_ids(source_mnt, true); 1660 err = invent_group_ids(source_mnt, true);
1614 if (err) 1661 if (err)
1615 goto out; 1662 goto out;
1616 } 1663 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
1617 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); 1664 if (err)
1618 if (err) 1665 goto out_cleanup_ids;
1619 goto out_cleanup_ids; 1666 lock_mount_hash();
1620
1621 lock_mount_hash();
1622
1623 if (IS_MNT_SHARED(dest_mnt)) {
1624 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1667 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1625 set_mnt_shared(p); 1668 set_mnt_shared(p);
1669 } else {
1670 lock_mount_hash();
1626 } 1671 }
1627 if (parent_path) { 1672 if (parent_path) {
1628 detach_mnt(source_mnt, parent_path); 1673 detach_mnt(source_mnt, parent_path);
@@ -1630,20 +1675,22 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1630 touch_mnt_namespace(source_mnt->mnt_ns); 1675 touch_mnt_namespace(source_mnt->mnt_ns);
1631 } else { 1676 } else {
1632 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 1677 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
1633 commit_tree(source_mnt); 1678 commit_tree(source_mnt, NULL);
1634 } 1679 }
1635 1680
1636 list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { 1681 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
1637 list_del_init(&child->mnt_hash); 1682 struct mount *q;
1638 commit_tree(child); 1683 hlist_del_init(&child->mnt_hash);
1684 q = __lookup_mnt_last(&child->mnt_parent->mnt,
1685 child->mnt_mountpoint);
1686 commit_tree(child, q);
1639 } 1687 }
1640 unlock_mount_hash(); 1688 unlock_mount_hash();
1641 1689
1642 return 0; 1690 return 0;
1643 1691
1644 out_cleanup_ids: 1692 out_cleanup_ids:
1645 if (IS_MNT_SHARED(dest_mnt)) 1693 cleanup_group_ids(source_mnt, NULL);
1646 cleanup_group_ids(source_mnt, NULL);
1647 out: 1694 out:
1648 return err; 1695 return err;
1649} 1696}
@@ -2777,18 +2824,26 @@ void __init mnt_init(void)
2777 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 2824 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
2778 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); 2825 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
2779 2826
2780 mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 2827 mount_hashtable = alloc_large_system_hash("Mount-cache",
2781 mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); 2828 sizeof(struct hlist_head),
2829 mhash_entries, 19,
2830 0,
2831 &m_hash_shift, &m_hash_mask, 0, 0);
2832 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
2833 sizeof(struct hlist_head),
2834 mphash_entries, 19,
2835 0,
2836 &mp_hash_shift, &mp_hash_mask, 0, 0);
2782 2837
2783 if (!mount_hashtable || !mountpoint_hashtable) 2838 if (!mount_hashtable || !mountpoint_hashtable)
2784 panic("Failed to allocate mount hash table\n"); 2839 panic("Failed to allocate mount hash table\n");
2785 2840
2786 printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE); 2841 for (u = 0; u <= m_hash_mask; u++)
2842 INIT_HLIST_HEAD(&mount_hashtable[u]);
2843 for (u = 0; u <= mp_hash_mask; u++)
2844 INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
2787 2845
2788 for (u = 0; u < HASH_SIZE; u++) 2846 kernfs_init();
2789 INIT_LIST_HEAD(&mount_hashtable[u]);
2790 for (u = 0; u < HASH_SIZE; u++)
2791 INIT_LIST_HEAD(&mountpoint_hashtable[u]);
2792 2847
2793 err = sysfs_init(); 2848 err = sysfs_init();
2794 if (err) 2849 if (err)
@@ -2886,7 +2941,7 @@ bool fs_fully_visible(struct file_system_type *type)
2886 struct inode *inode = child->mnt_mountpoint->d_inode; 2941 struct inode *inode = child->mnt_mountpoint->d_inode;
2887 if (!S_ISDIR(inode->i_mode)) 2942 if (!S_ISDIR(inode->i_mode))
2888 goto next; 2943 goto next;
2889 if (inode->i_nlink != 2) 2944 if (inode->i_nlink > 2)
2890 goto next; 2945 goto next;
2891 } 2946 }
2892 visible = true; 2947 visible = true;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e242bbf72972..56ff823ca82e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -134,8 +134,8 @@ bl_submit_bio(int rw, struct bio *bio)
134 if (bio) { 134 if (bio) {
135 get_parallel(bio->bi_private); 135 get_parallel(bio->bi_private);
136 dprintk("%s submitting %s bio %u@%llu\n", __func__, 136 dprintk("%s submitting %s bio %u@%llu\n", __func__,
137 rw == READ ? "read" : "write", 137 rw == READ ? "read" : "write", bio->bi_iter.bi_size,
138 bio->bi_size, (unsigned long long)bio->bi_sector); 138 (unsigned long long)bio->bi_iter.bi_sector);
139 submit_bio(rw, bio); 139 submit_bio(rw, bio);
140 } 140 }
141 return NULL; 141 return NULL;
@@ -156,7 +156,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
156 } 156 }
157 157
158 if (bio) { 158 if (bio) {
159 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; 159 bio->bi_iter.bi_sector = isect - be->be_f_offset +
160 be->be_v_offset;
160 bio->bi_bdev = be->be_mdev; 161 bio->bi_bdev = be->be_mdev;
161 bio->bi_end_io = end_io; 162 bio->bi_end_io = end_io;
162 bio->bi_private = par; 163 bio->bi_private = par;
@@ -201,18 +202,14 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
201static void bl_end_io_read(struct bio *bio, int err) 202static void bl_end_io_read(struct bio *bio, int err)
202{ 203{
203 struct parallel_io *par = bio->bi_private; 204 struct parallel_io *par = bio->bi_private;
204 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 205 struct bio_vec *bvec;
205 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 206 int i;
206 207
207 do { 208 if (!err)
208 struct page *page = bvec->bv_page; 209 bio_for_each_segment_all(bvec, bio, i)
210 SetPageUptodate(bvec->bv_page);
209 211
210 if (--bvec >= bio->bi_io_vec) 212 if (err) {
211 prefetchw(&bvec->bv_page->flags);
212 if (uptodate)
213 SetPageUptodate(page);
214 } while (bvec >= bio->bi_io_vec);
215 if (!uptodate) {
216 struct nfs_read_data *rdata = par->data; 213 struct nfs_read_data *rdata = par->data;
217 struct nfs_pgio_header *header = rdata->header; 214 struct nfs_pgio_header *header = rdata->header;
218 215
@@ -383,20 +380,16 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
383static void bl_end_io_write_zero(struct bio *bio, int err) 380static void bl_end_io_write_zero(struct bio *bio, int err)
384{ 381{
385 struct parallel_io *par = bio->bi_private; 382 struct parallel_io *par = bio->bi_private;
386 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 383 struct bio_vec *bvec;
387 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 384 int i;
388
389 do {
390 struct page *page = bvec->bv_page;
391 385
392 if (--bvec >= bio->bi_io_vec) 386 bio_for_each_segment_all(bvec, bio, i) {
393 prefetchw(&bvec->bv_page->flags);
394 /* This is the zeroing page we added */ 387 /* This is the zeroing page we added */
395 end_page_writeback(page); 388 end_page_writeback(bvec->bv_page);
396 page_cache_release(page); 389 page_cache_release(bvec->bv_page);
397 } while (bvec >= bio->bi_io_vec); 390 }
398 391
399 if (unlikely(!uptodate)) { 392 if (unlikely(err)) {
400 struct nfs_write_data *data = par->data; 393 struct nfs_write_data *data = par->data;
401 struct nfs_pgio_header *header = data->header; 394 struct nfs_pgio_header *header = data->header;
402 395
@@ -519,7 +512,7 @@ bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
519 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + 512 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
520 (offset / SECTOR_SIZE); 513 (offset / SECTOR_SIZE);
521 514
522 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; 515 bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
523 bio->bi_bdev = be->be_mdev; 516 bio->bi_bdev = be->be_mdev;
524 bio->bi_end_io = bl_read_single_end_io; 517 bio->bi_end_io = bl_read_single_end_io;
525 518
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ef792f29f831..5d8ccecf5f5c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -659,16 +659,19 @@ int nfs_async_inode_return_delegation(struct inode *inode,
659 659
660 rcu_read_lock(); 660 rcu_read_lock();
661 delegation = rcu_dereference(NFS_I(inode)->delegation); 661 delegation = rcu_dereference(NFS_I(inode)->delegation);
662 if (delegation == NULL)
663 goto out_enoent;
662 664
663 if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) { 665 if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
664 rcu_read_unlock(); 666 goto out_enoent;
665 return -ENOENT;
666 }
667 nfs_mark_return_delegation(server, delegation); 667 nfs_mark_return_delegation(server, delegation);
668 rcu_read_unlock(); 668 rcu_read_unlock();
669 669
670 nfs_delegation_run_state_manager(clp); 670 nfs_delegation_run_state_manager(clp);
671 return 0; 671 return 0;
672out_enoent:
673 rcu_read_unlock();
674 return -ENOENT;
672} 675}
673 676
674static struct inode * 677static struct inode *
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 812154aff981..4a48fe4b84b6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -274,6 +274,15 @@ out_eof:
274 return -EBADCOOKIE; 274 return -EBADCOOKIE;
275} 275}
276 276
277static bool
278nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
279{
280 if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
281 return false;
282 smp_rmb();
283 return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
284}
285
277static 286static
278int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) 287int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
279{ 288{
@@ -287,8 +296,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
287 struct nfs_open_dir_context *ctx = desc->file->private_data; 296 struct nfs_open_dir_context *ctx = desc->file->private_data;
288 297
289 new_pos = desc->current_index + i; 298 new_pos = desc->current_index + i;
290 if (ctx->attr_gencount != nfsi->attr_gencount 299 if (ctx->attr_gencount != nfsi->attr_gencount ||
291 || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) { 300 !nfs_readdir_inode_mapping_valid(nfsi)) {
292 ctx->duped = 0; 301 ctx->duped = 0;
293 ctx->attr_gencount = nfsi->attr_gencount; 302 ctx->attr_gencount = nfsi->attr_gencount;
294 } else if (new_pos < desc->ctx->pos) { 303 } else if (new_pos < desc->ctx->pos) {
@@ -1404,7 +1413,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1404 /* Expect a negative dentry */ 1413 /* Expect a negative dentry */
1405 BUG_ON(dentry->d_inode); 1414 BUG_ON(dentry->d_inode);
1406 1415
1407 dfprintk(VFS, "NFS: atomic_open(%s/%ld), %pd\n", 1416 dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
1408 dir->i_sb->s_id, dir->i_ino, dentry); 1417 dir->i_sb->s_id, dir->i_ino, dentry);
1409 1418
1410 err = nfs_check_flags(open_flags); 1419 err = nfs_check_flags(open_flags);
@@ -1594,7 +1603,7 @@ int nfs_create(struct inode *dir, struct dentry *dentry,
1594 int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; 1603 int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
1595 int error; 1604 int error;
1596 1605
1597 dfprintk(VFS, "NFS: create(%s/%ld), %pd\n", 1606 dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
1598 dir->i_sb->s_id, dir->i_ino, dentry); 1607 dir->i_sb->s_id, dir->i_ino, dentry);
1599 1608
1600 attr.ia_mode = mode; 1609 attr.ia_mode = mode;
@@ -1621,7 +1630,7 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
1621 struct iattr attr; 1630 struct iattr attr;
1622 int status; 1631 int status;
1623 1632
1624 dfprintk(VFS, "NFS: mknod(%s/%ld), %pd\n", 1633 dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
1625 dir->i_sb->s_id, dir->i_ino, dentry); 1634 dir->i_sb->s_id, dir->i_ino, dentry);
1626 1635
1627 if (!new_valid_dev(rdev)) 1636 if (!new_valid_dev(rdev))
@@ -1650,7 +1659,7 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1650 struct iattr attr; 1659 struct iattr attr;
1651 int error; 1660 int error;
1652 1661
1653 dfprintk(VFS, "NFS: mkdir(%s/%ld), %pd\n", 1662 dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
1654 dir->i_sb->s_id, dir->i_ino, dentry); 1663 dir->i_sb->s_id, dir->i_ino, dentry);
1655 1664
1656 attr.ia_valid = ATTR_MODE; 1665 attr.ia_valid = ATTR_MODE;
@@ -1678,7 +1687,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
1678{ 1687{
1679 int error; 1688 int error;
1680 1689
1681 dfprintk(VFS, "NFS: rmdir(%s/%ld), %pd\n", 1690 dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n",
1682 dir->i_sb->s_id, dir->i_ino, dentry); 1691 dir->i_sb->s_id, dir->i_ino, dentry);
1683 1692
1684 trace_nfs_rmdir_enter(dir, dentry); 1693 trace_nfs_rmdir_enter(dir, dentry);
@@ -1747,7 +1756,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
1747 int error; 1756 int error;
1748 int need_rehash = 0; 1757 int need_rehash = 0;
1749 1758
1750 dfprintk(VFS, "NFS: unlink(%s/%ld, %pd)\n", dir->i_sb->s_id, 1759 dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
1751 dir->i_ino, dentry); 1760 dir->i_ino, dentry);
1752 1761
1753 trace_nfs_unlink_enter(dir, dentry); 1762 trace_nfs_unlink_enter(dir, dentry);
@@ -1798,7 +1807,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1798 unsigned int pathlen = strlen(symname); 1807 unsigned int pathlen = strlen(symname);
1799 int error; 1808 int error;
1800 1809
1801 dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s)\n", dir->i_sb->s_id, 1810 dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id,
1802 dir->i_ino, dentry, symname); 1811 dir->i_ino, dentry, symname);
1803 1812
1804 if (pathlen > PAGE_SIZE) 1813 if (pathlen > PAGE_SIZE)
@@ -1821,7 +1830,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1821 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); 1830 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
1822 trace_nfs_symlink_exit(dir, dentry, error); 1831 trace_nfs_symlink_exit(dir, dentry, error);
1823 if (error != 0) { 1832 if (error != 0) {
1824 dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s) error %d\n", 1833 dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
1825 dir->i_sb->s_id, dir->i_ino, 1834 dir->i_sb->s_id, dir->i_ino,
1826 dentry, symname, error); 1835 dentry, symname, error);
1827 d_drop(dentry); 1836 d_drop(dentry);
@@ -1837,6 +1846,11 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1837 GFP_KERNEL)) { 1846 GFP_KERNEL)) {
1838 SetPageUptodate(page); 1847 SetPageUptodate(page);
1839 unlock_page(page); 1848 unlock_page(page);
1849 /*
1850 * add_to_page_cache_lru() grabs an extra page refcount.
1851 * Drop it here to avoid leaking this page later.
1852 */
1853 page_cache_release(page);
1840 } else 1854 } else
1841 __free_page(page); 1855 __free_page(page);
1842 1856
@@ -2304,7 +2318,7 @@ out:
2304 if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) 2318 if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
2305 res = -EACCES; 2319 res = -EACCES;
2306 2320
2307 dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", 2321 dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
2308 inode->i_sb->s_id, inode->i_ino, mask, res); 2322 inode->i_sb->s_id, inode->i_ino, mask, res);
2309 return res; 2323 return res;
2310out_notsup: 2324out_notsup:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d71d66c9e0a1..b8797ae6831f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -222,14 +222,31 @@ out:
222 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust 222 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
223 * the iocb is still valid here if this is a synchronous request. 223 * the iocb is still valid here if this is a synchronous request.
224 */ 224 */
225static void nfs_direct_complete(struct nfs_direct_req *dreq) 225static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
226{ 226{
227 struct inode *inode = dreq->inode;
228
229 if (dreq->iocb && write) {
230 loff_t pos = dreq->iocb->ki_pos + dreq->count;
231
232 spin_lock(&inode->i_lock);
233 if (i_size_read(inode) < pos)
234 i_size_write(inode, pos);
235 spin_unlock(&inode->i_lock);
236 }
237
238 if (write)
239 nfs_zap_mapping(inode, inode->i_mapping);
240
241 inode_dio_done(inode);
242
227 if (dreq->iocb) { 243 if (dreq->iocb) {
228 long res = (long) dreq->error; 244 long res = (long) dreq->error;
229 if (!res) 245 if (!res)
230 res = (long) dreq->count; 246 res = (long) dreq->count;
231 aio_complete(dreq->iocb, res, 0); 247 aio_complete(dreq->iocb, res, 0);
232 } 248 }
249
233 complete_all(&dreq->completion); 250 complete_all(&dreq->completion);
234 251
235 nfs_direct_req_release(dreq); 252 nfs_direct_req_release(dreq);
@@ -237,9 +254,9 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
237 254
238static void nfs_direct_readpage_release(struct nfs_page *req) 255static void nfs_direct_readpage_release(struct nfs_page *req)
239{ 256{
240 dprintk("NFS: direct read done (%s/%lld %d@%lld)\n", 257 dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
241 req->wb_context->dentry->d_inode->i_sb->s_id, 258 req->wb_context->dentry->d_inode->i_sb->s_id,
242 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 259 (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
243 req->wb_bytes, 260 req->wb_bytes,
244 (long long)req_offset(req)); 261 (long long)req_offset(req));
245 nfs_release_request(req); 262 nfs_release_request(req);
@@ -272,7 +289,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
272 } 289 }
273out_put: 290out_put:
274 if (put_dreq(dreq)) 291 if (put_dreq(dreq))
275 nfs_direct_complete(dreq); 292 nfs_direct_complete(dreq, false);
276 hdr->release(hdr); 293 hdr->release(hdr);
277} 294}
278 295
@@ -402,6 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
402 loff_t pos, bool uio) 419 loff_t pos, bool uio)
403{ 420{
404 struct nfs_pageio_descriptor desc; 421 struct nfs_pageio_descriptor desc;
422 struct inode *inode = dreq->inode;
405 ssize_t result = -EINVAL; 423 ssize_t result = -EINVAL;
406 size_t requested_bytes = 0; 424 size_t requested_bytes = 0;
407 unsigned long seg; 425 unsigned long seg;
@@ -410,6 +428,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
410 &nfs_direct_read_completion_ops); 428 &nfs_direct_read_completion_ops);
411 get_dreq(dreq); 429 get_dreq(dreq);
412 desc.pg_dreq = dreq; 430 desc.pg_dreq = dreq;
431 atomic_inc(&inode->i_dio_count);
413 432
414 for (seg = 0; seg < nr_segs; seg++) { 433 for (seg = 0; seg < nr_segs; seg++) {
415 const struct iovec *vec = &iov[seg]; 434 const struct iovec *vec = &iov[seg];
@@ -429,26 +448,69 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
429 * generic layer handle the completion. 448 * generic layer handle the completion.
430 */ 449 */
431 if (requested_bytes == 0) { 450 if (requested_bytes == 0) {
451 inode_dio_done(inode);
432 nfs_direct_req_release(dreq); 452 nfs_direct_req_release(dreq);
433 return result < 0 ? result : -EIO; 453 return result < 0 ? result : -EIO;
434 } 454 }
435 455
436 if (put_dreq(dreq)) 456 if (put_dreq(dreq))
437 nfs_direct_complete(dreq); 457 nfs_direct_complete(dreq, false);
438 return 0; 458 return 0;
439} 459}
440 460
441static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 461/**
442 unsigned long nr_segs, loff_t pos, bool uio) 462 * nfs_file_direct_read - file direct read operation for NFS files
463 * @iocb: target I/O control block
464 * @iov: vector of user buffers into which to read data
465 * @nr_segs: size of iov vector
466 * @pos: byte offset in file where reading starts
467 *
468 * We use this function for direct reads instead of calling
469 * generic_file_aio_read() in order to avoid gfar's check to see if
470 * the request starts before the end of the file. For that check
471 * to work, we must generate a GETATTR before each direct read, and
472 * even then there is a window between the GETATTR and the subsequent
473 * READ where the file size could change. Our preference is simply
474 * to do all reads the application wants, and the server will take
475 * care of managing the end of file boundary.
476 *
477 * This function also eliminates unnecessarily updating the file's
478 * atime locally, as the NFS server sets the file's atime, and this
479 * client must read the updated atime from the server back into its
480 * cache.
481 */
482ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
483 unsigned long nr_segs, loff_t pos, bool uio)
443{ 484{
444 ssize_t result = -ENOMEM; 485 struct file *file = iocb->ki_filp;
445 struct inode *inode = iocb->ki_filp->f_mapping->host; 486 struct address_space *mapping = file->f_mapping;
487 struct inode *inode = mapping->host;
446 struct nfs_direct_req *dreq; 488 struct nfs_direct_req *dreq;
447 struct nfs_lock_context *l_ctx; 489 struct nfs_lock_context *l_ctx;
490 ssize_t result = -EINVAL;
491 size_t count;
448 492
493 count = iov_length(iov, nr_segs);
494 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
495
496 dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
497 file, count, (long long) pos);
498
499 result = 0;
500 if (!count)
501 goto out;
502
503 mutex_lock(&inode->i_mutex);
504 result = nfs_sync_mapping(mapping);
505 if (result)
506 goto out_unlock;
507
508 task_io_account_read(count);
509
510 result = -ENOMEM;
449 dreq = nfs_direct_req_alloc(); 511 dreq = nfs_direct_req_alloc();
450 if (dreq == NULL) 512 if (dreq == NULL)
451 goto out; 513 goto out_unlock;
452 514
453 dreq->inode = inode; 515 dreq->inode = inode;
454 dreq->bytes_left = iov_length(iov, nr_segs); 516 dreq->bytes_left = iov_length(iov, nr_segs);
@@ -464,20 +526,26 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
464 526
465 NFS_I(inode)->read_io += iov_length(iov, nr_segs); 527 NFS_I(inode)->read_io += iov_length(iov, nr_segs);
466 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); 528 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
467 if (!result) 529
530 mutex_unlock(&inode->i_mutex);
531
532 if (!result) {
468 result = nfs_direct_wait(dreq); 533 result = nfs_direct_wait(dreq);
534 if (result > 0)
535 iocb->ki_pos = pos + result;
536 }
537
538 nfs_direct_req_release(dreq);
539 return result;
540
469out_release: 541out_release:
470 nfs_direct_req_release(dreq); 542 nfs_direct_req_release(dreq);
543out_unlock:
544 mutex_unlock(&inode->i_mutex);
471out: 545out:
472 return result; 546 return result;
473} 547}
474 548
475static void nfs_inode_dio_write_done(struct inode *inode)
476{
477 nfs_zap_mapping(inode, inode->i_mapping);
478 inode_dio_done(inode);
479}
480
481#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 549#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
482static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 550static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
483{ 551{
@@ -593,8 +661,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
593 nfs_direct_write_reschedule(dreq); 661 nfs_direct_write_reschedule(dreq);
594 break; 662 break;
595 default: 663 default:
596 nfs_inode_dio_write_done(dreq->inode); 664 nfs_direct_complete(dreq, true);
597 nfs_direct_complete(dreq);
598 } 665 }
599} 666}
600 667
@@ -610,8 +677,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
610 677
611static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) 678static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
612{ 679{
613 nfs_inode_dio_write_done(inode); 680 nfs_direct_complete(dreq, true);
614 nfs_direct_complete(dreq);
615} 681}
616#endif 682#endif
617 683
@@ -842,93 +908,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
842 return 0; 908 return 0;
843} 909}
844 910
845static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
846 unsigned long nr_segs, loff_t pos,
847 size_t count, bool uio)
848{
849 ssize_t result = -ENOMEM;
850 struct inode *inode = iocb->ki_filp->f_mapping->host;
851 struct nfs_direct_req *dreq;
852 struct nfs_lock_context *l_ctx;
853
854 dreq = nfs_direct_req_alloc();
855 if (!dreq)
856 goto out;
857
858 dreq->inode = inode;
859 dreq->bytes_left = count;
860 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
861 l_ctx = nfs_get_lock_context(dreq->ctx);
862 if (IS_ERR(l_ctx)) {
863 result = PTR_ERR(l_ctx);
864 goto out_release;
865 }
866 dreq->l_ctx = l_ctx;
867 if (!is_sync_kiocb(iocb))
868 dreq->iocb = iocb;
869
870 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
871 if (!result)
872 result = nfs_direct_wait(dreq);
873out_release:
874 nfs_direct_req_release(dreq);
875out:
876 return result;
877}
878
879/**
880 * nfs_file_direct_read - file direct read operation for NFS files
881 * @iocb: target I/O control block
882 * @iov: vector of user buffers into which to read data
883 * @nr_segs: size of iov vector
884 * @pos: byte offset in file where reading starts
885 *
886 * We use this function for direct reads instead of calling
887 * generic_file_aio_read() in order to avoid gfar's check to see if
888 * the request starts before the end of the file. For that check
889 * to work, we must generate a GETATTR before each direct read, and
890 * even then there is a window between the GETATTR and the subsequent
891 * READ where the file size could change. Our preference is simply
892 * to do all reads the application wants, and the server will take
893 * care of managing the end of file boundary.
894 *
895 * This function also eliminates unnecessarily updating the file's
896 * atime locally, as the NFS server sets the file's atime, and this
897 * client must read the updated atime from the server back into its
898 * cache.
899 */
900ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
901 unsigned long nr_segs, loff_t pos, bool uio)
902{
903 ssize_t retval = -EINVAL;
904 struct file *file = iocb->ki_filp;
905 struct address_space *mapping = file->f_mapping;
906 size_t count;
907
908 count = iov_length(iov, nr_segs);
909 nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
910
911 dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
912 file, count, (long long) pos);
913
914 retval = 0;
915 if (!count)
916 goto out;
917
918 retval = nfs_sync_mapping(mapping);
919 if (retval)
920 goto out;
921
922 task_io_account_read(count);
923
924 retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
925 if (retval > 0)
926 iocb->ki_pos = pos + retval;
927
928out:
929 return retval;
930}
931
932/** 911/**
933 * nfs_file_direct_write - file direct write operation for NFS files 912 * nfs_file_direct_write - file direct write operation for NFS files
934 * @iocb: target I/O control block 913 * @iocb: target I/O control block
@@ -954,46 +933,96 @@ out:
954ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 933ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
955 unsigned long nr_segs, loff_t pos, bool uio) 934 unsigned long nr_segs, loff_t pos, bool uio)
956{ 935{
957 ssize_t retval = -EINVAL; 936 ssize_t result = -EINVAL;
958 struct file *file = iocb->ki_filp; 937 struct file *file = iocb->ki_filp;
959 struct address_space *mapping = file->f_mapping; 938 struct address_space *mapping = file->f_mapping;
939 struct inode *inode = mapping->host;
940 struct nfs_direct_req *dreq;
941 struct nfs_lock_context *l_ctx;
942 loff_t end;
960 size_t count; 943 size_t count;
961 944
962 count = iov_length(iov, nr_segs); 945 count = iov_length(iov, nr_segs);
946 end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
947
963 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); 948 nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
964 949
965 dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", 950 dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
966 file, count, (long long) pos); 951 file, count, (long long) pos);
967 952
968 retval = generic_write_checks(file, &pos, &count, 0); 953 result = generic_write_checks(file, &pos, &count, 0);
969 if (retval) 954 if (result)
970 goto out; 955 goto out;
971 956
972 retval = -EINVAL; 957 result = -EINVAL;
973 if ((ssize_t) count < 0) 958 if ((ssize_t) count < 0)
974 goto out; 959 goto out;
975 retval = 0; 960 result = 0;
976 if (!count) 961 if (!count)
977 goto out; 962 goto out;
978 963
979 retval = nfs_sync_mapping(mapping); 964 mutex_lock(&inode->i_mutex);
980 if (retval) 965
981 goto out; 966 result = nfs_sync_mapping(mapping);
967 if (result)
968 goto out_unlock;
969
970 if (mapping->nrpages) {
971 result = invalidate_inode_pages2_range(mapping,
972 pos >> PAGE_CACHE_SHIFT, end);
973 if (result)
974 goto out_unlock;
975 }
982 976
983 task_io_account_write(count); 977 task_io_account_write(count);
984 978
985 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); 979 result = -ENOMEM;
986 if (retval > 0) { 980 dreq = nfs_direct_req_alloc();
987 struct inode *inode = mapping->host; 981 if (!dreq)
982 goto out_unlock;
988 983
989 iocb->ki_pos = pos + retval; 984 dreq->inode = inode;
990 spin_lock(&inode->i_lock); 985 dreq->bytes_left = count;
991 if (i_size_read(inode) < iocb->ki_pos) 986 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
992 i_size_write(inode, iocb->ki_pos); 987 l_ctx = nfs_get_lock_context(dreq->ctx);
993 spin_unlock(&inode->i_lock); 988 if (IS_ERR(l_ctx)) {
989 result = PTR_ERR(l_ctx);
990 goto out_release;
991 }
992 dreq->l_ctx = l_ctx;
993 if (!is_sync_kiocb(iocb))
994 dreq->iocb = iocb;
995
996 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
997
998 if (mapping->nrpages) {
999 invalidate_inode_pages2_range(mapping,
1000 pos >> PAGE_CACHE_SHIFT, end);
994 } 1001 }
1002
1003 mutex_unlock(&inode->i_mutex);
1004
1005 if (!result) {
1006 result = nfs_direct_wait(dreq);
1007 if (result > 0) {
1008 struct inode *inode = mapping->host;
1009
1010 iocb->ki_pos = pos + result;
1011 spin_lock(&inode->i_lock);
1012 if (i_size_read(inode) < iocb->ki_pos)
1013 i_size_write(inode, iocb->ki_pos);
1014 spin_unlock(&inode->i_lock);
1015 }
1016 }
1017 nfs_direct_req_release(dreq);
1018 return result;
1019
1020out_release:
1021 nfs_direct_req_release(dreq);
1022out_unlock:
1023 mutex_unlock(&inode->i_mutex);
995out: 1024out:
996 return retval; 1025 return result;
997} 1026}
998 1027
999/** 1028/**
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e2fcacf07de3..5bb790a69c71 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
354 struct page *page; 354 struct page *page;
355 int once_thru = 0; 355 int once_thru = 0;
356 356
357 dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%ld), %u@%lld)\n", 357 dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
358 file, mapping->host->i_ino, len, (long long) pos); 358 file, mapping->host->i_ino, len, (long long) pos);
359 359
360start: 360start:
@@ -395,7 +395,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
395 struct nfs_open_context *ctx = nfs_file_open_context(file); 395 struct nfs_open_context *ctx = nfs_file_open_context(file);
396 int status; 396 int status;
397 397
398 dfprintk(PAGECACHE, "NFS: write_end(%pD2(%ld), %u@%lld)\n", 398 dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
399 file, mapping->host->i_ino, len, (long long) pos); 399 file, mapping->host->i_ino, len, (long long) pos);
400 400
401 /* 401 /*
@@ -585,7 +585,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
585 int ret = VM_FAULT_NOPAGE; 585 int ret = VM_FAULT_NOPAGE;
586 struct address_space *mapping; 586 struct address_space *mapping;
587 587
588 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%ld), offset %lld)\n", 588 dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
589 filp, filp->f_mapping->host->i_ino, 589 filp, filp->f_mapping->host->i_ino,
590 (long long)page_offset(page)); 590 (long long)page_offset(page));
591 591
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 00ad1c2b217d..360114ae8b82 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -164,17 +164,16 @@ static void nfs_zap_caches_locked(struct inode *inode)
164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { 164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
165 nfs_fscache_invalidate(inode); 165 nfs_fscache_invalidate(inode);
166 nfsi->cache_validity |= NFS_INO_INVALID_ATTR 166 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
167 | NFS_INO_INVALID_LABEL
168 | NFS_INO_INVALID_DATA 167 | NFS_INO_INVALID_DATA
169 | NFS_INO_INVALID_ACCESS 168 | NFS_INO_INVALID_ACCESS
170 | NFS_INO_INVALID_ACL 169 | NFS_INO_INVALID_ACL
171 | NFS_INO_REVAL_PAGECACHE; 170 | NFS_INO_REVAL_PAGECACHE;
172 } else 171 } else
173 nfsi->cache_validity |= NFS_INO_INVALID_ATTR 172 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
174 | NFS_INO_INVALID_LABEL
175 | NFS_INO_INVALID_ACCESS 173 | NFS_INO_INVALID_ACCESS
176 | NFS_INO_INVALID_ACL 174 | NFS_INO_INVALID_ACL
177 | NFS_INO_REVAL_PAGECACHE; 175 | NFS_INO_REVAL_PAGECACHE;
176 nfs_zap_label_cache_locked(nfsi);
178} 177}
179 178
180void nfs_zap_caches(struct inode *inode) 179void nfs_zap_caches(struct inode *inode)
@@ -266,6 +265,13 @@ nfs_init_locked(struct inode *inode, void *opaque)
266} 265}
267 266
268#ifdef CONFIG_NFS_V4_SECURITY_LABEL 267#ifdef CONFIG_NFS_V4_SECURITY_LABEL
268static void nfs_clear_label_invalid(struct inode *inode)
269{
270 spin_lock(&inode->i_lock);
271 NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL;
272 spin_unlock(&inode->i_lock);
273}
274
269void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, 275void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
270 struct nfs4_label *label) 276 struct nfs4_label *label)
271{ 277{
@@ -283,6 +289,7 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
283 __func__, 289 __func__,
284 (char *)label->label, 290 (char *)label->label,
285 label->len, error); 291 label->len, error);
292 nfs_clear_label_invalid(inode);
286 } 293 }
287} 294}
288 295
@@ -458,9 +465,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
458 unlock_new_inode(inode); 465 unlock_new_inode(inode);
459 } else 466 } else
460 nfs_refresh_inode(inode, fattr); 467 nfs_refresh_inode(inode, fattr);
461 dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", 468 dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",
462 inode->i_sb->s_id, 469 inode->i_sb->s_id,
463 (long long)NFS_FILEID(inode), 470 (unsigned long long)NFS_FILEID(inode),
464 nfs_display_fhandle_hash(fh), 471 nfs_display_fhandle_hash(fh),
465 atomic_read(&inode->i_count)); 472 atomic_read(&inode->i_count));
466 473
@@ -870,8 +877,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
870 struct nfs_fattr *fattr = NULL; 877 struct nfs_fattr *fattr = NULL;
871 struct nfs_inode *nfsi = NFS_I(inode); 878 struct nfs_inode *nfsi = NFS_I(inode);
872 879
873 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", 880 dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n",
874 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 881 inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode));
875 882
876 trace_nfs_revalidate_inode_enter(inode); 883 trace_nfs_revalidate_inode_enter(inode);
877 884
@@ -895,9 +902,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
895 902
896 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); 903 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
897 if (status != 0) { 904 if (status != 0) {
898 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", 905 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
899 inode->i_sb->s_id, 906 inode->i_sb->s_id,
900 (long long)NFS_FILEID(inode), status); 907 (unsigned long long)NFS_FILEID(inode), status);
901 if (status == -ESTALE) { 908 if (status == -ESTALE) {
902 nfs_zap_caches(inode); 909 nfs_zap_caches(inode);
903 if (!S_ISDIR(inode->i_mode)) 910 if (!S_ISDIR(inode->i_mode))
@@ -908,9 +915,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
908 915
909 status = nfs_refresh_inode(inode, fattr); 916 status = nfs_refresh_inode(inode, fattr);
910 if (status) { 917 if (status) {
911 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", 918 dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
912 inode->i_sb->s_id, 919 inode->i_sb->s_id,
913 (long long)NFS_FILEID(inode), status); 920 (unsigned long long)NFS_FILEID(inode), status);
914 goto err_out; 921 goto err_out;
915 } 922 }
916 923
@@ -919,9 +926,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
919 926
920 nfs_setsecurity(inode, fattr, label); 927 nfs_setsecurity(inode, fattr, label);
921 928
922 dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", 929 dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
923 inode->i_sb->s_id, 930 inode->i_sb->s_id,
924 (long long)NFS_FILEID(inode)); 931 (unsigned long long)NFS_FILEID(inode));
925 932
926err_out: 933err_out:
927 nfs4_label_free(label); 934 nfs4_label_free(label);
@@ -977,16 +984,17 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
977 if (ret < 0) 984 if (ret < 0)
978 return ret; 985 return ret;
979 } 986 }
980 spin_lock(&inode->i_lock); 987 if (S_ISDIR(inode->i_mode)) {
981 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 988 spin_lock(&inode->i_lock);
982 if (S_ISDIR(inode->i_mode))
983 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 989 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
984 spin_unlock(&inode->i_lock); 990 spin_unlock(&inode->i_lock);
991 }
985 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 992 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
986 nfs_fscache_wait_on_invalidate(inode); 993 nfs_fscache_wait_on_invalidate(inode);
987 994
988 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 995 dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n",
989 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 996 inode->i_sb->s_id,
997 (unsigned long long)NFS_FILEID(inode));
990 return 0; 998 return 0;
991} 999}
992 1000
@@ -1007,6 +1015,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
1007int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 1015int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1008{ 1016{
1009 struct nfs_inode *nfsi = NFS_I(inode); 1017 struct nfs_inode *nfsi = NFS_I(inode);
1018 unsigned long *bitlock = &nfsi->flags;
1010 int ret = 0; 1019 int ret = 0;
1011 1020
1012 /* swapfiles are not supposed to be shared. */ 1021 /* swapfiles are not supposed to be shared. */
@@ -1018,12 +1027,46 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1018 if (ret < 0) 1027 if (ret < 0)
1019 goto out; 1028 goto out;
1020 } 1029 }
1021 if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { 1030
1022 trace_nfs_invalidate_mapping_enter(inode); 1031 /*
1023 ret = nfs_invalidate_mapping(inode, mapping); 1032 * We must clear NFS_INO_INVALID_DATA first to ensure that
1024 trace_nfs_invalidate_mapping_exit(inode, ret); 1033 * invalidations that come in while we're shooting down the mappings
1034 * are respected. But, that leaves a race window where one revalidator
1035 * can clear the flag, and then another checks it before the mapping
1036 * gets invalidated. Fix that by serializing access to this part of
1037 * the function.
1038 *
1039 * At the same time, we need to allow other tasks to see whether we
1040 * might be in the middle of invalidating the pages, so we only set
1041 * the bit lock here if it looks like we're going to be doing that.
1042 */
1043 for (;;) {
1044 ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
1045 nfs_wait_bit_killable, TASK_KILLABLE);
1046 if (ret)
1047 goto out;
1048 spin_lock(&inode->i_lock);
1049 if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
1050 spin_unlock(&inode->i_lock);
1051 continue;
1052 }
1053 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
1054 break;
1055 spin_unlock(&inode->i_lock);
1056 goto out;
1025 } 1057 }
1026 1058
1059 set_bit(NFS_INO_INVALIDATING, bitlock);
1060 smp_wmb();
1061 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
1062 spin_unlock(&inode->i_lock);
1063 trace_nfs_invalidate_mapping_enter(inode);
1064 ret = nfs_invalidate_mapping(inode, mapping);
1065 trace_nfs_invalidate_mapping_exit(inode, ret);
1066
1067 clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
1068 smp_mb__after_clear_bit();
1069 wake_up_bit(bitlock, NFS_INO_INVALIDATING);
1027out: 1070out:
1028 return ret; 1071 return ret;
1029} 1072}
@@ -1282,12 +1325,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
1282 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); 1325 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
1283} 1326}
1284 1327
1328/*
1329 * Don't trust the change_attribute, mtime, ctime or size if
1330 * a pnfs LAYOUTCOMMIT is outstanding
1331 */
1332static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
1333 struct nfs_fattr *fattr)
1334{
1335 if (pnfs_layoutcommit_outstanding(inode))
1336 fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
1337 NFS_ATTR_FATTR_MTIME |
1338 NFS_ATTR_FATTR_CTIME |
1339 NFS_ATTR_FATTR_SIZE);
1340}
1341
1285static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) 1342static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
1286{ 1343{
1287 int ret; 1344 int ret;
1288 1345
1289 trace_nfs_refresh_inode_enter(inode); 1346 trace_nfs_refresh_inode_enter(inode);
1290 1347
1348 nfs_inode_attrs_handle_layoutcommit(inode, fattr);
1349
1291 if (nfs_inode_attrs_need_update(inode, fattr)) 1350 if (nfs_inode_attrs_need_update(inode, fattr))
1292 ret = nfs_update_inode(inode, fattr); 1351 ret = nfs_update_inode(inode, fattr);
1293 else 1352 else
@@ -1434,7 +1493,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1434 unsigned long now = jiffies; 1493 unsigned long now = jiffies;
1435 unsigned long save_cache_validity; 1494 unsigned long save_cache_validity;
1436 1495
1437 dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n", 1496 dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
1438 __func__, inode->i_sb->s_id, inode->i_ino, 1497 __func__, inode->i_sb->s_id, inode->i_ino,
1439 nfs_display_fhandle_hash(NFS_FH(inode)), 1498 nfs_display_fhandle_hash(NFS_FH(inode)),
1440 atomic_read(&inode->i_count), fattr->valid); 1499 atomic_read(&inode->i_count), fattr->valid);
@@ -1455,7 +1514,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1455 /* 1514 /*
1456 * Big trouble! The inode has become a different object. 1515 * Big trouble! The inode has become a different object.
1457 */ 1516 */
1458 printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n", 1517 printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n",
1459 __func__, inode->i_ino, inode->i_mode, fattr->mode); 1518 __func__, inode->i_ino, inode->i_mode, fattr->mode);
1460 goto out_err; 1519 goto out_err;
1461 } 1520 }
@@ -1517,8 +1576,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1517 if (new_isize != cur_isize) { 1576 if (new_isize != cur_isize) {
1518 /* Do we perhaps have any outstanding writes, or has 1577 /* Do we perhaps have any outstanding writes, or has
1519 * the file grown beyond our last write? */ 1578 * the file grown beyond our last write? */
1520 if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || 1579 if ((nfsi->npages == 0) || new_isize > cur_isize) {
1521 new_isize > cur_isize) {
1522 i_size_write(inode, new_isize); 1580 i_size_write(inode, new_isize);
1523 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1581 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1524 } 1582 }
@@ -1597,7 +1655,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1597 inode->i_blocks = fattr->du.nfs2.blocks; 1655 inode->i_blocks = fattr->du.nfs2.blocks;
1598 1656
1599 /* Update attrtimeo value if we're out of the unstable period */ 1657 /* Update attrtimeo value if we're out of the unstable period */
1600 if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) { 1658 if (invalid & NFS_INO_INVALID_ATTR) {
1601 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 1659 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
1602 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 1660 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
1603 nfsi->attrtimeo_timestamp = now; 1661 nfsi->attrtimeo_timestamp = now;
@@ -1610,7 +1668,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1610 } 1668 }
1611 } 1669 }
1612 invalid &= ~NFS_INO_INVALID_ATTR; 1670 invalid &= ~NFS_INO_INVALID_ATTR;
1613 invalid &= ~NFS_INO_INVALID_LABEL;
1614 /* Don't invalidate the data if we were to blame */ 1671 /* Don't invalidate the data if we were to blame */
1615 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 1672 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
1616 || S_ISLNK(inode->i_mode))) 1673 || S_ISLNK(inode->i_mode)))
@@ -1641,10 +1698,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
1641 return NULL; 1698 return NULL;
1642 nfsi->flags = 0UL; 1699 nfsi->flags = 0UL;
1643 nfsi->cache_validity = 0UL; 1700 nfsi->cache_validity = 0UL;
1644#ifdef CONFIG_NFS_V3_ACL
1645 nfsi->acl_access = ERR_PTR(-EAGAIN);
1646 nfsi->acl_default = ERR_PTR(-EAGAIN);
1647#endif
1648#if IS_ENABLED(CONFIG_NFS_V4) 1701#if IS_ENABLED(CONFIG_NFS_V4)
1649 nfsi->nfs4_acl = NULL; 1702 nfsi->nfs4_acl = NULL;
1650#endif /* CONFIG_NFS_V4 */ 1703#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 8b5cc04a8611..b46cf5a67329 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -176,7 +176,8 @@ extern struct nfs_server *nfs4_create_server(
176extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, 176extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
177 struct nfs_fh *); 177 struct nfs_fh *);
178extern int nfs4_update_server(struct nfs_server *server, const char *hostname, 178extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
179 struct sockaddr *sap, size_t salen); 179 struct sockaddr *sap, size_t salen,
180 struct net *net);
180extern void nfs_free_server(struct nfs_server *server); 181extern void nfs_free_server(struct nfs_server *server);
181extern struct nfs_server *nfs_clone_server(struct nfs_server *, 182extern struct nfs_server *nfs_clone_server(struct nfs_server *,
182 struct nfs_fh *, 183 struct nfs_fh *,
@@ -279,9 +280,18 @@ static inline void nfs4_label_free(struct nfs4_label *label)
279 } 280 }
280 return; 281 return;
281} 282}
283
284static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
285{
286 if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL))
287 nfsi->cache_validity |= NFS_INO_INVALID_LABEL;
288}
282#else 289#else
283static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } 290static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
284static inline void nfs4_label_free(void *label) {} 291static inline void nfs4_label_free(void *label) {}
292static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
293{
294}
285#endif /* CONFIG_NFS_V4_SECURITY_LABEL */ 295#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
286 296
287/* proc.c */ 297/* proc.c */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 4a1aafba6a20..871d6eda8dba 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -10,179 +10,7 @@
10 10
11#define NFSDBG_FACILITY NFSDBG_PROC 11#define NFSDBG_FACILITY NFSDBG_PROC
12 12
13ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) 13struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
14{
15 struct inode *inode = dentry->d_inode;
16 struct posix_acl *acl;
17 int pos=0, len=0;
18
19# define output(s) do { \
20 if (pos + sizeof(s) <= size) { \
21 memcpy(buffer + pos, s, sizeof(s)); \
22 pos += sizeof(s); \
23 } \
24 len += sizeof(s); \
25 } while(0)
26
27 acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS);
28 if (IS_ERR(acl))
29 return PTR_ERR(acl);
30 if (acl) {
31 output("system.posix_acl_access");
32 posix_acl_release(acl);
33 }
34
35 if (S_ISDIR(inode->i_mode)) {
36 acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT);
37 if (IS_ERR(acl))
38 return PTR_ERR(acl);
39 if (acl) {
40 output("system.posix_acl_default");
41 posix_acl_release(acl);
42 }
43 }
44
45# undef output
46
47 if (!buffer || len <= size)
48 return len;
49 return -ERANGE;
50}
51
52ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
53 void *buffer, size_t size)
54{
55 struct inode *inode = dentry->d_inode;
56 struct posix_acl *acl;
57 int type, error = 0;
58
59 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
60 type = ACL_TYPE_ACCESS;
61 else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
62 type = ACL_TYPE_DEFAULT;
63 else
64 return -EOPNOTSUPP;
65
66 acl = nfs3_proc_getacl(inode, type);
67 if (IS_ERR(acl))
68 return PTR_ERR(acl);
69 else if (acl) {
70 if (type == ACL_TYPE_ACCESS && acl->a_count == 0)
71 error = -ENODATA;
72 else
73 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
74 posix_acl_release(acl);
75 } else
76 error = -ENODATA;
77
78 return error;
79}
80
81int nfs3_setxattr(struct dentry *dentry, const char *name,
82 const void *value, size_t size, int flags)
83{
84 struct inode *inode = dentry->d_inode;
85 struct posix_acl *acl;
86 int type, error;
87
88 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
89 type = ACL_TYPE_ACCESS;
90 else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
91 type = ACL_TYPE_DEFAULT;
92 else
93 return -EOPNOTSUPP;
94
95 acl = posix_acl_from_xattr(&init_user_ns, value, size);
96 if (IS_ERR(acl))
97 return PTR_ERR(acl);
98 error = nfs3_proc_setacl(inode, type, acl);
99 posix_acl_release(acl);
100
101 return error;
102}
103
104int nfs3_removexattr(struct dentry *dentry, const char *name)
105{
106 struct inode *inode = dentry->d_inode;
107 int type;
108
109 if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
110 type = ACL_TYPE_ACCESS;
111 else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
112 type = ACL_TYPE_DEFAULT;
113 else
114 return -EOPNOTSUPP;
115
116 return nfs3_proc_setacl(inode, type, NULL);
117}
118
119static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi)
120{
121 if (!IS_ERR(nfsi->acl_access)) {
122 posix_acl_release(nfsi->acl_access);
123 nfsi->acl_access = ERR_PTR(-EAGAIN);
124 }
125 if (!IS_ERR(nfsi->acl_default)) {
126 posix_acl_release(nfsi->acl_default);
127 nfsi->acl_default = ERR_PTR(-EAGAIN);
128 }
129}
130
131void nfs3_forget_cached_acls(struct inode *inode)
132{
133 dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id,
134 inode->i_ino);
135 spin_lock(&inode->i_lock);
136 __nfs3_forget_cached_acls(NFS_I(inode));
137 spin_unlock(&inode->i_lock);
138}
139
140static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type)
141{
142 struct nfs_inode *nfsi = NFS_I(inode);
143 struct posix_acl *acl = ERR_PTR(-EINVAL);
144
145 spin_lock(&inode->i_lock);
146 switch(type) {
147 case ACL_TYPE_ACCESS:
148 acl = nfsi->acl_access;
149 break;
150
151 case ACL_TYPE_DEFAULT:
152 acl = nfsi->acl_default;
153 break;
154
155 default:
156 goto out;
157 }
158 if (IS_ERR(acl))
159 acl = ERR_PTR(-EAGAIN);
160 else
161 acl = posix_acl_dup(acl);
162out:
163 spin_unlock(&inode->i_lock);
164 dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id,
165 inode->i_ino, type, acl);
166 return acl;
167}
168
169static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
170 struct posix_acl *dfacl)
171{
172 struct nfs_inode *nfsi = NFS_I(inode);
173
174 dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id,
175 inode->i_ino, acl, dfacl);
176 spin_lock(&inode->i_lock);
177 __nfs3_forget_cached_acls(NFS_I(inode));
178 if (!IS_ERR(acl))
179 nfsi->acl_access = posix_acl_dup(acl);
180 if (!IS_ERR(dfacl))
181 nfsi->acl_default = posix_acl_dup(dfacl);
182 spin_unlock(&inode->i_lock);
183}
184
185struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
186{ 14{
187 struct nfs_server *server = NFS_SERVER(inode); 15 struct nfs_server *server = NFS_SERVER(inode);
188 struct page *pages[NFSACL_MAXPAGES] = { }; 16 struct page *pages[NFSACL_MAXPAGES] = { };
@@ -198,7 +26,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
198 .rpc_argp = &args, 26 .rpc_argp = &args,
199 .rpc_resp = &res, 27 .rpc_resp = &res,
200 }; 28 };
201 struct posix_acl *acl;
202 int status, count; 29 int status, count;
203 30
204 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 31 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
207 status = nfs_revalidate_inode(server, inode); 34 status = nfs_revalidate_inode(server, inode);
208 if (status < 0) 35 if (status < 0)
209 return ERR_PTR(status); 36 return ERR_PTR(status);
210 acl = nfs3_get_cached_acl(inode, type);
211 if (acl != ERR_PTR(-EAGAIN))
212 return acl;
213 acl = NULL;
214 37
215 /* 38 /*
216 * Only get the access acl when explicitly requested: We don't 39 * Only get the access acl when explicitly requested: We don't
@@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
257 } 80 }
258 81
259 if (res.acl_access != NULL) { 82 if (res.acl_access != NULL) {
260 if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { 83 if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) ||
84 res.acl_access->a_count == 0) {
261 posix_acl_release(res.acl_access); 85 posix_acl_release(res.acl_access);
262 res.acl_access = NULL; 86 res.acl_access = NULL;
263 } 87 }
264 } 88 }
265 nfs3_cache_acls(inode,
266 (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL),
267 (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
268 89
269 switch(type) { 90 if (res.mask & NFS_ACL)
270 case ACL_TYPE_ACCESS: 91 set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access);
271 acl = res.acl_access; 92 else
272 res.acl_access = NULL; 93 forget_cached_acl(inode, ACL_TYPE_ACCESS);
273 break;
274 94
275 case ACL_TYPE_DEFAULT: 95 if (res.mask & NFS_DFACL)
276 acl = res.acl_default; 96 set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default);
277 res.acl_default = NULL; 97 else
98 forget_cached_acl(inode, ACL_TYPE_DEFAULT);
99
100 nfs_free_fattr(res.fattr);
101 if (type == ACL_TYPE_ACCESS) {
102 posix_acl_release(res.acl_default);
103 return res.acl_access;
104 } else {
105 posix_acl_release(res.acl_access);
106 return res.acl_default;
278 } 107 }
279 108
280getout: 109getout:
281 posix_acl_release(res.acl_access); 110 posix_acl_release(res.acl_access);
282 posix_acl_release(res.acl_default); 111 posix_acl_release(res.acl_default);
283 nfs_free_fattr(res.fattr); 112 nfs_free_fattr(res.fattr);
284 113 return ERR_PTR(status);
285 if (status != 0) {
286 posix_acl_release(acl);
287 acl = ERR_PTR(status);
288 }
289 return acl;
290} 114}
291 115
292static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, 116static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
293 struct posix_acl *dfacl) 117 struct posix_acl *dfacl)
294{ 118{
295 struct nfs_server *server = NFS_SERVER(inode); 119 struct nfs_server *server = NFS_SERVER(inode);
296 struct nfs_fattr *fattr; 120 struct nfs_fattr *fattr;
@@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
353 switch (status) { 177 switch (status) {
354 case 0: 178 case 0:
355 status = nfs_refresh_inode(inode, fattr); 179 status = nfs_refresh_inode(inode, fattr);
356 nfs3_cache_acls(inode, acl, dfacl); 180 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
181 set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
357 break; 182 break;
358 case -EPFNOSUPPORT: 183 case -EPFNOSUPPORT:
359 case -EPROTONOSUPPORT: 184 case -EPROTONOSUPPORT:
@@ -373,40 +198,43 @@ out:
373 return status; 198 return status;
374} 199}
375 200
376int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) 201int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
202 struct posix_acl *dfacl)
203{
204 int ret;
205 ret = __nfs3_proc_setacls(inode, acl, dfacl);
206 return (ret == -EOPNOTSUPP) ? 0 : ret;
207
208}
209
210int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
377{ 211{
378 struct posix_acl *alloc = NULL, *dfacl = NULL; 212 struct posix_acl *alloc = NULL, *dfacl = NULL;
379 int status; 213 int status;
380 214
381 if (S_ISDIR(inode->i_mode)) { 215 if (S_ISDIR(inode->i_mode)) {
382 switch(type) { 216 switch(type) {
383 case ACL_TYPE_ACCESS: 217 case ACL_TYPE_ACCESS:
384 alloc = dfacl = nfs3_proc_getacl(inode, 218 alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT);
385 ACL_TYPE_DEFAULT); 219 if (IS_ERR(alloc))
386 if (IS_ERR(alloc)) 220 goto fail;
387 goto fail; 221 break;
388 break;
389
390 case ACL_TYPE_DEFAULT:
391 dfacl = acl;
392 alloc = acl = nfs3_proc_getacl(inode,
393 ACL_TYPE_ACCESS);
394 if (IS_ERR(alloc))
395 goto fail;
396 break;
397 222
398 default: 223 case ACL_TYPE_DEFAULT:
399 return -EINVAL; 224 dfacl = acl;
225 alloc = acl = get_acl(inode, ACL_TYPE_ACCESS);
226 if (IS_ERR(alloc))
227 goto fail;
228 break;
400 } 229 }
401 } else if (type != ACL_TYPE_ACCESS) 230 }
402 return -EINVAL;
403 231
404 if (acl == NULL) { 232 if (acl == NULL) {
405 alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); 233 alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
406 if (IS_ERR(alloc)) 234 if (IS_ERR(alloc))
407 goto fail; 235 goto fail;
408 } 236 }
409 status = nfs3_proc_setacls(inode, acl, dfacl); 237 status = __nfs3_proc_setacls(inode, acl, dfacl);
410 posix_acl_release(alloc); 238 posix_acl_release(alloc);
411 return status; 239 return status;
412 240
@@ -414,27 +242,8 @@ fail:
414 return PTR_ERR(alloc); 242 return PTR_ERR(alloc);
415} 243}
416 244
417int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, 245const struct xattr_handler *nfs3_xattr_handlers[] = {
418 umode_t mode) 246 &posix_acl_access_xattr_handler,
419{ 247 &posix_acl_default_xattr_handler,
420 struct posix_acl *dfacl, *acl; 248 NULL,
421 int error = 0; 249};
422
423 dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT);
424 if (IS_ERR(dfacl)) {
425 error = PTR_ERR(dfacl);
426 return (error == -EOPNOTSUPP) ? 0 : error;
427 }
428 if (!dfacl)
429 return 0;
430 acl = posix_acl_dup(dfacl);
431 error = posix_acl_create(&acl, GFP_KERNEL, &mode);
432 if (error < 0)
433 goto out_release_dfacl;
434 error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
435 dfacl : NULL);
436 posix_acl_release(acl);
437out_release_dfacl:
438 posix_acl_release(dfacl);
439 return error;
440}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 01b6f6a49d16..a462ef0fb5d6 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -18,6 +18,7 @@
18#include <linux/lockd/bind.h> 18#include <linux/lockd/bind.h>
19#include <linux/nfs_mount.h> 19#include <linux/nfs_mount.h>
20#include <linux/freezer.h> 20#include <linux/freezer.h>
21#include <linux/xattr.h>
21 22
22#include "iostat.h" 23#include "iostat.h"
23#include "internal.h" 24#include "internal.h"
@@ -317,8 +318,8 @@ static int
317nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 318nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
318 int flags) 319 int flags)
319{ 320{
321 struct posix_acl *default_acl, *acl;
320 struct nfs3_createdata *data; 322 struct nfs3_createdata *data;
321 umode_t mode = sattr->ia_mode;
322 int status = -ENOMEM; 323 int status = -ENOMEM;
323 324
324 dprintk("NFS call create %pd\n", dentry); 325 dprintk("NFS call create %pd\n", dentry);
@@ -340,7 +341,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
340 data->arg.create.verifier[1] = cpu_to_be32(current->pid); 341 data->arg.create.verifier[1] = cpu_to_be32(current->pid);
341 } 342 }
342 343
343 sattr->ia_mode &= ~current_umask(); 344 status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
345 if (status)
346 goto out;
344 347
345 for (;;) { 348 for (;;) {
346 status = nfs3_do_create(dir, dentry, data); 349 status = nfs3_do_create(dir, dentry, data);
@@ -366,7 +369,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
366 } 369 }
367 370
368 if (status != 0) 371 if (status != 0)
369 goto out; 372 goto out_release_acls;
370 373
371 /* When we created the file with exclusive semantics, make 374 /* When we created the file with exclusive semantics, make
372 * sure we set the attributes afterwards. */ 375 * sure we set the attributes afterwards. */
@@ -385,9 +388,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
385 nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); 388 nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
386 dprintk("NFS reply setattr (post-create): %d\n", status); 389 dprintk("NFS reply setattr (post-create): %d\n", status);
387 if (status != 0) 390 if (status != 0)
388 goto out; 391 goto out_release_acls;
389 } 392 }
390 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 393
394 status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
395
396out_release_acls:
397 posix_acl_release(acl);
398 posix_acl_release(default_acl);
391out: 399out:
392 nfs3_free_createdata(data); 400 nfs3_free_createdata(data);
393 dprintk("NFS reply create: %d\n", status); 401 dprintk("NFS reply create: %d\n", status);
@@ -572,18 +580,20 @@ out:
572static int 580static int
573nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 581nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
574{ 582{
583 struct posix_acl *default_acl, *acl;
575 struct nfs3_createdata *data; 584 struct nfs3_createdata *data;
576 umode_t mode = sattr->ia_mode;
577 int status = -ENOMEM; 585 int status = -ENOMEM;
578 586
579 dprintk("NFS call mkdir %pd\n", dentry); 587 dprintk("NFS call mkdir %pd\n", dentry);
580 588
581 sattr->ia_mode &= ~current_umask();
582
583 data = nfs3_alloc_createdata(); 589 data = nfs3_alloc_createdata();
584 if (data == NULL) 590 if (data == NULL)
585 goto out; 591 goto out;
586 592
593 status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
594 if (status)
595 goto out;
596
587 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; 597 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
588 data->arg.mkdir.fh = NFS_FH(dir); 598 data->arg.mkdir.fh = NFS_FH(dir);
589 data->arg.mkdir.name = dentry->d_name.name; 599 data->arg.mkdir.name = dentry->d_name.name;
@@ -592,9 +602,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
592 602
593 status = nfs3_do_create(dir, dentry, data); 603 status = nfs3_do_create(dir, dentry, data);
594 if (status != 0) 604 if (status != 0)
595 goto out; 605 goto out_release_acls;
596 606
597 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 607 status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
608
609out_release_acls:
610 posix_acl_release(acl);
611 posix_acl_release(default_acl);
598out: 612out:
599 nfs3_free_createdata(data); 613 nfs3_free_createdata(data);
600 dprintk("NFS reply mkdir: %d\n", status); 614 dprintk("NFS reply mkdir: %d\n", status);
@@ -691,19 +705,21 @@ static int
691nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 705nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
692 dev_t rdev) 706 dev_t rdev)
693{ 707{
708 struct posix_acl *default_acl, *acl;
694 struct nfs3_createdata *data; 709 struct nfs3_createdata *data;
695 umode_t mode = sattr->ia_mode;
696 int status = -ENOMEM; 710 int status = -ENOMEM;
697 711
698 dprintk("NFS call mknod %pd %u:%u\n", dentry, 712 dprintk("NFS call mknod %pd %u:%u\n", dentry,
699 MAJOR(rdev), MINOR(rdev)); 713 MAJOR(rdev), MINOR(rdev));
700 714
701 sattr->ia_mode &= ~current_umask();
702
703 data = nfs3_alloc_createdata(); 715 data = nfs3_alloc_createdata();
704 if (data == NULL) 716 if (data == NULL)
705 goto out; 717 goto out;
706 718
719 status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
720 if (status)
721 goto out;
722
707 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; 723 data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
708 data->arg.mknod.fh = NFS_FH(dir); 724 data->arg.mknod.fh = NFS_FH(dir);
709 data->arg.mknod.name = dentry->d_name.name; 725 data->arg.mknod.name = dentry->d_name.name;
@@ -731,8 +747,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
731 747
732 status = nfs3_do_create(dir, dentry, data); 748 status = nfs3_do_create(dir, dentry, data);
733 if (status != 0) 749 if (status != 0)
734 goto out; 750 goto out_release_acls;
735 status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); 751
752 status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
753
754out_release_acls:
755 posix_acl_release(acl);
756 posix_acl_release(default_acl);
736out: 757out:
737 nfs3_free_createdata(data); 758 nfs3_free_createdata(data);
738 dprintk("NFS reply mknod: %d\n", status); 759 dprintk("NFS reply mknod: %d\n", status);
@@ -904,20 +925,28 @@ static const struct inode_operations nfs3_dir_inode_operations = {
904 .permission = nfs_permission, 925 .permission = nfs_permission,
905 .getattr = nfs_getattr, 926 .getattr = nfs_getattr,
906 .setattr = nfs_setattr, 927 .setattr = nfs_setattr,
907 .listxattr = nfs3_listxattr, 928#ifdef CONFIG_NFS_V3_ACL
908 .getxattr = nfs3_getxattr, 929 .listxattr = generic_listxattr,
909 .setxattr = nfs3_setxattr, 930 .getxattr = generic_getxattr,
910 .removexattr = nfs3_removexattr, 931 .setxattr = generic_setxattr,
932 .removexattr = generic_removexattr,
933 .get_acl = nfs3_get_acl,
934 .set_acl = nfs3_set_acl,
935#endif
911}; 936};
912 937
913static const struct inode_operations nfs3_file_inode_operations = { 938static const struct inode_operations nfs3_file_inode_operations = {
914 .permission = nfs_permission, 939 .permission = nfs_permission,
915 .getattr = nfs_getattr, 940 .getattr = nfs_getattr,
916 .setattr = nfs_setattr, 941 .setattr = nfs_setattr,
917 .listxattr = nfs3_listxattr, 942#ifdef CONFIG_NFS_V3_ACL
918 .getxattr = nfs3_getxattr, 943 .listxattr = generic_listxattr,
919 .setxattr = nfs3_setxattr, 944 .getxattr = generic_getxattr,
920 .removexattr = nfs3_removexattr, 945 .setxattr = generic_setxattr,
946 .removexattr = generic_removexattr,
947 .get_acl = nfs3_get_acl,
948 .set_acl = nfs3_set_acl,
949#endif
921}; 950};
922 951
923const struct nfs_rpc_ops nfs_v3_clientops = { 952const struct nfs_rpc_ops nfs_v3_clientops = {
@@ -965,7 +994,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
965 .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, 994 .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
966 .commit_done = nfs3_commit_done, 995 .commit_done = nfs3_commit_done,
967 .lock = nfs3_proc_lock, 996 .lock = nfs3_proc_lock,
968 .clear_acl_cache = nfs3_forget_cached_acls, 997 .clear_acl_cache = forget_all_cached_acls,
969 .close_context = nfs_close_context, 998 .close_context = nfs_close_context,
970 .have_delegation = nfs3_have_delegation, 999 .have_delegation = nfs3_have_delegation,
971 .return_delegation = nfs3_return_delegation, 1000 .return_delegation = nfs3_return_delegation,
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index cc471c725230..d6a98949af19 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -12,6 +12,9 @@ static struct nfs_subversion nfs_v3 = {
12 .rpc_vers = &nfs_version3, 12 .rpc_vers = &nfs_version3,
13 .rpc_ops = &nfs_v3_clientops, 13 .rpc_ops = &nfs_v3_clientops,
14 .sops = &nfs_sops, 14 .sops = &nfs_sops,
15#ifdef CONFIG_NFS_V3_ACL
16 .xattr = nfs3_xattr_handlers,
17#endif
15}; 18};
16 19
17static int __init init_nfs_v3(void) 20static int __init init_nfs_v3(void)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5609edc742a0..a5b27c2d9689 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -270,6 +270,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
270extern int nfs41_setup_sequence(struct nfs4_session *session, 270extern int nfs41_setup_sequence(struct nfs4_session *session,
271 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 271 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
272 struct rpc_task *task); 272 struct rpc_task *task);
273extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
273extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); 274extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
274extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); 275extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
275extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 276extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index b4a160a405ce..0e46d3d1b6cc 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -10,6 +10,7 @@
10#include <linux/sunrpc/auth.h> 10#include <linux/sunrpc/auth.h>
11#include <linux/sunrpc/xprt.h> 11#include <linux/sunrpc/xprt.h>
12#include <linux/sunrpc/bc_xprt.h> 12#include <linux/sunrpc/bc_xprt.h>
13#include <linux/sunrpc/rpc_pipe_fs.h>
13#include "internal.h" 14#include "internal.h"
14#include "callback.h" 15#include "callback.h"
15#include "delegation.h" 16#include "delegation.h"
@@ -169,7 +170,7 @@ void nfs41_shutdown_client(struct nfs_client *clp)
169void nfs40_shutdown_client(struct nfs_client *clp) 170void nfs40_shutdown_client(struct nfs_client *clp)
170{ 171{
171 if (clp->cl_slot_tbl) { 172 if (clp->cl_slot_tbl) {
172 nfs4_release_slot_table(clp->cl_slot_tbl); 173 nfs4_shutdown_slot_table(clp->cl_slot_tbl);
173 kfree(clp->cl_slot_tbl); 174 kfree(clp->cl_slot_tbl);
174 } 175 }
175} 176}
@@ -370,6 +371,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
370 __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); 371 __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
371 __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); 372 __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
372 __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); 373 __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
374
373 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); 375 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
374 if (error == -EINVAL) 376 if (error == -EINVAL)
375 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); 377 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
@@ -409,13 +411,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
409 error = nfs4_discover_server_trunking(clp, &old); 411 error = nfs4_discover_server_trunking(clp, &old);
410 if (error < 0) 412 if (error < 0)
411 goto error; 413 goto error;
412 nfs_put_client(clp);
413 if (clp != old) {
414 clp->cl_preserve_clid = true;
415 clp = old;
416 }
417 414
418 return clp; 415 if (clp != old)
416 clp->cl_preserve_clid = true;
417 nfs_put_client(clp);
418 return old;
419 419
420error: 420error:
421 nfs_mark_client_ready(clp, error); 421 nfs_mark_client_ready(clp, error);
@@ -493,9 +493,10 @@ int nfs40_walk_client_list(struct nfs_client *new,
493 prev = pos; 493 prev = pos;
494 494
495 status = nfs_wait_client_init_complete(pos); 495 status = nfs_wait_client_init_complete(pos);
496 spin_lock(&nn->nfs_client_lock);
497 if (status < 0) 496 if (status < 0)
498 continue; 497 goto out;
498 status = -NFS4ERR_STALE_CLIENTID;
499 spin_lock(&nn->nfs_client_lock);
499 } 500 }
500 if (pos->cl_cons_state != NFS_CS_READY) 501 if (pos->cl_cons_state != NFS_CS_READY)
501 continue; 502 continue;
@@ -633,7 +634,8 @@ int nfs41_walk_client_list(struct nfs_client *new,
633 } 634 }
634 spin_lock(&nn->nfs_client_lock); 635 spin_lock(&nn->nfs_client_lock);
635 if (status < 0) 636 if (status < 0)
636 continue; 637 break;
638 status = -NFS4ERR_STALE_CLIENTID;
637 } 639 }
638 if (pos->cl_cons_state != NFS_CS_READY) 640 if (pos->cl_cons_state != NFS_CS_READY)
639 continue; 641 continue;
@@ -1133,6 +1135,7 @@ static int nfs_probe_destination(struct nfs_server *server)
1133 * @hostname: new end-point's hostname 1135 * @hostname: new end-point's hostname
1134 * @sap: new end-point's socket address 1136 * @sap: new end-point's socket address
1135 * @salen: size of "sap" 1137 * @salen: size of "sap"
1138 * @net: net namespace
1136 * 1139 *
1137 * The nfs_server must be quiescent before this function is invoked. 1140 * The nfs_server must be quiescent before this function is invoked.
1138 * Either its session is drained (NFSv4.1+), or its transport is 1141 * Either its session is drained (NFSv4.1+), or its transport is
@@ -1141,13 +1144,13 @@ static int nfs_probe_destination(struct nfs_server *server)
1141 * Returns zero on success, or a negative errno value. 1144 * Returns zero on success, or a negative errno value.
1142 */ 1145 */
1143int nfs4_update_server(struct nfs_server *server, const char *hostname, 1146int nfs4_update_server(struct nfs_server *server, const char *hostname,
1144 struct sockaddr *sap, size_t salen) 1147 struct sockaddr *sap, size_t salen, struct net *net)
1145{ 1148{
1146 struct nfs_client *clp = server->nfs_client; 1149 struct nfs_client *clp = server->nfs_client;
1147 struct rpc_clnt *clnt = server->client; 1150 struct rpc_clnt *clnt = server->client;
1148 struct xprt_create xargs = { 1151 struct xprt_create xargs = {
1149 .ident = clp->cl_proto, 1152 .ident = clp->cl_proto,
1150 .net = &init_net, 1153 .net = net,
1151 .dstaddr = sap, 1154 .dstaddr = sap,
1152 .addrlen = salen, 1155 .addrlen = salen,
1153 .servername = hostname, 1156 .servername = hostname,
@@ -1187,7 +1190,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
1187 error = nfs4_set_client(server, hostname, sap, salen, buf, 1190 error = nfs4_set_client(server, hostname, sap, salen, buf,
1188 clp->cl_rpcclient->cl_auth->au_flavor, 1191 clp->cl_rpcclient->cl_auth->au_flavor,
1189 clp->cl_proto, clnt->cl_timeout, 1192 clp->cl_proto, clnt->cl_timeout,
1190 clp->cl_minorversion, clp->cl_net); 1193 clp->cl_minorversion, net);
1191 nfs_put_client(clp); 1194 nfs_put_client(clp);
1192 if (error != 0) { 1195 if (error != 0) {
1193 nfs_server_insert_lists(server); 1196 nfs_server_insert_lists(server);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index b86464ba25e1..b9a35c05b60f 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -91,10 +91,10 @@ static void filelayout_reset_write(struct nfs_write_data *data)
91 91
92 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 92 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
93 dprintk("%s Reset task %5u for i/o through MDS " 93 dprintk("%s Reset task %5u for i/o through MDS "
94 "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, 94 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
95 data->task.tk_pid, 95 data->task.tk_pid,
96 hdr->inode->i_sb->s_id, 96 hdr->inode->i_sb->s_id,
97 (long long)NFS_FILEID(hdr->inode), 97 (unsigned long long)NFS_FILEID(hdr->inode),
98 data->args.count, 98 data->args.count,
99 (unsigned long long)data->args.offset); 99 (unsigned long long)data->args.offset);
100 100
@@ -112,10 +112,10 @@ static void filelayout_reset_read(struct nfs_read_data *data)
112 112
113 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 113 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
114 dprintk("%s Reset task %5u for i/o through MDS " 114 dprintk("%s Reset task %5u for i/o through MDS "
115 "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, 115 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
116 data->task.tk_pid, 116 data->task.tk_pid,
117 hdr->inode->i_sb->s_id, 117 hdr->inode->i_sb->s_id,
118 (long long)NFS_FILEID(hdr->inode), 118 (unsigned long long)NFS_FILEID(hdr->inode),
119 data->args.count, 119 data->args.count,
120 (unsigned long long)data->args.offset); 120 (unsigned long long)data->args.offset);
121 121
@@ -324,8 +324,9 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
324 &rdata->res.seq_res, 324 &rdata->res.seq_res,
325 task)) 325 task))
326 return; 326 return;
327 nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, 327 if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
328 rdata->args.lock_context, FMODE_READ); 328 rdata->args.lock_context, FMODE_READ) == -EIO)
329 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
329} 330}
330 331
331static void filelayout_read_call_done(struct rpc_task *task, void *data) 332static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -335,8 +336,10 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
335 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); 336 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
336 337
337 if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && 338 if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
338 task->tk_status == 0) 339 task->tk_status == 0) {
340 nfs41_sequence_done(task, &rdata->res.seq_res);
339 return; 341 return;
342 }
340 343
341 /* Note this may cause RPC to be resent */ 344 /* Note this may cause RPC to be resent */
342 rdata->header->mds_ops->rpc_call_done(task, data); 345 rdata->header->mds_ops->rpc_call_done(task, data);
@@ -433,8 +436,9 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
433 &wdata->res.seq_res, 436 &wdata->res.seq_res,
434 task)) 437 task))
435 return; 438 return;
436 nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, 439 if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
437 wdata->args.lock_context, FMODE_WRITE); 440 wdata->args.lock_context, FMODE_WRITE) == -EIO)
441 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
438} 442}
439 443
440static void filelayout_write_call_done(struct rpc_task *task, void *data) 444static void filelayout_write_call_done(struct rpc_task *task, void *data)
@@ -442,8 +446,10 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
442 struct nfs_write_data *wdata = data; 446 struct nfs_write_data *wdata = data;
443 447
444 if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && 448 if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
445 task->tk_status == 0) 449 task->tk_status == 0) {
450 nfs41_sequence_done(task, &wdata->res.seq_res);
446 return; 451 return;
452 }
447 453
448 /* Note this may cause RPC to be resent */ 454 /* Note this may cause RPC to be resent */
449 wdata->header->mds_ops->rpc_call_done(task, data); 455 wdata->header->mds_ops->rpc_call_done(task, data);
@@ -1216,17 +1222,17 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
1216 struct pnfs_commit_bucket *b; 1222 struct pnfs_commit_bucket *b;
1217 int i; 1223 int i;
1218 1224
1219 /* NOTE cinfo->lock is NOT held, relying on fact that this is 1225 spin_lock(cinfo->lock);
1220 * only called on single thread per dreq.
1221 * Can't take the lock because need to do pnfs_put_lseg
1222 */
1223 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { 1226 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1224 if (transfer_commit_list(&b->written, dst, cinfo, 0)) { 1227 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1228 spin_unlock(cinfo->lock);
1225 pnfs_put_lseg(b->wlseg); 1229 pnfs_put_lseg(b->wlseg);
1226 b->wlseg = NULL; 1230 b->wlseg = NULL;
1231 spin_lock(cinfo->lock);
1227 } 1232 }
1228 } 1233 }
1229 cinfo->ds->nwritten = 0; 1234 cinfo->ds->nwritten = 0;
1235 spin_unlock(cinfo->lock);
1230} 1236}
1231 1237
1232static unsigned int 1238static unsigned int
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index c7c295e556ed..efac602edb37 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -95,7 +95,7 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
95 b6 = (struct sockaddr_in6 *)addr2; 95 b6 = (struct sockaddr_in6 *)addr2;
96 96
97 /* LINKLOCAL addresses must have matching scope_id */ 97 /* LINKLOCAL addresses must have matching scope_id */
98 if (ipv6_addr_scope(&a6->sin6_addr) == 98 if (ipv6_addr_src_scope(&a6->sin6_addr) ==
99 IPV6_ADDR_SCOPE_LINKLOCAL && 99 IPV6_ADDR_SCOPE_LINKLOCAL &&
100 a6->sin6_scope_id != b6->sin6_scope_id) 100 a6->sin6_scope_id != b6->sin6_scope_id)
101 return false; 101 return false;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 4e7f05d3e9db..3d5dbf80d46a 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,9 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry,
121} 121}
122 122
123static size_t nfs_parse_server_name(char *string, size_t len, 123static size_t nfs_parse_server_name(char *string, size_t len,
124 struct sockaddr *sa, size_t salen, struct nfs_server *server) 124 struct sockaddr *sa, size_t salen, struct net *net)
125{ 125{
126 struct net *net = rpc_net_ns(server->client);
127 ssize_t ret; 126 ssize_t ret;
128 127
129 ret = rpc_pton(net, string, len, sa, salen); 128 ret = rpc_pton(net, string, len, sa, salen);
@@ -223,6 +222,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
223 const struct nfs4_fs_location *location) 222 const struct nfs4_fs_location *location)
224{ 223{
225 const size_t addr_bufsize = sizeof(struct sockaddr_storage); 224 const size_t addr_bufsize = sizeof(struct sockaddr_storage);
225 struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client);
226 struct vfsmount *mnt = ERR_PTR(-ENOENT); 226 struct vfsmount *mnt = ERR_PTR(-ENOENT);
227 char *mnt_path; 227 char *mnt_path;
228 unsigned int maxbuflen; 228 unsigned int maxbuflen;
@@ -248,8 +248,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
248 continue; 248 continue;
249 249
250 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, 250 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
251 mountdata->addr, addr_bufsize, 251 mountdata->addr, addr_bufsize, net);
252 NFS_SB(mountdata->sb));
253 if (mountdata->addrlen == 0) 252 if (mountdata->addrlen == 0)
254 continue; 253 continue;
255 254
@@ -419,6 +418,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
419 const struct nfs4_fs_location *location) 418 const struct nfs4_fs_location *location)
420{ 419{
421 const size_t addr_bufsize = sizeof(struct sockaddr_storage); 420 const size_t addr_bufsize = sizeof(struct sockaddr_storage);
421 struct net *net = rpc_net_ns(server->client);
422 struct sockaddr *sap; 422 struct sockaddr *sap;
423 unsigned int s; 423 unsigned int s;
424 size_t salen; 424 size_t salen;
@@ -440,7 +440,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
440 continue; 440 continue;
441 441
442 salen = nfs_parse_server_name(buf->data, buf->len, 442 salen = nfs_parse_server_name(buf->data, buf->len,
443 sap, addr_bufsize, server); 443 sap, addr_bufsize, net);
444 if (salen == 0) 444 if (salen == 0)
445 continue; 445 continue;
446 rpc_set_port(sap, NFS_PORT); 446 rpc_set_port(sap, NFS_PORT);
@@ -450,7 +450,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server,
450 if (hostname == NULL) 450 if (hostname == NULL)
451 break; 451 break;
452 452
453 error = nfs4_update_server(server, hostname, sap, salen); 453 error = nfs4_update_server(server, hostname, sap, salen, net);
454 kfree(hostname); 454 kfree(hostname);
455 if (error == 0) 455 if (error == 0)
456 break; 456 break;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 15052b81df42..450bfedbe2f4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -539,7 +539,7 @@ static int nfs40_sequence_done(struct rpc_task *task,
539 struct nfs4_slot *slot = res->sr_slot; 539 struct nfs4_slot *slot = res->sr_slot;
540 struct nfs4_slot_table *tbl; 540 struct nfs4_slot_table *tbl;
541 541
542 if (!RPC_WAS_SENT(task)) 542 if (slot == NULL)
543 goto out; 543 goto out;
544 544
545 tbl = slot->table; 545 tbl = slot->table;
@@ -559,15 +559,10 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
559{ 559{
560 struct nfs4_session *session; 560 struct nfs4_session *session;
561 struct nfs4_slot_table *tbl; 561 struct nfs4_slot_table *tbl;
562 struct nfs4_slot *slot = res->sr_slot;
562 bool send_new_highest_used_slotid = false; 563 bool send_new_highest_used_slotid = false;
563 564
564 if (!res->sr_slot) { 565 tbl = slot->table;
565 /* just wake up the next guy waiting since
566 * we may have not consumed a slot after all */
567 dprintk("%s: No slot\n", __func__);
568 return;
569 }
570 tbl = res->sr_slot->table;
571 session = tbl->session; 566 session = tbl->session;
572 567
573 spin_lock(&tbl->slot_tbl_lock); 568 spin_lock(&tbl->slot_tbl_lock);
@@ -577,11 +572,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
577 if (tbl->highest_used_slotid > tbl->target_highest_slotid) 572 if (tbl->highest_used_slotid > tbl->target_highest_slotid)
578 send_new_highest_used_slotid = true; 573 send_new_highest_used_slotid = true;
579 574
580 if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) { 575 if (nfs41_wake_and_assign_slot(tbl, slot)) {
581 send_new_highest_used_slotid = false; 576 send_new_highest_used_slotid = false;
582 goto out_unlock; 577 goto out_unlock;
583 } 578 }
584 nfs4_free_slot(tbl, res->sr_slot); 579 nfs4_free_slot(tbl, slot);
585 580
586 if (tbl->highest_used_slotid != NFS4_NO_SLOT) 581 if (tbl->highest_used_slotid != NFS4_NO_SLOT)
587 send_new_highest_used_slotid = false; 582 send_new_highest_used_slotid = false;
@@ -592,19 +587,20 @@ out_unlock:
592 nfs41_server_notify_highest_slotid_update(session->clp); 587 nfs41_server_notify_highest_slotid_update(session->clp);
593} 588}
594 589
595static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 590int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
596{ 591{
597 struct nfs4_session *session; 592 struct nfs4_session *session;
598 struct nfs4_slot *slot; 593 struct nfs4_slot *slot = res->sr_slot;
599 struct nfs_client *clp; 594 struct nfs_client *clp;
600 bool interrupted = false; 595 bool interrupted = false;
601 int ret = 1; 596 int ret = 1;
602 597
598 if (slot == NULL)
599 goto out_noaction;
603 /* don't increment the sequence number if the task wasn't sent */ 600 /* don't increment the sequence number if the task wasn't sent */
604 if (!RPC_WAS_SENT(task)) 601 if (!RPC_WAS_SENT(task))
605 goto out; 602 goto out;
606 603
607 slot = res->sr_slot;
608 session = slot->table->session; 604 session = slot->table->session;
609 605
610 if (slot->interrupted) { 606 if (slot->interrupted) {
@@ -679,6 +675,7 @@ out:
679 /* The session may be reset by one of the error handlers. */ 675 /* The session may be reset by one of the error handlers. */
680 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); 676 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
681 nfs41_sequence_free_slot(res); 677 nfs41_sequence_free_slot(res);
678out_noaction:
682 return ret; 679 return ret;
683retry_nowait: 680retry_nowait:
684 if (rpc_restart_call_prepare(task)) { 681 if (rpc_restart_call_prepare(task)) {
@@ -692,6 +689,7 @@ out_retry:
692 rpc_delay(task, NFS4_POLL_RETRY_MAX); 689 rpc_delay(task, NFS4_POLL_RETRY_MAX);
693 return 0; 690 return 0;
694} 691}
692EXPORT_SYMBOL_GPL(nfs41_sequence_done);
695 693
696static int nfs4_sequence_done(struct rpc_task *task, 694static int nfs4_sequence_done(struct rpc_task *task,
697 struct nfs4_sequence_res *res) 695 struct nfs4_sequence_res *res)
@@ -1622,15 +1620,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
1622{ 1620{
1623 struct nfs4_opendata *data = calldata; 1621 struct nfs4_opendata *data = calldata;
1624 1622
1625 nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, 1623 nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args,
1626 &data->o_res.seq_res, task); 1624 &data->c_res.seq_res, task);
1627} 1625}
1628 1626
1629static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) 1627static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
1630{ 1628{
1631 struct nfs4_opendata *data = calldata; 1629 struct nfs4_opendata *data = calldata;
1632 1630
1633 nfs40_sequence_done(task, &data->o_res.seq_res); 1631 nfs40_sequence_done(task, &data->c_res.seq_res);
1634 1632
1635 data->rpc_status = task->tk_status; 1633 data->rpc_status = task->tk_status;
1636 if (data->rpc_status == 0) { 1634 if (data->rpc_status == 0) {
@@ -1688,7 +1686,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
1688 }; 1686 };
1689 int status; 1687 int status;
1690 1688
1691 nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1); 1689 nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1);
1692 kref_get(&data->kref); 1690 kref_get(&data->kref);
1693 data->rpc_done = 0; 1691 data->rpc_done = 0;
1694 data->rpc_status = 0; 1692 data->rpc_status = 0;
@@ -2400,13 +2398,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
2400 2398
2401 if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { 2399 if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
2402 /* Use that stateid */ 2400 /* Use that stateid */
2403 } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) { 2401 } else if (truncate && state != NULL) {
2404 struct nfs_lockowner lockowner = { 2402 struct nfs_lockowner lockowner = {
2405 .l_owner = current->files, 2403 .l_owner = current->files,
2406 .l_pid = current->tgid, 2404 .l_pid = current->tgid,
2407 }; 2405 };
2408 nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, 2406 if (!nfs4_valid_open_stateid(state))
2409 &lockowner); 2407 return -EBADF;
2408 if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
2409 &lockowner) == -EIO)
2410 return -EBADF;
2410 } else 2411 } else
2411 nfs4_stateid_copy(&arg.stateid, &zero_stateid); 2412 nfs4_stateid_copy(&arg.stateid, &zero_stateid);
2412 2413
@@ -2744,7 +2745,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2744 NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| 2745 NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
2745 NFS_CAP_CTIME|NFS_CAP_MTIME| 2746 NFS_CAP_CTIME|NFS_CAP_MTIME|
2746 NFS_CAP_SECURITY_LABEL); 2747 NFS_CAP_SECURITY_LABEL);
2747 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) 2748 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
2749 res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
2748 server->caps |= NFS_CAP_ACLS; 2750 server->caps |= NFS_CAP_ACLS;
2749 if (res.has_links != 0) 2751 if (res.has_links != 0)
2750 server->caps |= NFS_CAP_HARDLINKS; 2752 server->caps |= NFS_CAP_HARDLINKS;
@@ -4012,8 +4014,9 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid,
4012{ 4014{
4013 nfs4_stateid current_stateid; 4015 nfs4_stateid current_stateid;
4014 4016
4015 if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode)) 4017 /* If the current stateid represents a lost lock, then exit */
4016 return false; 4018 if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode) == -EIO)
4019 return true;
4017 return nfs4_stateid_match(stateid, &current_stateid); 4020 return nfs4_stateid_match(stateid, &current_stateid);
4018} 4021}
4019 4022
@@ -4321,9 +4324,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
4321 4324
4322static inline int nfs4_server_supports_acls(struct nfs_server *server) 4325static inline int nfs4_server_supports_acls(struct nfs_server *server)
4323{ 4326{
4324 return (server->caps & NFS_CAP_ACLS) 4327 return server->caps & NFS_CAP_ACLS;
4325 && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
4326 && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
4327} 4328}
4328 4329
4329/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that 4330/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
@@ -5831,8 +5832,7 @@ struct nfs_release_lockowner_data {
5831 struct nfs4_lock_state *lsp; 5832 struct nfs4_lock_state *lsp;
5832 struct nfs_server *server; 5833 struct nfs_server *server;
5833 struct nfs_release_lockowner_args args; 5834 struct nfs_release_lockowner_args args;
5834 struct nfs4_sequence_args seq_args; 5835 struct nfs_release_lockowner_res res;
5835 struct nfs4_sequence_res seq_res;
5836 unsigned long timestamp; 5836 unsigned long timestamp;
5837}; 5837};
5838 5838
@@ -5840,7 +5840,7 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
5840{ 5840{
5841 struct nfs_release_lockowner_data *data = calldata; 5841 struct nfs_release_lockowner_data *data = calldata;
5842 nfs40_setup_sequence(data->server, 5842 nfs40_setup_sequence(data->server,
5843 &data->seq_args, &data->seq_res, task); 5843 &data->args.seq_args, &data->res.seq_res, task);
5844 data->timestamp = jiffies; 5844 data->timestamp = jiffies;
5845} 5845}
5846 5846
@@ -5849,7 +5849,7 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
5849 struct nfs_release_lockowner_data *data = calldata; 5849 struct nfs_release_lockowner_data *data = calldata;
5850 struct nfs_server *server = data->server; 5850 struct nfs_server *server = data->server;
5851 5851
5852 nfs40_sequence_done(task, &data->seq_res); 5852 nfs40_sequence_done(task, &data->res.seq_res);
5853 5853
5854 switch (task->tk_status) { 5854 switch (task->tk_status) {
5855 case 0: 5855 case 0:
@@ -5890,7 +5890,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
5890 data = kmalloc(sizeof(*data), GFP_NOFS); 5890 data = kmalloc(sizeof(*data), GFP_NOFS);
5891 if (!data) 5891 if (!data)
5892 return -ENOMEM; 5892 return -ENOMEM;
5893 nfs4_init_sequence(&data->seq_args, &data->seq_res, 0);
5894 data->lsp = lsp; 5893 data->lsp = lsp;
5895 data->server = server; 5894 data->server = server;
5896 data->args.lock_owner.clientid = server->nfs_client->cl_clientid; 5895 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5898,6 +5897,8 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
5898 data->args.lock_owner.s_dev = server->s_dev; 5897 data->args.lock_owner.s_dev = server->s_dev;
5899 5898
5900 msg.rpc_argp = &data->args; 5899 msg.rpc_argp = &data->args;
5900 msg.rpc_resp = &data->res;
5901 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
5901 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); 5902 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
5902 return 0; 5903 return 0;
5903} 5904}
@@ -7409,9 +7410,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7409 struct nfs_server *server = NFS_SERVER(inode); 7410 struct nfs_server *server = NFS_SERVER(inode);
7410 struct pnfs_layout_hdr *lo; 7411 struct pnfs_layout_hdr *lo;
7411 struct nfs4_state *state = NULL; 7412 struct nfs4_state *state = NULL;
7412 unsigned long timeo, giveup; 7413 unsigned long timeo, now, giveup;
7413 7414
7414 dprintk("--> %s\n", __func__); 7415 dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
7415 7416
7416 if (!nfs41_sequence_done(task, &lgp->res.seq_res)) 7417 if (!nfs41_sequence_done(task, &lgp->res.seq_res))
7417 goto out; 7418 goto out;
@@ -7419,12 +7420,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7419 switch (task->tk_status) { 7420 switch (task->tk_status) {
7420 case 0: 7421 case 0:
7421 goto out; 7422 goto out;
7423 /*
7424 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
7425 * (or clients) writing to the same RAID stripe
7426 */
7422 case -NFS4ERR_LAYOUTTRYLATER: 7427 case -NFS4ERR_LAYOUTTRYLATER:
7428 /*
7429 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
7430 * existing layout before getting a new one).
7431 */
7423 case -NFS4ERR_RECALLCONFLICT: 7432 case -NFS4ERR_RECALLCONFLICT:
7424 timeo = rpc_get_timeout(task->tk_client); 7433 timeo = rpc_get_timeout(task->tk_client);
7425 giveup = lgp->args.timestamp + timeo; 7434 giveup = lgp->args.timestamp + timeo;
7426 if (time_after(giveup, jiffies)) 7435 now = jiffies;
7427 task->tk_status = -NFS4ERR_DELAY; 7436 if (time_after(giveup, now)) {
7437 unsigned long delay;
7438
7439 /* Delay for:
7440 * - Not less then NFS4_POLL_RETRY_MIN.
7441 * - One last time a jiffie before we give up
7442 * - exponential backoff (time_now minus start_attempt)
7443 */
7444 delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
7445 min((giveup - now - 1),
7446 now - lgp->args.timestamp));
7447
7448 dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
7449 __func__, delay);
7450 rpc_delay(task, delay);
7451 task->tk_status = 0;
7452 rpc_restart_call_prepare(task);
7453 goto out; /* Do not call nfs4_async_handle_error() */
7454 }
7428 break; 7455 break;
7429 case -NFS4ERR_EXPIRED: 7456 case -NFS4ERR_EXPIRED:
7430 case -NFS4ERR_BAD_STATEID: 7457 case -NFS4ERR_BAD_STATEID:
@@ -7780,10 +7807,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
7780 case -NFS4ERR_BADLAYOUT: /* no layout */ 7807 case -NFS4ERR_BADLAYOUT: /* no layout */
7781 case -NFS4ERR_GRACE: /* loca_recalim always false */ 7808 case -NFS4ERR_GRACE: /* loca_recalim always false */
7782 task->tk_status = 0; 7809 task->tk_status = 0;
7783 break;
7784 case 0: 7810 case 0:
7785 nfs_post_op_update_inode_force_wcc(data->args.inode,
7786 data->res.fattr);
7787 break; 7811 break;
7788 default: 7812 default:
7789 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 7813 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
@@ -7798,6 +7822,8 @@ static void nfs4_layoutcommit_release(void *calldata)
7798 struct nfs4_layoutcommit_data *data = calldata; 7822 struct nfs4_layoutcommit_data *data = calldata;
7799 7823
7800 pnfs_cleanup_layoutcommit(data); 7824 pnfs_cleanup_layoutcommit(data);
7825 nfs_post_op_update_inode_force_wcc(data->args.inode,
7826 data->res.fattr);
7801 put_rpccred(data->cred); 7827 put_rpccred(data->cred);
7802 kfree(data); 7828 kfree(data);
7803} 7829}
@@ -7920,7 +7946,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
7920 switch (err) { 7946 switch (err) {
7921 case 0: 7947 case 0:
7922 case -NFS4ERR_WRONGSEC: 7948 case -NFS4ERR_WRONGSEC:
7923 case -NFS4ERR_NOTSUPP: 7949 case -ENOTSUPP:
7924 goto out; 7950 goto out;
7925 default: 7951 default:
7926 err = nfs4_handle_exception(server, err, &exception); 7952 err = nfs4_handle_exception(server, err, &exception);
@@ -7954,7 +7980,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
7954 * Fall back on "guess and check" method if 7980 * Fall back on "guess and check" method if
7955 * the server doesn't support SECINFO_NO_NAME 7981 * the server doesn't support SECINFO_NO_NAME
7956 */ 7982 */
7957 if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { 7983 if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
7958 err = nfs4_find_root_sec(server, fhandle, info); 7984 err = nfs4_find_root_sec(server, fhandle, info);
7959 goto out_freepage; 7985 goto out_freepage;
7960 } 7986 }
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index cf883c7ae053..e799dc3c3b1d 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -231,14 +231,23 @@ out:
231 return ret; 231 return ret;
232} 232}
233 233
234/*
235 * nfs4_release_slot_table - release all slot table entries
236 */
237static void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
238{
239 nfs4_shrink_slot_table(tbl, 0);
240}
241
234/** 242/**
235 * nfs4_release_slot_table - release resources attached to a slot table 243 * nfs4_shutdown_slot_table - release resources attached to a slot table
236 * @tbl: slot table to shut down 244 * @tbl: slot table to shut down
237 * 245 *
238 */ 246 */
239void nfs4_release_slot_table(struct nfs4_slot_table *tbl) 247void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl)
240{ 248{
241 nfs4_shrink_slot_table(tbl, 0); 249 nfs4_release_slot_table(tbl);
250 rpc_destroy_wait_queue(&tbl->slot_tbl_waitq);
242} 251}
243 252
244/** 253/**
@@ -422,7 +431,7 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
422 spin_unlock(&tbl->slot_tbl_lock); 431 spin_unlock(&tbl->slot_tbl_lock);
423} 432}
424 433
425static void nfs4_destroy_session_slot_tables(struct nfs4_session *session) 434static void nfs4_release_session_slot_tables(struct nfs4_session *session)
426{ 435{
427 nfs4_release_slot_table(&session->fc_slot_table); 436 nfs4_release_slot_table(&session->fc_slot_table);
428 nfs4_release_slot_table(&session->bc_slot_table); 437 nfs4_release_slot_table(&session->bc_slot_table);
@@ -450,7 +459,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
450 if (status && tbl->slots == NULL) 459 if (status && tbl->slots == NULL)
451 /* Fore and back channel share a connection so get 460 /* Fore and back channel share a connection so get
452 * both slot tables or neither */ 461 * both slot tables or neither */
453 nfs4_destroy_session_slot_tables(ses); 462 nfs4_release_session_slot_tables(ses);
454 return status; 463 return status;
455} 464}
456 465
@@ -470,6 +479,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
470 return session; 479 return session;
471} 480}
472 481
482static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
483{
484 nfs4_shutdown_slot_table(&session->fc_slot_table);
485 nfs4_shutdown_slot_table(&session->bc_slot_table);
486}
487
473void nfs4_destroy_session(struct nfs4_session *session) 488void nfs4_destroy_session(struct nfs4_session *session)
474{ 489{
475 struct rpc_xprt *xprt; 490 struct rpc_xprt *xprt;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 232306100651..b34ada9bc6a2 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -74,7 +74,7 @@ enum nfs4_session_state {
74 74
75extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, 75extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
76 unsigned int max_reqs, const char *queue); 76 unsigned int max_reqs, const char *queue);
77extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl); 77extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
78extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); 78extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
79extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); 79extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
80extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); 80extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 059c01b67a71..0deb32105ccf 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -974,9 +974,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
974 else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { 974 else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
975 nfs4_stateid_copy(dst, &lsp->ls_stateid); 975 nfs4_stateid_copy(dst, &lsp->ls_stateid);
976 ret = 0; 976 ret = 0;
977 smp_rmb();
978 if (!list_empty(&lsp->ls_seqid.list))
979 ret = -EWOULDBLOCK;
980 } 977 }
981 spin_unlock(&state->state_lock); 978 spin_unlock(&state->state_lock);
982 nfs4_put_lock_state(lsp); 979 nfs4_put_lock_state(lsp);
@@ -984,10 +981,9 @@ out:
984 return ret; 981 return ret;
985} 982}
986 983
987static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) 984static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
988{ 985{
989 const nfs4_stateid *src; 986 const nfs4_stateid *src;
990 int ret;
991 int seq; 987 int seq;
992 988
993 do { 989 do {
@@ -996,12 +992,7 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
996 if (test_bit(NFS_OPEN_STATE, &state->flags)) 992 if (test_bit(NFS_OPEN_STATE, &state->flags))
997 src = &state->open_stateid; 993 src = &state->open_stateid;
998 nfs4_stateid_copy(dst, src); 994 nfs4_stateid_copy(dst, src);
999 ret = 0;
1000 smp_rmb();
1001 if (!list_empty(&state->owner->so_seqid.list))
1002 ret = -EWOULDBLOCK;
1003 } while (read_seqretry(&state->seqlock, seq)); 995 } while (read_seqretry(&state->seqlock, seq));
1004 return ret;
1005} 996}
1006 997
1007/* 998/*
@@ -1015,15 +1006,19 @@ int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
1015 if (ret == -EIO) 1006 if (ret == -EIO)
1016 /* A lost lock - don't even consider delegations */ 1007 /* A lost lock - don't even consider delegations */
1017 goto out; 1008 goto out;
1018 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) 1009 /* returns true if delegation stateid found and copied */
1010 if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) {
1011 ret = 0;
1019 goto out; 1012 goto out;
1013 }
1020 if (ret != -ENOENT) 1014 if (ret != -ENOENT)
1021 /* nfs4_copy_delegation_stateid() didn't over-write 1015 /* nfs4_copy_delegation_stateid() didn't over-write
1022 * dst, so it still has the lock stateid which we now 1016 * dst, so it still has the lock stateid which we now
1023 * choose to use. 1017 * choose to use.
1024 */ 1018 */
1025 goto out; 1019 goto out;
1026 ret = nfs4_copy_open_stateid(dst, state); 1020 nfs4_copy_open_stateid(dst, state);
1021 ret = 0;
1027out: 1022out:
1028 if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41)) 1023 if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
1029 dst->seqid = 0; 1024 dst->seqid = 0;
@@ -1071,7 +1066,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
1071/* 1066/*
1072 * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or 1067 * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
1073 * failed with a seqid incrementing error - 1068 * failed with a seqid incrementing error -
1074 * see comments nfs_fs.h:seqid_mutating_error() 1069 * see comments nfs4.h:seqid_mutating_error()
1075 */ 1070 */
1076static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) 1071static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
1077{ 1072{
@@ -1116,7 +1111,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
1116/* 1111/*
1117 * Increment the seqid if the LOCK/LOCKU succeeded, or 1112 * Increment the seqid if the LOCK/LOCKU succeeded, or
1118 * failed with a seqid incrementing error - 1113 * failed with a seqid incrementing error -
1119 * see comments nfs_fs.h:seqid_mutating_error() 1114 * see comments nfs4.h:seqid_mutating_error()
1120 */ 1115 */
1121void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) 1116void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
1122{ 1117{
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 65ab0a0ca1c4..808f29574412 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
77{ 77{
78 int ret = nfs_write_inode(inode, wbc); 78 int ret = nfs_write_inode(inode, wbc);
79 79
80 if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { 80 if (ret == 0)
81 int status; 81 ret = pnfs_layoutcommit_inode(inode,
82 bool sync = true; 82 wbc->sync_mode == WB_SYNC_ALL);
83
84 if (wbc->sync_mode == WB_SYNC_NONE)
85 sync = false;
86
87 status = pnfs_layoutcommit_inode(inode, sync);
88 if (status < 0)
89 return status;
90 }
91 return ret; 83 return ret;
92} 84}
93 85
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5be2868c02f1..72f3bf1754ef 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3097,7 +3097,8 @@ out_overflow:
3097 return -EIO; 3097 return -EIO;
3098} 3098}
3099 3099
3100static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 3100static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
3101 int *nfs_retval)
3101{ 3102{
3102 __be32 *p; 3103 __be32 *p;
3103 uint32_t opnum; 3104 uint32_t opnum;
@@ -3107,19 +3108,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
3107 if (unlikely(!p)) 3108 if (unlikely(!p))
3108 goto out_overflow; 3109 goto out_overflow;
3109 opnum = be32_to_cpup(p++); 3110 opnum = be32_to_cpup(p++);
3110 if (opnum != expected) { 3111 if (unlikely(opnum != expected))
3111 dprintk("nfs: Server returned operation" 3112 goto out_bad_operation;
3112 " %d but we issued a request for %d\n",
3113 opnum, expected);
3114 return -EIO;
3115 }
3116 nfserr = be32_to_cpup(p); 3113 nfserr = be32_to_cpup(p);
3117 if (nfserr != NFS_OK) 3114 if (nfserr == NFS_OK)
3118 return nfs4_stat_to_errno(nfserr); 3115 *nfs_retval = 0;
3119 return 0; 3116 else
3117 *nfs_retval = nfs4_stat_to_errno(nfserr);
3118 return true;
3119out_bad_operation:
3120 dprintk("nfs: Server returned operation"
3121 " %d but we issued a request for %d\n",
3122 opnum, expected);
3123 *nfs_retval = -EREMOTEIO;
3124 return false;
3120out_overflow: 3125out_overflow:
3121 print_overflow_msg(__func__, xdr); 3126 print_overflow_msg(__func__, xdr);
3122 return -EIO; 3127 *nfs_retval = -EIO;
3128 return false;
3129}
3130
3131static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
3132{
3133 int retval;
3134
3135 __decode_op_hdr(xdr, expected, &retval);
3136 return retval;
3123} 3137}
3124 3138
3125/* Dummy routine */ 3139/* Dummy routine */
@@ -3435,7 +3449,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
3435{ 3449{
3436 __be32 *p; 3450 __be32 *p;
3437 3451
3438 *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL; 3452 *res = 0;
3439 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) 3453 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
3440 return -EIO; 3454 return -EIO;
3441 if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { 3455 if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
@@ -5001,11 +5015,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
5001 uint32_t savewords, bmlen, i; 5015 uint32_t savewords, bmlen, i;
5002 int status; 5016 int status;
5003 5017
5004 status = decode_op_hdr(xdr, OP_OPEN); 5018 if (!__decode_op_hdr(xdr, OP_OPEN, &status))
5005 if (status != -EIO) 5019 return status;
5006 nfs_increment_open_seqid(status, res->seqid); 5020 nfs_increment_open_seqid(status, res->seqid);
5007 if (!status) 5021 if (status)
5008 status = decode_stateid(xdr, &res->stateid); 5022 return status;
5023 status = decode_stateid(xdr, &res->stateid);
5009 if (unlikely(status)) 5024 if (unlikely(status))
5010 return status; 5025 return status;
5011 5026
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 89fe741e58b1..59f838cdc009 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,6 +36,7 @@
36 __print_flags(v, "|", \ 36 __print_flags(v, "|", \
37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ 37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
38 { 1 << NFS_INO_STALE, "STALE" }, \ 38 { 1 << NFS_INO_STALE, "STALE" }, \
39 { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
39 { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ 40 { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
40 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ 41 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
41 { 1 << NFS_INO_COMMIT, "COMMIT" }, \ 42 { 1 << NFS_INO_COMMIT, "COMMIT" }, \
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d75d938d36cb..4755858e37a0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1790,6 +1790,15 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1790} 1790}
1791EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 1791EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
1792 1792
1793static void pnfs_clear_layoutcommitting(struct inode *inode)
1794{
1795 unsigned long *bitlock = &NFS_I(inode)->flags;
1796
1797 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
1798 smp_mb__after_clear_bit();
1799 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
1800}
1801
1793/* 1802/*
1794 * There can be multiple RW segments. 1803 * There can be multiple RW segments.
1795 */ 1804 */
@@ -1807,7 +1816,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
1807static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) 1816static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
1808{ 1817{
1809 struct pnfs_layout_segment *lseg, *tmp; 1818 struct pnfs_layout_segment *lseg, *tmp;
1810 unsigned long *bitlock = &NFS_I(inode)->flags;
1811 1819
1812 /* Matched by references in pnfs_set_layoutcommit */ 1820 /* Matched by references in pnfs_set_layoutcommit */
1813 list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { 1821 list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
@@ -1815,9 +1823,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis
1815 pnfs_put_lseg(lseg); 1823 pnfs_put_lseg(lseg);
1816 } 1824 }
1817 1825
1818 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); 1826 pnfs_clear_layoutcommitting(inode);
1819 smp_mb__after_clear_bit();
1820 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
1821} 1827}
1822 1828
1823void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) 1829void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
@@ -1881,43 +1887,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1881 struct nfs4_layoutcommit_data *data; 1887 struct nfs4_layoutcommit_data *data;
1882 struct nfs_inode *nfsi = NFS_I(inode); 1888 struct nfs_inode *nfsi = NFS_I(inode);
1883 loff_t end_pos; 1889 loff_t end_pos;
1884 int status = 0; 1890 int status;
1885 1891
1886 dprintk("--> %s inode %lu\n", __func__, inode->i_ino); 1892 if (!pnfs_layoutcommit_outstanding(inode))
1887
1888 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1889 return 0; 1893 return 0;
1890 1894
1891 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 1895 dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
1892 data = kzalloc(sizeof(*data), GFP_NOFS);
1893 if (!data) {
1894 status = -ENOMEM;
1895 goto out;
1896 }
1897
1898 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1899 goto out_free;
1900 1896
1897 status = -EAGAIN;
1901 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { 1898 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1902 if (!sync) { 1899 if (!sync)
1903 status = -EAGAIN; 1900 goto out;
1904 goto out_free; 1901 status = wait_on_bit_lock(&nfsi->flags,
1905 } 1902 NFS_INO_LAYOUTCOMMITTING,
1906 status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, 1903 nfs_wait_bit_killable,
1907 nfs_wait_bit_killable, TASK_KILLABLE); 1904 TASK_KILLABLE);
1908 if (status) 1905 if (status)
1909 goto out_free; 1906 goto out;
1910 } 1907 }
1911 1908
1912 INIT_LIST_HEAD(&data->lseg_list); 1909 status = -ENOMEM;
1910 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
1911 data = kzalloc(sizeof(*data), GFP_NOFS);
1912 if (!data)
1913 goto clear_layoutcommitting;
1914
1915 status = 0;
1913 spin_lock(&inode->i_lock); 1916 spin_lock(&inode->i_lock);
1914 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 1917 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1915 clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); 1918 goto out_unlock;
1916 spin_unlock(&inode->i_lock);
1917 wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
1918 goto out_free;
1919 }
1920 1919
1920 INIT_LIST_HEAD(&data->lseg_list);
1921 pnfs_list_write_lseg(inode, &data->lseg_list); 1921 pnfs_list_write_lseg(inode, &data->lseg_list);
1922 1922
1923 end_pos = nfsi->layout->plh_lwb; 1923 end_pos = nfsi->layout->plh_lwb;
@@ -1940,8 +1940,11 @@ out:
1940 mark_inode_dirty_sync(inode); 1940 mark_inode_dirty_sync(inode);
1941 dprintk("<-- %s status %d\n", __func__, status); 1941 dprintk("<-- %s status %d\n", __func__, status);
1942 return status; 1942 return status;
1943out_free: 1943out_unlock:
1944 spin_unlock(&inode->i_lock);
1944 kfree(data); 1945 kfree(data);
1946clear_layoutcommitting:
1947 pnfs_clear_layoutcommitting(inode);
1945 goto out; 1948 goto out;
1946} 1949}
1947 1950
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a4f41810a7f4..023793909778 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -359,6 +359,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
359 PNFS_LAYOUTRET_ON_SETATTR; 359 PNFS_LAYOUTRET_ON_SETATTR;
360} 360}
361 361
362static inline bool
363pnfs_layoutcommit_outstanding(struct inode *inode)
364{
365 struct nfs_inode *nfsi = NFS_I(inode);
366
367 return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 ||
368 test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0;
369}
370
362static inline int pnfs_return_layout(struct inode *ino) 371static inline int pnfs_return_layout(struct inode *ino)
363{ 372{
364 struct nfs_inode *nfsi = NFS_I(ino); 373 struct nfs_inode *nfsi = NFS_I(ino);
@@ -515,6 +524,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
515 return false; 524 return false;
516} 525}
517 526
527static inline bool
528pnfs_layoutcommit_outstanding(struct inode *inode)
529{
530 return false;
531}
532
533
518static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 534static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
519{ 535{
520 return NULL; 536 return NULL;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 31db5c366b81..411aedda14bb 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -163,9 +163,9 @@ static void nfs_readpage_release(struct nfs_page *req)
163 163
164 unlock_page(req->wb_page); 164 unlock_page(req->wb_page);
165 165
166 dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", 166 dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
167 req->wb_context->dentry->d_inode->i_sb->s_id, 167 req->wb_context->dentry->d_inode->i_sb->s_id,
168 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 168 (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
169 req->wb_bytes, 169 req->wb_bytes,
170 (long long)req_offset(req)); 170 (long long)req_offset(req));
171 nfs_release_request(req); 171 nfs_release_request(req);
@@ -228,11 +228,11 @@ int nfs_initiate_read(struct rpc_clnt *clnt,
228 /* Set up the initial task struct. */ 228 /* Set up the initial task struct. */
229 NFS_PROTO(inode)->read_setup(data, &msg); 229 NFS_PROTO(inode)->read_setup(data, &msg);
230 230
231 dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " 231 dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ "
232 "offset %llu)\n", 232 "offset %llu)\n",
233 data->task.tk_pid, 233 data->task.tk_pid,
234 inode->i_sb->s_id, 234 inode->i_sb->s_id,
235 (long long)NFS_FILEID(inode), 235 (unsigned long long)NFS_FILEID(inode),
236 data->args.count, 236 data->args.count,
237 (unsigned long long)data->args.offset); 237 (unsigned long long)data->args.offset);
238 238
@@ -630,9 +630,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
630 unsigned long npages; 630 unsigned long npages;
631 int ret = -ESTALE; 631 int ret = -ESTALE;
632 632
633 dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", 633 dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
634 inode->i_sb->s_id, 634 inode->i_sb->s_id,
635 (long long)NFS_FILEID(inode), 635 (unsigned long long)NFS_FILEID(inode),
636 nr_pages); 636 nr_pages);
637 nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); 637 nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
638 638
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c1d548211c31..9a3b6a4cd6b9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -909,9 +909,14 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
909 */ 909 */
910static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) 910static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
911{ 911{
912 struct nfs_inode *nfsi = NFS_I(inode);
913
912 if (nfs_have_delegated_attributes(inode)) 914 if (nfs_have_delegated_attributes(inode))
913 goto out; 915 goto out;
914 if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE)) 916 if (nfsi->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
917 return false;
918 smp_rmb();
919 if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
915 return false; 920 return false;
916out: 921out:
917 return PageUptodate(page) != 0; 922 return PageUptodate(page) != 0;
@@ -922,19 +927,20 @@ out:
922 * extend the write to cover the entire page in order to avoid fragmentation 927 * extend the write to cover the entire page in order to avoid fragmentation
923 * inefficiencies. 928 * inefficiencies.
924 * 929 *
925 * If the file is opened for synchronous writes or if we have a write delegation 930 * If the file is opened for synchronous writes then we can just skip the rest
926 * from the server then we can just skip the rest of the checks. 931 * of the checks.
927 */ 932 */
928static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) 933static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
929{ 934{
930 if (file->f_flags & O_DSYNC) 935 if (file->f_flags & O_DSYNC)
931 return 0; 936 return 0;
937 if (!nfs_write_pageuptodate(page, inode))
938 return 0;
932 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) 939 if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
933 return 1; 940 return 1;
934 if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL || 941 if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
935 (inode->i_flock->fl_start == 0 &&
936 inode->i_flock->fl_end == OFFSET_MAX && 942 inode->i_flock->fl_end == OFFSET_MAX &&
937 inode->i_flock->fl_type != F_RDLCK))) 943 inode->i_flock->fl_type != F_RDLCK))
938 return 1; 944 return 1;
939 return 0; 945 return 0;
940} 946}
@@ -1013,10 +1019,10 @@ int nfs_initiate_write(struct rpc_clnt *clnt,
1013 NFS_PROTO(inode)->write_setup(data, &msg); 1019 NFS_PROTO(inode)->write_setup(data, &msg);
1014 1020
1015 dprintk("NFS: %5u initiated write call " 1021 dprintk("NFS: %5u initiated write call "
1016 "(req %s/%lld, %u bytes @ offset %llu)\n", 1022 "(req %s/%llu, %u bytes @ offset %llu)\n",
1017 data->task.tk_pid, 1023 data->task.tk_pid,
1018 inode->i_sb->s_id, 1024 inode->i_sb->s_id,
1019 (long long)NFS_FILEID(inode), 1025 (unsigned long long)NFS_FILEID(inode),
1020 data->args.count, 1026 data->args.count,
1021 (unsigned long long)data->args.offset); 1027 (unsigned long long)data->args.offset);
1022 1028
@@ -1606,9 +1612,9 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1606 nfs_list_remove_request(req); 1612 nfs_list_remove_request(req);
1607 nfs_clear_page_commit(req->wb_page); 1613 nfs_clear_page_commit(req->wb_page);
1608 1614
1609 dprintk("NFS: commit (%s/%lld %d@%lld)", 1615 dprintk("NFS: commit (%s/%llu %d@%lld)",
1610 req->wb_context->dentry->d_sb->s_id, 1616 req->wb_context->dentry->d_sb->s_id,
1611 (long long)NFS_FILEID(req->wb_context->dentry->d_inode), 1617 (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
1612 req->wb_bytes, 1618 req->wb_bytes,
1613 (long long)req_offset(req)); 1619 (long long)req_offset(req));
1614 if (status < 0) { 1620 if (status < 0) {
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 8b186a4955cc..a812fd1b92a4 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -35,7 +35,9 @@
35#ifndef LINUX_NFS4_ACL_H 35#ifndef LINUX_NFS4_ACL_H
36#define LINUX_NFS4_ACL_H 36#define LINUX_NFS4_ACL_H
37 37
38#include <linux/posix_acl.h> 38struct nfs4_acl;
39struct svc_fh;
40struct svc_rqst;
39 41
40/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to 42/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
41 * fit in a page: */ 43 * fit in a page: */
@@ -43,15 +45,11 @@
43 45
44struct nfs4_acl *nfs4_acl_new(int); 46struct nfs4_acl *nfs4_acl_new(int);
45int nfs4_acl_get_whotype(char *, u32); 47int nfs4_acl_get_whotype(char *, u32);
46int nfs4_acl_write_who(int who, char *p); 48__be32 nfs4_acl_write_who(int who, __be32 **p, int *len);
47 49
48#define NFS4_ACL_TYPE_DEFAULT 0x01 50int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
49#define NFS4_ACL_DIR 0x02 51 struct nfs4_acl **acl);
50#define NFS4_ACL_OWNER 0x04 52__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
51 53 struct nfs4_acl *acl);
52struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
53 struct posix_acl *, unsigned int flags);
54int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
55 struct posix_acl **, unsigned int flags);
56 54
57#endif /* LINUX_NFS4_ACL_H */ 55#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index d5c5b3e00266..b582f9ab6b2a 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,12 +84,4 @@ int nfsd_cache_lookup(struct svc_rqst *);
84void nfsd_cache_update(struct svc_rqst *, int, __be32 *); 84void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
85int nfsd_reply_cache_stats_open(struct inode *, struct file *); 85int nfsd_reply_cache_stats_open(struct inode *, struct file *);
86 86
87#ifdef CONFIG_NFSD_V4
88void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
89#else /* CONFIG_NFSD_V4 */
90static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
91{
92}
93#endif /* CONFIG_NFSD_V4 */
94
95#endif /* NFSCACHE_H */ 87#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index bf95f6b817a4..66e58db01936 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net)
56 56
57__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *); 57__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
58__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *); 58__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
59int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *); 59__be32 nfsd4_encode_user(struct svc_rqst *, kuid_t, __be32 **, int *);
60int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *); 60__be32 nfsd4_encode_group(struct svc_rqst *, kgid_t, __be32 **, int *);
61 61
62#endif /* LINUX_NFSD_IDMAP_H */ 62#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 849a7c3ced22..d32b3aa6600d 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -95,6 +95,7 @@ struct nfsd_net {
95 time_t nfsd4_grace; 95 time_t nfsd4_grace;
96 96
97 bool nfsd_net_up; 97 bool nfsd_net_up;
98 bool lockd_up;
98 99
99 /* 100 /*
100 * Time of server startup 101 * Time of server startup
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 95d76dc6c5da..11c1fba29312 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -30,8 +30,9 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
30static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, 30static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
31 struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) 31 struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
32{ 32{
33 svc_fh *fh;
34 struct posix_acl *acl; 33 struct posix_acl *acl;
34 struct inode *inode;
35 svc_fh *fh;
35 __be32 nfserr = 0; 36 __be32 nfserr = 0;
36 37
37 dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); 38 dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
@@ -41,6 +42,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
41 if (nfserr) 42 if (nfserr)
42 RETURN_STATUS(nfserr); 43 RETURN_STATUS(nfserr);
43 44
45 inode = fh->fh_dentry->d_inode;
46
44 if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) 47 if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
45 RETURN_STATUS(nfserr_inval); 48 RETURN_STATUS(nfserr_inval);
46 resp->mask = argp->mask; 49 resp->mask = argp->mask;
@@ -50,21 +53,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
50 goto fail; 53 goto fail;
51 54
52 if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { 55 if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
53 acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); 56 acl = get_acl(inode, ACL_TYPE_ACCESS);
54 if (IS_ERR(acl)) { 57 if (IS_ERR(acl)) {
55 int err = PTR_ERR(acl); 58 nfserr = nfserrno(PTR_ERR(acl));
56 59 goto fail;
57 if (err == -ENODATA || err == -EOPNOTSUPP)
58 acl = NULL;
59 else {
60 nfserr = nfserrno(err);
61 goto fail;
62 }
63 } 60 }
64 if (acl == NULL) { 61 if (acl == NULL) {
65 /* Solaris returns the inode's minimum ACL. */ 62 /* Solaris returns the inode's minimum ACL. */
66
67 struct inode *inode = fh->fh_dentry->d_inode;
68 acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); 63 acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
69 } 64 }
70 resp->acl_access = acl; 65 resp->acl_access = acl;
@@ -72,17 +67,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
72 if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { 67 if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
73 /* Check how Solaris handles requests for the Default ACL 68 /* Check how Solaris handles requests for the Default ACL
74 of a non-directory! */ 69 of a non-directory! */
75 70 acl = get_acl(inode, ACL_TYPE_DEFAULT);
76 acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
77 if (IS_ERR(acl)) { 71 if (IS_ERR(acl)) {
78 int err = PTR_ERR(acl); 72 nfserr = nfserrno(PTR_ERR(acl));
79 73 goto fail;
80 if (err == -ENODATA || err == -EOPNOTSUPP)
81 acl = NULL;
82 else {
83 nfserr = nfserrno(err);
84 goto fail;
85 }
86 } 74 }
87 resp->acl_default = acl; 75 resp->acl_default = acl;
88 } 76 }
@@ -103,31 +91,51 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
103 struct nfsd3_setaclargs *argp, 91 struct nfsd3_setaclargs *argp,
104 struct nfsd_attrstat *resp) 92 struct nfsd_attrstat *resp)
105{ 93{
94 struct inode *inode;
106 svc_fh *fh; 95 svc_fh *fh;
107 __be32 nfserr = 0; 96 __be32 nfserr = 0;
97 int error;
108 98
109 dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); 99 dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
110 100
111 fh = fh_copy(&resp->fh, &argp->fh); 101 fh = fh_copy(&resp->fh, &argp->fh);
112 nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); 102 nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
103 if (nfserr)
104 goto out;
113 105
114 if (!nfserr) { 106 inode = fh->fh_dentry->d_inode;
115 nfserr = nfserrno( nfsd_set_posix_acl( 107 if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
116 fh, ACL_TYPE_ACCESS, argp->acl_access) ); 108 error = -EOPNOTSUPP;
117 } 109 goto out_errno;
118 if (!nfserr) {
119 nfserr = nfserrno( nfsd_set_posix_acl(
120 fh, ACL_TYPE_DEFAULT, argp->acl_default) );
121 }
122 if (!nfserr) {
123 nfserr = fh_getattr(fh, &resp->stat);
124 } 110 }
125 111
112 error = fh_want_write(fh);
113 if (error)
114 goto out_errno;
115
116 error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
117 if (error)
118 goto out_drop_write;
119 error = inode->i_op->set_acl(inode, argp->acl_default,
120 ACL_TYPE_DEFAULT);
121 if (error)
122 goto out_drop_write;
123
124 fh_drop_write(fh);
125
126 nfserr = fh_getattr(fh, &resp->stat);
127
128out:
126 /* argp->acl_{access,default} may have been allocated in 129 /* argp->acl_{access,default} may have been allocated in
127 nfssvc_decode_setaclargs. */ 130 nfssvc_decode_setaclargs. */
128 posix_acl_release(argp->acl_access); 131 posix_acl_release(argp->acl_access);
129 posix_acl_release(argp->acl_default); 132 posix_acl_release(argp->acl_default);
130 return nfserr; 133 return nfserr;
134out_drop_write:
135 fh_drop_write(fh);
136out_errno:
137 nfserr = nfserrno(error);
138 goto out;
131} 139}
132 140
133/* 141/*
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9cbc1a841f87..adc5f1b1dc26 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -29,8 +29,9 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
29static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, 29static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
30 struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) 30 struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
31{ 31{
32 svc_fh *fh;
33 struct posix_acl *acl; 32 struct posix_acl *acl;
33 struct inode *inode;
34 svc_fh *fh;
34 __be32 nfserr = 0; 35 __be32 nfserr = 0;
35 36
36 fh = fh_copy(&resp->fh, &argp->fh); 37 fh = fh_copy(&resp->fh, &argp->fh);
@@ -38,26 +39,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
38 if (nfserr) 39 if (nfserr)
39 RETURN_STATUS(nfserr); 40 RETURN_STATUS(nfserr);
40 41
42 inode = fh->fh_dentry->d_inode;
43
41 if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) 44 if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
42 RETURN_STATUS(nfserr_inval); 45 RETURN_STATUS(nfserr_inval);
43 resp->mask = argp->mask; 46 resp->mask = argp->mask;
44 47
45 if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { 48 if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
46 acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); 49 acl = get_acl(inode, ACL_TYPE_ACCESS);
47 if (IS_ERR(acl)) { 50 if (IS_ERR(acl)) {
48 int err = PTR_ERR(acl); 51 nfserr = nfserrno(PTR_ERR(acl));
49 52 goto fail;
50 if (err == -ENODATA || err == -EOPNOTSUPP)
51 acl = NULL;
52 else {
53 nfserr = nfserrno(err);
54 goto fail;
55 }
56 } 53 }
57 if (acl == NULL) { 54 if (acl == NULL) {
58 /* Solaris returns the inode's minimum ACL. */ 55 /* Solaris returns the inode's minimum ACL. */
59
60 struct inode *inode = fh->fh_dentry->d_inode;
61 acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); 56 acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
62 } 57 }
63 resp->acl_access = acl; 58 resp->acl_access = acl;
@@ -65,17 +60,10 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
65 if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { 60 if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
66 /* Check how Solaris handles requests for the Default ACL 61 /* Check how Solaris handles requests for the Default ACL
67 of a non-directory! */ 62 of a non-directory! */
68 63 acl = get_acl(inode, ACL_TYPE_DEFAULT);
69 acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
70 if (IS_ERR(acl)) { 64 if (IS_ERR(acl)) {
71 int err = PTR_ERR(acl); 65 nfserr = nfserrno(PTR_ERR(acl));
72 66 goto fail;
73 if (err == -ENODATA || err == -EOPNOTSUPP)
74 acl = NULL;
75 else {
76 nfserr = nfserrno(err);
77 goto fail;
78 }
79 } 67 }
80 resp->acl_default = acl; 68 resp->acl_default = acl;
81 } 69 }
@@ -96,21 +84,37 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
96 struct nfsd3_setaclargs *argp, 84 struct nfsd3_setaclargs *argp,
97 struct nfsd3_attrstat *resp) 85 struct nfsd3_attrstat *resp)
98{ 86{
87 struct inode *inode;
99 svc_fh *fh; 88 svc_fh *fh;
100 __be32 nfserr = 0; 89 __be32 nfserr = 0;
90 int error;
101 91
102 fh = fh_copy(&resp->fh, &argp->fh); 92 fh = fh_copy(&resp->fh, &argp->fh);
103 nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); 93 nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
94 if (nfserr)
95 goto out;
104 96
105 if (!nfserr) { 97 inode = fh->fh_dentry->d_inode;
106 nfserr = nfserrno( nfsd_set_posix_acl( 98 if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
107 fh, ACL_TYPE_ACCESS, argp->acl_access) ); 99 error = -EOPNOTSUPP;
108 } 100 goto out_errno;
109 if (!nfserr) {
110 nfserr = nfserrno( nfsd_set_posix_acl(
111 fh, ACL_TYPE_DEFAULT, argp->acl_default) );
112 } 101 }
113 102
103 error = fh_want_write(fh);
104 if (error)
105 goto out_errno;
106
107 error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
108 if (error)
109 goto out_drop_write;
110 error = inode->i_op->set_acl(inode, argp->acl_default,
111 ACL_TYPE_DEFAULT);
112
113out_drop_write:
114 fh_drop_write(fh);
115out_errno:
116 nfserr = nfserrno(error);
117out:
114 /* argp->acl_{access,default} may have been allocated in 118 /* argp->acl_{access,default} may have been allocated in
115 nfs3svc_decode_setaclargs. */ 119 nfs3svc_decode_setaclargs. */
116 posix_acl_release(argp->acl_access); 120 posix_acl_release(argp->acl_access);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 14d9ecb96cff..de6e39e12cb3 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -168,7 +168,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
168 struct kstat *stat) 168 struct kstat *stat)
169{ 169{
170 *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); 170 *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
171 *p++ = htonl((u32) stat->mode); 171 *p++ = htonl((u32) (stat->mode & S_IALLUGO));
172 *p++ = htonl((u32) stat->nlink); 172 *p++ = htonl((u32) stat->nlink);
173 *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); 173 *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
174 *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); 174 *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
@@ -842,21 +842,21 @@ out:
842 842
843static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) 843static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
844{ 844{
845 struct svc_fh fh; 845 struct svc_fh *fh = &cd->scratch;
846 __be32 err; 846 __be32 err;
847 847
848 fh_init(&fh, NFS3_FHSIZE); 848 fh_init(fh, NFS3_FHSIZE);
849 err = compose_entry_fh(cd, &fh, name, namlen); 849 err = compose_entry_fh(cd, fh, name, namlen);
850 if (err) { 850 if (err) {
851 *p++ = 0; 851 *p++ = 0;
852 *p++ = 0; 852 *p++ = 0;
853 goto out; 853 goto out;
854 } 854 }
855 p = encode_post_op_attr(cd->rqstp, p, &fh); 855 p = encode_post_op_attr(cd->rqstp, p, fh);
856 *p++ = xdr_one; /* yes, a file handle follows */ 856 *p++ = xdr_one; /* yes, a file handle follows */
857 p = encode_fh(p, &fh); 857 p = encode_fh(p, fh);
858out: 858out:
859 fh_put(&fh); 859 fh_put(fh);
860 return p; 860 return p;
861} 861}
862 862
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 8a50b3c18093..d190e33d0ec2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -37,8 +37,14 @@
37#include <linux/slab.h> 37#include <linux/slab.h>
38#include <linux/nfs_fs.h> 38#include <linux/nfs_fs.h>
39#include <linux/export.h> 39#include <linux/export.h>
40#include "nfsfh.h"
41#include "nfsd.h"
40#include "acl.h" 42#include "acl.h"
43#include "vfs.h"
41 44
45#define NFS4_ACL_TYPE_DEFAULT 0x01
46#define NFS4_ACL_DIR 0x02
47#define NFS4_ACL_OWNER 0x04
42 48
43/* mode bit translations: */ 49/* mode bit translations: */
44#define NFS4_READ_MODE (NFS4_ACE_READ_DATA) 50#define NFS4_READ_MODE (NFS4_ACE_READ_DATA)
@@ -130,36 +136,47 @@ static short ace2type(struct nfs4_ace *);
130static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, 136static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
131 unsigned int); 137 unsigned int);
132 138
133struct nfs4_acl * 139int
134nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, 140nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
135 unsigned int flags) 141 struct nfs4_acl **acl)
136{ 142{
137 struct nfs4_acl *acl; 143 struct inode *inode = dentry->d_inode;
144 int error = 0;
145 struct posix_acl *pacl = NULL, *dpacl = NULL;
146 unsigned int flags = 0;
138 int size = 0; 147 int size = 0;
139 148
140 if (pacl) { 149 pacl = get_acl(inode, ACL_TYPE_ACCESS);
141 if (posix_acl_valid(pacl) < 0) 150 if (!pacl) {
142 return ERR_PTR(-EINVAL); 151 pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
143 size += 2*pacl->a_count; 152 if (IS_ERR(pacl))
153 return PTR_ERR(pacl);
144 } 154 }
145 if (dpacl) { 155 /* allocate for worst case: one (deny, allow) pair each: */
146 if (posix_acl_valid(dpacl) < 0) 156 size += 2 * pacl->a_count;
147 return ERR_PTR(-EINVAL); 157
148 size += 2*dpacl->a_count; 158 if (S_ISDIR(inode->i_mode)) {
159 flags = NFS4_ACL_DIR;
160 dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
161 if (dpacl)
162 size += 2 * dpacl->a_count;
149 } 163 }
150 164
151 /* Allocate for worst case: one (deny, allow) pair each: */ 165 *acl = nfs4_acl_new(size);
152 acl = nfs4_acl_new(size); 166 if (*acl == NULL) {
153 if (acl == NULL) 167 error = -ENOMEM;
154 return ERR_PTR(-ENOMEM); 168 goto out;
169 }
155 170
156 if (pacl) 171 _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
157 _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
158 172
159 if (dpacl) 173 if (dpacl)
160 _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT); 174 _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
161 175
162 return acl; 176 out:
177 posix_acl_release(pacl);
178 posix_acl_release(dpacl);
179 return error;
163} 180}
164 181
165struct posix_acl_summary { 182struct posix_acl_summary {
@@ -719,8 +736,9 @@ static void process_one_v4_ace(struct posix_acl_state *state,
719 } 736 }
720} 737}
721 738
722int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, 739static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl,
723 struct posix_acl **dpacl, unsigned int flags) 740 struct posix_acl **pacl, struct posix_acl **dpacl,
741 unsigned int flags)
724{ 742{
725 struct posix_acl_state effective_acl_state, default_acl_state; 743 struct posix_acl_state effective_acl_state, default_acl_state;
726 struct nfs4_ace *ace; 744 struct nfs4_ace *ace;
@@ -780,6 +798,57 @@ out_estate:
780 return ret; 798 return ret;
781} 799}
782 800
801__be32
802nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
803 struct nfs4_acl *acl)
804{
805 __be32 error;
806 int host_error;
807 struct dentry *dentry;
808 struct inode *inode;
809 struct posix_acl *pacl = NULL, *dpacl = NULL;
810 unsigned int flags = 0;
811
812 /* Get inode */
813 error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
814 if (error)
815 return error;
816
817 dentry = fhp->fh_dentry;
818 inode = dentry->d_inode;
819
820 if (!inode->i_op->set_acl || !IS_POSIXACL(inode))
821 return nfserr_attrnotsupp;
822
823 if (S_ISDIR(inode->i_mode))
824 flags = NFS4_ACL_DIR;
825
826 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
827 if (host_error == -EINVAL)
828 return nfserr_attrnotsupp;
829 if (host_error < 0)
830 goto out_nfserr;
831
832 host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS);
833 if (host_error < 0)
834 goto out_release;
835
836 if (S_ISDIR(inode->i_mode)) {
837 host_error = inode->i_op->set_acl(inode, dpacl,
838 ACL_TYPE_DEFAULT);
839 }
840
841out_release:
842 posix_acl_release(pacl);
843 posix_acl_release(dpacl);
844out_nfserr:
845 if (host_error == -EOPNOTSUPP)
846 return nfserr_attrnotsupp;
847 else
848 return nfserrno(host_error);
849}
850
851
783static short 852static short
784ace2type(struct nfs4_ace *ace) 853ace2type(struct nfs4_ace *ace)
785{ 854{
@@ -798,9 +867,6 @@ ace2type(struct nfs4_ace *ace)
798 return -1; 867 return -1;
799} 868}
800 869
801EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4);
802EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix);
803
804struct nfs4_acl * 870struct nfs4_acl *
805nfs4_acl_new(int n) 871nfs4_acl_new(int n)
806{ 872{
@@ -848,21 +914,22 @@ nfs4_acl_get_whotype(char *p, u32 len)
848 return NFS4_ACL_WHO_NAMED; 914 return NFS4_ACL_WHO_NAMED;
849} 915}
850 916
851int 917__be32 nfs4_acl_write_who(int who, __be32 **p, int *len)
852nfs4_acl_write_who(int who, char *p)
853{ 918{
854 int i; 919 int i;
920 int bytes;
855 921
856 for (i = 0; i < ARRAY_SIZE(s2t_map); i++) { 922 for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
857 if (s2t_map[i].type == who) { 923 if (s2t_map[i].type != who)
858 memcpy(p, s2t_map[i].string, s2t_map[i].stringlen); 924 continue;
859 return s2t_map[i].stringlen; 925 bytes = 4 + (XDR_QUADLEN(s2t_map[i].stringlen) << 2);
860 } 926 if (bytes > *len)
927 return nfserr_resource;
928 *p = xdr_encode_opaque(*p, s2t_map[i].string,
929 s2t_map[i].stringlen);
930 *len -= bytes;
931 return 0;
861 } 932 }
862 BUG(); 933 WARN_ON_ONCE(1);
863 return -1; 934 return -1;
864} 935}
865
866EXPORT_SYMBOL(nfs4_acl_new);
867EXPORT_SYMBOL(nfs4_acl_get_whotype);
868EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4832fd819f88..c0dfde68742e 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -551,27 +551,46 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
551 return 0; 551 return 0;
552} 552}
553 553
554static int 554static __be32 encode_ascii_id(u32 id, __be32 **p, int *buflen)
555idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name) 555{
556 char buf[11];
557 int len;
558 int bytes;
559
560 len = sprintf(buf, "%u", id);
561 bytes = 4 + (XDR_QUADLEN(len) << 2);
562 if (bytes > *buflen)
563 return nfserr_resource;
564 *p = xdr_encode_opaque(*p, buf, len);
565 *buflen -= bytes;
566 return 0;
567}
568
569static __be32 idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen)
556{ 570{
557 struct ent *item, key = { 571 struct ent *item, key = {
558 .id = id, 572 .id = id,
559 .type = type, 573 .type = type,
560 }; 574 };
561 int ret; 575 int ret;
576 int bytes;
562 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 577 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
563 578
564 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); 579 strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
565 ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); 580 ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
566 if (ret == -ENOENT) 581 if (ret == -ENOENT)
567 return sprintf(name, "%u", id); 582 return encode_ascii_id(id, p, buflen);
568 if (ret) 583 if (ret)
569 return ret; 584 return nfserrno(ret);
570 ret = strlen(item->name); 585 ret = strlen(item->name);
571 BUG_ON(ret > IDMAP_NAMESZ); 586 WARN_ON_ONCE(ret > IDMAP_NAMESZ);
572 memcpy(name, item->name, ret); 587 bytes = 4 + (XDR_QUADLEN(ret) << 2);
588 if (bytes > *buflen)
589 return nfserr_resource;
590 *p = xdr_encode_opaque(*p, item->name, ret);
591 *buflen -= bytes;
573 cache_put(&item->h, nn->idtoname_cache); 592 cache_put(&item->h, nn->idtoname_cache);
574 return ret; 593 return 0;
575} 594}
576 595
577static bool 596static bool
@@ -603,12 +622,11 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
603 return idmap_name_to_id(rqstp, type, name, namelen, id); 622 return idmap_name_to_id(rqstp, type, name, namelen, id);
604} 623}
605 624
606static int 625static __be32 encode_name_from_id(struct svc_rqst *rqstp, int type, u32 id, __be32 **p, int *buflen)
607do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
608{ 626{
609 if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) 627 if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
610 return sprintf(name, "%u", id); 628 return encode_ascii_id(id, p, buflen);
611 return idmap_id_to_name(rqstp, type, id, name); 629 return idmap_id_to_name(rqstp, type, id, p, buflen);
612} 630}
613 631
614__be32 632__be32
@@ -637,16 +655,14 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
637 return status; 655 return status;
638} 656}
639 657
640int 658__be32 nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t uid, __be32 **p, int *buflen)
641nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name)
642{ 659{
643 u32 id = from_kuid(&init_user_ns, uid); 660 u32 id = from_kuid(&init_user_ns, uid);
644 return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name); 661 return encode_name_from_id(rqstp, IDMAP_TYPE_USER, id, p, buflen);
645} 662}
646 663
647int 664__be32 nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t gid, __be32 **p, int *buflen)
648nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name)
649{ 665{
650 u32 id = from_kgid(&init_user_ns, gid); 666 u32 id = from_kgid(&init_user_ns, gid);
651 return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name); 667 return encode_name_from_id(rqstp, IDMAP_TYPE_GROUP, id, p, buflen);
652} 668}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 419572f33b72..82189b208af3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -41,6 +41,7 @@
41#include "vfs.h" 41#include "vfs.h"
42#include "current_stateid.h" 42#include "current_stateid.h"
43#include "netns.h" 43#include "netns.h"
44#include "acl.h"
44 45
45#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 46#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
46#include <linux/security.h> 47#include <linux/security.h>
@@ -230,17 +231,16 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate
230} 231}
231 232
232static __be32 233static __be32
233do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) 234do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
234{ 235{
235 struct svc_fh *current_fh = &cstate->current_fh; 236 struct svc_fh *current_fh = &cstate->current_fh;
236 struct svc_fh *resfh;
237 int accmode; 237 int accmode;
238 __be32 status; 238 __be32 status;
239 239
240 resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); 240 *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
241 if (!resfh) 241 if (!*resfh)
242 return nfserr_jukebox; 242 return nfserr_jukebox;
243 fh_init(resfh, NFS4_FHSIZE); 243 fh_init(*resfh, NFS4_FHSIZE);
244 open->op_truncate = 0; 244 open->op_truncate = 0;
245 245
246 if (open->op_create) { 246 if (open->op_create) {
@@ -265,12 +265,12 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
265 */ 265 */
266 status = do_nfsd_create(rqstp, current_fh, open->op_fname.data, 266 status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
267 open->op_fname.len, &open->op_iattr, 267 open->op_fname.len, &open->op_iattr,
268 resfh, open->op_createmode, 268 *resfh, open->op_createmode,
269 (u32 *)open->op_verf.data, 269 (u32 *)open->op_verf.data,
270 &open->op_truncate, &open->op_created); 270 &open->op_truncate, &open->op_created);
271 271
272 if (!status && open->op_label.len) 272 if (!status && open->op_label.len)
273 nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval); 273 nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
274 274
275 /* 275 /*
276 * Following rfc 3530 14.2.16, use the returned bitmask 276 * Following rfc 3530 14.2.16, use the returned bitmask
@@ -280,31 +280,32 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
280 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) 280 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
281 open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | 281 open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
282 FATTR4_WORD1_TIME_MODIFY); 282 FATTR4_WORD1_TIME_MODIFY);
283 } else { 283 } else
284 /*
285 * Note this may exit with the parent still locked.
286 * We will hold the lock until nfsd4_open's final
287 * lookup, to prevent renames or unlinks until we've had
288 * a chance to an acquire a delegation if appropriate.
289 */
284 status = nfsd_lookup(rqstp, current_fh, 290 status = nfsd_lookup(rqstp, current_fh,
285 open->op_fname.data, open->op_fname.len, resfh); 291 open->op_fname.data, open->op_fname.len, *resfh);
286 fh_unlock(current_fh);
287 }
288 if (status) 292 if (status)
289 goto out; 293 goto out;
290 status = nfsd_check_obj_isreg(resfh); 294 status = nfsd_check_obj_isreg(*resfh);
291 if (status) 295 if (status)
292 goto out; 296 goto out;
293 297
294 if (is_create_with_attrs(open) && open->op_acl != NULL) 298 if (is_create_with_attrs(open) && open->op_acl != NULL)
295 do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval); 299 do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
296 300
297 nfsd4_set_open_owner_reply_cache(cstate, open, resfh); 301 nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
298 accmode = NFSD_MAY_NOP; 302 accmode = NFSD_MAY_NOP;
299 if (open->op_created || 303 if (open->op_created ||
300 open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) 304 open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
301 accmode |= NFSD_MAY_OWNER_OVERRIDE; 305 accmode |= NFSD_MAY_OWNER_OVERRIDE;
302 status = do_open_permission(rqstp, resfh, open, accmode); 306 status = do_open_permission(rqstp, *resfh, open, accmode);
303 set_change_info(&open->op_cinfo, current_fh); 307 set_change_info(&open->op_cinfo, current_fh);
304 fh_dup2(current_fh, resfh);
305out: 308out:
306 fh_put(resfh);
307 kfree(resfh);
308 return status; 309 return status;
309} 310}
310 311
@@ -357,6 +358,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
357 struct nfsd4_open *open) 358 struct nfsd4_open *open)
358{ 359{
359 __be32 status; 360 __be32 status;
361 struct svc_fh *resfh = NULL;
360 struct nfsd4_compoundres *resp; 362 struct nfsd4_compoundres *resp;
361 struct net *net = SVC_NET(rqstp); 363 struct net *net = SVC_NET(rqstp);
362 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 364 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -423,7 +425,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
423 switch (open->op_claim_type) { 425 switch (open->op_claim_type) {
424 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 426 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
425 case NFS4_OPEN_CLAIM_NULL: 427 case NFS4_OPEN_CLAIM_NULL:
426 status = do_open_lookup(rqstp, cstate, open); 428 status = do_open_lookup(rqstp, cstate, open, &resfh);
427 if (status) 429 if (status)
428 goto out; 430 goto out;
429 break; 431 break;
@@ -439,6 +441,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
439 status = do_open_fhandle(rqstp, cstate, open); 441 status = do_open_fhandle(rqstp, cstate, open);
440 if (status) 442 if (status)
441 goto out; 443 goto out;
444 resfh = &cstate->current_fh;
442 break; 445 break;
443 case NFS4_OPEN_CLAIM_DELEG_PREV_FH: 446 case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
444 case NFS4_OPEN_CLAIM_DELEGATE_PREV: 447 case NFS4_OPEN_CLAIM_DELEGATE_PREV:
@@ -458,9 +461,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
458 * successful, it (1) truncates the file if open->op_truncate was 461 * successful, it (1) truncates the file if open->op_truncate was
459 * set, (2) sets open->op_stateid, (3) sets open->op_delegation. 462 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
460 */ 463 */
461 status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); 464 status = nfsd4_process_open2(rqstp, resfh, open);
462 WARN_ON(status && open->op_created); 465 WARN_ON(status && open->op_created);
463out: 466out:
467 if (resfh && resfh != &cstate->current_fh) {
468 fh_dup2(&cstate->current_fh, resfh);
469 fh_put(resfh);
470 kfree(resfh);
471 }
464 nfsd4_cleanup_open_state(open, status); 472 nfsd4_cleanup_open_state(open, status);
465 if (open->op_openowner && !nfsd4_has_session(cstate)) 473 if (open->op_openowner && !nfsd4_has_session(cstate))
466 cstate->replay_owner = &open->op_openowner->oo_owner; 474 cstate->replay_owner = &open->op_openowner->oo_owner;
@@ -1069,8 +1077,10 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1069 cstate->current_fh.fh_dentry, &p, 1077 cstate->current_fh.fh_dentry, &p,
1070 count, verify->ve_bmval, 1078 count, verify->ve_bmval,
1071 rqstp, 0); 1079 rqstp, 0);
1072 1080 /*
1073 /* this means that nfsd4_encode_fattr() ran out of space */ 1081 * If nfsd4_encode_fattr() ran out of space, assume that's because
1082 * the attributes are longer (hence different) than those given:
1083 */
1074 if (status == nfserr_resource) 1084 if (status == nfserr_resource)
1075 status = nfserr_not_same; 1085 status = nfserr_not_same;
1076 if (status) 1086 if (status)
@@ -1524,7 +1534,8 @@ static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1524static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1534static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1525{ 1535{
1526 return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\ 1536 return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
1527 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\ 1537 1 + 1 + /* eir_flags, spr_how */\
1538 4 + /* spo_must_enforce & _allow with bitmap */\
1528 2 + /*eir_server_owner.so_minor_id */\ 1539 2 + /*eir_server_owner.so_minor_id */\
1529 /* eir_server_owner.so_major_id<> */\ 1540 /* eir_server_owner.so_major_id<> */\
1530 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\ 1541 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
@@ -1881,6 +1892,7 @@ struct svc_version nfsd_version4 = {
1881 .vs_proc = nfsd_procedures4, 1892 .vs_proc = nfsd_procedures4,
1882 .vs_dispatch = nfsd_dispatch, 1893 .vs_dispatch = nfsd_dispatch,
1883 .vs_xdrsize = NFS4_SVC_XDRSIZE, 1894 .vs_xdrsize = NFS4_SVC_XDRSIZE,
1895 .vs_rpcb_optnl = 1,
1884}; 1896};
1885 1897
1886/* 1898/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 105d6fa7c514..d5d070fbeb35 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -832,10 +832,11 @@ static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
832 spin_unlock(&nfsd_drc_lock); 832 spin_unlock(&nfsd_drc_lock);
833} 833}
834 834
835static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs) 835static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
836 struct nfsd4_channel_attrs *battrs)
836{ 837{
837 int numslots = attrs->maxreqs; 838 int numslots = fattrs->maxreqs;
838 int slotsize = slot_bytes(attrs); 839 int slotsize = slot_bytes(fattrs);
839 struct nfsd4_session *new; 840 struct nfsd4_session *new;
840 int mem, i; 841 int mem, i;
841 842
@@ -852,6 +853,10 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
852 if (!new->se_slots[i]) 853 if (!new->se_slots[i])
853 goto out_free; 854 goto out_free;
854 } 855 }
856
857 memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
858 memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs));
859
855 return new; 860 return new;
856out_free: 861out_free:
857 while (i--) 862 while (i--)
@@ -997,8 +1002,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
997 list_add(&new->se_perclnt, &clp->cl_sessions); 1002 list_add(&new->se_perclnt, &clp->cl_sessions);
998 spin_unlock(&clp->cl_lock); 1003 spin_unlock(&clp->cl_lock);
999 spin_unlock(&nn->client_lock); 1004 spin_unlock(&nn->client_lock);
1000 memcpy(&new->se_fchannel, &cses->fore_channel, 1005
1001 sizeof(struct nfsd4_channel_attrs));
1002 if (cses->flags & SESSION4_BACK_CHAN) { 1006 if (cses->flags & SESSION4_BACK_CHAN) {
1003 struct sockaddr *sa = svc_addr(rqstp); 1007 struct sockaddr *sa = svc_addr(rqstp);
1004 /* 1008 /*
@@ -1851,6 +1855,11 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
1851 return nfs_ok; 1855 return nfs_ok;
1852} 1856}
1853 1857
1858#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \
1859 RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
1860#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \
1861 RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
1862
1854static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca) 1863static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
1855{ 1864{
1856 ca->headerpadsz = 0; 1865 ca->headerpadsz = 0;
@@ -1861,9 +1870,9 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
1861 * less than 1k. Tighten up this estimate in the unlikely event 1870 * less than 1k. Tighten up this estimate in the unlikely event
1862 * it turns out to be a problem for some client: 1871 * it turns out to be a problem for some client:
1863 */ 1872 */
1864 if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH) 1873 if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
1865 return nfserr_toosmall; 1874 return nfserr_toosmall;
1866 if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH) 1875 if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
1867 return nfserr_toosmall; 1876 return nfserr_toosmall;
1868 ca->maxresp_cached = 0; 1877 ca->maxresp_cached = 0;
1869 if (ca->maxops < 2) 1878 if (ca->maxops < 2)
@@ -1913,9 +1922,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1913 return status; 1922 return status;
1914 status = check_backchannel_attrs(&cr_ses->back_channel); 1923 status = check_backchannel_attrs(&cr_ses->back_channel);
1915 if (status) 1924 if (status)
1916 return status; 1925 goto out_release_drc_mem;
1917 status = nfserr_jukebox; 1926 status = nfserr_jukebox;
1918 new = alloc_session(&cr_ses->fore_channel); 1927 new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
1919 if (!new) 1928 if (!new)
1920 goto out_release_drc_mem; 1929 goto out_release_drc_mem;
1921 conn = alloc_conn_from_crses(rqstp, cr_ses); 1930 conn = alloc_conn_from_crses(rqstp, cr_ses);
@@ -3034,18 +3043,18 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
3034 if (!fl) 3043 if (!fl)
3035 return -ENOMEM; 3044 return -ENOMEM;
3036 fl->fl_file = find_readable_file(fp); 3045 fl->fl_file = find_readable_file(fp);
3037 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
3038 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); 3046 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
3039 if (status) { 3047 if (status)
3040 list_del_init(&dp->dl_perclnt); 3048 goto out_free;
3041 locks_free_lock(fl); 3049 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
3042 return status;
3043 }
3044 fp->fi_lease = fl; 3050 fp->fi_lease = fl;
3045 fp->fi_deleg_file = get_file(fl->fl_file); 3051 fp->fi_deleg_file = get_file(fl->fl_file);
3046 atomic_set(&fp->fi_delegees, 1); 3052 atomic_set(&fp->fi_delegees, 1);
3047 list_add(&dp->dl_perfile, &fp->fi_delegations); 3053 list_add(&dp->dl_perfile, &fp->fi_delegations);
3048 return 0; 3054 return 0;
3055out_free:
3056 locks_free_lock(fl);
3057 return status;
3049} 3058}
3050 3059
3051static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) 3060static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp)
@@ -3125,6 +3134,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3125 goto out_no_deleg; 3134 goto out_no_deleg;
3126 break; 3135 break;
3127 case NFS4_OPEN_CLAIM_NULL: 3136 case NFS4_OPEN_CLAIM_NULL:
3137 case NFS4_OPEN_CLAIM_FH:
3128 /* 3138 /*
3129 * Let's not give out any delegations till everyone's 3139 * Let's not give out any delegations till everyone's
3130 * had the chance to reclaim theirs.... 3140 * had the chance to reclaim theirs....
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ee7237f99f54..63f2395c57ed 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -103,11 +103,6 @@ xdr_error: \
103 (x) = (u64)ntohl(*p++) << 32; \ 103 (x) = (u64)ntohl(*p++) << 32; \
104 (x) |= ntohl(*p++); \ 104 (x) |= ntohl(*p++); \
105} while (0) 105} while (0)
106#define READTIME(x) do { \
107 p++; \
108 (x) = ntohl(*p++); \
109 p++; \
110} while (0)
111#define READMEM(x,nbytes) do { \ 106#define READMEM(x,nbytes) do { \
112 x = (char *)p; \ 107 x = (char *)p; \
113 p += XDR_QUADLEN(nbytes); \ 108 p += XDR_QUADLEN(nbytes); \
@@ -190,6 +185,15 @@ static int zero_clientid(clientid_t *clid)
190 return (clid->cl_boot == 0) && (clid->cl_id == 0); 185 return (clid->cl_boot == 0) && (clid->cl_id == 0);
191} 186}
192 187
188/**
189 * defer_free - mark an allocation as deferred freed
190 * @argp: NFSv4 compound argument structure to be freed with
191 * @release: release callback to free @p, typically kfree()
192 * @p: pointer to be freed
193 *
194 * Marks @p to be freed when processing the compound operation
195 * described in @argp finishes.
196 */
193static int 197static int
194defer_free(struct nfsd4_compoundargs *argp, 198defer_free(struct nfsd4_compoundargs *argp,
195 void (*release)(const void *), void *p) 199 void (*release)(const void *), void *p)
@@ -206,6 +210,16 @@ defer_free(struct nfsd4_compoundargs *argp,
206 return 0; 210 return 0;
207} 211}
208 212
213/**
214 * savemem - duplicate a chunk of memory for later processing
215 * @argp: NFSv4 compound argument structure to be freed with
216 * @p: pointer to be duplicated
217 * @nbytes: length to be duplicated
218 *
219 * Returns a pointer to a copy of @nbytes bytes of memory at @p
220 * that are preserved until processing of the NFSv4 compound
221 * operation described by @argp finishes.
222 */
209static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) 223static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
210{ 224{
211 if (p == argp->tmp) { 225 if (p == argp->tmp) {
@@ -257,7 +271,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
257 int expected_len, len = 0; 271 int expected_len, len = 0;
258 u32 dummy32; 272 u32 dummy32;
259 char *buf; 273 char *buf;
260 int host_err;
261 274
262 DECODE_HEAD; 275 DECODE_HEAD;
263 iattr->ia_valid = 0; 276 iattr->ia_valid = 0;
@@ -284,10 +297,9 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
284 return nfserr_resource; 297 return nfserr_resource;
285 298
286 *acl = nfs4_acl_new(nace); 299 *acl = nfs4_acl_new(nace);
287 if (*acl == NULL) { 300 if (*acl == NULL)
288 host_err = -ENOMEM; 301 return nfserr_jukebox;
289 goto out_nfserr; 302
290 }
291 defer_free(argp, kfree, *acl); 303 defer_free(argp, kfree, *acl);
292 304
293 (*acl)->naces = nace; 305 (*acl)->naces = nace;
@@ -425,10 +437,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
425 goto xdr_error; 437 goto xdr_error;
426 438
427 DECODE_TAIL; 439 DECODE_TAIL;
428
429out_nfserr:
430 status = nfserrno(host_err);
431 goto out;
432} 440}
433 441
434static __be32 442static __be32
@@ -1957,56 +1965,16 @@ static u32 nfs4_file_type(umode_t mode)
1957 }; 1965 };
1958} 1966}
1959 1967
1960static __be32
1961nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid,
1962 __be32 **p, int *buflen)
1963{
1964 int status;
1965
1966 if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4)
1967 return nfserr_resource;
1968 if (whotype != NFS4_ACL_WHO_NAMED)
1969 status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
1970 else if (gid_valid(gid))
1971 status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1));
1972 else
1973 status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1));
1974 if (status < 0)
1975 return nfserrno(status);
1976 *p = xdr_encode_opaque(*p, NULL, status);
1977 *buflen -= (XDR_QUADLEN(status) << 2) + 4;
1978 BUG_ON(*buflen < 0);
1979 return 0;
1980}
1981
1982static inline __be32
1983nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen)
1984{
1985 return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID,
1986 p, buflen);
1987}
1988
1989static inline __be32
1990nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen)
1991{
1992 return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group,
1993 p, buflen);
1994}
1995
1996static inline __be32 1968static inline __be32
1997nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace, 1969nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
1998 __be32 **p, int *buflen) 1970 __be32 **p, int *buflen)
1999{ 1971{
2000 kuid_t uid = INVALID_UID; 1972 if (ace->whotype != NFS4_ACL_WHO_NAMED)
2001 kgid_t gid = INVALID_GID; 1973 return nfs4_acl_write_who(ace->whotype, p, buflen);
2002 1974 else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
2003 if (ace->whotype == NFS4_ACL_WHO_NAMED) { 1975 return nfsd4_encode_group(rqstp, ace->who_gid, p, buflen);
2004 if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) 1976 else
2005 gid = ace->who_gid; 1977 return nfsd4_encode_user(rqstp, ace->who_uid, p, buflen);
2006 else
2007 uid = ace->who_uid;
2008 }
2009 return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen);
2010} 1978}
2011 1979
2012#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ 1980#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -2090,7 +2058,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2090 u32 bmval1 = bmval[1]; 2058 u32 bmval1 = bmval[1];
2091 u32 bmval2 = bmval[2]; 2059 u32 bmval2 = bmval[2];
2092 struct kstat stat; 2060 struct kstat stat;
2093 struct svc_fh tempfh; 2061 struct svc_fh *tempfh = NULL;
2094 struct kstatfs statfs; 2062 struct kstatfs statfs;
2095 int buflen = count << 2; 2063 int buflen = count << 2;
2096 __be32 *attrlenp; 2064 __be32 *attrlenp;
@@ -2137,11 +2105,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2137 goto out_nfserr; 2105 goto out_nfserr;
2138 } 2106 }
2139 if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { 2107 if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
2140 fh_init(&tempfh, NFS4_FHSIZE); 2108 tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
2141 status = fh_compose(&tempfh, exp, dentry, NULL); 2109 status = nfserr_jukebox;
2110 if (!tempfh)
2111 goto out;
2112 fh_init(tempfh, NFS4_FHSIZE);
2113 status = fh_compose(tempfh, exp, dentry, NULL);
2142 if (status) 2114 if (status)
2143 goto out; 2115 goto out;
2144 fhp = &tempfh; 2116 fhp = tempfh;
2145 } 2117 }
2146 if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT 2118 if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
2147 | FATTR4_WORD0_SUPPORTED_ATTRS)) { 2119 | FATTR4_WORD0_SUPPORTED_ATTRS)) {
@@ -2222,8 +2194,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2222 if ((buflen -= 4) < 0) 2194 if ((buflen -= 4) < 0)
2223 goto out_resource; 2195 goto out_resource;
2224 dummy = nfs4_file_type(stat.mode); 2196 dummy = nfs4_file_type(stat.mode);
2225 if (dummy == NF4BAD) 2197 if (dummy == NF4BAD) {
2226 goto out_serverfault; 2198 status = nfserr_serverfault;
2199 goto out;
2200 }
2227 WRITE32(dummy); 2201 WRITE32(dummy);
2228 } 2202 }
2229 if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { 2203 if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
@@ -2317,8 +2291,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2317 WRITE32(ace->flag); 2291 WRITE32(ace->flag);
2318 WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL); 2292 WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
2319 status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen); 2293 status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen);
2320 if (status == nfserr_resource)
2321 goto out_resource;
2322 if (status) 2294 if (status)
2323 goto out; 2295 goto out;
2324 } 2296 }
@@ -2379,8 +2351,6 @@ out_acl:
2379 } 2351 }
2380 if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { 2352 if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
2381 status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen); 2353 status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
2382 if (status == nfserr_resource)
2383 goto out_resource;
2384 if (status) 2354 if (status)
2385 goto out; 2355 goto out;
2386 } 2356 }
@@ -2431,15 +2401,11 @@ out_acl:
2431 } 2401 }
2432 if (bmval1 & FATTR4_WORD1_OWNER) { 2402 if (bmval1 & FATTR4_WORD1_OWNER) {
2433 status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen); 2403 status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen);
2434 if (status == nfserr_resource)
2435 goto out_resource;
2436 if (status) 2404 if (status)
2437 goto out; 2405 goto out;
2438 } 2406 }
2439 if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { 2407 if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
2440 status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen); 2408 status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen);
2441 if (status == nfserr_resource)
2442 goto out_resource;
2443 if (status) 2409 if (status)
2444 goto out; 2410 goto out;
2445 } 2411 }
@@ -2533,8 +2499,8 @@ out:
2533 security_release_secctx(context, contextlen); 2499 security_release_secctx(context, contextlen);
2534#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ 2500#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
2535 kfree(acl); 2501 kfree(acl);
2536 if (fhp == &tempfh) 2502 if (tempfh)
2537 fh_put(&tempfh); 2503 fh_put(tempfh);
2538 return status; 2504 return status;
2539out_nfserr: 2505out_nfserr:
2540 status = nfserrno(err); 2506 status = nfserrno(err);
@@ -2542,9 +2508,6 @@ out_nfserr:
2542out_resource: 2508out_resource:
2543 status = nfserr_resource; 2509 status = nfserr_resource;
2544 goto out; 2510 goto out;
2545out_serverfault:
2546 status = nfserr_serverfault;
2547 goto out;
2548} 2511}
2549 2512
2550static inline int attributes_need_mount(u32 *bmval) 2513static inline int attributes_need_mount(u32 *bmval)
@@ -2621,17 +2584,14 @@ out_put:
2621static __be32 * 2584static __be32 *
2622nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr) 2585nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
2623{ 2586{
2624 __be32 *attrlenp;
2625
2626 if (buflen < 6) 2587 if (buflen < 6)
2627 return NULL; 2588 return NULL;
2628 *p++ = htonl(2); 2589 *p++ = htonl(2);
2629 *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ 2590 *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
2630 *p++ = htonl(0); /* bmval1 */ 2591 *p++ = htonl(0); /* bmval1 */
2631 2592
2632 attrlenp = p++; 2593 *p++ = htonl(4); /* attribute length */
2633 *p++ = nfserr; /* no htonl */ 2594 *p++ = nfserr; /* no htonl */
2634 *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
2635 return p; 2595 return p;
2636} 2596}
2637 2597
@@ -3244,7 +3204,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
3244 3204
3245 if (rpcauth_get_gssinfo(pf, &info) == 0) { 3205 if (rpcauth_get_gssinfo(pf, &info) == 0) {
3246 supported++; 3206 supported++;
3247 RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4); 3207 RESERVE_SPACE(4 + 4 + XDR_LEN(info.oid.len) + 4 + 4);
3248 WRITE32(RPC_AUTH_GSS); 3208 WRITE32(RPC_AUTH_GSS);
3249 WRITE32(info.oid.len); 3209 WRITE32(info.oid.len);
3250 WRITEMEM(info.oid.data, info.oid.len); 3210 WRITEMEM(info.oid.data, info.oid.len);
@@ -3379,35 +3339,43 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
3379 8 /* eir_clientid */ + 3339 8 /* eir_clientid */ +
3380 4 /* eir_sequenceid */ + 3340 4 /* eir_sequenceid */ +
3381 4 /* eir_flags */ + 3341 4 /* eir_flags */ +
3382 4 /* spr_how */ + 3342 4 /* spr_how */);
3383 8 /* spo_must_enforce, spo_must_allow */ +
3384 8 /* so_minor_id */ +
3385 4 /* so_major_id.len */ +
3386 (XDR_QUADLEN(major_id_sz) * 4) +
3387 4 /* eir_server_scope.len */ +
3388 (XDR_QUADLEN(server_scope_sz) * 4) +
3389 4 /* eir_server_impl_id.count (0) */);
3390 3343
3391 WRITEMEM(&exid->clientid, 8); 3344 WRITEMEM(&exid->clientid, 8);
3392 WRITE32(exid->seqid); 3345 WRITE32(exid->seqid);
3393 WRITE32(exid->flags); 3346 WRITE32(exid->flags);
3394 3347
3395 WRITE32(exid->spa_how); 3348 WRITE32(exid->spa_how);
3349 ADJUST_ARGS();
3350
3396 switch (exid->spa_how) { 3351 switch (exid->spa_how) {
3397 case SP4_NONE: 3352 case SP4_NONE:
3398 break; 3353 break;
3399 case SP4_MACH_CRED: 3354 case SP4_MACH_CRED:
3355 /* spo_must_enforce, spo_must_allow */
3356 RESERVE_SPACE(16);
3357
3400 /* spo_must_enforce bitmap: */ 3358 /* spo_must_enforce bitmap: */
3401 WRITE32(2); 3359 WRITE32(2);
3402 WRITE32(nfs4_minimal_spo_must_enforce[0]); 3360 WRITE32(nfs4_minimal_spo_must_enforce[0]);
3403 WRITE32(nfs4_minimal_spo_must_enforce[1]); 3361 WRITE32(nfs4_minimal_spo_must_enforce[1]);
3404 /* empty spo_must_allow bitmap: */ 3362 /* empty spo_must_allow bitmap: */
3405 WRITE32(0); 3363 WRITE32(0);
3364
3365 ADJUST_ARGS();
3406 break; 3366 break;
3407 default: 3367 default:
3408 WARN_ON_ONCE(1); 3368 WARN_ON_ONCE(1);
3409 } 3369 }
3410 3370
3371 RESERVE_SPACE(
3372 8 /* so_minor_id */ +
3373 4 /* so_major_id.len */ +
3374 (XDR_QUADLEN(major_id_sz) * 4) +
3375 4 /* eir_server_scope.len */ +
3376 (XDR_QUADLEN(server_scope_sz) * 4) +
3377 4 /* eir_server_impl_id.count (0) */);
3378
3411 /* The server_owner struct */ 3379 /* The server_owner struct */
3412 WRITE64(minor_id); /* Minor id */ 3380 WRITE64(minor_id); /* Minor id */
3413 /* major id */ 3381 /* major id */
@@ -3474,28 +3442,6 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
3474} 3442}
3475 3443
3476static __be32 3444static __be32
3477nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr,
3478 struct nfsd4_destroy_session *destroy_session)
3479{
3480 return nfserr;
3481}
3482
3483static __be32
3484nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3485 struct nfsd4_free_stateid *free_stateid)
3486{
3487 __be32 *p;
3488
3489 if (nfserr)
3490 return nfserr;
3491
3492 RESERVE_SPACE(4);
3493 *p++ = nfserr;
3494 ADJUST_ARGS();
3495 return nfserr;
3496}
3497
3498static __be32
3499nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, 3445nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
3500 struct nfsd4_sequence *seq) 3446 struct nfsd4_sequence *seq)
3501{ 3447{
@@ -3593,8 +3539,8 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3593 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session, 3539 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
3594 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, 3540 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3595 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, 3541 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3596 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, 3542 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3597 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_free_stateid, 3543 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3598 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, 3544 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3599 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, 3545 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3600 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, 3546 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index b6af150c96b8..f8f060ffbf4f 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -132,13 +132,6 @@ nfsd_reply_cache_alloc(void)
132} 132}
133 133
134static void 134static void
135nfsd_reply_cache_unhash(struct svc_cacherep *rp)
136{
137 hlist_del_init(&rp->c_hash);
138 list_del_init(&rp->c_lru);
139}
140
141static void
142nfsd_reply_cache_free_locked(struct svc_cacherep *rp) 135nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
143{ 136{
144 if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { 137 if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
@@ -416,22 +409,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
416 409
417 /* 410 /*
418 * Since the common case is a cache miss followed by an insert, 411 * Since the common case is a cache miss followed by an insert,
419 * preallocate an entry. First, try to reuse the first entry on the LRU 412 * preallocate an entry.
420 * if it works, then go ahead and prune the LRU list.
421 */ 413 */
422 spin_lock(&cache_lock);
423 if (!list_empty(&lru_head)) {
424 rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
425 if (nfsd_cache_entry_expired(rp) ||
426 num_drc_entries >= max_drc_entries) {
427 nfsd_reply_cache_unhash(rp);
428 prune_cache_entries();
429 goto search_cache;
430 }
431 }
432
433 /* No expired ones available, allocate a new one. */
434 spin_unlock(&cache_lock);
435 rp = nfsd_reply_cache_alloc(); 414 rp = nfsd_reply_cache_alloc();
436 spin_lock(&cache_lock); 415 spin_lock(&cache_lock);
437 if (likely(rp)) { 416 if (likely(rp)) {
@@ -439,7 +418,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
439 drc_mem_usage += sizeof(*rp); 418 drc_mem_usage += sizeof(*rp);
440 } 419 }
441 420
442search_cache: 421 /* go ahead and prune the cache */
422 prune_cache_entries();
423
443 found = nfsd_cache_search(rqstp, csum); 424 found = nfsd_cache_search(rqstp, csum);
444 if (found) { 425 if (found) {
445 if (likely(rp)) 426 if (likely(rp))
@@ -453,15 +434,6 @@ search_cache:
453 goto out; 434 goto out;
454 } 435 }
455 436
456 /*
457 * We're keeping the one we just allocated. Are we now over the
458 * limit? Prune one off the tip of the LRU in trade for the one we
459 * just allocated if so.
460 */
461 if (num_drc_entries >= max_drc_entries)
462 nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
463 struct svc_cacherep, c_lru));
464
465 nfsdstats.rcmisses++; 437 nfsdstats.rcmisses++;
466 rqstp->rq_cacherep = rp; 438 rqstp->rq_cacherep = rp;
467 rp->c_state = RC_INPROG; 439 rp->c_state = RC_INPROG;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 760c85a6f534..9a4a5f9e7468 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -241,6 +241,15 @@ static void nfsd_shutdown_generic(void)
241 nfsd_racache_shutdown(); 241 nfsd_racache_shutdown();
242} 242}
243 243
244static bool nfsd_needs_lockd(void)
245{
246#if defined(CONFIG_NFSD_V3)
247 return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL);
248#else
249 return (nfsd_versions[2] != NULL);
250#endif
251}
252
244static int nfsd_startup_net(int nrservs, struct net *net) 253static int nfsd_startup_net(int nrservs, struct net *net)
245{ 254{
246 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 255 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -255,9 +264,14 @@ static int nfsd_startup_net(int nrservs, struct net *net)
255 ret = nfsd_init_socks(net); 264 ret = nfsd_init_socks(net);
256 if (ret) 265 if (ret)
257 goto out_socks; 266 goto out_socks;
258 ret = lockd_up(net); 267
259 if (ret) 268 if (nfsd_needs_lockd() && !nn->lockd_up) {
260 goto out_socks; 269 ret = lockd_up(net);
270 if (ret)
271 goto out_socks;
272 nn->lockd_up = 1;
273 }
274
261 ret = nfs4_state_start_net(net); 275 ret = nfs4_state_start_net(net);
262 if (ret) 276 if (ret)
263 goto out_lockd; 277 goto out_lockd;
@@ -266,7 +280,10 @@ static int nfsd_startup_net(int nrservs, struct net *net)
266 return 0; 280 return 0;
267 281
268out_lockd: 282out_lockd:
269 lockd_down(net); 283 if (nn->lockd_up) {
284 lockd_down(net);
285 nn->lockd_up = 0;
286 }
270out_socks: 287out_socks:
271 nfsd_shutdown_generic(); 288 nfsd_shutdown_generic();
272 return ret; 289 return ret;
@@ -277,7 +294,10 @@ static void nfsd_shutdown_net(struct net *net)
277 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 294 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
278 295
279 nfs4_state_shutdown_net(net); 296 nfs4_state_shutdown_net(net);
280 lockd_down(net); 297 if (nn->lockd_up) {
298 lockd_down(net);
299 nn->lockd_up = 0;
300 }
281 nn->nfsd_net_up = false; 301 nn->nfsd_net_up = false;
282 nfsd_shutdown_generic(); 302 nfsd_shutdown_generic();
283} 303}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 9c769a47ac5a..b17d93214d01 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
152 type = (stat->mode & S_IFMT); 152 type = (stat->mode & S_IFMT);
153 153
154 *p++ = htonl(nfs_ftypes[type >> 12]); 154 *p++ = htonl(nfs_ftypes[type >> 12]);
155 *p++ = htonl((u32) stat->mode); 155 *p++ = htonl((u32) (stat->mode & S_IALLUGO));
156 *p++ = htonl((u32) stat->nlink); 156 *p++ = htonl((u32) stat->nlink);
157 *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid)); 157 *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
158 *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid)); 158 *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7eea63cada1d..6d7be3f80356 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -207,7 +207,12 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
207 goto out_nfserr; 207 goto out_nfserr;
208 } 208 }
209 } else { 209 } else {
210 fh_lock(fhp); 210 /*
211 * In the nfsd4_open() case, this may be held across
212 * subsequent open and delegation acquisition which may
213 * need to take the child's i_mutex:
214 */
215 fh_lock_nested(fhp, I_MUTEX_PARENT);
211 dentry = lookup_one_len(name, dparent, len); 216 dentry = lookup_one_len(name, dparent, len);
212 host_err = PTR_ERR(dentry); 217 host_err = PTR_ERR(dentry);
213 if (IS_ERR(dentry)) 218 if (IS_ERR(dentry))
@@ -273,13 +278,6 @@ out:
273 return err; 278 return err;
274} 279}
275 280
276static int nfsd_break_lease(struct inode *inode)
277{
278 if (!S_ISREG(inode->i_mode))
279 return 0;
280 return break_lease(inode, O_WRONLY | O_NONBLOCK);
281}
282
283/* 281/*
284 * Commit metadata changes to stable storage. 282 * Commit metadata changes to stable storage.
285 */ 283 */
@@ -348,8 +346,7 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
348 346
349 /* Revoke setuid/setgid on chown */ 347 /* Revoke setuid/setgid on chown */
350 if (!S_ISDIR(inode->i_mode) && 348 if (!S_ISDIR(inode->i_mode) &&
351 (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) || 349 ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) {
352 ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) {
353 iap->ia_valid |= ATTR_KILL_PRIV; 350 iap->ia_valid |= ATTR_KILL_PRIV;
354 if (iap->ia_valid & ATTR_MODE) { 351 if (iap->ia_valid & ATTR_MODE) {
355 /* we're setting mode too, just clear the s*id bits */ 352 /* we're setting mode too, just clear the s*id bits */
@@ -449,16 +446,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
449 goto out_put_write_access; 446 goto out_put_write_access;
450 } 447 }
451 448
452 host_err = nfsd_break_lease(inode);
453 if (host_err)
454 goto out_put_write_access_nfserror;
455
456 fh_lock(fhp); 449 fh_lock(fhp);
457 host_err = notify_change(dentry, iap, NULL); 450 host_err = notify_change(dentry, iap, NULL);
458 fh_unlock(fhp); 451 fh_unlock(fhp);
459
460out_put_write_access_nfserror:
461 err = nfserrno(host_err); 452 err = nfserrno(host_err);
453
462out_put_write_access: 454out_put_write_access:
463 if (size_change) 455 if (size_change)
464 put_write_access(inode); 456 put_write_access(inode);
@@ -468,158 +460,7 @@ out:
468 return err; 460 return err;
469} 461}
470 462
471#if defined(CONFIG_NFSD_V2_ACL) || \
472 defined(CONFIG_NFSD_V3_ACL) || \
473 defined(CONFIG_NFSD_V4)
474static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
475{
476 ssize_t buflen;
477 ssize_t ret;
478
479 buflen = vfs_getxattr(dentry, key, NULL, 0);
480 if (buflen <= 0)
481 return buflen;
482
483 *buf = kmalloc(buflen, GFP_KERNEL);
484 if (!*buf)
485 return -ENOMEM;
486
487 ret = vfs_getxattr(dentry, key, *buf, buflen);
488 if (ret < 0)
489 kfree(*buf);
490 return ret;
491}
492#endif
493
494#if defined(CONFIG_NFSD_V4) 463#if defined(CONFIG_NFSD_V4)
495static int
496set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
497{
498 int len;
499 size_t buflen;
500 char *buf = NULL;
501 int error = 0;
502
503 buflen = posix_acl_xattr_size(pacl->a_count);
504 buf = kmalloc(buflen, GFP_KERNEL);
505 error = -ENOMEM;
506 if (buf == NULL)
507 goto out;
508
509 len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen);
510 if (len < 0) {
511 error = len;
512 goto out;
513 }
514
515 error = vfs_setxattr(dentry, key, buf, len, 0);
516out:
517 kfree(buf);
518 return error;
519}
520
521__be32
522nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
523 struct nfs4_acl *acl)
524{
525 __be32 error;
526 int host_error;
527 struct dentry *dentry;
528 struct inode *inode;
529 struct posix_acl *pacl = NULL, *dpacl = NULL;
530 unsigned int flags = 0;
531
532 /* Get inode */
533 error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
534 if (error)
535 return error;
536
537 dentry = fhp->fh_dentry;
538 inode = dentry->d_inode;
539 if (S_ISDIR(inode->i_mode))
540 flags = NFS4_ACL_DIR;
541
542 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
543 if (host_error == -EINVAL) {
544 return nfserr_attrnotsupp;
545 } else if (host_error < 0)
546 goto out_nfserr;
547
548 host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
549 if (host_error < 0)
550 goto out_release;
551
552 if (S_ISDIR(inode->i_mode))
553 host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
554
555out_release:
556 posix_acl_release(pacl);
557 posix_acl_release(dpacl);
558out_nfserr:
559 if (host_error == -EOPNOTSUPP)
560 return nfserr_attrnotsupp;
561 else
562 return nfserrno(host_error);
563}
564
565static struct posix_acl *
566_get_posix_acl(struct dentry *dentry, char *key)
567{
568 void *buf = NULL;
569 struct posix_acl *pacl = NULL;
570 int buflen;
571
572 buflen = nfsd_getxattr(dentry, key, &buf);
573 if (!buflen)
574 buflen = -ENODATA;
575 if (buflen <= 0)
576 return ERR_PTR(buflen);
577
578 pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen);
579 kfree(buf);
580 return pacl;
581}
582
583int
584nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
585{
586 struct inode *inode = dentry->d_inode;
587 int error = 0;
588 struct posix_acl *pacl = NULL, *dpacl = NULL;
589 unsigned int flags = 0;
590
591 pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
592 if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
593 pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
594 if (IS_ERR(pacl)) {
595 error = PTR_ERR(pacl);
596 pacl = NULL;
597 goto out;
598 }
599
600 if (S_ISDIR(inode->i_mode)) {
601 dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
602 if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
603 dpacl = NULL;
604 else if (IS_ERR(dpacl)) {
605 error = PTR_ERR(dpacl);
606 dpacl = NULL;
607 goto out;
608 }
609 flags = NFS4_ACL_DIR;
610 }
611
612 *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
613 if (IS_ERR(*acl)) {
614 error = PTR_ERR(*acl);
615 *acl = NULL;
616 }
617 out:
618 posix_acl_release(pacl);
619 posix_acl_release(dpacl);
620 return error;
621}
622
623/* 464/*
624 * NFS junction information is stored in an extended attribute. 465 * NFS junction information is stored in an extended attribute.
625 */ 466 */
@@ -1760,11 +1601,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1760 err = nfserr_noent; 1601 err = nfserr_noent;
1761 if (!dold->d_inode) 1602 if (!dold->d_inode)
1762 goto out_dput; 1603 goto out_dput;
1763 host_err = nfsd_break_lease(dold->d_inode);
1764 if (host_err) {
1765 err = nfserrno(host_err);
1766 goto out_dput;
1767 }
1768 host_err = vfs_link(dold, dirp, dnew, NULL); 1604 host_err = vfs_link(dold, dirp, dnew, NULL);
1769 if (!host_err) { 1605 if (!host_err) {
1770 err = nfserrno(commit_metadata(ffhp)); 1606 err = nfserrno(commit_metadata(ffhp));
@@ -1858,14 +1694,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1858 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) 1694 if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
1859 goto out_dput_new; 1695 goto out_dput_new;
1860 1696
1861 host_err = nfsd_break_lease(odentry->d_inode);
1862 if (host_err)
1863 goto out_dput_new;
1864 if (ndentry->d_inode) {
1865 host_err = nfsd_break_lease(ndentry->d_inode);
1866 if (host_err)
1867 goto out_dput_new;
1868 }
1869 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); 1697 host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
1870 if (!host_err) { 1698 if (!host_err) {
1871 host_err = commit_metadata(tfhp); 1699 host_err = commit_metadata(tfhp);
@@ -1935,16 +1763,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1935 if (!type) 1763 if (!type)
1936 type = rdentry->d_inode->i_mode & S_IFMT; 1764 type = rdentry->d_inode->i_mode & S_IFMT;
1937 1765
1938 host_err = nfsd_break_lease(rdentry->d_inode);
1939 if (host_err)
1940 goto out_put;
1941 if (type != S_IFDIR) 1766 if (type != S_IFDIR)
1942 host_err = vfs_unlink(dirp, rdentry, NULL); 1767 host_err = vfs_unlink(dirp, rdentry, NULL);
1943 else 1768 else
1944 host_err = vfs_rmdir(dirp, rdentry); 1769 host_err = vfs_rmdir(dirp, rdentry);
1945 if (!host_err) 1770 if (!host_err)
1946 host_err = commit_metadata(fhp); 1771 host_err = commit_metadata(fhp);
1947out_put:
1948 dput(rdentry); 1772 dput(rdentry);
1949 1773
1950out_nfserr: 1774out_nfserr:
@@ -2284,93 +2108,3 @@ out_nomem:
2284 nfsd_racache_shutdown(); 2108 nfsd_racache_shutdown();
2285 return -ENOMEM; 2109 return -ENOMEM;
2286} 2110}
2287
2288#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
2289struct posix_acl *
2290nfsd_get_posix_acl(struct svc_fh *fhp, int type)
2291{
2292 struct inode *inode = fhp->fh_dentry->d_inode;
2293 char *name;
2294 void *value = NULL;
2295 ssize_t size;
2296 struct posix_acl *acl;
2297
2298 if (!IS_POSIXACL(inode))
2299 return ERR_PTR(-EOPNOTSUPP);
2300
2301 switch (type) {
2302 case ACL_TYPE_ACCESS:
2303 name = POSIX_ACL_XATTR_ACCESS;
2304 break;
2305 case ACL_TYPE_DEFAULT:
2306 name = POSIX_ACL_XATTR_DEFAULT;
2307 break;
2308 default:
2309 return ERR_PTR(-EOPNOTSUPP);
2310 }
2311
2312 size = nfsd_getxattr(fhp->fh_dentry, name, &value);
2313 if (size < 0)
2314 return ERR_PTR(size);
2315
2316 acl = posix_acl_from_xattr(&init_user_ns, value, size);
2317 kfree(value);
2318 return acl;
2319}
2320
2321int
2322nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
2323{
2324 struct inode *inode = fhp->fh_dentry->d_inode;
2325 char *name;
2326 void *value = NULL;
2327 size_t size;
2328 int error;
2329
2330 if (!IS_POSIXACL(inode) ||
2331 !inode->i_op->setxattr || !inode->i_op->removexattr)
2332 return -EOPNOTSUPP;
2333 switch(type) {
2334 case ACL_TYPE_ACCESS:
2335 name = POSIX_ACL_XATTR_ACCESS;
2336 break;
2337 case ACL_TYPE_DEFAULT:
2338 name = POSIX_ACL_XATTR_DEFAULT;
2339 break;
2340 default:
2341 return -EOPNOTSUPP;
2342 }
2343
2344 if (acl && acl->a_count) {
2345 size = posix_acl_xattr_size(acl->a_count);
2346 value = kmalloc(size, GFP_KERNEL);
2347 if (!value)
2348 return -ENOMEM;
2349 error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
2350 if (error < 0)
2351 goto getout;
2352 size = error;
2353 } else
2354 size = 0;
2355
2356 error = fh_want_write(fhp);
2357 if (error)
2358 goto getout;
2359 if (size)
2360 error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
2361 else {
2362 if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
2363 error = 0;
2364 else {
2365 error = vfs_removexattr(fhp->fh_dentry, name);
2366 if (error == -ENODATA)
2367 error = 0;
2368 }
2369 }
2370 fh_drop_write(fhp);
2371
2372getout:
2373 kfree(value);
2374 return error;
2375}
2376#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a4be2e389670..fbe90bdb2214 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -52,9 +52,6 @@ __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
52 struct iattr *, int, time_t); 52 struct iattr *, int, time_t);
53int nfsd_mountpoint(struct dentry *, struct svc_export *); 53int nfsd_mountpoint(struct dentry *, struct svc_export *);
54#ifdef CONFIG_NFSD_V4 54#ifdef CONFIG_NFSD_V4
55__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
56 struct nfs4_acl *);
57int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
58__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, 55__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
59 struct xdr_netobj *); 56 struct xdr_netobj *);
60#endif /* CONFIG_NFSD_V4 */ 57#endif /* CONFIG_NFSD_V4 */
@@ -89,8 +86,6 @@ __be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
89__be32 nfsd_rename(struct svc_rqst *, 86__be32 nfsd_rename(struct svc_rqst *,
90 struct svc_fh *, char *, int, 87 struct svc_fh *, char *, int,
91 struct svc_fh *, char *, int); 88 struct svc_fh *, char *, int);
92__be32 nfsd_remove(struct svc_rqst *,
93 struct svc_fh *, char *, int);
94__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, 89__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
95 char *name, int len); 90 char *name, int len);
96__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, 91__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
@@ -101,11 +96,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
101__be32 nfsd_permission(struct svc_rqst *, struct svc_export *, 96__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
102 struct dentry *, int); 97 struct dentry *, int);
103 98
104#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
105struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
106int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
107#endif
108
109static inline int fh_want_write(struct svc_fh *fh) 99static inline int fh_want_write(struct svc_fh *fh)
110{ 100{
111 int ret = mnt_want_write(fh->fh_export->ex_path.mnt); 101 int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index b6d5542a4ac8..335e04aaf7db 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -174,6 +174,9 @@ struct nfsd3_linkres {
174struct nfsd3_readdirres { 174struct nfsd3_readdirres {
175 __be32 status; 175 __be32 status;
176 struct svc_fh fh; 176 struct svc_fh fh;
177 /* Just to save kmalloc on every readdirplus entry (svc_fh is a
178 * little large for the stack): */
179 struct svc_fh scratch;
177 int count; 180 int count;
178 __be32 verf[2]; 181 __be32 verf[2];
179 182
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index b3ed6446ed8e..d278a0d03496 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -228,7 +228,7 @@ struct nfsd4_open {
228 u32 op_create; /* request */ 228 u32 op_create; /* request */
229 u32 op_createmode; /* request */ 229 u32 op_createmode; /* request */
230 u32 op_bmval[3]; /* request */ 230 u32 op_bmval[3]; /* request */
231 struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */ 231 struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
232 nfs4_verifier op_verf __attribute__((aligned(32))); 232 nfs4_verifier op_verf __attribute__((aligned(32)));
233 /* EXCLUSIVE4 */ 233 /* EXCLUSIVE4 */
234 clientid_t op_clientid; /* request */ 234 clientid_t op_clientid; /* request */
@@ -250,7 +250,6 @@ struct nfsd4_open {
250 struct nfs4_acl *op_acl; 250 struct nfs4_acl *op_acl;
251 struct xdr_netobj op_label; 251 struct xdr_netobj op_label;
252}; 252};
253#define op_iattr iattr
254 253
255struct nfsd4_open_confirm { 254struct nfsd4_open_confirm {
256 stateid_t oc_req_stateid /* request */; 255 stateid_t oc_req_stateid /* request */;
@@ -374,7 +373,6 @@ struct nfsd4_test_stateid {
374 373
375struct nfsd4_free_stateid { 374struct nfsd4_free_stateid {
376 stateid_t fr_stateid; /* request */ 375 stateid_t fr_stateid; /* request */
377 __be32 fr_status; /* response */
378}; 376};
379 377
380/* also used for NVERIFY */ 378/* also used for NVERIFY */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b44bdb291b84..2b34021948e4 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -37,7 +37,26 @@
37#include "sufile.h" 37#include "sufile.h"
38#include "dat.h" 38#include "dat.h"
39 39
40 40/**
41 * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info
42 * @nilfs: nilfs object
43 * @argv: vector of arguments from userspace
44 * @dir: set of direction flags
45 * @dofunc: concrete function of get/set metadata info
46 *
47 * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of
48 * calling dofunc() function on the basis of @argv argument.
49 *
50 * Return Value: On success, 0 is returned and requested metadata info
51 * is copied into userspace. On error, one of the following
52 * negative error codes is returned.
53 *
54 * %-EINVAL - Invalid arguments from userspace.
55 *
56 * %-ENOMEM - Insufficient amount of memory available.
57 *
58 * %-EFAULT - Failure during execution of requested operation.
59 */
41static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, 60static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
42 struct nilfs_argv *argv, int dir, 61 struct nilfs_argv *argv, int dir,
43 ssize_t (*dofunc)(struct the_nilfs *, 62 ssize_t (*dofunc)(struct the_nilfs *,
@@ -57,6 +76,14 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
57 if (argv->v_size > PAGE_SIZE) 76 if (argv->v_size > PAGE_SIZE)
58 return -EINVAL; 77 return -EINVAL;
59 78
79 /*
80 * Reject pairs of a start item position (argv->v_index) and a
81 * total count (argv->v_nmembs) which leads position 'pos' to
82 * overflow by the increment at the end of the loop.
83 */
84 if (argv->v_index > ~(__u64)0 - argv->v_nmembs)
85 return -EINVAL;
86
60 buf = (void *)__get_free_pages(GFP_NOFS, 0); 87 buf = (void *)__get_free_pages(GFP_NOFS, 0);
61 if (unlikely(!buf)) 88 if (unlikely(!buf))
62 return -ENOMEM; 89 return -ENOMEM;
@@ -99,6 +126,9 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
99 return ret; 126 return ret;
100} 127}
101 128
129/**
130 * nilfs_ioctl_getflags - ioctl to support lsattr
131 */
102static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) 132static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
103{ 133{
104 unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE; 134 unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
@@ -106,6 +136,9 @@ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
106 return put_user(flags, (int __user *)argp); 136 return put_user(flags, (int __user *)argp);
107} 137}
108 138
139/**
140 * nilfs_ioctl_setflags - ioctl to support chattr
141 */
109static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, 142static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
110 void __user *argp) 143 void __user *argp)
111{ 144{
@@ -158,11 +191,33 @@ out:
158 return ret; 191 return ret;
159} 192}
160 193
194/**
195 * nilfs_ioctl_getversion - get info about a file's version (generation number)
196 */
161static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp) 197static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
162{ 198{
163 return put_user(inode->i_generation, (int __user *)argp); 199 return put_user(inode->i_generation, (int __user *)argp);
164} 200}
165 201
202/**
203 * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot)
204 * @inode: inode object
205 * @filp: file object
206 * @cmd: ioctl's request code
207 * @argp: pointer on argument from userspace
208 *
209 * Description: nilfs_ioctl_change_cpmode() function changes mode of
210 * given checkpoint between checkpoint and snapshot state. This ioctl
211 * is used in chcp and mkcp utilities.
212 *
213 * Return Value: On success, 0 is returned and mode of a checkpoint is
214 * changed. On error, one of the following negative error codes
215 * is returned.
216 *
217 * %-EPERM - Operation not permitted.
218 *
219 * %-EFAULT - Failure during checkpoint mode changing.
220 */
166static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, 221static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
167 unsigned int cmd, void __user *argp) 222 unsigned int cmd, void __user *argp)
168{ 223{
@@ -198,6 +253,25 @@ out:
198 return ret; 253 return ret;
199} 254}
200 255
256/**
257 * nilfs_ioctl_delete_checkpoint - remove checkpoint
258 * @inode: inode object
259 * @filp: file object
260 * @cmd: ioctl's request code
261 * @argp: pointer on argument from userspace
262 *
263 * Description: nilfs_ioctl_delete_checkpoint() function removes
264 * checkpoint from NILFS2 file system. This ioctl is used in rmcp
265 * utility.
266 *
267 * Return Value: On success, 0 is returned and a checkpoint is
268 * removed. On error, one of the following negative error codes
269 * is returned.
270 *
271 * %-EPERM - Operation not permitted.
272 *
273 * %-EFAULT - Failure during checkpoint removing.
274 */
201static int 275static int
202nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, 276nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
203 unsigned int cmd, void __user *argp) 277 unsigned int cmd, void __user *argp)
@@ -229,6 +303,21 @@ out:
229 return ret; 303 return ret;
230} 304}
231 305
306/**
307 * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints
308 * @nilfs: nilfs object
309 * @posp: pointer on array of checkpoint's numbers
310 * @flags: checkpoint mode (checkpoint or snapshot)
311 * @buf: buffer for storing checkponts' info
312 * @size: size in bytes of one checkpoint info item in array
313 * @nmembs: number of checkpoints in array (numbers and infos)
314 *
315 * Description: nilfs_ioctl_do_get_cpinfo() function returns info about
316 * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in
317 * lscp utility and by nilfs_cleanerd daemon.
318 *
319 * Return value: count of nilfs_cpinfo structures in output buffer.
320 */
232static ssize_t 321static ssize_t
233nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, 322nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
234 void *buf, size_t size, size_t nmembs) 323 void *buf, size_t size, size_t nmembs)
@@ -242,6 +331,27 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
242 return ret; 331 return ret;
243} 332}
244 333
334/**
335 * nilfs_ioctl_get_cpstat - get checkpoints statistics
336 * @inode: inode object
337 * @filp: file object
338 * @cmd: ioctl's request code
339 * @argp: pointer on argument from userspace
340 *
341 * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints.
342 * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities
343 * and by nilfs_cleanerd daemon.
344 *
345 * Return Value: On success, 0 is returned, and checkpoints information is
346 * copied into userspace pointer @argp. On error, one of the following
347 * negative error codes is returned.
348 *
349 * %-EIO - I/O error.
350 *
351 * %-ENOMEM - Insufficient amount of memory available.
352 *
353 * %-EFAULT - Failure during getting checkpoints statistics.
354 */
245static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, 355static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
246 unsigned int cmd, void __user *argp) 356 unsigned int cmd, void __user *argp)
247{ 357{
@@ -260,6 +370,21 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
260 return ret; 370 return ret;
261} 371}
262 372
373/**
374 * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info
375 * @nilfs: nilfs object
376 * @posp: pointer on array of segment numbers
377 * @flags: *not used*
378 * @buf: buffer for storing suinfo array
379 * @size: size in bytes of one suinfo item in array
380 * @nmembs: count of segment numbers and suinfos in array
381 *
382 * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage
383 * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used
384 * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon.
385 *
386 * Return value: count of nilfs_suinfo structures in output buffer.
387 */
263static ssize_t 388static ssize_t
264nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, 389nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
265 void *buf, size_t size, size_t nmembs) 390 void *buf, size_t size, size_t nmembs)
@@ -273,6 +398,27 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
273 return ret; 398 return ret;
274} 399}
275 400
401/**
402 * nilfs_ioctl_get_sustat - get segment usage statistics
403 * @inode: inode object
404 * @filp: file object
405 * @cmd: ioctl's request code
406 * @argp: pointer on argument from userspace
407 *
408 * Description: nilfs_ioctl_get_sustat() returns segment usage statistics.
409 * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities
410 * and by nilfs_cleanerd daemon.
411 *
412 * Return Value: On success, 0 is returned, and segment usage information is
413 * copied into userspace pointer @argp. On error, one of the following
414 * negative error codes is returned.
415 *
416 * %-EIO - I/O error.
417 *
418 * %-ENOMEM - Insufficient amount of memory available.
419 *
420 * %-EFAULT - Failure during getting segment usage statistics.
421 */
276static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, 422static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
277 unsigned int cmd, void __user *argp) 423 unsigned int cmd, void __user *argp)
278{ 424{
@@ -291,6 +437,21 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
291 return ret; 437 return ret;
292} 438}
293 439
440/**
441 * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info
442 * @nilfs: nilfs object
443 * @posp: *not used*
444 * @flags: *not used*
445 * @buf: buffer for storing array of nilfs_vinfo structures
446 * @size: size in bytes of one vinfo item in array
447 * @nmembs: count of vinfos in array
448 *
449 * Description: nilfs_ioctl_do_get_vinfo() function returns information
450 * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used
451 * by nilfs_cleanerd daemon.
452 *
453 * Return value: count of nilfs_vinfo structures in output buffer.
454 */
294static ssize_t 455static ssize_t
295nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, 456nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
296 void *buf, size_t size, size_t nmembs) 457 void *buf, size_t size, size_t nmembs)
@@ -303,6 +464,21 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
303 return ret; 464 return ret;
304} 465}
305 466
467/**
468 * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors
469 * @nilfs: nilfs object
470 * @posp: *not used*
471 * @flags: *not used*
472 * @buf: buffer for storing array of nilfs_bdesc structures
473 * @size: size in bytes of one bdesc item in array
474 * @nmembs: count of bdescs in array
475 *
476 * Description: nilfs_ioctl_do_get_bdescs() function returns information
477 * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
478 * is used by nilfs_cleanerd daemon.
479 *
480 * Return value: count of nilfs_bdescs structures in output buffer.
481 */
306static ssize_t 482static ssize_t
307nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, 483nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
308 void *buf, size_t size, size_t nmembs) 484 void *buf, size_t size, size_t nmembs)
@@ -329,6 +505,29 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
329 return nmembs; 505 return nmembs;
330} 506}
331 507
508/**
509 * nilfs_ioctl_get_bdescs - get disk block descriptors
510 * @inode: inode object
511 * @filp: file object
512 * @cmd: ioctl's request code
513 * @argp: pointer on argument from userspace
514 *
515 * Description: nilfs_ioctl_do_get_bdescs() function returns information
516 * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
517 * is used by nilfs_cleanerd daemon.
518 *
519 * Return Value: On success, 0 is returned, and disk block descriptors are
520 * copied into userspace pointer @argp. On error, one of the following
521 * negative error codes is returned.
522 *
523 * %-EINVAL - Invalid arguments from userspace.
524 *
525 * %-EIO - I/O error.
526 *
527 * %-ENOMEM - Insufficient amount of memory available.
528 *
529 * %-EFAULT - Failure during getting disk block descriptors.
530 */
332static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, 531static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
333 unsigned int cmd, void __user *argp) 532 unsigned int cmd, void __user *argp)
334{ 533{
@@ -352,6 +551,26 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
352 return ret; 551 return ret;
353} 552}
354 553
554/**
555 * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC
556 * @inode: inode object
557 * @vdesc: descriptor of virtual block number
558 * @buffers: list of moving buffers
559 *
560 * Description: nilfs_ioctl_move_inode_block() function registers data/node
561 * buffer in the GC pagecache and submit read request.
562 *
563 * Return Value: On success, 0 is returned. On error, one of the following
564 * negative error codes is returned.
565 *
566 * %-EIO - I/O error.
567 *
568 * %-ENOMEM - Insufficient amount of memory available.
569 *
570 * %-ENOENT - Requested block doesn't exist.
571 *
572 * %-EEXIST - Blocks conflict is detected.
573 */
355static int nilfs_ioctl_move_inode_block(struct inode *inode, 574static int nilfs_ioctl_move_inode_block(struct inode *inode,
356 struct nilfs_vdesc *vdesc, 575 struct nilfs_vdesc *vdesc,
357 struct list_head *buffers) 576 struct list_head *buffers)
@@ -397,6 +616,19 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
397 return 0; 616 return 0;
398} 617}
399 618
619/**
620 * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection
621 * @sb: superblock object
622 * @argv: vector of arguments from userspace
623 * @buf: array of nilfs_vdesc structures
624 *
625 * Description: nilfs_ioctl_move_blocks() function reads valid data/node
626 * blocks that garbage collector specified with the array of nilfs_vdesc
627 * structures and stores them into page caches of GC inodes.
628 *
629 * Return Value: Number of processed nilfs_vdesc structures or
630 * error code, otherwise.
631 */
400static int nilfs_ioctl_move_blocks(struct super_block *sb, 632static int nilfs_ioctl_move_blocks(struct super_block *sb,
401 struct nilfs_argv *argv, void *buf) 633 struct nilfs_argv *argv, void *buf)
402{ 634{
@@ -462,6 +694,25 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
462 return ret; 694 return ret;
463} 695}
464 696
697/**
698 * nilfs_ioctl_delete_checkpoints - delete checkpoints
699 * @nilfs: nilfs object
700 * @argv: vector of arguments from userspace
701 * @buf: array of periods of checkpoints numbers
702 *
703 * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints
704 * in the period from p_start to p_end, excluding p_end itself. The checkpoints
705 * which have been already deleted are ignored.
706 *
707 * Return Value: Number of processed nilfs_period structures or
708 * error code, otherwise.
709 *
710 * %-EIO - I/O error.
711 *
712 * %-ENOMEM - Insufficient amount of memory available.
713 *
714 * %-EINVAL - invalid checkpoints.
715 */
465static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, 716static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
466 struct nilfs_argv *argv, void *buf) 717 struct nilfs_argv *argv, void *buf)
467{ 718{
@@ -479,6 +730,24 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
479 return nmembs; 730 return nmembs;
480} 731}
481 732
733/**
734 * nilfs_ioctl_free_vblocknrs - free virtual block numbers
735 * @nilfs: nilfs object
736 * @argv: vector of arguments from userspace
737 * @buf: array of virtual block numbers
738 *
739 * Description: nilfs_ioctl_free_vblocknrs() function frees
740 * the virtual block numbers specified by @buf and @argv->v_nmembs.
741 *
742 * Return Value: Number of processed virtual block numbers or
743 * error code, otherwise.
744 *
745 * %-EIO - I/O error.
746 *
747 * %-ENOMEM - Insufficient amount of memory available.
748 *
749 * %-ENOENT - The virtual block number have not been allocated.
750 */
482static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, 751static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
483 struct nilfs_argv *argv, void *buf) 752 struct nilfs_argv *argv, void *buf)
484{ 753{
@@ -490,6 +759,24 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
490 return (ret < 0) ? ret : nmembs; 759 return (ret < 0) ? ret : nmembs;
491} 760}
492 761
762/**
763 * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty
764 * @nilfs: nilfs object
765 * @argv: vector of arguments from userspace
766 * @buf: array of block descriptors
767 *
768 * Description: nilfs_ioctl_mark_blocks_dirty() function marks
769 * metadata file or data blocks as dirty.
770 *
771 * Return Value: Number of processed block descriptors or
772 * error code, otherwise.
773 *
774 * %-ENOMEM - Insufficient memory available.
775 *
776 * %-EIO - I/O error
777 *
778 * %-ENOENT - the specified block does not exist (hole block)
779 */
493static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, 780static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
494 struct nilfs_argv *argv, void *buf) 781 struct nilfs_argv *argv, void *buf)
495{ 782{
@@ -571,6 +858,20 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
571 return ret; 858 return ret;
572} 859}
573 860
861/**
862 * nilfs_ioctl_clean_segments - clean segments
863 * @inode: inode object
864 * @filp: file object
865 * @cmd: ioctl's request code
866 * @argp: pointer on argument from userspace
867 *
868 * Description: nilfs_ioctl_clean_segments() function makes garbage
869 * collection operation in the environment of requested parameters
870 * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by
871 * nilfs_cleanerd daemon.
872 *
873 * Return Value: On success, 0 is returned or error code, otherwise.
874 */
574static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, 875static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
575 unsigned int cmd, void __user *argp) 876 unsigned int cmd, void __user *argp)
576{ 877{
@@ -682,6 +983,33 @@ out:
682 return ret; 983 return ret;
683} 984}
684 985
986/**
987 * nilfs_ioctl_sync - make a checkpoint
988 * @inode: inode object
989 * @filp: file object
990 * @cmd: ioctl's request code
991 * @argp: pointer on argument from userspace
992 *
993 * Description: nilfs_ioctl_sync() function constructs a logical segment
994 * for checkpointing. This function guarantees that all modified data
995 * and metadata are written out to the device when it successfully
996 * returned.
997 *
998 * Return Value: On success, 0 is retured. On errors, one of the following
999 * negative error code is returned.
1000 *
1001 * %-EROFS - Read only filesystem.
1002 *
1003 * %-EIO - I/O error
1004 *
1005 * %-ENOSPC - No space left on device (only in a panic state).
1006 *
1007 * %-ERESTARTSYS - Interrupted.
1008 *
1009 * %-ENOMEM - Insufficient memory available.
1010 *
1011 * %-EFAULT - Failure during execution of requested operation.
1012 */
685static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, 1013static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
686 unsigned int cmd, void __user *argp) 1014 unsigned int cmd, void __user *argp)
687{ 1015{
@@ -710,6 +1038,14 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
710 return 0; 1038 return 0;
711} 1039}
712 1040
1041/**
1042 * nilfs_ioctl_resize - resize NILFS2 volume
1043 * @inode: inode object
1044 * @filp: file object
1045 * @argp: pointer on argument from userspace
1046 *
1047 * Return Value: On success, 0 is returned or error code, otherwise.
1048 */
713static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, 1049static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
714 void __user *argp) 1050 void __user *argp)
715{ 1051{
@@ -735,6 +1071,17 @@ out:
735 return ret; 1071 return ret;
736} 1072}
737 1073
1074/**
1075 * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
1076 * @inode: inode object
1077 * @argp: pointer on argument from userspace
1078 *
1079 * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit
1080 * of segments in bytes and upper limit of segments in bytes.
1081 * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
1082 *
1083 * Return Value: On success, 0 is returned or error code, otherwise.
1084 */
738static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) 1085static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
739{ 1086{
740 struct the_nilfs *nilfs = inode->i_sb->s_fs_info; 1087 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
@@ -767,6 +1114,28 @@ out:
767 return ret; 1114 return ret;
768} 1115}
769 1116
1117/**
1118 * nilfs_ioctl_get_info - wrapping function of get metadata info
1119 * @inode: inode object
1120 * @filp: file object
1121 * @cmd: ioctl's request code
1122 * @argp: pointer on argument from userspace
1123 * @membsz: size of an item in bytes
1124 * @dofunc: concrete function of getting metadata info
1125 *
1126 * Description: nilfs_ioctl_get_info() gets metadata info by means of
1127 * calling dofunc() function.
1128 *
1129 * Return Value: On success, 0 is returned and requested metadata info
1130 * is copied into userspace. On error, one of the following
1131 * negative error codes is returned.
1132 *
1133 * %-EINVAL - Invalid arguments from userspace.
1134 *
1135 * %-ENOMEM - Insufficient amount of memory available.
1136 *
1137 * %-EFAULT - Failure during execution of requested operation.
1138 */
770static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, 1139static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
771 unsigned int cmd, void __user *argp, 1140 unsigned int cmd, void __user *argp,
772 size_t membsz, 1141 size_t membsz,
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2d8be51f90dc..dc3a9efdaab8 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -416,7 +416,8 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
416 } 416 }
417 if (likely(bio)) { 417 if (likely(bio)) {
418 bio->bi_bdev = nilfs->ns_bdev; 418 bio->bi_bdev = nilfs->ns_bdev;
419 bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9); 419 bio->bi_iter.bi_sector =
420 start << (nilfs->ns_blocksize_bits - 9);
420 } 421 }
421 return bio; 422 return bio;
422} 423}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9f6b486b6c01..a1a191634abc 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1440 1440
1441 nilfs_clear_logs(&sci->sc_segbufs); 1441 nilfs_clear_logs(&sci->sc_segbufs);
1442 1442
1443 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1444 if (unlikely(err))
1445 return err;
1446
1447 if (sci->sc_stage.flags & NILFS_CF_SUFREED) { 1443 if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
1448 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, 1444 err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
1449 sci->sc_freesegs, 1445 sci->sc_freesegs,
1450 sci->sc_nfreesegs, 1446 sci->sc_nfreesegs,
1451 NULL); 1447 NULL);
1452 WARN_ON(err); /* do not happen */ 1448 WARN_ON(err); /* do not happen */
1449 sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
1453 } 1450 }
1451
1452 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1453 if (unlikely(err))
1454 return err;
1455
1454 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); 1456 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1455 sci->sc_stage = prev_stage; 1457 sci->sc_stage = prev_stage;
1456 } 1458 }
diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c
index 634a8b717b02..266c2d7d50bd 100644
--- a/fs/nls/mac-celtic.c
+++ b/fs/nls/mac-celtic.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
583 .char2uni = char2uni, 583 .char2uni = char2uni,
584 .charset2lower = charset2lower, 584 .charset2lower = charset2lower,
585 .charset2upper = charset2upper, 585 .charset2upper = charset2upper,
586 .owner = THIS_MODULE,
587}; 586};
588 587
589static int __init init_nls_macceltic(void) 588static int __init init_nls_macceltic(void)
diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c
index 979e6265ac5e..9789c6057551 100644
--- a/fs/nls/mac-centeuro.c
+++ b/fs/nls/mac-centeuro.c
@@ -513,7 +513,6 @@ static struct nls_table table = {
513 .char2uni = char2uni, 513 .char2uni = char2uni,
514 .charset2lower = charset2lower, 514 .charset2lower = charset2lower,
515 .charset2upper = charset2upper, 515 .charset2upper = charset2upper,
516 .owner = THIS_MODULE,
517}; 516};
518 517
519static int __init init_nls_maccenteuro(void) 518static int __init init_nls_maccenteuro(void)
diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c
index dd3f675911ee..bb19e7a07d43 100644
--- a/fs/nls/mac-croatian.c
+++ b/fs/nls/mac-croatian.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
583 .char2uni = char2uni, 583 .char2uni = char2uni,
584 .charset2lower = charset2lower, 584 .charset2lower = charset2lower,
585 .charset2upper = charset2upper, 585 .charset2upper = charset2upper,
586 .owner = THIS_MODULE,
587}; 586};
588 587
589static int __init init_nls_maccroatian(void) 588static int __init init_nls_maccroatian(void)
diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c
index 1112c84dd8bb..2a7dea36acba 100644
--- a/fs/nls/mac-cyrillic.c
+++ b/fs/nls/mac-cyrillic.c
@@ -478,7 +478,6 @@ static struct nls_table table = {
478 .char2uni = char2uni, 478 .char2uni = char2uni,
479 .charset2lower = charset2lower, 479 .charset2lower = charset2lower,
480 .charset2upper = charset2upper, 480 .charset2upper = charset2upper,
481 .owner = THIS_MODULE,
482}; 481};
483 482
484static int __init init_nls_maccyrillic(void) 483static int __init init_nls_maccyrillic(void)
diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c
index 2de9158409c8..77b001653588 100644
--- a/fs/nls/mac-gaelic.c
+++ b/fs/nls/mac-gaelic.c
@@ -548,7 +548,6 @@ static struct nls_table table = {
548 .char2uni = char2uni, 548 .char2uni = char2uni,
549 .charset2lower = charset2lower, 549 .charset2lower = charset2lower,
550 .charset2upper = charset2upper, 550 .charset2upper = charset2upper,
551 .owner = THIS_MODULE,
552}; 551};
553 552
554static int __init init_nls_macgaelic(void) 553static int __init init_nls_macgaelic(void)
diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c
index a86310082802..1eccf499e2eb 100644
--- a/fs/nls/mac-greek.c
+++ b/fs/nls/mac-greek.c
@@ -478,7 +478,6 @@ static struct nls_table table = {
478 .char2uni = char2uni, 478 .char2uni = char2uni,
479 .charset2lower = charset2lower, 479 .charset2lower = charset2lower,
480 .charset2upper = charset2upper, 480 .charset2upper = charset2upper,
481 .owner = THIS_MODULE,
482}; 481};
483 482
484static int __init init_nls_macgreek(void) 483static int __init init_nls_macgreek(void)
diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c
index babe2998d5ce..cbd0875c6d69 100644
--- a/fs/nls/mac-iceland.c
+++ b/fs/nls/mac-iceland.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
583 .char2uni = char2uni, 583 .char2uni = char2uni,
584 .charset2lower = charset2lower, 584 .charset2lower = charset2lower,
585 .charset2upper = charset2upper, 585 .charset2upper = charset2upper,
586 .owner = THIS_MODULE,
587}; 586};
588 587
589static int __init init_nls_maciceland(void) 588static int __init init_nls_maciceland(void)
diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c
index 312364f010dc..fba8357aaf03 100644
--- a/fs/nls/mac-inuit.c
+++ b/fs/nls/mac-inuit.c
@@ -513,7 +513,6 @@ static struct nls_table table = {
513 .char2uni = char2uni, 513 .char2uni = char2uni,
514 .charset2lower = charset2lower, 514 .charset2lower = charset2lower,
515 .charset2upper = charset2upper, 515 .charset2upper = charset2upper,
516 .owner = THIS_MODULE,
517}; 516};
518 517
519static int __init init_nls_macinuit(void) 518static int __init init_nls_macinuit(void)
diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c
index 53ce0809cbd2..b6a98a5208cd 100644
--- a/fs/nls/mac-roman.c
+++ b/fs/nls/mac-roman.c
@@ -618,7 +618,6 @@ static struct nls_table table = {
618 .char2uni = char2uni, 618 .char2uni = char2uni,
619 .charset2lower = charset2lower, 619 .charset2lower = charset2lower,
620 .charset2upper = charset2upper, 620 .charset2upper = charset2upper,
621 .owner = THIS_MODULE,
622}; 621};
623 622
624static int __init init_nls_macroman(void) 623static int __init init_nls_macroman(void)
diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c
index add6f7a0c666..25547f023638 100644
--- a/fs/nls/mac-romanian.c
+++ b/fs/nls/mac-romanian.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
583 .char2uni = char2uni, 583 .char2uni = char2uni,
584 .charset2lower = charset2lower, 584 .charset2lower = charset2lower,
585 .charset2upper = charset2upper, 585 .charset2upper = charset2upper,
586 .owner = THIS_MODULE,
587}; 586};
588 587
589static int __init init_nls_macromanian(void) 588static int __init init_nls_macromanian(void)
diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c
index dffa96d5de00..b5454bc7b7fa 100644
--- a/fs/nls/mac-turkish.c
+++ b/fs/nls/mac-turkish.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
583 .char2uni = char2uni, 583 .char2uni = char2uni,
584 .charset2lower = charset2lower, 584 .charset2lower = charset2lower,
585 .charset2upper = charset2upper, 585 .charset2upper = charset2upper,
586 .owner = THIS_MODULE,
587}; 586};
588 587
589static int __init init_nls_macturkish(void) 588static int __init init_nls_macturkish(void)
diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c
index 7020e940f74e..a2620650d5e4 100644
--- a/fs/nls/nls_ascii.c
+++ b/fs/nls/nls_ascii.c
@@ -148,7 +148,6 @@ static struct nls_table table = {
148 .char2uni = char2uni, 148 .char2uni = char2uni,
149 .charset2lower = charset2lower, 149 .charset2lower = charset2lower,
150 .charset2upper = charset2upper, 150 .charset2upper = charset2upper,
151 .owner = THIS_MODULE,
152}; 151};
153 152
154static int __init init_nls_ascii(void) 153static int __init init_nls_ascii(void)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index fea6bd5831dc..52ccd34b1e79 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -232,13 +232,14 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
232} 232}
233EXPORT_SYMBOL(utf16s_to_utf8s); 233EXPORT_SYMBOL(utf16s_to_utf8s);
234 234
235int register_nls(struct nls_table * nls) 235int __register_nls(struct nls_table *nls, struct module *owner)
236{ 236{
237 struct nls_table ** tmp = &tables; 237 struct nls_table ** tmp = &tables;
238 238
239 if (nls->next) 239 if (nls->next)
240 return -EBUSY; 240 return -EBUSY;
241 241
242 nls->owner = owner;
242 spin_lock(&nls_lock); 243 spin_lock(&nls_lock);
243 while (*tmp) { 244 while (*tmp) {
244 if (nls == *tmp) { 245 if (nls == *tmp) {
@@ -252,6 +253,7 @@ int register_nls(struct nls_table * nls)
252 spin_unlock(&nls_lock); 253 spin_unlock(&nls_lock);
253 return 0; 254 return 0;
254} 255}
256EXPORT_SYMBOL(__register_nls);
255 257
256int unregister_nls(struct nls_table * nls) 258int unregister_nls(struct nls_table * nls)
257{ 259{
@@ -538,7 +540,6 @@ struct nls_table *load_nls_default(void)
538 return &default_table; 540 return &default_table;
539} 541}
540 542
541EXPORT_SYMBOL(register_nls);
542EXPORT_SYMBOL(unregister_nls); 543EXPORT_SYMBOL(unregister_nls);
543EXPORT_SYMBOL(unload_nls); 544EXPORT_SYMBOL(unload_nls);
544EXPORT_SYMBOL(load_nls); 545EXPORT_SYMBOL(load_nls);
diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c
index c8471fe78e4e..ace3e19d3407 100644
--- a/fs/nls/nls_cp1250.c
+++ b/fs/nls/nls_cp1250.c
@@ -329,7 +329,6 @@ static struct nls_table table = {
329 .char2uni = char2uni, 329 .char2uni = char2uni,
330 .charset2lower = charset2lower, 330 .charset2lower = charset2lower,
331 .charset2upper = charset2upper, 331 .charset2upper = charset2upper,
332 .owner = THIS_MODULE,
333}; 332};
334 333
335static int __init init_nls_cp1250(void) 334static int __init init_nls_cp1250(void)
diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c
index 1939b46e772f..9273ddfd08a1 100644
--- a/fs/nls/nls_cp1251.c
+++ b/fs/nls/nls_cp1251.c
@@ -283,7 +283,6 @@ static struct nls_table table = {
283 .char2uni = char2uni, 283 .char2uni = char2uni,
284 .charset2lower = charset2lower, 284 .charset2lower = charset2lower,
285 .charset2upper = charset2upper, 285 .charset2upper = charset2upper,
286 .owner = THIS_MODULE,
287}; 286};
288 287
289static int __init init_nls_cp1251(void) 288static int __init init_nls_cp1251(void)
diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c
index 8120ae2e091a..1caf5dfed85b 100644
--- a/fs/nls/nls_cp1255.c
+++ b/fs/nls/nls_cp1255.c
@@ -365,7 +365,6 @@ static struct nls_table table = {
365 .char2uni = char2uni, 365 .char2uni = char2uni,
366 .charset2lower = charset2lower, 366 .charset2lower = charset2lower,
367 .charset2upper = charset2upper, 367 .charset2upper = charset2upper,
368 .owner = THIS_MODULE,
369}; 368};
370 369
371static int __init init_nls_cp1255(void) 370static int __init init_nls_cp1255(void)
diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c
index ff37a4628ce4..7ddb830da3fd 100644
--- a/fs/nls/nls_cp437.c
+++ b/fs/nls/nls_cp437.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
369 .char2uni = char2uni, 369 .char2uni = char2uni,
370 .charset2lower = charset2lower, 370 .charset2lower = charset2lower,
371 .charset2upper = charset2upper, 371 .charset2upper = charset2upper,
372 .owner = THIS_MODULE,
373}; 372};
374 373
375static int __init init_nls_cp437(void) 374static int __init init_nls_cp437(void)
diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c
index f5576b8be1b9..c593f683a0cd 100644
--- a/fs/nls/nls_cp737.c
+++ b/fs/nls/nls_cp737.c
@@ -332,7 +332,6 @@ static struct nls_table table = {
332 .char2uni = char2uni, 332 .char2uni = char2uni,
333 .charset2lower = charset2lower, 333 .charset2lower = charset2lower,
334 .charset2upper = charset2upper, 334 .charset2upper = charset2upper,
335 .owner = THIS_MODULE,
336}; 335};
337 336
338static int __init init_nls_cp737(void) 337static int __init init_nls_cp737(void)
diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c
index 4905635d1c00..554c863745f2 100644
--- a/fs/nls/nls_cp775.c
+++ b/fs/nls/nls_cp775.c
@@ -301,7 +301,6 @@ static struct nls_table table = {
301 .char2uni = char2uni, 301 .char2uni = char2uni,
302 .charset2lower = charset2lower, 302 .charset2lower = charset2lower,
303 .charset2upper = charset2upper, 303 .charset2upper = charset2upper,
304 .owner = THIS_MODULE,
305}; 304};
306 305
307static int __init init_nls_cp775(void) 306static int __init init_nls_cp775(void)
diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c
index fe5bdad50e2b..56cccd14b40b 100644
--- a/fs/nls/nls_cp850.c
+++ b/fs/nls/nls_cp850.c
@@ -297,7 +297,6 @@ static struct nls_table table = {
297 .char2uni = char2uni, 297 .char2uni = char2uni,
298 .charset2lower = charset2lower, 298 .charset2lower = charset2lower,
299 .charset2upper = charset2upper, 299 .charset2upper = charset2upper,
300 .owner = THIS_MODULE,
301}; 300};
302 301
303static int __init init_nls_cp850(void) 302static int __init init_nls_cp850(void)
diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c
index ceb1c0166dd8..7cdc05ac1d40 100644
--- a/fs/nls/nls_cp852.c
+++ b/fs/nls/nls_cp852.c
@@ -319,7 +319,6 @@ static struct nls_table table = {
319 .char2uni = char2uni, 319 .char2uni = char2uni,
320 .charset2lower = charset2lower, 320 .charset2lower = charset2lower,
321 .charset2upper = charset2upper, 321 .charset2upper = charset2upper,
322 .owner = THIS_MODULE,
323}; 322};
324 323
325static int __init init_nls_cp852(void) 324static int __init init_nls_cp852(void)
diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c
index cc7f5fb2e0c2..7426eea05663 100644
--- a/fs/nls/nls_cp855.c
+++ b/fs/nls/nls_cp855.c
@@ -281,7 +281,6 @@ static struct nls_table table = {
281 .char2uni = char2uni, 281 .char2uni = char2uni,
282 .charset2lower = charset2lower, 282 .charset2lower = charset2lower,
283 .charset2upper = charset2upper, 283 .charset2upper = charset2upper,
284 .owner = THIS_MODULE,
285}; 284};
286 285
287static int __init init_nls_cp855(void) 286static int __init init_nls_cp855(void)
diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c
index e418e198e8d8..098309733ebd 100644
--- a/fs/nls/nls_cp857.c
+++ b/fs/nls/nls_cp857.c
@@ -283,7 +283,6 @@ static struct nls_table table = {
283 .char2uni = char2uni, 283 .char2uni = char2uni,
284 .charset2lower = charset2lower, 284 .charset2lower = charset2lower,
285 .charset2upper = charset2upper, 285 .charset2upper = charset2upper,
286 .owner = THIS_MODULE,
287}; 286};
288 287
289static int __init init_nls_cp857(void) 288static int __init init_nls_cp857(void)
diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c
index a86c97d1aa34..84224478e731 100644
--- a/fs/nls/nls_cp860.c
+++ b/fs/nls/nls_cp860.c
@@ -346,7 +346,6 @@ static struct nls_table table = {
346 .char2uni = char2uni, 346 .char2uni = char2uni,
347 .charset2lower = charset2lower, 347 .charset2lower = charset2lower,
348 .charset2upper = charset2upper, 348 .charset2upper = charset2upper,
349 .owner = THIS_MODULE,
350}; 349};
351 350
352static int __init init_nls_cp860(void) 351static int __init init_nls_cp860(void)
diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c
index bd920227acdf..dc873e4be092 100644
--- a/fs/nls/nls_cp861.c
+++ b/fs/nls/nls_cp861.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
369 .char2uni = char2uni, 369 .char2uni = char2uni,
370 .charset2lower = charset2lower, 370 .charset2lower = charset2lower,
371 .charset2upper = charset2upper, 371 .charset2upper = charset2upper,
372 .owner = THIS_MODULE,
373}; 372};
374 373
375static int __init init_nls_cp861(void) 374static int __init init_nls_cp861(void)
diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c
index e9b68eb3daf0..d5263e3c5566 100644
--- a/fs/nls/nls_cp862.c
+++ b/fs/nls/nls_cp862.c
@@ -403,7 +403,6 @@ static struct nls_table table = {
403 .char2uni = char2uni, 403 .char2uni = char2uni,
404 .charset2lower = charset2lower, 404 .charset2lower = charset2lower,
405 .charset2upper = charset2upper, 405 .charset2upper = charset2upper,
406 .owner = THIS_MODULE,
407}; 406};
408 407
409static int __init init_nls_cp862(void) 408static int __init init_nls_cp862(void)
diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c
index f8a9b07ab4e2..051c9832e36a 100644
--- a/fs/nls/nls_cp863.c
+++ b/fs/nls/nls_cp863.c
@@ -363,7 +363,6 @@ static struct nls_table table = {
363 .char2uni = char2uni, 363 .char2uni = char2uni,
364 .charset2lower = charset2lower, 364 .charset2lower = charset2lower,
365 .charset2upper = charset2upper, 365 .charset2upper = charset2upper,
366 .owner = THIS_MODULE,
367}; 366};
368 367
369static int __init init_nls_cp863(void) 368static int __init init_nls_cp863(void)
diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c
index 8d31f435fc6f..97eb1273b2f7 100644
--- a/fs/nls/nls_cp864.c
+++ b/fs/nls/nls_cp864.c
@@ -389,7 +389,6 @@ static struct nls_table table = {
389 .char2uni = char2uni, 389 .char2uni = char2uni,
390 .charset2lower = charset2lower, 390 .charset2lower = charset2lower,
391 .charset2upper = charset2upper, 391 .charset2upper = charset2upper,
392 .owner = THIS_MODULE,
393}; 392};
394 393
395static int __init init_nls_cp864(void) 394static int __init init_nls_cp864(void)
diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c
index 4bd902fe3ec9..111214228525 100644
--- a/fs/nls/nls_cp865.c
+++ b/fs/nls/nls_cp865.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
369 .char2uni = char2uni, 369 .char2uni = char2uni,
370 .charset2lower = charset2lower, 370 .charset2lower = charset2lower,
371 .charset2upper = charset2upper, 371 .charset2upper = charset2upper,
372 .owner = THIS_MODULE,
373}; 372};
374 373
375static int __init init_nls_cp865(void) 374static int __init init_nls_cp865(void)
diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c
index bdc7cb391398..ffdcbc3fc38d 100644
--- a/fs/nls/nls_cp866.c
+++ b/fs/nls/nls_cp866.c
@@ -287,7 +287,6 @@ static struct nls_table table = {
287 .char2uni = char2uni, 287 .char2uni = char2uni,
288 .charset2lower = charset2lower, 288 .charset2lower = charset2lower,
289 .charset2upper = charset2upper, 289 .charset2upper = charset2upper,
290 .owner = THIS_MODULE,
291}; 290};
292 291
293static int __init init_nls_cp866(void) 292static int __init init_nls_cp866(void)
diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c
index 9f283a2b151a..3b5a34589354 100644
--- a/fs/nls/nls_cp869.c
+++ b/fs/nls/nls_cp869.c
@@ -297,7 +297,6 @@ static struct nls_table table = {
297 .char2uni = char2uni, 297 .char2uni = char2uni,
298 .charset2lower = charset2lower, 298 .charset2lower = charset2lower,
299 .charset2upper = charset2upper, 299 .charset2upper = charset2upper,
300 .owner = THIS_MODULE,
301}; 300};
302 301
303static int __init init_nls_cp869(void) 302static int __init init_nls_cp869(void)
diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c
index 0b3c4886f8c0..8dfaa10710fa 100644
--- a/fs/nls/nls_cp874.c
+++ b/fs/nls/nls_cp874.c
@@ -256,7 +256,6 @@ static struct nls_table table = {
256 .char2uni = char2uni, 256 .char2uni = char2uni,
257 .charset2lower = charset2lower, 257 .charset2lower = charset2lower,
258 .charset2upper = charset2upper, 258 .charset2upper = charset2upper,
259 .owner = THIS_MODULE,
260}; 259};
261 260
262static int __init init_nls_cp874(void) 261static int __init init_nls_cp874(void)
diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c
index 0ffed6f1cebb..67b7398e8483 100644
--- a/fs/nls/nls_cp932.c
+++ b/fs/nls/nls_cp932.c
@@ -7914,7 +7914,6 @@ static struct nls_table table = {
7914 .char2uni = char2uni, 7914 .char2uni = char2uni,
7915 .charset2lower = charset2lower, 7915 .charset2lower = charset2lower,
7916 .charset2upper = charset2upper, 7916 .charset2upper = charset2upper,
7917 .owner = THIS_MODULE,
7918}; 7917};
7919 7918
7920static int __init init_nls_cp932(void) 7919static int __init init_nls_cp932(void)
diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
index 82770301bc3d..c96546cfec9f 100644
--- a/fs/nls/nls_cp936.c
+++ b/fs/nls/nls_cp936.c
@@ -11092,7 +11092,6 @@ static struct nls_table table = {
11092 .char2uni = char2uni, 11092 .char2uni = char2uni,
11093 .charset2lower = charset2lower, 11093 .charset2lower = charset2lower,
11094 .charset2upper = charset2upper, 11094 .charset2upper = charset2upper,
11095 .owner = THIS_MODULE,
11096}; 11095};
11097 11096
11098static int __init init_nls_cp936(void) 11097static int __init init_nls_cp936(void)
diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c
index 8a7a2fe85c65..199171e97aa4 100644
--- a/fs/nls/nls_cp949.c
+++ b/fs/nls/nls_cp949.c
@@ -13927,7 +13927,6 @@ static struct nls_table table = {
13927 .char2uni = char2uni, 13927 .char2uni = char2uni,
13928 .charset2lower = charset2lower, 13928 .charset2lower = charset2lower,
13929 .charset2upper = charset2upper, 13929 .charset2upper = charset2upper,
13930 .owner = THIS_MODULE,
13931}; 13930};
13932 13931
13933static int __init init_nls_cp949(void) 13932static int __init init_nls_cp949(void)
diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c
index ef2536829aa5..8e1418708209 100644
--- a/fs/nls/nls_cp950.c
+++ b/fs/nls/nls_cp950.c
@@ -9463,7 +9463,6 @@ static struct nls_table table = {
9463 .char2uni = char2uni, 9463 .char2uni = char2uni,
9464 .charset2lower = charset2lower, 9464 .charset2lower = charset2lower,
9465 .charset2upper = charset2upper, 9465 .charset2upper = charset2upper,
9466 .owner = THIS_MODULE,
9467}; 9466};
9468 9467
9469static int __init init_nls_cp950(void) 9468static int __init init_nls_cp950(void)
diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c
index 7424929a278b..162b3f160353 100644
--- a/fs/nls/nls_euc-jp.c
+++ b/fs/nls/nls_euc-jp.c
@@ -553,7 +553,6 @@ static struct nls_table table = {
553 .charset = "euc-jp", 553 .charset = "euc-jp",
554 .uni2char = uni2char, 554 .uni2char = uni2char,
555 .char2uni = char2uni, 555 .char2uni = char2uni,
556 .owner = THIS_MODULE,
557}; 556};
558 557
559static int __init init_nls_euc_jp(void) 558static int __init init_nls_euc_jp(void)
diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c
index 7b951bb5849c..69ac020d43b1 100644
--- a/fs/nls/nls_iso8859-1.c
+++ b/fs/nls/nls_iso8859-1.c
@@ -239,7 +239,6 @@ static struct nls_table table = {
239 .char2uni = char2uni, 239 .char2uni = char2uni,
240 .charset2lower = charset2lower, 240 .charset2lower = charset2lower,
241 .charset2upper = charset2upper, 241 .charset2upper = charset2upper,
242 .owner = THIS_MODULE,
243}; 242};
244 243
245static int __init init_nls_iso8859_1(void) 244static int __init init_nls_iso8859_1(void)
diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c
index c4d52ea9f092..afb3f8f275f0 100644
--- a/fs/nls/nls_iso8859-13.c
+++ b/fs/nls/nls_iso8859-13.c
@@ -267,7 +267,6 @@ static struct nls_table table = {
267 .char2uni = char2uni, 267 .char2uni = char2uni,
268 .charset2lower = charset2lower, 268 .charset2lower = charset2lower,
269 .charset2upper = charset2upper, 269 .charset2upper = charset2upper,
270 .owner = THIS_MODULE,
271}; 270};
272 271
273static int __init init_nls_iso8859_13(void) 272static int __init init_nls_iso8859_13(void)
diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c
index dc02600c7fe1..046370f0b6f0 100644
--- a/fs/nls/nls_iso8859-14.c
+++ b/fs/nls/nls_iso8859-14.c
@@ -323,7 +323,6 @@ static struct nls_table table = {
323 .char2uni = char2uni, 323 .char2uni = char2uni,
324 .charset2lower = charset2lower, 324 .charset2lower = charset2lower,
325 .charset2upper = charset2upper, 325 .charset2upper = charset2upper,
326 .owner = THIS_MODULE,
327}; 326};
328 327
329static int __init init_nls_iso8859_14(void) 328static int __init init_nls_iso8859_14(void)
diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c
index 3c7dfc832ef1..7e34a841a056 100644
--- a/fs/nls/nls_iso8859-15.c
+++ b/fs/nls/nls_iso8859-15.c
@@ -289,7 +289,6 @@ static struct nls_table table = {
289 .char2uni = char2uni, 289 .char2uni = char2uni,
290 .charset2lower = charset2lower, 290 .charset2lower = charset2lower,
291 .charset2upper = charset2upper, 291 .charset2upper = charset2upper,
292 .owner = THIS_MODULE,
293}; 292};
294 293
295static int __init init_nls_iso8859_15(void) 294static int __init init_nls_iso8859_15(void)
diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c
index a2d2197e4c77..7dd571181741 100644
--- a/fs/nls/nls_iso8859-2.c
+++ b/fs/nls/nls_iso8859-2.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
290 .char2uni = char2uni, 290 .char2uni = char2uni,
291 .charset2lower = charset2lower, 291 .charset2lower = charset2lower,
292 .charset2upper = charset2upper, 292 .charset2upper = charset2upper,
293 .owner = THIS_MODULE,
294}; 293};
295 294
296static int __init init_nls_iso8859_2(void) 295static int __init init_nls_iso8859_2(void)
diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c
index a61e0daa3a86..740b75ec4493 100644
--- a/fs/nls/nls_iso8859-3.c
+++ b/fs/nls/nls_iso8859-3.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
290 .char2uni = char2uni, 290 .char2uni = char2uni,
291 .charset2lower = charset2lower, 291 .charset2lower = charset2lower,
292 .charset2upper = charset2upper, 292 .charset2upper = charset2upper,
293 .owner = THIS_MODULE,
294}; 293};
295 294
296static int __init init_nls_iso8859_3(void) 295static int __init init_nls_iso8859_3(void)
diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c
index e8ff555483b6..8826021e32f5 100644
--- a/fs/nls/nls_iso8859-4.c
+++ b/fs/nls/nls_iso8859-4.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
290 .char2uni = char2uni, 290 .char2uni = char2uni,
291 .charset2lower = charset2lower, 291 .charset2lower = charset2lower,
292 .charset2upper = charset2upper, 292 .charset2upper = charset2upper,
293 .owner = THIS_MODULE,
294}; 293};
295 294
296static int __init init_nls_iso8859_4(void) 295static int __init init_nls_iso8859_4(void)
diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c
index 4721e8930124..7c04057a1ad8 100644
--- a/fs/nls/nls_iso8859-5.c
+++ b/fs/nls/nls_iso8859-5.c
@@ -254,7 +254,6 @@ static struct nls_table table = {
254 .char2uni = char2uni, 254 .char2uni = char2uni,
255 .charset2lower = charset2lower, 255 .charset2lower = charset2lower,
256 .charset2upper = charset2upper, 256 .charset2upper = charset2upper,
257 .owner = THIS_MODULE,
258}; 257};
259 258
260static int __init init_nls_iso8859_5(void) 259static int __init init_nls_iso8859_5(void)
diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c
index 01a517d6d306..d4a881400d74 100644
--- a/fs/nls/nls_iso8859-6.c
+++ b/fs/nls/nls_iso8859-6.c
@@ -245,7 +245,6 @@ static struct nls_table table = {
245 .char2uni = char2uni, 245 .char2uni = char2uni,
246 .charset2lower = charset2lower, 246 .charset2lower = charset2lower,
247 .charset2upper = charset2upper, 247 .charset2upper = charset2upper,
248 .owner = THIS_MODULE,
249}; 248};
250 249
251static int __init init_nls_iso8859_6(void) 250static int __init init_nls_iso8859_6(void)
diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c
index 2d27b93ef19e..37b75d825a75 100644
--- a/fs/nls/nls_iso8859-7.c
+++ b/fs/nls/nls_iso8859-7.c
@@ -299,7 +299,6 @@ static struct nls_table table = {
299 .char2uni = char2uni, 299 .char2uni = char2uni,
300 .charset2lower = charset2lower, 300 .charset2lower = charset2lower,
301 .charset2upper = charset2upper, 301 .charset2upper = charset2upper,
302 .owner = THIS_MODULE,
303}; 302};
304 303
305static int __init init_nls_iso8859_7(void) 304static int __init init_nls_iso8859_7(void)
diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c
index 694bf070c721..557b98250d37 100644
--- a/fs/nls/nls_iso8859-9.c
+++ b/fs/nls/nls_iso8859-9.c
@@ -254,7 +254,6 @@ static struct nls_table table = {
254 .char2uni = char2uni, 254 .char2uni = char2uni,
255 .charset2lower = charset2lower, 255 .charset2lower = charset2lower,
256 .charset2upper = charset2upper, 256 .charset2upper = charset2upper,
257 .owner = THIS_MODULE,
258}; 257};
259 258
260static int __init init_nls_iso8859_9(void) 259static int __init init_nls_iso8859_9(void)
diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c
index 43875310540d..811f232fccfb 100644
--- a/fs/nls/nls_koi8-r.c
+++ b/fs/nls/nls_koi8-r.c
@@ -305,7 +305,6 @@ static struct nls_table table = {
305 .char2uni = char2uni, 305 .char2uni = char2uni,
306 .charset2lower = charset2lower, 306 .charset2lower = charset2lower,
307 .charset2upper = charset2upper, 307 .charset2upper = charset2upper,
308 .owner = THIS_MODULE,
309}; 308};
310 309
311static int __init init_nls_koi8_r(void) 310static int __init init_nls_koi8_r(void)
diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c
index e7bc1d75c78c..a80a741a8676 100644
--- a/fs/nls/nls_koi8-ru.c
+++ b/fs/nls/nls_koi8-ru.c
@@ -55,7 +55,6 @@ static struct nls_table table = {
55 .charset = "koi8-ru", 55 .charset = "koi8-ru",
56 .uni2char = uni2char, 56 .uni2char = uni2char,
57 .char2uni = char2uni, 57 .char2uni = char2uni,
58 .owner = THIS_MODULE,
59}; 58};
60 59
61static int __init init_nls_koi8_ru(void) 60static int __init init_nls_koi8_ru(void)
diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c
index 8c9f0292b5ae..7e029e4c188a 100644
--- a/fs/nls/nls_koi8-u.c
+++ b/fs/nls/nls_koi8-u.c
@@ -312,7 +312,6 @@ static struct nls_table table = {
312 .char2uni = char2uni, 312 .char2uni = char2uni,
313 .charset2lower = charset2lower, 313 .charset2lower = charset2lower,
314 .charset2upper = charset2upper, 314 .charset2upper = charset2upper,
315 .owner = THIS_MODULE,
316}; 315};
317 316
318static int __init init_nls_koi8_u(void) 317static int __init init_nls_koi8_u(void)
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index 0d60a44acacd..afcfbc4a14db 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -46,7 +46,6 @@ static struct nls_table table = {
46 .char2uni = char2uni, 46 .char2uni = char2uni,
47 .charset2lower = identity, /* no conversion */ 47 .charset2lower = identity, /* no conversion */
48 .charset2upper = identity, 48 .charset2upper = identity,
49 .owner = THIS_MODULE,
50}; 49};
51 50
52static int __init init_nls_utf8(void) 51static int __init init_nls_utf8(void)
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1fedd5f7ccc4..abc8cbcfe90e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
82 * events. 82 * events.
83 */ 83 */
84static int dnotify_handle_event(struct fsnotify_group *group, 84static int dnotify_handle_event(struct fsnotify_group *group,
85 struct inode *inode,
85 struct fsnotify_mark *inode_mark, 86 struct fsnotify_mark *inode_mark,
86 struct fsnotify_mark *vfsmount_mark, 87 struct fsnotify_mark *vfsmount_mark,
87 struct fsnotify_event *event) 88 u32 mask, void *data, int data_type,
89 const unsigned char *file_name, u32 cookie)
88{ 90{
89 struct dnotify_mark *dn_mark; 91 struct dnotify_mark *dn_mark;
90 struct inode *to_tell;
91 struct dnotify_struct *dn; 92 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 93 struct dnotify_struct **prev;
93 struct fown_struct *fown; 94 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; 95 __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
95 96
96 BUG_ON(vfsmount_mark); 97 /* not a dir, dnotify doesn't care */
98 if (!S_ISDIR(inode->i_mode))
99 return 0;
97 100
98 to_tell = event->to_tell; 101 BUG_ON(vfsmount_mark);
99 102
100 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); 103 dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
101 104
@@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group,
122 return 0; 125 return 0;
123} 126}
124 127
125/*
126 * Given an inode and mask determine if dnotify would be interested in sending
127 * userspace notification for that pair.
128 */
129static bool dnotify_should_send_event(struct fsnotify_group *group,
130 struct inode *inode,
131 struct fsnotify_mark *inode_mark,
132 struct fsnotify_mark *vfsmount_mark,
133 __u32 mask, void *data, int data_type)
134{
135 /* not a dir, dnotify doesn't care */
136 if (!S_ISDIR(inode->i_mode))
137 return false;
138
139 return true;
140}
141
142static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) 128static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
143{ 129{
144 struct dnotify_mark *dn_mark = container_of(fsn_mark, 130 struct dnotify_mark *dn_mark = container_of(fsn_mark,
@@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
152 138
153static struct fsnotify_ops dnotify_fsnotify_ops = { 139static struct fsnotify_ops dnotify_fsnotify_ops = {
154 .handle_event = dnotify_handle_event, 140 .handle_event = dnotify_handle_event,
155 .should_send_event = dnotify_should_send_event,
156 .free_group_priv = NULL,
157 .freeing_mark = NULL,
158 .free_event_priv = NULL,
159}; 141};
160 142
161/* 143/*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0c2f9122b262..dc638f786d5c 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,91 +9,59 @@
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/wait.h> 10#include <linux/wait.h>
11 11
12static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) 12#include "fanotify.h"
13
14static bool should_merge(struct fsnotify_event *old_fsn,
15 struct fsnotify_event *new_fsn)
13{ 16{
14 pr_debug("%s: old=%p new=%p\n", __func__, old, new); 17 struct fanotify_event_info *old, *new;
15 18
16 if (old->to_tell == new->to_tell && 19 pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
17 old->data_type == new->data_type && 20 old = FANOTIFY_E(old_fsn);
18 old->tgid == new->tgid) { 21 new = FANOTIFY_E(new_fsn);
19 switch (old->data_type) { 22
20 case (FSNOTIFY_EVENT_PATH): 23 if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
21#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 24 old->path.mnt == new->path.mnt &&
22 /* dont merge two permission events */ 25 old->path.dentry == new->path.dentry)
23 if ((old->mask & FAN_ALL_PERM_EVENTS) && 26 return true;
24 (new->mask & FAN_ALL_PERM_EVENTS))
25 return false;
26#endif
27 if ((old->path.mnt == new->path.mnt) &&
28 (old->path.dentry == new->path.dentry))
29 return true;
30 break;
31 case (FSNOTIFY_EVENT_NONE):
32 return true;
33 default:
34 BUG();
35 };
36 }
37 return false; 27 return false;
38} 28}
39 29
40/* and the list better be locked by something too! */ 30/* and the list better be locked by something too! */
41static struct fsnotify_event *fanotify_merge(struct list_head *list, 31static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
42 struct fsnotify_event *event)
43{ 32{
44 struct fsnotify_event_holder *test_holder; 33 struct fsnotify_event *test_event;
45 struct fsnotify_event *test_event = NULL; 34 bool do_merge = false;
46 struct fsnotify_event *new_event;
47 35
48 pr_debug("%s: list=%p event=%p\n", __func__, list, event); 36 pr_debug("%s: list=%p event=%p\n", __func__, list, event);
49 37
38#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
39 /*
40 * Don't merge a permission event with any other event so that we know
41 * the event structure we have created in fanotify_handle_event() is the
42 * one we should check for permission response.
43 */
44 if (event->mask & FAN_ALL_PERM_EVENTS)
45 return 0;
46#endif
50 47
51 list_for_each_entry_reverse(test_holder, list, event_list) { 48 list_for_each_entry_reverse(test_event, list, list) {
52 if (should_merge(test_holder->event, event)) { 49 if (should_merge(test_event, event)) {
53 test_event = test_holder->event; 50 do_merge = true;
54 break; 51 break;
55 } 52 }
56 } 53 }
57 54
58 if (!test_event) 55 if (!do_merge)
59 return NULL; 56 return 0;
60
61 fsnotify_get_event(test_event);
62
63 /* if they are exactly the same we are done */
64 if (test_event->mask == event->mask)
65 return test_event;
66
67 /*
68 * if the refcnt == 2 this is the only queue
69 * for this event and so we can update the mask
70 * in place.
71 */
72 if (atomic_read(&test_event->refcnt) == 2) {
73 test_event->mask |= event->mask;
74 return test_event;
75 }
76
77 new_event = fsnotify_clone_event(test_event);
78
79 /* done with test_event */
80 fsnotify_put_event(test_event);
81
82 /* couldn't allocate memory, merge was not possible */
83 if (unlikely(!new_event))
84 return ERR_PTR(-ENOMEM);
85
86 /* build new event and replace it on the list */
87 new_event->mask = (test_event->mask | event->mask);
88 fsnotify_replace_event(test_holder, new_event);
89 57
90 /* we hold a reference on new_event from clone_event */ 58 test_event->mask |= event->mask;
91 return new_event; 59 return 1;
92} 60}
93 61
94#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 62#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
95static int fanotify_get_response_from_access(struct fsnotify_group *group, 63static int fanotify_get_response_from_access(struct fsnotify_group *group,
96 struct fsnotify_event *event) 64 struct fanotify_event_info *event)
97{ 65{
98 int ret; 66 int ret;
99 67
@@ -106,7 +74,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
106 return 0; 74 return 0;
107 75
108 /* userspace responded, convert to something usable */ 76 /* userspace responded, convert to something usable */
109 spin_lock(&event->lock);
110 switch (event->response) { 77 switch (event->response) {
111 case FAN_ALLOW: 78 case FAN_ALLOW:
112 ret = 0; 79 ret = 0;
@@ -116,7 +83,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
116 ret = -EPERM; 83 ret = -EPERM;
117 } 84 }
118 event->response = 0; 85 event->response = 0;
119 spin_unlock(&event->lock);
120 86
121 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, 87 pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
122 group, event, ret); 88 group, event, ret);
@@ -125,58 +91,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
125} 91}
126#endif 92#endif
127 93
128static int fanotify_handle_event(struct fsnotify_group *group, 94static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
129 struct fsnotify_mark *inode_mark,
130 struct fsnotify_mark *fanotify_mark,
131 struct fsnotify_event *event)
132{
133 int ret = 0;
134 struct fsnotify_event *notify_event = NULL;
135
136 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
137 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
138 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
139 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
140 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
141 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
142 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
143 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
144 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
145 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
146
147 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
148
149 notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
150 if (IS_ERR(notify_event))
151 return PTR_ERR(notify_event);
152
153#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
154 if (event->mask & FAN_ALL_PERM_EVENTS) {
155 /* if we merged we need to wait on the new event */
156 if (notify_event)
157 event = notify_event;
158 ret = fanotify_get_response_from_access(group, event);
159 }
160#endif
161
162 if (notify_event)
163 fsnotify_put_event(notify_event);
164
165 return ret;
166}
167
168static bool fanotify_should_send_event(struct fsnotify_group *group,
169 struct inode *to_tell,
170 struct fsnotify_mark *inode_mark,
171 struct fsnotify_mark *vfsmnt_mark, 95 struct fsnotify_mark *vfsmnt_mark,
172 __u32 event_mask, void *data, int data_type) 96 u32 event_mask,
97 void *data, int data_type)
173{ 98{
174 __u32 marks_mask, marks_ignored_mask; 99 __u32 marks_mask, marks_ignored_mask;
175 struct path *path = data; 100 struct path *path = data;
176 101
177 pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " 102 pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
178 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, 103 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
179 inode_mark, vfsmnt_mark, event_mask, data, data_type); 104 event_mask, data, data_type);
180 105
181 /* if we don't have enough info to send an event to userspace say no */ 106 /* if we don't have enough info to send an event to userspace say no */
182 if (data_type != FSNOTIFY_EVENT_PATH) 107 if (data_type != FSNOTIFY_EVENT_PATH)
@@ -217,6 +142,73 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
217 return false; 142 return false;
218} 143}
219 144
145static int fanotify_handle_event(struct fsnotify_group *group,
146 struct inode *inode,
147 struct fsnotify_mark *inode_mark,
148 struct fsnotify_mark *fanotify_mark,
149 u32 mask, void *data, int data_type,
150 const unsigned char *file_name, u32 cookie)
151{
152 int ret = 0;
153 struct fanotify_event_info *event;
154 struct fsnotify_event *fsn_event;
155
156 BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
157 BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
158 BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
159 BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
160 BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
161 BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
162 BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
163 BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
164 BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
165 BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
166
167 if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
168 data_type))
169 return 0;
170
171 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
172 mask);
173
174 event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
175 if (unlikely(!event))
176 return -ENOMEM;
177
178 fsn_event = &event->fse;
179 fsnotify_init_event(fsn_event, inode, mask);
180 event->tgid = get_pid(task_tgid(current));
181 if (data_type == FSNOTIFY_EVENT_PATH) {
182 struct path *path = data;
183 event->path = *path;
184 path_get(&event->path);
185 } else {
186 event->path.mnt = NULL;
187 event->path.dentry = NULL;
188 }
189#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
190 event->response = 0;
191#endif
192
193 ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
194 if (ret) {
195 /* Permission events shouldn't be merged */
196 BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
197 /* Our event wasn't used in the end. Free it. */
198 fsnotify_destroy_event(group, fsn_event);
199
200 return 0;
201 }
202
203#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
204 if (mask & FAN_ALL_PERM_EVENTS) {
205 ret = fanotify_get_response_from_access(group, event);
206 fsnotify_destroy_event(group, fsn_event);
207 }
208#endif
209 return ret;
210}
211
220static void fanotify_free_group_priv(struct fsnotify_group *group) 212static void fanotify_free_group_priv(struct fsnotify_group *group)
221{ 213{
222 struct user_struct *user; 214 struct user_struct *user;
@@ -226,10 +218,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
226 free_uid(user); 218 free_uid(user);
227} 219}
228 220
221static void fanotify_free_event(struct fsnotify_event *fsn_event)
222{
223 struct fanotify_event_info *event;
224
225 event = FANOTIFY_E(fsn_event);
226 path_put(&event->path);
227 put_pid(event->tgid);
228 kmem_cache_free(fanotify_event_cachep, event);
229}
230
229const struct fsnotify_ops fanotify_fsnotify_ops = { 231const struct fsnotify_ops fanotify_fsnotify_ops = {
230 .handle_event = fanotify_handle_event, 232 .handle_event = fanotify_handle_event,
231 .should_send_event = fanotify_should_send_event,
232 .free_group_priv = fanotify_free_group_priv, 233 .free_group_priv = fanotify_free_group_priv,
233 .free_event_priv = NULL, 234 .free_event = fanotify_free_event,
234 .freeing_mark = NULL,
235}; 235};
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 000000000000..32a2f034fb94
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,30 @@
1#include <linux/fsnotify_backend.h>
2#include <linux/path.h>
3#include <linux/slab.h>
4
5extern struct kmem_cache *fanotify_event_cachep;
6
7/*
8 * Lifetime of the structure differs for normal and permission events. In both
9 * cases the structure is allocated in fanotify_handle_event(). For normal
10 * events the structure is freed immediately after reporting it to userspace.
11 * For permission events we free it only after we receive response from
12 * userspace.
13 */
14struct fanotify_event_info {
15 struct fsnotify_event fse;
16 /*
17 * We hold ref to this path so it may be dereferenced at any point
18 * during this object's lifetime
19 */
20 struct path path;
21 struct pid *tgid;
22#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
23 u32 response; /* userspace answer to question */
24#endif
25};
26
27static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
28{
29 return container_of(fse, struct fanotify_event_info, fse);
30}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e44cb6427df3..287a22c04149 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -19,6 +19,7 @@
19 19
20#include "../../mount.h" 20#include "../../mount.h"
21#include "../fdinfo.h" 21#include "../fdinfo.h"
22#include "fanotify.h"
22 23
23#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 24#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
24#define FANOTIFY_DEFAULT_MAX_MARKS 8192 25#define FANOTIFY_DEFAULT_MAX_MARKS 8192
@@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops;
28 29
29static struct kmem_cache *fanotify_mark_cache __read_mostly; 30static struct kmem_cache *fanotify_mark_cache __read_mostly;
30static struct kmem_cache *fanotify_response_event_cache __read_mostly; 31static struct kmem_cache *fanotify_response_event_cache __read_mostly;
32struct kmem_cache *fanotify_event_cachep __read_mostly;
31 33
32struct fanotify_response_event { 34struct fanotify_response_event {
33 struct list_head list; 35 struct list_head list;
34 __s32 fd; 36 __s32 fd;
35 struct fsnotify_event *event; 37 struct fanotify_event_info *event;
36}; 38};
37 39
38/* 40/*
@@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
61} 63}
62 64
63static int create_fd(struct fsnotify_group *group, 65static int create_fd(struct fsnotify_group *group,
64 struct fsnotify_event *event, 66 struct fanotify_event_info *event,
65 struct file **file) 67 struct file **file)
66{ 68{
67 int client_fd; 69 int client_fd;
68 struct file *new_file; 70 struct file *new_file;
@@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group,
73 if (client_fd < 0) 75 if (client_fd < 0)
74 return client_fd; 76 return client_fd;
75 77
76 if (event->data_type != FSNOTIFY_EVENT_PATH) {
77 WARN_ON(1);
78 put_unused_fd(client_fd);
79 return -EINVAL;
80 }
81
82 /* 78 /*
83 * we need a new file handle for the userspace program so it can read even if it was 79 * we need a new file handle for the userspace program so it can read even if it was
84 * originally opened O_WRONLY. 80 * originally opened O_WRONLY.
@@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group,
109} 105}
110 106
111static int fill_event_metadata(struct fsnotify_group *group, 107static int fill_event_metadata(struct fsnotify_group *group,
112 struct fanotify_event_metadata *metadata, 108 struct fanotify_event_metadata *metadata,
113 struct fsnotify_event *event, 109 struct fsnotify_event *fsn_event,
114 struct file **file) 110 struct file **file)
115{ 111{
116 int ret = 0; 112 int ret = 0;
113 struct fanotify_event_info *event;
117 114
118 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, 115 pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
119 group, metadata, event); 116 group, metadata, fsn_event);
120 117
121 *file = NULL; 118 *file = NULL;
119 event = container_of(fsn_event, struct fanotify_event_info, fse);
122 metadata->event_len = FAN_EVENT_METADATA_LEN; 120 metadata->event_len = FAN_EVENT_METADATA_LEN;
123 metadata->metadata_len = FAN_EVENT_METADATA_LEN; 121 metadata->metadata_len = FAN_EVENT_METADATA_LEN;
124 metadata->vers = FANOTIFY_METADATA_VERSION; 122 metadata->vers = FANOTIFY_METADATA_VERSION;
125 metadata->reserved = 0; 123 metadata->reserved = 0;
126 metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; 124 metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
127 metadata->pid = pid_vnr(event->tgid); 125 metadata->pid = pid_vnr(event->tgid);
128 if (unlikely(event->mask & FAN_Q_OVERFLOW)) 126 if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
129 metadata->fd = FAN_NOFD; 127 metadata->fd = FAN_NOFD;
130 else { 128 else {
131 metadata->fd = create_fd(group, event, file); 129 metadata->fd = create_fd(group, event, file);
@@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
209 if (!re) 207 if (!re)
210 return -ENOMEM; 208 return -ENOMEM;
211 209
212 re->event = event; 210 re->event = FANOTIFY_E(event);
213 re->fd = fd; 211 re->fd = fd;
214 212
215 mutex_lock(&group->fanotify_data.access_mutex); 213 mutex_lock(&group->fanotify_data.access_mutex);
@@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group,
217 if (atomic_read(&group->fanotify_data.bypass_perm)) { 215 if (atomic_read(&group->fanotify_data.bypass_perm)) {
218 mutex_unlock(&group->fanotify_data.access_mutex); 216 mutex_unlock(&group->fanotify_data.access_mutex);
219 kmem_cache_free(fanotify_response_event_cache, re); 217 kmem_cache_free(fanotify_response_event_cache, re);
220 event->response = FAN_ALLOW; 218 FANOTIFY_E(event)->response = FAN_ALLOW;
221 return 0; 219 return 0;
222 } 220 }
223 221
@@ -273,7 +271,7 @@ out_close_fd:
273out: 271out:
274#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 272#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
275 if (event->mask & FAN_ALL_PERM_EVENTS) { 273 if (event->mask & FAN_ALL_PERM_EVENTS) {
276 event->response = FAN_DENY; 274 FANOTIFY_E(event)->response = FAN_DENY;
277 wake_up(&group->fanotify_data.access_waitq); 275 wake_up(&group->fanotify_data.access_waitq);
278 } 276 }
279#endif 277#endif
@@ -321,7 +319,12 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
321 if (IS_ERR(kevent)) 319 if (IS_ERR(kevent))
322 break; 320 break;
323 ret = copy_event_to_user(group, kevent, buf); 321 ret = copy_event_to_user(group, kevent, buf);
324 fsnotify_put_event(kevent); 322 /*
323 * Permission events get destroyed after we
324 * receive response
325 */
326 if (!(kevent->mask & FAN_ALL_PERM_EVENTS))
327 fsnotify_destroy_event(group, kevent);
325 if (ret < 0) 328 if (ret < 0)
326 break; 329 break;
327 buf += ret; 330 buf += ret;
@@ -409,7 +412,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
409static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 412static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
410{ 413{
411 struct fsnotify_group *group; 414 struct fsnotify_group *group;
412 struct fsnotify_event_holder *holder; 415 struct fsnotify_event *fsn_event;
413 void __user *p; 416 void __user *p;
414 int ret = -ENOTTY; 417 int ret = -ENOTTY;
415 size_t send_len = 0; 418 size_t send_len = 0;
@@ -421,7 +424,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
421 switch (cmd) { 424 switch (cmd) {
422 case FIONREAD: 425 case FIONREAD:
423 mutex_lock(&group->notification_mutex); 426 mutex_lock(&group->notification_mutex);
424 list_for_each_entry(holder, &group->notification_list, event_list) 427 list_for_each_entry(fsn_event, &group->notification_list, list)
425 send_len += FAN_EVENT_METADATA_LEN; 428 send_len += FAN_EVENT_METADATA_LEN;
426 mutex_unlock(&group->notification_mutex); 429 mutex_unlock(&group->notification_mutex);
427 ret = put_user(send_len, (int __user *) p); 430 ret = put_user(send_len, (int __user *) p);
@@ -695,6 +698,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
695 struct fsnotify_group *group; 698 struct fsnotify_group *group;
696 int f_flags, fd; 699 int f_flags, fd;
697 struct user_struct *user; 700 struct user_struct *user;
701 struct fanotify_event_info *oevent;
698 702
699 pr_debug("%s: flags=%d event_f_flags=%d\n", 703 pr_debug("%s: flags=%d event_f_flags=%d\n",
700 __func__, flags, event_f_flags); 704 __func__, flags, event_f_flags);
@@ -727,8 +731,20 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
727 group->fanotify_data.user = user; 731 group->fanotify_data.user = user;
728 atomic_inc(&user->fanotify_listeners); 732 atomic_inc(&user->fanotify_listeners);
729 733
734 oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
735 if (unlikely(!oevent)) {
736 fd = -ENOMEM;
737 goto out_destroy_group;
738 }
739 group->overflow_event = &oevent->fse;
740 fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
741 oevent->tgid = get_pid(task_tgid(current));
742 oevent->path.mnt = NULL;
743 oevent->path.dentry = NULL;
744
730 group->fanotify_data.f_flags = event_f_flags; 745 group->fanotify_data.f_flags = event_f_flags;
731#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 746#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
747 oevent->response = 0;
732 mutex_init(&group->fanotify_data.access_mutex); 748 mutex_init(&group->fanotify_data.access_mutex);
733 init_waitqueue_head(&group->fanotify_data.access_waitq); 749 init_waitqueue_head(&group->fanotify_data.access_waitq);
734 INIT_LIST_HEAD(&group->fanotify_data.access_list); 750 INIT_LIST_HEAD(&group->fanotify_data.access_list);
@@ -888,9 +904,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
888{ 904{
889 return sys_fanotify_mark(fanotify_fd, flags, 905 return sys_fanotify_mark(fanotify_fd, flags,
890#ifdef __BIG_ENDIAN 906#ifdef __BIG_ENDIAN
891 ((__u64)mask1 << 32) | mask0,
892#else
893 ((__u64)mask0 << 32) | mask1, 907 ((__u64)mask0 << 32) | mask1,
908#else
909 ((__u64)mask1 << 32) | mask0,
894#endif 910#endif
895 dfd, pathname); 911 dfd, pathname);
896} 912}
@@ -906,6 +922,7 @@ static int __init fanotify_user_setup(void)
906 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); 922 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
907 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, 923 fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
908 SLAB_PANIC); 924 SLAB_PANIC);
925 fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
909 926
910 return 0; 927 return 0;
911} 928}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4bb21d67d9b1..9d3e9c50066a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell,
128 struct fsnotify_mark *vfsmount_mark, 128 struct fsnotify_mark *vfsmount_mark,
129 __u32 mask, void *data, 129 __u32 mask, void *data,
130 int data_is, u32 cookie, 130 int data_is, u32 cookie,
131 const unsigned char *file_name, 131 const unsigned char *file_name)
132 struct fsnotify_event **event)
133{ 132{
134 struct fsnotify_group *group = NULL; 133 struct fsnotify_group *group = NULL;
135 __u32 inode_test_mask = 0; 134 __u32 inode_test_mask = 0;
@@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell,
170 169
171 pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" 170 pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
172 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" 171 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
173 " data=%p data_is=%d cookie=%d event=%p\n", 172 " data=%p data_is=%d cookie=%d\n",
174 __func__, group, to_tell, mask, inode_mark, 173 __func__, group, to_tell, mask, inode_mark,
175 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, 174 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
176 data_is, cookie, *event); 175 data_is, cookie);
177 176
178 if (!inode_test_mask && !vfsmount_test_mask) 177 if (!inode_test_mask && !vfsmount_test_mask)
179 return 0; 178 return 0;
180 179
181 if (group->ops->should_send_event(group, to_tell, inode_mark, 180 return group->ops->handle_event(group, to_tell, inode_mark,
182 vfsmount_mark, mask, data, 181 vfsmount_mark, mask, data, data_is,
183 data_is) == false) 182 file_name, cookie);
184 return 0;
185
186 if (!*event) {
187 *event = fsnotify_create_event(to_tell, mask, data,
188 data_is, file_name,
189 cookie, GFP_KERNEL);
190 if (!*event)
191 return -ENOMEM;
192 }
193 return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
194} 183}
195 184
196/* 185/*
@@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
205 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; 194 struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
206 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; 195 struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
207 struct fsnotify_group *inode_group, *vfsmount_group; 196 struct fsnotify_group *inode_group, *vfsmount_group;
208 struct fsnotify_event *event = NULL;
209 struct mount *mnt; 197 struct mount *mnt;
210 int idx, ret = 0; 198 int idx, ret = 0;
211 /* global tests shouldn't care about events on child only the specific event */ 199 /* global tests shouldn't care about events on child only the specific event */
@@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
258 246
259 if (inode_group > vfsmount_group) { 247 if (inode_group > vfsmount_group) {
260 /* handle inode */ 248 /* handle inode */
261 ret = send_to_group(to_tell, inode_mark, NULL, mask, data, 249 ret = send_to_group(to_tell, inode_mark, NULL, mask,
262 data_is, cookie, file_name, &event); 250 data, data_is, cookie, file_name);
263 /* we didn't use the vfsmount_mark */ 251 /* we didn't use the vfsmount_mark */
264 vfsmount_group = NULL; 252 vfsmount_group = NULL;
265 } else if (vfsmount_group > inode_group) { 253 } else if (vfsmount_group > inode_group) {
266 ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, 254 ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
267 data_is, cookie, file_name, &event); 255 data, data_is, cookie, file_name);
268 inode_group = NULL; 256 inode_group = NULL;
269 } else { 257 } else {
270 ret = send_to_group(to_tell, inode_mark, vfsmount_mark, 258 ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
271 mask, data, data_is, cookie, file_name, 259 mask, data, data_is, cookie,
272 &event); 260 file_name);
273 } 261 }
274 262
275 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) 263 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
@@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
285 ret = 0; 273 ret = 0;
286out: 274out:
287 srcu_read_unlock(&fsnotify_mark_srcu, idx); 275 srcu_read_unlock(&fsnotify_mark_srcu, idx);
288 /*
289 * fsnotify_create_event() took a reference so the event can't be cleaned
290 * up while we are still trying to add it to lists, drop that one.
291 */
292 if (event)
293 fsnotify_put_event(event);
294 276
295 return ret; 277 return ret;
296} 278}
diff --git a/fs/notify/group.c b/fs/notify/group.c
index bd2625bd88b4..ad1995980456 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -55,6 +55,13 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
55 /* clear the notification queue of all events */ 55 /* clear the notification queue of all events */
56 fsnotify_flush_notify(group); 56 fsnotify_flush_notify(group);
57 57
58 /*
59 * Destroy overflow event (we cannot use fsnotify_destroy_event() as
60 * that deliberately ignores overflow events.
61 */
62 if (group->overflow_event)
63 group->ops->free_event(group->overflow_event);
64
58 fsnotify_put_group(group); 65 fsnotify_put_group(group);
59} 66}
60 67
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index b6642e4de4bf..ed855ef6f077 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -2,11 +2,12 @@
2#include <linux/inotify.h> 2#include <linux/inotify.h>
3#include <linux/slab.h> /* struct kmem_cache */ 3#include <linux/slab.h> /* struct kmem_cache */
4 4
5extern struct kmem_cache *event_priv_cachep; 5struct inotify_event_info {
6 6 struct fsnotify_event fse;
7struct inotify_event_private_data {
8 struct fsnotify_event_private_data fsnotify_event_priv_data;
9 int wd; 7 int wd;
8 u32 sync_cookie;
9 int name_len;
10 char name[];
10}; 11};
11 12
12struct inotify_inode_mark { 13struct inotify_inode_mark {
@@ -14,8 +15,18 @@ struct inotify_inode_mark {
14 int wd; 15 int wd;
15}; 16};
16 17
18static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
19{
20 return container_of(fse, struct inotify_event_info, fse);
21}
22
17extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, 23extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
18 struct fsnotify_group *group); 24 struct fsnotify_group *group);
19extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); 25extern int inotify_handle_event(struct fsnotify_group *group,
26 struct inode *inode,
27 struct fsnotify_mark *inode_mark,
28 struct fsnotify_mark *vfsmount_mark,
29 u32 mask, void *data, int data_type,
30 const unsigned char *file_name, u32 cookie);
20 31
21extern const struct fsnotify_ops inotify_fsnotify_ops; 32extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 4216308b81b4..43ab1e1a07a2 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -34,107 +34,90 @@
34#include "inotify.h" 34#include "inotify.h"
35 35
36/* 36/*
37 * Check if 2 events contain the same information. We do not compare private data 37 * Check if 2 events contain the same information.
38 * but at this moment that isn't a problem for any know fsnotify listeners.
39 */ 38 */
40static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) 39static bool event_compare(struct fsnotify_event *old_fsn,
40 struct fsnotify_event *new_fsn)
41{ 41{
42 if ((old->mask == new->mask) && 42 struct inotify_event_info *old, *new;
43 (old->to_tell == new->to_tell) && 43
44 (old->data_type == new->data_type) && 44 if (old_fsn->mask & FS_IN_IGNORED)
45 (old->name_len == new->name_len)) { 45 return false;
46 switch (old->data_type) { 46 old = INOTIFY_E(old_fsn);
47 case (FSNOTIFY_EVENT_INODE): 47 new = INOTIFY_E(new_fsn);
48 /* remember, after old was put on the wait_q we aren't 48 if ((old_fsn->mask == new_fsn->mask) &&
49 * allowed to look at the inode any more, only thing 49 (old_fsn->inode == new_fsn->inode) &&
50 * left to check was if the file_name is the same */ 50 (old->name_len == new->name_len) &&
51 if (!old->name_len || 51 (!old->name_len || !strcmp(old->name, new->name)))
52 !strcmp(old->file_name, new->file_name)) 52 return true;
53 return true;
54 break;
55 case (FSNOTIFY_EVENT_PATH):
56 if ((old->path.mnt == new->path.mnt) &&
57 (old->path.dentry == new->path.dentry))
58 return true;
59 break;
60 case (FSNOTIFY_EVENT_NONE):
61 if (old->mask & FS_Q_OVERFLOW)
62 return true;
63 else if (old->mask & FS_IN_IGNORED)
64 return false;
65 return true;
66 };
67 }
68 return false; 53 return false;
69} 54}
70 55
71static struct fsnotify_event *inotify_merge(struct list_head *list, 56static int inotify_merge(struct list_head *list,
72 struct fsnotify_event *event) 57 struct fsnotify_event *event)
73{ 58{
74 struct fsnotify_event_holder *last_holder;
75 struct fsnotify_event *last_event; 59 struct fsnotify_event *last_event;
76 60
77 /* and the list better be locked by something too */ 61 last_event = list_entry(list->prev, struct fsnotify_event, list);
78 spin_lock(&event->lock); 62 return event_compare(last_event, event);
79
80 last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
81 last_event = last_holder->event;
82 if (event_compare(last_event, event))
83 fsnotify_get_event(last_event);
84 else
85 last_event = NULL;
86
87 spin_unlock(&event->lock);
88
89 return last_event;
90} 63}
91 64
92static int inotify_handle_event(struct fsnotify_group *group, 65int inotify_handle_event(struct fsnotify_group *group,
93 struct fsnotify_mark *inode_mark, 66 struct inode *inode,
94 struct fsnotify_mark *vfsmount_mark, 67 struct fsnotify_mark *inode_mark,
95 struct fsnotify_event *event) 68 struct fsnotify_mark *vfsmount_mark,
69 u32 mask, void *data, int data_type,
70 const unsigned char *file_name, u32 cookie)
96{ 71{
97 struct inotify_inode_mark *i_mark; 72 struct inotify_inode_mark *i_mark;
98 struct inode *to_tell; 73 struct inotify_event_info *event;
99 struct inotify_event_private_data *event_priv; 74 struct fsnotify_event *fsn_event;
100 struct fsnotify_event_private_data *fsn_event_priv; 75 int ret;
101 struct fsnotify_event *added_event; 76 int len = 0;
102 int wd, ret = 0; 77 int alloc_len = sizeof(struct inotify_event_info);
103 78
104 BUG_ON(vfsmount_mark); 79 BUG_ON(vfsmount_mark);
105 80
106 pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, 81 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
107 event, event->to_tell, event->mask); 82 (data_type == FSNOTIFY_EVENT_PATH)) {
83 struct path *path = data;
84
85 if (d_unlinked(path->dentry))
86 return 0;
87 }
88 if (file_name) {
89 len = strlen(file_name);
90 alloc_len += len + 1;
91 }
108 92
109 to_tell = event->to_tell; 93 pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
94 mask);
110 95
111 i_mark = container_of(inode_mark, struct inotify_inode_mark, 96 i_mark = container_of(inode_mark, struct inotify_inode_mark,
112 fsn_mark); 97 fsn_mark);
113 wd = i_mark->wd;
114 98
115 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); 99 event = kmalloc(alloc_len, GFP_KERNEL);
116 if (unlikely(!event_priv)) 100 if (unlikely(!event))
117 return -ENOMEM; 101 return -ENOMEM;
118 102
119 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 103 fsn_event = &event->fse;
120 104 fsnotify_init_event(fsn_event, inode, mask);
121 fsnotify_get_group(group); 105 event->wd = i_mark->wd;
122 fsn_event_priv->group = group; 106 event->sync_cookie = cookie;
123 event_priv->wd = wd; 107 event->name_len = len;
124 108 if (len)
125 added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); 109 strcpy(event->name, file_name);
126 if (added_event) { 110
127 inotify_free_event_priv(fsn_event_priv); 111 ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
128 if (!IS_ERR(added_event)) 112 if (ret) {
129 fsnotify_put_event(added_event); 113 /* Our event wasn't used in the end. Free it. */
130 else 114 fsnotify_destroy_event(group, fsn_event);
131 ret = PTR_ERR(added_event);
132 } 115 }
133 116
134 if (inode_mark->mask & IN_ONESHOT) 117 if (inode_mark->mask & IN_ONESHOT)
135 fsnotify_destroy_mark(inode_mark, group); 118 fsnotify_destroy_mark(inode_mark, group);
136 119
137 return ret; 120 return 0;
138} 121}
139 122
140static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) 123static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
@@ -142,22 +125,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
142 inotify_ignored_and_remove_idr(fsn_mark, group); 125 inotify_ignored_and_remove_idr(fsn_mark, group);
143} 126}
144 127
145static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
146 struct fsnotify_mark *inode_mark,
147 struct fsnotify_mark *vfsmount_mark,
148 __u32 mask, void *data, int data_type)
149{
150 if ((inode_mark->mask & FS_EXCL_UNLINK) &&
151 (data_type == FSNOTIFY_EVENT_PATH)) {
152 struct path *path = data;
153
154 if (d_unlinked(path->dentry))
155 return false;
156 }
157
158 return true;
159}
160
161/* 128/*
162 * This is NEVER supposed to be called. Inotify marks should either have been 129 * This is NEVER supposed to be called. Inotify marks should either have been
163 * removed from the idr when the watch was removed or in the 130 * removed from the idr when the watch was removed or in the
@@ -202,22 +169,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
202 free_uid(group->inotify_data.user); 169 free_uid(group->inotify_data.user);
203} 170}
204 171
205void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) 172static void inotify_free_event(struct fsnotify_event *fsn_event)
206{ 173{
207 struct inotify_event_private_data *event_priv; 174 kfree(INOTIFY_E(fsn_event));
208
209
210 event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
211 fsnotify_event_priv_data);
212
213 fsnotify_put_group(fsn_event_priv->group);
214 kmem_cache_free(event_priv_cachep, event_priv);
215} 175}
216 176
217const struct fsnotify_ops inotify_fsnotify_ops = { 177const struct fsnotify_ops inotify_fsnotify_ops = {
218 .handle_event = inotify_handle_event, 178 .handle_event = inotify_handle_event,
219 .should_send_event = inotify_should_send_event,
220 .free_group_priv = inotify_free_group_priv, 179 .free_group_priv = inotify_free_group_priv,
221 .free_event_priv = inotify_free_event_priv, 180 .free_event = inotify_free_event,
222 .freeing_mark = inotify_freeing_mark, 181 .freeing_mark = inotify_freeing_mark,
223}; 182};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 60f954a891ab..78a2ca3966c3 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly;
50static int inotify_max_user_watches __read_mostly; 50static int inotify_max_user_watches __read_mostly;
51 51
52static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; 52static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
53struct kmem_cache *event_priv_cachep __read_mostly;
54 53
55#ifdef CONFIG_SYSCTL 54#ifdef CONFIG_SYSCTL
56 55
@@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
124 return ret; 123 return ret;
125} 124}
126 125
126static int round_event_name_len(struct fsnotify_event *fsn_event)
127{
128 struct inotify_event_info *event;
129
130 event = INOTIFY_E(fsn_event);
131 if (!event->name_len)
132 return 0;
133 return roundup(event->name_len + 1, sizeof(struct inotify_event));
134}
135
127/* 136/*
128 * Get an inotify_kernel_event if one exists and is small 137 * Get an inotify_kernel_event if one exists and is small
129 * enough to fit in "count". Return an error pointer if 138 * enough to fit in "count". Return an error pointer if
@@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
144 153
145 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 154 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
146 155
147 if (event->name_len) 156 event_size += round_event_name_len(event);
148 event_size += roundup(event->name_len + 1, event_size);
149
150 if (event_size > count) 157 if (event_size > count)
151 return ERR_PTR(-EINVAL); 158 return ERR_PTR(-EINVAL);
152 159
@@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
164 * buffer we had in "get_one_event()" above. 171 * buffer we had in "get_one_event()" above.
165 */ 172 */
166static ssize_t copy_event_to_user(struct fsnotify_group *group, 173static ssize_t copy_event_to_user(struct fsnotify_group *group,
167 struct fsnotify_event *event, 174 struct fsnotify_event *fsn_event,
168 char __user *buf) 175 char __user *buf)
169{ 176{
170 struct inotify_event inotify_event; 177 struct inotify_event inotify_event;
171 struct fsnotify_event_private_data *fsn_priv; 178 struct inotify_event_info *event;
172 struct inotify_event_private_data *priv;
173 size_t event_size = sizeof(struct inotify_event); 179 size_t event_size = sizeof(struct inotify_event);
174 size_t name_len = 0; 180 size_t name_len;
181 size_t pad_name_len;
175 182
176 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 183 pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
177
178 /* we get the inotify watch descriptor from the event private data */
179 spin_lock(&event->lock);
180 fsn_priv = fsnotify_remove_priv_from_event(group, event);
181 spin_unlock(&event->lock);
182
183 if (!fsn_priv)
184 inotify_event.wd = -1;
185 else {
186 priv = container_of(fsn_priv, struct inotify_event_private_data,
187 fsnotify_event_priv_data);
188 inotify_event.wd = priv->wd;
189 inotify_free_event_priv(fsn_priv);
190 }
191 184
185 event = INOTIFY_E(fsn_event);
186 name_len = event->name_len;
192 /* 187 /*
193 * round up event->name_len so it is a multiple of event_size 188 * round up name length so it is a multiple of event_size
194 * plus an extra byte for the terminating '\0'. 189 * plus an extra byte for the terminating '\0'.
195 */ 190 */
196 if (event->name_len) 191 pad_name_len = round_event_name_len(fsn_event);
197 name_len = roundup(event->name_len + 1, event_size); 192 inotify_event.len = pad_name_len;
198 inotify_event.len = name_len; 193 inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
199 194 inotify_event.wd = event->wd;
200 inotify_event.mask = inotify_mask_to_arg(event->mask);
201 inotify_event.cookie = event->sync_cookie; 195 inotify_event.cookie = event->sync_cookie;
202 196
203 /* send the main event */ 197 /* send the main event */
@@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
209 /* 203 /*
210 * fsnotify only stores the pathname, so here we have to send the pathname 204 * fsnotify only stores the pathname, so here we have to send the pathname
211 * and then pad that pathname out to a multiple of sizeof(inotify_event) 205 * and then pad that pathname out to a multiple of sizeof(inotify_event)
212 * with zeros. I get my zeros from the nul_inotify_event. 206 * with zeros.
213 */ 207 */
214 if (name_len) { 208 if (pad_name_len) {
215 unsigned int len_to_zero = name_len - event->name_len;
216 /* copy the path name */ 209 /* copy the path name */
217 if (copy_to_user(buf, event->file_name, event->name_len)) 210 if (copy_to_user(buf, event->name, name_len))
218 return -EFAULT; 211 return -EFAULT;
219 buf += event->name_len; 212 buf += name_len;
220 213
221 /* fill userspace with 0's */ 214 /* fill userspace with 0's */
222 if (clear_user(buf, len_to_zero)) 215 if (clear_user(buf, pad_name_len - name_len))
223 return -EFAULT; 216 return -EFAULT;
224 buf += len_to_zero; 217 event_size += pad_name_len;
225 event_size += name_len;
226 } 218 }
227 219
228 return event_size; 220 return event_size;
@@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
254 if (IS_ERR(kevent)) 246 if (IS_ERR(kevent))
255 break; 247 break;
256 ret = copy_event_to_user(group, kevent, buf); 248 ret = copy_event_to_user(group, kevent, buf);
257 fsnotify_put_event(kevent); 249 fsnotify_destroy_event(group, kevent);
258 if (ret < 0) 250 if (ret < 0)
259 break; 251 break;
260 buf += ret; 252 buf += ret;
@@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
297 unsigned long arg) 289 unsigned long arg)
298{ 290{
299 struct fsnotify_group *group; 291 struct fsnotify_group *group;
300 struct fsnotify_event_holder *holder; 292 struct fsnotify_event *fsn_event;
301 struct fsnotify_event *event;
302 void __user *p; 293 void __user *p;
303 int ret = -ENOTTY; 294 int ret = -ENOTTY;
304 size_t send_len = 0; 295 size_t send_len = 0;
@@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
311 switch (cmd) { 302 switch (cmd) {
312 case FIONREAD: 303 case FIONREAD:
313 mutex_lock(&group->notification_mutex); 304 mutex_lock(&group->notification_mutex);
314 list_for_each_entry(holder, &group->notification_list, event_list) { 305 list_for_each_entry(fsn_event, &group->notification_list,
315 event = holder->event; 306 list) {
316 send_len += sizeof(struct inotify_event); 307 send_len += sizeof(struct inotify_event);
317 if (event->name_len) 308 send_len += round_event_name_len(fsn_event);
318 send_len += roundup(event->name_len + 1,
319 sizeof(struct inotify_event));
320 } 309 }
321 mutex_unlock(&group->notification_mutex); 310 mutex_unlock(&group->notification_mutex);
322 ret = put_user(send_len, (int __user *) p); 311 ret = put_user(send_len, (int __user *) p);
@@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
503 struct fsnotify_group *group) 492 struct fsnotify_group *group)
504{ 493{
505 struct inotify_inode_mark *i_mark; 494 struct inotify_inode_mark *i_mark;
506 struct fsnotify_event *ignored_event, *notify_event;
507 struct inotify_event_private_data *event_priv;
508 struct fsnotify_event_private_data *fsn_event_priv;
509 int ret;
510
511 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
512
513 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
514 FSNOTIFY_EVENT_NONE, NULL, 0,
515 GFP_NOFS);
516 if (!ignored_event)
517 goto skip_send_ignore;
518
519 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
520 if (unlikely(!event_priv))
521 goto skip_send_ignore;
522
523 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
524
525 fsnotify_get_group(group);
526 fsn_event_priv->group = group;
527 event_priv->wd = i_mark->wd;
528
529 notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
530 if (notify_event) {
531 if (IS_ERR(notify_event))
532 ret = PTR_ERR(notify_event);
533 else
534 fsnotify_put_event(notify_event);
535 inotify_free_event_priv(fsn_event_priv);
536 }
537 495
538skip_send_ignore: 496 /* Queue ignore event for the watch */
539 /* matches the reference taken when the event was created */ 497 inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
540 if (ignored_event) 498 NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
541 fsnotify_put_event(ignored_event);
542 499
500 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
543 /* remove this mark from the idr */ 501 /* remove this mark from the idr */
544 inotify_remove_from_idr(group, i_mark); 502 inotify_remove_from_idr(group, i_mark);
545 503
@@ -675,11 +633,23 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
675static struct fsnotify_group *inotify_new_group(unsigned int max_events) 633static struct fsnotify_group *inotify_new_group(unsigned int max_events)
676{ 634{
677 struct fsnotify_group *group; 635 struct fsnotify_group *group;
636 struct inotify_event_info *oevent;
678 637
679 group = fsnotify_alloc_group(&inotify_fsnotify_ops); 638 group = fsnotify_alloc_group(&inotify_fsnotify_ops);
680 if (IS_ERR(group)) 639 if (IS_ERR(group))
681 return group; 640 return group;
682 641
642 oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
643 if (unlikely(!oevent)) {
644 fsnotify_destroy_group(group);
645 return ERR_PTR(-ENOMEM);
646 }
647 group->overflow_event = &oevent->fse;
648 fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
649 oevent->wd = -1;
650 oevent->sync_cookie = 0;
651 oevent->name_len = 0;
652
683 group->max_events = max_events; 653 group->max_events = max_events;
684 654
685 spin_lock_init(&group->inotify_data.idr_lock); 655 spin_lock_init(&group->inotify_data.idr_lock);
@@ -836,7 +806,6 @@ static int __init inotify_user_setup(void)
836 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); 806 BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
837 807
838 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); 808 inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
839 event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
840 809
841 inotify_max_queued_events = 16384; 810 inotify_max_queued_events = 16384;
842 inotify_max_user_instances = 128; 811 inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7b51b05f160c..1e58402171a5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -48,15 +48,6 @@
48#include <linux/fsnotify_backend.h> 48#include <linux/fsnotify_backend.h>
49#include "fsnotify.h" 49#include "fsnotify.h"
50 50
51static struct kmem_cache *fsnotify_event_cachep;
52static struct kmem_cache *fsnotify_event_holder_cachep;
53/*
54 * This is a magic event we send when the q is too full. Since it doesn't
55 * hold real event information we just keep one system wide and use it any time
56 * it is needed. It's refcnt is set 1 at kernel init time and will never
57 * get set to 0 so it will never get 'freed'
58 */
59static struct fsnotify_event *q_overflow_event;
60static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); 51static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
61 52
62/** 53/**
@@ -76,186 +67,82 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
76 return list_empty(&group->notification_list) ? true : false; 67 return list_empty(&group->notification_list) ? true : false;
77} 68}
78 69
79void fsnotify_get_event(struct fsnotify_event *event) 70void fsnotify_destroy_event(struct fsnotify_group *group,
71 struct fsnotify_event *event)
80{ 72{
81 atomic_inc(&event->refcnt); 73 /* Overflow events are per-group and we don't want to free them */
82} 74 if (!event || event->mask == FS_Q_OVERFLOW)
83
84void fsnotify_put_event(struct fsnotify_event *event)
85{
86 if (!event)
87 return; 75 return;
88 76
89 if (atomic_dec_and_test(&event->refcnt)) { 77 group->ops->free_event(event);
90 pr_debug("%s: event=%p\n", __func__, event);
91
92 if (event->data_type == FSNOTIFY_EVENT_PATH)
93 path_put(&event->path);
94
95 BUG_ON(!list_empty(&event->private_data_list));
96
97 kfree(event->file_name);
98 put_pid(event->tgid);
99 kmem_cache_free(fsnotify_event_cachep, event);
100 }
101}
102
103struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
104{
105 return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
106}
107
108void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
109{
110 if (holder)
111 kmem_cache_free(fsnotify_event_holder_cachep, holder);
112}
113
114/*
115 * Find the private data that the group previously attached to this event when
116 * the group added the event to the notification queue (fsnotify_add_notify_event)
117 */
118struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
119{
120 struct fsnotify_event_private_data *lpriv;
121 struct fsnotify_event_private_data *priv = NULL;
122
123 assert_spin_locked(&event->lock);
124
125 list_for_each_entry(lpriv, &event->private_data_list, event_list) {
126 if (lpriv->group == group) {
127 priv = lpriv;
128 list_del(&priv->event_list);
129 break;
130 }
131 }
132 return priv;
133} 78}
134 79
135/* 80/*
136 * Add an event to the group notification queue. The group can later pull this 81 * Add an event to the group notification queue. The group can later pull this
137 * event off the queue to deal with. If the event is successfully added to the 82 * event off the queue to deal with. The function returns 0 if the event was
138 * group's notification queue, a reference is taken on event. 83 * added to the queue, 1 if the event was merged with some other queued event,
84 * 2 if the queue of events has overflown.
139 */ 85 */
140struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, 86int fsnotify_add_notify_event(struct fsnotify_group *group,
141 struct fsnotify_event_private_data *priv, 87 struct fsnotify_event *event,
142 struct fsnotify_event *(*merge)(struct list_head *, 88 int (*merge)(struct list_head *,
143 struct fsnotify_event *)) 89 struct fsnotify_event *))
144{ 90{
145 struct fsnotify_event *return_event = NULL; 91 int ret = 0;
146 struct fsnotify_event_holder *holder = NULL;
147 struct list_head *list = &group->notification_list; 92 struct list_head *list = &group->notification_list;
148 93
149 pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); 94 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
150
151 /*
152 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
153 * Check if we expect to be able to use that holder. If not alloc a new
154 * holder.
155 * For the overflow event it's possible that something will use the in
156 * event holder before we get the lock so we may need to jump back and
157 * alloc a new holder, this can't happen for most events...
158 */
159 if (!list_empty(&event->holder.event_list)) {
160alloc_holder:
161 holder = fsnotify_alloc_event_holder();
162 if (!holder)
163 return ERR_PTR(-ENOMEM);
164 }
165 95
166 mutex_lock(&group->notification_mutex); 96 mutex_lock(&group->notification_mutex);
167 97
168 if (group->q_len >= group->max_events) { 98 if (group->q_len >= group->max_events) {
169 event = q_overflow_event; 99 ret = 2;
170 100 /* Queue overflow event only if it isn't already queued */
171 /* 101 if (!list_empty(&group->overflow_event->list)) {
172 * we need to return the overflow event
173 * which means we need a ref
174 */
175 fsnotify_get_event(event);
176 return_event = event;
177
178 /* sorry, no private data on the overflow event */
179 priv = NULL;
180 }
181
182 if (!list_empty(list) && merge) {
183 struct fsnotify_event *tmp;
184
185 tmp = merge(list, event);
186 if (tmp) {
187 mutex_unlock(&group->notification_mutex); 102 mutex_unlock(&group->notification_mutex);
188 103 return ret;
189 if (return_event)
190 fsnotify_put_event(return_event);
191 if (holder != &event->holder)
192 fsnotify_destroy_event_holder(holder);
193 return tmp;
194 } 104 }
105 event = group->overflow_event;
106 goto queue;
195 } 107 }
196 108
197 spin_lock(&event->lock); 109 if (!list_empty(list) && merge) {
198 110 ret = merge(list, event);
199 if (list_empty(&event->holder.event_list)) { 111 if (ret) {
200 if (unlikely(holder)) 112 mutex_unlock(&group->notification_mutex);
201 fsnotify_destroy_event_holder(holder); 113 return ret;
202 holder = &event->holder;
203 } else if (unlikely(!holder)) {
204 /* between the time we checked above and got the lock the in
205 * event holder was used, go back and get a new one */
206 spin_unlock(&event->lock);
207 mutex_unlock(&group->notification_mutex);
208
209 if (return_event) {
210 fsnotify_put_event(return_event);
211 return_event = NULL;
212 } 114 }
213
214 goto alloc_holder;
215 } 115 }
216 116
117queue:
217 group->q_len++; 118 group->q_len++;
218 holder->event = event; 119 list_add_tail(&event->list, list);
219
220 fsnotify_get_event(event);
221 list_add_tail(&holder->event_list, list);
222 if (priv)
223 list_add_tail(&priv->event_list, &event->private_data_list);
224 spin_unlock(&event->lock);
225 mutex_unlock(&group->notification_mutex); 120 mutex_unlock(&group->notification_mutex);
226 121
227 wake_up(&group->notification_waitq); 122 wake_up(&group->notification_waitq);
228 kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); 123 kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
229 return return_event; 124 return ret;
230} 125}
231 126
232/* 127/*
233 * Remove and return the first event from the notification list. There is a 128 * Remove and return the first event from the notification list. It is the
234 * reference held on this event since it was on the list. It is the responsibility 129 * responsibility of the caller to destroy the obtained event
235 * of the caller to drop this reference.
236 */ 130 */
237struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) 131struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
238{ 132{
239 struct fsnotify_event *event; 133 struct fsnotify_event *event;
240 struct fsnotify_event_holder *holder;
241 134
242 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 135 BUG_ON(!mutex_is_locked(&group->notification_mutex));
243 136
244 pr_debug("%s: group=%p\n", __func__, group); 137 pr_debug("%s: group=%p\n", __func__, group);
245 138
246 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 139 event = list_first_entry(&group->notification_list,
247 140 struct fsnotify_event, list);
248 event = holder->event; 141 /*
249 142 * We need to init list head for the case of overflow event so that
250 spin_lock(&event->lock); 143 * check in fsnotify_add_notify_events() works
251 holder->event = NULL; 144 */
252 list_del_init(&holder->event_list); 145 list_del_init(&event->list);
253 spin_unlock(&event->lock);
254
255 /* event == holder means we are referenced through the in event holder */
256 if (holder != &event->holder)
257 fsnotify_destroy_event_holder(holder);
258
259 group->q_len--; 146 group->q_len--;
260 147
261 return event; 148 return event;
@@ -266,15 +153,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
266 */ 153 */
267struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) 154struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
268{ 155{
269 struct fsnotify_event *event;
270 struct fsnotify_event_holder *holder;
271
272 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 156 BUG_ON(!mutex_is_locked(&group->notification_mutex));
273 157
274 holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); 158 return list_first_entry(&group->notification_list,
275 event = holder->event; 159 struct fsnotify_event, list);
276
277 return event;
278} 160}
279 161
280/* 162/*
@@ -284,181 +166,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
284void fsnotify_flush_notify(struct fsnotify_group *group) 166void fsnotify_flush_notify(struct fsnotify_group *group)
285{ 167{
286 struct fsnotify_event *event; 168 struct fsnotify_event *event;
287 struct fsnotify_event_private_data *priv;
288 169
289 mutex_lock(&group->notification_mutex); 170 mutex_lock(&group->notification_mutex);
290 while (!fsnotify_notify_queue_is_empty(group)) { 171 while (!fsnotify_notify_queue_is_empty(group)) {
291 event = fsnotify_remove_notify_event(group); 172 event = fsnotify_remove_notify_event(group);
292 /* if they don't implement free_event_priv they better not have attached any */ 173 fsnotify_destroy_event(group, event);
293 if (group->ops->free_event_priv) {
294 spin_lock(&event->lock);
295 priv = fsnotify_remove_priv_from_event(group, event);
296 spin_unlock(&event->lock);
297 if (priv)
298 group->ops->free_event_priv(priv);
299 }
300 fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
301 } 174 }
302 mutex_unlock(&group->notification_mutex); 175 mutex_unlock(&group->notification_mutex);
303} 176}
304 177
305static void initialize_event(struct fsnotify_event *event)
306{
307 INIT_LIST_HEAD(&event->holder.event_list);
308 atomic_set(&event->refcnt, 1);
309
310 spin_lock_init(&event->lock);
311
312 INIT_LIST_HEAD(&event->private_data_list);
313}
314
315/*
316 * Caller damn well better be holding whatever mutex is protecting the
317 * old_holder->event_list and the new_event must be a clean event which
318 * cannot be found anywhere else in the kernel.
319 */
320int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
321 struct fsnotify_event *new_event)
322{
323 struct fsnotify_event *old_event = old_holder->event;
324 struct fsnotify_event_holder *new_holder = &new_event->holder;
325
326 enum event_spinlock_class {
327 SPINLOCK_OLD,
328 SPINLOCK_NEW,
329 };
330
331 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
332
333 /*
334 * if the new_event's embedded holder is in use someone
335 * screwed up and didn't give us a clean new event.
336 */
337 BUG_ON(!list_empty(&new_holder->event_list));
338
339 spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
340 spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
341
342 new_holder->event = new_event;
343 list_replace_init(&old_holder->event_list, &new_holder->event_list);
344
345 spin_unlock(&new_event->lock);
346 spin_unlock(&old_event->lock);
347
348 /* event == holder means we are referenced through the in event holder */
349 if (old_holder != &old_event->holder)
350 fsnotify_destroy_event_holder(old_holder);
351
352 fsnotify_get_event(new_event); /* on the list take reference */
353 fsnotify_put_event(old_event); /* off the list, drop reference */
354
355 return 0;
356}
357
358struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
359{
360 struct fsnotify_event *event;
361
362 event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
363 if (!event)
364 return NULL;
365
366 pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
367
368 memcpy(event, old_event, sizeof(*event));
369 initialize_event(event);
370
371 if (event->name_len) {
372 event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
373 if (!event->file_name) {
374 kmem_cache_free(fsnotify_event_cachep, event);
375 return NULL;
376 }
377 }
378 event->tgid = get_pid(old_event->tgid);
379 if (event->data_type == FSNOTIFY_EVENT_PATH)
380 path_get(&event->path);
381
382 return event;
383}
384
385/* 178/*
386 * fsnotify_create_event - Allocate a new event which will be sent to each 179 * fsnotify_create_event - Allocate a new event which will be sent to each
387 * group's handle_event function if the group was interested in this 180 * group's handle_event function if the group was interested in this
388 * particular event. 181 * particular event.
389 * 182 *
390 * @to_tell the inode which is supposed to receive the event (sometimes a 183 * @inode the inode which is supposed to receive the event (sometimes a
391 * parent of the inode to which the event happened. 184 * parent of the inode to which the event happened.
392 * @mask what actually happened. 185 * @mask what actually happened.
393 * @data pointer to the object which was actually affected 186 * @data pointer to the object which was actually affected
394 * @data_type flag indication if the data is a file, path, inode, nothing... 187 * @data_type flag indication if the data is a file, path, inode, nothing...
395 * @name the filename, if available 188 * @name the filename, if available
396 */ 189 */
397struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, 190void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
398 int data_type, const unsigned char *name, 191 u32 mask)
399 u32 cookie, gfp_t gfp)
400{ 192{
401 struct fsnotify_event *event; 193 INIT_LIST_HEAD(&event->list);
402 194 event->inode = inode;
403 event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
404 if (!event)
405 return NULL;
406
407 pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
408 __func__, event, to_tell, mask, data, data_type);
409
410 initialize_event(event);
411
412 if (name) {
413 event->file_name = kstrdup(name, gfp);
414 if (!event->file_name) {
415 kmem_cache_free(fsnotify_event_cachep, event);
416 return NULL;
417 }
418 event->name_len = strlen(event->file_name);
419 }
420
421 event->tgid = get_pid(task_tgid(current));
422 event->sync_cookie = cookie;
423 event->to_tell = to_tell;
424 event->data_type = data_type;
425
426 switch (data_type) {
427 case FSNOTIFY_EVENT_PATH: {
428 struct path *path = data;
429 event->path.dentry = path->dentry;
430 event->path.mnt = path->mnt;
431 path_get(&event->path);
432 break;
433 }
434 case FSNOTIFY_EVENT_INODE:
435 event->inode = data;
436 break;
437 case FSNOTIFY_EVENT_NONE:
438 event->inode = NULL;
439 event->path.dentry = NULL;
440 event->path.mnt = NULL;
441 break;
442 default:
443 BUG();
444 }
445
446 event->mask = mask; 195 event->mask = mask;
447
448 return event;
449}
450
451static __init int fsnotify_notification_init(void)
452{
453 fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
454 fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
455
456 q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
457 FSNOTIFY_EVENT_NONE, NULL, 0,
458 GFP_KERNEL);
459 if (!q_overflow_event)
460 panic("unable to allocate fsnotify q_overflow_event\n");
461
462 return 0;
463} 196}
464subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ea4ba9daeb47..db9bd8a31725 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2134,7 +2134,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2134 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2134 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
2135 mutex_unlock(&inode->i_mutex); 2135 mutex_unlock(&inode->i_mutex);
2136 if (ret > 0) { 2136 if (ret > 0) {
2137 int err = generic_write_sync(file, pos, ret); 2137 int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
2138 if (err < 0) 2138 if (err < 0)
2139 ret = err; 2139 ret = err;
2140 } 2140 }
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b32989..ce210d4951a1 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -38,7 +38,6 @@ ocfs2-objs := \
38 symlink.o \ 38 symlink.o \
39 sysfile.o \ 39 sysfile.o \
40 uptodate.o \ 40 uptodate.o \
41 ver.o \
42 quota_local.o \ 41 quota_local.o \
43 quota_global.o \ 42 quota_global.o \
44 xattr.o \ 43 xattr.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index b4f788e0ca31..555f4cddefe3 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -160,36 +160,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
160 return acl; 160 return acl;
161} 161}
162 162
163
164/*
165 * Get posix acl.
166 */
167static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
168{
169 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
170 struct buffer_head *di_bh = NULL;
171 struct posix_acl *acl;
172 int ret;
173
174 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
175 return NULL;
176
177 ret = ocfs2_inode_lock(inode, &di_bh, 0);
178 if (ret < 0) {
179 mlog_errno(ret);
180 acl = ERR_PTR(ret);
181 return acl;
182 }
183
184 acl = ocfs2_get_acl_nolock(inode, type, di_bh);
185
186 ocfs2_inode_unlock(inode, 0);
187
188 brelse(di_bh);
189
190 return acl;
191}
192
193/* 163/*
194 * Helper function to set i_mode in memory and disk. Some call paths 164 * Helper function to set i_mode in memory and disk. Some call paths
195 * will not have di_bh or a journal handle to pass, in which case it 165 * will not have di_bh or a journal handle to pass, in which case it
@@ -250,7 +220,7 @@ out:
250/* 220/*
251 * Set the access or default ACL of an inode. 221 * Set the access or default ACL of an inode.
252 */ 222 */
253static int ocfs2_set_acl(handle_t *handle, 223int ocfs2_set_acl(handle_t *handle,
254 struct inode *inode, 224 struct inode *inode,
255 struct buffer_head *di_bh, 225 struct buffer_head *di_bh,
256 int type, 226 int type,
@@ -313,6 +283,11 @@ static int ocfs2_set_acl(handle_t *handle,
313 return ret; 283 return ret;
314} 284}
315 285
286int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
287{
288 return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
289}
290
316struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) 291struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
317{ 292{
318 struct ocfs2_super *osb; 293 struct ocfs2_super *osb;
@@ -334,200 +309,3 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
334 309
335 return acl; 310 return acl;
336} 311}
337
338int ocfs2_acl_chmod(struct inode *inode)
339{
340 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
341 struct posix_acl *acl;
342 int ret;
343
344 if (S_ISLNK(inode->i_mode))
345 return -EOPNOTSUPP;
346
347 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
348 return 0;
349
350 acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
351 if (IS_ERR(acl) || !acl)
352 return PTR_ERR(acl);
353 ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
354 if (ret)
355 return ret;
356 ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
357 acl, NULL, NULL);
358 posix_acl_release(acl);
359 return ret;
360}
361
362/*
363 * Initialize the ACLs of a new inode. If parent directory has default ACL,
364 * then clone to new inode. Called from ocfs2_mknod.
365 */
366int ocfs2_init_acl(handle_t *handle,
367 struct inode *inode,
368 struct inode *dir,
369 struct buffer_head *di_bh,
370 struct buffer_head *dir_bh,
371 struct ocfs2_alloc_context *meta_ac,
372 struct ocfs2_alloc_context *data_ac)
373{
374 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
375 struct posix_acl *acl = NULL;
376 int ret = 0, ret2;
377 umode_t mode;
378
379 if (!S_ISLNK(inode->i_mode)) {
380 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
381 acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
382 dir_bh);
383 if (IS_ERR(acl))
384 return PTR_ERR(acl);
385 }
386 if (!acl) {
387 mode = inode->i_mode & ~current_umask();
388 ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
389 if (ret) {
390 mlog_errno(ret);
391 goto cleanup;
392 }
393 }
394 }
395 if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
396 if (S_ISDIR(inode->i_mode)) {
397 ret = ocfs2_set_acl(handle, inode, di_bh,
398 ACL_TYPE_DEFAULT, acl,
399 meta_ac, data_ac);
400 if (ret)
401 goto cleanup;
402 }
403 mode = inode->i_mode;
404 ret = posix_acl_create(&acl, GFP_NOFS, &mode);
405 if (ret < 0)
406 return ret;
407
408 ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
409 if (ret2) {
410 mlog_errno(ret2);
411 ret = ret2;
412 goto cleanup;
413 }
414 if (ret > 0) {
415 ret = ocfs2_set_acl(handle, inode,
416 di_bh, ACL_TYPE_ACCESS,
417 acl, meta_ac, data_ac);
418 }
419 }
420cleanup:
421 posix_acl_release(acl);
422 return ret;
423}
424
425static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
426 char *list,
427 size_t list_len,
428 const char *name,
429 size_t name_len,
430 int type)
431{
432 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
433 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
434
435 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
436 return 0;
437
438 if (list && size <= list_len)
439 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
440 return size;
441}
442
443static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
444 char *list,
445 size_t list_len,
446 const char *name,
447 size_t name_len,
448 int type)
449{
450 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
451 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
452
453 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
454 return 0;
455
456 if (list && size <= list_len)
457 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
458 return size;
459}
460
461static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
462 void *buffer, size_t size, int type)
463{
464 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
465 struct posix_acl *acl;
466 int ret;
467
468 if (strcmp(name, "") != 0)
469 return -EINVAL;
470 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
471 return -EOPNOTSUPP;
472
473 acl = ocfs2_get_acl(dentry->d_inode, type);
474 if (IS_ERR(acl))
475 return PTR_ERR(acl);
476 if (acl == NULL)
477 return -ENODATA;
478 ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
479 posix_acl_release(acl);
480
481 return ret;
482}
483
484static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
485 const void *value, size_t size, int flags, int type)
486{
487 struct inode *inode = dentry->d_inode;
488 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
489 struct posix_acl *acl;
490 int ret = 0;
491
492 if (strcmp(name, "") != 0)
493 return -EINVAL;
494 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
495 return -EOPNOTSUPP;
496
497 if (!inode_owner_or_capable(inode))
498 return -EPERM;
499
500 if (value) {
501 acl = posix_acl_from_xattr(&init_user_ns, value, size);
502 if (IS_ERR(acl))
503 return PTR_ERR(acl);
504 else if (acl) {
505 ret = posix_acl_valid(acl);
506 if (ret)
507 goto cleanup;
508 }
509 } else
510 acl = NULL;
511
512 ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
513
514cleanup:
515 posix_acl_release(acl);
516 return ret;
517}
518
519const struct xattr_handler ocfs2_xattr_acl_access_handler = {
520 .prefix = POSIX_ACL_XATTR_ACCESS,
521 .flags = ACL_TYPE_ACCESS,
522 .list = ocfs2_xattr_list_acl_access,
523 .get = ocfs2_xattr_get_acl,
524 .set = ocfs2_xattr_set_acl,
525};
526
527const struct xattr_handler ocfs2_xattr_acl_default_handler = {
528 .prefix = POSIX_ACL_XATTR_DEFAULT,
529 .flags = ACL_TYPE_DEFAULT,
530 .list = ocfs2_xattr_list_acl_default,
531 .get = ocfs2_xattr_get_acl,
532 .set = ocfs2_xattr_set_acl,
533};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 071fbd380f2f..3fce68d08625 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -27,10 +27,13 @@ struct ocfs2_acl_entry {
27}; 27};
28 28
29struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); 29struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
30extern int ocfs2_acl_chmod(struct inode *); 30int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, 31int ocfs2_set_acl(handle_t *handle,
32 struct buffer_head *, struct buffer_head *, 32 struct inode *inode,
33 struct ocfs2_alloc_context *, 33 struct buffer_head *di_bh,
34 struct ocfs2_alloc_context *); 34 int type,
35 struct posix_acl *acl,
36 struct ocfs2_alloc_context *meta_ac,
37 struct ocfs2_alloc_context *data_ac);
35 38
36#endif /* OCFS2_ACL_H */ 39#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index dc7411fe185d..e2edff38be52 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4742 enum ocfs2_alloc_restarted *reason_ret) 4742 enum ocfs2_alloc_restarted *reason_ret)
4743{ 4743{
4744 int status = 0, err = 0; 4744 int status = 0, err = 0;
4745 int need_free = 0;
4745 int free_extents; 4746 int free_extents;
4746 enum ocfs2_alloc_restarted reason = RESTART_NONE; 4747 enum ocfs2_alloc_restarted reason = RESTART_NONE;
4747 u32 bit_off, num_bits; 4748 u32 bit_off, num_bits;
@@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4796 OCFS2_JOURNAL_ACCESS_WRITE); 4797 OCFS2_JOURNAL_ACCESS_WRITE);
4797 if (status < 0) { 4798 if (status < 0) {
4798 mlog_errno(status); 4799 mlog_errno(status);
4799 goto leave; 4800 need_free = 1;
4801 goto bail;
4800 } 4802 }
4801 4803
4802 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4804 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
@@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4807 num_bits, flags, meta_ac); 4809 num_bits, flags, meta_ac);
4808 if (status < 0) { 4810 if (status < 0) {
4809 mlog_errno(status); 4811 mlog_errno(status);
4810 goto leave; 4812 need_free = 1;
4813 goto bail;
4811 } 4814 }
4812 4815
4813 ocfs2_journal_dirty(handle, et->et_root_bh); 4816 ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4821 reason = RESTART_TRANS; 4824 reason = RESTART_TRANS;
4822 } 4825 }
4823 4826
4827bail:
4828 if (need_free) {
4829 if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
4830 ocfs2_free_local_alloc_bits(osb, handle, data_ac,
4831 bit_off, num_bits);
4832 else
4833 ocfs2_free_clusters(handle,
4834 data_ac->ac_inode,
4835 data_ac->ac_bh,
4836 ocfs2_clusters_to_blocks(osb->sb, bit_off),
4837 num_bits);
4838 }
4839
4824leave: 4840leave:
4825 if (reason_ret) 4841 if (reason_ret)
4826 *reason_ret = reason; 4842 *reason_ret = reason;
@@ -6805,6 +6821,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6805 struct buffer_head *di_bh) 6821 struct buffer_head *di_bh)
6806{ 6822{
6807 int ret, i, has_data, num_pages = 0; 6823 int ret, i, has_data, num_pages = 0;
6824 int need_free = 0;
6825 u32 bit_off, num;
6808 handle_t *handle; 6826 handle_t *handle;
6809 u64 uninitialized_var(block); 6827 u64 uninitialized_var(block);
6810 struct ocfs2_inode_info *oi = OCFS2_I(inode); 6828 struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -6850,7 +6868,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6850 } 6868 }
6851 6869
6852 if (has_data) { 6870 if (has_data) {
6853 u32 bit_off, num;
6854 unsigned int page_end; 6871 unsigned int page_end;
6855 u64 phys; 6872 u64 phys;
6856 6873
@@ -6886,6 +6903,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6886 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); 6903 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6887 if (ret) { 6904 if (ret) {
6888 mlog_errno(ret); 6905 mlog_errno(ret);
6906 need_free = 1;
6889 goto out_commit; 6907 goto out_commit;
6890 } 6908 }
6891 6909
@@ -6896,6 +6914,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6896 ret = ocfs2_read_inline_data(inode, pages[0], di_bh); 6914 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6897 if (ret) { 6915 if (ret) {
6898 mlog_errno(ret); 6916 mlog_errno(ret);
6917 need_free = 1;
6899 goto out_commit; 6918 goto out_commit;
6900 } 6919 }
6901 6920
@@ -6927,6 +6946,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6927 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); 6946 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
6928 if (ret) { 6947 if (ret) {
6929 mlog_errno(ret); 6948 mlog_errno(ret);
6949 need_free = 1;
6930 goto out_commit; 6950 goto out_commit;
6931 } 6951 }
6932 6952
@@ -6938,6 +6958,18 @@ out_commit:
6938 dquot_free_space_nodirty(inode, 6958 dquot_free_space_nodirty(inode,
6939 ocfs2_clusters_to_bytes(osb->sb, 1)); 6959 ocfs2_clusters_to_bytes(osb->sb, 1));
6940 6960
6961 if (need_free) {
6962 if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
6963 ocfs2_free_local_alloc_bits(osb, handle, data_ac,
6964 bit_off, num);
6965 else
6966 ocfs2_free_clusters(handle,
6967 data_ac->ac_inode,
6968 data_ac->ac_bh,
6969 ocfs2_clusters_to_blocks(osb->sb, bit_off),
6970 num);
6971 }
6972
6941 ocfs2_commit_trans(osb, handle); 6973 ocfs2_commit_trans(osb, handle);
6942 6974
6943out_unlock: 6975out_unlock:
@@ -7126,7 +7158,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7126 if (end > i_size_read(inode)) 7158 if (end > i_size_read(inode))
7127 end = i_size_read(inode); 7159 end = i_size_read(inode);
7128 7160
7129 BUG_ON(start >= end); 7161 BUG_ON(start > end);
7130 7162
7131 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || 7163 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7132 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || 7164 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
@@ -7260,14 +7292,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7260 start = range->start >> osb->s_clustersize_bits; 7292 start = range->start >> osb->s_clustersize_bits;
7261 len = range->len >> osb->s_clustersize_bits; 7293 len = range->len >> osb->s_clustersize_bits;
7262 minlen = range->minlen >> osb->s_clustersize_bits; 7294 minlen = range->minlen >> osb->s_clustersize_bits;
7263 trimmed = 0;
7264 7295
7265 if (!len) { 7296 if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
7266 range->len = 0;
7267 return 0;
7268 }
7269
7270 if (minlen >= osb->bitmap_cpg)
7271 return -EINVAL; 7297 return -EINVAL;
7272 7298
7273 main_bm_inode = ocfs2_get_system_file_inode(osb, 7299 main_bm_inode = ocfs2_get_system_file_inode(osb,
@@ -7293,6 +7319,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7293 goto out_unlock; 7319 goto out_unlock;
7294 } 7320 }
7295 7321
7322 len = range->len >> osb->s_clustersize_bits;
7296 if (start + len > le32_to_cpu(main_bm->i_clusters)) 7323 if (start + len > le32_to_cpu(main_bm->i_clusters))
7297 len = le32_to_cpu(main_bm->i_clusters) - start; 7324 len = le32_to_cpu(main_bm->i_clusters) - start;
7298 7325
@@ -7307,6 +7334,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7307 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); 7334 last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7308 last_bit = osb->bitmap_cpg; 7335 last_bit = osb->bitmap_cpg;
7309 7336
7337 trimmed = 0;
7310 for (group = first_group; group <= last_group;) { 7338 for (group = first_group; group <= last_group;) {
7311 if (first_bit + len >= osb->bitmap_cpg) 7339 if (first_bit + len >= osb->bitmap_cpg)
7312 last_bit = osb->bitmap_cpg; 7340 last_bit = osb->bitmap_cpg;
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d8608..1aefc0350ec3 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o 1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2 2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ 3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o netdebug.o ver.o 4 quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73920ffda05b..bf482dfed14f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -413,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
413 } 413 }
414 414
415 /* Must put everything in 512 byte sectors for the bio... */ 415 /* Must put everything in 512 byte sectors for the bio... */
416 bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); 416 bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
417 bio->bi_bdev = reg->hr_bdev; 417 bio->bi_bdev = reg->hr_bdev;
418 bio->bi_private = wc; 418 bio->bi_private = wc;
419 bio->bi_end_io = o2hb_bio_end_io; 419 bio->bi_end_io = o2hb_bio_end_io;
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5f..441c84e169e6 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -29,7 +29,6 @@
29#include "heartbeat.h" 29#include "heartbeat.h"
30#include "masklog.h" 30#include "masklog.h"
31#include "sys.h" 31#include "sys.h"
32#include "ver.h"
33 32
34/* for now we operate under the assertion that there can be only one 33/* for now we operate under the assertion that there can be only one
35 * cluster active at a time. Changing this will require trickling 34 * cluster active at a time. Changing this will require trickling
@@ -945,8 +944,6 @@ static int __init init_o2nm(void)
945{ 944{
946 int ret = -1; 945 int ret = -1;
947 946
948 cluster_print_version();
949
950 ret = o2hb_init(); 947 ret = o2hb_init();
951 if (ret) 948 if (ret)
952 goto out; 949 goto out;
@@ -984,6 +981,7 @@ out:
984 981
985MODULE_AUTHOR("Oracle"); 982MODULE_AUTHOR("Oracle");
986MODULE_LICENSE("GPL"); 983MODULE_LICENSE("GPL");
984MODULE_DESCRIPTION("OCFS2 cluster management");
987 985
988module_init(init_o2nm) 986module_init(init_o2nm)
989module_exit(exit_o2nm) 987module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad3..000000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "ver.h"
30
31#define CLUSTER_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34
35void cluster_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c2..000000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef O2CLUSTER_VER_H
27#define O2CLUSTER_VER_H
28
29void cluster_print_version(void);
30
31#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb15..bd1aab1f49a4 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
7 7
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8b3382abf840..33660a4a52fa 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
43#include "dlmdomain.h" 43#include "dlmdomain.h"
44#include "dlmdebug.h" 44#include "dlmdebug.h"
45 45
46#include "dlmver.h"
47
48#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) 46#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
49#include "cluster/masklog.h" 47#include "cluster/masklog.h"
50 48
@@ -2328,8 +2326,6 @@ static int __init dlm_init(void)
2328{ 2326{
2329 int status; 2327 int status;
2330 2328
2331 dlm_print_version();
2332
2333 status = dlm_init_mle_cache(); 2329 status = dlm_init_mle_cache();
2334 if (status) { 2330 if (status) {
2335 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); 2331 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void)
2379 2375
2380MODULE_AUTHOR("Oracle"); 2376MODULE_AUTHOR("Oracle");
2381MODULE_LICENSE("GPL"); 2377MODULE_LICENSE("GPL");
2378MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
2382 2379
2383module_init(dlm_init); 2380module_init(dlm_init);
2384module_exit(dlm_exit); 2381module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158d..000000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmver.h"
30
31#define DLM_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34
35void dlm_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a16..000000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLM_VER_H
27#define DLM_VER_H
28
29void dlm_print_version(void);
30
31#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a6701..eed3db8c5b49 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4 4
5ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o 5ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index efa2b3d339e3..09b7d9dac71d 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
49 49
50#include "stackglue.h" 50#include "stackglue.h"
51#include "userdlm.h" 51#include "userdlm.h"
52#include "dlmfsver.h"
53 52
54#define MLOG_MASK_PREFIX ML_DLMFS 53#define MLOG_MASK_PREFIX ML_DLMFS
55#include "cluster/masklog.h" 54#include "cluster/masklog.h"
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)
644 int status; 643 int status;
645 int cleanup_inode = 0, cleanup_worker = 0; 644 int cleanup_inode = 0, cleanup_worker = 0;
646 645
647 dlmfs_print_version();
648
649 status = bdi_init(&dlmfs_backing_dev_info); 646 status = bdi_init(&dlmfs_backing_dev_info);
650 if (status) 647 if (status)
651 return status; 648 return status;
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
701 698
702MODULE_AUTHOR("Oracle"); 699MODULE_AUTHOR("Oracle");
703MODULE_LICENSE("GPL"); 700MODULE_LICENSE("GPL");
701MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
704 702
705module_init(init_dlmfs_fs) 703module_init(init_dlmfs_fs)
706module_exit(exit_dlmfs_fs) 704module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f83..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmfsver.h"
30
31#define DLM_BUILD_VERSION "1.5.0"
32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34
35void dlmfs_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25c..000000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLMFS_VER_H
27#define DLMFS_VER_H
28
29void dlmfs_print_version(void);
30
31#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3407b2c62b21..19986959d149 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2996 2996
2997 /* for now, uuid == domain */ 2997 /* for now, uuid == domain */
2998 status = ocfs2_cluster_connect(osb->osb_cluster_stack, 2998 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2999 osb->osb_cluster_name,
3000 strlen(osb->osb_cluster_name),
2999 osb->uuid_str, 3001 osb->uuid_str,
3000 strlen(osb->uuid_str), 3002 strlen(osb->uuid_str),
3001 &lproto, ocfs2_do_node_down, osb, 3003 &lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
3005 goto bail; 3007 goto bail;
3006 } 3008 }
3007 3009
3008 status = ocfs2_cluster_this_node(&osb->node_num); 3010 status = ocfs2_cluster_this_node(conn, &osb->node_num);
3009 if (status < 0) { 3011 if (status < 0) {
3010 mlog_errno(status); 3012 mlog_errno(status);
3011 mlog(ML_ERROR, 3013 mlog(ML_ERROR,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6fff128cad16..51632c40e896 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -185,6 +185,9 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
185 file->f_path.dentry->d_name.name, 185 file->f_path.dentry->d_name.name,
186 (unsigned long long)datasync); 186 (unsigned long long)datasync);
187 187
188 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
189 return -EROFS;
190
188 err = filemap_write_and_wait_range(inode->i_mapping, start, end); 191 err = filemap_write_and_wait_range(inode->i_mapping, start, end);
189 if (err) 192 if (err)
190 return err; 193 return err;
@@ -474,11 +477,6 @@ static int ocfs2_truncate_file(struct inode *inode,
474 goto bail; 477 goto bail;
475 } 478 }
476 479
477 /* lets handle the simple truncate cases before doing any more
478 * cluster locking. */
479 if (new_i_size == le64_to_cpu(fe->i_size))
480 goto bail;
481
482 down_write(&OCFS2_I(inode)->ip_alloc_sem); 480 down_write(&OCFS2_I(inode)->ip_alloc_sem);
483 481
484 ocfs2_resv_discard(&osb->osb_la_resmap, 482 ocfs2_resv_discard(&osb->osb_la_resmap,
@@ -718,7 +716,8 @@ leave:
718 * While a write will already be ordering the data, a truncate will not. 716 * While a write will already be ordering the data, a truncate will not.
719 * Thus, we need to explicitly order the zeroed pages. 717 * Thus, we need to explicitly order the zeroed pages.
720 */ 718 */
721static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) 719static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
720 struct buffer_head *di_bh)
722{ 721{
723 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 722 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
724 handle_t *handle = NULL; 723 handle_t *handle = NULL;
@@ -735,7 +734,14 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
735 } 734 }
736 735
737 ret = ocfs2_jbd2_file_inode(handle, inode); 736 ret = ocfs2_jbd2_file_inode(handle, inode);
738 if (ret < 0) 737 if (ret < 0) {
738 mlog_errno(ret);
739 goto out;
740 }
741
742 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
743 OCFS2_JOURNAL_ACCESS_WRITE);
744 if (ret)
739 mlog_errno(ret); 745 mlog_errno(ret);
740 746
741out: 747out:
@@ -751,7 +757,7 @@ out:
751 * to be too fragile to do exactly what we need without us having to 757 * to be too fragile to do exactly what we need without us having to
752 * worry about recursive locking in ->write_begin() and ->write_end(). */ 758 * worry about recursive locking in ->write_begin() and ->write_end(). */
753static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, 759static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
754 u64 abs_to) 760 u64 abs_to, struct buffer_head *di_bh)
755{ 761{
756 struct address_space *mapping = inode->i_mapping; 762 struct address_space *mapping = inode->i_mapping;
757 struct page *page; 763 struct page *page;
@@ -759,6 +765,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
759 handle_t *handle = NULL; 765 handle_t *handle = NULL;
760 int ret = 0; 766 int ret = 0;
761 unsigned zero_from, zero_to, block_start, block_end; 767 unsigned zero_from, zero_to, block_start, block_end;
768 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
762 769
763 BUG_ON(abs_from >= abs_to); 770 BUG_ON(abs_from >= abs_to);
764 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 771 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
@@ -801,7 +808,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
801 } 808 }
802 809
803 if (!handle) { 810 if (!handle) {
804 handle = ocfs2_zero_start_ordered_transaction(inode); 811 handle = ocfs2_zero_start_ordered_transaction(inode,
812 di_bh);
805 if (IS_ERR(handle)) { 813 if (IS_ERR(handle)) {
806 ret = PTR_ERR(handle); 814 ret = PTR_ERR(handle);
807 handle = NULL; 815 handle = NULL;
@@ -818,8 +826,22 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
818 ret = 0; 826 ret = 0;
819 } 827 }
820 828
821 if (handle) 829 if (handle) {
830 /*
831 * fs-writeback will release the dirty pages without page lock
832 * whose offset are over inode size, the release happens at
833 * block_write_full_page_endio().
834 */
835 i_size_write(inode, abs_to);
836 inode->i_blocks = ocfs2_inode_sector_count(inode);
837 di->i_size = cpu_to_le64((u64)i_size_read(inode));
838 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
839 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
840 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
841 di->i_mtime_nsec = di->i_ctime_nsec;
842 ocfs2_journal_dirty(handle, di_bh);
822 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 843 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
844 }
823 845
824out_unlock: 846out_unlock:
825 unlock_page(page); 847 unlock_page(page);
@@ -915,7 +937,7 @@ out:
915 * has made sure that the entire range needs zeroing. 937 * has made sure that the entire range needs zeroing.
916 */ 938 */
917static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, 939static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
918 u64 range_end) 940 u64 range_end, struct buffer_head *di_bh)
919{ 941{
920 int rc = 0; 942 int rc = 0;
921 u64 next_pos; 943 u64 next_pos;
@@ -931,7 +953,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
931 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; 953 next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
932 if (next_pos > range_end) 954 if (next_pos > range_end)
933 next_pos = range_end; 955 next_pos = range_end;
934 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); 956 rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
935 if (rc < 0) { 957 if (rc < 0) {
936 mlog_errno(rc); 958 mlog_errno(rc);
937 break; 959 break;
@@ -977,7 +999,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
977 range_end = zero_to_size; 999 range_end = zero_to_size;
978 1000
979 ret = ocfs2_zero_extend_range(inode, range_start, 1001 ret = ocfs2_zero_extend_range(inode, range_start,
980 range_end); 1002 range_end, di_bh);
981 if (ret) { 1003 if (ret) {
982 mlog_errno(ret); 1004 mlog_errno(ret);
983 break; 1005 break;
@@ -1145,14 +1167,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1145 goto bail_unlock_rw; 1167 goto bail_unlock_rw;
1146 } 1168 }
1147 1169
1148 if (size_change && attr->ia_size != i_size_read(inode)) { 1170 if (size_change) {
1149 status = inode_newsize_ok(inode, attr->ia_size); 1171 status = inode_newsize_ok(inode, attr->ia_size);
1150 if (status) 1172 if (status)
1151 goto bail_unlock; 1173 goto bail_unlock;
1152 1174
1153 inode_dio_wait(inode); 1175 inode_dio_wait(inode);
1154 1176
1155 if (i_size_read(inode) > attr->ia_size) { 1177 if (i_size_read(inode) >= attr->ia_size) {
1156 if (ocfs2_should_order_data(inode)) { 1178 if (ocfs2_should_order_data(inode)) {
1157 status = ocfs2_begin_ordered_truncate(inode, 1179 status = ocfs2_begin_ordered_truncate(inode,
1158 attr->ia_size); 1180 attr->ia_size);
@@ -1236,7 +1258,7 @@ bail:
1236 dqput(transfer_to[qtype]); 1258 dqput(transfer_to[qtype]);
1237 1259
1238 if (!status && attr->ia_valid & ATTR_MODE) { 1260 if (!status && attr->ia_valid & ATTR_MODE) {
1239 status = ocfs2_acl_chmod(inode); 1261 status = posix_acl_chmod(inode, inode->i_mode);
1240 if (status < 0) 1262 if (status < 0)
1241 mlog_errno(status); 1263 mlog_errno(status);
1242 } 1264 }
@@ -1869,7 +1891,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1869 } 1891 }
1870 size = sr->l_start + sr->l_len; 1892 size = sr->l_start + sr->l_len;
1871 1893
1872 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { 1894 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
1895 cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
1873 if (sr->l_len <= 0) { 1896 if (sr->l_len <= 0) {
1874 ret = -EINVAL; 1897 ret = -EINVAL;
1875 goto out_inode_unlock; 1898 goto out_inode_unlock;
@@ -2370,8 +2393,8 @@ out_dio:
2370 2393
2371 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || 2394 if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2372 ((file->f_flags & O_DIRECT) && !direct_io)) { 2395 ((file->f_flags & O_DIRECT) && !direct_io)) {
2373 ret = filemap_fdatawrite_range(file->f_mapping, pos, 2396 ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
2374 pos + count - 1); 2397 *ppos + count - 1);
2375 if (ret < 0) 2398 if (ret < 0)
2376 written = ret; 2399 written = ret;
2377 2400
@@ -2384,8 +2407,8 @@ out_dio:
2384 } 2407 }
2385 2408
2386 if (!ret) 2409 if (!ret)
2387 ret = filemap_fdatawait_range(file->f_mapping, pos, 2410 ret = filemap_fdatawait_range(file->f_mapping, *ppos,
2388 pos + count - 1); 2411 *ppos + count - 1);
2389 } 2412 }
2390 2413
2391 /* 2414 /*
@@ -2661,6 +2684,7 @@ const struct inode_operations ocfs2_file_iops = {
2661 .removexattr = generic_removexattr, 2684 .removexattr = generic_removexattr,
2662 .fiemap = ocfs2_fiemap, 2685 .fiemap = ocfs2_fiemap,
2663 .get_acl = ocfs2_iop_get_acl, 2686 .get_acl = ocfs2_iop_get_acl,
2687 .set_acl = ocfs2_iop_set_acl,
2664}; 2688};
2665 2689
2666const struct inode_operations ocfs2_special_file_iops = { 2690const struct inode_operations ocfs2_special_file_iops = {
@@ -2668,6 +2692,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2668 .getattr = ocfs2_getattr, 2692 .getattr = ocfs2_getattr,
2669 .permission = ocfs2_permission, 2693 .permission = ocfs2_permission,
2670 .get_acl = ocfs2_iop_get_acl, 2694 .get_acl = ocfs2_iop_get_acl,
2695 .set_acl = ocfs2_iop_set_acl,
2671}; 2696};
2672 2697
2673/* 2698/*
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index fa32ce9b455d..8ca3c29accbf 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/blkdev.h>
10#include <linux/compat.h> 11#include <linux/compat.h>
11 12
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
966 case FITRIM: 967 case FITRIM:
967 { 968 {
968 struct super_block *sb = inode->i_sb; 969 struct super_block *sb = inode->i_sb;
970 struct request_queue *q = bdev_get_queue(sb->s_bdev);
969 struct fstrim_range range; 971 struct fstrim_range range;
970 int ret = 0; 972 int ret = 0;
971 973
972 if (!capable(CAP_SYS_ADMIN)) 974 if (!capable(CAP_SYS_ADMIN))
973 return -EPERM; 975 return -EPERM;
974 976
977 if (!blk_queue_discard(q))
978 return -EOPNOTSUPP;
979
975 if (copy_from_user(&range, argp, sizeof(range))) 980 if (copy_from_user(&range, argp, sizeof(range)))
976 return -EFAULT; 981 return -EFAULT;
977 982
983 range.minlen = max_t(u64, q->limits.discard_granularity,
984 range.minlen);
978 ret = ocfs2_trim_fs(sb, &range); 985 ret = ocfs2_trim_fs(sb, &range);
979 if (ret < 0) 986 if (ret < 0)
980 return ret; 987 return ret;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index cd5496b7a0a3..044013455621 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -781,6 +781,48 @@ bail:
781 return status; 781 return status;
782} 782}
783 783
784int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
785 handle_t *handle,
786 struct ocfs2_alloc_context *ac,
787 u32 bit_off,
788 u32 num_bits)
789{
790 int status, start;
791 u32 clear_bits;
792 struct inode *local_alloc_inode;
793 void *bitmap;
794 struct ocfs2_dinode *alloc;
795 struct ocfs2_local_alloc *la;
796
797 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
798
799 local_alloc_inode = ac->ac_inode;
800 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
801 la = OCFS2_LOCAL_ALLOC(alloc);
802
803 bitmap = la->la_bitmap;
804 start = bit_off - le32_to_cpu(la->la_bm_off);
805 clear_bits = num_bits;
806
807 status = ocfs2_journal_access_di(handle,
808 INODE_CACHE(local_alloc_inode),
809 osb->local_alloc_bh,
810 OCFS2_JOURNAL_ACCESS_WRITE);
811 if (status < 0) {
812 mlog_errno(status);
813 goto bail;
814 }
815
816 while (clear_bits--)
817 ocfs2_clear_bit(start++, bitmap);
818
819 le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
820 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
821
822bail:
823 return status;
824}
825
784static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) 826static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
785{ 827{
786 u32 count; 828 u32 count;
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 1be9b5864460..44a7d1fb2dec 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
55 u32 *bit_off, 55 u32 *bit_off,
56 u32 *num_bits); 56 u32 *num_bits);
57 57
58int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
59 handle_t *handle,
60 struct ocfs2_alloc_context *ac,
61 u32 bit_off,
62 u32 num_bits);
63
58void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, 64void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
59 unsigned int num_clusters); 65 unsigned int num_clusters);
60void ocfs2_la_enable_worker(struct work_struct *work); 66void ocfs2_la_enable_worker(struct work_struct *work);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 631a98213474..64c304d668f0 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
561 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 561 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
562} 562}
563 563
564static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
565 handle_t *handle,
566 struct buffer_head *di_bh,
567 u32 num_bits,
568 u16 chain)
569{
570 int ret;
571 u32 tmp_used;
572 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
573 struct ocfs2_chain_list *cl =
574 (struct ocfs2_chain_list *) &di->id2.i_chain;
575
576 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
577 OCFS2_JOURNAL_ACCESS_WRITE);
578 if (ret < 0) {
579 mlog_errno(ret);
580 goto out;
581 }
582
583 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
584 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
585 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
586 ocfs2_journal_dirty(handle, di_bh);
587
588out:
589 return ret;
590}
591
592static inline int ocfs2_block_group_set_bits(handle_t *handle,
593 struct inode *alloc_inode,
594 struct ocfs2_group_desc *bg,
595 struct buffer_head *group_bh,
596 unsigned int bit_off,
597 unsigned int num_bits)
598{
599 int status;
600 void *bitmap = bg->bg_bitmap;
601 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
602
603 /* All callers get the descriptor via
604 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
605 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
606 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
607
608 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
609 num_bits);
610
611 if (ocfs2_is_cluster_bitmap(alloc_inode))
612 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
613
614 status = ocfs2_journal_access_gd(handle,
615 INODE_CACHE(alloc_inode),
616 group_bh,
617 journal_type);
618 if (status < 0) {
619 mlog_errno(status);
620 goto bail;
621 }
622
623 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
624 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
625 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
626 " count %u but claims %u are freed. num_bits %d",
627 (unsigned long long)le64_to_cpu(bg->bg_blkno),
628 le16_to_cpu(bg->bg_bits),
629 le16_to_cpu(bg->bg_free_bits_count), num_bits);
630 return -EROFS;
631 }
632 while (num_bits--)
633 ocfs2_set_bit(bit_off++, bitmap);
634
635 ocfs2_journal_dirty(handle, group_bh);
636
637bail:
638 return status;
639}
640
641static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 564static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
642 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 565 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
643 u32 len, int ext_flags) 566 u32 len, int ext_flags)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 4f791f6d27d0..3683643f3f0e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -230,6 +230,7 @@ static int ocfs2_mknod(struct inode *dir,
230 struct ocfs2_dir_lookup_result lookup = { NULL, }; 230 struct ocfs2_dir_lookup_result lookup = { NULL, };
231 sigset_t oldset; 231 sigset_t oldset;
232 int did_block_signals = 0; 232 int did_block_signals = 0;
233 struct posix_acl *default_acl = NULL, *acl = NULL;
233 234
234 trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, 235 trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
235 (unsigned long long)OCFS2_I(dir)->ip_blkno, 236 (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -331,6 +332,12 @@ static int ocfs2_mknod(struct inode *dir,
331 goto leave; 332 goto leave;
332 } 333 }
333 334
335 status = posix_acl_create(dir, &mode, &default_acl, &acl);
336 if (status) {
337 mlog_errno(status);
338 goto leave;
339 }
340
334 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 341 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
335 S_ISDIR(mode), 342 S_ISDIR(mode),
336 xattr_credits)); 343 xattr_credits));
@@ -379,8 +386,17 @@ static int ocfs2_mknod(struct inode *dir,
379 inc_nlink(dir); 386 inc_nlink(dir);
380 } 387 }
381 388
382 status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, 389 if (default_acl) {
383 meta_ac, data_ac); 390 status = ocfs2_set_acl(handle, inode, new_fe_bh,
391 ACL_TYPE_DEFAULT, default_acl,
392 meta_ac, data_ac);
393 }
394 if (!status && acl) {
395 status = ocfs2_set_acl(handle, inode, new_fe_bh,
396 ACL_TYPE_ACCESS, acl,
397 meta_ac, data_ac);
398 }
399
384 if (status < 0) { 400 if (status < 0) {
385 mlog_errno(status); 401 mlog_errno(status);
386 goto leave; 402 goto leave;
@@ -419,6 +435,10 @@ static int ocfs2_mknod(struct inode *dir,
419 d_instantiate(dentry, inode); 435 d_instantiate(dentry, inode);
420 status = 0; 436 status = 0;
421leave: 437leave:
438 if (default_acl)
439 posix_acl_release(default_acl);
440 if (acl)
441 posix_acl_release(acl);
422 if (status < 0 && did_quota_inode) 442 if (status < 0 && did_quota_inode)
423 dquot_free_inode(inode); 443 dquot_free_inode(inode);
424 if (handle) 444 if (handle)
@@ -644,6 +664,7 @@ static int ocfs2_link(struct dentry *old_dentry,
644 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 664 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
645 struct ocfs2_dir_lookup_result lookup = { NULL, }; 665 struct ocfs2_dir_lookup_result lookup = { NULL, };
646 sigset_t oldset; 666 sigset_t oldset;
667 u64 old_de_ino;
647 668
648 trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno, 669 trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
649 old_dentry->d_name.len, old_dentry->d_name.name, 670 old_dentry->d_name.len, old_dentry->d_name.name,
@@ -666,6 +687,22 @@ static int ocfs2_link(struct dentry *old_dentry,
666 goto out; 687 goto out;
667 } 688 }
668 689
690 err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
691 old_dentry->d_name.len, &old_de_ino);
692 if (err) {
693 err = -ENOENT;
694 goto out;
695 }
696
697 /*
698 * Check whether another node removed the source inode while we
699 * were in the vfs.
700 */
701 if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
702 err = -ENOENT;
703 goto out;
704 }
705
669 err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, 706 err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
670 dentry->d_name.len); 707 dentry->d_name.len);
671 if (err) 708 if (err)
@@ -948,7 +985,7 @@ leave:
948 ocfs2_free_dir_lookup_result(&orphan_insert); 985 ocfs2_free_dir_lookup_result(&orphan_insert);
949 ocfs2_free_dir_lookup_result(&lookup); 986 ocfs2_free_dir_lookup_result(&lookup);
950 987
951 if (status && (status != -ENOTEMPTY)) 988 if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
952 mlog_errno(status); 989 mlog_errno(status);
953 990
954 return status; 991 return status;
@@ -2504,4 +2541,5 @@ const struct inode_operations ocfs2_dir_iops = {
2504 .removexattr = generic_removexattr, 2541 .removexattr = generic_removexattr,
2505 .fiemap = ocfs2_fiemap, 2542 .fiemap = ocfs2_fiemap,
2506 .get_acl = ocfs2_iop_get_acl, 2543 .get_acl = ocfs2_iop_get_acl,
2544 .set_acl = ocfs2_iop_set_acl,
2507}; 2545};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c794..553f53cc73ae 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -387,6 +387,7 @@ struct ocfs2_super
387 u8 osb_stackflags; 387 u8 osb_stackflags;
388 388
389 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; 389 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
390 char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
390 struct ocfs2_cluster_connection *cconn; 391 struct ocfs2_cluster_connection *cconn;
391 struct ocfs2_lock_res osb_super_lockres; 392 struct ocfs2_lock_res osb_super_lockres;
392 struct ocfs2_lock_res osb_rename_lockres; 393 struct ocfs2_lock_res osb_rename_lockres;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index aaa50611ec66..d7b5108789e2 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -717,6 +717,12 @@ static int ocfs2_release_dquot(struct dquot *dquot)
717 */ 717 */
718 if (status < 0) 718 if (status < 0)
719 mlog_errno(status); 719 mlog_errno(status);
720 /*
721 * Clear dq_off so that we search for the structure in quota file next
722 * time we acquire it. The structure might be deleted and reallocated
723 * elsewhere by another node while our dquot structure is on freelist.
724 */
725 dquot->dq_off = 0;
720 clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); 726 clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
721out_trans: 727out_trans:
722 ocfs2_commit_trans(osb, handle); 728 ocfs2_commit_trans(osb, handle);
@@ -756,16 +762,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
756 status = ocfs2_lock_global_qf(info, 1); 762 status = ocfs2_lock_global_qf(info, 1);
757 if (status < 0) 763 if (status < 0)
758 goto out; 764 goto out;
759 if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { 765 status = ocfs2_qinfo_lock(info, 0);
760 status = ocfs2_qinfo_lock(info, 0); 766 if (status < 0)
761 if (status < 0) 767 goto out_dq;
762 goto out_dq; 768 /*
763 status = qtree_read_dquot(&info->dqi_gi, dquot); 769 * We always want to read dquot structure from disk because we don't
764 ocfs2_qinfo_unlock(info, 0); 770 * know what happened with it while it was on freelist.
765 if (status < 0) 771 */
766 goto out_dq; 772 status = qtree_read_dquot(&info->dqi_gi, dquot);
767 } 773 ocfs2_qinfo_unlock(info, 0);
768 set_bit(DQ_READ_B, &dquot->dq_flags); 774 if (status < 0)
775 goto out_dq;
769 776
770 OCFS2_DQUOT(dquot)->dq_use_count++; 777 OCFS2_DQUOT(dquot)->dq_use_count++;
771 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; 778 OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2e4344be3b96..2001862bf2b1 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1303,10 +1303,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
1303 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); 1303 ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
1304 1304
1305out: 1305out:
1306 /* Clear the read bit so that next time someone uses this
1307 * dquot he reads fresh info from disk and allocates local
1308 * dquot structure */
1309 clear_bit(DQ_READ_B, &dquot->dq_flags);
1310 return status; 1306 return status;
1311} 1307}
1312 1308
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 55767e1ba724..6ba4bcbc4796 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -46,6 +46,7 @@
46#include <linux/quotaops.h> 46#include <linux/quotaops.h>
47#include <linux/namei.h> 47#include <linux/namei.h>
48#include <linux/mount.h> 48#include <linux/mount.h>
49#include <linux/posix_acl.h>
49 50
50struct ocfs2_cow_context { 51struct ocfs2_cow_context {
51 struct inode *inode; 52 struct inode *inode;
@@ -4268,11 +4269,20 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4268 struct inode *inode = old_dentry->d_inode; 4269 struct inode *inode = old_dentry->d_inode;
4269 struct buffer_head *old_bh = NULL; 4270 struct buffer_head *old_bh = NULL;
4270 struct inode *new_orphan_inode = NULL; 4271 struct inode *new_orphan_inode = NULL;
4272 struct posix_acl *default_acl, *acl;
4273 umode_t mode;
4271 4274
4272 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) 4275 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4273 return -EOPNOTSUPP; 4276 return -EOPNOTSUPP;
4274 4277
4275 error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, 4278 mode = inode->i_mode;
4279 error = posix_acl_create(dir, &mode, &default_acl, &acl);
4280 if (error) {
4281 mlog_errno(error);
4282 goto out;
4283 }
4284
4285 error = ocfs2_create_inode_in_orphan(dir, mode,
4276 &new_orphan_inode); 4286 &new_orphan_inode);
4277 if (error) { 4287 if (error) {
4278 mlog_errno(error); 4288 mlog_errno(error);
@@ -4303,11 +4313,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4303 /* If the security isn't preserved, we need to re-initialize them. */ 4313 /* If the security isn't preserved, we need to re-initialize them. */
4304 if (!preserve) { 4314 if (!preserve) {
4305 error = ocfs2_init_security_and_acl(dir, new_orphan_inode, 4315 error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
4306 &new_dentry->d_name); 4316 &new_dentry->d_name,
4317 default_acl, acl);
4307 if (error) 4318 if (error)
4308 mlog_errno(error); 4319 mlog_errno(error);
4309 } 4320 }
4310out: 4321out:
4322 if (default_acl)
4323 posix_acl_release(default_acl);
4324 if (acl)
4325 posix_acl_release(acl);
4311 if (!error) { 4326 if (!error) {
4312 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, 4327 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4313 new_dentry); 4328 new_dentry);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456f..1724d43d3da1 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
398 return 0; 398 return 0;
399} 399}
400 400
401static int o2cb_cluster_this_node(unsigned int *node) 401static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
402 unsigned int *node)
402{ 403{
403 int node_num; 404 int node_num;
404 405
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231f..13a8537d8e8b 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -23,6 +23,7 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/reboot.h> 25#include <linux/reboot.h>
26#include <linux/sched.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
28#include "stackglue.h" 29#include "stackglue.h"
@@ -102,6 +103,12 @@
102#define OCFS2_TEXT_UUID_LEN 32 103#define OCFS2_TEXT_UUID_LEN 32
103#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 104#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
104#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 105#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
106#define VERSION_LOCK "version_lock"
107
108enum ocfs2_connection_type {
109 WITH_CONTROLD,
110 NO_CONTROLD
111};
105 112
106/* 113/*
107 * ocfs2_live_connection is refcounted because the filesystem and 114 * ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +117,13 @@
110struct ocfs2_live_connection { 117struct ocfs2_live_connection {
111 struct list_head oc_list; 118 struct list_head oc_list;
112 struct ocfs2_cluster_connection *oc_conn; 119 struct ocfs2_cluster_connection *oc_conn;
120 enum ocfs2_connection_type oc_type;
121 atomic_t oc_this_node;
122 int oc_our_slot;
123 struct dlm_lksb oc_version_lksb;
124 char oc_lvb[DLM_LVB_LEN];
125 struct completion oc_sync_wait;
126 wait_queue_head_t oc_wait;
113}; 127};
114 128
115struct ocfs2_control_private { 129struct ocfs2_control_private {
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
198 * mount path. Since the VFS prevents multiple calls to 212 * mount path. Since the VFS prevents multiple calls to
199 * fill_super(), we can't get dupes here. 213 * fill_super(), we can't get dupes here.
200 */ 214 */
201static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, 215static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
202 struct ocfs2_live_connection **c_ret) 216 struct ocfs2_live_connection *c)
203{ 217{
204 int rc = 0; 218 int rc = 0;
205 struct ocfs2_live_connection *c;
206
207 c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
208 if (!c)
209 return -ENOMEM;
210 219
211 mutex_lock(&ocfs2_control_lock); 220 mutex_lock(&ocfs2_control_lock);
212 c->oc_conn = conn; 221 c->oc_conn = conn;
213 222
214 if (atomic_read(&ocfs2_control_opened)) 223 if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
215 list_add(&c->oc_list, &ocfs2_live_connection_list); 224 list_add(&c->oc_list, &ocfs2_live_connection_list);
216 else { 225 else {
217 printk(KERN_ERR 226 printk(KERN_ERR
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
220 } 229 }
221 230
222 mutex_unlock(&ocfs2_control_lock); 231 mutex_unlock(&ocfs2_control_lock);
223
224 if (!rc)
225 *c_ret = c;
226 else
227 kfree(c);
228
229 return rc; 232 return rc;
230} 233}
231 234
@@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
799 return 0; 802 return 0;
800} 803}
801 804
805static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
806{
807 struct ocfs2_protocol_version *pv =
808 (struct ocfs2_protocol_version *)lvb;
809 /*
810 * ocfs2_protocol_version has two u8 variables, so we don't
811 * need any endian conversion.
812 */
813 ver->pv_major = pv->pv_major;
814 ver->pv_minor = pv->pv_minor;
815}
816
817static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
818{
819 struct ocfs2_protocol_version *pv =
820 (struct ocfs2_protocol_version *)lvb;
821 /*
822 * ocfs2_protocol_version has two u8 variables, so we don't
823 * need any endian conversion.
824 */
825 pv->pv_major = ver->pv_major;
826 pv->pv_minor = ver->pv_minor;
827}
828
829static void sync_wait_cb(void *arg)
830{
831 struct ocfs2_cluster_connection *conn = arg;
832 struct ocfs2_live_connection *lc = conn->cc_private;
833 complete(&lc->oc_sync_wait);
834}
835
836static int sync_unlock(struct ocfs2_cluster_connection *conn,
837 struct dlm_lksb *lksb, char *name)
838{
839 int error;
840 struct ocfs2_live_connection *lc = conn->cc_private;
841
842 error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
843 if (error) {
844 printk(KERN_ERR "%s lkid %x error %d\n",
845 name, lksb->sb_lkid, error);
846 return error;
847 }
848
849 wait_for_completion(&lc->oc_sync_wait);
850
851 if (lksb->sb_status != -DLM_EUNLOCK) {
852 printk(KERN_ERR "%s lkid %x status %d\n",
853 name, lksb->sb_lkid, lksb->sb_status);
854 return -1;
855 }
856 return 0;
857}
858
859static int sync_lock(struct ocfs2_cluster_connection *conn,
860 int mode, uint32_t flags,
861 struct dlm_lksb *lksb, char *name)
862{
863 int error, status;
864 struct ocfs2_live_connection *lc = conn->cc_private;
865
866 error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
867 name, strlen(name),
868 0, sync_wait_cb, conn, NULL);
869 if (error) {
870 printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
871 name, lksb->sb_lkid, flags, mode, error);
872 return error;
873 }
874
875 wait_for_completion(&lc->oc_sync_wait);
876
877 status = lksb->sb_status;
878
879 if (status && status != -EAGAIN) {
880 printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
881 name, lksb->sb_lkid, flags, mode, status);
882 }
883
884 return status;
885}
886
887
888static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
889 int flags)
890{
891 struct ocfs2_live_connection *lc = conn->cc_private;
892 return sync_lock(conn, mode, flags,
893 &lc->oc_version_lksb, VERSION_LOCK);
894}
895
896static int version_unlock(struct ocfs2_cluster_connection *conn)
897{
898 struct ocfs2_live_connection *lc = conn->cc_private;
899 return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
900}
901
902/* get_protocol_version()
903 *
904 * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
905 * The algorithm is:
906 * 1. Attempt to take the lock in EX mode (non-blocking).
907 * 2. If successful (which means it is the first mount), write the
908 * version number and downconvert to PR lock.
909 * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
910 * taking the PR lock.
911 */
912
913static int get_protocol_version(struct ocfs2_cluster_connection *conn)
914{
915 int ret;
916 struct ocfs2_live_connection *lc = conn->cc_private;
917 struct ocfs2_protocol_version pv;
918
919 running_proto.pv_major =
920 ocfs2_user_plugin.sp_max_proto.pv_major;
921 running_proto.pv_minor =
922 ocfs2_user_plugin.sp_max_proto.pv_minor;
923
924 lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
925 ret = version_lock(conn, DLM_LOCK_EX,
926 DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
927 if (!ret) {
928 conn->cc_version.pv_major = running_proto.pv_major;
929 conn->cc_version.pv_minor = running_proto.pv_minor;
930 version_to_lvb(&running_proto, lc->oc_lvb);
931 version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
932 } else if (ret == -EAGAIN) {
933 ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
934 if (ret)
935 goto out;
936 lvb_to_version(lc->oc_lvb, &pv);
937
938 if ((pv.pv_major != running_proto.pv_major) ||
939 (pv.pv_minor > running_proto.pv_minor)) {
940 ret = -EINVAL;
941 goto out;
942 }
943
944 conn->cc_version.pv_major = pv.pv_major;
945 conn->cc_version.pv_minor = pv.pv_minor;
946 }
947out:
948 return ret;
949}
950
951static void user_recover_prep(void *arg)
952{
953}
954
955static void user_recover_slot(void *arg, struct dlm_slot *slot)
956{
957 struct ocfs2_cluster_connection *conn = arg;
958 printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
959 slot->nodeid, slot->slot);
960 conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
961
962}
963
964static void user_recover_done(void *arg, struct dlm_slot *slots,
965 int num_slots, int our_slot,
966 uint32_t generation)
967{
968 struct ocfs2_cluster_connection *conn = arg;
969 struct ocfs2_live_connection *lc = conn->cc_private;
970 int i;
971
972 for (i = 0; i < num_slots; i++)
973 if (slots[i].slot == our_slot) {
974 atomic_set(&lc->oc_this_node, slots[i].nodeid);
975 break;
976 }
977
978 lc->oc_our_slot = our_slot;
979 wake_up(&lc->oc_wait);
980}
981
982static const struct dlm_lockspace_ops ocfs2_ls_ops = {
983 .recover_prep = user_recover_prep,
984 .recover_slot = user_recover_slot,
985 .recover_done = user_recover_done,
986};
987
988static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
989{
990 version_unlock(conn);
991 dlm_release_lockspace(conn->cc_lockspace, 2);
992 conn->cc_lockspace = NULL;
993 ocfs2_live_connection_drop(conn->cc_private);
994 conn->cc_private = NULL;
995 return 0;
996}
997
802static int user_cluster_connect(struct ocfs2_cluster_connection *conn) 998static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
803{ 999{
804 dlm_lockspace_t *fsdlm; 1000 dlm_lockspace_t *fsdlm;
805 struct ocfs2_live_connection *uninitialized_var(control); 1001 struct ocfs2_live_connection *lc;
806 int rc = 0; 1002 int rc, ops_rv;
807 1003
808 BUG_ON(conn == NULL); 1004 BUG_ON(conn == NULL);
809 1005
810 rc = ocfs2_live_connection_new(conn, &control); 1006 lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
1007 if (!lc) {
1008 rc = -ENOMEM;
1009 goto out;
1010 }
1011
1012 init_waitqueue_head(&lc->oc_wait);
1013 init_completion(&lc->oc_sync_wait);
1014 atomic_set(&lc->oc_this_node, 0);
1015 conn->cc_private = lc;
1016 lc->oc_type = NO_CONTROLD;
1017
1018 rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
1019 DLM_LSFL_FS, DLM_LVB_LEN,
1020 &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
1021 if (rc)
1022 goto out;
1023
1024 if (ops_rv == -EOPNOTSUPP) {
1025 lc->oc_type = WITH_CONTROLD;
1026 printk(KERN_NOTICE "ocfs2: You seem to be using an older "
1027 "version of dlm_controld and/or ocfs2-tools."
1028 " Please consider upgrading.\n");
1029 } else if (ops_rv) {
1030 rc = ops_rv;
1031 goto out;
1032 }
1033 conn->cc_lockspace = fsdlm;
1034
1035 rc = ocfs2_live_connection_attach(conn, lc);
811 if (rc) 1036 if (rc)
812 goto out; 1037 goto out;
813 1038
1039 if (lc->oc_type == NO_CONTROLD) {
1040 rc = get_protocol_version(conn);
1041 if (rc) {
1042 printk(KERN_ERR "ocfs2: Could not determine"
1043 " locking version\n");
1044 user_cluster_disconnect(conn);
1045 goto out;
1046 }
1047 wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
1048 }
1049
814 /* 1050 /*
815 * running_proto must have been set before we allowed any mounts 1051 * running_proto must have been set before we allowed any mounts
816 * to proceed. 1052 * to proceed.
@@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
818 if (fs_protocol_compare(&running_proto, &conn->cc_version)) { 1054 if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
819 printk(KERN_ERR 1055 printk(KERN_ERR
820 "Unable to mount with fs locking protocol version " 1056 "Unable to mount with fs locking protocol version "
821 "%u.%u because the userspace control daemon has " 1057 "%u.%u because negotiated protocol is %u.%u\n",
822 "negotiated %u.%u\n",
823 conn->cc_version.pv_major, conn->cc_version.pv_minor, 1058 conn->cc_version.pv_major, conn->cc_version.pv_minor,
824 running_proto.pv_major, running_proto.pv_minor); 1059 running_proto.pv_major, running_proto.pv_minor);
825 rc = -EPROTO; 1060 rc = -EPROTO;
826 ocfs2_live_connection_drop(control); 1061 ocfs2_live_connection_drop(lc);
827 goto out; 1062 lc = NULL;
828 }
829
830 rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
831 NULL, NULL, NULL, &fsdlm);
832 if (rc) {
833 ocfs2_live_connection_drop(control);
834 goto out;
835 } 1063 }
836 1064
837 conn->cc_private = control;
838 conn->cc_lockspace = fsdlm;
839out: 1065out:
1066 if (rc && lc)
1067 kfree(lc);
840 return rc; 1068 return rc;
841} 1069}
842 1070
843static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
844{
845 dlm_release_lockspace(conn->cc_lockspace, 2);
846 conn->cc_lockspace = NULL;
847 ocfs2_live_connection_drop(conn->cc_private);
848 conn->cc_private = NULL;
849 return 0;
850}
851 1071
852static int user_cluster_this_node(unsigned int *this_node) 1072static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
1073 unsigned int *this_node)
853{ 1074{
854 int rc; 1075 int rc;
1076 struct ocfs2_live_connection *lc = conn->cc_private;
1077
1078 if (lc->oc_type == WITH_CONTROLD)
1079 rc = ocfs2_control_get_this_node();
1080 else if (lc->oc_type == NO_CONTROLD)
1081 rc = atomic_read(&lc->oc_this_node);
1082 else
1083 rc = -EINVAL;
855 1084
856 rc = ocfs2_control_get_this_node();
857 if (rc < 0) 1085 if (rc < 0)
858 return rc; 1086 return rc;
859 1087
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index cb7ec0b63ddc..ca5ce14cbddc 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
309EXPORT_SYMBOL_GPL(ocfs2_plock); 309EXPORT_SYMBOL_GPL(ocfs2_plock);
310 310
311int ocfs2_cluster_connect(const char *stack_name, 311int ocfs2_cluster_connect(const char *stack_name,
312 const char *cluster_name,
313 int cluster_name_len,
312 const char *group, 314 const char *group,
313 int grouplen, 315 int grouplen,
314 struct ocfs2_locking_protocol *lproto, 316 struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,12 @@ int ocfs2_cluster_connect(const char *stack_name,
342 goto out; 344 goto out;
343 } 345 }
344 346
345 memcpy(new_conn->cc_name, group, grouplen); 347 strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
346 new_conn->cc_namelen = grouplen; 348 new_conn->cc_namelen = grouplen;
349 if (cluster_name_len)
350 strlcpy(new_conn->cc_cluster_name, cluster_name,
351 CLUSTER_NAME_MAX + 1);
352 new_conn->cc_cluster_name_len = cluster_name_len;
347 new_conn->cc_recovery_handler = recovery_handler; 353 new_conn->cc_recovery_handler = recovery_handler;
348 new_conn->cc_recovery_data = recovery_data; 354 new_conn->cc_recovery_data = recovery_data;
349 355
@@ -386,8 +392,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
386 392
387 if (cluster_stack_name[0]) 393 if (cluster_stack_name[0])
388 stack_name = cluster_stack_name; 394 stack_name = cluster_stack_name;
389 return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, 395 return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
390 recovery_handler, recovery_data, conn); 396 lproto, recovery_handler, recovery_data,
397 conn);
391} 398}
392EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); 399EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
393 400
@@ -460,9 +467,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
460} 467}
461EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); 468EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
462 469
463int ocfs2_cluster_this_node(unsigned int *node) 470int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
471 unsigned int *node)
464{ 472{
465 return active_stack->sp_ops->this_node(node); 473 return active_stack->sp_ops->this_node(conn, node);
466} 474}
467EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); 475EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
468 476
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0d..66334a30cea8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
45 */ 45 */
46#define GROUP_NAME_MAX 64 46#define GROUP_NAME_MAX 64
47 47
48/* This shadows OCFS2_CLUSTER_NAME_LEN */
49#define CLUSTER_NAME_MAX 16
50
48 51
49/* 52/*
50 * ocfs2_protocol_version changes when ocfs2 does something different in 53 * ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
97 * locking compatibility. 100 * locking compatibility.
98 */ 101 */
99struct ocfs2_cluster_connection { 102struct ocfs2_cluster_connection {
100 char cc_name[GROUP_NAME_MAX]; 103 char cc_name[GROUP_NAME_MAX + 1];
101 int cc_namelen; 104 int cc_namelen;
105 char cc_cluster_name[CLUSTER_NAME_MAX + 1];
106 int cc_cluster_name_len;
102 struct ocfs2_protocol_version cc_version; 107 struct ocfs2_protocol_version cc_version;
103 struct ocfs2_locking_protocol *cc_proto; 108 struct ocfs2_locking_protocol *cc_proto;
104 void (*cc_recovery_handler)(int node_num, void *recovery_data); 109 void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations {
152 * ->this_node() returns the cluster's unique identifier for the 157 * ->this_node() returns the cluster's unique identifier for the
153 * local node. 158 * local node.
154 */ 159 */
155 int (*this_node)(unsigned int *node); 160 int (*this_node)(struct ocfs2_cluster_connection *conn,
161 unsigned int *node);
156 162
157 /* 163 /*
158 * Call the underlying dlm lock function. The ->dlm_lock() 164 * Call the underlying dlm lock function. The ->dlm_lock()
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {
239 245
240/* Used by the filesystem */ 246/* Used by the filesystem */
241int ocfs2_cluster_connect(const char *stack_name, 247int ocfs2_cluster_connect(const char *stack_name,
248 const char *cluster_name,
249 int cluster_name_len,
242 const char *group, 250 const char *group,
243 int grouplen, 251 int grouplen,
244 struct ocfs2_locking_protocol *lproto, 252 struct ocfs2_locking_protocol *lproto,
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
260int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, 268int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
261 int hangup_pending); 269 int hangup_pending);
262void ocfs2_cluster_hangup(const char *group, int grouplen); 270void ocfs2_cluster_hangup(const char *group, int grouplen);
263int ocfs2_cluster_this_node(unsigned int *node); 271int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
272 unsigned int *node);
264 273
265struct ocfs2_lock_res; 274struct ocfs2_lock_res;
266int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, 275int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2c91452c4047..47ae2663a6f5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
113 struct ocfs2_suballoc_result *res); 113 struct ocfs2_suballoc_result *res);
114static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, 114static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
115 int nr); 115 int nr);
116static inline int ocfs2_block_group_set_bits(handle_t *handle,
117 struct inode *alloc_inode,
118 struct ocfs2_group_desc *bg,
119 struct buffer_head *group_bh,
120 unsigned int bit_off,
121 unsigned int num_bits);
122static int ocfs2_relink_block_group(handle_t *handle, 116static int ocfs2_relink_block_group(handle_t *handle,
123 struct inode *alloc_inode, 117 struct inode *alloc_inode,
124 struct buffer_head *fe_bh, 118 struct buffer_head *fe_bh,
@@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1343 return status; 1337 return status;
1344} 1338}
1345 1339
1346static inline int ocfs2_block_group_set_bits(handle_t *handle, 1340int ocfs2_block_group_set_bits(handle_t *handle,
1347 struct inode *alloc_inode, 1341 struct inode *alloc_inode,
1348 struct ocfs2_group_desc *bg, 1342 struct ocfs2_group_desc *bg,
1349 struct buffer_head *group_bh, 1343 struct buffer_head *group_bh,
@@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1388 ocfs2_journal_dirty(handle, group_bh); 1382 ocfs2_journal_dirty(handle, group_bh);
1389 1383
1390bail: 1384bail:
1391 if (status)
1392 mlog_errno(status);
1393 return status; 1385 return status;
1394} 1386}
1395 1387
@@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode,
1588 return ret; 1580 return ret;
1589} 1581}
1590 1582
1591static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 1583int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1592 handle_t *handle, 1584 handle_t *handle,
1593 struct buffer_head *di_bh, 1585 struct buffer_head *di_bh,
1594 u32 num_bits, 1586 u32 num_bits,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa50911..218d8036b3e7 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
86 u32 bits_wanted, 86 u32 bits_wanted,
87 struct ocfs2_alloc_context **ac); 87 struct ocfs2_alloc_context **ac);
88 88
89int ocfs2_alloc_dinode_update_counts(struct inode *inode,
90 handle_t *handle,
91 struct buffer_head *di_bh,
92 u32 num_bits,
93 u16 chain);
94int ocfs2_block_group_set_bits(handle_t *handle,
95 struct inode *alloc_inode,
96 struct ocfs2_group_desc *bg,
97 struct buffer_head *group_bh,
98 unsigned int bit_off,
99 unsigned int num_bits);
100
89int ocfs2_claim_metadata(handle_t *handle, 101int ocfs2_claim_metadata(handle_t *handle,
90 struct ocfs2_alloc_context *ac, 102 struct ocfs2_alloc_context *ac,
91 u32 bits_wanted, 103 u32 bits_wanted,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c41492957aa5..49d84f80f36c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,6 @@
68#include "super.h" 68#include "super.h"
69#include "sysfile.h" 69#include "sysfile.h"
70#include "uptodate.h" 70#include "uptodate.h"
71#include "ver.h"
72#include "xattr.h" 71#include "xattr.h"
73#include "quota.h" 72#include "quota.h"
74#include "refcounttree.h" 73#include "refcounttree.h"
@@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL;
90 89
91MODULE_AUTHOR("Oracle"); 90MODULE_AUTHOR("Oracle");
92MODULE_LICENSE("GPL"); 91MODULE_LICENSE("GPL");
92MODULE_DESCRIPTION("OCFS2 cluster file system");
93 93
94struct mount_options 94struct mount_options
95{ 95{
@@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void)
1618{ 1618{
1619 int status, i; 1619 int status, i;
1620 1620
1621 ocfs2_print_version();
1622
1623 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) 1621 for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
1624 init_waitqueue_head(&ocfs2__ioend_wq[i]); 1622 init_waitqueue_head(&ocfs2__ioend_wq[i]);
1625 1623
@@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1947 1945
1948 ocfs2_shutdown_local_alloc(osb); 1946 ocfs2_shutdown_local_alloc(osb);
1949 1947
1950 ocfs2_truncate_log_shutdown(osb);
1951
1952 /* This will disable recovery and flush any recovery work. */ 1948 /* This will disable recovery and flush any recovery work. */
1953 ocfs2_recovery_exit(osb); 1949 ocfs2_recovery_exit(osb);
1954 1950
1951 /*
1952 * During dismount, when it recovers another node it will call
1953 * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq.
1954 */
1955 ocfs2_truncate_log_shutdown(osb);
1956
1955 ocfs2_journal_shutdown(osb); 1957 ocfs2_journal_shutdown(osb);
1956 1958
1957 ocfs2_sync_blockdev(sb); 1959 ocfs2_sync_blockdev(sb);
@@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2225 if (ocfs2_clusterinfo_valid(osb)) { 2227 if (ocfs2_clusterinfo_valid(osb)) {
2226 osb->osb_stackflags = 2228 osb->osb_stackflags =
2227 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; 2229 OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
2228 memcpy(osb->osb_cluster_stack, 2230 strlcpy(osb->osb_cluster_stack,
2229 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, 2231 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
2230 OCFS2_STACK_LABEL_LEN); 2232 OCFS2_STACK_LABEL_LEN + 1);
2231 osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
2232 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { 2233 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
2233 mlog(ML_ERROR, 2234 mlog(ML_ERROR,
2234 "couldn't mount because of an invalid " 2235 "couldn't mount because of an invalid "
@@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
2237 status = -EINVAL; 2238 status = -EINVAL;
2238 goto bail; 2239 goto bail;
2239 } 2240 }
2241 strlcpy(osb->osb_cluster_name,
2242 OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
2243 OCFS2_CLUSTER_NAME_LEN + 1);
2240 } else { 2244 } else {
2241 /* The empty string is identical with classic tools that 2245 /* The empty string is identical with classic tools that
2242 * don't know about s_cluster_info. */ 2246 * don't know about s_cluster_info. */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a2..000000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/string.h>
28#include <linux/kernel.h>
29
30#include "ver.h"
31
32#define OCFS2_BUILD_VERSION "1.5.0"
33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35
36void ocfs2_print_version(void)
37{
38 printk(KERN_INFO "%s\n", VERSION_STR);
39}
40
41MODULE_DESCRIPTION(VERSION_STR);
42
43MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2f..000000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_VER_H
27#define OCFS2_VER_H
28
29void ocfs2_print_version(void);
30
31#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f0a1326d9bba..185fa3b7f962 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -99,8 +99,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
99 99
100const struct xattr_handler *ocfs2_xattr_handlers[] = { 100const struct xattr_handler *ocfs2_xattr_handlers[] = {
101 &ocfs2_xattr_user_handler, 101 &ocfs2_xattr_user_handler,
102 &ocfs2_xattr_acl_access_handler, 102 &posix_acl_access_xattr_handler,
103 &ocfs2_xattr_acl_default_handler, 103 &posix_acl_default_xattr_handler,
104 &ocfs2_xattr_trusted_handler, 104 &ocfs2_xattr_trusted_handler,
105 &ocfs2_xattr_security_handler, 105 &ocfs2_xattr_security_handler,
106 NULL 106 NULL
@@ -109,9 +109,9 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = {
109static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 109static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] 111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
112 = &ocfs2_xattr_acl_access_handler, 112 = &posix_acl_access_xattr_handler,
113 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] 113 [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
114 = &ocfs2_xattr_acl_default_handler, 114 = &posix_acl_default_xattr_handler,
115 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, 115 [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
116 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, 116 [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
117}; 117};
@@ -7190,10 +7190,12 @@ out:
7190 */ 7190 */
7191int ocfs2_init_security_and_acl(struct inode *dir, 7191int ocfs2_init_security_and_acl(struct inode *dir,
7192 struct inode *inode, 7192 struct inode *inode,
7193 const struct qstr *qstr) 7193 const struct qstr *qstr,
7194 struct posix_acl *default_acl,
7195 struct posix_acl *acl)
7194{ 7196{
7195 int ret = 0;
7196 struct buffer_head *dir_bh = NULL; 7197 struct buffer_head *dir_bh = NULL;
7198 int ret = 0;
7197 7199
7198 ret = ocfs2_init_security_get(inode, dir, qstr, NULL); 7200 ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
7199 if (ret) { 7201 if (ret) {
@@ -7207,9 +7209,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
7207 goto leave; 7209 goto leave;
7208 } 7210 }
7209 7211
7210 ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); 7212 if (!ret && default_acl)
7211 if (ret) 7213 ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
7212 mlog_errno(ret); 7214 if (!ret && acl)
7215 ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
7213 7216
7214 ocfs2_inode_unlock(dir, 0); 7217 ocfs2_inode_unlock(dir, 0);
7215 brelse(dir_bh); 7218 brelse(dir_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 19f134e896a9..f10d5b93c366 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,8 +40,6 @@ struct ocfs2_security_xattr_info {
40extern const struct xattr_handler ocfs2_xattr_user_handler; 40extern const struct xattr_handler ocfs2_xattr_user_handler;
41extern const struct xattr_handler ocfs2_xattr_trusted_handler; 41extern const struct xattr_handler ocfs2_xattr_trusted_handler;
42extern const struct xattr_handler ocfs2_xattr_security_handler; 42extern const struct xattr_handler ocfs2_xattr_security_handler;
43extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
44extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
45extern const struct xattr_handler *ocfs2_xattr_handlers[]; 43extern const struct xattr_handler *ocfs2_xattr_handlers[];
46 44
47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 45ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
@@ -96,5 +94,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
96 bool preserve_security); 94 bool preserve_security);
97int ocfs2_init_security_and_acl(struct inode *dir, 95int ocfs2_init_security_and_acl(struct inode *dir,
98 struct inode *inode, 96 struct inode *inode,
99 const struct qstr *qstr); 97 const struct qstr *qstr,
98 struct posix_acl *default_acl,
99 struct posix_acl *acl);
100#endif /* OCFS2_XATTR_H */ 100#endif /* OCFS2_XATTR_H */
diff --git a/fs/open.c b/fs/open.c
index 4b3e1edf2fe4..b9ed8b25c108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
705 return 0; 705 return 0;
706 } 706 }
707 707
708 /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
709 if (S_ISREG(inode->i_mode))
710 f->f_mode |= FMODE_ATOMIC_POS;
711
708 f->f_op = fops_get(inode->i_fop); 712 f->f_op = fops_get(inode->i_fop);
709 if (unlikely(WARN_ON(!f->f_op))) { 713 if (unlikely(WARN_ON(!f->f_op))) {
710 error = -ENODEV; 714 error = -ENODEV;
diff --git a/fs/pipe.c b/fs/pipe.c
index 0e0752ef2715..78fd0d0788db 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -663,10 +663,11 @@ out:
663 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 663 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
664 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 664 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
665 } 665 }
666 if (ret > 0) { 666 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
667 int err = file_update_time(filp); 667 int err = file_update_time(filp);
668 if (err) 668 if (err)
669 ret = err; 669 ret = err;
670 sb_end_write(file_inode(filp)->i_sb);
670 } 671 }
671 return ret; 672 return ret;
672} 673}
diff --git a/fs/pnode.c b/fs/pnode.c
index c7221bb19801..88396df725b4 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -220,14 +220,14 @@ static struct mount *get_source(struct mount *dest,
220 * @tree_list : list of heads of trees to be attached. 220 * @tree_list : list of heads of trees to be attached.
221 */ 221 */
222int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, 222int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
223 struct mount *source_mnt, struct list_head *tree_list) 223 struct mount *source_mnt, struct hlist_head *tree_list)
224{ 224{
225 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 225 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
226 struct mount *m, *child; 226 struct mount *m, *child;
227 int ret = 0; 227 int ret = 0;
228 struct mount *prev_dest_mnt = dest_mnt; 228 struct mount *prev_dest_mnt = dest_mnt;
229 struct mount *prev_src_mnt = source_mnt; 229 struct mount *prev_src_mnt = source_mnt;
230 LIST_HEAD(tmp_list); 230 HLIST_HEAD(tmp_list);
231 231
232 for (m = propagation_next(dest_mnt, dest_mnt); m; 232 for (m = propagation_next(dest_mnt, dest_mnt); m;
233 m = propagation_next(m, dest_mnt)) { 233 m = propagation_next(m, dest_mnt)) {
@@ -246,27 +246,29 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
246 child = copy_tree(source, source->mnt.mnt_root, type); 246 child = copy_tree(source, source->mnt.mnt_root, type);
247 if (IS_ERR(child)) { 247 if (IS_ERR(child)) {
248 ret = PTR_ERR(child); 248 ret = PTR_ERR(child);
249 list_splice(tree_list, tmp_list.prev); 249 tmp_list = *tree_list;
250 tmp_list.first->pprev = &tmp_list.first;
251 INIT_HLIST_HEAD(tree_list);
250 goto out; 252 goto out;
251 } 253 }
252 254
253 if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { 255 if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
254 mnt_set_mountpoint(m, dest_mp, child); 256 mnt_set_mountpoint(m, dest_mp, child);
255 list_add_tail(&child->mnt_hash, tree_list); 257 hlist_add_head(&child->mnt_hash, tree_list);
256 } else { 258 } else {
257 /* 259 /*
258 * This can happen if the parent mount was bind mounted 260 * This can happen if the parent mount was bind mounted
259 * on some subdirectory of a shared/slave mount. 261 * on some subdirectory of a shared/slave mount.
260 */ 262 */
261 list_add_tail(&child->mnt_hash, &tmp_list); 263 hlist_add_head(&child->mnt_hash, &tmp_list);
262 } 264 }
263 prev_dest_mnt = m; 265 prev_dest_mnt = m;
264 prev_src_mnt = child; 266 prev_src_mnt = child;
265 } 267 }
266out: 268out:
267 lock_mount_hash(); 269 lock_mount_hash();
268 while (!list_empty(&tmp_list)) { 270 while (!hlist_empty(&tmp_list)) {
269 child = list_first_entry(&tmp_list, struct mount, mnt_hash); 271 child = hlist_entry(tmp_list.first, struct mount, mnt_hash);
270 umount_tree(child, 0); 272 umount_tree(child, 0);
271 } 273 }
272 unlock_mount_hash(); 274 unlock_mount_hash();
@@ -338,8 +340,10 @@ static void __propagate_umount(struct mount *mnt)
338 * umount the child only if the child has no 340 * umount the child only if the child has no
339 * other children 341 * other children
340 */ 342 */
341 if (child && list_empty(&child->mnt_mounts)) 343 if (child && list_empty(&child->mnt_mounts)) {
342 list_move_tail(&child->mnt_hash, &mnt->mnt_hash); 344 hlist_del_init_rcu(&child->mnt_hash);
345 hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
346 }
343 } 347 }
344} 348}
345 349
@@ -350,11 +354,11 @@ static void __propagate_umount(struct mount *mnt)
350 * 354 *
351 * vfsmount lock must be held for write 355 * vfsmount lock must be held for write
352 */ 356 */
353int propagate_umount(struct list_head *list) 357int propagate_umount(struct hlist_head *list)
354{ 358{
355 struct mount *mnt; 359 struct mount *mnt;
356 360
357 list_for_each_entry(mnt, list, mnt_hash) 361 hlist_for_each_entry(mnt, list, mnt_hash)
358 __propagate_umount(mnt); 362 __propagate_umount(mnt);
359 return 0; 363 return 0;
360} 364}
diff --git a/fs/pnode.h b/fs/pnode.h
index 59e7eda1851e..fc28a27fa892 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -36,8 +36,8 @@ static inline void set_mnt_shared(struct mount *mnt)
36 36
37void change_mnt_propagation(struct mount *, int); 37void change_mnt_propagation(struct mount *, int);
38int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, 38int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
39 struct list_head *); 39 struct hlist_head *);
40int propagate_umount(struct list_head *); 40int propagate_umount(struct hlist_head *);
41int propagate_mount_busy(struct mount *, int); 41int propagate_mount_busy(struct mount *, int);
42void mnt_release_group_id(struct mount *); 42void mnt_release_group_id(struct mount *);
43int get_dominating_id(struct mount *mnt, const struct path *root); 43int get_dominating_id(struct mount *mnt, const struct path *root);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f82..11c54fd51e16 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1,10 +1,8 @@
1/* 1/*
2 * linux/fs/posix_acl.c 2 * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
3 * 3 *
4 * Copyright (C) 2002 by Andreas Gruenbacher <a.gruenbacher@computer.org> 4 * Fixes from William Schumacher incorporated on 15 March 2001.
5 * 5 * (Reported by Charles Bertsch, <CBertsch@microtest.com>).
6 * Fixes from William Schumacher incorporated on 15 March 2001.
7 * (Reported by Charles Bertsch, <CBertsch@microtest.com>).
8 */ 6 */
9 7
10/* 8/*
@@ -18,15 +16,112 @@
18#include <linux/fs.h> 16#include <linux/fs.h>
19#include <linux/sched.h> 17#include <linux/sched.h>
20#include <linux/posix_acl.h> 18#include <linux/posix_acl.h>
19#include <linux/posix_acl_xattr.h>
20#include <linux/xattr.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/user_namespace.h>
22 23
23#include <linux/errno.h> 24struct posix_acl **acl_by_type(struct inode *inode, int type)
25{
26 switch (type) {
27 case ACL_TYPE_ACCESS:
28 return &inode->i_acl;
29 case ACL_TYPE_DEFAULT:
30 return &inode->i_default_acl;
31 default:
32 BUG();
33 }
34}
35EXPORT_SYMBOL(acl_by_type);
24 36
25EXPORT_SYMBOL(posix_acl_init); 37struct posix_acl *get_cached_acl(struct inode *inode, int type)
26EXPORT_SYMBOL(posix_acl_alloc); 38{
27EXPORT_SYMBOL(posix_acl_valid); 39 struct posix_acl **p = acl_by_type(inode, type);
28EXPORT_SYMBOL(posix_acl_equiv_mode); 40 struct posix_acl *acl = ACCESS_ONCE(*p);
29EXPORT_SYMBOL(posix_acl_from_mode); 41 if (acl) {
42 spin_lock(&inode->i_lock);
43 acl = *p;
44 if (acl != ACL_NOT_CACHED)
45 acl = posix_acl_dup(acl);
46 spin_unlock(&inode->i_lock);
47 }
48 return acl;
49}
50EXPORT_SYMBOL(get_cached_acl);
51
52struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
53{
54 return rcu_dereference(*acl_by_type(inode, type));
55}
56EXPORT_SYMBOL(get_cached_acl_rcu);
57
58void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
59{
60 struct posix_acl **p = acl_by_type(inode, type);
61 struct posix_acl *old;
62 spin_lock(&inode->i_lock);
63 old = *p;
64 rcu_assign_pointer(*p, posix_acl_dup(acl));
65 spin_unlock(&inode->i_lock);
66 if (old != ACL_NOT_CACHED)
67 posix_acl_release(old);
68}
69EXPORT_SYMBOL(set_cached_acl);
70
71void forget_cached_acl(struct inode *inode, int type)
72{
73 struct posix_acl **p = acl_by_type(inode, type);
74 struct posix_acl *old;
75 spin_lock(&inode->i_lock);
76 old = *p;
77 *p = ACL_NOT_CACHED;
78 spin_unlock(&inode->i_lock);
79 if (old != ACL_NOT_CACHED)
80 posix_acl_release(old);
81}
82EXPORT_SYMBOL(forget_cached_acl);
83
84void forget_all_cached_acls(struct inode *inode)
85{
86 struct posix_acl *old_access, *old_default;
87 spin_lock(&inode->i_lock);
88 old_access = inode->i_acl;
89 old_default = inode->i_default_acl;
90 inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
91 spin_unlock(&inode->i_lock);
92 if (old_access != ACL_NOT_CACHED)
93 posix_acl_release(old_access);
94 if (old_default != ACL_NOT_CACHED)
95 posix_acl_release(old_default);
96}
97EXPORT_SYMBOL(forget_all_cached_acls);
98
99struct posix_acl *get_acl(struct inode *inode, int type)
100{
101 struct posix_acl *acl;
102
103 acl = get_cached_acl(inode, type);
104 if (acl != ACL_NOT_CACHED)
105 return acl;
106
107 if (!IS_POSIXACL(inode))
108 return NULL;
109
110 /*
111 * A filesystem can force a ACL callback by just never filling the
112 * ACL cache. But normally you'd fill the cache either at inode
113 * instantiation time, or on the first ->get_acl call.
114 *
115 * If the filesystem doesn't have a get_acl() function at all, we'll
116 * just create the negative cache entry.
117 */
118 if (!inode->i_op->get_acl) {
119 set_cached_acl(inode, type, NULL);
120 return NULL;
121 }
122 return inode->i_op->get_acl(inode, type);
123}
124EXPORT_SYMBOL(get_acl);
30 125
31/* 126/*
32 * Init a fresh posix_acl 127 * Init a fresh posix_acl
@@ -37,6 +132,7 @@ posix_acl_init(struct posix_acl *acl, int count)
37 atomic_set(&acl->a_refcount, 1); 132 atomic_set(&acl->a_refcount, 1);
38 acl->a_count = count; 133 acl->a_count = count;
39} 134}
135EXPORT_SYMBOL(posix_acl_init);
40 136
41/* 137/*
42 * Allocate a new ACL with the specified number of entries. 138 * Allocate a new ACL with the specified number of entries.
@@ -51,6 +147,7 @@ posix_acl_alloc(int count, gfp_t flags)
51 posix_acl_init(acl, count); 147 posix_acl_init(acl, count);
52 return acl; 148 return acl;
53} 149}
150EXPORT_SYMBOL(posix_acl_alloc);
54 151
55/* 152/*
56 * Clone an ACL. 153 * Clone an ACL.
@@ -78,8 +175,6 @@ posix_acl_valid(const struct posix_acl *acl)
78{ 175{
79 const struct posix_acl_entry *pa, *pe; 176 const struct posix_acl_entry *pa, *pe;
80 int state = ACL_USER_OBJ; 177 int state = ACL_USER_OBJ;
81 kuid_t prev_uid = INVALID_UID;
82 kgid_t prev_gid = INVALID_GID;
83 int needs_mask = 0; 178 int needs_mask = 0;
84 179
85 FOREACH_ACL_ENTRY(pa, acl, pe) { 180 FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -98,10 +193,6 @@ posix_acl_valid(const struct posix_acl *acl)
98 return -EINVAL; 193 return -EINVAL;
99 if (!uid_valid(pa->e_uid)) 194 if (!uid_valid(pa->e_uid))
100 return -EINVAL; 195 return -EINVAL;
101 if (uid_valid(prev_uid) &&
102 uid_lte(pa->e_uid, prev_uid))
103 return -EINVAL;
104 prev_uid = pa->e_uid;
105 needs_mask = 1; 196 needs_mask = 1;
106 break; 197 break;
107 198
@@ -117,10 +208,6 @@ posix_acl_valid(const struct posix_acl *acl)
117 return -EINVAL; 208 return -EINVAL;
118 if (!gid_valid(pa->e_gid)) 209 if (!gid_valid(pa->e_gid))
119 return -EINVAL; 210 return -EINVAL;
120 if (gid_valid(prev_gid) &&
121 gid_lte(pa->e_gid, prev_gid))
122 return -EINVAL;
123 prev_gid = pa->e_gid;
124 needs_mask = 1; 211 needs_mask = 1;
125 break; 212 break;
126 213
@@ -146,6 +233,7 @@ posix_acl_valid(const struct posix_acl *acl)
146 return 0; 233 return 0;
147 return -EINVAL; 234 return -EINVAL;
148} 235}
236EXPORT_SYMBOL(posix_acl_valid);
149 237
150/* 238/*
151 * Returns 0 if the acl can be exactly represented in the traditional 239 * Returns 0 if the acl can be exactly represented in the traditional
@@ -186,6 +274,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
186 *mode_p = (*mode_p & ~S_IRWXUGO) | mode; 274 *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
187 return not_equiv; 275 return not_equiv;
188} 276}
277EXPORT_SYMBOL(posix_acl_equiv_mode);
189 278
190/* 279/*
191 * Create an ACL representing the file mode permission bits of an inode. 280 * Create an ACL representing the file mode permission bits of an inode.
@@ -207,6 +296,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
207 acl->a_entries[2].e_perm = (mode & S_IRWXO); 296 acl->a_entries[2].e_perm = (mode & S_IRWXO);
208 return acl; 297 return acl;
209} 298}
299EXPORT_SYMBOL(posix_acl_from_mode);
210 300
211/* 301/*
212 * Return 0 if current is granted want access to the inode 302 * Return 0 if current is granted want access to the inode
@@ -338,7 +428,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
338/* 428/*
339 * Modify the ACL for the chmod syscall. 429 * Modify the ACL for the chmod syscall.
340 */ 430 */
341static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) 431static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
342{ 432{
343 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; 433 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
344 struct posix_acl_entry *pa, *pe; 434 struct posix_acl_entry *pa, *pe;
@@ -384,7 +474,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
384} 474}
385 475
386int 476int
387posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) 477__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
388{ 478{
389 struct posix_acl *clone = posix_acl_clone(*acl, gfp); 479 struct posix_acl *clone = posix_acl_clone(*acl, gfp);
390 int err = -ENOMEM; 480 int err = -ENOMEM;
@@ -399,15 +489,15 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
399 *acl = clone; 489 *acl = clone;
400 return err; 490 return err;
401} 491}
402EXPORT_SYMBOL(posix_acl_create); 492EXPORT_SYMBOL(__posix_acl_create);
403 493
404int 494int
405posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) 495__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
406{ 496{
407 struct posix_acl *clone = posix_acl_clone(*acl, gfp); 497 struct posix_acl *clone = posix_acl_clone(*acl, gfp);
408 int err = -ENOMEM; 498 int err = -ENOMEM;
409 if (clone) { 499 if (clone) {
410 err = posix_acl_chmod_masq(clone, mode); 500 err = __posix_acl_chmod_masq(clone, mode);
411 if (err) { 501 if (err) {
412 posix_acl_release(clone); 502 posix_acl_release(clone);
413 clone = NULL; 503 clone = NULL;
@@ -417,4 +507,388 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
417 *acl = clone; 507 *acl = clone;
418 return err; 508 return err;
419} 509}
510EXPORT_SYMBOL(__posix_acl_chmod);
511
512int
513posix_acl_chmod(struct inode *inode, umode_t mode)
514{
515 struct posix_acl *acl;
516 int ret = 0;
517
518 if (!IS_POSIXACL(inode))
519 return 0;
520 if (!inode->i_op->set_acl)
521 return -EOPNOTSUPP;
522
523 acl = get_acl(inode, ACL_TYPE_ACCESS);
524 if (IS_ERR_OR_NULL(acl)) {
525 if (acl == ERR_PTR(-EOPNOTSUPP))
526 return 0;
527 return PTR_ERR(acl);
528 }
529
530 ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
531 if (ret)
532 return ret;
533 ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
534 posix_acl_release(acl);
535 return ret;
536}
420EXPORT_SYMBOL(posix_acl_chmod); 537EXPORT_SYMBOL(posix_acl_chmod);
538
539int
540posix_acl_create(struct inode *dir, umode_t *mode,
541 struct posix_acl **default_acl, struct posix_acl **acl)
542{
543 struct posix_acl *p;
544 int ret;
545
546 if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
547 goto no_acl;
548
549 p = get_acl(dir, ACL_TYPE_DEFAULT);
550 if (IS_ERR(p)) {
551 if (p == ERR_PTR(-EOPNOTSUPP))
552 goto apply_umask;
553 return PTR_ERR(p);
554 }
555
556 if (!p)
557 goto apply_umask;
558
559 *acl = posix_acl_clone(p, GFP_NOFS);
560 if (!*acl)
561 return -ENOMEM;
562
563 ret = posix_acl_create_masq(*acl, mode);
564 if (ret < 0) {
565 posix_acl_release(*acl);
566 return -ENOMEM;
567 }
568
569 if (ret == 0) {
570 posix_acl_release(*acl);
571 *acl = NULL;
572 }
573
574 if (!S_ISDIR(*mode)) {
575 posix_acl_release(p);
576 *default_acl = NULL;
577 } else {
578 *default_acl = p;
579 }
580 return 0;
581
582apply_umask:
583 *mode &= ~current_umask();
584no_acl:
585 *default_acl = NULL;
586 *acl = NULL;
587 return 0;
588}
589EXPORT_SYMBOL_GPL(posix_acl_create);
590
591/*
592 * Fix up the uids and gids in posix acl extended attributes in place.
593 */
594static void posix_acl_fix_xattr_userns(
595 struct user_namespace *to, struct user_namespace *from,
596 void *value, size_t size)
597{
598 posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
599 posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
600 int count;
601 kuid_t uid;
602 kgid_t gid;
603
604 if (!value)
605 return;
606 if (size < sizeof(posix_acl_xattr_header))
607 return;
608 if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
609 return;
610
611 count = posix_acl_xattr_count(size);
612 if (count < 0)
613 return;
614 if (count == 0)
615 return;
616
617 for (end = entry + count; entry != end; entry++) {
618 switch(le16_to_cpu(entry->e_tag)) {
619 case ACL_USER:
620 uid = make_kuid(from, le32_to_cpu(entry->e_id));
621 entry->e_id = cpu_to_le32(from_kuid(to, uid));
622 break;
623 case ACL_GROUP:
624 gid = make_kgid(from, le32_to_cpu(entry->e_id));
625 entry->e_id = cpu_to_le32(from_kgid(to, gid));
626 break;
627 default:
628 break;
629 }
630 }
631}
632
633void posix_acl_fix_xattr_from_user(void *value, size_t size)
634{
635 struct user_namespace *user_ns = current_user_ns();
636 if (user_ns == &init_user_ns)
637 return;
638 posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
639}
640
641void posix_acl_fix_xattr_to_user(void *value, size_t size)
642{
643 struct user_namespace *user_ns = current_user_ns();
644 if (user_ns == &init_user_ns)
645 return;
646 posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
647}
648
649/*
650 * Convert from extended attribute to in-memory representation.
651 */
652struct posix_acl *
653posix_acl_from_xattr(struct user_namespace *user_ns,
654 const void *value, size_t size)
655{
656 posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
657 posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
658 int count;
659 struct posix_acl *acl;
660 struct posix_acl_entry *acl_e;
661
662 if (!value)
663 return NULL;
664 if (size < sizeof(posix_acl_xattr_header))
665 return ERR_PTR(-EINVAL);
666 if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
667 return ERR_PTR(-EOPNOTSUPP);
668
669 count = posix_acl_xattr_count(size);
670 if (count < 0)
671 return ERR_PTR(-EINVAL);
672 if (count == 0)
673 return NULL;
674
675 acl = posix_acl_alloc(count, GFP_NOFS);
676 if (!acl)
677 return ERR_PTR(-ENOMEM);
678 acl_e = acl->a_entries;
679
680 for (end = entry + count; entry != end; acl_e++, entry++) {
681 acl_e->e_tag = le16_to_cpu(entry->e_tag);
682 acl_e->e_perm = le16_to_cpu(entry->e_perm);
683
684 switch(acl_e->e_tag) {
685 case ACL_USER_OBJ:
686 case ACL_GROUP_OBJ:
687 case ACL_MASK:
688 case ACL_OTHER:
689 break;
690
691 case ACL_USER:
692 acl_e->e_uid =
693 make_kuid(user_ns,
694 le32_to_cpu(entry->e_id));
695 if (!uid_valid(acl_e->e_uid))
696 goto fail;
697 break;
698 case ACL_GROUP:
699 acl_e->e_gid =
700 make_kgid(user_ns,
701 le32_to_cpu(entry->e_id));
702 if (!gid_valid(acl_e->e_gid))
703 goto fail;
704 break;
705
706 default:
707 goto fail;
708 }
709 }
710 return acl;
711
712fail:
713 posix_acl_release(acl);
714 return ERR_PTR(-EINVAL);
715}
716EXPORT_SYMBOL (posix_acl_from_xattr);
717
718/*
719 * Convert from in-memory to extended attribute representation.
720 */
721int
722posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
723 void *buffer, size_t size)
724{
725 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
726 posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
727 int real_size, n;
728
729 real_size = posix_acl_xattr_size(acl->a_count);
730 if (!buffer)
731 return real_size;
732 if (real_size > size)
733 return -ERANGE;
734
735 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
736
737 for (n=0; n < acl->a_count; n++, ext_entry++) {
738 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
739 ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
740 ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
741 switch(acl_e->e_tag) {
742 case ACL_USER:
743 ext_entry->e_id =
744 cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
745 break;
746 case ACL_GROUP:
747 ext_entry->e_id =
748 cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
749 break;
750 default:
751 ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
752 break;
753 }
754 }
755 return real_size;
756}
757EXPORT_SYMBOL (posix_acl_to_xattr);
758
759static int
760posix_acl_xattr_get(struct dentry *dentry, const char *name,
761 void *value, size_t size, int type)
762{
763 struct posix_acl *acl;
764 int error;
765
766 if (!IS_POSIXACL(dentry->d_inode))
767 return -EOPNOTSUPP;
768 if (S_ISLNK(dentry->d_inode->i_mode))
769 return -EOPNOTSUPP;
770
771 acl = get_acl(dentry->d_inode, type);
772 if (IS_ERR(acl))
773 return PTR_ERR(acl);
774 if (acl == NULL)
775 return -ENODATA;
776
777 error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
778 posix_acl_release(acl);
779
780 return error;
781}
782
783static int
784posix_acl_xattr_set(struct dentry *dentry, const char *name,
785 const void *value, size_t size, int flags, int type)
786{
787 struct inode *inode = dentry->d_inode;
788 struct posix_acl *acl = NULL;
789 int ret;
790
791 if (!IS_POSIXACL(inode))
792 return -EOPNOTSUPP;
793 if (!inode->i_op->set_acl)
794 return -EOPNOTSUPP;
795
796 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
797 return value ? -EACCES : 0;
798 if (!inode_owner_or_capable(inode))
799 return -EPERM;
800
801 if (value) {
802 acl = posix_acl_from_xattr(&init_user_ns, value, size);
803 if (IS_ERR(acl))
804 return PTR_ERR(acl);
805
806 if (acl) {
807 ret = posix_acl_valid(acl);
808 if (ret)
809 goto out;
810 }
811 }
812
813 ret = inode->i_op->set_acl(inode, acl, type);
814out:
815 posix_acl_release(acl);
816 return ret;
817}
818
819static size_t
820posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
821 const char *name, size_t name_len, int type)
822{
823 const char *xname;
824 size_t size;
825
826 if (!IS_POSIXACL(dentry->d_inode))
827 return -EOPNOTSUPP;
828 if (S_ISLNK(dentry->d_inode->i_mode))
829 return -EOPNOTSUPP;
830
831 if (type == ACL_TYPE_ACCESS)
832 xname = POSIX_ACL_XATTR_ACCESS;
833 else
834 xname = POSIX_ACL_XATTR_DEFAULT;
835
836 size = strlen(xname) + 1;
837 if (list && size <= list_size)
838 memcpy(list, xname, size);
839 return size;
840}
841
842const struct xattr_handler posix_acl_access_xattr_handler = {
843 .prefix = POSIX_ACL_XATTR_ACCESS,
844 .flags = ACL_TYPE_ACCESS,
845 .list = posix_acl_xattr_list,
846 .get = posix_acl_xattr_get,
847 .set = posix_acl_xattr_set,
848};
849EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
850
851const struct xattr_handler posix_acl_default_xattr_handler = {
852 .prefix = POSIX_ACL_XATTR_DEFAULT,
853 .flags = ACL_TYPE_DEFAULT,
854 .list = posix_acl_xattr_list,
855 .get = posix_acl_xattr_get,
856 .set = posix_acl_xattr_set,
857};
858EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
859
860int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
861{
862 int error;
863
864 if (type == ACL_TYPE_ACCESS) {
865 error = posix_acl_equiv_mode(acl, &inode->i_mode);
866 if (error < 0)
867 return 0;
868 if (error == 0)
869 acl = NULL;
870 }
871
872 inode->i_ctime = CURRENT_TIME;
873 set_cached_acl(inode, type, acl);
874 return 0;
875}
876
877int simple_acl_create(struct inode *dir, struct inode *inode)
878{
879 struct posix_acl *default_acl, *acl;
880 int error;
881
882 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
883 if (error)
884 return error;
885
886 set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
887 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
888
889 if (default_acl)
890 posix_acl_release(default_acl);
891 if (acl)
892 posix_acl_release(acl);
893 return 0;
894}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 1bd2077187fd..656e401794de 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -140,24 +140,15 @@ static const char * const task_state_array[] = {
140 "t (tracing stop)", /* 8 */ 140 "t (tracing stop)", /* 8 */
141 "Z (zombie)", /* 16 */ 141 "Z (zombie)", /* 16 */
142 "X (dead)", /* 32 */ 142 "X (dead)", /* 32 */
143 "x (dead)", /* 64 */
144 "K (wakekill)", /* 128 */
145 "W (waking)", /* 256 */
146 "P (parked)", /* 512 */
147}; 143};
148 144
149static inline const char *get_task_state(struct task_struct *tsk) 145static inline const char *get_task_state(struct task_struct *tsk)
150{ 146{
151 unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; 147 unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
152 const char * const *p = &task_state_array[0];
153 148
154 BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); 149 BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
155 150
156 while (state) { 151 return task_state_array[fls(state)];
157 p++;
158 state >>= 1;
159 }
160 return *p;
161} 152}
162 153
163static inline void task_state(struct seq_file *m, struct pid_namespace *ns, 154static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
@@ -453,8 +444,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
453 min_flt += t->min_flt; 444 min_flt += t->min_flt;
454 maj_flt += t->maj_flt; 445 maj_flt += t->maj_flt;
455 gtime += task_gtime(t); 446 gtime += task_gtime(t);
456 t = next_thread(t); 447 } while_each_thread(task, t);
457 } while (t != task);
458 448
459 min_flt += sig->min_flt; 449 min_flt += sig->min_flt;
460 maj_flt += sig->maj_flt; 450 maj_flt += sig->maj_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 03c8d747be48..b9760628e1fd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1658,13 +1658,18 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1658 return 0; 1658 return 0;
1659} 1659}
1660 1660
1661static inline bool proc_inode_is_dead(struct inode *inode)
1662{
1663 return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
1664}
1665
1661int pid_delete_dentry(const struct dentry *dentry) 1666int pid_delete_dentry(const struct dentry *dentry)
1662{ 1667{
1663 /* Is the task we represent dead? 1668 /* Is the task we represent dead?
1664 * If so, then don't put the dentry on the lru list, 1669 * If so, then don't put the dentry on the lru list,
1665 * kill it immediately. 1670 * kill it immediately.
1666 */ 1671 */
1667 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; 1672 return proc_inode_is_dead(dentry->d_inode);
1668} 1673}
1669 1674
1670const struct dentry_operations pid_dentry_operations = 1675const struct dentry_operations pid_dentry_operations =
@@ -1819,6 +1824,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
1819 if (rc) 1824 if (rc)
1820 goto out_mmput; 1825 goto out_mmput;
1821 1826
1827 rc = -ENOENT;
1822 down_read(&mm->mmap_sem); 1828 down_read(&mm->mmap_sem);
1823 vma = find_exact_vma(mm, vm_start, vm_end); 1829 vma = find_exact_vma(mm, vm_start, vm_end);
1824 if (vma && vma->vm_file) { 1830 if (vma && vma->vm_file) {
@@ -3092,34 +3098,42 @@ out_no_task:
3092 * In the case of a seek we start with the leader and walk nr 3098 * In the case of a seek we start with the leader and walk nr
3093 * threads past it. 3099 * threads past it.
3094 */ 3100 */
3095static struct task_struct *first_tid(struct task_struct *leader, 3101static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
3096 int tid, int nr, struct pid_namespace *ns) 3102 struct pid_namespace *ns)
3097{ 3103{
3098 struct task_struct *pos; 3104 struct task_struct *pos, *task;
3105 unsigned long nr = f_pos;
3106
3107 if (nr != f_pos) /* 32bit overflow? */
3108 return NULL;
3099 3109
3100 rcu_read_lock(); 3110 rcu_read_lock();
3101 /* Attempt to start with the pid of a thread */ 3111 task = pid_task(pid, PIDTYPE_PID);
3102 if (tid && (nr > 0)) { 3112 if (!task)
3113 goto fail;
3114
3115 /* Attempt to start with the tid of a thread */
3116 if (tid && nr) {
3103 pos = find_task_by_pid_ns(tid, ns); 3117 pos = find_task_by_pid_ns(tid, ns);
3104 if (pos && (pos->group_leader == leader)) 3118 if (pos && same_thread_group(pos, task))
3105 goto found; 3119 goto found;
3106 } 3120 }
3107 3121
3108 /* If nr exceeds the number of threads there is nothing todo */ 3122 /* If nr exceeds the number of threads there is nothing todo */
3109 pos = NULL; 3123 if (nr >= get_nr_threads(task))
3110 if (nr && nr >= get_nr_threads(leader)) 3124 goto fail;
3111 goto out;
3112 3125
3113 /* If we haven't found our starting place yet start 3126 /* If we haven't found our starting place yet start
3114 * with the leader and walk nr threads forward. 3127 * with the leader and walk nr threads forward.
3115 */ 3128 */
3116 for (pos = leader; nr > 0; --nr) { 3129 pos = task = task->group_leader;
3117 pos = next_thread(pos); 3130 do {
3118 if (pos == leader) { 3131 if (!nr--)
3119 pos = NULL; 3132 goto found;
3120 goto out; 3133 } while_each_thread(task, pos);
3121 } 3134fail:
3122 } 3135 pos = NULL;
3136 goto out;
3123found: 3137found:
3124 get_task_struct(pos); 3138 get_task_struct(pos);
3125out: 3139out:
@@ -3152,25 +3166,16 @@ static struct task_struct *next_tid(struct task_struct *start)
3152/* for the /proc/TGID/task/ directories */ 3166/* for the /proc/TGID/task/ directories */
3153static int proc_task_readdir(struct file *file, struct dir_context *ctx) 3167static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3154{ 3168{
3155 struct task_struct *leader = NULL; 3169 struct inode *inode = file_inode(file);
3156 struct task_struct *task = get_proc_task(file_inode(file)); 3170 struct task_struct *task;
3157 struct pid_namespace *ns; 3171 struct pid_namespace *ns;
3158 int tid; 3172 int tid;
3159 3173
3160 if (!task) 3174 if (proc_inode_is_dead(inode))
3161 return -ENOENT;
3162 rcu_read_lock();
3163 if (pid_alive(task)) {
3164 leader = task->group_leader;
3165 get_task_struct(leader);
3166 }
3167 rcu_read_unlock();
3168 put_task_struct(task);
3169 if (!leader)
3170 return -ENOENT; 3175 return -ENOENT;
3171 3176
3172 if (!dir_emit_dots(file, ctx)) 3177 if (!dir_emit_dots(file, ctx))
3173 goto out; 3178 return 0;
3174 3179
3175 /* f_version caches the tgid value that the last readdir call couldn't 3180 /* f_version caches the tgid value that the last readdir call couldn't
3176 * return. lseek aka telldir automagically resets f_version to 0. 3181 * return. lseek aka telldir automagically resets f_version to 0.
@@ -3178,7 +3183,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3178 ns = file->f_dentry->d_sb->s_fs_info; 3183 ns = file->f_dentry->d_sb->s_fs_info;
3179 tid = (int)file->f_version; 3184 tid = (int)file->f_version;
3180 file->f_version = 0; 3185 file->f_version = 0;
3181 for (task = first_tid(leader, tid, ctx->pos - 2, ns); 3186 for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
3182 task; 3187 task;
3183 task = next_tid(task), ctx->pos++) { 3188 task = next_tid(task), ctx->pos++) {
3184 char name[PROC_NUMBUF]; 3189 char name[PROC_NUMBUF];
@@ -3194,8 +3199,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3194 break; 3199 break;
3195 } 3200 }
3196 } 3201 }
3197out: 3202
3198 put_task_struct(leader);
3199 return 0; 3203 return 0;
3200} 3204}
3201 3205
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 82676e3fcd1d..cbd82dff7e81 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void)
26 proc_create("cmdline", 0, NULL, &cmdline_proc_fops); 26 proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
27 return 0; 27 return 0;
28} 28}
29module_init(proc_cmdline_init); 29fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index 51942d5abcec..290ba85cb900 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -109,4 +109,4 @@ static int __init proc_consoles_init(void)
109 proc_create("consoles", 0, NULL, &proc_consoles_operations); 109 proc_create("consoles", 0, NULL, &proc_consoles_operations);
110 return 0; 110 return 0;
111} 111}
112module_init(proc_consoles_init); 112fs_initcall(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 5a1e539a234b..06f4d31e0396 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void)
21 proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); 21 proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
22 return 0; 22 return 0;
23} 23}
24module_init(proc_cpuinfo_init); 24fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index b14347167c35..50493edc30e5 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -67,4 +67,4 @@ static int __init proc_devices_init(void)
67 proc_create("devices", 0, NULL, &proc_devinfo_operations); 67 proc_create("devices", 0, NULL, &proc_devinfo_operations);
68 return 0; 68 return 0;
69} 69}
70module_init(proc_devices_init); 70fs_initcall(proc_devices_init);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index cca93b6fb9a9..b7f268eb5f45 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -49,8 +49,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
49 setattr_copy(inode, iattr); 49 setattr_copy(inode, iattr);
50 mark_inode_dirty(inode); 50 mark_inode_dirty(inode);
51 51
52 de->uid = inode->i_uid; 52 proc_set_user(de, inode->i_uid, inode->i_gid);
53 de->gid = inode->i_gid;
54 de->mode = inode->i_mode; 53 de->mode = inode->i_mode;
55 return 0; 54 return 0;
56} 55}
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 05029c0e2f24..a352d5703b41 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void)
50 proc_create("interrupts", 0, NULL, &proc_interrupts_operations); 50 proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
51 return 0; 51 return 0;
52} 52}
53module_init(proc_interrupts_init); 53fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 5ed0e52d6aa0..39e6ef32f0bd 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -639,4 +639,4 @@ static int __init proc_kcore_init(void)
639 639
640 return 0; 640 return 0;
641} 641}
642module_init(proc_kcore_init); 642fs_initcall(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bdfabdaefdce..05f8dcdb086e 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -61,4 +61,4 @@ static int __init proc_kmsg_init(void)
61 proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); 61 proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
62 return 0; 62 return 0;
63} 63}
64module_init(proc_kmsg_init); 64fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 1afa4dd4cae2..aec66e6c2060 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void)
42 proc_create("loadavg", 0, NULL, &loadavg_proc_fops); 42 proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
43 return 0; 43 return 0;
44} 44}
45module_init(proc_loadavg_init); 45fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a77d2b299199..136e548d9567 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
26 unsigned long committed; 26 unsigned long committed;
27 struct vmalloc_info vmi; 27 struct vmalloc_info vmi;
28 long cached; 28 long cached;
29 long available;
30 unsigned long pagecache;
31 unsigned long wmark_low = 0;
29 unsigned long pages[NR_LRU_LISTS]; 32 unsigned long pages[NR_LRU_LISTS];
33 struct zone *zone;
30 int lru; 34 int lru;
31 35
32/* 36/*
@@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
47 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 51 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
48 pages[lru] = global_page_state(NR_LRU_BASE + lru); 52 pages[lru] = global_page_state(NR_LRU_BASE + lru);
49 53
54 for_each_zone(zone)
55 wmark_low += zone->watermark[WMARK_LOW];
56
57 /*
58 * Estimate the amount of memory available for userspace allocations,
59 * without causing swapping.
60 *
61 * Free memory cannot be taken below the low watermark, before the
62 * system starts swapping.
63 */
64 available = i.freeram - wmark_low;
65
66 /*
67 * Not all the page cache can be freed, otherwise the system will
68 * start swapping. Assume at least half of the page cache, or the
69 * low watermark worth of cache, needs to stay.
70 */
71 pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
72 pagecache -= min(pagecache / 2, wmark_low);
73 available += pagecache;
74
75 /*
76 * Part of the reclaimable swap consists of items that are in use,
77 * and cannot be freed. Cap this estimate at the low watermark.
78 */
79 available += global_page_state(NR_SLAB_RECLAIMABLE) -
80 min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
81
82 if (available < 0)
83 available = 0;
84
50 /* 85 /*
51 * Tagged format, for easy grepping and expansion. 86 * Tagged format, for easy grepping and expansion.
52 */ 87 */
53 seq_printf(m, 88 seq_printf(m,
54 "MemTotal: %8lu kB\n" 89 "MemTotal: %8lu kB\n"
55 "MemFree: %8lu kB\n" 90 "MemFree: %8lu kB\n"
91 "MemAvailable: %8lu kB\n"
56 "Buffers: %8lu kB\n" 92 "Buffers: %8lu kB\n"
57 "Cached: %8lu kB\n" 93 "Cached: %8lu kB\n"
58 "SwapCached: %8lu kB\n" 94 "SwapCached: %8lu kB\n"
@@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
105 , 141 ,
106 K(i.totalram), 142 K(i.totalram),
107 K(i.freeram), 143 K(i.freeram),
144 K(available),
108 K(i.bufferram), 145 K(i.bufferram),
109 K(cached), 146 K(cached),
110 K(total_swapcache_pages()), 147 K(total_swapcache_pages()),
@@ -183,4 +220,4 @@ static int __init proc_meminfo_init(void)
183 proc_create("meminfo", 0, NULL, &meminfo_proc_fops); 220 proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
184 return 0; 221 return 0;
185} 222}
186module_init(proc_meminfo_init); 223fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 5f9bc8a746c9..d4a35746cab9 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -131,4 +131,4 @@ static int __init proc_nommu_init(void)
131 return 0; 131 return 0;
132} 132}
133 133
134module_init(proc_nommu_init); 134fs_initcall(proc_nommu_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b8730d9ebaee..e647c55275d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -118,10 +118,11 @@ u64 stable_page_flags(struct page *page)
118 /* 118 /*
119 * PageTransCompound can be true for non-huge compound pages (slab 119 * PageTransCompound can be true for non-huge compound pages (slab
120 * pages or pages allocated by drivers with __GFP_COMP) because it 120 * pages or pages allocated by drivers with __GFP_COMP) because it
121 * just checks PG_head/PG_tail, so we need to check PageLRU to make 121 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
122 * sure a given page is a thp, not a non-huge compound page. 122 * to make sure a given page is a thp, not a non-huge compound page.
123 */ 123 */
124 else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) 124 else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
125 PageAnon(compound_head(page))))
125 u |= 1 << KPF_THP; 126 u |= 1 << KPF_THP;
126 127
127 /* 128 /*
@@ -217,4 +218,4 @@ static int __init proc_page_init(void)
217 proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); 218 proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
218 return 0; 219 return 0;
219} 220}
220module_init(proc_page_init); 221fs_initcall(proc_page_init);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 70779b2fc209..c82dd5147845 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -74,9 +74,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
74 return NULL; 74 return NULL;
75 75
76 if (!strncmp(name, "security-", 9)) 76 if (!strncmp(name, "security-", 9))
77 ent->size = 0; /* don't leak number of password chars */ 77 proc_set_size(ent, 0); /* don't leak number of password chars */
78 else 78 else
79 ent->size = pp->length; 79 proc_set_size(ent, pp->length);
80 80
81 return ent; 81 return ent;
82} 82}
@@ -232,6 +232,7 @@ void __init proc_device_tree_init(void)
232 return; 232 return;
233 root = of_find_node_by_path("/"); 233 root = of_find_node_by_path("/");
234 if (root == NULL) { 234 if (root == NULL) {
235 remove_proc_entry("device-tree", NULL);
235 pr_debug("/proc/device-tree: can't find root\n"); 236 pr_debug("/proc/device-tree: can't find root\n");
236 return; 237 return;
237 } 238 }
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 62604be9f58d..ad8a77f94beb 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void)
41 proc_create("softirqs", 0, NULL, &proc_softirqs_operations); 41 proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
42 return 0; 42 return 0;
43} 43}
44module_init(proc_softirqs_init); 44fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 1cf86c0e8689..6f599c62f0cc 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -221,4 +221,4 @@ static int __init proc_stat_init(void)
221 proc_create("stat", 0, NULL, &proc_stat_operations); 221 proc_create("stat", 0, NULL, &proc_stat_operations);
222 return 0; 222 return 0;
223} 223}
224module_init(proc_stat_init); 224fs_initcall(proc_stat_init);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 061894625903..7141b8d0ca9e 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -49,4 +49,4 @@ static int __init proc_uptime_init(void)
49 proc_create("uptime", 0, NULL, &uptime_proc_fops); 49 proc_create("uptime", 0, NULL, &uptime_proc_fops);
50 return 0; 50 return 0;
51} 51}
52module_init(proc_uptime_init); 52fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 76817a60678c..d2154eb6d78f 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -31,4 +31,4 @@ static int __init proc_version_init(void)
31 proc_create("version", 0, NULL, &version_proc_fops); 31 proc_create("version", 0, NULL, &version_proc_fops);
32 return 0; 32 return 0;
33} 33}
34module_init(proc_version_init); 34fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9100d6959886..88d4585b30f1 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -468,17 +468,24 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
468 return rc; 468 return rc;
469 } 469 }
470 nhdr_ptr = notes_section; 470 nhdr_ptr = notes_section;
471 while (real_sz < max_sz) { 471 while (nhdr_ptr->n_namesz != 0) {
472 if (nhdr_ptr->n_namesz == 0)
473 break;
474 sz = sizeof(Elf64_Nhdr) + 472 sz = sizeof(Elf64_Nhdr) +
475 ((nhdr_ptr->n_namesz + 3) & ~3) + 473 ((nhdr_ptr->n_namesz + 3) & ~3) +
476 ((nhdr_ptr->n_descsz + 3) & ~3); 474 ((nhdr_ptr->n_descsz + 3) & ~3);
475 if ((real_sz + sz) > max_sz) {
476 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
477 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
478 break;
479 }
477 real_sz += sz; 480 real_sz += sz;
478 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); 481 nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
479 } 482 }
480 kfree(notes_section); 483 kfree(notes_section);
481 phdr_ptr->p_memsz = real_sz; 484 phdr_ptr->p_memsz = real_sz;
485 if (real_sz == 0) {
486 pr_warn("Warning: Zero PT_NOTE entries found\n");
487 return -EINVAL;
488 }
482 } 489 }
483 490
484 return 0; 491 return 0;
@@ -648,17 +655,24 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
648 return rc; 655 return rc;
649 } 656 }
650 nhdr_ptr = notes_section; 657 nhdr_ptr = notes_section;
651 while (real_sz < max_sz) { 658 while (nhdr_ptr->n_namesz != 0) {
652 if (nhdr_ptr->n_namesz == 0)
653 break;
654 sz = sizeof(Elf32_Nhdr) + 659 sz = sizeof(Elf32_Nhdr) +
655 ((nhdr_ptr->n_namesz + 3) & ~3) + 660 ((nhdr_ptr->n_namesz + 3) & ~3) +
656 ((nhdr_ptr->n_descsz + 3) & ~3); 661 ((nhdr_ptr->n_descsz + 3) & ~3);
662 if ((real_sz + sz) > max_sz) {
663 pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
664 nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
665 break;
666 }
657 real_sz += sz; 667 real_sz += sz;
658 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); 668 nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
659 } 669 }
660 kfree(notes_section); 670 kfree(notes_section);
661 phdr_ptr->p_memsz = real_sz; 671 phdr_ptr->p_memsz = real_sz;
672 if (real_sz == 0) {
673 pr_warn("Warning: Zero PT_NOTE entries found\n");
674 return -EINVAL;
675 }
662 } 676 }
663 677
664 return 0; 678 return 0;
@@ -1082,7 +1096,7 @@ static int __init vmcore_init(void)
1082 proc_vmcore->size = vmcore_size; 1096 proc_vmcore->size = vmcore_size;
1083 return 0; 1097 return 0;
1084} 1098}
1085module_init(vmcore_init) 1099fs_initcall(vmcore_init);
1086 1100
1087/* Cleanup function for vmcore module. */ 1101/* Cleanup function for vmcore module. */
1088void vmcore_cleanup(void) 1102void vmcore_cleanup(void)
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 439406e081af..7be26f03a3f5 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -234,17 +234,12 @@ static int mounts_open_common(struct inode *inode, struct file *file,
234 234
235 rcu_read_lock(); 235 rcu_read_lock();
236 nsp = task_nsproxy(task); 236 nsp = task_nsproxy(task);
237 if (!nsp) { 237 if (!nsp || !nsp->mnt_ns) {
238 rcu_read_unlock(); 238 rcu_read_unlock();
239 put_task_struct(task); 239 put_task_struct(task);
240 goto err; 240 goto err;
241 } 241 }
242 ns = nsp->mnt_ns; 242 ns = nsp->mnt_ns;
243 if (!ns) {
244 rcu_read_unlock();
245 put_task_struct(task);
246 goto err;
247 }
248 get_mnt_ns(ns); 243 get_mnt_ns(ns);
249 rcu_read_unlock(); 244 rcu_read_unlock();
250 task_lock(task); 245 task_lock(task);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2e8caa62da78..89558810381c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -27,7 +27,6 @@
27 27
28static const struct super_operations qnx4_sops; 28static const struct super_operations qnx4_sops;
29 29
30static void qnx4_put_super(struct super_block *sb);
31static struct inode *qnx4_alloc_inode(struct super_block *sb); 30static struct inode *qnx4_alloc_inode(struct super_block *sb);
32static void qnx4_destroy_inode(struct inode *inode); 31static void qnx4_destroy_inode(struct inode *inode);
33static int qnx4_remount(struct super_block *sb, int *flags, char *data); 32static int qnx4_remount(struct super_block *sb, int *flags, char *data);
@@ -37,7 +36,6 @@ static const struct super_operations qnx4_sops =
37{ 36{
38 .alloc_inode = qnx4_alloc_inode, 37 .alloc_inode = qnx4_alloc_inode,
39 .destroy_inode = qnx4_destroy_inode, 38 .destroy_inode = qnx4_destroy_inode,
40 .put_super = qnx4_put_super,
41 .statfs = qnx4_statfs, 39 .statfs = qnx4_statfs,
42 .remount_fs = qnx4_remount, 40 .remount_fs = qnx4_remount,
43}; 41};
@@ -148,18 +146,19 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
148 * it really _is_ a qnx4 filesystem, and to check the size 146 * it really _is_ a qnx4 filesystem, and to check the size
149 * of the directory entry. 147 * of the directory entry.
150 */ 148 */
151static const char *qnx4_checkroot(struct super_block *sb) 149static const char *qnx4_checkroot(struct super_block *sb,
150 struct qnx4_super_block *s)
152{ 151{
153 struct buffer_head *bh; 152 struct buffer_head *bh;
154 struct qnx4_inode_entry *rootdir; 153 struct qnx4_inode_entry *rootdir;
155 int rd, rl; 154 int rd, rl;
156 int i, j; 155 int i, j;
157 156
158 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') 157 if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0')
159 return "no qnx4 filesystem (no root dir)."; 158 return "no qnx4 filesystem (no root dir).";
160 QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); 159 QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
161 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; 160 rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1;
162 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); 161 rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size);
163 for (j = 0; j < rl; j++) { 162 for (j = 0; j < rl; j++) {
164 bh = sb_bread(sb, rd + j); /* root dir, first block */ 163 bh = sb_bread(sb, rd + j); /* root dir, first block */
165 if (bh == NULL) 164 if (bh == NULL)
@@ -189,7 +188,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
189 struct inode *root; 188 struct inode *root;
190 const char *errmsg; 189 const char *errmsg;
191 struct qnx4_sb_info *qs; 190 struct qnx4_sb_info *qs;
192 int ret = -EINVAL;
193 191
194 qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); 192 qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
195 if (!qs) 193 if (!qs)
@@ -198,67 +196,50 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
198 196
199 sb_set_blocksize(s, QNX4_BLOCK_SIZE); 197 sb_set_blocksize(s, QNX4_BLOCK_SIZE);
200 198
199 s->s_op = &qnx4_sops;
200 s->s_magic = QNX4_SUPER_MAGIC;
201 s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
202
201 /* Check the superblock signature. Since the qnx4 code is 203 /* Check the superblock signature. Since the qnx4 code is
202 dangerous, we should leave as quickly as possible 204 dangerous, we should leave as quickly as possible
203 if we don't belong here... */ 205 if we don't belong here... */
204 bh = sb_bread(s, 1); 206 bh = sb_bread(s, 1);
205 if (!bh) { 207 if (!bh) {
206 printk(KERN_ERR "qnx4: unable to read the superblock\n"); 208 printk(KERN_ERR "qnx4: unable to read the superblock\n");
207 goto outnobh; 209 return -EINVAL;
208 } 210 }
209 if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
210 if (!silent)
211 printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
212 goto out;
213 }
214 s->s_op = &qnx4_sops;
215 s->s_magic = QNX4_SUPER_MAGIC;
216 s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
217 qnx4_sb(s)->sb_buf = bh;
218 qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
219
220 211
221 /* check before allocating dentries, inodes, .. */ 212 /* check before allocating dentries, inodes, .. */
222 errmsg = qnx4_checkroot(s); 213 errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
214 brelse(bh);
223 if (errmsg != NULL) { 215 if (errmsg != NULL) {
224 if (!silent) 216 if (!silent)
225 printk(KERN_ERR "qnx4: %s\n", errmsg); 217 printk(KERN_ERR "qnx4: %s\n", errmsg);
226 goto out; 218 return -EINVAL;
227 } 219 }
228 220
229 /* does root not have inode number QNX4_ROOT_INO ?? */ 221 /* does root not have inode number QNX4_ROOT_INO ?? */
230 root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); 222 root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
231 if (IS_ERR(root)) { 223 if (IS_ERR(root)) {
232 printk(KERN_ERR "qnx4: get inode failed\n"); 224 printk(KERN_ERR "qnx4: get inode failed\n");
233 ret = PTR_ERR(root); 225 return PTR_ERR(root);
234 goto outb;
235 } 226 }
236 227
237 ret = -ENOMEM;
238 s->s_root = d_make_root(root); 228 s->s_root = d_make_root(root);
239 if (s->s_root == NULL) 229 if (s->s_root == NULL)
240 goto outb; 230 return -ENOMEM;
241 231
242 brelse(bh);
243 return 0; 232 return 0;
244
245 outb:
246 kfree(qs->BitMap);
247 out:
248 brelse(bh);
249 outnobh:
250 kfree(qs);
251 s->s_fs_info = NULL;
252 return ret;
253} 233}
254 234
255static void qnx4_put_super(struct super_block *sb) 235static void qnx4_kill_sb(struct super_block *sb)
256{ 236{
257 struct qnx4_sb_info *qs = qnx4_sb(sb); 237 struct qnx4_sb_info *qs = qnx4_sb(sb);
258 kfree( qs->BitMap ); 238 kill_block_super(sb);
259 kfree( qs ); 239 if (qs) {
260 sb->s_fs_info = NULL; 240 kfree(qs->BitMap);
261 return; 241 kfree(qs);
242 }
262} 243}
263 244
264static int qnx4_readpage(struct file *file, struct page *page) 245static int qnx4_readpage(struct file *file, struct page *page)
@@ -409,7 +390,7 @@ static struct file_system_type qnx4_fs_type = {
409 .owner = THIS_MODULE, 390 .owner = THIS_MODULE,
410 .name = "qnx4", 391 .name = "qnx4",
411 .mount = qnx4_mount, 392 .mount = qnx4_mount,
412 .kill_sb = kill_block_super, 393 .kill_sb = qnx4_kill_sb,
413 .fs_flags = FS_REQUIRES_DEV, 394 .fs_flags = FS_REQUIRES_DEV,
414}; 395};
415MODULE_ALIAS_FS("qnx4"); 396MODULE_ALIAS_FS("qnx4");
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 34e2d329c97e..c9b1be2c164d 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -10,8 +10,6 @@
10#endif 10#endif
11 11
12struct qnx4_sb_info { 12struct qnx4_sb_info {
13 struct buffer_head *sb_buf; /* superblock buffer */
14 struct qnx4_super_block *sb; /* our superblock */
15 unsigned int Version; /* may be useful */ 13 unsigned int Version; /* may be useful */
16 struct qnx4_inode_entry *BitMap; /* useful */ 14 struct qnx4_inode_entry *BitMap; /* useful */
17}; 15};
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 831d49a4111f..cfc8dcc16043 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb,
581 dqstats_inc(DQST_LOOKUPS); 581 dqstats_inc(DQST_LOOKUPS);
582 dqput(old_dquot); 582 dqput(old_dquot);
583 old_dquot = dquot; 583 old_dquot = dquot;
584 ret = fn(dquot, priv); 584 /*
585 if (ret < 0) 585 * ->release_dquot() can be racing with us. Our reference
586 goto out; 586 * protects us from new calls to it so just wait for any
587 * outstanding call and recheck the DQ_ACTIVE_B after that.
588 */
589 wait_on_dquot(dquot);
590 if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
591 ret = fn(dquot, priv);
592 if (ret < 0)
593 goto out;
594 }
587 spin_lock(&dq_list_lock); 595 spin_lock(&dq_list_lock);
588 /* We are safe to continue now because our dquot could not 596 /* We are safe to continue now because our dquot could not
589 * be moved out of the inuse list while we hold the reference */ 597 * be moved out of the inuse list while we hold the reference */
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 4884ac5ae9be..1e56a4e8cf7c 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -30,13 +30,6 @@
30 30
31#include "internal.h" 31#include "internal.h"
32 32
33const struct address_space_operations ramfs_aops = {
34 .readpage = simple_readpage,
35 .write_begin = simple_write_begin,
36 .write_end = simple_write_end,
37 .set_page_dirty = __set_page_dirty_no_writeback,
38};
39
40const struct file_operations ramfs_file_operations = { 33const struct file_operations ramfs_file_operations = {
41 .read = do_sync_read, 34 .read = do_sync_read,
42 .aio_read = generic_file_aio_read, 35 .aio_read = generic_file_aio_read,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 8d5b438cc188..0b3d8e4cb2fa 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,13 +27,12 @@
27#include "internal.h" 27#include "internal.h"
28 28
29static int ramfs_nommu_setattr(struct dentry *, struct iattr *); 29static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
30 30static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
31const struct address_space_operations ramfs_aops = { 31 unsigned long addr,
32 .readpage = simple_readpage, 32 unsigned long len,
33 .write_begin = simple_write_begin, 33 unsigned long pgoff,
34 .write_end = simple_write_end, 34 unsigned long flags);
35 .set_page_dirty = __set_page_dirty_no_writeback, 35static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
36};
37 36
38const struct file_operations ramfs_file_operations = { 37const struct file_operations ramfs_file_operations = {
39 .mmap = ramfs_nommu_mmap, 38 .mmap = ramfs_nommu_mmap,
@@ -197,7 +196,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
197 * - the pages to be mapped must exist 196 * - the pages to be mapped must exist
198 * - the pages be physically contiguous in sequence 197 * - the pages be physically contiguous in sequence
199 */ 198 */
200unsigned long ramfs_nommu_get_unmapped_area(struct file *file, 199static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
201 unsigned long addr, unsigned long len, 200 unsigned long addr, unsigned long len,
202 unsigned long pgoff, unsigned long flags) 201 unsigned long pgoff, unsigned long flags)
203{ 202{
@@ -256,7 +255,7 @@ out:
256/* 255/*
257 * set up a mapping for shared memory segments 256 * set up a mapping for shared memory segments
258 */ 257 */
259int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) 258static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
260{ 259{
261 if (!(vma->vm_flags & VM_SHARED)) 260 if (!(vma->vm_flags & VM_SHARED))
262 return -ENOSYS; 261 return -ENOSYS;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 39d14659a8d3..d365b1c4eb3c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -43,6 +43,13 @@
43static const struct super_operations ramfs_ops; 43static const struct super_operations ramfs_ops;
44static const struct inode_operations ramfs_dir_inode_operations; 44static const struct inode_operations ramfs_dir_inode_operations;
45 45
46static const struct address_space_operations ramfs_aops = {
47 .readpage = simple_readpage,
48 .write_begin = simple_write_begin,
49 .write_end = simple_write_end,
50 .set_page_dirty = __set_page_dirty_no_writeback,
51};
52
46static struct backing_dev_info ramfs_backing_dev_info = { 53static struct backing_dev_info ramfs_backing_dev_info = {
47 .name = "ramfs", 54 .name = "ramfs",
48 .ra_pages = 0, /* No readahead */ 55 .ra_pages = 0, /* No readahead */
@@ -275,4 +282,4 @@ int __init init_ramfs_fs(void)
275 282
276 return err; 283 return err;
277} 284}
278module_init(init_ramfs_fs) 285fs_initcall(init_ramfs_fs);
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 6b330639b51d..a9d8ae88fa15 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,5 +10,4 @@
10 */ 10 */
11 11
12 12
13extern const struct address_space_operations ramfs_aops;
14extern const struct inode_operations ramfs_file_inode_operations; 13extern const struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/read_write.c b/fs/read_write.c
index 58e440df1bc6..28cc9c810744 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -264,10 +264,22 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
264} 264}
265EXPORT_SYMBOL(vfs_llseek); 265EXPORT_SYMBOL(vfs_llseek);
266 266
267static inline struct fd fdget_pos(int fd)
268{
269 return __to_fd(__fdget_pos(fd));
270}
271
272static inline void fdput_pos(struct fd f)
273{
274 if (f.flags & FDPUT_POS_UNLOCK)
275 mutex_unlock(&f.file->f_pos_lock);
276 fdput(f);
277}
278
267SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 279SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
268{ 280{
269 off_t retval; 281 off_t retval;
270 struct fd f = fdget(fd); 282 struct fd f = fdget_pos(fd);
271 if (!f.file) 283 if (!f.file)
272 return -EBADF; 284 return -EBADF;
273 285
@@ -278,7 +290,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
278 if (res != (loff_t)retval) 290 if (res != (loff_t)retval)
279 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 291 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
280 } 292 }
281 fdput(f); 293 fdput_pos(f);
282 return retval; 294 return retval;
283} 295}
284 296
@@ -295,7 +307,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
295 unsigned int, whence) 307 unsigned int, whence)
296{ 308{
297 int retval; 309 int retval;
298 struct fd f = fdget(fd); 310 struct fd f = fdget_pos(fd);
299 loff_t offset; 311 loff_t offset;
300 312
301 if (!f.file) 313 if (!f.file)
@@ -315,7 +327,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
315 retval = 0; 327 retval = 0;
316 } 328 }
317out_putf: 329out_putf:
318 fdput(f); 330 fdput_pos(f);
319 return retval; 331 return retval;
320} 332}
321#endif 333#endif
@@ -498,7 +510,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
498 510
499SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 511SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
500{ 512{
501 struct fd f = fdget(fd); 513 struct fd f = fdget_pos(fd);
502 ssize_t ret = -EBADF; 514 ssize_t ret = -EBADF;
503 515
504 if (f.file) { 516 if (f.file) {
@@ -506,7 +518,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
506 ret = vfs_read(f.file, buf, count, &pos); 518 ret = vfs_read(f.file, buf, count, &pos);
507 if (ret >= 0) 519 if (ret >= 0)
508 file_pos_write(f.file, pos); 520 file_pos_write(f.file, pos);
509 fdput(f); 521 fdput_pos(f);
510 } 522 }
511 return ret; 523 return ret;
512} 524}
@@ -514,7 +526,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
514SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 526SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
515 size_t, count) 527 size_t, count)
516{ 528{
517 struct fd f = fdget(fd); 529 struct fd f = fdget_pos(fd);
518 ssize_t ret = -EBADF; 530 ssize_t ret = -EBADF;
519 531
520 if (f.file) { 532 if (f.file) {
@@ -522,7 +534,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
522 ret = vfs_write(f.file, buf, count, &pos); 534 ret = vfs_write(f.file, buf, count, &pos);
523 if (ret >= 0) 535 if (ret >= 0)
524 file_pos_write(f.file, pos); 536 file_pos_write(f.file, pos);
525 fdput(f); 537 fdput_pos(f);
526 } 538 }
527 539
528 return ret; 540 return ret;
@@ -797,7 +809,7 @@ EXPORT_SYMBOL(vfs_writev);
797SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 809SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
798 unsigned long, vlen) 810 unsigned long, vlen)
799{ 811{
800 struct fd f = fdget(fd); 812 struct fd f = fdget_pos(fd);
801 ssize_t ret = -EBADF; 813 ssize_t ret = -EBADF;
802 814
803 if (f.file) { 815 if (f.file) {
@@ -805,7 +817,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
805 ret = vfs_readv(f.file, vec, vlen, &pos); 817 ret = vfs_readv(f.file, vec, vlen, &pos);
806 if (ret >= 0) 818 if (ret >= 0)
807 file_pos_write(f.file, pos); 819 file_pos_write(f.file, pos);
808 fdput(f); 820 fdput_pos(f);
809 } 821 }
810 822
811 if (ret > 0) 823 if (ret > 0)
@@ -817,7 +829,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
817SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 829SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
818 unsigned long, vlen) 830 unsigned long, vlen)
819{ 831{
820 struct fd f = fdget(fd); 832 struct fd f = fdget_pos(fd);
821 ssize_t ret = -EBADF; 833 ssize_t ret = -EBADF;
822 834
823 if (f.file) { 835 if (f.file) {
@@ -825,7 +837,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
825 ret = vfs_writev(f.file, vec, vlen, &pos); 837 ret = vfs_writev(f.file, vec, vlen, &pos);
826 if (ret >= 0) 838 if (ret >= 0)
827 file_pos_write(f.file, pos); 839 file_pos_write(f.file, pos);
828 fdput(f); 840 fdput_pos(f);
829 } 841 }
830 842
831 if (ret > 0) 843 if (ret > 0)
@@ -901,10 +913,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
901 io_fn_t fn; 913 io_fn_t fn;
902 iov_fn_t fnv; 914 iov_fn_t fnv;
903 915
904 ret = -EFAULT;
905 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
906 goto out;
907
908 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, 916 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
909 UIO_FASTIOV, iovstack, &iov); 917 UIO_FASTIOV, iovstack, &iov);
910 if (ret <= 0) 918 if (ret <= 0)
@@ -968,11 +976,11 @@ out:
968 return ret; 976 return ret;
969} 977}
970 978
971COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, 979COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
972 const struct compat_iovec __user *,vec, 980 const struct compat_iovec __user *,vec,
973 unsigned long, vlen) 981 compat_ulong_t, vlen)
974{ 982{
975 struct fd f = fdget(fd); 983 struct fd f = fdget_pos(fd);
976 ssize_t ret; 984 ssize_t ret;
977 loff_t pos; 985 loff_t pos;
978 986
@@ -982,7 +990,7 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
982 ret = compat_readv(f.file, vec, vlen, &pos); 990 ret = compat_readv(f.file, vec, vlen, &pos);
983 if (ret >= 0) 991 if (ret >= 0)
984 f.file->f_pos = pos; 992 f.file->f_pos = pos;
985 fdput(f); 993 fdput_pos(f);
986 return ret; 994 return ret;
987} 995}
988 996
@@ -1005,9 +1013,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1005 return ret; 1013 return ret;
1006} 1014}
1007 1015
1008COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, 1016COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1009 const struct compat_iovec __user *,vec, 1017 const struct compat_iovec __user *,vec,
1010 unsigned long, vlen, u32, pos_low, u32, pos_high) 1018 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1011{ 1019{
1012 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1020 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1013 return compat_sys_preadv64(fd, vec, vlen, pos); 1021 return compat_sys_preadv64(fd, vec, vlen, pos);
@@ -1035,11 +1043,11 @@ out:
1035 return ret; 1043 return ret;
1036} 1044}
1037 1045
1038COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, 1046COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1039 const struct compat_iovec __user *, vec, 1047 const struct compat_iovec __user *, vec,
1040 unsigned long, vlen) 1048 compat_ulong_t, vlen)
1041{ 1049{
1042 struct fd f = fdget(fd); 1050 struct fd f = fdget_pos(fd);
1043 ssize_t ret; 1051 ssize_t ret;
1044 loff_t pos; 1052 loff_t pos;
1045 1053
@@ -1049,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1049 ret = compat_writev(f.file, vec, vlen, &pos); 1057 ret = compat_writev(f.file, vec, vlen, &pos);
1050 if (ret >= 0) 1058 if (ret >= 0)
1051 f.file->f_pos = pos; 1059 f.file->f_pos = pos;
1052 fdput(f); 1060 fdput_pos(f);
1053 return ret; 1061 return ret;
1054} 1062}
1055 1063
@@ -1072,9 +1080,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1072 return ret; 1080 return ret;
1073} 1081}
1074 1082
1075COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, 1083COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1076 const struct compat_iovec __user *,vec, 1084 const struct compat_iovec __user *,vec,
1077 unsigned long, vlen, u32, pos_low, u32, pos_high) 1085 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1078{ 1086{
1079 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1087 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1080 return compat_sys_pwritev64(fd, vec, vlen, pos); 1088 return compat_sys_pwritev64(fd, vec, vlen, pos);
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index f096b80e73d8..4a211f5b34b8 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -48,18 +48,18 @@ static inline int reiserfs_acl_count(size_t size)
48 48
49#ifdef CONFIG_REISERFS_FS_POSIX_ACL 49#ifdef CONFIG_REISERFS_FS_POSIX_ACL
50struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); 50struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
51int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
51int reiserfs_acl_chmod(struct inode *inode); 52int reiserfs_acl_chmod(struct inode *inode);
52int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, 53int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
53 struct inode *dir, struct dentry *dentry, 54 struct inode *dir, struct dentry *dentry,
54 struct inode *inode); 55 struct inode *inode);
55int reiserfs_cache_default_acl(struct inode *dir); 56int reiserfs_cache_default_acl(struct inode *dir);
56extern const struct xattr_handler reiserfs_posix_acl_default_handler;
57extern const struct xattr_handler reiserfs_posix_acl_access_handler;
58 57
59#else 58#else
60 59
61#define reiserfs_cache_default_acl(inode) 0 60#define reiserfs_cache_default_acl(inode) 0
62#define reiserfs_get_acl NULL 61#define reiserfs_get_acl NULL
62#define reiserfs_set_acl NULL
63 63
64static inline int reiserfs_acl_chmod(struct inode *inode) 64static inline int reiserfs_acl_chmod(struct inode *inode)
65{ 65{
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 2b7882b508db..9a3c68cf6026 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -324,23 +324,17 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
324 switch (flag) { 324 switch (flag) {
325 case M_INSERT: /* insert item into L[0] */ 325 case M_INSERT: /* insert item into L[0] */
326 326
327 if (item_pos == tb->lnum[0] - 1 327 if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
328 && tb->lbytes != -1) {
329 /* part of new item falls into L[0] */ 328 /* part of new item falls into L[0] */
330 int new_item_len; 329 int new_item_len;
331 int version; 330 int version;
332 331
333 ret_val = 332 ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
334 leaf_shift_left(tb, tb->lnum[0] - 1,
335 -1);
336 333
337 /* Calculate item length to insert to S[0] */ 334 /* Calculate item length to insert to S[0] */
338 new_item_len = 335 new_item_len = ih_item_len(ih) - tb->lbytes;
339 ih_item_len(ih) - tb->lbytes;
340 /* Calculate and check item length to insert to L[0] */ 336 /* Calculate and check item length to insert to L[0] */
341 put_ih_item_len(ih, 337 put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
342 ih_item_len(ih) -
343 new_item_len);
344 338
345 RFALSE(ih_item_len(ih) <= 0, 339 RFALSE(ih_item_len(ih) <= 0,
346 "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", 340 "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
@@ -349,30 +343,18 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
349 /* Insert new item into L[0] */ 343 /* Insert new item into L[0] */
350 buffer_info_init_left(tb, &bi); 344 buffer_info_init_left(tb, &bi);
351 leaf_insert_into_buf(&bi, 345 leaf_insert_into_buf(&bi,
352 n + item_pos - 346 n + item_pos - ret_val, ih, body,
353 ret_val, ih, body, 347 zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num);
354 zeros_num >
355 ih_item_len(ih) ?
356 ih_item_len(ih) :
357 zeros_num);
358 348
359 version = ih_version(ih); 349 version = ih_version(ih);
360 350
361 /* Calculate key component, item length and body to insert into S[0] */ 351 /* Calculate key component, item length and body to insert into S[0] */
362 set_le_ih_k_offset(ih, 352 set_le_ih_k_offset(ih, le_ih_k_offset(ih) +
363 le_ih_k_offset(ih) + 353 (tb-> lbytes << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0)));
364 (tb->
365 lbytes <<
366 (is_indirect_le_ih
367 (ih) ? tb->tb_sb->
368 s_blocksize_bits -
369 UNFM_P_SHIFT :
370 0)));
371 354
372 put_ih_item_len(ih, new_item_len); 355 put_ih_item_len(ih, new_item_len);
373 if (tb->lbytes > zeros_num) { 356 if (tb->lbytes > zeros_num) {
374 body += 357 body += (tb->lbytes - zeros_num);
375 (tb->lbytes - zeros_num);
376 zeros_num = 0; 358 zeros_num = 0;
377 } else 359 } else
378 zeros_num -= tb->lbytes; 360 zeros_num -= tb->lbytes;
@@ -383,15 +365,10 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
383 } else { 365 } else {
384 /* new item in whole falls into L[0] */ 366 /* new item in whole falls into L[0] */
385 /* Shift lnum[0]-1 items to L[0] */ 367 /* Shift lnum[0]-1 items to L[0] */
386 ret_val = 368 ret_val = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
387 leaf_shift_left(tb, tb->lnum[0] - 1,
388 tb->lbytes);
389 /* Insert new item into L[0] */ 369 /* Insert new item into L[0] */
390 buffer_info_init_left(tb, &bi); 370 buffer_info_init_left(tb, &bi);
391 leaf_insert_into_buf(&bi, 371 leaf_insert_into_buf(&bi, n + item_pos - ret_val, ih, body, zeros_num);
392 n + item_pos -
393 ret_val, ih, body,
394 zeros_num);
395 tb->insert_size[0] = 0; 372 tb->insert_size[0] = 0;
396 zeros_num = 0; 373 zeros_num = 0;
397 } 374 }
@@ -399,264 +376,117 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
399 376
400 case M_PASTE: /* append item in L[0] */ 377 case M_PASTE: /* append item in L[0] */
401 378
402 if (item_pos == tb->lnum[0] - 1 379 if (item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
403 && tb->lbytes != -1) {
404 /* we must shift the part of the appended item */ 380 /* we must shift the part of the appended item */
405 if (is_direntry_le_ih 381 if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) {
406 (B_N_PITEM_HEAD(tbS0, item_pos))) {
407 382
408 RFALSE(zeros_num, 383 RFALSE(zeros_num,
409 "PAP-12090: invalid parameter in case of a directory"); 384 "PAP-12090: invalid parameter in case of a directory");
410 /* directory item */ 385 /* directory item */
411 if (tb->lbytes > pos_in_item) { 386 if (tb->lbytes > pos_in_item) {
412 /* new directory entry falls into L[0] */ 387 /* new directory entry falls into L[0] */
413 struct item_head 388 struct item_head *pasted;
414 *pasted; 389 int l_pos_in_item = pos_in_item;
415 int l_pos_in_item =
416 pos_in_item;
417 390
418 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ 391 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
419 ret_val = 392 ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes-1);
420 leaf_shift_left(tb, 393 if (ret_val && !item_pos) {
421 tb-> 394 pasted = B_N_PITEM_HEAD(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
422 lnum 395 l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes -1);
423 [0],
424 tb->
425 lbytes
426 -
427 1);
428 if (ret_val
429 && !item_pos) {
430 pasted =
431 B_N_PITEM_HEAD
432 (tb->L[0],
433 B_NR_ITEMS
434 (tb->
435 L[0]) -
436 1);
437 l_pos_in_item +=
438 I_ENTRY_COUNT
439 (pasted) -
440 (tb->
441 lbytes -
442 1);
443 } 396 }
444 397
445 /* Append given directory entry to directory item */ 398 /* Append given directory entry to directory item */
446 buffer_info_init_left(tb, &bi); 399 buffer_info_init_left(tb, &bi);
447 leaf_paste_in_buffer 400 leaf_paste_in_buffer(&bi, n + item_pos - ret_val, l_pos_in_item, tb->insert_size[0], body, zeros_num);
448 (&bi,
449 n + item_pos -
450 ret_val,
451 l_pos_in_item,
452 tb->insert_size[0],
453 body, zeros_num);
454 401
455 /* previous string prepared space for pasting new entry, following string pastes this entry */ 402 /* previous string prepared space for pasting new entry, following string pastes this entry */
456 403
457 /* when we have merge directory item, pos_in_item has been changed too */ 404 /* when we have merge directory item, pos_in_item has been changed too */
458 405
459 /* paste new directory entry. 1 is entry number */ 406 /* paste new directory entry. 1 is entry number */
460 leaf_paste_entries(&bi, 407 leaf_paste_entries(&bi, n + item_pos - ret_val, l_pos_in_item,
461 n + 408 1, (struct reiserfs_de_head *) body,
462 item_pos 409 body + DEH_SIZE, tb->insert_size[0]);
463 -
464 ret_val,
465 l_pos_in_item,
466 1,
467 (struct
468 reiserfs_de_head
469 *)
470 body,
471 body
472 +
473 DEH_SIZE,
474 tb->
475 insert_size
476 [0]
477 );
478 tb->insert_size[0] = 0; 410 tb->insert_size[0] = 0;
479 } else { 411 } else {
480 /* new directory item doesn't fall into L[0] */ 412 /* new directory item doesn't fall into L[0] */
481 /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ 413 /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
482 leaf_shift_left(tb, 414 leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
483 tb->
484 lnum[0],
485 tb->
486 lbytes);
487 } 415 }
488 /* Calculate new position to append in item body */ 416 /* Calculate new position to append in item body */
489 pos_in_item -= tb->lbytes; 417 pos_in_item -= tb->lbytes;
490 } else { 418 } else {
491 /* regular object */ 419 /* regular object */
492 RFALSE(tb->lbytes <= 0, 420 RFALSE(tb->lbytes <= 0, "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", tb->lbytes);
493 "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", 421 RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),
494 tb->lbytes);
495 RFALSE(pos_in_item !=
496 ih_item_len
497 (B_N_PITEM_HEAD
498 (tbS0, item_pos)),
499 "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", 422 "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
500 ih_item_len 423 ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),pos_in_item);
501 (B_N_PITEM_HEAD
502 (tbS0, item_pos)),
503 pos_in_item);
504 424
505 if (tb->lbytes >= pos_in_item) { 425 if (tb->lbytes >= pos_in_item) {
506 /* appended item will be in L[0] in whole */ 426 /* appended item will be in L[0] in whole */
507 int l_n; 427 int l_n;
508 428
509 /* this bytes number must be appended to the last item of L[h] */ 429 /* this bytes number must be appended to the last item of L[h] */
510 l_n = 430 l_n = tb->lbytes - pos_in_item;
511 tb->lbytes -
512 pos_in_item;
513 431
514 /* Calculate new insert_size[0] */ 432 /* Calculate new insert_size[0] */
515 tb->insert_size[0] -= 433 tb->insert_size[0] -= l_n;
516 l_n;
517 434
518 RFALSE(tb-> 435 RFALSE(tb->insert_size[0] <= 0,
519 insert_size[0] <=
520 0,
521 "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", 436 "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
522 tb-> 437 tb->insert_size[0]);
523 insert_size[0]); 438 ret_val = leaf_shift_left(tb, tb->lnum[0], ih_item_len
524 ret_val = 439 (B_N_PITEM_HEAD(tbS0, item_pos)));
525 leaf_shift_left(tb,
526 tb->
527 lnum
528 [0],
529 ih_item_len
530 (B_N_PITEM_HEAD
531 (tbS0,
532 item_pos)));
533 /* Append to body of item in L[0] */ 440 /* Append to body of item in L[0] */
534 buffer_info_init_left(tb, &bi); 441 buffer_info_init_left(tb, &bi);
535 leaf_paste_in_buffer 442 leaf_paste_in_buffer
536 (&bi, 443 (&bi, n + item_pos - ret_val, ih_item_len
537 n + item_pos - 444 (B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val)),
538 ret_val, 445 l_n, body,
539 ih_item_len 446 zeros_num > l_n ? l_n : zeros_num);
540 (B_N_PITEM_HEAD
541 (tb->L[0],
542 n + item_pos -
543 ret_val)), l_n,
544 body,
545 zeros_num >
546 l_n ? l_n :
547 zeros_num);
548 /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */ 447 /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */
549 { 448 {
550 int version; 449 int version;
551 int temp_l = 450 int temp_l = l_n;
552 l_n; 451
553 452 RFALSE(ih_item_len(B_N_PITEM_HEAD(tbS0, 0)),
554 RFALSE
555 (ih_item_len
556 (B_N_PITEM_HEAD
557 (tbS0,
558 0)),
559 "PAP-12106: item length must be 0"); 453 "PAP-12106: item length must be 0");
560 RFALSE 454 RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY
561 (comp_short_le_keys 455 (tb->L[0], n + item_pos - ret_val)),
562 (B_N_PKEY
563 (tbS0, 0),
564 B_N_PKEY
565 (tb->L[0],
566 n +
567 item_pos
568 -
569 ret_val)),
570 "PAP-12107: items must be of the same file"); 456 "PAP-12107: items must be of the same file");
571 if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) { 457 if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) {
572 temp_l = 458 temp_l = l_n << (tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT);
573 l_n
574 <<
575 (tb->
576 tb_sb->
577 s_blocksize_bits
578 -
579 UNFM_P_SHIFT);
580 } 459 }
581 /* update key of first item in S0 */ 460 /* update key of first item in S0 */
582 version = 461 version = ih_version(B_N_PITEM_HEAD(tbS0, 0));
583 ih_version 462 set_le_key_k_offset(version, B_N_PKEY(tbS0, 0),
584 (B_N_PITEM_HEAD 463 le_key_k_offset(version,B_N_PKEY(tbS0, 0)) + temp_l);
585 (tbS0, 0));
586 set_le_key_k_offset
587 (version,
588 B_N_PKEY
589 (tbS0, 0),
590 le_key_k_offset
591 (version,
592 B_N_PKEY
593 (tbS0,
594 0)) +
595 temp_l);
596 /* update left delimiting key */ 464 /* update left delimiting key */
597 set_le_key_k_offset 465 set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
598 (version, 466 le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0])) + temp_l);
599 B_N_PDELIM_KEY
600 (tb->
601 CFL[0],
602 tb->
603 lkey[0]),
604 le_key_k_offset
605 (version,
606 B_N_PDELIM_KEY
607 (tb->
608 CFL[0],
609 tb->
610 lkey[0]))
611 + temp_l);
612 } 467 }
613 468
614 /* Calculate new body, position in item and insert_size[0] */ 469 /* Calculate new body, position in item and insert_size[0] */
615 if (l_n > zeros_num) { 470 if (l_n > zeros_num) {
616 body += 471 body += (l_n - zeros_num);
617 (l_n -
618 zeros_num);
619 zeros_num = 0; 472 zeros_num = 0;
620 } else 473 } else
621 zeros_num -= 474 zeros_num -= l_n;
622 l_n;
623 pos_in_item = 0; 475 pos_in_item = 0;
624 476
625 RFALSE 477 RFALSE(comp_short_le_keys(B_N_PKEY(tbS0, 0), B_N_PKEY(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1))
626 (comp_short_le_keys 478 || !op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)
627 (B_N_PKEY(tbS0, 0), 479 || !op_is_left_mergeable(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), tbS0->b_size),
628 B_N_PKEY(tb->L[0],
629 B_NR_ITEMS
630 (tb->
631 L[0]) -
632 1))
633 ||
634 !op_is_left_mergeable
635 (B_N_PKEY(tbS0, 0),
636 tbS0->b_size)
637 ||
638 !op_is_left_mergeable
639 (B_N_PDELIM_KEY
640 (tb->CFL[0],
641 tb->lkey[0]),
642 tbS0->b_size),
643 "PAP-12120: item must be merge-able with left neighboring item"); 480 "PAP-12120: item must be merge-able with left neighboring item");
644 } else { /* only part of the appended item will be in L[0] */ 481 } else { /* only part of the appended item will be in L[0] */
645 482
646 /* Calculate position in item for append in S[0] */ 483 /* Calculate position in item for append in S[0] */
647 pos_in_item -= 484 pos_in_item -= tb->lbytes;
648 tb->lbytes;
649 485
650 RFALSE(pos_in_item <= 0, 486 RFALSE(pos_in_item <= 0, "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item);
651 "PAP-12125: no place for paste. pos_in_item=%d",
652 pos_in_item);
653 487
654 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ 488 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
655 leaf_shift_left(tb, 489 leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
656 tb->
657 lnum[0],
658 tb->
659 lbytes);
660 } 490 }
661 } 491 }
662 } else { /* appended item will be in L[0] in whole */ 492 } else { /* appended item will be in L[0] in whole */
@@ -665,52 +495,30 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
665 495
666 if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */ 496 if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */
667 /* then increment pos_in_item by the size of the last item in L[0] */ 497 /* then increment pos_in_item by the size of the last item in L[0] */
668 pasted = 498 pasted = B_N_PITEM_HEAD(tb->L[0], n - 1);
669 B_N_PITEM_HEAD(tb->L[0],
670 n - 1);
671 if (is_direntry_le_ih(pasted)) 499 if (is_direntry_le_ih(pasted))
672 pos_in_item += 500 pos_in_item += ih_entry_count(pasted);
673 ih_entry_count
674 (pasted);
675 else 501 else
676 pos_in_item += 502 pos_in_item += ih_item_len(pasted);
677 ih_item_len(pasted);
678 } 503 }
679 504
680 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ 505 /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
681 ret_val = 506 ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
682 leaf_shift_left(tb, tb->lnum[0],
683 tb->lbytes);
684 /* Append to body of item in L[0] */ 507 /* Append to body of item in L[0] */
685 buffer_info_init_left(tb, &bi); 508 buffer_info_init_left(tb, &bi);
686 leaf_paste_in_buffer(&bi, 509 leaf_paste_in_buffer(&bi, n + item_pos - ret_val,
687 n + item_pos -
688 ret_val,
689 pos_in_item, 510 pos_in_item,
690 tb->insert_size[0], 511 tb->insert_size[0],
691 body, zeros_num); 512 body, zeros_num);
692 513
693 /* if appended item is directory, paste entry */ 514 /* if appended item is directory, paste entry */
694 pasted = 515 pasted = B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val);
695 B_N_PITEM_HEAD(tb->L[0],
696 n + item_pos -
697 ret_val);
698 if (is_direntry_le_ih(pasted)) 516 if (is_direntry_le_ih(pasted))
699 leaf_paste_entries(&bi, 517 leaf_paste_entries(&bi, n + item_pos - ret_val,
700 n + 518 pos_in_item, 1,
701 item_pos - 519 (struct reiserfs_de_head *) body,
702 ret_val, 520 body + DEH_SIZE,
703 pos_in_item, 521 tb->insert_size[0]);
704 1,
705 (struct
706 reiserfs_de_head
707 *)body,
708 body +
709 DEH_SIZE,
710 tb->
711 insert_size
712 [0]
713 );
714 /* if appended item is indirect item, put unformatted node into un list */ 522 /* if appended item is indirect item, put unformatted node into un list */
715 if (is_indirect_le_ih(pasted)) 523 if (is_indirect_le_ih(pasted))
716 set_ih_free_space(pasted, 0); 524 set_ih_free_space(pasted, 0);
@@ -722,13 +530,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
722 reiserfs_panic(tb->tb_sb, "PAP-12130", 530 reiserfs_panic(tb->tb_sb, "PAP-12130",
723 "lnum > 0: unexpected mode: " 531 "lnum > 0: unexpected mode: "
724 " %s(%d)", 532 " %s(%d)",
725 (flag == 533 (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
726 M_DELETE) ? "DELETE" : ((flag ==
727 M_CUT)
728 ? "CUT"
729 :
730 "UNKNOWN"),
731 flag);
732 } 534 }
733 } else { 535 } else {
734 /* new item doesn't fall into L[0] */ 536 /* new item doesn't fall into L[0] */
@@ -748,14 +550,12 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
748 case M_INSERT: /* insert item */ 550 case M_INSERT: /* insert item */
749 if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */ 551 if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */
750 if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */ 552 if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */
751 loff_t old_key_comp, old_len, 553 loff_t old_key_comp, old_len, r_zeros_number;
752 r_zeros_number;
753 const char *r_body; 554 const char *r_body;
754 int version; 555 int version;
755 loff_t offset; 556 loff_t offset;
756 557
757 leaf_shift_right(tb, tb->rnum[0] - 1, 558 leaf_shift_right(tb, tb->rnum[0] - 1, -1);
758 -1);
759 559
760 version = ih_version(ih); 560 version = ih_version(ih);
761 /* Remember key component and item length */ 561 /* Remember key component and item length */
@@ -763,29 +563,17 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
763 old_len = ih_item_len(ih); 563 old_len = ih_item_len(ih);
764 564
765 /* Calculate key component and item length to insert into R[0] */ 565 /* Calculate key component and item length to insert into R[0] */
766 offset = 566 offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << (is_indirect_le_ih(ih) ? tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT : 0));
767 le_ih_k_offset(ih) +
768 ((old_len -
769 tb->
770 rbytes) << (is_indirect_le_ih(ih)
771 ? tb->tb_sb->
772 s_blocksize_bits -
773 UNFM_P_SHIFT : 0));
774 set_le_ih_k_offset(ih, offset); 567 set_le_ih_k_offset(ih, offset);
775 put_ih_item_len(ih, tb->rbytes); 568 put_ih_item_len(ih, tb->rbytes);
776 /* Insert part of the item into R[0] */ 569 /* Insert part of the item into R[0] */
777 buffer_info_init_right(tb, &bi); 570 buffer_info_init_right(tb, &bi);
778 if ((old_len - tb->rbytes) > zeros_num) { 571 if ((old_len - tb->rbytes) > zeros_num) {
779 r_zeros_number = 0; 572 r_zeros_number = 0;
780 r_body = 573 r_body = body + (old_len - tb->rbytes) - zeros_num;
781 body + (old_len -
782 tb->rbytes) -
783 zeros_num;
784 } else { 574 } else {
785 r_body = body; 575 r_body = body;
786 r_zeros_number = 576 r_zeros_number = zeros_num - (old_len - tb->rbytes);
787 zeros_num - (old_len -
788 tb->rbytes);
789 zeros_num -= r_zeros_number; 577 zeros_num -= r_zeros_number;
790 } 578 }
791 579
@@ -798,25 +586,18 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
798 586
799 /* Calculate key component and item length to insert into S[0] */ 587 /* Calculate key component and item length to insert into S[0] */
800 set_le_ih_k_offset(ih, old_key_comp); 588 set_le_ih_k_offset(ih, old_key_comp);
801 put_ih_item_len(ih, 589 put_ih_item_len(ih, old_len - tb->rbytes);
802 old_len - tb->rbytes);
803 590
804 tb->insert_size[0] -= tb->rbytes; 591 tb->insert_size[0] -= tb->rbytes;
805 592
806 } else { /* whole new item falls into R[0] */ 593 } else { /* whole new item falls into R[0] */
807 594
808 /* Shift rnum[0]-1 items to R[0] */ 595 /* Shift rnum[0]-1 items to R[0] */
809 ret_val = 596 ret_val = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
810 leaf_shift_right(tb,
811 tb->rnum[0] - 1,
812 tb->rbytes);
813 /* Insert new item into R[0] */ 597 /* Insert new item into R[0] */
814 buffer_info_init_right(tb, &bi); 598 buffer_info_init_right(tb, &bi);
815 leaf_insert_into_buf(&bi, 599 leaf_insert_into_buf(&bi, item_pos - n + tb->rnum[0] - 1,
816 item_pos - n + 600 ih, body, zeros_num);
817 tb->rnum[0] - 1,
818 ih, body,
819 zeros_num);
820 601
821 if (item_pos - n + tb->rnum[0] - 1 == 0) { 602 if (item_pos - n + tb->rnum[0] - 1 == 0) {
822 replace_key(tb, tb->CFR[0], 603 replace_key(tb, tb->CFR[0],
@@ -841,200 +622,97 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
841 622
842 RFALSE(zeros_num, 623 RFALSE(zeros_num,
843 "PAP-12145: invalid parameter in case of a directory"); 624 "PAP-12145: invalid parameter in case of a directory");
844 entry_count = 625 entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD
845 I_ENTRY_COUNT(B_N_PITEM_HEAD 626 (tbS0, item_pos));
846 (tbS0,
847 item_pos));
848 if (entry_count - tb->rbytes < 627 if (entry_count - tb->rbytes <
849 pos_in_item) 628 pos_in_item)
850 /* new directory entry falls into R[0] */ 629 /* new directory entry falls into R[0] */
851 { 630 {
852 int paste_entry_position; 631 int paste_entry_position;
853 632
854 RFALSE(tb->rbytes - 1 >= 633 RFALSE(tb->rbytes - 1 >= entry_count || !tb-> insert_size[0],
855 entry_count
856 || !tb->
857 insert_size[0],
858 "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", 634 "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
859 tb->rbytes, 635 tb->rbytes, entry_count);
860 entry_count);
861 /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ 636 /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
862 leaf_shift_right(tb, 637 leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
863 tb->
864 rnum
865 [0],
866 tb->
867 rbytes
868 - 1);
869 /* Paste given directory entry to directory item */ 638 /* Paste given directory entry to directory item */
870 paste_entry_position = 639 paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1;
871 pos_in_item -
872 entry_count +
873 tb->rbytes - 1;
874 buffer_info_init_right(tb, &bi); 640 buffer_info_init_right(tb, &bi);
875 leaf_paste_in_buffer 641 leaf_paste_in_buffer(&bi, 0, paste_entry_position, tb->insert_size[0], body, zeros_num);
876 (&bi, 0,
877 paste_entry_position,
878 tb->insert_size[0],
879 body, zeros_num);
880 /* paste entry */ 642 /* paste entry */
881 leaf_paste_entries(&bi, 643 leaf_paste_entries(&bi, 0, paste_entry_position, 1,
882 0, 644 (struct reiserfs_de_head *) body,
883 paste_entry_position, 645 body + DEH_SIZE, tb->insert_size[0]);
884 1, 646
885 (struct 647 if (paste_entry_position == 0) {
886 reiserfs_de_head
887 *)
888 body,
889 body
890 +
891 DEH_SIZE,
892 tb->
893 insert_size
894 [0]
895 );
896
897 if (paste_entry_position
898 == 0) {
899 /* change delimiting keys */ 648 /* change delimiting keys */
900 replace_key(tb, 649 replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0],0);
901 tb->
902 CFR
903 [0],
904 tb->
905 rkey
906 [0],
907 tb->
908 R
909 [0],
910 0);
911 } 650 }
912 651
913 tb->insert_size[0] = 0; 652 tb->insert_size[0] = 0;
914 pos_in_item++; 653 pos_in_item++;
915 } else { /* new directory entry doesn't fall into R[0] */ 654 } else { /* new directory entry doesn't fall into R[0] */
916 655
917 leaf_shift_right(tb, 656 leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
918 tb->
919 rnum
920 [0],
921 tb->
922 rbytes);
923 } 657 }
924 } else { /* regular object */ 658 } else { /* regular object */
925 659
926 int n_shift, n_rem, 660 int n_shift, n_rem, r_zeros_number;
927 r_zeros_number;
928 const char *r_body; 661 const char *r_body;
929 662
930 /* Calculate number of bytes which must be shifted from appended item */ 663 /* Calculate number of bytes which must be shifted from appended item */
931 if ((n_shift = 664 if ((n_shift = tb->rbytes - tb->insert_size[0]) < 0)
932 tb->rbytes -
933 tb->insert_size[0]) < 0)
934 n_shift = 0; 665 n_shift = 0;
935 666
936 RFALSE(pos_in_item != 667 RFALSE(pos_in_item != ih_item_len
937 ih_item_len 668 (B_N_PITEM_HEAD(tbS0, item_pos)),
938 (B_N_PITEM_HEAD
939 (tbS0, item_pos)),
940 "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", 669 "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
941 pos_in_item, 670 pos_in_item, ih_item_len
942 ih_item_len 671 (B_N_PITEM_HEAD(tbS0, item_pos)));
943 (B_N_PITEM_HEAD 672
944 (tbS0, item_pos))); 673 leaf_shift_right(tb, tb->rnum[0], n_shift);
945
946 leaf_shift_right(tb,
947 tb->rnum[0],
948 n_shift);
949 /* Calculate number of bytes which must remain in body after appending to R[0] */ 674 /* Calculate number of bytes which must remain in body after appending to R[0] */
950 if ((n_rem = 675 if ((n_rem = tb->insert_size[0] - tb->rbytes) < 0)
951 tb->insert_size[0] -
952 tb->rbytes) < 0)
953 n_rem = 0; 676 n_rem = 0;
954 677
955 { 678 {
956 int version; 679 int version;
957 unsigned long temp_rem = 680 unsigned long temp_rem = n_rem;
958 n_rem; 681
959 682 version = ih_version(B_N_PITEM_HEAD(tb->R[0], 0));
960 version = 683 if (is_indirect_le_key(version, B_N_PKEY(tb->R[0], 0))) {
961 ih_version 684 temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT);
962 (B_N_PITEM_HEAD
963 (tb->R[0], 0));
964 if (is_indirect_le_key
965 (version,
966 B_N_PKEY(tb->R[0],
967 0))) {
968 temp_rem =
969 n_rem <<
970 (tb->tb_sb->
971 s_blocksize_bits
972 -
973 UNFM_P_SHIFT);
974 } 685 }
975 set_le_key_k_offset 686 set_le_key_k_offset(version, B_N_PKEY(tb->R[0], 0),
976 (version, 687 le_key_k_offset(version, B_N_PKEY(tb->R[0], 0)) + temp_rem);
977 B_N_PKEY(tb->R[0], 688 set_le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]),
978 0), 689 le_key_k_offset(version, B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])) + temp_rem);
979 le_key_k_offset
980 (version,
981 B_N_PKEY(tb->R[0],
982 0)) +
983 temp_rem);
984 set_le_key_k_offset
985 (version,
986 B_N_PDELIM_KEY(tb->
987 CFR
988 [0],
989 tb->
990 rkey
991 [0]),
992 le_key_k_offset
993 (version,
994 B_N_PDELIM_KEY
995 (tb->CFR[0],
996 tb->rkey[0])) +
997 temp_rem);
998 } 690 }
999/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; 691/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem;
1000 k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ 692 k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/
1001 do_balance_mark_internal_dirty 693 do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
1002 (tb, tb->CFR[0], 0);
1003 694
1004 /* Append part of body into R[0] */ 695 /* Append part of body into R[0] */
1005 buffer_info_init_right(tb, &bi); 696 buffer_info_init_right(tb, &bi);
1006 if (n_rem > zeros_num) { 697 if (n_rem > zeros_num) {
1007 r_zeros_number = 0; 698 r_zeros_number = 0;
1008 r_body = 699 r_body = body + n_rem - zeros_num;
1009 body + n_rem -
1010 zeros_num;
1011 } else { 700 } else {
1012 r_body = body; 701 r_body = body;
1013 r_zeros_number = 702 r_zeros_number = zeros_num - n_rem;
1014 zeros_num - n_rem; 703 zeros_num -= r_zeros_number;
1015 zeros_num -=
1016 r_zeros_number;
1017 } 704 }
1018 705
1019 leaf_paste_in_buffer(&bi, 0, 706 leaf_paste_in_buffer(&bi, 0, n_shift,
1020 n_shift, 707 tb->insert_size[0] - n_rem,
1021 tb-> 708 r_body, r_zeros_number);
1022 insert_size 709
1023 [0] - 710 if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->R[0], 0))) {
1024 n_rem,
1025 r_body,
1026 r_zeros_number);
1027
1028 if (is_indirect_le_ih
1029 (B_N_PITEM_HEAD
1030 (tb->R[0], 0))) {
1031#if 0 711#if 0
1032 RFALSE(n_rem, 712 RFALSE(n_rem,
1033 "PAP-12160: paste more than one unformatted node pointer"); 713 "PAP-12160: paste more than one unformatted node pointer");
1034#endif 714#endif
1035 set_ih_free_space 715 set_ih_free_space(B_N_PITEM_HEAD(tb->R[0], 0), 0);
1036 (B_N_PITEM_HEAD
1037 (tb->R[0], 0), 0);
1038 } 716 }
1039 tb->insert_size[0] = n_rem; 717 tb->insert_size[0] = n_rem;
1040 if (!n_rem) 718 if (!n_rem)
@@ -1044,58 +722,28 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1044 722
1045 struct item_head *pasted; 723 struct item_head *pasted;
1046 724
1047 ret_val = 725 ret_val = leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
1048 leaf_shift_right(tb, tb->rnum[0],
1049 tb->rbytes);
1050 /* append item in R[0] */ 726 /* append item in R[0] */
1051 if (pos_in_item >= 0) { 727 if (pos_in_item >= 0) {
1052 buffer_info_init_right(tb, &bi); 728 buffer_info_init_right(tb, &bi);
1053 leaf_paste_in_buffer(&bi, 729 leaf_paste_in_buffer(&bi, item_pos - n + tb->rnum[0], pos_in_item,
1054 item_pos - 730 tb->insert_size[0], body, zeros_num);
1055 n +
1056 tb->
1057 rnum[0],
1058 pos_in_item,
1059 tb->
1060 insert_size
1061 [0], body,
1062 zeros_num);
1063 } 731 }
1064 732
1065 /* paste new entry, if item is directory item */ 733 /* paste new entry, if item is directory item */
1066 pasted = 734 pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]);
1067 B_N_PITEM_HEAD(tb->R[0], 735 if (is_direntry_le_ih(pasted) && pos_in_item >= 0) {
1068 item_pos - n + 736 leaf_paste_entries(&bi, item_pos - n + tb->rnum[0],
1069 tb->rnum[0]); 737 pos_in_item, 1,
1070 if (is_direntry_le_ih(pasted) 738 (struct reiserfs_de_head *) body,
1071 && pos_in_item >= 0) { 739 body + DEH_SIZE, tb->insert_size[0]);
1072 leaf_paste_entries(&bi,
1073 item_pos -
1074 n +
1075 tb->rnum[0],
1076 pos_in_item,
1077 1,
1078 (struct
1079 reiserfs_de_head
1080 *)body,
1081 body +
1082 DEH_SIZE,
1083 tb->
1084 insert_size
1085 [0]
1086 );
1087 if (!pos_in_item) { 740 if (!pos_in_item) {
1088 741
1089 RFALSE(item_pos - n + 742 RFALSE(item_pos - n + tb->rnum[0],
1090 tb->rnum[0],
1091 "PAP-12165: directory item must be first item of node when pasting is in 0th position"); 743 "PAP-12165: directory item must be first item of node when pasting is in 0th position");
1092 744
1093 /* update delimiting keys */ 745 /* update delimiting keys */
1094 replace_key(tb, 746 replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
1095 tb->CFR[0],
1096 tb->rkey[0],
1097 tb->R[0],
1098 0);
1099 } 747 }
1100 } 748 }
1101 749
@@ -1111,22 +759,16 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1111 default: /* cases d and t */ 759 default: /* cases d and t */
1112 reiserfs_panic(tb->tb_sb, "PAP-12175", 760 reiserfs_panic(tb->tb_sb, "PAP-12175",
1113 "rnum > 0: unexpected mode: %s(%d)", 761 "rnum > 0: unexpected mode: %s(%d)",
1114 (flag == 762 (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
1115 M_DELETE) ? "DELETE" : ((flag ==
1116 M_CUT) ? "CUT"
1117 : "UNKNOWN"),
1118 flag);
1119 } 763 }
1120 764
1121 } 765 }
1122 766
1123 /* tb->rnum[0] > 0 */ 767 /* tb->rnum[0] > 0 */
1124 RFALSE(tb->blknum[0] > 3, 768 RFALSE(tb->blknum[0] > 3,
1125 "PAP-12180: blknum can not be %d. It must be <= 3", 769 "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
1126 tb->blknum[0]);
1127 RFALSE(tb->blknum[0] < 0, 770 RFALSE(tb->blknum[0] < 0,
1128 "PAP-12185: blknum can not be %d. It must be >= 0", 771 "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
1129 tb->blknum[0]);
1130 772
1131 /* if while adding to a node we discover that it is possible to split 773 /* if while adding to a node we discover that it is possible to split
1132 it in two, and merge the left part into the left neighbor and the 774 it in two, and merge the left part into the left neighbor and the
@@ -1177,8 +819,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1177 819
1178 if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */ 820 if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */
1179 if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */ 821 if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */
1180 int old_key_comp, old_len, 822 int old_key_comp, old_len, r_zeros_number;
1181 r_zeros_number;
1182 const char *r_body; 823 const char *r_body;
1183 int version; 824 int version;
1184 825
@@ -1192,15 +833,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1192 old_len = ih_item_len(ih); 833 old_len = ih_item_len(ih);
1193 834
1194 /* Calculate key component and item length to insert into S_new[i] */ 835 /* Calculate key component and item length to insert into S_new[i] */
1195 set_le_ih_k_offset(ih, 836 set_le_ih_k_offset(ih, le_ih_k_offset(ih) +
1196 le_ih_k_offset(ih) + 837 ((old_len - sbytes[i]) << (is_indirect_le_ih(ih) ? tb->tb_sb-> s_blocksize_bits - UNFM_P_SHIFT : 0)));
1197 ((old_len -
1198 sbytes[i]) <<
1199 (is_indirect_le_ih
1200 (ih) ? tb->tb_sb->
1201 s_blocksize_bits -
1202 UNFM_P_SHIFT :
1203 0)));
1204 838
1205 put_ih_item_len(ih, sbytes[i]); 839 put_ih_item_len(ih, sbytes[i]);
1206 840
@@ -1209,39 +843,29 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1209 843
1210 if ((old_len - sbytes[i]) > zeros_num) { 844 if ((old_len - sbytes[i]) > zeros_num) {
1211 r_zeros_number = 0; 845 r_zeros_number = 0;
1212 r_body = 846 r_body = body + (old_len - sbytes[i]) - zeros_num;
1213 body + (old_len -
1214 sbytes[i]) -
1215 zeros_num;
1216 } else { 847 } else {
1217 r_body = body; 848 r_body = body;
1218 r_zeros_number = 849 r_zeros_number = zeros_num - (old_len - sbytes[i]);
1219 zeros_num - (old_len -
1220 sbytes[i]);
1221 zeros_num -= r_zeros_number; 850 zeros_num -= r_zeros_number;
1222 } 851 }
1223 852
1224 leaf_insert_into_buf(&bi, 0, ih, r_body, 853 leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeros_number);
1225 r_zeros_number);
1226 854
1227 /* Calculate key component and item length to insert into S[i] */ 855 /* Calculate key component and item length to insert into S[i] */
1228 set_le_ih_k_offset(ih, old_key_comp); 856 set_le_ih_k_offset(ih, old_key_comp);
1229 put_ih_item_len(ih, 857 put_ih_item_len(ih, old_len - sbytes[i]);
1230 old_len - sbytes[i]);
1231 tb->insert_size[0] -= sbytes[i]; 858 tb->insert_size[0] -= sbytes[i];
1232 } else { /* whole new item falls into S_new[i] */ 859 } else { /* whole new item falls into S_new[i] */
1233 860
1234 /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ 861 /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
1235 leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, 862 leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
1236 snum[i] - 1, sbytes[i], 863 snum[i] - 1, sbytes[i], S_new[i]);
1237 S_new[i]);
1238 864
1239 /* Insert new item into S_new[i] */ 865 /* Insert new item into S_new[i] */
1240 buffer_info_init_bh(tb, &bi, S_new[i]); 866 buffer_info_init_bh(tb, &bi, S_new[i]);
1241 leaf_insert_into_buf(&bi, 867 leaf_insert_into_buf(&bi, item_pos - n + snum[i] - 1,
1242 item_pos - n + 868 ih, body, zeros_num);
1243 snum[i] - 1, ih,
1244 body, zeros_num);
1245 869
1246 zeros_num = tb->insert_size[0] = 0; 870 zeros_num = tb->insert_size[0] = 0;
1247 } 871 }
@@ -1268,150 +892,73 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1268 892
1269 int entry_count; 893 int entry_count;
1270 894
1271 entry_count = 895 entry_count = ih_entry_count(aux_ih);
1272 ih_entry_count(aux_ih);
1273 896
1274 if (entry_count - sbytes[i] < 897 if (entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count) {
1275 pos_in_item
1276 && pos_in_item <=
1277 entry_count) {
1278 /* new directory entry falls into S_new[i] */ 898 /* new directory entry falls into S_new[i] */
1279 899
1280 RFALSE(!tb-> 900 RFALSE(!tb->insert_size[0], "PAP-12215: insert_size is already 0");
1281 insert_size[0], 901 RFALSE(sbytes[i] - 1 >= entry_count,
1282 "PAP-12215: insert_size is already 0");
1283 RFALSE(sbytes[i] - 1 >=
1284 entry_count,
1285 "PAP-12220: there are no so much entries (%d), only %d", 902 "PAP-12220: there are no so much entries (%d), only %d",
1286 sbytes[i] - 1, 903 sbytes[i] - 1, entry_count);
1287 entry_count);
1288 904
1289 /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ 905 /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
1290 leaf_move_items 906 leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i] - 1, S_new[i]);
1291 (LEAF_FROM_S_TO_SNEW,
1292 tb, snum[i],
1293 sbytes[i] - 1,
1294 S_new[i]);
1295 /* Paste given directory entry to directory item */ 907 /* Paste given directory entry to directory item */
1296 buffer_info_init_bh(tb, &bi, S_new[i]); 908 buffer_info_init_bh(tb, &bi, S_new[i]);
1297 leaf_paste_in_buffer 909 leaf_paste_in_buffer(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1,
1298 (&bi, 0, 910 tb->insert_size[0], body, zeros_num);
1299 pos_in_item -
1300 entry_count +
1301 sbytes[i] - 1,
1302 tb->insert_size[0],
1303 body, zeros_num);
1304 /* paste new directory entry */ 911 /* paste new directory entry */
1305 leaf_paste_entries(&bi, 912 leaf_paste_entries(&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, 1,
1306 0, 913 (struct reiserfs_de_head *) body,
1307 pos_in_item 914 body + DEH_SIZE, tb->insert_size[0]);
1308 -
1309 entry_count
1310 +
1311 sbytes
1312 [i] -
1313 1, 1,
1314 (struct
1315 reiserfs_de_head
1316 *)
1317 body,
1318 body
1319 +
1320 DEH_SIZE,
1321 tb->
1322 insert_size
1323 [0]
1324 );
1325 tb->insert_size[0] = 0; 915 tb->insert_size[0] = 0;
1326 pos_in_item++; 916 pos_in_item++;
1327 } else { /* new directory entry doesn't fall into S_new[i] */ 917 } else { /* new directory entry doesn't fall into S_new[i] */
1328 leaf_move_items 918 leaf_move_items(LEAF_FROM_S_TO_SNEW,tb, snum[i], sbytes[i], S_new[i]);
1329 (LEAF_FROM_S_TO_SNEW,
1330 tb, snum[i],
1331 sbytes[i],
1332 S_new[i]);
1333 } 919 }
1334 } else { /* regular object */ 920 } else { /* regular object */
1335 921
1336 int n_shift, n_rem, 922 int n_shift, n_rem, r_zeros_number;
1337 r_zeros_number;
1338 const char *r_body; 923 const char *r_body;
1339 924
1340 RFALSE(pos_in_item != 925 RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)) || tb->insert_size[0] <= 0,
1341 ih_item_len
1342 (B_N_PITEM_HEAD
1343 (tbS0, item_pos))
1344 || tb->insert_size[0] <=
1345 0,
1346 "PAP-12225: item too short or insert_size <= 0"); 926 "PAP-12225: item too short or insert_size <= 0");
1347 927
1348 /* Calculate number of bytes which must be shifted from appended item */ 928 /* Calculate number of bytes which must be shifted from appended item */
1349 n_shift = 929 n_shift = sbytes[i] - tb->insert_size[0];
1350 sbytes[i] -
1351 tb->insert_size[0];
1352 if (n_shift < 0) 930 if (n_shift < 0)
1353 n_shift = 0; 931 n_shift = 0;
1354 leaf_move_items 932 leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]);
1355 (LEAF_FROM_S_TO_SNEW, tb,
1356 snum[i], n_shift,
1357 S_new[i]);
1358 933
1359 /* Calculate number of bytes which must remain in body after append to S_new[i] */ 934 /* Calculate number of bytes which must remain in body after append to S_new[i] */
1360 n_rem = 935 n_rem = tb->insert_size[0] - sbytes[i];
1361 tb->insert_size[0] -
1362 sbytes[i];
1363 if (n_rem < 0) 936 if (n_rem < 0)
1364 n_rem = 0; 937 n_rem = 0;
1365 /* Append part of body into S_new[0] */ 938 /* Append part of body into S_new[0] */
1366 buffer_info_init_bh(tb, &bi, S_new[i]); 939 buffer_info_init_bh(tb, &bi, S_new[i]);
1367 if (n_rem > zeros_num) { 940 if (n_rem > zeros_num) {
1368 r_zeros_number = 0; 941 r_zeros_number = 0;
1369 r_body = 942 r_body = body + n_rem - zeros_num;
1370 body + n_rem -
1371 zeros_num;
1372 } else { 943 } else {
1373 r_body = body; 944 r_body = body;
1374 r_zeros_number = 945 r_zeros_number = zeros_num - n_rem;
1375 zeros_num - n_rem; 946 zeros_num -= r_zeros_number;
1376 zeros_num -=
1377 r_zeros_number;
1378 } 947 }
1379 948
1380 leaf_paste_in_buffer(&bi, 0, 949 leaf_paste_in_buffer(&bi, 0, n_shift,
1381 n_shift, 950 tb->insert_size[0] - n_rem,
1382 tb-> 951 r_body, r_zeros_number);
1383 insert_size
1384 [0] -
1385 n_rem,
1386 r_body,
1387 r_zeros_number);
1388 { 952 {
1389 struct item_head *tmp; 953 struct item_head *tmp;
1390 954
1391 tmp = 955 tmp = B_N_PITEM_HEAD(S_new[i], 0);
1392 B_N_PITEM_HEAD(S_new
1393 [i],
1394 0);
1395 if (is_indirect_le_ih 956 if (is_indirect_le_ih
1396 (tmp)) { 957 (tmp)) {
1397 set_ih_free_space 958 set_ih_free_space(tmp, 0);
1398 (tmp, 0); 959 set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT)));
1399 set_le_ih_k_offset
1400 (tmp,
1401 le_ih_k_offset
1402 (tmp) +
1403 (n_rem <<
1404 (tb->
1405 tb_sb->
1406 s_blocksize_bits
1407 -
1408 UNFM_P_SHIFT)));
1409 } else { 960 } else {
1410 set_le_ih_k_offset 961 set_le_ih_k_offset(tmp, le_ih_k_offset(tmp) + n_rem);
1411 (tmp,
1412 le_ih_k_offset
1413 (tmp) +
1414 n_rem);
1415 } 962 }
1416 } 963 }
1417 964
@@ -1426,8 +973,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1426 struct item_head *pasted; 973 struct item_head *pasted;
1427 974
1428#ifdef CONFIG_REISERFS_CHECK 975#ifdef CONFIG_REISERFS_CHECK
1429 struct item_head *ih_check = 976 struct item_head *ih_check = B_N_PITEM_HEAD(tbS0, item_pos);
1430 B_N_PITEM_HEAD(tbS0, item_pos);
1431 977
1432 if (!is_direntry_le_ih(ih_check) 978 if (!is_direntry_le_ih(ih_check)
1433 && (pos_in_item != ih_item_len(ih_check) 979 && (pos_in_item != ih_item_len(ih_check)
@@ -1439,8 +985,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1439 "to ih_item_len"); 985 "to ih_item_len");
1440#endif /* CONFIG_REISERFS_CHECK */ 986#endif /* CONFIG_REISERFS_CHECK */
1441 987
1442 leaf_mi = 988 leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW,
1443 leaf_move_items(LEAF_FROM_S_TO_SNEW,
1444 tb, snum[i], 989 tb, snum[i],
1445 sbytes[i], 990 sbytes[i],
1446 S_new[i]); 991 S_new[i]);
@@ -1452,30 +997,19 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1452 /* paste into item */ 997 /* paste into item */
1453 buffer_info_init_bh(tb, &bi, S_new[i]); 998 buffer_info_init_bh(tb, &bi, S_new[i]);
1454 leaf_paste_in_buffer(&bi, 999 leaf_paste_in_buffer(&bi,
1455 item_pos - n + 1000 item_pos - n + snum[i],
1456 snum[i],
1457 pos_in_item, 1001 pos_in_item,
1458 tb->insert_size[0], 1002 tb->insert_size[0],
1459 body, zeros_num); 1003 body, zeros_num);
1460 1004
1461 pasted = 1005 pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]);
1462 B_N_PITEM_HEAD(S_new[i],
1463 item_pos - n +
1464 snum[i]);
1465 if (is_direntry_le_ih(pasted)) { 1006 if (is_direntry_le_ih(pasted)) {
1466 leaf_paste_entries(&bi, 1007 leaf_paste_entries(&bi,
1467 item_pos - 1008 item_pos - n + snum[i],
1468 n + snum[i], 1009 pos_in_item, 1,
1469 pos_in_item, 1010 (struct reiserfs_de_head *)body,
1470 1, 1011 body + DEH_SIZE,
1471 (struct 1012 tb->insert_size[0]
1472 reiserfs_de_head
1473 *)body,
1474 body +
1475 DEH_SIZE,
1476 tb->
1477 insert_size
1478 [0]
1479 ); 1013 );
1480 } 1014 }
1481 1015
@@ -1495,11 +1029,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1495 default: /* cases d and t */ 1029 default: /* cases d and t */
1496 reiserfs_panic(tb->tb_sb, "PAP-12245", 1030 reiserfs_panic(tb->tb_sb, "PAP-12245",
1497 "blknum > 2: unexpected mode: %s(%d)", 1031 "blknum > 2: unexpected mode: %s(%d)",
1498 (flag == 1032 (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
1499 M_DELETE) ? "DELETE" : ((flag ==
1500 M_CUT) ? "CUT"
1501 : "UNKNOWN"),
1502 flag);
1503 } 1033 }
1504 1034
1505 memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE); 1035 memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE);
@@ -1524,9 +1054,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1524 /* If we insert the first key change the delimiting key */ 1054 /* If we insert the first key change the delimiting key */
1525 if (item_pos == 0) { 1055 if (item_pos == 0) {
1526 if (tb->CFL[0]) /* can be 0 in reiserfsck */ 1056 if (tb->CFL[0]) /* can be 0 in reiserfsck */
1527 replace_key(tb, tb->CFL[0], tb->lkey[0], 1057 replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
1528 tbS0, 0);
1529
1530 } 1058 }
1531 break; 1059 break;
1532 1060
@@ -1536,53 +1064,27 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1536 pasted = B_N_PITEM_HEAD(tbS0, item_pos); 1064 pasted = B_N_PITEM_HEAD(tbS0, item_pos);
1537 /* when directory, may be new entry already pasted */ 1065 /* when directory, may be new entry already pasted */
1538 if (is_direntry_le_ih(pasted)) { 1066 if (is_direntry_le_ih(pasted)) {
1539 if (pos_in_item >= 0 && 1067 if (pos_in_item >= 0 && pos_in_item <= ih_entry_count(pasted)) {
1540 pos_in_item <=
1541 ih_entry_count(pasted)) {
1542 1068
1543 RFALSE(!tb->insert_size[0], 1069 RFALSE(!tb->insert_size[0],
1544 "PAP-12260: insert_size is 0 already"); 1070 "PAP-12260: insert_size is 0 already");
1545 1071
1546 /* prepare space */ 1072 /* prepare space */
1547 buffer_info_init_tbS0(tb, &bi); 1073 buffer_info_init_tbS0(tb, &bi);
1548 leaf_paste_in_buffer(&bi, 1074 leaf_paste_in_buffer(&bi, item_pos, pos_in_item,
1549 item_pos, 1075 tb->insert_size[0], body,
1550 pos_in_item,
1551 tb->
1552 insert_size
1553 [0], body,
1554 zeros_num); 1076 zeros_num);
1555 1077
1556 /* paste entry */ 1078 /* paste entry */
1557 leaf_paste_entries(&bi, 1079 leaf_paste_entries(&bi, item_pos, pos_in_item, 1,
1558 item_pos, 1080 (struct reiserfs_de_head *)body,
1559 pos_in_item, 1081 body + DEH_SIZE,
1560 1, 1082 tb->insert_size[0]);
1561 (struct
1562 reiserfs_de_head
1563 *)body,
1564 body +
1565 DEH_SIZE,
1566 tb->
1567 insert_size
1568 [0]
1569 );
1570 if (!item_pos && !pos_in_item) { 1083 if (!item_pos && !pos_in_item) {
1571 RFALSE(!tb->CFL[0] 1084 RFALSE(!tb->CFL[0] || !tb->L[0],
1572 || !tb->L[0],
1573 "PAP-12270: CFL[0]/L[0] must be specified"); 1085 "PAP-12270: CFL[0]/L[0] must be specified");
1574 if (tb->CFL[0]) { 1086 if (tb->CFL[0])
1575 replace_key(tb, 1087 replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
1576 tb->
1577 CFL
1578 [0],
1579 tb->
1580 lkey
1581 [0],
1582 tbS0,
1583 0);
1584
1585 }
1586 } 1088 }
1587 tb->insert_size[0] = 0; 1089 tb->insert_size[0] = 0;
1588 } 1090 }
@@ -1593,13 +1095,8 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1593 "PAP-12275: insert size must not be %d", 1095 "PAP-12275: insert size must not be %d",
1594 tb->insert_size[0]); 1096 tb->insert_size[0]);
1595 buffer_info_init_tbS0(tb, &bi); 1097 buffer_info_init_tbS0(tb, &bi);
1596 leaf_paste_in_buffer(&bi, 1098 leaf_paste_in_buffer(&bi, item_pos, pos_in_item,
1597 item_pos, 1099 tb->insert_size[0], body, zeros_num);
1598 pos_in_item,
1599 tb->
1600 insert_size
1601 [0], body,
1602 zeros_num);
1603 1100
1604 if (is_indirect_le_ih(pasted)) { 1101 if (is_indirect_le_ih(pasted)) {
1605#if 0 1102#if 0
@@ -1611,8 +1108,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1611 tb-> 1108 tb->
1612 insert_size[0]); 1109 insert_size[0]);
1613#endif 1110#endif
1614 set_ih_free_space 1111 set_ih_free_space(pasted, 0);
1615 (pasted, 0);
1616 } 1112 }
1617 tb->insert_size[0] = 0; 1113 tb->insert_size[0] = 0;
1618 } 1114 }
@@ -1620,8 +1116,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
1620 else { 1116 else {
1621 if (tb->insert_size[0]) { 1117 if (tb->insert_size[0]) {
1622 print_cur_tb("12285"); 1118 print_cur_tb("12285");
1623 reiserfs_panic(tb-> 1119 reiserfs_panic(tb->tb_sb,
1624 tb_sb,
1625 "PAP-12285", 1120 "PAP-12285",
1626 "insert_size " 1121 "insert_size "
1627 "must be 0 " 1122 "must be 0 "
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index dcaafcfc23b0..ed58d843d578 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -260,4 +260,5 @@ const struct inode_operations reiserfs_file_inode_operations = {
260 .removexattr = reiserfs_removexattr, 260 .removexattr = reiserfs_removexattr,
261 .permission = reiserfs_permission, 261 .permission = reiserfs_permission,
262 .get_acl = reiserfs_get_acl, 262 .get_acl = reiserfs_get_acl,
263 .set_acl = reiserfs_set_acl,
263}; 264};
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index dc5236f6de1b..e825f8b63e6b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1522,6 +1522,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
1522 .removexattr = reiserfs_removexattr, 1522 .removexattr = reiserfs_removexattr,
1523 .permission = reiserfs_permission, 1523 .permission = reiserfs_permission,
1524 .get_acl = reiserfs_get_acl, 1524 .get_acl = reiserfs_get_acl,
1525 .set_acl = reiserfs_set_acl,
1525}; 1526};
1526 1527
1527/* 1528/*
@@ -1538,8 +1539,6 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
1538 .listxattr = reiserfs_listxattr, 1539 .listxattr = reiserfs_listxattr,
1539 .removexattr = reiserfs_removexattr, 1540 .removexattr = reiserfs_removexattr,
1540 .permission = reiserfs_permission, 1541 .permission = reiserfs_permission,
1541 .get_acl = reiserfs_get_acl,
1542
1543}; 1542};
1544 1543
1545/* 1544/*
@@ -1553,4 +1552,5 @@ const struct inode_operations reiserfs_special_inode_operations = {
1553 .removexattr = reiserfs_removexattr, 1552 .removexattr = reiserfs_removexattr,
1554 .permission = reiserfs_permission, 1553 .permission = reiserfs_permission,
1555 .get_acl = reiserfs_get_acl, 1554 .get_acl = reiserfs_get_acl,
1555 .set_acl = reiserfs_set_acl,
1556}; 1556};
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index a958444a75fc..02b0b7d0f7d5 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -419,7 +419,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
419 char *s; 419 char *s;
420 420
421 /* Some block devices use /'s */ 421 /* Some block devices use /'s */
422 strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); 422 strlcpy(b, sb->s_id, BDEVNAME_SIZE);
423 s = strchr(b, '/'); 423 s = strchr(b, '/');
424 if (s) 424 if (s)
425 *s = '!'; 425 *s = '!';
@@ -449,7 +449,7 @@ int reiserfs_proc_info_done(struct super_block *sb)
449 char *s; 449 char *s;
450 450
451 /* Some block devices use /'s */ 451 /* Some block devices use /'s */
452 strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); 452 strlcpy(b, sb->s_id, BDEVNAME_SIZE);
453 s = strchr(b, '/'); 453 s = strchr(b, '/');
454 if (s) 454 if (s)
455 *s = '!'; 455 *s = '!';
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index f8adaee537c2..8d06adf89948 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -608,14 +608,6 @@ int reiserfs_resize(struct super_block *, unsigned long);
608 608
609#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->) 609#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
610 610
611/* A safe version of the "bdevname", which returns the "s_id" field of
612 * a superblock or else "Null superblock" if the super block is NULL.
613 */
614static inline char *reiserfs_bdevname(struct super_block *s)
615{
616 return (s == NULL) ? "Null superblock" : s->s_id;
617}
618
619#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal))) 611#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
620static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal 612static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
621 *journal) 613 *journal)
@@ -1958,8 +1950,6 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
1958#define MAX_US_INT 0xffff 1950#define MAX_US_INT 0xffff
1959 1951
1960// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset 1952// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
1961#define U32_MAX (~(__u32)0)
1962
1963static inline loff_t max_reiserfs_offset(struct inode *inode) 1953static inline loff_t max_reiserfs_offset(struct inode *inode)
1964{ 1954{
1965 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5) 1955 if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3ead145dadc4..2c803353f8ac 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1479,7 +1479,7 @@ static int read_super_block(struct super_block *s, int offset)
1479 if (!bh) { 1479 if (!bh) {
1480 reiserfs_warning(s, "sh-2006", 1480 reiserfs_warning(s, "sh-2006",
1481 "bread failed (dev %s, block %lu, size %lu)", 1481 "bread failed (dev %s, block %lu, size %lu)",
1482 reiserfs_bdevname(s), offset / s->s_blocksize, 1482 s->s_id, offset / s->s_blocksize,
1483 s->s_blocksize); 1483 s->s_blocksize);
1484 return 1; 1484 return 1;
1485 } 1485 }
@@ -1500,7 +1500,7 @@ static int read_super_block(struct super_block *s, int offset)
1500 if (!bh) { 1500 if (!bh) {
1501 reiserfs_warning(s, "sh-2007", 1501 reiserfs_warning(s, "sh-2007",
1502 "bread failed (dev %s, block %lu, size %lu)", 1502 "bread failed (dev %s, block %lu, size %lu)",
1503 reiserfs_bdevname(s), offset / s->s_blocksize, 1503 s->s_id, offset / s->s_blocksize,
1504 s->s_blocksize); 1504 s->s_blocksize);
1505 return 1; 1505 return 1;
1506 } 1506 }
@@ -1509,7 +1509,7 @@ static int read_super_block(struct super_block *s, int offset)
1509 if (sb_blocksize(rs) != s->s_blocksize) { 1509 if (sb_blocksize(rs) != s->s_blocksize) {
1510 reiserfs_warning(s, "sh-2011", "can't find a reiserfs " 1510 reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
1511 "filesystem on (dev %s, block %Lu, size %lu)", 1511 "filesystem on (dev %s, block %Lu, size %lu)",
1512 reiserfs_bdevname(s), 1512 s->s_id,
1513 (unsigned long long)bh->b_blocknr, 1513 (unsigned long long)bh->b_blocknr,
1514 s->s_blocksize); 1514 s->s_blocksize);
1515 brelse(bh); 1515 brelse(bh);
@@ -1825,7 +1825,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1825 /* try new format (64-th 1k block), which can contain reiserfs super block */ 1825 /* try new format (64-th 1k block), which can contain reiserfs super block */
1826 else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { 1826 else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
1827 SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", 1827 SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
1828 reiserfs_bdevname(s)); 1828 s->s_id);
1829 goto error_unlocked; 1829 goto error_unlocked;
1830 } 1830 }
1831 1831
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8a9e2dcfe004..5cdfbd638b5c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -50,6 +50,7 @@
50#include <linux/stat.h> 50#include <linux/stat.h>
51#include <linux/quotaops.h> 51#include <linux/quotaops.h>
52#include <linux/security.h> 52#include <linux/security.h>
53#include <linux/posix_acl_xattr.h>
53 54
54#define PRIVROOT_NAME ".reiserfs_priv" 55#define PRIVROOT_NAME ".reiserfs_priv"
55#define XAROOT_NAME "xattrs" 56#define XAROOT_NAME "xattrs"
@@ -904,8 +905,8 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = {
904 &reiserfs_xattr_security_handler, 905 &reiserfs_xattr_security_handler,
905#endif 906#endif
906#ifdef CONFIG_REISERFS_FS_POSIX_ACL 907#ifdef CONFIG_REISERFS_FS_POSIX_ACL
907 &reiserfs_posix_acl_access_handler, 908 &posix_acl_access_xattr_handler,
908 &reiserfs_posix_acl_default_handler, 909 &posix_acl_default_xattr_handler,
909#endif 910#endif
910 NULL 911 NULL
911}; 912};
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 06c04f73da65..a6ce532402dc 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -11,35 +11,19 @@
11#include "acl.h" 11#include "acl.h"
12#include <asm/uaccess.h> 12#include <asm/uaccess.h>
13 13
14static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, 14static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
15 struct inode *inode, int type, 15 struct inode *inode, int type,
16 struct posix_acl *acl); 16 struct posix_acl *acl);
17 17
18static int 18
19posix_acl_set(struct dentry *dentry, const char *name, const void *value, 19int
20 size_t size, int flags, int type) 20reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
21{ 21{
22 struct inode *inode = dentry->d_inode;
23 struct posix_acl *acl;
24 int error, error2; 22 int error, error2;
25 struct reiserfs_transaction_handle th; 23 struct reiserfs_transaction_handle th;
26 size_t jcreate_blocks; 24 size_t jcreate_blocks;
27 if (!reiserfs_posixacl(inode->i_sb)) 25 int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
28 return -EOPNOTSUPP; 26
29 if (!inode_owner_or_capable(inode))
30 return -EPERM;
31
32 if (value) {
33 acl = posix_acl_from_xattr(&init_user_ns, value, size);
34 if (IS_ERR(acl)) {
35 return PTR_ERR(acl);
36 } else if (acl) {
37 error = posix_acl_valid(acl);
38 if (error)
39 goto release_and_out;
40 }
41 } else
42 acl = NULL;
43 27
44 /* Pessimism: We can't assume that anything from the xattr root up 28 /* Pessimism: We can't assume that anything from the xattr root up
45 * has been created. */ 29 * has been created. */
@@ -51,7 +35,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
51 error = journal_begin(&th, inode->i_sb, jcreate_blocks); 35 error = journal_begin(&th, inode->i_sb, jcreate_blocks);
52 reiserfs_write_unlock(inode->i_sb); 36 reiserfs_write_unlock(inode->i_sb);
53 if (error == 0) { 37 if (error == 0) {
54 error = reiserfs_set_acl(&th, inode, type, acl); 38 error = __reiserfs_set_acl(&th, inode, type, acl);
55 reiserfs_write_lock(inode->i_sb); 39 reiserfs_write_lock(inode->i_sb);
56 error2 = journal_end(&th, inode->i_sb, jcreate_blocks); 40 error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
57 reiserfs_write_unlock(inode->i_sb); 41 reiserfs_write_unlock(inode->i_sb);
@@ -59,36 +43,13 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
59 error = error2; 43 error = error2;
60 } 44 }
61 45
62 release_and_out:
63 posix_acl_release(acl);
64 return error;
65}
66
67static int
68posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
69 size_t size, int type)
70{
71 struct posix_acl *acl;
72 int error;
73
74 if (!reiserfs_posixacl(dentry->d_sb))
75 return -EOPNOTSUPP;
76
77 acl = reiserfs_get_acl(dentry->d_inode, type);
78 if (IS_ERR(acl))
79 return PTR_ERR(acl);
80 if (acl == NULL)
81 return -ENODATA;
82 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
83 posix_acl_release(acl);
84
85 return error; 46 return error;
86} 47}
87 48
88/* 49/*
89 * Convert from filesystem to in-memory representation. 50 * Convert from filesystem to in-memory representation.
90 */ 51 */
91static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) 52static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
92{ 53{
93 const char *end = (char *)value + size; 54 const char *end = (char *)value + size;
94 int n, count; 55 int n, count;
@@ -158,7 +119,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
158/* 119/*
159 * Convert from in-memory to filesystem representation. 120 * Convert from in-memory to filesystem representation.
160 */ 121 */
161static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) 122static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
162{ 123{
163 reiserfs_acl_header *ext_acl; 124 reiserfs_acl_header *ext_acl;
164 char *e; 125 char *e;
@@ -221,10 +182,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
221 int size; 182 int size;
222 int retval; 183 int retval;
223 184
224 acl = get_cached_acl(inode, type);
225 if (acl != ACL_NOT_CACHED)
226 return acl;
227
228 switch (type) { 185 switch (type) {
229 case ACL_TYPE_ACCESS: 186 case ACL_TYPE_ACCESS:
230 name = POSIX_ACL_XATTR_ACCESS; 187 name = POSIX_ACL_XATTR_ACCESS;
@@ -257,7 +214,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
257 } else if (retval < 0) { 214 } else if (retval < 0) {
258 acl = ERR_PTR(retval); 215 acl = ERR_PTR(retval);
259 } else { 216 } else {
260 acl = posix_acl_from_disk(value, retval); 217 acl = reiserfs_posix_acl_from_disk(value, retval);
261 } 218 }
262 if (!IS_ERR(acl)) 219 if (!IS_ERR(acl))
263 set_cached_acl(inode, type, acl); 220 set_cached_acl(inode, type, acl);
@@ -273,7 +230,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
273 * BKL held [before 2.5.x] 230 * BKL held [before 2.5.x]
274 */ 231 */
275static int 232static int
276reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, 233__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
277 int type, struct posix_acl *acl) 234 int type, struct posix_acl *acl)
278{ 235{
279 char *name; 236 char *name;
@@ -281,9 +238,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
281 size_t size = 0; 238 size_t size = 0;
282 int error; 239 int error;
283 240
284 if (S_ISLNK(inode->i_mode))
285 return -EOPNOTSUPP;
286
287 switch (type) { 241 switch (type) {
288 case ACL_TYPE_ACCESS: 242 case ACL_TYPE_ACCESS:
289 name = POSIX_ACL_XATTR_ACCESS; 243 name = POSIX_ACL_XATTR_ACCESS;
@@ -307,7 +261,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
307 } 261 }
308 262
309 if (acl) { 263 if (acl) {
310 value = posix_acl_to_disk(acl, &size); 264 value = reiserfs_posix_acl_to_disk(acl, &size);
311 if (IS_ERR(value)) 265 if (IS_ERR(value))
312 return (int)PTR_ERR(value); 266 return (int)PTR_ERR(value);
313 } 267 }
@@ -343,7 +297,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
343 struct inode *dir, struct dentry *dentry, 297 struct inode *dir, struct dentry *dentry,
344 struct inode *inode) 298 struct inode *inode)
345{ 299{
346 struct posix_acl *acl; 300 struct posix_acl *default_acl, *acl;
347 int err = 0; 301 int err = 0;
348 302
349 /* ACLs only get applied to files and directories */ 303 /* ACLs only get applied to files and directories */
@@ -363,37 +317,28 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
363 goto apply_umask; 317 goto apply_umask;
364 } 318 }
365 319
366 acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT); 320 err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
367 if (IS_ERR(acl)) 321 if (err)
368 return PTR_ERR(acl); 322 return err;
369 323
324 if (default_acl) {
325 err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
326 default_acl);
327 posix_acl_release(default_acl);
328 }
370 if (acl) { 329 if (acl) {
371 /* Copy the default ACL to the default ACL of a new directory */ 330 if (!err)
372 if (S_ISDIR(inode->i_mode)) { 331 err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
373 err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, 332 acl);
374 acl);
375 if (err)
376 goto cleanup;
377 }
378
379 /* Now we reconcile the new ACL and the mode,
380 potentially modifying both */
381 err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
382 if (err < 0)
383 return err;
384
385 /* If we need an ACL.. */
386 if (err > 0)
387 err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl);
388 cleanup:
389 posix_acl_release(acl); 333 posix_acl_release(acl);
390 } else {
391 apply_umask:
392 /* no ACL, apply umask */
393 inode->i_mode &= ~current_umask();
394 } 334 }
395 335
396 return err; 336 return err;
337
338 apply_umask:
339 /* no ACL, apply umask */
340 inode->i_mode &= ~current_umask();
341 return err;
397} 342}
398 343
399/* This is used to cache the default acl before a new object is created. 344/* This is used to cache the default acl before a new object is created.
@@ -442,84 +387,11 @@ int reiserfs_cache_default_acl(struct inode *inode)
442 */ 387 */
443int reiserfs_acl_chmod(struct inode *inode) 388int reiserfs_acl_chmod(struct inode *inode)
444{ 389{
445 struct reiserfs_transaction_handle th;
446 struct posix_acl *acl;
447 size_t size;
448 int error;
449
450 if (IS_PRIVATE(inode)) 390 if (IS_PRIVATE(inode))
451 return 0; 391 return 0;
452
453 if (S_ISLNK(inode->i_mode))
454 return -EOPNOTSUPP;
455
456 if (get_inode_sd_version(inode) == STAT_DATA_V1 || 392 if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
457 !reiserfs_posixacl(inode->i_sb)) { 393 !reiserfs_posixacl(inode->i_sb))
458 return 0; 394 return 0;
459 }
460 395
461 acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); 396 return posix_acl_chmod(inode, inode->i_mode);
462 if (!acl)
463 return 0;
464 if (IS_ERR(acl))
465 return PTR_ERR(acl);
466 error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode);
467 if (error)
468 return error;
469
470 size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count));
471 reiserfs_write_lock(inode->i_sb);
472 error = journal_begin(&th, inode->i_sb, size * 2);
473 reiserfs_write_unlock(inode->i_sb);
474 if (!error) {
475 int error2;
476 error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl);
477 reiserfs_write_lock(inode->i_sb);
478 error2 = journal_end(&th, inode->i_sb, size * 2);
479 reiserfs_write_unlock(inode->i_sb);
480 if (error2)
481 error = error2;
482 }
483 posix_acl_release(acl);
484 return error;
485}
486
487static size_t posix_acl_access_list(struct dentry *dentry, char *list,
488 size_t list_size, const char *name,
489 size_t name_len, int type)
490{
491 const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
492 if (!reiserfs_posixacl(dentry->d_sb))
493 return 0;
494 if (list && size <= list_size)
495 memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
496 return size;
497} 397}
498
499const struct xattr_handler reiserfs_posix_acl_access_handler = {
500 .prefix = POSIX_ACL_XATTR_ACCESS,
501 .flags = ACL_TYPE_ACCESS,
502 .get = posix_acl_get,
503 .set = posix_acl_set,
504 .list = posix_acl_access_list,
505};
506
507static size_t posix_acl_default_list(struct dentry *dentry, char *list,
508 size_t list_size, const char *name,
509 size_t name_len, int type)
510{
511 const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
512 if (!reiserfs_posixacl(dentry->d_sb))
513 return 0;
514 if (list && size <= list_size)
515 memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
516 return size;
517}
518
519const struct xattr_handler reiserfs_posix_acl_default_handler = {
520 .prefix = POSIX_ACL_XATTR_DEFAULT,
521 .flags = ACL_TYPE_DEFAULT,
522 .get = posix_acl_get,
523 .set = posix_acl_set,
524 .list = posix_acl_default_list,
525};
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ff1d3d42e72a..d8418782862b 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -533,16 +533,14 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
533 533
534 root = romfs_iget(sb, pos); 534 root = romfs_iget(sb, pos);
535 if (IS_ERR(root)) 535 if (IS_ERR(root))
536 goto error; 536 return PTR_ERR(root);
537 537
538 sb->s_root = d_make_root(root); 538 sb->s_root = d_make_root(root);
539 if (!sb->s_root) 539 if (!sb->s_root)
540 goto error; 540 return -ENOMEM;
541 541
542 return 0; 542 return 0;
543 543
544error:
545 return -EINVAL;
546error_rsb_inval: 544error_rsb_inval:
547 ret = -EINVAL; 545 ret = -EINVAL;
548error_rsb: 546error_rsb:
diff --git a/fs/splice.c b/fs/splice.c
index 46a08f772d7d..12028fa41def 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = {
555 .get = generic_pipe_buf_get, 555 .get = generic_pipe_buf_get,
556}; 556};
557 557
558static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
559 struct pipe_buffer *buf)
560{
561 return 1;
562}
563
564/* Pipe buffer operations for a socket and similar. */
565const struct pipe_buf_operations nosteal_pipe_buf_ops = {
566 .can_merge = 0,
567 .map = generic_pipe_buf_map,
568 .unmap = generic_pipe_buf_unmap,
569 .confirm = generic_pipe_buf_confirm,
570 .release = generic_pipe_buf_release,
571 .steal = generic_pipe_buf_nosteal,
572 .get = generic_pipe_buf_get,
573};
574EXPORT_SYMBOL(nosteal_pipe_buf_ops);
575
558static ssize_t kernel_readv(struct file *file, const struct iovec *vec, 576static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
559 unsigned long vlen, loff_t offset) 577 unsigned long vlen, loff_t offset)
560{ 578{
diff --git a/fs/super.c b/fs/super.c
index e5f6c2cfac38..80d5cf2ca765 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
166 if (!s) 166 if (!s)
167 return NULL; 167 return NULL;
168 168
169 INIT_LIST_HEAD(&s->s_mounts);
170
169 if (security_sb_alloc(s)) 171 if (security_sb_alloc(s))
170 goto fail; 172 goto fail;
171 173
@@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
188 if (list_lru_init(&s->s_inode_lru)) 190 if (list_lru_init(&s->s_inode_lru))
189 goto fail; 191 goto fail;
190 192
191 INIT_LIST_HEAD(&s->s_mounts);
192 init_rwsem(&s->s_umount); 193 init_rwsem(&s->s_umount);
193 lockdep_set_class(&s->s_umount, &type->s_umount_key); 194 lockdep_set_class(&s->s_umount, &type->s_umount_key);
194 /* 195 /*
@@ -702,7 +703,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
702 if (flags & MS_RDONLY) 703 if (flags & MS_RDONLY)
703 acct_auto_close(sb); 704 acct_auto_close(sb);
704 shrink_dcache_sb(sb); 705 shrink_dcache_sb(sb);
705 sync_filesystem(sb);
706 706
707 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 707 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
708 708
@@ -719,6 +719,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
719 } 719 }
720 } 720 }
721 721
722 sync_filesystem(sb);
723
722 if (sb->s_op->remount_fs) { 724 if (sb->s_op->remount_fs) {
723 retval = sb->s_op->remount_fs(sb, &flags, data); 725 retval = sb->s_op->remount_fs(sb, &flags, data);
724 if (retval) { 726 if (retval) {
diff --git a/fs/sync.c b/fs/sync.c
index f15537452231..b28d1dd10e8b 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,11 +27,10 @@
27 * wait == 1 case since in that case write_inode() functions do 27 * wait == 1 case since in that case write_inode() functions do
28 * sync_dirty_buffer() and thus effectively write one block at a time. 28 * sync_dirty_buffer() and thus effectively write one block at a time.
29 */ 29 */
30static int __sync_filesystem(struct super_block *sb, int wait, 30static int __sync_filesystem(struct super_block *sb, int wait)
31 unsigned long start)
32{ 31{
33 if (wait) 32 if (wait)
34 sync_inodes_sb(sb, start); 33 sync_inodes_sb(sb);
35 else 34 else
36 writeback_inodes_sb(sb, WB_REASON_SYNC); 35 writeback_inodes_sb(sb, WB_REASON_SYNC);
37 36
@@ -48,7 +47,6 @@ static int __sync_filesystem(struct super_block *sb, int wait,
48int sync_filesystem(struct super_block *sb) 47int sync_filesystem(struct super_block *sb)
49{ 48{
50 int ret; 49 int ret;
51 unsigned long start = jiffies;
52 50
53 /* 51 /*
54 * We need to be protected against the filesystem going from 52 * We need to be protected against the filesystem going from
@@ -62,17 +60,17 @@ int sync_filesystem(struct super_block *sb)
62 if (sb->s_flags & MS_RDONLY) 60 if (sb->s_flags & MS_RDONLY)
63 return 0; 61 return 0;
64 62
65 ret = __sync_filesystem(sb, 0, start); 63 ret = __sync_filesystem(sb, 0);
66 if (ret < 0) 64 if (ret < 0)
67 return ret; 65 return ret;
68 return __sync_filesystem(sb, 1, start); 66 return __sync_filesystem(sb, 1);
69} 67}
70EXPORT_SYMBOL_GPL(sync_filesystem); 68EXPORT_SYMBOL_GPL(sync_filesystem);
71 69
72static void sync_inodes_one_sb(struct super_block *sb, void *arg) 70static void sync_inodes_one_sb(struct super_block *sb, void *arg)
73{ 71{
74 if (!(sb->s_flags & MS_RDONLY)) 72 if (!(sb->s_flags & MS_RDONLY))
75 sync_inodes_sb(sb, *((unsigned long *)arg)); 73 sync_inodes_sb(sb);
76} 74}
77 75
78static void sync_fs_one_sb(struct super_block *sb, void *arg) 76static void sync_fs_one_sb(struct super_block *sb, void *arg)
@@ -104,10 +102,9 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
104SYSCALL_DEFINE0(sync) 102SYSCALL_DEFINE0(sync)
105{ 103{
106 int nowait = 0, wait = 1; 104 int nowait = 0, wait = 1;
107 unsigned long start = jiffies;
108 105
109 wakeup_flusher_threads(0, WB_REASON_SYNC); 106 wakeup_flusher_threads(0, WB_REASON_SYNC);
110 iterate_supers(sync_inodes_one_sb, &start); 107 iterate_supers(sync_inodes_one_sb, NULL);
111 iterate_supers(sync_fs_one_sb, &nowait); 108 iterate_supers(sync_fs_one_sb, &nowait);
112 iterate_supers(sync_fs_one_sb, &wait); 109 iterate_supers(sync_fs_one_sb, &wait);
113 iterate_bdevs(fdatawrite_one_bdev, NULL); 110 iterate_bdevs(fdatawrite_one_bdev, NULL);
@@ -222,23 +219,6 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
222 return do_fsync(fd, 1); 219 return do_fsync(fd, 1);
223} 220}
224 221
225/**
226 * generic_write_sync - perform syncing after a write if file / inode is sync
227 * @file: file to which the write happened
228 * @pos: offset where the write started
229 * @count: length of the write
230 *
231 * This is just a simple wrapper about our general syncing function.
232 */
233int generic_write_sync(struct file *file, loff_t pos, loff_t count)
234{
235 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
236 return 0;
237 return vfs_fsync_range(file, pos, pos + count - 1,
238 (file->f_flags & __O_SYNC) ? 0 : 1);
239}
240EXPORT_SYMBOL(generic_write_sync);
241
242/* 222/*
243 * sys_sync_file_range() permits finely controlled syncing over a segment of 223 * sys_sync_file_range() permits finely controlled syncing over a segment of
244 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is 224 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 8876ac183373..6eff6e1205a5 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,4 +2,4 @@
2# Makefile for the sysfs virtual filesystem 2# Makefile for the sysfs virtual filesystem
3# 3#
4 4
5obj-y := inode.o file.o dir.o symlink.o mount.o group.o 5obj-y := file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5e73d6626e50..ee0d761c3179 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -13,465 +13,31 @@
13#undef DEBUG 13#undef DEBUG
14 14
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/mount.h>
17#include <linux/module.h>
18#include <linux/kobject.h> 16#include <linux/kobject.h>
19#include <linux/namei.h>
20#include <linux/idr.h>
21#include <linux/completion.h>
22#include <linux/mutex.h>
23#include <linux/slab.h> 17#include <linux/slab.h>
24#include <linux/security.h>
25#include <linux/hash.h>
26#include "sysfs.h" 18#include "sysfs.h"
27 19
28DEFINE_MUTEX(sysfs_mutex);
29DEFINE_SPINLOCK(sysfs_symlink_target_lock); 20DEFINE_SPINLOCK(sysfs_symlink_target_lock);
30 21
31#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
32
33static DEFINE_SPINLOCK(sysfs_ino_lock);
34static DEFINE_IDA(sysfs_ino_ida);
35
36/**
37 * sysfs_name_hash
38 * @name: Null terminated string to hash
39 * @ns: Namespace tag to hash
40 *
41 * Returns 31 bit hash of ns + name (so it fits in an off_t )
42 */
43static unsigned int sysfs_name_hash(const char *name, const void *ns)
44{
45 unsigned long hash = init_name_hash();
46 unsigned int len = strlen(name);
47 while (len--)
48 hash = partial_name_hash(*name++, hash);
49 hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
50 hash &= 0x7fffffffU;
51 /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
52 if (hash < 1)
53 hash += 2;
54 if (hash >= INT_MAX)
55 hash = INT_MAX - 1;
56 return hash;
57}
58
59static int sysfs_name_compare(unsigned int hash, const char *name,
60 const void *ns, const struct sysfs_dirent *sd)
61{
62 if (hash != sd->s_hash)
63 return hash - sd->s_hash;
64 if (ns != sd->s_ns)
65 return ns - sd->s_ns;
66 return strcmp(name, sd->s_name);
67}
68
69static int sysfs_sd_compare(const struct sysfs_dirent *left,
70 const struct sysfs_dirent *right)
71{
72 return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
73 right);
74}
75
76/**
77 * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
78 * @sd: sysfs_dirent of interest
79 *
80 * Link @sd into its sibling rbtree which starts from
81 * sd->s_parent->s_dir.children.
82 *
83 * Locking:
84 * mutex_lock(sysfs_mutex)
85 *
86 * RETURNS:
87 * 0 on susccess -EEXIST on failure.
88 */
89static int sysfs_link_sibling(struct sysfs_dirent *sd)
90{
91 struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
92 struct rb_node *parent = NULL;
93
94 if (sysfs_type(sd) == SYSFS_DIR)
95 sd->s_parent->s_dir.subdirs++;
96
97 while (*node) {
98 struct sysfs_dirent *pos;
99 int result;
100
101 pos = to_sysfs_dirent(*node);
102 parent = *node;
103 result = sysfs_sd_compare(sd, pos);
104 if (result < 0)
105 node = &pos->s_rb.rb_left;
106 else if (result > 0)
107 node = &pos->s_rb.rb_right;
108 else
109 return -EEXIST;
110 }
111 /* add new node and rebalance the tree */
112 rb_link_node(&sd->s_rb, parent, node);
113 rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
114 return 0;
115}
116
117/**
118 * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
119 * @sd: sysfs_dirent of interest
120 *
121 * Unlink @sd from its sibling rbtree which starts from
122 * sd->s_parent->s_dir.children.
123 *
124 * Locking:
125 * mutex_lock(sysfs_mutex)
126 */
127static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
128{
129 if (sysfs_type(sd) == SYSFS_DIR)
130 sd->s_parent->s_dir.subdirs--;
131
132 rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
133}
134
135/**
136 * sysfs_get_active - get an active reference to sysfs_dirent
137 * @sd: sysfs_dirent to get an active reference to
138 *
139 * Get an active reference of @sd. This function is noop if @sd
140 * is NULL.
141 *
142 * RETURNS:
143 * Pointer to @sd on success, NULL on failure.
144 */
145struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
146{
147 if (unlikely(!sd))
148 return NULL;
149
150 if (!atomic_inc_unless_negative(&sd->s_active))
151 return NULL;
152
153 if (likely(!sysfs_ignore_lockdep(sd)))
154 rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
155 return sd;
156}
157
158/**
159 * sysfs_put_active - put an active reference to sysfs_dirent
160 * @sd: sysfs_dirent to put an active reference to
161 *
162 * Put an active reference to @sd. This function is noop if @sd
163 * is NULL.
164 */
165void sysfs_put_active(struct sysfs_dirent *sd)
166{
167 int v;
168
169 if (unlikely(!sd))
170 return;
171
172 if (likely(!sysfs_ignore_lockdep(sd)))
173 rwsem_release(&sd->dep_map, 1, _RET_IP_);
174 v = atomic_dec_return(&sd->s_active);
175 if (likely(v != SD_DEACTIVATED_BIAS))
176 return;
177
178 /* atomic_dec_return() is a mb(), we'll always see the updated
179 * sd->u.completion.
180 */
181 complete(sd->u.completion);
182}
183
184/**
185 * sysfs_deactivate - deactivate sysfs_dirent
186 * @sd: sysfs_dirent to deactivate
187 *
188 * Deny new active references and drain existing ones.
189 */
190static void sysfs_deactivate(struct sysfs_dirent *sd)
191{
192 DECLARE_COMPLETION_ONSTACK(wait);
193 int v;
194
195 BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
196
197 if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
198 return;
199
200 sd->u.completion = (void *)&wait;
201
202 rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
203 /* atomic_add_return() is a mb(), put_active() will always see
204 * the updated sd->u.completion.
205 */
206 v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
207
208 if (v != SD_DEACTIVATED_BIAS) {
209 lock_contended(&sd->dep_map, _RET_IP_);
210 wait_for_completion(&wait);
211 }
212
213 lock_acquired(&sd->dep_map, _RET_IP_);
214 rwsem_release(&sd->dep_map, 1, _RET_IP_);
215}
216
217static int sysfs_alloc_ino(unsigned int *pino)
218{
219 int ino, rc;
220
221 retry:
222 spin_lock(&sysfs_ino_lock);
223 rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
224 spin_unlock(&sysfs_ino_lock);
225
226 if (rc == -EAGAIN) {
227 if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
228 goto retry;
229 rc = -ENOMEM;
230 }
231
232 *pino = ino;
233 return rc;
234}
235
236static void sysfs_free_ino(unsigned int ino)
237{
238 spin_lock(&sysfs_ino_lock);
239 ida_remove(&sysfs_ino_ida, ino);
240 spin_unlock(&sysfs_ino_lock);
241}
242
243void release_sysfs_dirent(struct sysfs_dirent *sd)
244{
245 struct sysfs_dirent *parent_sd;
246
247 repeat:
248 /* Moving/renaming is always done while holding reference.
249 * sd->s_parent won't change beneath us.
250 */
251 parent_sd = sd->s_parent;
252
253 WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
254 "sysfs: free using entry: %s/%s\n",
255 parent_sd ? parent_sd->s_name : "", sd->s_name);
256
257 if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
258 sysfs_put(sd->s_symlink.target_sd);
259 if (sysfs_type(sd) & SYSFS_COPY_NAME)
260 kfree(sd->s_name);
261 if (sd->s_iattr && sd->s_iattr->ia_secdata)
262 security_release_secctx(sd->s_iattr->ia_secdata,
263 sd->s_iattr->ia_secdata_len);
264 kfree(sd->s_iattr);
265 sysfs_free_ino(sd->s_ino);
266 kmem_cache_free(sysfs_dir_cachep, sd);
267
268 sd = parent_sd;
269 if (sd && atomic_dec_and_test(&sd->s_count))
270 goto repeat;
271}
272
273static int sysfs_dentry_delete(const struct dentry *dentry)
274{
275 struct sysfs_dirent *sd = dentry->d_fsdata;
276 return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
277}
278
279static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
280{
281 struct sysfs_dirent *sd;
282 int type;
283
284 if (flags & LOOKUP_RCU)
285 return -ECHILD;
286
287 sd = dentry->d_fsdata;
288 mutex_lock(&sysfs_mutex);
289
290 /* The sysfs dirent has been deleted */
291 if (sd->s_flags & SYSFS_FLAG_REMOVED)
292 goto out_bad;
293
294 /* The sysfs dirent has been moved? */
295 if (dentry->d_parent->d_fsdata != sd->s_parent)
296 goto out_bad;
297
298 /* The sysfs dirent has been renamed */
299 if (strcmp(dentry->d_name.name, sd->s_name) != 0)
300 goto out_bad;
301
302 /* The sysfs dirent has been moved to a different namespace */
303 type = KOBJ_NS_TYPE_NONE;
304 if (sd->s_parent) {
305 type = sysfs_ns_type(sd->s_parent);
306 if (type != KOBJ_NS_TYPE_NONE &&
307 sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
308 goto out_bad;
309 }
310
311 mutex_unlock(&sysfs_mutex);
312out_valid:
313 return 1;
314out_bad:
315 /* Remove the dentry from the dcache hashes.
316 * If this is a deleted dentry we use d_drop instead of d_delete
317 * so sysfs doesn't need to cope with negative dentries.
318 *
319 * If this is a dentry that has simply been renamed we
320 * use d_drop to remove it from the dcache lookup on its
321 * old parent. If this dentry persists later when a lookup
322 * is performed at its new name the dentry will be readded
323 * to the dcache hashes.
324 */
325 mutex_unlock(&sysfs_mutex);
326
327 /* If we have submounts we must allow the vfs caches
328 * to lie about the state of the filesystem to prevent
329 * leaks and other nasty things.
330 */
331 if (check_submounts_and_drop(dentry) != 0)
332 goto out_valid;
333
334 return 0;
335}
336
337static void sysfs_dentry_release(struct dentry *dentry)
338{
339 sysfs_put(dentry->d_fsdata);
340}
341
342const struct dentry_operations sysfs_dentry_ops = {
343 .d_revalidate = sysfs_dentry_revalidate,
344 .d_delete = sysfs_dentry_delete,
345 .d_release = sysfs_dentry_release,
346};
347
348struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
349{
350 char *dup_name = NULL;
351 struct sysfs_dirent *sd;
352
353 if (type & SYSFS_COPY_NAME) {
354 name = dup_name = kstrdup(name, GFP_KERNEL);
355 if (!name)
356 return NULL;
357 }
358
359 sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
360 if (!sd)
361 goto err_out1;
362
363 if (sysfs_alloc_ino(&sd->s_ino))
364 goto err_out2;
365
366 atomic_set(&sd->s_count, 1);
367 atomic_set(&sd->s_active, 0);
368
369 sd->s_name = name;
370 sd->s_mode = mode;
371 sd->s_flags = type | SYSFS_FLAG_REMOVED;
372
373 return sd;
374
375 err_out2:
376 kmem_cache_free(sysfs_dir_cachep, sd);
377 err_out1:
378 kfree(dup_name);
379 return NULL;
380}
381
382/**
383 * sysfs_addrm_start - prepare for sysfs_dirent add/remove
384 * @acxt: pointer to sysfs_addrm_cxt to be used
385 *
386 * This function is called when the caller is about to add or remove
387 * sysfs_dirent. This function acquires sysfs_mutex. @acxt is used
388 * to keep and pass context to other addrm functions.
389 *
390 * LOCKING:
391 * Kernel thread context (may sleep). sysfs_mutex is locked on
392 * return.
393 */
394void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
395 __acquires(sysfs_mutex)
396{
397 memset(acxt, 0, sizeof(*acxt));
398
399 mutex_lock(&sysfs_mutex);
400}
401
402/**
403 * __sysfs_add_one - add sysfs_dirent to parent without warning
404 * @acxt: addrm context to use
405 * @sd: sysfs_dirent to be added
406 * @parent_sd: the parent sysfs_dirent to add @sd to
407 *
408 * Get @parent_sd and set @sd->s_parent to it and increment nlink of
409 * the parent inode if @sd is a directory and link into the children
410 * list of the parent.
411 *
412 * This function should be called between calls to
413 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
414 * passed the same @acxt as passed to sysfs_addrm_start().
415 *
416 * LOCKING:
417 * Determined by sysfs_addrm_start().
418 *
419 * RETURNS:
420 * 0 on success, -EEXIST if entry with the given name already
421 * exists.
422 */
423int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
424 struct sysfs_dirent *parent_sd)
425{
426 struct sysfs_inode_attrs *ps_iattr;
427 int ret;
428
429 if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
430 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
431 sysfs_ns_type(parent_sd) ? "required" : "invalid",
432 parent_sd->s_name, sd->s_name);
433 return -EINVAL;
434 }
435
436 sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
437 sd->s_parent = sysfs_get(parent_sd);
438
439 ret = sysfs_link_sibling(sd);
440 if (ret)
441 return ret;
442
443 /* Update timestamps on the parent */
444 ps_iattr = parent_sd->s_iattr;
445 if (ps_iattr) {
446 struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
447 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
448 }
449
450 /* Mark the entry added into directory tree */
451 sd->s_flags &= ~SYSFS_FLAG_REMOVED;
452
453 return 0;
454}
455
456/** 22/**
457 * sysfs_pathname - return full path to sysfs dirent 23 * sysfs_pathname - return full path to sysfs dirent
458 * @sd: sysfs_dirent whose path we want 24 * @kn: kernfs_node whose path we want
459 * @path: caller allocated buffer of size PATH_MAX 25 * @path: caller allocated buffer of size PATH_MAX
460 * 26 *
461 * Gives the name "/" to the sysfs_root entry; any path returned 27 * Gives the name "/" to the sysfs_root entry; any path returned
462 * is relative to wherever sysfs is mounted. 28 * is relative to wherever sysfs is mounted.
463 */ 29 */
464static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) 30static char *sysfs_pathname(struct kernfs_node *kn, char *path)
465{ 31{
466 if (sd->s_parent) { 32 if (kn->parent) {
467 sysfs_pathname(sd->s_parent, path); 33 sysfs_pathname(kn->parent, path);
468 strlcat(path, "/", PATH_MAX); 34 strlcat(path, "/", PATH_MAX);
469 } 35 }
470 strlcat(path, sd->s_name, PATH_MAX); 36 strlcat(path, kn->name, PATH_MAX);
471 return path; 37 return path;
472} 38}
473 39
474void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) 40void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
475{ 41{
476 char *path; 42 char *path;
477 43
@@ -489,445 +55,34 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
489} 55}
490 56
491/** 57/**
492 * sysfs_add_one - add sysfs_dirent to parent
493 * @acxt: addrm context to use
494 * @sd: sysfs_dirent to be added
495 * @parent_sd: the parent sysfs_dirent to add @sd to
496 *
497 * Get @parent_sd and set @sd->s_parent to it and increment nlink of
498 * the parent inode if @sd is a directory and link into the children
499 * list of the parent.
500 *
501 * This function should be called between calls to
502 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
503 * passed the same @acxt as passed to sysfs_addrm_start().
504 *
505 * LOCKING:
506 * Determined by sysfs_addrm_start().
507 *
508 * RETURNS:
509 * 0 on success, -EEXIST if entry with the given name already
510 * exists.
511 */
512int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
513 struct sysfs_dirent *parent_sd)
514{
515 int ret;
516
517 ret = __sysfs_add_one(acxt, sd, parent_sd);
518
519 if (ret == -EEXIST)
520 sysfs_warn_dup(parent_sd, sd->s_name);
521 return ret;
522}
523
524/**
525 * sysfs_remove_one - remove sysfs_dirent from parent
526 * @acxt: addrm context to use
527 * @sd: sysfs_dirent to be removed
528 *
529 * Mark @sd removed and drop nlink of parent inode if @sd is a
530 * directory. @sd is unlinked from the children list.
531 *
532 * This function should be called between calls to
533 * sysfs_addrm_start() and sysfs_addrm_finish() and should be
534 * passed the same @acxt as passed to sysfs_addrm_start().
535 *
536 * LOCKING:
537 * Determined by sysfs_addrm_start().
538 */
539static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
540 struct sysfs_dirent *sd)
541{
542 struct sysfs_inode_attrs *ps_iattr;
543
544 /*
545 * Removal can be called multiple times on the same node. Only the
546 * first invocation is effective and puts the base ref.
547 */
548 if (sd->s_flags & SYSFS_FLAG_REMOVED)
549 return;
550
551 sysfs_unlink_sibling(sd);
552
553 /* Update timestamps on the parent */
554 ps_iattr = sd->s_parent->s_iattr;
555 if (ps_iattr) {
556 struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
557 ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
558 }
559
560 sd->s_flags |= SYSFS_FLAG_REMOVED;
561 sd->u.removed_list = acxt->removed;
562 acxt->removed = sd;
563}
564
565/**
566 * sysfs_addrm_finish - finish up sysfs_dirent add/remove
567 * @acxt: addrm context to finish up
568 *
569 * Finish up sysfs_dirent add/remove. Resources acquired by
570 * sysfs_addrm_start() are released and removed sysfs_dirents are
571 * cleaned up.
572 *
573 * LOCKING:
574 * sysfs_mutex is released.
575 */
576void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
577 __releases(sysfs_mutex)
578{
579 /* release resources acquired by sysfs_addrm_start() */
580 mutex_unlock(&sysfs_mutex);
581
582 /* kill removed sysfs_dirents */
583 while (acxt->removed) {
584 struct sysfs_dirent *sd = acxt->removed;
585
586 acxt->removed = sd->u.removed_list;
587
588 sysfs_deactivate(sd);
589 sysfs_unmap_bin_file(sd);
590 sysfs_put(sd);
591 }
592}
593
594/**
595 * sysfs_find_dirent - find sysfs_dirent with the given name
596 * @parent_sd: sysfs_dirent to search under
597 * @name: name to look for
598 * @ns: the namespace tag to use
599 *
600 * Look for sysfs_dirent with name @name under @parent_sd.
601 *
602 * LOCKING:
603 * mutex_lock(sysfs_mutex)
604 *
605 * RETURNS:
606 * Pointer to sysfs_dirent if found, NULL if not.
607 */
608struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
609 const unsigned char *name,
610 const void *ns)
611{
612 struct rb_node *node = parent_sd->s_dir.children.rb_node;
613 unsigned int hash;
614
615 if (!!sysfs_ns_type(parent_sd) != !!ns) {
616 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
617 sysfs_ns_type(parent_sd) ? "required" : "invalid",
618 parent_sd->s_name, name);
619 return NULL;
620 }
621
622 hash = sysfs_name_hash(name, ns);
623 while (node) {
624 struct sysfs_dirent *sd;
625 int result;
626
627 sd = to_sysfs_dirent(node);
628 result = sysfs_name_compare(hash, name, ns, sd);
629 if (result < 0)
630 node = node->rb_left;
631 else if (result > 0)
632 node = node->rb_right;
633 else
634 return sd;
635 }
636 return NULL;
637}
638
639/**
640 * sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
641 * @parent_sd: sysfs_dirent to search under
642 * @name: name to look for
643 * @ns: the namespace tag to use
644 *
645 * Look for sysfs_dirent with name @name under @parent_sd and get
646 * it if found.
647 *
648 * LOCKING:
649 * Kernel thread context (may sleep). Grabs sysfs_mutex.
650 *
651 * RETURNS:
652 * Pointer to sysfs_dirent if found, NULL if not.
653 */
654struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
655 const unsigned char *name,
656 const void *ns)
657{
658 struct sysfs_dirent *sd;
659
660 mutex_lock(&sysfs_mutex);
661 sd = sysfs_find_dirent(parent_sd, name, ns);
662 sysfs_get(sd);
663 mutex_unlock(&sysfs_mutex);
664
665 return sd;
666}
667EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
668
669static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
670 enum kobj_ns_type type,
671 const char *name, const void *ns,
672 struct sysfs_dirent **p_sd)
673{
674 umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
675 struct sysfs_addrm_cxt acxt;
676 struct sysfs_dirent *sd;
677 int rc;
678
679 /* allocate */
680 sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
681 if (!sd)
682 return -ENOMEM;
683
684 sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
685 sd->s_ns = ns;
686 sd->s_dir.kobj = kobj;
687
688 /* link in */
689 sysfs_addrm_start(&acxt);
690 rc = sysfs_add_one(&acxt, sd, parent_sd);
691 sysfs_addrm_finish(&acxt);
692
693 if (rc == 0)
694 *p_sd = sd;
695 else
696 sysfs_put(sd);
697
698 return rc;
699}
700
701int sysfs_create_subdir(struct kobject *kobj, const char *name,
702 struct sysfs_dirent **p_sd)
703{
704 return create_dir(kobj, kobj->sd,
705 KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
706}
707
708/**
709 * sysfs_read_ns_type: return associated ns_type
710 * @kobj: the kobject being queried
711 *
712 * Each kobject can be tagged with exactly one namespace type
713 * (i.e. network or user). Return the ns_type associated with
714 * this object if any
715 */
716static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
717{
718 const struct kobj_ns_type_operations *ops;
719 enum kobj_ns_type type;
720
721 ops = kobj_child_ns_ops(kobj);
722 if (!ops)
723 return KOBJ_NS_TYPE_NONE;
724
725 type = ops->type;
726 BUG_ON(type <= KOBJ_NS_TYPE_NONE);
727 BUG_ON(type >= KOBJ_NS_TYPES);
728 BUG_ON(!kobj_ns_type_registered(type));
729
730 return type;
731}
732
733/**
734 * sysfs_create_dir_ns - create a directory for an object with a namespace tag 58 * sysfs_create_dir_ns - create a directory for an object with a namespace tag
735 * @kobj: object we're creating directory for 59 * @kobj: object we're creating directory for
736 * @ns: the namespace tag to use 60 * @ns: the namespace tag to use
737 */ 61 */
738int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) 62int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
739{ 63{
740 enum kobj_ns_type type; 64 struct kernfs_node *parent, *kn;
741 struct sysfs_dirent *parent_sd, *sd;
742 int error = 0;
743 65
744 BUG_ON(!kobj); 66 BUG_ON(!kobj);
745 67
746 if (kobj->parent) 68 if (kobj->parent)
747 parent_sd = kobj->parent->sd; 69 parent = kobj->parent->sd;
748 else 70 else
749 parent_sd = &sysfs_root; 71 parent = sysfs_root_kn;
750 72
751 if (!parent_sd) 73 if (!parent)
752 return -ENOENT; 74 return -ENOENT;
753 75
754 type = sysfs_read_ns_type(kobj); 76 kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
755 77 S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
756 error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd); 78 if (IS_ERR(kn)) {
757 if (!error) 79 if (PTR_ERR(kn) == -EEXIST)
758 kobj->sd = sd; 80 sysfs_warn_dup(parent, kobject_name(kobj));
759 return error; 81 return PTR_ERR(kn);
760}
761
762static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
763 unsigned int flags)
764{
765 struct dentry *ret = NULL;
766 struct dentry *parent = dentry->d_parent;
767 struct sysfs_dirent *parent_sd = parent->d_fsdata;
768 struct sysfs_dirent *sd;
769 struct inode *inode;
770 enum kobj_ns_type type;
771 const void *ns;
772
773 mutex_lock(&sysfs_mutex);
774
775 type = sysfs_ns_type(parent_sd);
776 ns = sysfs_info(dir->i_sb)->ns[type];
777
778 sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
779
780 /* no such entry */
781 if (!sd) {
782 ret = ERR_PTR(-ENOENT);
783 goto out_unlock;
784 }
785 dentry->d_fsdata = sysfs_get(sd);
786
787 /* attach dentry and inode */
788 inode = sysfs_get_inode(dir->i_sb, sd);
789 if (!inode) {
790 ret = ERR_PTR(-ENOMEM);
791 goto out_unlock;
792 }
793
794 /* instantiate and hash dentry */
795 ret = d_materialise_unique(dentry, inode);
796 out_unlock:
797 mutex_unlock(&sysfs_mutex);
798 return ret;
799}
800
801const struct inode_operations sysfs_dir_inode_operations = {
802 .lookup = sysfs_lookup,
803 .permission = sysfs_permission,
804 .setattr = sysfs_setattr,
805 .getattr = sysfs_getattr,
806 .setxattr = sysfs_setxattr,
807};
808
809static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
810{
811 struct sysfs_dirent *last;
812
813 while (true) {
814 struct rb_node *rbn;
815
816 last = pos;
817
818 if (sysfs_type(pos) != SYSFS_DIR)
819 break;
820
821 rbn = rb_first(&pos->s_dir.children);
822 if (!rbn)
823 break;
824
825 pos = to_sysfs_dirent(rbn);
826 }
827
828 return last;
829}
830
831/**
832 * sysfs_next_descendant_post - find the next descendant for post-order walk
833 * @pos: the current position (%NULL to initiate traversal)
834 * @root: sysfs_dirent whose descendants to walk
835 *
836 * Find the next descendant to visit for post-order traversal of @root's
837 * descendants. @root is included in the iteration and the last node to be
838 * visited.
839 */
840static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
841 struct sysfs_dirent *root)
842{
843 struct rb_node *rbn;
844
845 lockdep_assert_held(&sysfs_mutex);
846
847 /* if first iteration, visit leftmost descendant which may be root */
848 if (!pos)
849 return sysfs_leftmost_descendant(root);
850
851 /* if we visited @root, we're done */
852 if (pos == root)
853 return NULL;
854
855 /* if there's an unvisited sibling, visit its leftmost descendant */
856 rbn = rb_next(&pos->s_rb);
857 if (rbn)
858 return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
859
860 /* no sibling left, visit parent */
861 return pos->s_parent;
862}
863
864static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
865 struct sysfs_dirent *sd)
866{
867 struct sysfs_dirent *pos, *next;
868
869 if (!sd)
870 return;
871
872 pr_debug("sysfs %s: removing\n", sd->s_name);
873
874 next = NULL;
875 do {
876 pos = next;
877 next = sysfs_next_descendant_post(pos, sd);
878 if (pos)
879 sysfs_remove_one(acxt, pos);
880 } while (next);
881}
882
883/**
884 * sysfs_remove - remove a sysfs_dirent recursively
885 * @sd: the sysfs_dirent to remove
886 *
887 * Remove @sd along with all its subdirectories and files.
888 */
889void sysfs_remove(struct sysfs_dirent *sd)
890{
891 struct sysfs_addrm_cxt acxt;
892
893 sysfs_addrm_start(&acxt);
894 __sysfs_remove(&acxt, sd);
895 sysfs_addrm_finish(&acxt);
896}
897
898/**
899 * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
900 * @dir_sd: parent of the target
901 * @name: name of the sysfs_dirent to remove
902 * @ns: namespace tag of the sysfs_dirent to remove
903 *
904 * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
905 * it. Returns 0 on success, -ENOENT if such entry doesn't exist.
906 */
907int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
908 const void *ns)
909{
910 struct sysfs_addrm_cxt acxt;
911 struct sysfs_dirent *sd;
912
913 if (!dir_sd) {
914 WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
915 name);
916 return -ENOENT;
917 } 82 }
918 83
919 sysfs_addrm_start(&acxt); 84 kobj->sd = kn;
920 85 return 0;
921 sd = sysfs_find_dirent(dir_sd, name, ns);
922 if (sd)
923 __sysfs_remove(&acxt, sd);
924
925 sysfs_addrm_finish(&acxt);
926
927 if (sd)
928 return 0;
929 else
930 return -ENOENT;
931} 86}
932 87
933/** 88/**
@@ -940,207 +95,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
940 */ 95 */
941void sysfs_remove_dir(struct kobject *kobj) 96void sysfs_remove_dir(struct kobject *kobj)
942{ 97{
943 struct sysfs_dirent *sd = kobj->sd; 98 struct kernfs_node *kn = kobj->sd;
944 99
945 /* 100 /*
946 * In general, kboject owner is responsible for ensuring removal 101 * In general, kboject owner is responsible for ensuring removal
947 * doesn't race with other operations and sysfs doesn't provide any 102 * doesn't race with other operations and sysfs doesn't provide any
948 * protection; however, when @kobj is used as a symlink target, the 103 * protection; however, when @kobj is used as a symlink target, the
949 * symlinking entity usually doesn't own @kobj and thus has no 104 * symlinking entity usually doesn't own @kobj and thus has no
950 * control over removal. @kobj->sd may be removed anytime and 105 * control over removal. @kobj->sd may be removed anytime
951 * symlink code may end up dereferencing an already freed sd. 106 * and symlink code may end up dereferencing an already freed node.
952 * 107 *
953 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation 108 * sysfs_symlink_target_lock synchronizes @kobj->sd
954 * against symlink operations so that symlink code can safely 109 * disassociation against symlink operations so that symlink code
955 * dereference @kobj->sd. 110 * can safely dereference @kobj->sd.
956 */ 111 */
957 spin_lock(&sysfs_symlink_target_lock); 112 spin_lock(&sysfs_symlink_target_lock);
958 kobj->sd = NULL; 113 kobj->sd = NULL;
959 spin_unlock(&sysfs_symlink_target_lock); 114 spin_unlock(&sysfs_symlink_target_lock);
960 115
961 if (sd) { 116 if (kn) {
962 WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR); 117 WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
963 sysfs_remove(sd); 118 kernfs_remove(kn);
964 } 119 }
965} 120}
966 121
967int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
968 const char *new_name, const void *new_ns)
969{
970 int error;
971
972 mutex_lock(&sysfs_mutex);
973
974 error = 0;
975 if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
976 (strcmp(sd->s_name, new_name) == 0))
977 goto out; /* nothing to rename */
978
979 error = -EEXIST;
980 if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
981 goto out;
982
983 /* rename sysfs_dirent */
984 if (strcmp(sd->s_name, new_name) != 0) {
985 error = -ENOMEM;
986 new_name = kstrdup(new_name, GFP_KERNEL);
987 if (!new_name)
988 goto out;
989
990 kfree(sd->s_name);
991 sd->s_name = new_name;
992 }
993
994 /*
995 * Move to the appropriate place in the appropriate directories rbtree.
996 */
997 sysfs_unlink_sibling(sd);
998 sysfs_get(new_parent_sd);
999 sysfs_put(sd->s_parent);
1000 sd->s_ns = new_ns;
1001 sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
1002 sd->s_parent = new_parent_sd;
1003 sysfs_link_sibling(sd);
1004
1005 error = 0;
1006 out:
1007 mutex_unlock(&sysfs_mutex);
1008 return error;
1009}
1010
1011int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, 122int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
1012 const void *new_ns) 123 const void *new_ns)
1013{ 124{
1014 struct sysfs_dirent *parent_sd = kobj->sd->s_parent; 125 struct kernfs_node *parent = kobj->sd->parent;
1015 126
1016 return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns); 127 return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
1017} 128}
1018 129
1019int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, 130int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
1020 const void *new_ns) 131 const void *new_ns)
1021{ 132{
1022 struct sysfs_dirent *sd = kobj->sd; 133 struct kernfs_node *kn = kobj->sd;
1023 struct sysfs_dirent *new_parent_sd; 134 struct kernfs_node *new_parent;
1024 135
1025 BUG_ON(!sd->s_parent); 136 BUG_ON(!kn->parent);
1026 new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? 137 new_parent = new_parent_kobj && new_parent_kobj->sd ?
1027 new_parent_kobj->sd : &sysfs_root; 138 new_parent_kobj->sd : sysfs_root_kn;
1028 139
1029 return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns); 140 return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
1030} 141}
1031
1032/* Relationship between s_mode and the DT_xxx types */
1033static inline unsigned char dt_type(struct sysfs_dirent *sd)
1034{
1035 return (sd->s_mode >> 12) & 15;
1036}
1037
1038static int sysfs_dir_release(struct inode *inode, struct file *filp)
1039{
1040 sysfs_put(filp->private_data);
1041 return 0;
1042}
1043
1044static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
1045 struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos)
1046{
1047 if (pos) {
1048 int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
1049 pos->s_parent == parent_sd &&
1050 hash == pos->s_hash;
1051 sysfs_put(pos);
1052 if (!valid)
1053 pos = NULL;
1054 }
1055 if (!pos && (hash > 1) && (hash < INT_MAX)) {
1056 struct rb_node *node = parent_sd->s_dir.children.rb_node;
1057 while (node) {
1058 pos = to_sysfs_dirent(node);
1059
1060 if (hash < pos->s_hash)
1061 node = node->rb_left;
1062 else if (hash > pos->s_hash)
1063 node = node->rb_right;
1064 else
1065 break;
1066 }
1067 }
1068 /* Skip over entries in the wrong namespace */
1069 while (pos && pos->s_ns != ns) {
1070 struct rb_node *node = rb_next(&pos->s_rb);
1071 if (!node)
1072 pos = NULL;
1073 else
1074 pos = to_sysfs_dirent(node);
1075 }
1076 return pos;
1077}
1078
1079static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
1080 struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
1081{
1082 pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
1083 if (pos)
1084 do {
1085 struct rb_node *node = rb_next(&pos->s_rb);
1086 if (!node)
1087 pos = NULL;
1088 else
1089 pos = to_sysfs_dirent(node);
1090 } while (pos && pos->s_ns != ns);
1091 return pos;
1092}
1093
1094static int sysfs_readdir(struct file *file, struct dir_context *ctx)
1095{
1096 struct dentry *dentry = file->f_path.dentry;
1097 struct sysfs_dirent *parent_sd = dentry->d_fsdata;
1098 struct sysfs_dirent *pos = file->private_data;
1099 enum kobj_ns_type type;
1100 const void *ns;
1101
1102 type = sysfs_ns_type(parent_sd);
1103 ns = sysfs_info(dentry->d_sb)->ns[type];
1104
1105 if (!dir_emit_dots(file, ctx))
1106 return 0;
1107 mutex_lock(&sysfs_mutex);
1108 for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
1109 pos;
1110 pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
1111 const char *name = pos->s_name;
1112 unsigned int type = dt_type(pos);
1113 int len = strlen(name);
1114 ino_t ino = pos->s_ino;
1115 ctx->pos = pos->s_hash;
1116 file->private_data = sysfs_get(pos);
1117
1118 mutex_unlock(&sysfs_mutex);
1119 if (!dir_emit(ctx, name, len, ino, type))
1120 return 0;
1121 mutex_lock(&sysfs_mutex);
1122 }
1123 mutex_unlock(&sysfs_mutex);
1124 file->private_data = NULL;
1125 ctx->pos = INT_MAX;
1126 return 0;
1127}
1128
1129static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
1130{
1131 struct inode *inode = file_inode(file);
1132 loff_t ret;
1133
1134 mutex_lock(&inode->i_mutex);
1135 ret = generic_file_llseek(file, offset, whence);
1136 mutex_unlock(&inode->i_mutex);
1137
1138 return ret;
1139}
1140
1141const struct file_operations sysfs_dir_operations = {
1142 .read = generic_read_dir,
1143 .iterate = sysfs_readdir,
1144 .release = sysfs_dir_release,
1145 .llseek = sysfs_dir_llseek,
1146};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 35e7d08fe629..810cf6e613e5 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,70 +14,23 @@
14#include <linux/kobject.h> 14#include <linux/kobject.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/fsnotify.h>
18#include <linux/namei.h>
19#include <linux/poll.h>
20#include <linux/list.h> 17#include <linux/list.h>
21#include <linux/mutex.h> 18#include <linux/mutex.h>
22#include <linux/limits.h>
23#include <linux/uaccess.h>
24#include <linux/seq_file.h> 19#include <linux/seq_file.h>
25#include <linux/mm.h>
26 20
27#include "sysfs.h" 21#include "sysfs.h"
22#include "../kernfs/kernfs-internal.h"
28 23
29/* 24/*
30 * There's one sysfs_open_file for each open file and one sysfs_open_dirent 25 * Determine ktype->sysfs_ops for the given kernfs_node. This function
31 * for each sysfs_dirent with one or more open files.
32 *
33 * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open is
34 * protected by sysfs_open_dirent_lock.
35 *
36 * filp->private_data points to seq_file whose ->private points to
37 * sysfs_open_file. sysfs_open_files are chained at
38 * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
39 */
40static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
41static DEFINE_MUTEX(sysfs_open_file_mutex);
42
43struct sysfs_open_dirent {
44 atomic_t refcnt;
45 atomic_t event;
46 wait_queue_head_t poll;
47 struct list_head files; /* goes through sysfs_open_file.list */
48};
49
50struct sysfs_open_file {
51 struct sysfs_dirent *sd;
52 struct file *file;
53 struct mutex mutex;
54 int event;
55 struct list_head list;
56
57 bool mmapped;
58 const struct vm_operations_struct *vm_ops;
59};
60
61static bool sysfs_is_bin(struct sysfs_dirent *sd)
62{
63 return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
64}
65
66static struct sysfs_open_file *sysfs_of(struct file *file)
67{
68 return ((struct seq_file *)file->private_data)->private;
69}
70
71/*
72 * Determine ktype->sysfs_ops for the given sysfs_dirent. This function
73 * must be called while holding an active reference. 26 * must be called while holding an active reference.
74 */ 27 */
75static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) 28static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
76{ 29{
77 struct kobject *kobj = sd->s_parent->s_dir.kobj; 30 struct kobject *kobj = kn->parent->priv;
78 31
79 if (!sysfs_ignore_lockdep(sd)) 32 if (kn->flags & KERNFS_LOCKDEP)
80 lockdep_assert_held(sd); 33 lockdep_assert_held(kn);
81 return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; 34 return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
82} 35}
83 36
@@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
86 * details like buffering and seeking. The following function pipes 39 * details like buffering and seeking. The following function pipes
87 * sysfs_ops->show() result through seq_file. 40 * sysfs_ops->show() result through seq_file.
88 */ 41 */
89static int sysfs_seq_show(struct seq_file *sf, void *v) 42static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
90{ 43{
91 struct sysfs_open_file *of = sf->private; 44 struct kernfs_open_file *of = sf->private;
92 struct kobject *kobj = of->sd->s_parent->s_dir.kobj; 45 struct kobject *kobj = of->kn->parent->priv;
93 const struct sysfs_ops *ops; 46 const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
94 char *buf;
95 ssize_t count; 47 ssize_t count;
48 char *buf;
96 49
97 /* acquire buffer and ensure that it's >= PAGE_SIZE */ 50 /* acquire buffer and ensure that it's >= PAGE_SIZE */
98 count = seq_get_buf(sf, &buf); 51 count = seq_get_buf(sf, &buf);
@@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
102 } 55 }
103 56
104 /* 57 /*
105 * Need @of->sd for attr and ops, its parent for kobj. @of->mutex 58 * Invoke show(). Control may reach here via seq file lseek even
106 * nests outside active ref and is just to ensure that the ops 59 * if @ops->show() isn't implemented.
107 * aren't called concurrently for the same open file.
108 */ 60 */
109 mutex_lock(&of->mutex); 61 if (ops->show) {
110 if (!sysfs_get_active(of->sd)) { 62 count = ops->show(kobj, of->kn->priv, buf);
111 mutex_unlock(&of->mutex); 63 if (count < 0)
112 return -ENODEV; 64 return count;
113 } 65 }
114 66
115 of->event = atomic_read(&of->sd->s_attr.open->event);
116
117 /*
118 * Lookup @ops and invoke show(). Control may reach here via seq
119 * file lseek even if @ops->show() isn't implemented.
120 */
121 ops = sysfs_file_ops(of->sd);
122 if (ops->show)
123 count = ops->show(kobj, of->sd->s_attr.attr, buf);
124 else
125 count = 0;
126
127 sysfs_put_active(of->sd);
128 mutex_unlock(&of->mutex);
129
130 if (count < 0)
131 return count;
132
133 /* 67 /*
134 * The code works fine with PAGE_SIZE return but it's likely to 68 * The code works fine with PAGE_SIZE return but it's likely to
135 * indicate truncated result or overflow in normal use cases. 69 * indicate truncated result or overflow in normal use cases.
@@ -144,726 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
144 return 0; 78 return 0;
145} 79}
146 80
147/* 81static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
148 * Read method for bin files. As reading a bin file can have side-effects, 82 size_t count, loff_t pos)
149 * the exact offset and bytes specified in read(2) call should be passed to
150 * the read callback making it difficult to use seq_file. Implement
151 * simplistic custom buffering for bin files.
152 */
153static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
154 size_t bytes, loff_t *off)
155{ 83{
156 struct sysfs_open_file *of = sysfs_of(file); 84 struct bin_attribute *battr = of->kn->priv;
157 struct bin_attribute *battr = of->sd->s_attr.bin_attr; 85 struct kobject *kobj = of->kn->parent->priv;
158 struct kobject *kobj = of->sd->s_parent->s_dir.kobj; 86 loff_t size = file_inode(of->file)->i_size;
159 loff_t size = file_inode(file)->i_size;
160 int count = min_t(size_t, bytes, PAGE_SIZE);
161 loff_t offs = *off;
162 char *buf;
163 87
164 if (!bytes) 88 if (!count)
165 return 0; 89 return 0;
166 90
167 if (size) { 91 if (size) {
168 if (offs > size) 92 if (pos > size)
169 return 0; 93 return 0;
170 if (offs + count > size) 94 if (pos + count > size)
171 count = size - offs; 95 count = size - pos;
172 }
173
174 buf = kmalloc(count, GFP_KERNEL);
175 if (!buf)
176 return -ENOMEM;
177
178 /* need of->sd for battr, its parent for kobj */
179 mutex_lock(&of->mutex);
180 if (!sysfs_get_active(of->sd)) {
181 count = -ENODEV;
182 mutex_unlock(&of->mutex);
183 goto out_free;
184 }
185
186 if (battr->read)
187 count = battr->read(file, kobj, battr, buf, offs, count);
188 else
189 count = -EIO;
190
191 sysfs_put_active(of->sd);
192 mutex_unlock(&of->mutex);
193
194 if (count < 0)
195 goto out_free;
196
197 if (copy_to_user(userbuf, buf, count)) {
198 count = -EFAULT;
199 goto out_free;
200 } 96 }
201 97
202 pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); 98 if (!battr->read)
203 99 return -EIO;
204 *off = offs + count;
205 100
206 out_free: 101 return battr->read(of->file, kobj, battr, buf, pos, count);
207 kfree(buf);
208 return count;
209} 102}
210 103
211/** 104/* kernfs write callback for regular sysfs files */
212 * flush_write_buffer - push buffer to kobject 105static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
213 * @of: open file 106 size_t count, loff_t pos)
214 * @buf: data buffer for file
215 * @off: file offset to write to
216 * @count: number of bytes
217 *
218 * Get the correct pointers for the kobject and the attribute we're dealing
219 * with, then call the store() method for it with @buf.
220 */
221static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
222 size_t count)
223{ 107{
224 struct kobject *kobj = of->sd->s_parent->s_dir.kobj; 108 const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
225 int rc = 0; 109 struct kobject *kobj = of->kn->parent->priv;
226
227 /*
228 * Need @of->sd for attr and ops, its parent for kobj. @of->mutex
229 * nests outside active ref and is just to ensure that the ops
230 * aren't called concurrently for the same open file.
231 */
232 mutex_lock(&of->mutex);
233 if (!sysfs_get_active(of->sd)) {
234 mutex_unlock(&of->mutex);
235 return -ENODEV;
236 }
237 110
238 if (sysfs_is_bin(of->sd)) { 111 if (!count)
239 struct bin_attribute *battr = of->sd->s_attr.bin_attr; 112 return 0;
240
241 rc = -EIO;
242 if (battr->write)
243 rc = battr->write(of->file, kobj, battr, buf, off,
244 count);
245 } else {
246 const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
247
248 rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
249 }
250
251 sysfs_put_active(of->sd);
252 mutex_unlock(&of->mutex);
253 113
254 return rc; 114 return ops->store(kobj, of->kn->priv, buf, count);
255} 115}
256 116
257/** 117/* kernfs write callback for bin sysfs files */
258 * sysfs_write_file - write an attribute 118static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
259 * @file: file pointer 119 size_t count, loff_t pos)
260 * @user_buf: data to write
261 * @count: number of bytes
262 * @ppos: starting offset
263 *
264 * Copy data in from userland and pass it to the matching
265 * sysfs_ops->store() by invoking flush_write_buffer().
266 *
267 * There is no easy way for us to know if userspace is only doing a partial
268 * write, so we don't support them. We expect the entire buffer to come on
269 * the first write. Hint: if you're writing a value, first read the file,
270 * modify only the the value you're changing, then write entire buffer
271 * back.
272 */
273static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
274 size_t count, loff_t *ppos)
275{ 120{
276 struct sysfs_open_file *of = sysfs_of(file); 121 struct bin_attribute *battr = of->kn->priv;
277 ssize_t len = min_t(size_t, count, PAGE_SIZE); 122 struct kobject *kobj = of->kn->parent->priv;
278 loff_t size = file_inode(file)->i_size; 123 loff_t size = file_inode(of->file)->i_size;
279 char *buf;
280 124
281 if (sysfs_is_bin(of->sd) && size) { 125 if (size) {
282 if (size <= *ppos) 126 if (size <= pos)
283 return 0; 127 return 0;
284 len = min_t(ssize_t, len, size - *ppos); 128 count = min_t(ssize_t, count, size - pos);
285 } 129 }
286 130 if (!count)
287 if (!len)
288 return 0; 131 return 0;
289 132
290 buf = kmalloc(len + 1, GFP_KERNEL); 133 if (!battr->write)
291 if (!buf) 134 return -EIO;
292 return -ENOMEM;
293 135
294 if (copy_from_user(buf, user_buf, len)) { 136 return battr->write(of->file, kobj, battr, buf, pos, count);
295 len = -EFAULT;
296 goto out_free;
297 }
298 buf[len] = '\0'; /* guarantee string termination */
299
300 len = flush_write_buffer(of, buf, *ppos, len);
301 if (len > 0)
302 *ppos += len;
303out_free:
304 kfree(buf);
305 return len;
306}
307
308static void sysfs_bin_vma_open(struct vm_area_struct *vma)
309{
310 struct file *file = vma->vm_file;
311 struct sysfs_open_file *of = sysfs_of(file);
312
313 if (!of->vm_ops)
314 return;
315
316 if (!sysfs_get_active(of->sd))
317 return;
318
319 if (of->vm_ops->open)
320 of->vm_ops->open(vma);
321
322 sysfs_put_active(of->sd);
323} 137}
324 138
325static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 139static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
140 struct vm_area_struct *vma)
326{ 141{
327 struct file *file = vma->vm_file; 142 struct bin_attribute *battr = of->kn->priv;
328 struct sysfs_open_file *of = sysfs_of(file); 143 struct kobject *kobj = of->kn->parent->priv;
329 int ret;
330 144
331 if (!of->vm_ops) 145 return battr->mmap(of->file, kobj, battr, vma);
332 return VM_FAULT_SIGBUS;
333
334 if (!sysfs_get_active(of->sd))
335 return VM_FAULT_SIGBUS;
336
337 ret = VM_FAULT_SIGBUS;
338 if (of->vm_ops->fault)
339 ret = of->vm_ops->fault(vma, vmf);
340
341 sysfs_put_active(of->sd);
342 return ret;
343} 146}
344 147
345static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma, 148void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
346 struct vm_fault *vmf)
347{ 149{
348 struct file *file = vma->vm_file; 150 struct kernfs_node *kn = kobj->sd, *tmp;
349 struct sysfs_open_file *of = sysfs_of(file);
350 int ret;
351
352 if (!of->vm_ops)
353 return VM_FAULT_SIGBUS;
354 151
355 if (!sysfs_get_active(of->sd)) 152 if (kn && dir)
356 return VM_FAULT_SIGBUS; 153 kn = kernfs_find_and_get(kn, dir);
357
358 ret = 0;
359 if (of->vm_ops->page_mkwrite)
360 ret = of->vm_ops->page_mkwrite(vma, vmf);
361 else 154 else
362 file_update_time(file); 155 kernfs_get(kn);
363
364 sysfs_put_active(of->sd);
365 return ret;
366}
367
368static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
369 void *buf, int len, int write)
370{
371 struct file *file = vma->vm_file;
372 struct sysfs_open_file *of = sysfs_of(file);
373 int ret;
374
375 if (!of->vm_ops)
376 return -EINVAL;
377
378 if (!sysfs_get_active(of->sd))
379 return -EINVAL;
380
381 ret = -EINVAL;
382 if (of->vm_ops->access)
383 ret = of->vm_ops->access(vma, addr, buf, len, write);
384
385 sysfs_put_active(of->sd);
386 return ret;
387}
388
389#ifdef CONFIG_NUMA
390static int sysfs_bin_set_policy(struct vm_area_struct *vma,
391 struct mempolicy *new)
392{
393 struct file *file = vma->vm_file;
394 struct sysfs_open_file *of = sysfs_of(file);
395 int ret;
396
397 if (!of->vm_ops)
398 return 0;
399
400 if (!sysfs_get_active(of->sd))
401 return -EINVAL;
402
403 ret = 0;
404 if (of->vm_ops->set_policy)
405 ret = of->vm_ops->set_policy(vma, new);
406
407 sysfs_put_active(of->sd);
408 return ret;
409}
410
411static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
412 unsigned long addr)
413{
414 struct file *file = vma->vm_file;
415 struct sysfs_open_file *of = sysfs_of(file);
416 struct mempolicy *pol;
417
418 if (!of->vm_ops)
419 return vma->vm_policy;
420
421 if (!sysfs_get_active(of->sd))
422 return vma->vm_policy;
423
424 pol = vma->vm_policy;
425 if (of->vm_ops->get_policy)
426 pol = of->vm_ops->get_policy(vma, addr);
427
428 sysfs_put_active(of->sd);
429 return pol;
430}
431
432static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
433 const nodemask_t *to, unsigned long flags)
434{
435 struct file *file = vma->vm_file;
436 struct sysfs_open_file *of = sysfs_of(file);
437 int ret;
438
439 if (!of->vm_ops)
440 return 0;
441
442 if (!sysfs_get_active(of->sd))
443 return 0;
444
445 ret = 0;
446 if (of->vm_ops->migrate)
447 ret = of->vm_ops->migrate(vma, from, to, flags);
448
449 sysfs_put_active(of->sd);
450 return ret;
451}
452#endif
453
454static const struct vm_operations_struct sysfs_bin_vm_ops = {
455 .open = sysfs_bin_vma_open,
456 .fault = sysfs_bin_fault,
457 .page_mkwrite = sysfs_bin_page_mkwrite,
458 .access = sysfs_bin_access,
459#ifdef CONFIG_NUMA
460 .set_policy = sysfs_bin_set_policy,
461 .get_policy = sysfs_bin_get_policy,
462 .migrate = sysfs_bin_migrate,
463#endif
464};
465
466static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
467{
468 struct sysfs_open_file *of = sysfs_of(file);
469 struct bin_attribute *battr = of->sd->s_attr.bin_attr;
470 struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
471 int rc;
472
473 mutex_lock(&of->mutex);
474
475 /* need of->sd for battr, its parent for kobj */
476 rc = -ENODEV;
477 if (!sysfs_get_active(of->sd))
478 goto out_unlock;
479
480 if (!battr->mmap)
481 goto out_put;
482
483 rc = battr->mmap(file, kobj, battr, vma);
484 if (rc)
485 goto out_put;
486
487 /*
488 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
489 * to satisfy versions of X which crash if the mmap fails: that
490 * substitutes a new vm_file, and we don't then want bin_vm_ops.
491 */
492 if (vma->vm_file != file)
493 goto out_put;
494
495 rc = -EINVAL;
496 if (of->mmapped && of->vm_ops != vma->vm_ops)
497 goto out_put;
498 156
499 /* 157 if (kn && attr) {
500 * It is not possible to successfully wrap close. 158 tmp = kernfs_find_and_get(kn, attr);
501 * So error if someone is trying to use close. 159 kernfs_put(kn);
502 */ 160 kn = tmp;
503 rc = -EINVAL;
504 if (vma->vm_ops && vma->vm_ops->close)
505 goto out_put;
506
507 rc = 0;
508 of->mmapped = 1;
509 of->vm_ops = vma->vm_ops;
510 vma->vm_ops = &sysfs_bin_vm_ops;
511out_put:
512 sysfs_put_active(of->sd);
513out_unlock:
514 mutex_unlock(&of->mutex);
515
516 return rc;
517}
518
519/**
520 * sysfs_get_open_dirent - get or create sysfs_open_dirent
521 * @sd: target sysfs_dirent
522 * @of: sysfs_open_file for this instance of open
523 *
524 * If @sd->s_attr.open exists, increment its reference count;
525 * otherwise, create one. @of is chained to the files list.
526 *
527 * LOCKING:
528 * Kernel thread context (may sleep).
529 *
530 * RETURNS:
531 * 0 on success, -errno on failure.
532 */
533static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
534 struct sysfs_open_file *of)
535{
536 struct sysfs_open_dirent *od, *new_od = NULL;
537
538 retry:
539 mutex_lock(&sysfs_open_file_mutex);
540 spin_lock_irq(&sysfs_open_dirent_lock);
541
542 if (!sd->s_attr.open && new_od) {
543 sd->s_attr.open = new_od;
544 new_od = NULL;
545 } 161 }
546 162
547 od = sd->s_attr.open; 163 if (kn) {
548 if (od) { 164 kernfs_notify(kn);
549 atomic_inc(&od->refcnt); 165 kernfs_put(kn);
550 list_add_tail(&of->list, &od->files);
551 }
552
553 spin_unlock_irq(&sysfs_open_dirent_lock);
554 mutex_unlock(&sysfs_open_file_mutex);
555
556 if (od) {
557 kfree(new_od);
558 return 0;
559 } 166 }
167}
168EXPORT_SYMBOL_GPL(sysfs_notify);
560 169
561 /* not there, initialize a new one and retry */ 170static const struct kernfs_ops sysfs_file_kfops_empty = {
562 new_od = kmalloc(sizeof(*new_od), GFP_KERNEL); 171};
563 if (!new_od)
564 return -ENOMEM;
565 172
566 atomic_set(&new_od->refcnt, 0); 173static const struct kernfs_ops sysfs_file_kfops_ro = {
567 atomic_set(&new_od->event, 1); 174 .seq_show = sysfs_kf_seq_show,
568 init_waitqueue_head(&new_od->poll); 175};
569 INIT_LIST_HEAD(&new_od->files);
570 goto retry;
571}
572 176
573/** 177static const struct kernfs_ops sysfs_file_kfops_wo = {
574 * sysfs_put_open_dirent - put sysfs_open_dirent 178 .write = sysfs_kf_write,
575 * @sd: target sysfs_dirent 179};
576 * @of: associated sysfs_open_file
577 *
578 * Put @sd->s_attr.open and unlink @of from the files list. If
579 * reference count reaches zero, disassociate and free it.
580 *
581 * LOCKING:
582 * None.
583 */
584static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
585 struct sysfs_open_file *of)
586{
587 struct sysfs_open_dirent *od = sd->s_attr.open;
588 unsigned long flags;
589 180
590 mutex_lock(&sysfs_open_file_mutex); 181static const struct kernfs_ops sysfs_file_kfops_rw = {
591 spin_lock_irqsave(&sysfs_open_dirent_lock, flags); 182 .seq_show = sysfs_kf_seq_show,
183 .write = sysfs_kf_write,
184};
592 185
593 if (of) 186static const struct kernfs_ops sysfs_bin_kfops_ro = {
594 list_del(&of->list); 187 .read = sysfs_kf_bin_read,
188};
595 189
596 if (atomic_dec_and_test(&od->refcnt)) 190static const struct kernfs_ops sysfs_bin_kfops_wo = {
597 sd->s_attr.open = NULL; 191 .write = sysfs_kf_bin_write,
598 else 192};
599 od = NULL;
600 193
601 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); 194static const struct kernfs_ops sysfs_bin_kfops_rw = {
602 mutex_unlock(&sysfs_open_file_mutex); 195 .read = sysfs_kf_bin_read,
196 .write = sysfs_kf_bin_write,
197};
603 198
604 kfree(od); 199static const struct kernfs_ops sysfs_bin_kfops_mmap = {
605} 200 .read = sysfs_kf_bin_read,
201 .write = sysfs_kf_bin_write,
202 .mmap = sysfs_kf_bin_mmap,
203};
606 204
607static int sysfs_open_file(struct inode *inode, struct file *file) 205int sysfs_add_file_mode_ns(struct kernfs_node *parent,
206 const struct attribute *attr, bool is_bin,
207 umode_t mode, const void *ns)
608{ 208{
609 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 209 struct lock_class_key *key = NULL;
610 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 210 const struct kernfs_ops *ops;
611 struct sysfs_open_file *of; 211 struct kernfs_node *kn;
612 bool has_read, has_write; 212 loff_t size;
613 int error = -EACCES;
614
615 /* need attr_sd for attr and ops, its parent for kobj */
616 if (!sysfs_get_active(attr_sd))
617 return -ENODEV;
618 213
619 if (sysfs_is_bin(attr_sd)) { 214 if (!is_bin) {
620 struct bin_attribute *battr = attr_sd->s_attr.bin_attr; 215 struct kobject *kobj = parent->priv;
621 216 const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
622 has_read = battr->read || battr->mmap;
623 has_write = battr->write || battr->mmap;
624 } else {
625 const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
626 217
627 /* every kobject with an attribute needs a ktype assigned */ 218 /* every kobject with an attribute needs a ktype assigned */
628 if (WARN(!ops, KERN_ERR 219 if (WARN(!sysfs_ops, KERN_ERR
629 "missing sysfs attribute operations for kobject: %s\n", 220 "missing sysfs attribute operations for kobject: %s\n",
630 kobject_name(kobj))) 221 kobject_name(kobj)))
631 goto err_out; 222 return -EINVAL;
632 223
633 has_read = ops->show; 224 if (sysfs_ops->show && sysfs_ops->store)
634 has_write = ops->store; 225 ops = &sysfs_file_kfops_rw;
635 } 226 else if (sysfs_ops->show)
636 227 ops = &sysfs_file_kfops_ro;
637 /* check perms and supported operations */ 228 else if (sysfs_ops->store)
638 if ((file->f_mode & FMODE_WRITE) && 229 ops = &sysfs_file_kfops_wo;
639 (!(inode->i_mode & S_IWUGO) || !has_write)) 230 else
640 goto err_out; 231 ops = &sysfs_file_kfops_empty;
641 232
642 if ((file->f_mode & FMODE_READ) && 233 size = PAGE_SIZE;
643 (!(inode->i_mode & S_IRUGO) || !has_read)) 234 } else {
644 goto err_out; 235 struct bin_attribute *battr = (void *)attr;
645 236
646 /* allocate a sysfs_open_file for the file */ 237 if (battr->mmap)
647 error = -ENOMEM; 238 ops = &sysfs_bin_kfops_mmap;
648 of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL); 239 else if (battr->read && battr->write)
649 if (!of) 240 ops = &sysfs_bin_kfops_rw;
650 goto err_out; 241 else if (battr->read)
651 242 ops = &sysfs_bin_kfops_ro;
652 /* 243 else if (battr->write)
653 * The following is done to give a different lockdep key to 244 ops = &sysfs_bin_kfops_wo;
654 * @of->mutex for files which implement mmap. This is a rather 245 else
655 * crude way to avoid false positive lockdep warning around 246 ops = &sysfs_file_kfops_empty;
656 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and 247
657 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under 248 size = battr->size;
658 * which mm->mmap_sem nests, while holding @of->mutex. As each
659 * open file has a separate mutex, it's okay as long as those don't
660 * happen on the same file. At this point, we can't easily give
661 * each file a separate locking class. Let's differentiate on
662 * whether the file is bin or not for now.
663 */
664 if (sysfs_is_bin(attr_sd))
665 mutex_init(&of->mutex);
666 else
667 mutex_init(&of->mutex);
668
669 of->sd = attr_sd;
670 of->file = file;
671
672 /*
673 * Always instantiate seq_file even if read access doesn't use
674 * seq_file or is not requested. This unifies private data access
675 * and readable regular files are the vast majority anyway.
676 */
677 if (sysfs_is_bin(attr_sd))
678 error = single_open(file, NULL, of);
679 else
680 error = single_open(file, sysfs_seq_show, of);
681 if (error)
682 goto err_free;
683
684 /* seq_file clears PWRITE unconditionally, restore it if WRITE */
685 if (file->f_mode & FMODE_WRITE)
686 file->f_mode |= FMODE_PWRITE;
687
688 /* make sure we have open dirent struct */
689 error = sysfs_get_open_dirent(attr_sd, of);
690 if (error)
691 goto err_close;
692
693 /* open succeeded, put active references */
694 sysfs_put_active(attr_sd);
695 return 0;
696
697err_close:
698 single_release(inode, file);
699err_free:
700 kfree(of);
701err_out:
702 sysfs_put_active(attr_sd);
703 return error;
704}
705
706static int sysfs_release(struct inode *inode, struct file *filp)
707{
708 struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
709 struct sysfs_open_file *of = sysfs_of(filp);
710
711 sysfs_put_open_dirent(sd, of);
712 single_release(inode, filp);
713 kfree(of);
714
715 return 0;
716}
717
718void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
719{
720 struct sysfs_open_dirent *od;
721 struct sysfs_open_file *of;
722
723 if (!sysfs_is_bin(sd))
724 return;
725
726 spin_lock_irq(&sysfs_open_dirent_lock);
727 od = sd->s_attr.open;
728 if (od)
729 atomic_inc(&od->refcnt);
730 spin_unlock_irq(&sysfs_open_dirent_lock);
731 if (!od)
732 return;
733
734 mutex_lock(&sysfs_open_file_mutex);
735 list_for_each_entry(of, &od->files, list) {
736 struct inode *inode = file_inode(of->file);
737 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
738 } 249 }
739 mutex_unlock(&sysfs_open_file_mutex);
740
741 sysfs_put_open_dirent(sd, NULL);
742}
743
744/* Sysfs attribute files are pollable. The idea is that you read
745 * the content and then you use 'poll' or 'select' to wait for
746 * the content to change. When the content changes (assuming the
747 * manager for the kobject supports notification), poll will
748 * return POLLERR|POLLPRI, and select will return the fd whether
749 * it is waiting for read, write, or exceptions.
750 * Once poll/select indicates that the value has changed, you
751 * need to close and re-open the file, or seek to 0 and read again.
752 * Reminder: this only works for attributes which actively support
753 * it, and it is not possible to test an attribute from userspace
754 * to see if it supports poll (Neither 'poll' nor 'select' return
755 * an appropriate error code). When in doubt, set a suitable timeout value.
756 */
757static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
758{
759 struct sysfs_open_file *of = sysfs_of(filp);
760 struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
761 struct sysfs_open_dirent *od = attr_sd->s_attr.open;
762
763 /* need parent for the kobj, grab both */
764 if (!sysfs_get_active(attr_sd))
765 goto trigger;
766
767 poll_wait(filp, &od->poll, wait);
768 250
769 sysfs_put_active(attr_sd); 251#ifdef CONFIG_DEBUG_LOCK_ALLOC
770 252 if (!attr->ignore_lockdep)
771 if (of->event != atomic_read(&od->event)) 253 key = attr->key ?: (struct lock_class_key *)&attr->skey;
772 goto trigger; 254#endif
773 255 kn = __kernfs_create_file(parent, attr->name, mode, size, ops,
774 return DEFAULT_POLLMASK; 256 (void *)attr, ns, true, key);
775 257 if (IS_ERR(kn)) {
776 trigger: 258 if (PTR_ERR(kn) == -EEXIST)
777 return DEFAULT_POLLMASK|POLLERR|POLLPRI; 259 sysfs_warn_dup(parent, attr->name);
778} 260 return PTR_ERR(kn);
779
780void sysfs_notify_dirent(struct sysfs_dirent *sd)
781{
782 struct sysfs_open_dirent *od;
783 unsigned long flags;
784
785 spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
786
787 if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
788 od = sd->s_attr.open;
789 if (od) {
790 atomic_inc(&od->event);
791 wake_up_interruptible(&od->poll);
792 }
793 } 261 }
794 262 return 0;
795 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
796}
797EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
798
799void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
800{
801 struct sysfs_dirent *sd = k->sd;
802
803 mutex_lock(&sysfs_mutex);
804
805 if (sd && dir)
806 sd = sysfs_find_dirent(sd, dir, NULL);
807 if (sd && attr)
808 sd = sysfs_find_dirent(sd, attr, NULL);
809 if (sd)
810 sysfs_notify_dirent(sd);
811
812 mutex_unlock(&sysfs_mutex);
813}
814EXPORT_SYMBOL_GPL(sysfs_notify);
815
816const struct file_operations sysfs_file_operations = {
817 .read = seq_read,
818 .write = sysfs_write_file,
819 .llseek = generic_file_llseek,
820 .open = sysfs_open_file,
821 .release = sysfs_release,
822 .poll = sysfs_poll,
823};
824
825const struct file_operations sysfs_bin_operations = {
826 .read = sysfs_bin_read,
827 .write = sysfs_write_file,
828 .llseek = generic_file_llseek,
829 .mmap = sysfs_bin_mmap,
830 .open = sysfs_open_file,
831 .release = sysfs_release,
832 .poll = sysfs_poll,
833};
834
835int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
836 const struct attribute *attr, int type,
837 umode_t amode, const void *ns)
838{
839 umode_t mode = (amode & S_IALLUGO) | S_IFREG;
840 struct sysfs_addrm_cxt acxt;
841 struct sysfs_dirent *sd;
842 int rc;
843
844 sd = sysfs_new_dirent(attr->name, mode, type);
845 if (!sd)
846 return -ENOMEM;
847
848 sd->s_ns = ns;
849 sd->s_attr.attr = (void *)attr;
850 sysfs_dirent_init_lockdep(sd);
851
852 sysfs_addrm_start(&acxt);
853 rc = sysfs_add_one(&acxt, sd, dir_sd);
854 sysfs_addrm_finish(&acxt);
855
856 if (rc)
857 sysfs_put(sd);
858
859 return rc;
860} 263}
861 264
862 265int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
863int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, 266 bool is_bin)
864 int type)
865{ 267{
866 return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL); 268 return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
867} 269}
868 270
869/** 271/**
@@ -877,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
877{ 279{
878 BUG_ON(!kobj || !kobj->sd || !attr); 280 BUG_ON(!kobj || !kobj->sd || !attr);
879 281
880 return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR, 282 return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
881 attr->mode, ns);
882 283
883} 284}
884EXPORT_SYMBOL_GPL(sysfs_create_file_ns); 285EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -906,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files);
906int sysfs_add_file_to_group(struct kobject *kobj, 307int sysfs_add_file_to_group(struct kobject *kobj,
907 const struct attribute *attr, const char *group) 308 const struct attribute *attr, const char *group)
908{ 309{
909 struct sysfs_dirent *dir_sd; 310 struct kernfs_node *parent;
910 int error; 311 int error;
911 312
912 if (group) 313 if (group) {
913 dir_sd = sysfs_get_dirent(kobj->sd, group); 314 parent = kernfs_find_and_get(kobj->sd, group);
914 else 315 } else {
915 dir_sd = sysfs_get(kobj->sd); 316 parent = kobj->sd;
317 kernfs_get(parent);
318 }
916 319
917 if (!dir_sd) 320 if (!parent)
918 return -ENOENT; 321 return -ENOENT;
919 322
920 error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR); 323 error = sysfs_add_file(parent, attr, false);
921 sysfs_put(dir_sd); 324 kernfs_put(parent);
922 325
923 return error; 326 return error;
924} 327}
@@ -934,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
934int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, 337int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
935 umode_t mode) 338 umode_t mode)
936{ 339{
937 struct sysfs_dirent *sd; 340 struct kernfs_node *kn;
938 struct iattr newattrs; 341 struct iattr newattrs;
939 int rc; 342 int rc;
940 343
941 mutex_lock(&sysfs_mutex); 344 kn = kernfs_find_and_get(kobj->sd, attr->name);
942 345 if (!kn)
943 rc = -ENOENT; 346 return -ENOENT;
944 sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
945 if (!sd)
946 goto out;
947 347
948 newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); 348 newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
949 newattrs.ia_valid = ATTR_MODE; 349 newattrs.ia_valid = ATTR_MODE;
950 rc = sysfs_sd_setattr(sd, &newattrs);
951 350
952 out: 351 rc = kernfs_setattr(kn, &newattrs);
953 mutex_unlock(&sysfs_mutex); 352
353 kernfs_put(kn);
954 return rc; 354 return rc;
955} 355}
956EXPORT_SYMBOL_GPL(sysfs_chmod_file); 356EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -966,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
966void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, 366void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
967 const void *ns) 367 const void *ns)
968{ 368{
969 struct sysfs_dirent *dir_sd = kobj->sd; 369 struct kernfs_node *parent = kobj->sd;
970 370
971 sysfs_hash_and_remove(dir_sd, attr->name, ns); 371 kernfs_remove_by_name_ns(parent, attr->name, ns);
972} 372}
973EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); 373EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
974 374
@@ -989,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files);
989void sysfs_remove_file_from_group(struct kobject *kobj, 389void sysfs_remove_file_from_group(struct kobject *kobj,
990 const struct attribute *attr, const char *group) 390 const struct attribute *attr, const char *group)
991{ 391{
992 struct sysfs_dirent *dir_sd; 392 struct kernfs_node *parent;
993 393
994 if (group) 394 if (group) {
995 dir_sd = sysfs_get_dirent(kobj->sd, group); 395 parent = kernfs_find_and_get(kobj->sd, group);
996 else 396 } else {
997 dir_sd = sysfs_get(kobj->sd); 397 parent = kobj->sd;
998 if (dir_sd) { 398 kernfs_get(parent);
999 sysfs_hash_and_remove(dir_sd, attr->name, NULL); 399 }
1000 sysfs_put(dir_sd); 400
401 if (parent) {
402 kernfs_remove_by_name(parent, attr->name);
403 kernfs_put(parent);
1001 } 404 }
1002} 405}
1003EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); 406EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
@@ -1012,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
1012{ 415{
1013 BUG_ON(!kobj || !kobj->sd || !attr); 416 BUG_ON(!kobj || !kobj->sd || !attr);
1014 417
1015 return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); 418 return sysfs_add_file(kobj->sd, &attr->attr, true);
1016} 419}
1017EXPORT_SYMBOL_GPL(sysfs_create_bin_file); 420EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
1018 421
@@ -1024,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
1024void sysfs_remove_bin_file(struct kobject *kobj, 427void sysfs_remove_bin_file(struct kobject *kobj,
1025 const struct bin_attribute *attr) 428 const struct bin_attribute *attr)
1026{ 429{
1027 sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL); 430 kernfs_remove_by_name(kobj->sd, attr->attr.name);
1028} 431}
1029EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); 432EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
1030 433
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1898a10e38ce..6b579387c67a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
18#include "sysfs.h" 18#include "sysfs.h"
19 19
20 20
21static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 21static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
22 const struct attribute_group *grp) 22 const struct attribute_group *grp)
23{ 23{
24 struct attribute *const *attr; 24 struct attribute *const *attr;
@@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
26 26
27 if (grp->attrs) 27 if (grp->attrs)
28 for (attr = grp->attrs; *attr; attr++) 28 for (attr = grp->attrs; *attr; attr++)
29 sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); 29 kernfs_remove_by_name(parent, (*attr)->name);
30 if (grp->bin_attrs) 30 if (grp->bin_attrs)
31 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) 31 for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
32 sysfs_remove_bin_file(kobj, *bin_attr); 32 sysfs_remove_bin_file(kobj, *bin_attr);
33} 33}
34 34
35static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, 35static int create_files(struct kernfs_node *parent, struct kobject *kobj,
36 const struct attribute_group *grp, int update) 36 const struct attribute_group *grp, int update)
37{ 37{
38 struct attribute *const *attr; 38 struct attribute *const *attr;
@@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
49 * re-adding (if required) the file. 49 * re-adding (if required) the file.
50 */ 50 */
51 if (update) 51 if (update)
52 sysfs_hash_and_remove(dir_sd, (*attr)->name, 52 kernfs_remove_by_name(parent, (*attr)->name);
53 NULL);
54 if (grp->is_visible) { 53 if (grp->is_visible) {
55 mode = grp->is_visible(kobj, *attr, i); 54 mode = grp->is_visible(kobj, *attr, i);
56 if (!mode) 55 if (!mode)
57 continue; 56 continue;
58 } 57 }
59 error = sysfs_add_file_mode_ns(dir_sd, *attr, 58 error = sysfs_add_file_mode_ns(parent, *attr, false,
60 SYSFS_KOBJ_ATTR,
61 (*attr)->mode | mode, 59 (*attr)->mode | mode,
62 NULL); 60 NULL);
63 if (unlikely(error)) 61 if (unlikely(error))
64 break; 62 break;
65 } 63 }
66 if (error) { 64 if (error) {
67 remove_files(dir_sd, kobj, grp); 65 remove_files(parent, kobj, grp);
68 goto exit; 66 goto exit;
69 } 67 }
70 } 68 }
@@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
78 break; 76 break;
79 } 77 }
80 if (error) 78 if (error)
81 remove_files(dir_sd, kobj, grp); 79 remove_files(parent, kobj, grp);
82 } 80 }
83exit: 81exit:
84 return error; 82 return error;
@@ -88,7 +86,7 @@ exit:
88static int internal_create_group(struct kobject *kobj, int update, 86static int internal_create_group(struct kobject *kobj, int update,
89 const struct attribute_group *grp) 87 const struct attribute_group *grp)
90{ 88{
91 struct sysfs_dirent *sd; 89 struct kernfs_node *kn;
92 int error; 90 int error;
93 91
94 BUG_ON(!kobj || (!update && !kobj->sd)); 92 BUG_ON(!kobj || (!update && !kobj->sd));
@@ -102,18 +100,22 @@ static int internal_create_group(struct kobject *kobj, int update,
102 return -EINVAL; 100 return -EINVAL;
103 } 101 }
104 if (grp->name) { 102 if (grp->name) {
105 error = sysfs_create_subdir(kobj, grp->name, &sd); 103 kn = kernfs_create_dir(kobj->sd, grp->name,
106 if (error) 104 S_IRWXU | S_IRUGO | S_IXUGO, kobj);
107 return error; 105 if (IS_ERR(kn)) {
106 if (PTR_ERR(kn) == -EEXIST)
107 sysfs_warn_dup(kobj->sd, grp->name);
108 return PTR_ERR(kn);
109 }
108 } else 110 } else
109 sd = kobj->sd; 111 kn = kobj->sd;
110 sysfs_get(sd); 112 kernfs_get(kn);
111 error = create_files(sd, kobj, grp, update); 113 error = create_files(kn, kobj, grp, update);
112 if (error) { 114 if (error) {
113 if (grp->name) 115 if (grp->name)
114 sysfs_remove(sd); 116 kernfs_remove(kn);
115 } 117 }
116 sysfs_put(sd); 118 kernfs_put(kn);
117 return error; 119 return error;
118} 120}
119 121
@@ -203,25 +205,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group);
203void sysfs_remove_group(struct kobject *kobj, 205void sysfs_remove_group(struct kobject *kobj,
204 const struct attribute_group *grp) 206 const struct attribute_group *grp)
205{ 207{
206 struct sysfs_dirent *dir_sd = kobj->sd; 208 struct kernfs_node *parent = kobj->sd;
207 struct sysfs_dirent *sd; 209 struct kernfs_node *kn;
208 210
209 if (grp->name) { 211 if (grp->name) {
210 sd = sysfs_get_dirent(dir_sd, grp->name); 212 kn = kernfs_find_and_get(parent, grp->name);
211 if (!sd) { 213 if (!kn) {
212 WARN(!sd, KERN_WARNING 214 WARN(!kn, KERN_WARNING
213 "sysfs group %p not found for kobject '%s'\n", 215 "sysfs group %p not found for kobject '%s'\n",
214 grp, kobject_name(kobj)); 216 grp, kobject_name(kobj));
215 return; 217 return;
216 } 218 }
217 } else 219 } else {
218 sd = sysfs_get(dir_sd); 220 kn = parent;
221 kernfs_get(kn);
222 }
219 223
220 remove_files(sd, kobj, grp); 224 remove_files(kn, kobj, grp);
221 if (grp->name) 225 if (grp->name)
222 sysfs_remove(sd); 226 kernfs_remove(kn);
223 227
224 sysfs_put(sd); 228 kernfs_put(kn);
225} 229}
226EXPORT_SYMBOL_GPL(sysfs_remove_group); 230EXPORT_SYMBOL_GPL(sysfs_remove_group);
227 231
@@ -257,22 +261,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups);
257int sysfs_merge_group(struct kobject *kobj, 261int sysfs_merge_group(struct kobject *kobj,
258 const struct attribute_group *grp) 262 const struct attribute_group *grp)
259{ 263{
260 struct sysfs_dirent *dir_sd; 264 struct kernfs_node *parent;
261 int error = 0; 265 int error = 0;
262 struct attribute *const *attr; 266 struct attribute *const *attr;
263 int i; 267 int i;
264 268
265 dir_sd = sysfs_get_dirent(kobj->sd, grp->name); 269 parent = kernfs_find_and_get(kobj->sd, grp->name);
266 if (!dir_sd) 270 if (!parent)
267 return -ENOENT; 271 return -ENOENT;
268 272
269 for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) 273 for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
270 error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); 274 error = sysfs_add_file(parent, *attr, false);
271 if (error) { 275 if (error) {
272 while (--i >= 0) 276 while (--i >= 0)
273 sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL); 277 kernfs_remove_by_name(parent, (*--attr)->name);
274 } 278 }
275 sysfs_put(dir_sd); 279 kernfs_put(parent);
276 280
277 return error; 281 return error;
278} 282}
@@ -286,14 +290,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group);
286void sysfs_unmerge_group(struct kobject *kobj, 290void sysfs_unmerge_group(struct kobject *kobj,
287 const struct attribute_group *grp) 291 const struct attribute_group *grp)
288{ 292{
289 struct sysfs_dirent *dir_sd; 293 struct kernfs_node *parent;
290 struct attribute *const *attr; 294 struct attribute *const *attr;
291 295
292 dir_sd = sysfs_get_dirent(kobj->sd, grp->name); 296 parent = kernfs_find_and_get(kobj->sd, grp->name);
293 if (dir_sd) { 297 if (parent) {
294 for (attr = grp->attrs; *attr; ++attr) 298 for (attr = grp->attrs; *attr; ++attr)
295 sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); 299 kernfs_remove_by_name(parent, (*attr)->name);
296 sysfs_put(dir_sd); 300 kernfs_put(parent);
297 } 301 }
298} 302}
299EXPORT_SYMBOL_GPL(sysfs_unmerge_group); 303EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -308,15 +312,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
308int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, 312int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
309 struct kobject *target, const char *link_name) 313 struct kobject *target, const char *link_name)
310{ 314{
311 struct sysfs_dirent *dir_sd; 315 struct kernfs_node *parent;
312 int error = 0; 316 int error = 0;
313 317
314 dir_sd = sysfs_get_dirent(kobj->sd, group_name); 318 parent = kernfs_find_and_get(kobj->sd, group_name);
315 if (!dir_sd) 319 if (!parent)
316 return -ENOENT; 320 return -ENOENT;
317 321
318 error = sysfs_create_link_sd(dir_sd, target, link_name); 322 error = sysfs_create_link_sd(parent, target, link_name);
319 sysfs_put(dir_sd); 323 kernfs_put(parent);
320 324
321 return error; 325 return error;
322} 326}
@@ -331,12 +335,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
331void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, 335void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
332 const char *link_name) 336 const char *link_name)
333{ 337{
334 struct sysfs_dirent *dir_sd; 338 struct kernfs_node *parent;
335 339
336 dir_sd = sysfs_get_dirent(kobj->sd, group_name); 340 parent = kernfs_find_and_get(kobj->sd, group_name);
337 if (dir_sd) { 341 if (parent) {
338 sysfs_hash_and_remove(dir_sd, link_name, NULL); 342 kernfs_remove_by_name(parent, link_name);
339 sysfs_put(dir_sd); 343 kernfs_put(parent);
340 } 344 }
341} 345}
342EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); 346EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
deleted file mode 100644
index 1750f790af3b..000000000000
--- a/fs/sysfs/inode.c
+++ /dev/null
@@ -1,331 +0,0 @@
1/*
2 * fs/sysfs/inode.c - basic sysfs inode and dentry operations
3 *
4 * Copyright (c) 2001-3 Patrick Mochel
5 * Copyright (c) 2007 SUSE Linux Products GmbH
6 * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
7 *
8 * This file is released under the GPLv2.
9 *
10 * Please see Documentation/filesystems/sysfs.txt for more information.
11 */
12
13#undef DEBUG
14
15#include <linux/pagemap.h>
16#include <linux/namei.h>
17#include <linux/backing-dev.h>
18#include <linux/capability.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/slab.h>
22#include <linux/sysfs.h>
23#include <linux/xattr.h>
24#include <linux/security.h>
25#include "sysfs.h"
26
27static const struct address_space_operations sysfs_aops = {
28 .readpage = simple_readpage,
29 .write_begin = simple_write_begin,
30 .write_end = simple_write_end,
31};
32
33static struct backing_dev_info sysfs_backing_dev_info = {
34 .name = "sysfs",
35 .ra_pages = 0, /* No readahead */
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
37};
38
39static const struct inode_operations sysfs_inode_operations = {
40 .permission = sysfs_permission,
41 .setattr = sysfs_setattr,
42 .getattr = sysfs_getattr,
43 .setxattr = sysfs_setxattr,
44};
45
46int __init sysfs_inode_init(void)
47{
48 return bdi_init(&sysfs_backing_dev_info);
49}
50
51static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
52{
53 struct sysfs_inode_attrs *attrs;
54 struct iattr *iattrs;
55
56 attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
57 if (!attrs)
58 return NULL;
59 iattrs = &attrs->ia_iattr;
60
61 /* assign default attributes */
62 iattrs->ia_mode = sd->s_mode;
63 iattrs->ia_uid = GLOBAL_ROOT_UID;
64 iattrs->ia_gid = GLOBAL_ROOT_GID;
65 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
66
67 return attrs;
68}
69
70int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
71{
72 struct sysfs_inode_attrs *sd_attrs;
73 struct iattr *iattrs;
74 unsigned int ia_valid = iattr->ia_valid;
75
76 sd_attrs = sd->s_iattr;
77
78 if (!sd_attrs) {
79 /* setting attributes for the first time, allocate now */
80 sd_attrs = sysfs_init_inode_attrs(sd);
81 if (!sd_attrs)
82 return -ENOMEM;
83 sd->s_iattr = sd_attrs;
84 }
85 /* attributes were changed at least once in past */
86 iattrs = &sd_attrs->ia_iattr;
87
88 if (ia_valid & ATTR_UID)
89 iattrs->ia_uid = iattr->ia_uid;
90 if (ia_valid & ATTR_GID)
91 iattrs->ia_gid = iattr->ia_gid;
92 if (ia_valid & ATTR_ATIME)
93 iattrs->ia_atime = iattr->ia_atime;
94 if (ia_valid & ATTR_MTIME)
95 iattrs->ia_mtime = iattr->ia_mtime;
96 if (ia_valid & ATTR_CTIME)
97 iattrs->ia_ctime = iattr->ia_ctime;
98 if (ia_valid & ATTR_MODE) {
99 umode_t mode = iattr->ia_mode;
100 iattrs->ia_mode = sd->s_mode = mode;
101 }
102 return 0;
103}
104
105int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
106{
107 struct inode *inode = dentry->d_inode;
108 struct sysfs_dirent *sd = dentry->d_fsdata;
109 int error;
110
111 if (!sd)
112 return -EINVAL;
113
114 mutex_lock(&sysfs_mutex);
115 error = inode_change_ok(inode, iattr);
116 if (error)
117 goto out;
118
119 error = sysfs_sd_setattr(sd, iattr);
120 if (error)
121 goto out;
122
123 /* this ignores size changes */
124 setattr_copy(inode, iattr);
125
126out:
127 mutex_unlock(&sysfs_mutex);
128 return error;
129}
130
131static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
132 u32 *secdata_len)
133{
134 struct sysfs_inode_attrs *iattrs;
135 void *old_secdata;
136 size_t old_secdata_len;
137
138 if (!sd->s_iattr) {
139 sd->s_iattr = sysfs_init_inode_attrs(sd);
140 if (!sd->s_iattr)
141 return -ENOMEM;
142 }
143
144 iattrs = sd->s_iattr;
145 old_secdata = iattrs->ia_secdata;
146 old_secdata_len = iattrs->ia_secdata_len;
147
148 iattrs->ia_secdata = *secdata;
149 iattrs->ia_secdata_len = *secdata_len;
150
151 *secdata = old_secdata;
152 *secdata_len = old_secdata_len;
153 return 0;
154}
155
156int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
157 size_t size, int flags)
158{
159 struct sysfs_dirent *sd = dentry->d_fsdata;
160 void *secdata;
161 int error;
162 u32 secdata_len = 0;
163
164 if (!sd)
165 return -EINVAL;
166
167 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
168 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
169 error = security_inode_setsecurity(dentry->d_inode, suffix,
170 value, size, flags);
171 if (error)
172 goto out;
173 error = security_inode_getsecctx(dentry->d_inode,
174 &secdata, &secdata_len);
175 if (error)
176 goto out;
177
178 mutex_lock(&sysfs_mutex);
179 error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
180 mutex_unlock(&sysfs_mutex);
181
182 if (secdata)
183 security_release_secctx(secdata, secdata_len);
184 } else
185 return -EINVAL;
186out:
187 return error;
188}
189
190static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
191{
192 inode->i_mode = mode;
193 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
194}
195
196static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
197{
198 inode->i_uid = iattr->ia_uid;
199 inode->i_gid = iattr->ia_gid;
200 inode->i_atime = iattr->ia_atime;
201 inode->i_mtime = iattr->ia_mtime;
202 inode->i_ctime = iattr->ia_ctime;
203}
204
205static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
206{
207 struct sysfs_inode_attrs *iattrs = sd->s_iattr;
208
209 inode->i_mode = sd->s_mode;
210 if (iattrs) {
211 /* sysfs_dirent has non-default attributes
212 * get them from persistent copy in sysfs_dirent
213 */
214 set_inode_attr(inode, &iattrs->ia_iattr);
215 security_inode_notifysecctx(inode,
216 iattrs->ia_secdata,
217 iattrs->ia_secdata_len);
218 }
219
220 if (sysfs_type(sd) == SYSFS_DIR)
221 set_nlink(inode, sd->s_dir.subdirs + 2);
222}
223
224int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
225 struct kstat *stat)
226{
227 struct sysfs_dirent *sd = dentry->d_fsdata;
228 struct inode *inode = dentry->d_inode;
229
230 mutex_lock(&sysfs_mutex);
231 sysfs_refresh_inode(sd, inode);
232 mutex_unlock(&sysfs_mutex);
233
234 generic_fillattr(inode, stat);
235 return 0;
236}
237
238static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
239{
240 struct bin_attribute *bin_attr;
241
242 inode->i_private = sysfs_get(sd);
243 inode->i_mapping->a_ops = &sysfs_aops;
244 inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
245 inode->i_op = &sysfs_inode_operations;
246
247 set_default_inode_attr(inode, sd->s_mode);
248 sysfs_refresh_inode(sd, inode);
249
250 /* initialize inode according to type */
251 switch (sysfs_type(sd)) {
252 case SYSFS_DIR:
253 inode->i_op = &sysfs_dir_inode_operations;
254 inode->i_fop = &sysfs_dir_operations;
255 break;
256 case SYSFS_KOBJ_ATTR:
257 inode->i_size = PAGE_SIZE;
258 inode->i_fop = &sysfs_file_operations;
259 break;
260 case SYSFS_KOBJ_BIN_ATTR:
261 bin_attr = sd->s_attr.bin_attr;
262 inode->i_size = bin_attr->size;
263 inode->i_fop = &sysfs_bin_operations;
264 break;
265 case SYSFS_KOBJ_LINK:
266 inode->i_op = &sysfs_symlink_inode_operations;
267 break;
268 default:
269 BUG();
270 }
271
272 unlock_new_inode(inode);
273}
274
275/**
276 * sysfs_get_inode - get inode for sysfs_dirent
277 * @sb: super block
278 * @sd: sysfs_dirent to allocate inode for
279 *
280 * Get inode for @sd. If such inode doesn't exist, a new inode
281 * is allocated and basics are initialized. New inode is
282 * returned locked.
283 *
284 * LOCKING:
285 * Kernel thread context (may sleep).
286 *
287 * RETURNS:
288 * Pointer to allocated inode on success, NULL on failure.
289 */
290struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
291{
292 struct inode *inode;
293
294 inode = iget_locked(sb, sd->s_ino);
295 if (inode && (inode->i_state & I_NEW))
296 sysfs_init_inode(sd, inode);
297
298 return inode;
299}
300
301/*
302 * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
303 * To prevent the sysfs inode numbers from being freed prematurely we take a
304 * reference to sysfs_dirent from the sysfs inode. A
305 * super_operations.evict_inode() implementation is needed to drop that
306 * reference upon inode destruction.
307 */
308void sysfs_evict_inode(struct inode *inode)
309{
310 struct sysfs_dirent *sd = inode->i_private;
311
312 truncate_inode_pages(&inode->i_data, 0);
313 clear_inode(inode);
314 sysfs_put(sd);
315}
316
317int sysfs_permission(struct inode *inode, int mask)
318{
319 struct sysfs_dirent *sd;
320
321 if (mask & MAY_NOT_BLOCK)
322 return -ECHILD;
323
324 sd = inode->i_private;
325
326 mutex_lock(&sysfs_mutex);
327 sysfs_refresh_inode(sd, inode);
328 mutex_unlock(&sysfs_mutex);
329
330 return generic_permission(inode, mask);
331}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a3..3eaf5c6622eb 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -14,146 +14,42 @@
14 14
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/mount.h> 16#include <linux/mount.h>
17#include <linux/pagemap.h>
18#include <linux/init.h> 17#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/magic.h>
21#include <linux/slab.h>
22#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
23 19
24#include "sysfs.h" 20#include "sysfs.h"
25 21
26 22static struct kernfs_root *sysfs_root;
27static struct vfsmount *sysfs_mnt; 23struct kernfs_node *sysfs_root_kn;
28struct kmem_cache *sysfs_dir_cachep;
29
30static const struct super_operations sysfs_ops = {
31 .statfs = simple_statfs,
32 .drop_inode = generic_delete_inode,
33 .evict_inode = sysfs_evict_inode,
34};
35
36struct sysfs_dirent sysfs_root = {
37 .s_name = "",
38 .s_count = ATOMIC_INIT(1),
39 .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
40 .s_mode = S_IFDIR | S_IRUGO | S_IXUGO,
41 .s_ino = 1,
42};
43
44static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
45{
46 struct inode *inode;
47 struct dentry *root;
48
49 sb->s_blocksize = PAGE_CACHE_SIZE;
50 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
51 sb->s_magic = SYSFS_MAGIC;
52 sb->s_op = &sysfs_ops;
53 sb->s_time_gran = 1;
54
55 /* get root inode, initialize and unlock it */
56 mutex_lock(&sysfs_mutex);
57 inode = sysfs_get_inode(sb, &sysfs_root);
58 mutex_unlock(&sysfs_mutex);
59 if (!inode) {
60 pr_debug("sysfs: could not get root inode\n");
61 return -ENOMEM;
62 }
63
64 /* instantiate and link root dentry */
65 root = d_make_root(inode);
66 if (!root) {
67 pr_debug("%s: could not get root dentry!\n", __func__);
68 return -ENOMEM;
69 }
70 root->d_fsdata = &sysfs_root;
71 sb->s_root = root;
72 sb->s_d_op = &sysfs_dentry_ops;
73 return 0;
74}
75
76static int sysfs_test_super(struct super_block *sb, void *data)
77{
78 struct sysfs_super_info *sb_info = sysfs_info(sb);
79 struct sysfs_super_info *info = data;
80 enum kobj_ns_type type;
81 int found = 1;
82
83 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
84 if (sb_info->ns[type] != info->ns[type])
85 found = 0;
86 }
87 return found;
88}
89
90static int sysfs_set_super(struct super_block *sb, void *data)
91{
92 int error;
93 error = set_anon_super(sb, data);
94 if (!error)
95 sb->s_fs_info = data;
96 return error;
97}
98
99static void free_sysfs_super_info(struct sysfs_super_info *info)
100{
101 int type;
102 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
103 kobj_ns_drop(type, info->ns[type]);
104 kfree(info);
105}
106 24
107static struct dentry *sysfs_mount(struct file_system_type *fs_type, 25static struct dentry *sysfs_mount(struct file_system_type *fs_type,
108 int flags, const char *dev_name, void *data) 26 int flags, const char *dev_name, void *data)
109{ 27{
110 struct sysfs_super_info *info; 28 struct dentry *root;
111 enum kobj_ns_type type; 29 void *ns;
112 struct super_block *sb; 30 bool new_sb;
113 int error;
114 31
115 if (!(flags & MS_KERNMOUNT)) { 32 if (!(flags & MS_KERNMOUNT)) {
116 if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) 33 if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
117 return ERR_PTR(-EPERM); 34 return ERR_PTR(-EPERM);
118 35
119 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { 36 if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
120 if (!kobj_ns_current_may_mount(type)) 37 return ERR_PTR(-EPERM);
121 return ERR_PTR(-EPERM);
122 }
123 }
124
125 info = kzalloc(sizeof(*info), GFP_KERNEL);
126 if (!info)
127 return ERR_PTR(-ENOMEM);
128
129 for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
130 info->ns[type] = kobj_ns_grab_current(type);
131
132 sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
133 if (IS_ERR(sb) || sb->s_fs_info != info)
134 free_sysfs_super_info(info);
135 if (IS_ERR(sb))
136 return ERR_CAST(sb);
137 if (!sb->s_root) {
138 error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
139 if (error) {
140 deactivate_locked_super(sb);
141 return ERR_PTR(error);
142 }
143 sb->s_flags |= MS_ACTIVE;
144 } 38 }
145 39
146 return dget(sb->s_root); 40 ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
41 root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns);
42 if (IS_ERR(root) || !new_sb)
43 kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
44 return root;
147} 45}
148 46
149static void sysfs_kill_sb(struct super_block *sb) 47static void sysfs_kill_sb(struct super_block *sb)
150{ 48{
151 struct sysfs_super_info *info = sysfs_info(sb); 49 void *ns = (void *)kernfs_super_ns(sb);
152 /* Remove the superblock from fs_supers/s_instances 50
153 * so we can't find it, before freeing sysfs_super_info. 51 kernfs_kill_sb(sb);
154 */ 52 kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
155 kill_anon_super(sb);
156 free_sysfs_super_info(info);
157} 53}
158 54
159static struct file_system_type sysfs_fs_type = { 55static struct file_system_type sysfs_fs_type = {
@@ -165,48 +61,19 @@ static struct file_system_type sysfs_fs_type = {
165 61
166int __init sysfs_init(void) 62int __init sysfs_init(void)
167{ 63{
168 int err = -ENOMEM; 64 int err;
169 65
170 sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", 66 sysfs_root = kernfs_create_root(NULL, NULL);
171 sizeof(struct sysfs_dirent), 67 if (IS_ERR(sysfs_root))
172 0, 0, NULL); 68 return PTR_ERR(sysfs_root);
173 if (!sysfs_dir_cachep)
174 goto out;
175 69
176 err = sysfs_inode_init(); 70 sysfs_root_kn = sysfs_root->kn;
177 if (err)
178 goto out_err;
179 71
180 err = register_filesystem(&sysfs_fs_type); 72 err = register_filesystem(&sysfs_fs_type);
181 if (!err) { 73 if (err) {
182 sysfs_mnt = kern_mount(&sysfs_fs_type); 74 kernfs_destroy_root(sysfs_root);
183 if (IS_ERR(sysfs_mnt)) { 75 return err;
184 printk(KERN_ERR "sysfs: could not mount!\n"); 76 }
185 err = PTR_ERR(sysfs_mnt);
186 sysfs_mnt = NULL;
187 unregister_filesystem(&sysfs_fs_type);
188 goto out_err;
189 }
190 } else
191 goto out_err;
192out:
193 return err;
194out_err:
195 kmem_cache_destroy(sysfs_dir_cachep);
196 sysfs_dir_cachep = NULL;
197 goto out;
198}
199
200#undef sysfs_get
201struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
202{
203 return __sysfs_get(sd);
204}
205EXPORT_SYMBOL_GPL(sysfs_get);
206 77
207#undef sysfs_put 78 return 0;
208void sysfs_put(struct sysfs_dirent *sd)
209{
210 __sysfs_put(sd);
211} 79}
212EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3ae3f1bf1a09..aecb15f84557 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,109 +11,73 @@
11 */ 11 */
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/gfp.h>
15#include <linux/mount.h>
16#include <linux/module.h> 14#include <linux/module.h>
17#include <linux/kobject.h> 15#include <linux/kobject.h>
18#include <linux/namei.h>
19#include <linux/mutex.h> 16#include <linux/mutex.h>
20#include <linux/security.h> 17#include <linux/security.h>
21 18
22#include "sysfs.h" 19#include "sysfs.h"
23 20
24static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, 21static int sysfs_do_create_link_sd(struct kernfs_node *parent,
25 struct kobject *target, 22 struct kobject *target_kobj,
26 const char *name, int warn) 23 const char *name, int warn)
27{ 24{
28 struct sysfs_dirent *target_sd = NULL; 25 struct kernfs_node *kn, *target = NULL;
29 struct sysfs_dirent *sd = NULL;
30 struct sysfs_addrm_cxt acxt;
31 enum kobj_ns_type ns_type;
32 int error;
33 26
34 BUG_ON(!name || !parent_sd); 27 BUG_ON(!name || !parent);
35 28
36 /* 29 /*
37 * We don't own @target and it may be removed at any time. 30 * We don't own @target_kobj and it may be removed at any time.
38 * Synchronize using sysfs_symlink_target_lock. See 31 * Synchronize using sysfs_symlink_target_lock. See
39 * sysfs_remove_dir() for details. 32 * sysfs_remove_dir() for details.
40 */ 33 */
41 spin_lock(&sysfs_symlink_target_lock); 34 spin_lock(&sysfs_symlink_target_lock);
42 if (target->sd) 35 if (target_kobj->sd) {
43 target_sd = sysfs_get(target->sd); 36 target = target_kobj->sd;
37 kernfs_get(target);
38 }
44 spin_unlock(&sysfs_symlink_target_lock); 39 spin_unlock(&sysfs_symlink_target_lock);
45 40
46 error = -ENOENT; 41 if (!target)
47 if (!target_sd) 42 return -ENOENT;
48 goto out_put;
49
50 error = -ENOMEM;
51 sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
52 if (!sd)
53 goto out_put;
54 43
55 ns_type = sysfs_ns_type(parent_sd); 44 kn = kernfs_create_link(parent, name, target);
56 if (ns_type) 45 kernfs_put(target);
57 sd->s_ns = target_sd->s_ns;
58 sd->s_symlink.target_sd = target_sd;
59 target_sd = NULL; /* reference is now owned by the symlink */
60
61 sysfs_addrm_start(&acxt);
62 /* Symlinks must be between directories with the same ns_type */
63 if (!ns_type ||
64 (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
65 if (warn)
66 error = sysfs_add_one(&acxt, sd, parent_sd);
67 else
68 error = __sysfs_add_one(&acxt, sd, parent_sd);
69 } else {
70 error = -EINVAL;
71 WARN(1, KERN_WARNING
72 "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
73 parent_sd->s_name,
74 sd->s_name,
75 sd->s_symlink.target_sd->s_parent->s_name,
76 sd->s_symlink.target_sd->s_name);
77 }
78 sysfs_addrm_finish(&acxt);
79 46
80 if (error) 47 if (!IS_ERR(kn))
81 goto out_put; 48 return 0;
82 49
83 return 0; 50 if (warn && PTR_ERR(kn) == -EEXIST)
84 51 sysfs_warn_dup(parent, name);
85 out_put: 52 return PTR_ERR(kn);
86 sysfs_put(target_sd);
87 sysfs_put(sd);
88 return error;
89} 53}
90 54
91/** 55/**
92 * sysfs_create_link_sd - create symlink to a given object. 56 * sysfs_create_link_sd - create symlink to a given object.
93 * @sd: directory we're creating the link in. 57 * @kn: directory we're creating the link in.
94 * @target: object we're pointing to. 58 * @target: object we're pointing to.
95 * @name: name of the symlink. 59 * @name: name of the symlink.
96 */ 60 */
97int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, 61int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
98 const char *name) 62 const char *name)
99{ 63{
100 return sysfs_do_create_link_sd(sd, target, name, 1); 64 return sysfs_do_create_link_sd(kn, target, name, 1);
101} 65}
102 66
103static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, 67static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
104 const char *name, int warn) 68 const char *name, int warn)
105{ 69{
106 struct sysfs_dirent *parent_sd = NULL; 70 struct kernfs_node *parent = NULL;
107 71
108 if (!kobj) 72 if (!kobj)
109 parent_sd = &sysfs_root; 73 parent = sysfs_root_kn;
110 else 74 else
111 parent_sd = kobj->sd; 75 parent = kobj->sd;
112 76
113 if (!parent_sd) 77 if (!parent)
114 return -EFAULT; 78 return -EFAULT;
115 79
116 return sysfs_do_create_link_sd(parent_sd, target, name, warn); 80 return sysfs_do_create_link_sd(parent, target, name, warn);
117} 81}
118 82
119/** 83/**
@@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
164 * sysfs_remove_dir() for details. 128 * sysfs_remove_dir() for details.
165 */ 129 */
166 spin_lock(&sysfs_symlink_target_lock); 130 spin_lock(&sysfs_symlink_target_lock);
167 if (targ->sd && sysfs_ns_type(kobj->sd)) 131 if (targ->sd && kernfs_ns_enabled(kobj->sd))
168 ns = targ->sd->s_ns; 132 ns = targ->sd->ns;
169 spin_unlock(&sysfs_symlink_target_lock); 133 spin_unlock(&sysfs_symlink_target_lock);
170 sysfs_hash_and_remove(kobj->sd, name, ns); 134 kernfs_remove_by_name_ns(kobj->sd, name, ns);
171} 135}
172 136
173/** 137/**
@@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
177 */ 141 */
178void sysfs_remove_link(struct kobject *kobj, const char *name) 142void sysfs_remove_link(struct kobject *kobj, const char *name)
179{ 143{
180 struct sysfs_dirent *parent_sd = NULL; 144 struct kernfs_node *parent = NULL;
181 145
182 if (!kobj) 146 if (!kobj)
183 parent_sd = &sysfs_root; 147 parent = sysfs_root_kn;
184 else 148 else
185 parent_sd = kobj->sd; 149 parent = kobj->sd;
186 150
187 sysfs_hash_and_remove(parent_sd, name, NULL); 151 kernfs_remove_by_name(parent, name);
188} 152}
189EXPORT_SYMBOL_GPL(sysfs_remove_link); 153EXPORT_SYMBOL_GPL(sysfs_remove_link);
190 154
@@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link);
201int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, 165int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
202 const char *old, const char *new, const void *new_ns) 166 const char *old, const char *new, const void *new_ns)
203{ 167{
204 struct sysfs_dirent *parent_sd, *sd = NULL; 168 struct kernfs_node *parent, *kn = NULL;
205 const void *old_ns = NULL; 169 const void *old_ns = NULL;
206 int result; 170 int result;
207 171
208 if (!kobj) 172 if (!kobj)
209 parent_sd = &sysfs_root; 173 parent = sysfs_root_kn;
210 else 174 else
211 parent_sd = kobj->sd; 175 parent = kobj->sd;
212 176
213 if (targ->sd) 177 if (targ->sd)
214 old_ns = targ->sd->s_ns; 178 old_ns = targ->sd->ns;
215 179
216 result = -ENOENT; 180 result = -ENOENT;
217 sd = sysfs_get_dirent_ns(parent_sd, old, old_ns); 181 kn = kernfs_find_and_get_ns(parent, old, old_ns);
218 if (!sd) 182 if (!kn)
219 goto out; 183 goto out;
220 184
221 result = -EINVAL; 185 result = -EINVAL;
222 if (sysfs_type(sd) != SYSFS_KOBJ_LINK) 186 if (kernfs_type(kn) != KERNFS_LINK)
223 goto out; 187 goto out;
224 if (sd->s_symlink.target_sd->s_dir.kobj != targ) 188 if (kn->symlink.target_kn->priv != targ)
225 goto out; 189 goto out;
226 190
227 result = sysfs_rename(sd, parent_sd, new, new_ns); 191 result = kernfs_rename_ns(kn, parent, new, new_ns);
228 192
229out: 193out:
230 sysfs_put(sd); 194 kernfs_put(kn);
231 return result; 195 return result;
232} 196}
233EXPORT_SYMBOL_GPL(sysfs_rename_link_ns); 197EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
234
235static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
236 struct sysfs_dirent *target_sd, char *path)
237{
238 struct sysfs_dirent *base, *sd;
239 char *s = path;
240 int len = 0;
241
242 /* go up to the root, stop at the base */
243 base = parent_sd;
244 while (base->s_parent) {
245 sd = target_sd->s_parent;
246 while (sd->s_parent && base != sd)
247 sd = sd->s_parent;
248
249 if (base == sd)
250 break;
251
252 strcpy(s, "../");
253 s += 3;
254 base = base->s_parent;
255 }
256
257 /* determine end of target string for reverse fillup */
258 sd = target_sd;
259 while (sd->s_parent && sd != base) {
260 len += strlen(sd->s_name) + 1;
261 sd = sd->s_parent;
262 }
263
264 /* check limits */
265 if (len < 2)
266 return -EINVAL;
267 len--;
268 if ((s - path) + len > PATH_MAX)
269 return -ENAMETOOLONG;
270
271 /* reverse fillup of target string from target to base */
272 sd = target_sd;
273 while (sd->s_parent && sd != base) {
274 int slen = strlen(sd->s_name);
275
276 len -= slen;
277 strncpy(s + len, sd->s_name, slen);
278 if (len)
279 s[--len] = '/';
280
281 sd = sd->s_parent;
282 }
283
284 return 0;
285}
286
287static int sysfs_getlink(struct dentry *dentry, char *path)
288{
289 struct sysfs_dirent *sd = dentry->d_fsdata;
290 struct sysfs_dirent *parent_sd = sd->s_parent;
291 struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
292 int error;
293
294 mutex_lock(&sysfs_mutex);
295 error = sysfs_get_target_path(parent_sd, target_sd, path);
296 mutex_unlock(&sysfs_mutex);
297
298 return error;
299}
300
301static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
302{
303 int error = -ENOMEM;
304 unsigned long page = get_zeroed_page(GFP_KERNEL);
305 if (page) {
306 error = sysfs_getlink(dentry, (char *) page);
307 if (error < 0)
308 free_page((unsigned long)page);
309 }
310 nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
311 return NULL;
312}
313
314static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
315 void *cookie)
316{
317 char *page = nd_get_link(nd);
318 if (!IS_ERR(page))
319 free_page((unsigned long)page);
320}
321
322const struct inode_operations sysfs_symlink_inode_operations = {
323 .setxattr = sysfs_setxattr,
324 .readlink = generic_readlink,
325 .follow_link = sysfs_follow_link,
326 .put_link = sysfs_put_link,
327 .setattr = sysfs_setattr,
328 .getattr = sysfs_getattr,
329 .permission = sysfs_permission,
330};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0af09fbfb3f6..0e2f1cccb812 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,248 +8,36 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/lockdep.h> 11#ifndef __SYSFS_INTERNAL_H
12#include <linux/kobject_ns.h> 12#define __SYSFS_INTERNAL_H
13#include <linux/fs.h>
14#include <linux/rbtree.h>
15 13
16struct sysfs_open_dirent; 14#include <linux/sysfs.h>
17
18/* type-specific structures for sysfs_dirent->s_* union members */
19struct sysfs_elem_dir {
20 struct kobject *kobj;
21
22 unsigned long subdirs;
23 /* children rbtree starts here and goes through sd->s_rb */
24 struct rb_root children;
25};
26
27struct sysfs_elem_symlink {
28 struct sysfs_dirent *target_sd;
29};
30
31struct sysfs_elem_attr {
32 union {
33 struct attribute *attr;
34 struct bin_attribute *bin_attr;
35 };
36 struct sysfs_open_dirent *open;
37};
38
39struct sysfs_inode_attrs {
40 struct iattr ia_iattr;
41 void *ia_secdata;
42 u32 ia_secdata_len;
43};
44
45/*
46 * sysfs_dirent - the building block of sysfs hierarchy. Each and
47 * every sysfs node is represented by single sysfs_dirent.
48 *
49 * As long as s_count reference is held, the sysfs_dirent itself is
50 * accessible. Dereferencing s_elem or any other outer entity
51 * requires s_active reference.
52 */
53struct sysfs_dirent {
54 atomic_t s_count;
55 atomic_t s_active;
56#ifdef CONFIG_DEBUG_LOCK_ALLOC
57 struct lockdep_map dep_map;
58#endif
59 struct sysfs_dirent *s_parent;
60 const char *s_name;
61
62 struct rb_node s_rb;
63
64 union {
65 struct completion *completion;
66 struct sysfs_dirent *removed_list;
67 } u;
68
69 const void *s_ns; /* namespace tag */
70 unsigned int s_hash; /* ns + name hash */
71 union {
72 struct sysfs_elem_dir s_dir;
73 struct sysfs_elem_symlink s_symlink;
74 struct sysfs_elem_attr s_attr;
75 };
76
77 unsigned short s_flags;
78 umode_t s_mode;
79 unsigned int s_ino;
80 struct sysfs_inode_attrs *s_iattr;
81};
82
83#define SD_DEACTIVATED_BIAS INT_MIN
84
85#define SYSFS_TYPE_MASK 0x00ff
86#define SYSFS_DIR 0x0001
87#define SYSFS_KOBJ_ATTR 0x0002
88#define SYSFS_KOBJ_BIN_ATTR 0x0004
89#define SYSFS_KOBJ_LINK 0x0008
90#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
91#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
92
93/* identify any namespace tag on sysfs_dirents */
94#define SYSFS_NS_TYPE_MASK 0xf00
95#define SYSFS_NS_TYPE_SHIFT 8
96
97#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
98#define SYSFS_FLAG_REMOVED 0x02000
99
100static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
101{
102 return sd->s_flags & SYSFS_TYPE_MASK;
103}
104
105/*
106 * Return any namespace tags on this dirent.
107 * enum kobj_ns_type is defined in linux/kobject.h
108 */
109static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
110{
111 return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
112}
113
114#ifdef CONFIG_DEBUG_LOCK_ALLOC
115
116#define sysfs_dirent_init_lockdep(sd) \
117do { \
118 struct attribute *attr = sd->s_attr.attr; \
119 struct lock_class_key *key = attr->key; \
120 if (!key) \
121 key = &attr->skey; \
122 \
123 lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
124} while (0)
125
126/* Test for attributes that want to ignore lockdep for read-locking */
127static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
128{
129 int type = sysfs_type(sd);
130
131 return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
132 sd->s_attr.attr->ignore_lockdep;
133}
134
135#else
136
137#define sysfs_dirent_init_lockdep(sd) do {} while (0)
138
139static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
140{
141 return true;
142}
143
144#endif
145
146/*
147 * Context structure to be used while adding/removing nodes.
148 */
149struct sysfs_addrm_cxt {
150 struct sysfs_dirent *removed;
151};
152 15
153/* 16/*
154 * mount.c 17 * mount.c
155 */ 18 */
156 19extern struct kernfs_node *sysfs_root_kn;
157/*
158 * Each sb is associated with a set of namespace tags (i.e.
159 * the network namespace of the task which mounted this sysfs
160 * instance).
161 */
162struct sysfs_super_info {
163 void *ns[KOBJ_NS_TYPES];
164};
165#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
166extern struct sysfs_dirent sysfs_root;
167extern struct kmem_cache *sysfs_dir_cachep;
168 20
169/* 21/*
170 * dir.c 22 * dir.c
171 */ 23 */
172extern struct mutex sysfs_mutex;
173extern spinlock_t sysfs_symlink_target_lock; 24extern spinlock_t sysfs_symlink_target_lock;
174extern const struct dentry_operations sysfs_dentry_ops;
175
176extern const struct file_operations sysfs_dir_operations;
177extern const struct inode_operations sysfs_dir_inode_operations;
178 25
179struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); 26void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
180void sysfs_put_active(struct sysfs_dirent *sd);
181void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
182void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
183int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
184 struct sysfs_dirent *parent_sd);
185int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
186 struct sysfs_dirent *parent_sd);
187void sysfs_remove(struct sysfs_dirent *sd);
188int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
189 const void *ns);
190void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
191
192struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
193 const unsigned char *name,
194 const void *ns);
195struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
196
197void release_sysfs_dirent(struct sysfs_dirent *sd);
198
199int sysfs_create_subdir(struct kobject *kobj, const char *name,
200 struct sysfs_dirent **p_sd);
201
202int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
203 const char *new_name, const void *new_ns);
204
205static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
206{
207 if (sd) {
208 WARN_ON(!atomic_read(&sd->s_count));
209 atomic_inc(&sd->s_count);
210 }
211 return sd;
212}
213#define sysfs_get(sd) __sysfs_get(sd)
214
215static inline void __sysfs_put(struct sysfs_dirent *sd)
216{
217 if (sd && atomic_dec_and_test(&sd->s_count))
218 release_sysfs_dirent(sd);
219}
220#define sysfs_put(sd) __sysfs_put(sd)
221
222/*
223 * inode.c
224 */
225struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
226void sysfs_evict_inode(struct inode *inode);
227int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
228int sysfs_permission(struct inode *inode, int mask);
229int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
230int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
231 struct kstat *stat);
232int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
233 size_t size, int flags);
234int sysfs_inode_init(void);
235 27
236/* 28/*
237 * file.c 29 * file.c
238 */ 30 */
239extern const struct file_operations sysfs_file_operations; 31int sysfs_add_file(struct kernfs_node *parent,
240extern const struct file_operations sysfs_bin_operations; 32 const struct attribute *attr, bool is_bin);
241 33int sysfs_add_file_mode_ns(struct kernfs_node *parent,
242int sysfs_add_file(struct sysfs_dirent *dir_sd, 34 const struct attribute *attr, bool is_bin,
243 const struct attribute *attr, int type);
244
245int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
246 const struct attribute *attr, int type,
247 umode_t amode, const void *ns); 35 umode_t amode, const void *ns);
248void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
249 36
250/* 37/*
251 * symlink.c 38 * symlink.c
252 */ 39 */
253extern const struct inode_operations sysfs_symlink_inode_operations; 40int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
254int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
255 const char *name); 41 const char *name);
42
43#endif /* __SYSFS_INTERNAL_H */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index cc1febd8fadf..5157b866a853 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2118,26 +2118,10 @@ out_free:
2118 */ 2118 */
2119static void free_inodes(struct fsck_data *fsckd) 2119static void free_inodes(struct fsck_data *fsckd)
2120{ 2120{
2121 struct rb_node *this = fsckd->inodes.rb_node; 2121 struct fsck_inode *fscki, *n;
2122 struct fsck_inode *fscki;
2123 2122
2124 while (this) { 2123 rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb)
2125 if (this->rb_left) 2124 kfree(fscki);
2126 this = this->rb_left;
2127 else if (this->rb_right)
2128 this = this->rb_right;
2129 else {
2130 fscki = rb_entry(this, struct fsck_inode, rb);
2131 this = rb_parent(this);
2132 if (this) {
2133 if (this->rb_left == &fscki->rb)
2134 this->rb_left = NULL;
2135 else
2136 this->rb_right = NULL;
2137 }
2138 kfree(fscki);
2139 }
2140 }
2141} 2125}
2142 2126
2143/** 2127/**
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36bd4efd0819..a902c5919e42 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -574,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum)
574 */ 574 */
575static void destroy_done_tree(struct rb_root *done_tree) 575static void destroy_done_tree(struct rb_root *done_tree)
576{ 576{
577 struct rb_node *this = done_tree->rb_node; 577 struct done_ref *dr, *n;
578 struct done_ref *dr;
579 578
580 while (this) { 579 rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb)
581 if (this->rb_left) {
582 this = this->rb_left;
583 continue;
584 } else if (this->rb_right) {
585 this = this->rb_right;
586 continue;
587 }
588 dr = rb_entry(this, struct done_ref, rb);
589 this = rb_parent(this);
590 if (this) {
591 if (this->rb_left == &dr->rb)
592 this->rb_left = NULL;
593 else
594 this->rb_right = NULL;
595 }
596 kfree(dr); 580 kfree(dr);
597 }
598} 581}
599 582
600/** 583/**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index ba32da3fe08a..f1c3e5a1b315 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -815,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
815 815
816static void dbg_free_check_tree(struct rb_root *root) 816static void dbg_free_check_tree(struct rb_root *root)
817{ 817{
818 struct rb_node *this = root->rb_node; 818 struct check_orphan *o, *n;
819 struct check_orphan *o;
820 819
821 while (this) { 820 rbtree_postorder_for_each_entry_safe(o, n, root, rb)
822 if (this->rb_left) {
823 this = this->rb_left;
824 continue;
825 } else if (this->rb_right) {
826 this = this->rb_right;
827 continue;
828 }
829 o = rb_entry(this, struct check_orphan, rb);
830 this = rb_parent(this);
831 if (this) {
832 if (this->rb_left == &o->rb)
833 this->rb_left = NULL;
834 else
835 this->rb_right = NULL;
836 }
837 kfree(o); 821 kfree(o);
838 }
839} 822}
840 823
841static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, 824static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 065096e36ed9..c14adb2f420c 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -1335,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum)
1335 */ 1335 */
1336void ubifs_destroy_size_tree(struct ubifs_info *c) 1336void ubifs_destroy_size_tree(struct ubifs_info *c)
1337{ 1337{
1338 struct rb_node *this = c->size_tree.rb_node; 1338 struct size_entry *e, *n;
1339 struct size_entry *e;
1340 1339
1341 while (this) { 1340 rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
1342 if (this->rb_left) {
1343 this = this->rb_left;
1344 continue;
1345 } else if (this->rb_right) {
1346 this = this->rb_right;
1347 continue;
1348 }
1349 e = rb_entry(this, struct size_entry, rb);
1350 if (e->inode) 1341 if (e->inode)
1351 iput(e->inode); 1342 iput(e->inode);
1352 this = rb_parent(this);
1353 if (this) {
1354 if (this->rb_left == &e->rb)
1355 this->rb_left = NULL;
1356 else
1357 this->rb_right = NULL;
1358 }
1359 kfree(e); 1343 kfree(e);
1360 } 1344 }
1345
1361 c->size_tree = RB_ROOT; 1346 c->size_tree = RB_ROOT;
1362} 1347}
1363 1348
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f69daa514a57..5ded8490c0c6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -873,26 +873,10 @@ static void free_orphans(struct ubifs_info *c)
873 */ 873 */
874static void free_buds(struct ubifs_info *c) 874static void free_buds(struct ubifs_info *c)
875{ 875{
876 struct rb_node *this = c->buds.rb_node; 876 struct ubifs_bud *bud, *n;
877 struct ubifs_bud *bud; 877
878 878 rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
879 while (this) { 879 kfree(bud);
880 if (this->rb_left)
881 this = this->rb_left;
882 else if (this->rb_right)
883 this = this->rb_right;
884 else {
885 bud = rb_entry(this, struct ubifs_bud, rb);
886 this = rb_parent(this);
887 if (this) {
888 if (this->rb_left == &bud->rb)
889 this->rb_left = NULL;
890 else
891 this->rb_right = NULL;
892 }
893 kfree(bud);
894 }
895 }
896} 880}
897 881
898/** 882/**
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 349f31a30f40..9083bc7ed4ae 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -178,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c,
178 */ 178 */
179void destroy_old_idx(struct ubifs_info *c) 179void destroy_old_idx(struct ubifs_info *c)
180{ 180{
181 struct rb_node *this = c->old_idx.rb_node; 181 struct ubifs_old_idx *old_idx, *n;
182 struct ubifs_old_idx *old_idx;
183 182
184 while (this) { 183 rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb)
185 if (this->rb_left) {
186 this = this->rb_left;
187 continue;
188 } else if (this->rb_right) {
189 this = this->rb_right;
190 continue;
191 }
192 old_idx = rb_entry(this, struct ubifs_old_idx, rb);
193 this = rb_parent(this);
194 if (this) {
195 if (this->rb_left == &old_idx->rb)
196 this->rb_left = NULL;
197 else
198 this->rb_right = NULL;
199 }
200 kfree(old_idx); 184 kfree(old_idx);
201 } 185
202 c->old_idx = RB_ROOT; 186 c->old_idx = RB_ROOT;
203} 187}
204 188
diff --git a/fs/udf/file.c b/fs/udf/file.c
index c02a27a19c6d..1037637957c7 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -144,6 +144,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
144 size_t count = iocb->ki_nbytes; 144 size_t count = iocb->ki_nbytes;
145 struct udf_inode_info *iinfo = UDF_I(inode); 145 struct udf_inode_info *iinfo = UDF_I(inode);
146 146
147 mutex_lock(&inode->i_mutex);
147 down_write(&iinfo->i_data_sem); 148 down_write(&iinfo->i_data_sem);
148 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 149 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
149 if (file->f_flags & O_APPEND) 150 if (file->f_flags & O_APPEND)
@@ -156,6 +157,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
156 pos + count)) { 157 pos + count)) {
157 err = udf_expand_file_adinicb(inode); 158 err = udf_expand_file_adinicb(inode);
158 if (err) { 159 if (err) {
160 mutex_unlock(&inode->i_mutex);
159 udf_debug("udf_expand_adinicb: err=%d\n", err); 161 udf_debug("udf_expand_adinicb: err=%d\n", err);
160 return err; 162 return err;
161 } 163 }
@@ -169,9 +171,17 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
169 } else 171 } else
170 up_write(&iinfo->i_data_sem); 172 up_write(&iinfo->i_data_sem);
171 173
172 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); 174 retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
173 if (retval > 0) 175 mutex_unlock(&inode->i_mutex);
176
177 if (retval > 0) {
178 ssize_t err;
179
174 mark_inode_dirty(inode); 180 mark_inode_dirty(inode);
181 err = generic_write_sync(file, iocb->ki_pos - retval, retval);
182 if (err < 0)
183 retval = err;
184 }
175 185
176 return retval; 186 return retval;
177} 187}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 062b7925bca0..982ce05c87ed 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -265,6 +265,7 @@ int udf_expand_file_adinicb(struct inode *inode)
265 .nr_to_write = 1, 265 .nr_to_write = 1,
266 }; 266 };
267 267
268 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
268 if (!iinfo->i_lenAlloc) { 269 if (!iinfo->i_lenAlloc) {
269 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 270 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
270 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; 271 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 5f6fc17d6bc5..9737cba1357d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1010 else 1010 else
1011 udf_truncate_tail_extent(inode); 1011 udf_truncate_tail_extent(inode);
1012 mark_inode_dirty(inode); 1012 mark_inode_dirty(inode);
1013 up_write(&iinfo->i_data_sem);
1013 1014
1014 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 1015 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1015 if (!fi) 1016 if (!fi)
@@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1023 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 1024 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1024 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1025 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1025 mark_inode_dirty(dir); 1026 mark_inode_dirty(dir);
1026 up_write(&iinfo->i_data_sem);
1027 if (fibh.sbh != fibh.ebh) 1027 if (fibh.sbh != fibh.ebh)
1028 brelse(fibh.ebh); 1028 brelse(fibh.ebh);
1029 brelse(fibh.sbh); 1029 brelse(fibh.sbh);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
deleted file mode 100644
index 9fbea87fdb6e..000000000000
--- a/fs/xattr_acl.c
+++ /dev/null
@@ -1,180 +0,0 @@
1/*
2 * linux/fs/xattr_acl.c
3 *
4 * Almost all from linux/fs/ext2/acl.c:
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */
7
8#include <linux/export.h>
9#include <linux/fs.h>
10#include <linux/posix_acl_xattr.h>
11#include <linux/gfp.h>
12#include <linux/user_namespace.h>
13
14/*
15 * Fix up the uids and gids in posix acl extended attributes in place.
16 */
17static void posix_acl_fix_xattr_userns(
18 struct user_namespace *to, struct user_namespace *from,
19 void *value, size_t size)
20{
21 posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
22 posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
23 int count;
24 kuid_t uid;
25 kgid_t gid;
26
27 if (!value)
28 return;
29 if (size < sizeof(posix_acl_xattr_header))
30 return;
31 if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
32 return;
33
34 count = posix_acl_xattr_count(size);
35 if (count < 0)
36 return;
37 if (count == 0)
38 return;
39
40 for (end = entry + count; entry != end; entry++) {
41 switch(le16_to_cpu(entry->e_tag)) {
42 case ACL_USER:
43 uid = make_kuid(from, le32_to_cpu(entry->e_id));
44 entry->e_id = cpu_to_le32(from_kuid(to, uid));
45 break;
46 case ACL_GROUP:
47 gid = make_kgid(from, le32_to_cpu(entry->e_id));
48 entry->e_id = cpu_to_le32(from_kgid(to, gid));
49 break;
50 default:
51 break;
52 }
53 }
54}
55
56void posix_acl_fix_xattr_from_user(void *value, size_t size)
57{
58 struct user_namespace *user_ns = current_user_ns();
59 if (user_ns == &init_user_ns)
60 return;
61 posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
62}
63
64void posix_acl_fix_xattr_to_user(void *value, size_t size)
65{
66 struct user_namespace *user_ns = current_user_ns();
67 if (user_ns == &init_user_ns)
68 return;
69 posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
70}
71
72/*
73 * Convert from extended attribute to in-memory representation.
74 */
75struct posix_acl *
76posix_acl_from_xattr(struct user_namespace *user_ns,
77 const void *value, size_t size)
78{
79 posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
80 posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
81 int count;
82 struct posix_acl *acl;
83 struct posix_acl_entry *acl_e;
84
85 if (!value)
86 return NULL;
87 if (size < sizeof(posix_acl_xattr_header))
88 return ERR_PTR(-EINVAL);
89 if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
90 return ERR_PTR(-EOPNOTSUPP);
91
92 count = posix_acl_xattr_count(size);
93 if (count < 0)
94 return ERR_PTR(-EINVAL);
95 if (count == 0)
96 return NULL;
97
98 acl = posix_acl_alloc(count, GFP_NOFS);
99 if (!acl)
100 return ERR_PTR(-ENOMEM);
101 acl_e = acl->a_entries;
102
103 for (end = entry + count; entry != end; acl_e++, entry++) {
104 acl_e->e_tag = le16_to_cpu(entry->e_tag);
105 acl_e->e_perm = le16_to_cpu(entry->e_perm);
106
107 switch(acl_e->e_tag) {
108 case ACL_USER_OBJ:
109 case ACL_GROUP_OBJ:
110 case ACL_MASK:
111 case ACL_OTHER:
112 break;
113
114 case ACL_USER:
115 acl_e->e_uid =
116 make_kuid(user_ns,
117 le32_to_cpu(entry->e_id));
118 if (!uid_valid(acl_e->e_uid))
119 goto fail;
120 break;
121 case ACL_GROUP:
122 acl_e->e_gid =
123 make_kgid(user_ns,
124 le32_to_cpu(entry->e_id));
125 if (!gid_valid(acl_e->e_gid))
126 goto fail;
127 break;
128
129 default:
130 goto fail;
131 }
132 }
133 return acl;
134
135fail:
136 posix_acl_release(acl);
137 return ERR_PTR(-EINVAL);
138}
139EXPORT_SYMBOL (posix_acl_from_xattr);
140
141/*
142 * Convert from in-memory to extended attribute representation.
143 */
144int
145posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
146 void *buffer, size_t size)
147{
148 posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
149 posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
150 int real_size, n;
151
152 real_size = posix_acl_xattr_size(acl->a_count);
153 if (!buffer)
154 return real_size;
155 if (real_size > size)
156 return -ERANGE;
157
158 ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
159
160 for (n=0; n < acl->a_count; n++, ext_entry++) {
161 const struct posix_acl_entry *acl_e = &acl->a_entries[n];
162 ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
163 ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
164 switch(acl_e->e_tag) {
165 case ACL_USER:
166 ext_entry->e_id =
167 cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
168 break;
169 case ACL_GROUP:
170 ext_entry->e_id =
171 cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
172 break;
173 default:
174 ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
175 break;
176 }
177 }
178 return real_size;
179}
180EXPORT_SYMBOL (posix_acl_to_xattr);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 370eb3e121d1..0ecec1896f25 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -124,16 +124,12 @@ struct posix_acl *
124xfs_get_acl(struct inode *inode, int type) 124xfs_get_acl(struct inode *inode, int type)
125{ 125{
126 struct xfs_inode *ip = XFS_I(inode); 126 struct xfs_inode *ip = XFS_I(inode);
127 struct posix_acl *acl; 127 struct posix_acl *acl = NULL;
128 struct xfs_acl *xfs_acl; 128 struct xfs_acl *xfs_acl;
129 unsigned char *ea_name; 129 unsigned char *ea_name;
130 int error; 130 int error;
131 int len; 131 int len;
132 132
133 acl = get_cached_acl(inode, type);
134 if (acl != ACL_NOT_CACHED)
135 return acl;
136
137 trace_xfs_get_acl(ip); 133 trace_xfs_get_acl(ip);
138 134
139 switch (type) { 135 switch (type) {
@@ -164,10 +160,8 @@ xfs_get_acl(struct inode *inode, int type)
164 * cache entry, for any other error assume it is transient and 160 * cache entry, for any other error assume it is transient and
165 * leave the cache entry as ACL_NOT_CACHED. 161 * leave the cache entry as ACL_NOT_CACHED.
166 */ 162 */
167 if (error == -ENOATTR) { 163 if (error == -ENOATTR)
168 acl = NULL;
169 goto out_update_cache; 164 goto out_update_cache;
170 }
171 goto out; 165 goto out;
172 } 166 }
173 167
@@ -183,15 +177,12 @@ out:
183} 177}
184 178
185STATIC int 179STATIC int
186xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) 180__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
187{ 181{
188 struct xfs_inode *ip = XFS_I(inode); 182 struct xfs_inode *ip = XFS_I(inode);
189 unsigned char *ea_name; 183 unsigned char *ea_name;
190 int error; 184 int error;
191 185
192 if (S_ISLNK(inode->i_mode))
193 return -EOPNOTSUPP;
194
195 switch (type) { 186 switch (type) {
196 case ACL_TYPE_ACCESS: 187 case ACL_TYPE_ACCESS:
197 ea_name = SGI_ACL_FILE; 188 ea_name = SGI_ACL_FILE;
@@ -282,131 +273,23 @@ posix_acl_default_exists(struct inode *inode)
282 return xfs_acl_exists(inode, SGI_ACL_DEFAULT); 273 return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
283} 274}
284 275
285/*
286 * No need for i_mutex because the inode is not yet exposed to the VFS.
287 */
288int 276int
289xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) 277xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
290{ 278{
291 umode_t mode = inode->i_mode;
292 int error = 0, inherit = 0;
293
294 if (S_ISDIR(inode->i_mode)) {
295 error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
296 if (error)
297 goto out;
298 }
299
300 error = posix_acl_create(&acl, GFP_KERNEL, &mode);
301 if (error < 0)
302 return error;
303
304 /*
305 * If posix_acl_create returns a positive value we need to
306 * inherit a permission that can't be represented using the Unix
307 * mode bits and we actually need to set an ACL.
308 */
309 if (error > 0)
310 inherit = 1;
311
312 error = xfs_set_mode(inode, mode);
313 if (error)
314 goto out;
315
316 if (inherit)
317 error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
318
319out:
320 posix_acl_release(acl);
321 return error;
322}
323
324int
325xfs_acl_chmod(struct inode *inode)
326{
327 struct posix_acl *acl;
328 int error;
329
330 if (S_ISLNK(inode->i_mode))
331 return -EOPNOTSUPP;
332
333 acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
334 if (IS_ERR(acl) || !acl)
335 return PTR_ERR(acl);
336
337 error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
338 if (error)
339 return error;
340
341 error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
342 posix_acl_release(acl);
343 return error;
344}
345
346static int
347xfs_xattr_acl_get(struct dentry *dentry, const char *name,
348 void *value, size_t size, int type)
349{
350 struct posix_acl *acl;
351 int error;
352
353 acl = xfs_get_acl(dentry->d_inode, type);
354 if (IS_ERR(acl))
355 return PTR_ERR(acl);
356 if (acl == NULL)
357 return -ENODATA;
358
359 error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
360 posix_acl_release(acl);
361
362 return error;
363}
364
365static int
366xfs_xattr_acl_set(struct dentry *dentry, const char *name,
367 const void *value, size_t size, int flags, int type)
368{
369 struct inode *inode = dentry->d_inode;
370 struct posix_acl *acl = NULL;
371 int error = 0; 279 int error = 0;
372 280
373 if (flags & XATTR_CREATE) 281 if (!acl)
374 return -EINVAL;
375 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
376 return value ? -EACCES : 0;
377 if (!inode_owner_or_capable(inode))
378 return -EPERM;
379
380 if (!value)
381 goto set_acl; 282 goto set_acl;
382 283
383 acl = posix_acl_from_xattr(&init_user_ns, value, size);
384 if (!acl) {
385 /*
386 * acl_set_file(3) may request that we set default ACLs with
387 * zero length -- defend (gracefully) against that here.
388 */
389 goto out;
390 }
391 if (IS_ERR(acl)) {
392 error = PTR_ERR(acl);
393 goto out;
394 }
395
396 error = posix_acl_valid(acl);
397 if (error)
398 goto out_release;
399
400 error = -EINVAL; 284 error = -EINVAL;
401 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) 285 if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
402 goto out_release; 286 return error;
403 287
404 if (type == ACL_TYPE_ACCESS) { 288 if (type == ACL_TYPE_ACCESS) {
405 umode_t mode = inode->i_mode; 289 umode_t mode = inode->i_mode;
406 error = posix_acl_equiv_mode(acl, &mode); 290 error = posix_acl_equiv_mode(acl, &mode);
407 291
408 if (error <= 0) { 292 if (error <= 0) {
409 posix_acl_release(acl);
410 acl = NULL; 293 acl = NULL;
411 294
412 if (error < 0) 295 if (error < 0)
@@ -415,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
415 298
416 error = xfs_set_mode(inode, mode); 299 error = xfs_set_mode(inode, mode);
417 if (error) 300 if (error)
418 goto out_release; 301 return error;
419 } 302 }
420 303
421 set_acl: 304 set_acl:
422 error = xfs_set_acl(inode, type, acl); 305 return __xfs_set_acl(inode, type, acl);
423 out_release:
424 posix_acl_release(acl);
425 out:
426 return error;
427} 306}
428
429const struct xattr_handler xfs_xattr_acl_access_handler = {
430 .prefix = POSIX_ACL_XATTR_ACCESS,
431 .flags = ACL_TYPE_ACCESS,
432 .get = xfs_xattr_acl_get,
433 .set = xfs_xattr_acl_set,
434};
435
436const struct xattr_handler xfs_xattr_acl_default_handler = {
437 .prefix = POSIX_ACL_XATTR_DEFAULT,
438 .flags = ACL_TYPE_DEFAULT,
439 .get = xfs_xattr_acl_get,
440 .set = xfs_xattr_acl_set,
441};
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 4016a567b83c..5dc163744511 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -60,20 +60,15 @@ struct xfs_acl {
60 60
61#ifdef CONFIG_XFS_POSIX_ACL 61#ifdef CONFIG_XFS_POSIX_ACL
62extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); 62extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
63extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); 63extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
64extern int xfs_acl_chmod(struct inode *inode);
65extern int posix_acl_access_exists(struct inode *inode); 64extern int posix_acl_access_exists(struct inode *inode);
66extern int posix_acl_default_exists(struct inode *inode); 65extern int posix_acl_default_exists(struct inode *inode);
67
68extern const struct xattr_handler xfs_xattr_acl_access_handler;
69extern const struct xattr_handler xfs_xattr_acl_default_handler;
70#else 66#else
71static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) 67static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
72{ 68{
73 return NULL; 69 return NULL;
74} 70}
75# define xfs_inherit_acl(inode, default_acl) 0 71# define xfs_set_acl NULL
76# define xfs_acl_chmod(inode) 0
77# define posix_acl_access_exists(inode) 0 72# define posix_acl_access_exists(inode) 0
78# define posix_acl_default_exists(inode) 0 73# define posix_acl_default_exists(inode) 0
79#endif /* CONFIG_XFS_POSIX_ACL */ 74#endif /* CONFIG_XFS_POSIX_ACL */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 71c8c9d2b882..db2cfb067d0b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -407,7 +407,7 @@ xfs_alloc_ioend_bio(
407 struct bio *bio = bio_alloc(GFP_NOIO, nvecs); 407 struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
408 408
409 ASSERT(bio->bi_private == NULL); 409 ASSERT(bio->bi_private == NULL);
410 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 410 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
411 bio->bi_bdev = bh->b_bdev; 411 bio->bi_bdev = bh->b_bdev;
412 return bio; 412 return bio;
413} 413}
@@ -1217,7 +1217,7 @@ __xfs_get_blocks(
1217 lockmode = XFS_ILOCK_EXCL; 1217 lockmode = XFS_ILOCK_EXCL;
1218 xfs_ilock(ip, lockmode); 1218 xfs_ilock(ip, lockmode);
1219 } else { 1219 } else {
1220 lockmode = xfs_ilock_map_shared(ip); 1220 lockmode = xfs_ilock_data_map_shared(ip);
1221 } 1221 }
1222 1222
1223 ASSERT(offset <= mp->m_super->s_maxbytes); 1223 ASSERT(offset <= mp->m_super->s_maxbytes);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b86127072ac3..01b6a0102fbd 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -164,6 +164,7 @@ xfs_attr_get(
164{ 164{
165 int error; 165 int error;
166 struct xfs_name xname; 166 struct xfs_name xname;
167 uint lock_mode;
167 168
168 XFS_STATS_INC(xs_attr_get); 169 XFS_STATS_INC(xs_attr_get);
169 170
@@ -174,9 +175,9 @@ xfs_attr_get(
174 if (error) 175 if (error)
175 return error; 176 return error;
176 177
177 xfs_ilock(ip, XFS_ILOCK_SHARED); 178 lock_mode = xfs_ilock_attr_map_shared(ip);
178 error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); 179 error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
179 xfs_iunlock(ip, XFS_ILOCK_SHARED); 180 xfs_iunlock(ip, lock_mode);
180 return(error); 181 return(error);
181} 182}
182 183
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d174b128153..01db96f60cf0 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -507,17 +507,17 @@ xfs_attr_list_int(
507{ 507{
508 int error; 508 int error;
509 xfs_inode_t *dp = context->dp; 509 xfs_inode_t *dp = context->dp;
510 uint lock_mode;
510 511
511 XFS_STATS_INC(xs_attr_list); 512 XFS_STATS_INC(xs_attr_list);
512 513
513 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 514 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
514 return EIO; 515 return EIO;
515 516
516 xfs_ilock(dp, XFS_ILOCK_SHARED);
517
518 /* 517 /*
519 * Decide on what work routines to call based on the inode size. 518 * Decide on what work routines to call based on the inode size.
520 */ 519 */
520 lock_mode = xfs_ilock_attr_map_shared(dp);
521 if (!xfs_inode_hasattr(dp)) { 521 if (!xfs_inode_hasattr(dp)) {
522 error = 0; 522 error = 0;
523 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { 523 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
@@ -527,9 +527,7 @@ xfs_attr_list_int(
527 } else { 527 } else {
528 error = xfs_attr_node_list(context); 528 error = xfs_attr_node_list(context);
529 } 529 }
530 530 xfs_iunlock(dp, lock_mode);
531 xfs_iunlock(dp, XFS_ILOCK_SHARED);
532
533 return error; 531 return error;
534} 532}
535 533
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 739e0a52deda..5549d69ddb45 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -110,7 +110,7 @@ xfs_attr3_rmt_verify(
110 if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) 110 if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
111 return false; 111 return false;
112 if (be32_to_cpu(rmt->rm_offset) + 112 if (be32_to_cpu(rmt->rm_offset) +
113 be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) 113 be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
114 return false; 114 return false;
115 if (rmt->rm_owner == 0) 115 if (rmt->rm_owner == 0)
116 return false; 116 return false;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3b2c14b6f0fb..152543c4ca70 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4013,6 +4013,7 @@ xfs_bmapi_read(
4013 ASSERT(*nmap >= 1); 4013 ASSERT(*nmap >= 1);
4014 ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| 4014 ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
4015 XFS_BMAPI_IGSTATE))); 4015 XFS_BMAPI_IGSTATE)));
4016 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
4016 4017
4017 if (unlikely(XFS_TEST_ERROR( 4018 if (unlikely(XFS_TEST_ERROR(
4018 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 4019 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4207,6 +4208,7 @@ xfs_bmapi_delay(
4207 ASSERT(*nmap >= 1); 4208 ASSERT(*nmap >= 1);
4208 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); 4209 ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
4209 ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); 4210 ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
4211 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
4210 4212
4211 if (unlikely(XFS_TEST_ERROR( 4213 if (unlikely(XFS_TEST_ERROR(
4212 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && 4214 (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
@@ -4500,6 +4502,7 @@ xfs_bmapi_write(
4500 ASSERT(tp != NULL); 4502 ASSERT(tp != NULL);
4501 ASSERT(len > 0); 4503 ASSERT(len > 0);
4502 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); 4504 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
4505 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
4503 4506
4504 if (unlikely(XFS_TEST_ERROR( 4507 if (unlikely(XFS_TEST_ERROR(
4505 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 4508 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5051,6 +5054,7 @@ xfs_bunmapi(
5051 if (XFS_FORCED_SHUTDOWN(mp)) 5054 if (XFS_FORCED_SHUTDOWN(mp))
5052 return XFS_ERROR(EIO); 5055 return XFS_ERROR(EIO);
5053 5056
5057 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
5054 ASSERT(len > 0); 5058 ASSERT(len > 0);
5055 ASSERT(nexts >= 0); 5059 ASSERT(nexts >= 0);
5056 5060
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1394106ed22d..f264616080ca 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -287,6 +287,7 @@ xfs_bmapi_allocate(
287 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); 287 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
288 queue_work(xfs_alloc_wq, &args->work); 288 queue_work(xfs_alloc_wq, &args->work);
289 wait_for_completion(&done); 289 wait_for_completion(&done);
290 destroy_work_on_stack(&args->work);
290 return args->result; 291 return args->result;
291} 292}
292 293
@@ -617,22 +618,27 @@ xfs_getbmap(
617 return XFS_ERROR(ENOMEM); 618 return XFS_ERROR(ENOMEM);
618 619
619 xfs_ilock(ip, XFS_IOLOCK_SHARED); 620 xfs_ilock(ip, XFS_IOLOCK_SHARED);
620 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 621 if (whichfork == XFS_DATA_FORK) {
621 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { 622 if (!(iflags & BMV_IF_DELALLOC) &&
623 (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
622 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); 624 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
623 if (error) 625 if (error)
624 goto out_unlock_iolock; 626 goto out_unlock_iolock;
627
628 /*
629 * Even after flushing the inode, there can still be
630 * delalloc blocks on the inode beyond EOF due to
631 * speculative preallocation. These are not removed
632 * until the release function is called or the inode
633 * is inactivated. Hence we cannot assert here that
634 * ip->i_delayed_blks == 0.
635 */
625 } 636 }
626 /*
627 * even after flushing the inode, there can still be delalloc
628 * blocks on the inode beyond EOF due to speculative
629 * preallocation. These are not removed until the release
630 * function is called or the inode is inactivated. Hence we
631 * cannot assert here that ip->i_delayed_blks == 0.
632 */
633 }
634 637
635 lock = xfs_ilock_map_shared(ip); 638 lock = xfs_ilock_data_map_shared(ip);
639 } else {
640 lock = xfs_ilock_attr_map_shared(ip);
641 }
636 642
637 /* 643 /*
638 * Don't let nex be bigger than the number of extents 644 * Don't let nex be bigger than the number of extents
@@ -737,7 +743,7 @@ xfs_getbmap(
737 out_free_map: 743 out_free_map:
738 kmem_free(map); 744 kmem_free(map);
739 out_unlock_ilock: 745 out_unlock_ilock:
740 xfs_iunlock_map_shared(ip, lock); 746 xfs_iunlock(ip, lock);
741 out_unlock_iolock: 747 out_unlock_iolock:
742 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 748 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
743 749
@@ -1168,9 +1174,15 @@ xfs_zero_remaining_bytes(
1168 xfs_buf_unlock(bp); 1174 xfs_buf_unlock(bp);
1169 1175
1170 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { 1176 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1177 uint lock_mode;
1178
1171 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1179 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1172 nimap = 1; 1180 nimap = 1;
1181
1182 lock_mode = xfs_ilock_data_map_shared(ip);
1173 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); 1183 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1184 xfs_iunlock(ip, lock_mode);
1185
1174 if (error || nimap < 1) 1186 if (error || nimap < 1)
1175 break; 1187 break;
1176 ASSERT(imap.br_blockcount >= 1); 1188 ASSERT(imap.br_blockcount >= 1);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index afe7645e4b2b..9c061ef2b0d9 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -445,8 +445,8 @@ _xfs_buf_find(
445 numbytes = BBTOB(numblks); 445 numbytes = BBTOB(numblks);
446 446
447 /* Check for IOs smaller than the sector size / not sector aligned */ 447 /* Check for IOs smaller than the sector size / not sector aligned */
448 ASSERT(!(numbytes < (1 << btp->bt_sshift))); 448 ASSERT(!(numbytes < btp->bt_meta_sectorsize));
449 ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); 449 ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
450 450
451 /* 451 /*
452 * Corrupted block numbers can get through to here, unfortunately, so we 452 * Corrupted block numbers can get through to here, unfortunately, so we
@@ -1240,7 +1240,7 @@ next_chunk:
1240 1240
1241 bio = bio_alloc(GFP_NOIO, nr_pages); 1241 bio = bio_alloc(GFP_NOIO, nr_pages);
1242 bio->bi_bdev = bp->b_target->bt_bdev; 1242 bio->bi_bdev = bp->b_target->bt_bdev;
1243 bio->bi_sector = sector; 1243 bio->bi_iter.bi_sector = sector;
1244 bio->bi_end_io = xfs_buf_bio_end_io; 1244 bio->bi_end_io = xfs_buf_bio_end_io;
1245 bio->bi_private = bp; 1245 bio->bi_private = bp;
1246 1246
@@ -1262,7 +1262,7 @@ next_chunk:
1262 total_nr_pages--; 1262 total_nr_pages--;
1263 } 1263 }
1264 1264
1265 if (likely(bio->bi_size)) { 1265 if (likely(bio->bi_iter.bi_size)) {
1266 if (xfs_buf_is_vmapped(bp)) { 1266 if (xfs_buf_is_vmapped(bp)) {
1267 flush_kernel_vmap_range(bp->b_addr, 1267 flush_kernel_vmap_range(bp->b_addr,
1268 xfs_buf_vmap_len(bp)); 1268 xfs_buf_vmap_len(bp));
@@ -1593,16 +1593,15 @@ xfs_free_buftarg(
1593 kmem_free(btp); 1593 kmem_free(btp);
1594} 1594}
1595 1595
1596STATIC int 1596int
1597xfs_setsize_buftarg_flags( 1597xfs_setsize_buftarg(
1598 xfs_buftarg_t *btp, 1598 xfs_buftarg_t *btp,
1599 unsigned int blocksize, 1599 unsigned int blocksize,
1600 unsigned int sectorsize, 1600 unsigned int sectorsize)
1601 int verbose)
1602{ 1601{
1603 btp->bt_bsize = blocksize; 1602 /* Set up metadata sector size info */
1604 btp->bt_sshift = ffs(sectorsize) - 1; 1603 btp->bt_meta_sectorsize = sectorsize;
1605 btp->bt_smask = sectorsize - 1; 1604 btp->bt_meta_sectormask = sectorsize - 1;
1606 1605
1607 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1606 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1608 char name[BDEVNAME_SIZE]; 1607 char name[BDEVNAME_SIZE];
@@ -1615,30 +1614,25 @@ xfs_setsize_buftarg_flags(
1615 return EINVAL; 1614 return EINVAL;
1616 } 1615 }
1617 1616
1617 /* Set up device logical sector size mask */
1618 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
1619 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
1620
1618 return 0; 1621 return 0;
1619} 1622}
1620 1623
1621/* 1624/*
1622 * When allocating the initial buffer target we have not yet 1625 * When allocating the initial buffer target we have not yet
1623 * read in the superblock, so don't know what sized sectors 1626 * read in the superblock, so don't know what sized sectors
1624 * are being used at this early stage. Play safe. 1627 * are being used at this early stage. Play safe.
1625 */ 1628 */
1626STATIC int 1629STATIC int
1627xfs_setsize_buftarg_early( 1630xfs_setsize_buftarg_early(
1628 xfs_buftarg_t *btp, 1631 xfs_buftarg_t *btp,
1629 struct block_device *bdev) 1632 struct block_device *bdev)
1630{ 1633{
1631 return xfs_setsize_buftarg_flags(btp, 1634 return xfs_setsize_buftarg(btp, PAGE_SIZE,
1632 PAGE_SIZE, bdev_logical_block_size(bdev), 0); 1635 bdev_logical_block_size(bdev));
1633}
1634
1635int
1636xfs_setsize_buftarg(
1637 xfs_buftarg_t *btp,
1638 unsigned int blocksize,
1639 unsigned int sectorsize)
1640{
1641 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1642} 1636}
1643 1637
1644xfs_buftarg_t * 1638xfs_buftarg_t *
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1cf21a4a9f22..995339534db6 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -88,14 +88,28 @@ typedef unsigned int xfs_buf_flags_t;
88 */ 88 */
89#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ 89#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
90 90
91/*
92 * The xfs_buftarg contains 2 notions of "sector size" -
93 *
94 * 1) The metadata sector size, which is the minimum unit and
95 * alignment of IO which will be performed by metadata operations.
96 * 2) The device logical sector size
97 *
98 * The first is specified at mkfs time, and is stored on-disk in the
99 * superblock's sb_sectsize.
100 *
101 * The latter is derived from the underlying device, and controls direct IO
102 * alignment constraints.
103 */
91typedef struct xfs_buftarg { 104typedef struct xfs_buftarg {
92 dev_t bt_dev; 105 dev_t bt_dev;
93 struct block_device *bt_bdev; 106 struct block_device *bt_bdev;
94 struct backing_dev_info *bt_bdi; 107 struct backing_dev_info *bt_bdi;
95 struct xfs_mount *bt_mount; 108 struct xfs_mount *bt_mount;
96 unsigned int bt_bsize; 109 unsigned int bt_meta_sectorsize;
97 unsigned int bt_sshift; 110 size_t bt_meta_sectormask;
98 size_t bt_smask; 111 size_t bt_logical_sectorsize;
112 size_t bt_logical_sectormask;
99 113
100 /* LRU control structures */ 114 /* LRU control structures */
101 struct shrinker bt_shrinker; 115 struct shrinker bt_shrinker;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2227b9b050bb..33149113e333 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -182,21 +182,47 @@ xfs_buf_item_size(
182 trace_xfs_buf_item_size(bip); 182 trace_xfs_buf_item_size(bip);
183} 183}
184 184
185static struct xfs_log_iovec * 185static inline void
186xfs_buf_item_copy_iovec(
187 struct xfs_log_vec *lv,
188 struct xfs_log_iovec **vecp,
189 struct xfs_buf *bp,
190 uint offset,
191 int first_bit,
192 uint nbits)
193{
194 offset += first_bit * XFS_BLF_CHUNK;
195 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
196 xfs_buf_offset(bp, offset),
197 nbits * XFS_BLF_CHUNK);
198}
199
200static inline bool
201xfs_buf_item_straddle(
202 struct xfs_buf *bp,
203 uint offset,
204 int next_bit,
205 int last_bit)
206{
207 return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
208 (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
209 XFS_BLF_CHUNK);
210}
211
212static void
186xfs_buf_item_format_segment( 213xfs_buf_item_format_segment(
187 struct xfs_buf_log_item *bip, 214 struct xfs_buf_log_item *bip,
188 struct xfs_log_iovec *vecp, 215 struct xfs_log_vec *lv,
216 struct xfs_log_iovec **vecp,
189 uint offset, 217 uint offset,
190 struct xfs_buf_log_format *blfp) 218 struct xfs_buf_log_format *blfp)
191{ 219{
192 struct xfs_buf *bp = bip->bli_buf; 220 struct xfs_buf *bp = bip->bli_buf;
193 uint base_size; 221 uint base_size;
194 uint nvecs;
195 int first_bit; 222 int first_bit;
196 int last_bit; 223 int last_bit;
197 int next_bit; 224 int next_bit;
198 uint nbits; 225 uint nbits;
199 uint buffer_offset;
200 226
201 /* copy the flags across from the base format item */ 227 /* copy the flags across from the base format item */
202 blfp->blf_flags = bip->__bli_format.blf_flags; 228 blfp->blf_flags = bip->__bli_format.blf_flags;
@@ -208,21 +234,17 @@ xfs_buf_item_format_segment(
208 */ 234 */
209 base_size = xfs_buf_log_format_size(blfp); 235 base_size = xfs_buf_log_format_size(blfp);
210 236
211 nvecs = 0;
212 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 237 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
213 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { 238 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
214 /* 239 /*
215 * If the map is not be dirty in the transaction, mark 240 * If the map is not be dirty in the transaction, mark
216 * the size as zero and do not advance the vector pointer. 241 * the size as zero and do not advance the vector pointer.
217 */ 242 */
218 goto out; 243 return;
219 } 244 }
220 245
221 vecp->i_addr = blfp; 246 blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
222 vecp->i_len = base_size; 247 blfp->blf_size = 1;
223 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
224 vecp++;
225 nvecs = 1;
226 248
227 if (bip->bli_flags & XFS_BLI_STALE) { 249 if (bip->bli_flags & XFS_BLI_STALE) {
228 /* 250 /*
@@ -232,14 +254,13 @@ xfs_buf_item_format_segment(
232 */ 254 */
233 trace_xfs_buf_item_format_stale(bip); 255 trace_xfs_buf_item_format_stale(bip);
234 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 256 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
235 goto out; 257 return;
236 } 258 }
237 259
238 260
239 /* 261 /*
240 * Fill in an iovec for each set of contiguous chunks. 262 * Fill in an iovec for each set of contiguous chunks.
241 */ 263 */
242
243 last_bit = first_bit; 264 last_bit = first_bit;
244 nbits = 1; 265 nbits = 1;
245 for (;;) { 266 for (;;) {
@@ -252,42 +273,22 @@ xfs_buf_item_format_segment(
252 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 273 next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
253 (uint)last_bit + 1); 274 (uint)last_bit + 1);
254 /* 275 /*
255 * If we run out of bits fill in the last iovec and get 276 * If we run out of bits fill in the last iovec and get out of
256 * out of the loop. 277 * the loop. Else if we start a new set of bits then fill in
257 * Else if we start a new set of bits then fill in the 278 * the iovec for the series we were looking at and start
258 * iovec for the series we were looking at and start 279 * counting the bits in the new one. Else we're still in the
259 * counting the bits in the new one. 280 * same set of bits so just keep counting and scanning.
260 * Else we're still in the same set of bits so just
261 * keep counting and scanning.
262 */ 281 */
263 if (next_bit == -1) { 282 if (next_bit == -1) {
264 buffer_offset = offset + first_bit * XFS_BLF_CHUNK; 283 xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
265 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 284 first_bit, nbits);
266 vecp->i_len = nbits * XFS_BLF_CHUNK; 285 blfp->blf_size++;
267 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
268 nvecs++;
269 break; 286 break;
270 } else if (next_bit != last_bit + 1) { 287 } else if (next_bit != last_bit + 1 ||
271 buffer_offset = offset + first_bit * XFS_BLF_CHUNK; 288 xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
272 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 289 xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
273 vecp->i_len = nbits * XFS_BLF_CHUNK; 290 first_bit, nbits);
274 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 291 blfp->blf_size++;
275 nvecs++;
276 vecp++;
277 first_bit = next_bit;
278 last_bit = next_bit;
279 nbits = 1;
280 } else if (xfs_buf_offset(bp, offset +
281 (next_bit << XFS_BLF_SHIFT)) !=
282 (xfs_buf_offset(bp, offset +
283 (last_bit << XFS_BLF_SHIFT)) +
284 XFS_BLF_CHUNK)) {
285 buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
286 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
287 vecp->i_len = nbits * XFS_BLF_CHUNK;
288 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
289 nvecs++;
290 vecp++;
291 first_bit = next_bit; 292 first_bit = next_bit;
292 last_bit = next_bit; 293 last_bit = next_bit;
293 nbits = 1; 294 nbits = 1;
@@ -296,9 +297,6 @@ xfs_buf_item_format_segment(
296 nbits++; 297 nbits++;
297 } 298 }
298 } 299 }
299out:
300 blfp->blf_size = nvecs;
301 return vecp;
302} 300}
303 301
304/* 302/*
@@ -310,10 +308,11 @@ out:
310STATIC void 308STATIC void
311xfs_buf_item_format( 309xfs_buf_item_format(
312 struct xfs_log_item *lip, 310 struct xfs_log_item *lip,
313 struct xfs_log_iovec *vecp) 311 struct xfs_log_vec *lv)
314{ 312{
315 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 313 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
316 struct xfs_buf *bp = bip->bli_buf; 314 struct xfs_buf *bp = bip->bli_buf;
315 struct xfs_log_iovec *vecp = NULL;
317 uint offset = 0; 316 uint offset = 0;
318 int i; 317 int i;
319 318
@@ -354,8 +353,8 @@ xfs_buf_item_format(
354 } 353 }
355 354
356 for (i = 0; i < bip->bli_format_count; i++) { 355 for (i = 0; i < bip->bli_format_count; i++) {
357 vecp = xfs_buf_item_format_segment(bip, vecp, offset, 356 xfs_buf_item_format_segment(bip, lv, &vecp, offset,
358 &bip->bli_formats[i]); 357 &bip->bli_formats[i]);
359 offset += bp->b_maps[i].bm_len; 358 offset += bp->b_maps[i].bm_len;
360 } 359 }
361 360
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index c4e50c6ed584..aead369e1c30 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -674,6 +674,7 @@ xfs_readdir(
674{ 674{
675 int rval; /* return value */ 675 int rval; /* return value */
676 int v; /* type-checking value */ 676 int v; /* type-checking value */
677 uint lock_mode;
677 678
678 trace_xfs_readdir(dp); 679 trace_xfs_readdir(dp);
679 680
@@ -683,6 +684,7 @@ xfs_readdir(
683 ASSERT(S_ISDIR(dp->i_d.di_mode)); 684 ASSERT(S_ISDIR(dp->i_d.di_mode));
684 XFS_STATS_INC(xs_dir_getdents); 685 XFS_STATS_INC(xs_dir_getdents);
685 686
687 lock_mode = xfs_ilock_data_map_shared(dp);
686 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 688 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
687 rval = xfs_dir2_sf_getdents(dp, ctx); 689 rval = xfs_dir2_sf_getdents(dp, ctx);
688 else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) 690 else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
@@ -691,5 +693,7 @@ xfs_readdir(
691 rval = xfs_dir2_block_getdents(dp, ctx); 693 rval = xfs_dir2_block_getdents(dp, ctx);
692 else 694 else
693 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); 695 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
696 xfs_iunlock(dp, lock_mode);
697
694 return rval; 698 return rval;
695} 699}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index aafc6e46cb58..3725fb1b902b 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -170,6 +170,7 @@ xfs_dir2_block_to_sf(
170 char *ptr; /* current data pointer */ 170 char *ptr; /* current data pointer */
171 xfs_dir2_sf_entry_t *sfep; /* shortform entry */ 171 xfs_dir2_sf_entry_t *sfep; /* shortform entry */
172 xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ 172 xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */
173 xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */
173 174
174 trace_xfs_dir2_block_to_sf(args); 175 trace_xfs_dir2_block_to_sf(args);
175 176
@@ -177,35 +178,20 @@ xfs_dir2_block_to_sf(
177 mp = dp->i_mount; 178 mp = dp->i_mount;
178 179
179 /* 180 /*
180 * Make a copy of the block data, so we can shrink the inode 181 * allocate a temporary destination buffer the size of the inode
181 * and add local data. 182 * to format the data into. Once we have formatted the data, we
183 * can free the block and copy the formatted data into the inode literal
184 * area.
182 */ 185 */
183 hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); 186 dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
184 memcpy(hdr, bp->b_addr, mp->m_dirblksize); 187 hdr = bp->b_addr;
185 logflags = XFS_ILOG_CORE;
186 if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
187 ASSERT(error != ENOSPC);
188 goto out;
189 }
190 188
191 /* 189 /*
192 * The buffer is now unconditionally gone, whether
193 * xfs_dir2_shrink_inode worked or not.
194 *
195 * Convert the inode to local format.
196 */
197 dp->i_df.if_flags &= ~XFS_IFEXTENTS;
198 dp->i_df.if_flags |= XFS_IFINLINE;
199 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
200 ASSERT(dp->i_df.if_bytes == 0);
201 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
202 logflags |= XFS_ILOG_DDATA;
203 /*
204 * Copy the header into the newly allocate local space. 190 * Copy the header into the newly allocate local space.
205 */ 191 */
206 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 192 sfp = (xfs_dir2_sf_hdr_t *)dst;
207 memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); 193 memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
208 dp->i_d.di_size = size; 194
209 /* 195 /*
210 * Set up to loop over the block's entries. 196 * Set up to loop over the block's entries.
211 */ 197 */
@@ -258,10 +244,34 @@ xfs_dir2_block_to_sf(
258 ptr += dp->d_ops->data_entsize(dep->namelen); 244 ptr += dp->d_ops->data_entsize(dep->namelen);
259 } 245 }
260 ASSERT((char *)sfep - (char *)sfp == size); 246 ASSERT((char *)sfep - (char *)sfp == size);
247
248 /* now we are done with the block, we can shrink the inode */
249 logflags = XFS_ILOG_CORE;
250 error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp);
251 if (error) {
252 ASSERT(error != ENOSPC);
253 goto out;
254 }
255
256 /*
257 * The buffer is now unconditionally gone, whether
258 * xfs_dir2_shrink_inode worked or not.
259 *
260 * Convert the inode to local format and copy the data in.
261 */
262 dp->i_df.if_flags &= ~XFS_IFEXTENTS;
263 dp->i_df.if_flags |= XFS_IFINLINE;
264 dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
265 ASSERT(dp->i_df.if_bytes == 0);
266 xfs_idata_realloc(dp, size, XFS_DATA_FORK);
267
268 logflags |= XFS_ILOG_DDATA;
269 memcpy(dp->i_df.if_u1.if_data, dst, size);
270 dp->i_d.di_size = size;
261 xfs_dir2_sf_check(args); 271 xfs_dir2_sf_check(args);
262out: 272out:
263 xfs_trans_log_inode(args->trans, dp, logflags); 273 xfs_trans_log_inode(args->trans, dp, logflags);
264 kmem_free(hdr); 274 kmem_free(dst);
265 return error; 275 return error;
266} 276}
267 277
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 6b1e695caf0e..7aeb4c895b32 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -469,16 +469,17 @@ xfs_qm_dqtobp(
469 struct xfs_mount *mp = dqp->q_mount; 469 struct xfs_mount *mp = dqp->q_mount;
470 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 470 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
471 struct xfs_trans *tp = (tpp ? *tpp : NULL); 471 struct xfs_trans *tp = (tpp ? *tpp : NULL);
472 uint lock_mode;
472 473
473 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; 474 dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
474 475
475 xfs_ilock(quotip, XFS_ILOCK_SHARED); 476 lock_mode = xfs_ilock_data_map_shared(quotip);
476 if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { 477 if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
477 /* 478 /*
478 * Return if this type of quotas is turned off while we 479 * Return if this type of quotas is turned off while we
479 * didn't have the quota inode lock. 480 * didn't have the quota inode lock.
480 */ 481 */
481 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 482 xfs_iunlock(quotip, lock_mode);
482 return ESRCH; 483 return ESRCH;
483 } 484 }
484 485
@@ -488,7 +489,7 @@ xfs_qm_dqtobp(
488 error = xfs_bmapi_read(quotip, dqp->q_fileoffset, 489 error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
489 XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); 490 XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
490 491
491 xfs_iunlock(quotip, XFS_ILOCK_SHARED); 492 xfs_iunlock(quotip, lock_mode);
492 if (error) 493 if (error)
493 return error; 494 return error;
494 495
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 92e5f62eefc6..f33fbaaa4d8a 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -57,20 +57,24 @@ xfs_qm_dquot_logitem_size(
57STATIC void 57STATIC void
58xfs_qm_dquot_logitem_format( 58xfs_qm_dquot_logitem_format(
59 struct xfs_log_item *lip, 59 struct xfs_log_item *lip,
60 struct xfs_log_iovec *logvec) 60 struct xfs_log_vec *lv)
61{ 61{
62 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); 62 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
63 63 struct xfs_log_iovec *vecp = NULL;
64 logvec->i_addr = &qlip->qli_format; 64 struct xfs_dq_logformat *qlf;
65 logvec->i_len = sizeof(xfs_dq_logformat_t); 65
66 logvec->i_type = XLOG_REG_TYPE_QFORMAT; 66 qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
67 logvec++; 67 qlf->qlf_type = XFS_LI_DQUOT;
68 logvec->i_addr = &qlip->qli_dquot->q_core; 68 qlf->qlf_size = 2;
69 logvec->i_len = sizeof(xfs_disk_dquot_t); 69 qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
70 logvec->i_type = XLOG_REG_TYPE_DQUOT; 70 qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
71 71 qlf->qlf_len = 1;
72 qlip->qli_format.qlf_size = 2; 72 qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
73 73 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
74
75 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
76 &qlip->qli_dquot->q_core,
77 sizeof(struct xfs_disk_dquot));
74} 78}
75 79
76/* 80/*
@@ -257,18 +261,6 @@ xfs_qm_dquot_logitem_init(
257 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, 261 xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
258 &xfs_dquot_item_ops); 262 &xfs_dquot_item_ops);
259 lp->qli_dquot = dqp; 263 lp->qli_dquot = dqp;
260 lp->qli_format.qlf_type = XFS_LI_DQUOT;
261 lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
262 lp->qli_format.qlf_blkno = dqp->q_blkno;
263 lp->qli_format.qlf_len = 1;
264 /*
265 * This is just the offset of this dquot within its buffer
266 * (which is currently 1 FSB and probably won't change).
267 * Hence 32 bits for this offset should be just fine.
268 * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
269 * here, and recompute it at recovery time.
270 */
271 lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
272} 264}
273 265
274/*------------------ QUOTAOFF LOG ITEMS -------------------*/ 266/*------------------ QUOTAOFF LOG ITEMS -------------------*/
@@ -294,26 +286,20 @@ xfs_qm_qoff_logitem_size(
294 *nbytes += sizeof(struct xfs_qoff_logitem); 286 *nbytes += sizeof(struct xfs_qoff_logitem);
295} 287}
296 288
297/*
298 * This is called to fill in the vector of log iovecs for the
299 * given quotaoff log item. We use only 1 iovec, and we point that
300 * at the quotaoff_log_format structure embedded in the quotaoff item.
301 * It is at this point that we assert that all of the extent
302 * slots in the quotaoff item have been filled.
303 */
304STATIC void 289STATIC void
305xfs_qm_qoff_logitem_format( 290xfs_qm_qoff_logitem_format(
306 struct xfs_log_item *lip, 291 struct xfs_log_item *lip,
307 struct xfs_log_iovec *log_vector) 292 struct xfs_log_vec *lv)
308{ 293{
309 struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); 294 struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
310 295 struct xfs_log_iovec *vecp = NULL;
311 ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); 296 struct xfs_qoff_logformat *qlf;
312 297
313 log_vector->i_addr = &qflip->qql_format; 298 qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
314 log_vector->i_len = sizeof(xfs_qoff_logitem_t); 299 qlf->qf_type = XFS_LI_QUOTAOFF;
315 log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; 300 qlf->qf_size = 1;
316 qflip->qql_format.qf_size = 1; 301 qlf->qf_flags = qflip->qql_flags;
302 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
317} 303}
318 304
319/* 305/*
@@ -453,8 +439,7 @@ xfs_qm_qoff_logitem_init(
453 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? 439 xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
454 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); 440 &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
455 qf->qql_item.li_mountp = mp; 441 qf->qql_item.li_mountp = mp;
456 qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
457 qf->qql_format.qf_flags = flags;
458 qf->qql_start_lip = start; 442 qf->qql_start_lip = start;
443 qf->qql_flags = flags;
459 return qf; 444 return qf;
460} 445}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70b..502e9464634a 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem {
27 xfs_log_item_t qli_item; /* common portion */ 27 xfs_log_item_t qli_item; /* common portion */
28 struct xfs_dquot *qli_dquot; /* dquot ptr */ 28 struct xfs_dquot *qli_dquot; /* dquot ptr */
29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ 29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
30 xfs_dq_logformat_t qli_format; /* logged structure */
31} xfs_dq_logitem_t; 30} xfs_dq_logitem_t;
32 31
33typedef struct xfs_qoff_logitem { 32typedef struct xfs_qoff_logitem {
34 xfs_log_item_t qql_item; /* common portion */ 33 xfs_log_item_t qql_item; /* common portion */
35 struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ 34 struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
36 xfs_qoff_logformat_t qql_format; /* logged structure */ 35 unsigned int qql_flags;
37} xfs_qoff_logitem_t; 36} xfs_qoff_logitem_t;
38 37
39 38
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3680d04f973f..fb7a4c1ce1c5 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -26,6 +26,7 @@
26#include "xfs_trans_priv.h" 26#include "xfs_trans_priv.h"
27#include "xfs_buf_item.h" 27#include "xfs_buf_item.h"
28#include "xfs_extfree_item.h" 28#include "xfs_extfree_item.h"
29#include "xfs_log.h"
29 30
30 31
31kmem_zone_t *xfs_efi_zone; 32kmem_zone_t *xfs_efi_zone;
@@ -101,9 +102,10 @@ xfs_efi_item_size(
101STATIC void 102STATIC void
102xfs_efi_item_format( 103xfs_efi_item_format(
103 struct xfs_log_item *lip, 104 struct xfs_log_item *lip,
104 struct xfs_log_iovec *log_vector) 105 struct xfs_log_vec *lv)
105{ 106{
106 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 107 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
108 struct xfs_log_iovec *vecp = NULL;
107 109
108 ASSERT(atomic_read(&efip->efi_next_extent) == 110 ASSERT(atomic_read(&efip->efi_next_extent) ==
109 efip->efi_format.efi_nextents); 111 efip->efi_format.efi_nextents);
@@ -111,10 +113,9 @@ xfs_efi_item_format(
111 efip->efi_format.efi_type = XFS_LI_EFI; 113 efip->efi_format.efi_type = XFS_LI_EFI;
112 efip->efi_format.efi_size = 1; 114 efip->efi_format.efi_size = 1;
113 115
114 log_vector->i_addr = &efip->efi_format; 116 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
115 log_vector->i_len = xfs_efi_item_sizeof(efip); 117 &efip->efi_format,
116 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; 118 xfs_efi_item_sizeof(efip));
117 ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
118} 119}
119 120
120 121
@@ -368,19 +369,19 @@ xfs_efd_item_size(
368STATIC void 369STATIC void
369xfs_efd_item_format( 370xfs_efd_item_format(
370 struct xfs_log_item *lip, 371 struct xfs_log_item *lip,
371 struct xfs_log_iovec *log_vector) 372 struct xfs_log_vec *lv)
372{ 373{
373 struct xfs_efd_log_item *efdp = EFD_ITEM(lip); 374 struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
375 struct xfs_log_iovec *vecp = NULL;
374 376
375 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); 377 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
376 378
377 efdp->efd_format.efd_type = XFS_LI_EFD; 379 efdp->efd_format.efd_type = XFS_LI_EFD;
378 efdp->efd_format.efd_size = 1; 380 efdp->efd_format.efd_size = 1;
379 381
380 log_vector->i_addr = &efdp->efd_format; 382 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
381 log_vector->i_len = xfs_efd_item_sizeof(efdp); 383 &efdp->efd_format,
382 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; 384 xfs_efd_item_sizeof(efdp));
383 ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
384} 385}
385 386
386/* 387/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 52c91e143725..64b48eade91d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -261,7 +261,8 @@ xfs_file_aio_read(
261 xfs_buftarg_t *target = 261 xfs_buftarg_t *target =
262 XFS_IS_REALTIME_INODE(ip) ? 262 XFS_IS_REALTIME_INODE(ip) ?
263 mp->m_rtdev_targp : mp->m_ddev_targp; 263 mp->m_rtdev_targp : mp->m_ddev_targp;
264 if ((pos & target->bt_smask) || (size & target->bt_smask)) { 264 /* DIO must be aligned to device logical sector size */
265 if ((pos | size) & target->bt_logical_sectormask) {
265 if (pos == i_size_read(inode)) 266 if (pos == i_size_read(inode))
266 return 0; 267 return 0;
267 return -XFS_ERROR(EINVAL); 268 return -XFS_ERROR(EINVAL);
@@ -641,9 +642,11 @@ xfs_file_dio_aio_write(
641 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 642 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
642 mp->m_rtdev_targp : mp->m_ddev_targp; 643 mp->m_rtdev_targp : mp->m_ddev_targp;
643 644
644 if ((pos & target->bt_smask) || (count & target->bt_smask)) 645 /* DIO must be aligned to device logical sector size */
646 if ((pos | count) & target->bt_logical_sectormask)
645 return -XFS_ERROR(EINVAL); 647 return -XFS_ERROR(EINVAL);
646 648
649 /* "unaligned" here means not aligned to a filesystem block */
647 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) 650 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
648 unaligned_io = 1; 651 unaligned_io = 1;
649 652
@@ -796,7 +799,7 @@ xfs_file_aio_write(
796 XFS_STATS_ADD(xs_write_bytes, ret); 799 XFS_STATS_ADD(xs_write_bytes, ret);
797 800
798 /* Handle various SYNC-type writes */ 801 /* Handle various SYNC-type writes */
799 err = generic_write_sync(file, pos, ret); 802 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
800 if (err < 0) 803 if (err < 0)
801 ret = err; 804 ret = err;
802 } 805 }
@@ -912,7 +915,7 @@ xfs_dir_open(
912 * If there are any blocks, read-ahead block 0 as we're almost 915 * If there are any blocks, read-ahead block 0 as we're almost
913 * certain to have the next operation be a read there. 916 * certain to have the next operation be a read there.
914 */ 917 */
915 mode = xfs_ilock_map_shared(ip); 918 mode = xfs_ilock_data_map_shared(ip);
916 if (ip->i_d.di_nextents > 0) 919 if (ip->i_d.di_nextents > 0)
917 xfs_dir3_data_readahead(NULL, ip, 0, -1); 920 xfs_dir3_data_readahead(NULL, ip, 0, -1);
918 xfs_iunlock(ip, mode); 921 xfs_iunlock(ip, mode);
@@ -1215,7 +1218,7 @@ xfs_seek_data(
1215 uint lock; 1218 uint lock;
1216 int error; 1219 int error;
1217 1220
1218 lock = xfs_ilock_map_shared(ip); 1221 lock = xfs_ilock_data_map_shared(ip);
1219 1222
1220 isize = i_size_read(inode); 1223 isize = i_size_read(inode);
1221 if (start >= isize) { 1224 if (start >= isize) {
@@ -1294,7 +1297,7 @@ out:
1294 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1297 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1295 1298
1296out_unlock: 1299out_unlock:
1297 xfs_iunlock_map_shared(ip, lock); 1300 xfs_iunlock(ip, lock);
1298 1301
1299 if (error) 1302 if (error)
1300 return -error; 1303 return -error;
@@ -1319,7 +1322,7 @@ xfs_seek_hole(
1319 if (XFS_FORCED_SHUTDOWN(mp)) 1322 if (XFS_FORCED_SHUTDOWN(mp))
1320 return -XFS_ERROR(EIO); 1323 return -XFS_ERROR(EIO);
1321 1324
1322 lock = xfs_ilock_map_shared(ip); 1325 lock = xfs_ilock_data_map_shared(ip);
1323 1326
1324 isize = i_size_read(inode); 1327 isize = i_size_read(inode);
1325 if (start >= isize) { 1328 if (start >= isize) {
@@ -1402,7 +1405,7 @@ out:
1402 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1405 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1403 1406
1404out_unlock: 1407out_unlock:
1405 xfs_iunlock_map_shared(ip, lock); 1408 xfs_iunlock(ip, lock);
1406 1409
1407 if (error) 1410 if (error)
1408 return -error; 1411 return -error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e87719c5bebe..5d7f105a1c82 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -52,7 +52,7 @@ xfs_ialloc_cluster_alignment(
52{ 52{
53 if (xfs_sb_version_hasalign(&args->mp->m_sb) && 53 if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
54 args->mp->m_sb.sb_inoalignmt >= 54 args->mp->m_sb.sb_inoalignmt >=
55 XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) 55 XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
56 return args->mp->m_sb.sb_inoalignmt; 56 return args->mp->m_sb.sb_inoalignmt;
57 return 1; 57 return 1;
58} 58}
@@ -170,27 +170,20 @@ xfs_ialloc_inode_init(
170{ 170{
171 struct xfs_buf *fbuf; 171 struct xfs_buf *fbuf;
172 struct xfs_dinode *free; 172 struct xfs_dinode *free;
173 int blks_per_cluster, nbufs, ninodes; 173 int nbufs, blks_per_cluster, inodes_per_cluster;
174 int version; 174 int version;
175 int i, j; 175 int i, j;
176 xfs_daddr_t d; 176 xfs_daddr_t d;
177 xfs_ino_t ino = 0; 177 xfs_ino_t ino = 0;
178 178
179 /* 179 /*
180 * Loop over the new block(s), filling in the inodes. 180 * Loop over the new block(s), filling in the inodes. For small block
181 * For small block sizes, manipulate the inodes in buffers 181 * sizes, manipulate the inodes in buffers which are multiples of the
182 * which are multiples of the blocks size. 182 * blocks size.
183 */ 183 */
184 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 184 blks_per_cluster = xfs_icluster_size_fsb(mp);
185 blks_per_cluster = 1; 185 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
186 nbufs = length; 186 nbufs = length / blks_per_cluster;
187 ninodes = mp->m_sb.sb_inopblock;
188 } else {
189 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
190 mp->m_sb.sb_blocksize;
191 nbufs = length / blks_per_cluster;
192 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
193 }
194 187
195 /* 188 /*
196 * Figure out what version number to use in the inodes we create. If 189 * Figure out what version number to use in the inodes we create. If
@@ -225,7 +218,7 @@ xfs_ialloc_inode_init(
225 * they track in the AIL as if they were physically logged. 218 * they track in the AIL as if they were physically logged.
226 */ 219 */
227 if (tp) 220 if (tp)
228 xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp), 221 xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
229 mp->m_sb.sb_inodesize, length, gen); 222 mp->m_sb.sb_inodesize, length, gen);
230 } else if (xfs_sb_version_hasnlink(&mp->m_sb)) 223 } else if (xfs_sb_version_hasnlink(&mp->m_sb))
231 version = 2; 224 version = 2;
@@ -246,7 +239,7 @@ xfs_ialloc_inode_init(
246 /* Initialize the inode buffers and log them appropriately. */ 239 /* Initialize the inode buffers and log them appropriately. */
247 fbuf->b_ops = &xfs_inode_buf_ops; 240 fbuf->b_ops = &xfs_inode_buf_ops;
248 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); 241 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
249 for (i = 0; i < ninodes; i++) { 242 for (i = 0; i < inodes_per_cluster; i++) {
250 int ioffset = i << mp->m_sb.sb_inodelog; 243 int ioffset = i << mp->m_sb.sb_inodelog;
251 uint isize = xfs_dinode_size(version); 244 uint isize = xfs_dinode_size(version);
252 245
@@ -329,11 +322,11 @@ xfs_ialloc_ag_alloc(
329 * Locking will ensure that we don't have two callers in here 322 * Locking will ensure that we don't have two callers in here
330 * at one time. 323 * at one time.
331 */ 324 */
332 newlen = XFS_IALLOC_INODES(args.mp); 325 newlen = args.mp->m_ialloc_inos;
333 if (args.mp->m_maxicount && 326 if (args.mp->m_maxicount &&
334 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) 327 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
335 return XFS_ERROR(ENOSPC); 328 return XFS_ERROR(ENOSPC);
336 args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); 329 args.minlen = args.maxlen = args.mp->m_ialloc_blks;
337 /* 330 /*
338 * First try to allocate inodes contiguous with the last-allocated 331 * First try to allocate inodes contiguous with the last-allocated
339 * chunk of inodes. If the filesystem is striped, this will fill 332 * chunk of inodes. If the filesystem is striped, this will fill
@@ -343,7 +336,7 @@ xfs_ialloc_ag_alloc(
343 newino = be32_to_cpu(agi->agi_newino); 336 newino = be32_to_cpu(agi->agi_newino);
344 agno = be32_to_cpu(agi->agi_seqno); 337 agno = be32_to_cpu(agi->agi_seqno);
345 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 338 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
346 XFS_IALLOC_BLOCKS(args.mp); 339 args.mp->m_ialloc_blks;
347 if (likely(newino != NULLAGINO && 340 if (likely(newino != NULLAGINO &&
348 (args.agbno < be32_to_cpu(agi->agi_length)))) { 341 (args.agbno < be32_to_cpu(agi->agi_length)))) {
349 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); 342 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -585,7 +578,7 @@ xfs_ialloc_ag_select(
585 * Is there enough free space for the file plus a block of 578 * Is there enough free space for the file plus a block of
586 * inodes? (if we need to allocate some)? 579 * inodes? (if we need to allocate some)?
587 */ 580 */
588 ineed = XFS_IALLOC_BLOCKS(mp); 581 ineed = mp->m_ialloc_blks;
589 longest = pag->pagf_longest; 582 longest = pag->pagf_longest;
590 if (!longest) 583 if (!longest)
591 longest = pag->pagf_flcount > 0; 584 longest = pag->pagf_flcount > 0;
@@ -999,7 +992,7 @@ xfs_dialloc(
999 * inode. 992 * inode.
1000 */ 993 */
1001 if (mp->m_maxicount && 994 if (mp->m_maxicount &&
1002 mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { 995 mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
1003 noroom = 1; 996 noroom = 1;
1004 okalloc = 0; 997 okalloc = 0;
1005 } 998 }
@@ -1202,7 +1195,7 @@ xfs_difree(
1202 * When an inode cluster is free, it becomes eligible for removal 1195 * When an inode cluster is free, it becomes eligible for removal
1203 */ 1196 */
1204 if (!(mp->m_flags & XFS_MOUNT_IKEEP) && 1197 if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
1205 (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { 1198 (rec.ir_freecount == mp->m_ialloc_inos)) {
1206 1199
1207 *delete = 1; 1200 *delete = 1;
1208 *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); 1201 *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
@@ -1212,7 +1205,7 @@ xfs_difree(
1212 * AGI and Superblock inode counts, and mark the disk space 1205 * AGI and Superblock inode counts, and mark the disk space
1213 * to be freed when the transaction is committed. 1206 * to be freed when the transaction is committed.
1214 */ 1207 */
1215 ilen = XFS_IALLOC_INODES(mp); 1208 ilen = mp->m_ialloc_inos;
1216 be32_add_cpu(&agi->agi_count, -ilen); 1209 be32_add_cpu(&agi->agi_count, -ilen);
1217 be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); 1210 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1218 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); 1211 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1228,9 +1221,9 @@ xfs_difree(
1228 goto error0; 1221 goto error0;
1229 } 1222 }
1230 1223
1231 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, 1224 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
1232 agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), 1225 XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
1233 XFS_IALLOC_BLOCKS(mp), flist, mp); 1226 mp->m_ialloc_blks, flist, mp);
1234 } else { 1227 } else {
1235 *delete = 0; 1228 *delete = 0;
1236 1229
@@ -1311,7 +1304,7 @@ xfs_imap_lookup(
1311 1304
1312 /* check that the returned record contains the required inode */ 1305 /* check that the returned record contains the required inode */
1313 if (rec.ir_startino > agino || 1306 if (rec.ir_startino > agino ||
1314 rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) 1307 rec.ir_startino + mp->m_ialloc_inos <= agino)
1315 return EINVAL; 1308 return EINVAL;
1316 1309
1317 /* for untrusted inodes check it is allocated first */ 1310 /* for untrusted inodes check it is allocated first */
@@ -1384,7 +1377,7 @@ xfs_imap(
1384 return XFS_ERROR(EINVAL); 1377 return XFS_ERROR(EINVAL);
1385 } 1378 }
1386 1379
1387 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; 1380 blks_per_cluster = xfs_icluster_size_fsb(mp);
1388 1381
1389 /* 1382 /*
1390 * For bulkstat and handle lookups, we have an untrusted inode number 1383 * For bulkstat and handle lookups, we have an untrusted inode number
@@ -1405,7 +1398,7 @@ xfs_imap(
1405 * If the inode cluster size is the same as the blocksize or 1398 * If the inode cluster size is the same as the blocksize or
1406 * smaller we get to the buffer by simple arithmetics. 1399 * smaller we get to the buffer by simple arithmetics.
1407 */ 1400 */
1408 if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { 1401 if (blks_per_cluster == 1) {
1409 offset = XFS_INO_TO_OFFSET(mp, ino); 1402 offset = XFS_INO_TO_OFFSET(mp, ino);
1410 ASSERT(offset < mp->m_sb.sb_inopblock); 1403 ASSERT(offset < mp->m_sb.sb_inopblock);
1411 1404
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index a8f76a5ff418..812365d17e67 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -25,17 +25,18 @@ struct xfs_mount;
25struct xfs_trans; 25struct xfs_trans;
26struct xfs_btree_cur; 26struct xfs_btree_cur;
27 27
28/* 28/* Move inodes in clusters of this size */
29 * Allocation parameters for inode allocation.
30 */
31#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos
32#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks
33
34/*
35 * Move inodes in clusters of this size.
36 */
37#define XFS_INODE_BIG_CLUSTER_SIZE 8192 29#define XFS_INODE_BIG_CLUSTER_SIZE 8192
38#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size 30
31/* Calculate and return the number of filesystem blocks per inode cluster */
32static inline int
33xfs_icluster_size_fsb(
34 struct xfs_mount *mp)
35{
36 if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
37 return 1;
38 return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
39}
39 40
40/* 41/*
41 * Make an inode pointer out of the buffer/offset. 42 * Make an inode pointer out of the buffer/offset.
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d2eaccfa73f4..7e4549233251 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -28,6 +28,7 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30#include "xfs_icreate_item.h" 30#include "xfs_icreate_item.h"
31#include "xfs_log.h"
31 32
32kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ 33kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
33 34
@@ -58,13 +59,14 @@ xfs_icreate_item_size(
58STATIC void 59STATIC void
59xfs_icreate_item_format( 60xfs_icreate_item_format(
60 struct xfs_log_item *lip, 61 struct xfs_log_item *lip,
61 struct xfs_log_iovec *log_vector) 62 struct xfs_log_vec *lv)
62{ 63{
63 struct xfs_icreate_item *icp = ICR_ITEM(lip); 64 struct xfs_icreate_item *icp = ICR_ITEM(lip);
65 struct xfs_log_iovec *vecp = NULL;
64 66
65 log_vector->i_addr = (xfs_caddr_t)&icp->ic_format; 67 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
66 log_vector->i_len = sizeof(struct xfs_icreate_log); 68 &icp->ic_format,
67 log_vector->i_type = XLOG_REG_TYPE_ICREATE; 69 sizeof(struct xfs_icreate_log));
68} 70}
69 71
70 72
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 001aa893ed59..3a137e9f9a7d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -77,48 +77,44 @@ xfs_get_extsz_hint(
77} 77}
78 78
79/* 79/*
80 * This is a wrapper routine around the xfs_ilock() routine used to centralize 80 * These two are wrapper routines around the xfs_ilock() routine used to
81 * some grungy code. It is used in places that wish to lock the inode solely 81 * centralize some grungy code. They are used in places that wish to lock the
82 * for reading the extents. The reason these places can't just call 82 * inode solely for reading the extents. The reason these places can't just
83 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the 83 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
84 * extents from disk for a file in b-tree format. If the inode is in b-tree 84 * bringing in of the extents from disk for a file in b-tree format. If the
85 * format, then we need to lock the inode exclusively until the extents are read 85 * inode is in b-tree format, then we need to lock the inode exclusively until
86 * in. Locking it exclusively all the time would limit our parallelism 86 * the extents are read in. Locking it exclusively all the time would limit
87 * unnecessarily, though. What we do instead is check to see if the extents 87 * our parallelism unnecessarily, though. What we do instead is check to see
88 * have been read in yet, and only lock the inode exclusively if they have not. 88 * if the extents have been read in yet, and only lock the inode exclusively
89 * if they have not.
89 * 90 *
90 * The function returns a value which should be given to the corresponding 91 * The functions return a value which should be given to the corresponding
91 * xfs_iunlock_map_shared(). This value is the mode in which the lock was 92 * xfs_iunlock() call.
92 * actually taken.
93 */ 93 */
94uint 94uint
95xfs_ilock_map_shared( 95xfs_ilock_data_map_shared(
96 xfs_inode_t *ip) 96 struct xfs_inode *ip)
97{ 97{
98 uint lock_mode; 98 uint lock_mode = XFS_ILOCK_SHARED;
99 99
100 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && 100 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
101 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { 101 (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
102 lock_mode = XFS_ILOCK_EXCL; 102 lock_mode = XFS_ILOCK_EXCL;
103 } else {
104 lock_mode = XFS_ILOCK_SHARED;
105 }
106
107 xfs_ilock(ip, lock_mode); 103 xfs_ilock(ip, lock_mode);
108
109 return lock_mode; 104 return lock_mode;
110} 105}
111 106
112/* 107uint
113 * This is simply the unlock routine to go with xfs_ilock_map_shared(). 108xfs_ilock_attr_map_shared(
114 * All it does is call xfs_iunlock() with the given lock_mode. 109 struct xfs_inode *ip)
115 */
116void
117xfs_iunlock_map_shared(
118 xfs_inode_t *ip,
119 unsigned int lock_mode)
120{ 110{
121 xfs_iunlock(ip, lock_mode); 111 uint lock_mode = XFS_ILOCK_SHARED;
112
113 if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
114 (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
115 lock_mode = XFS_ILOCK_EXCL;
116 xfs_ilock(ip, lock_mode);
117 return lock_mode;
122} 118}
123 119
124/* 120/*
@@ -588,9 +584,9 @@ xfs_lookup(
588 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 584 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
589 return XFS_ERROR(EIO); 585 return XFS_ERROR(EIO);
590 586
591 lock_mode = xfs_ilock_map_shared(dp); 587 lock_mode = xfs_ilock_data_map_shared(dp);
592 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 588 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
593 xfs_iunlock_map_shared(dp, lock_mode); 589 xfs_iunlock(dp, lock_mode);
594 590
595 if (error) 591 if (error)
596 goto out; 592 goto out;
@@ -2141,8 +2137,8 @@ xfs_ifree_cluster(
2141{ 2137{
2142 xfs_mount_t *mp = free_ip->i_mount; 2138 xfs_mount_t *mp = free_ip->i_mount;
2143 int blks_per_cluster; 2139 int blks_per_cluster;
2140 int inodes_per_cluster;
2144 int nbufs; 2141 int nbufs;
2145 int ninodes;
2146 int i, j; 2142 int i, j;
2147 xfs_daddr_t blkno; 2143 xfs_daddr_t blkno;
2148 xfs_buf_t *bp; 2144 xfs_buf_t *bp;
@@ -2152,18 +2148,11 @@ xfs_ifree_cluster(
2152 struct xfs_perag *pag; 2148 struct xfs_perag *pag;
2153 2149
2154 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); 2150 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2155 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 2151 blks_per_cluster = xfs_icluster_size_fsb(mp);
2156 blks_per_cluster = 1; 2152 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2157 ninodes = mp->m_sb.sb_inopblock; 2153 nbufs = mp->m_ialloc_blks / blks_per_cluster;
2158 nbufs = XFS_IALLOC_BLOCKS(mp);
2159 } else {
2160 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
2161 mp->m_sb.sb_blocksize;
2162 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
2163 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
2164 }
2165 2154
2166 for (j = 0; j < nbufs; j++, inum += ninodes) { 2155 for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
2167 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2156 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2168 XFS_INO_TO_AGBNO(mp, inum)); 2157 XFS_INO_TO_AGBNO(mp, inum));
2169 2158
@@ -2225,7 +2214,7 @@ xfs_ifree_cluster(
2225 * transaction stale above, which means there is no point in 2214 * transaction stale above, which means there is no point in
2226 * even trying to lock them. 2215 * even trying to lock them.
2227 */ 2216 */
2228 for (i = 0; i < ninodes; i++) { 2217 for (i = 0; i < inodes_per_cluster; i++) {
2229retry: 2218retry:
2230 rcu_read_lock(); 2219 rcu_read_lock();
2231 ip = radix_tree_lookup(&pag->pag_ici_root, 2220 ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2906,13 +2895,13 @@ xfs_iflush_cluster(
2906 2895
2907 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2896 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2908 2897
2909 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2898 inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
2910 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2899 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2911 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); 2900 ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2912 if (!ilist) 2901 if (!ilist)
2913 goto out_put; 2902 goto out_put;
2914 2903
2915 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2904 mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
2916 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2905 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2917 rcu_read_lock(); 2906 rcu_read_lock();
2918 /* really need a gang lookup range call here */ 2907 /* really need a gang lookup range call here */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 9e6efccbae04..65e2350f449c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -337,8 +337,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint);
337void xfs_iunlock(xfs_inode_t *, uint); 337void xfs_iunlock(xfs_inode_t *, uint);
338void xfs_ilock_demote(xfs_inode_t *, uint); 338void xfs_ilock_demote(xfs_inode_t *, uint);
339int xfs_isilocked(xfs_inode_t *, uint); 339int xfs_isilocked(xfs_inode_t *, uint);
340uint xfs_ilock_map_shared(xfs_inode_t *); 340uint xfs_ilock_data_map_shared(struct xfs_inode *);
341void xfs_iunlock_map_shared(xfs_inode_t *, uint); 341uint xfs_ilock_attr_map_shared(struct xfs_inode *);
342int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, 342int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
343 xfs_nlink_t, xfs_dev_t, prid_t, int, 343 xfs_nlink_t, xfs_dev_t, prid_t, int,
344 struct xfs_buf **, xfs_inode_t **); 344 struct xfs_buf **, xfs_inode_t **);
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index cfee14a83cfe..73514c0486b7 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -431,6 +431,8 @@ xfs_iread_extents(
431 xfs_ifork_t *ifp; 431 xfs_ifork_t *ifp;
432 xfs_extnum_t nextents; 432 xfs_extnum_t nextents;
433 433
434 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
435
434 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 436 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
435 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 437 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
436 ip->i_mount); 438 ip->i_mount);
@@ -721,15 +723,16 @@ xfs_idestroy_fork(
721} 723}
722 724
723/* 725/*
724 * xfs_iextents_copy() 726 * Convert in-core extents to on-disk form
725 * 727 *
726 * This is called to copy the REAL extents (as opposed to the delayed 728 * For either the data or attr fork in extent format, we need to endian convert
727 * allocation extents) from the inode into the given buffer. It 729 * the in-core extent as we place them into the on-disk inode.
728 * returns the number of bytes copied into the buffer.
729 * 730 *
730 * If there are no delayed allocation extents, then we can just 731 * In the case of the data fork, the in-core and on-disk fork sizes can be
731 * memcpy() the extents into the buffer. Otherwise, we need to 732 * different due to delayed allocation extents. We only copy on-disk extents
732 * examine each extent in turn and skip those which are delayed. 733 * here, so callers must always use the physical fork size to determine the
734 * size of the buffer passed to this routine. We will return the size actually
735 * used.
733 */ 736 */
734int 737int
735xfs_iextents_copy( 738xfs_iextents_copy(
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7c0d391f9a6e..686889b4a1e5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -30,6 +30,7 @@
30#include "xfs_trace.h" 30#include "xfs_trace.h"
31#include "xfs_trans_priv.h" 31#include "xfs_trans_priv.h"
32#include "xfs_dinode.h" 32#include "xfs_dinode.h"
33#include "xfs_log.h"
33 34
34 35
35kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 36kmem_zone_t *xfs_ili_zone; /* inode log item zone */
@@ -39,27 +40,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
39 return container_of(lip, struct xfs_inode_log_item, ili_item); 40 return container_of(lip, struct xfs_inode_log_item, ili_item);
40} 41}
41 42
42
43/*
44 * This returns the number of iovecs needed to log the given inode item.
45 *
46 * We need one iovec for the inode log format structure, one for the
47 * inode core, and possibly one for the inode data/extents/b-tree root
48 * and one for the inode attribute data/extents/b-tree root.
49 */
50STATIC void 43STATIC void
51xfs_inode_item_size( 44xfs_inode_item_data_fork_size(
52 struct xfs_log_item *lip, 45 struct xfs_inode_log_item *iip,
53 int *nvecs, 46 int *nvecs,
54 int *nbytes) 47 int *nbytes)
55{ 48{
56 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
57 struct xfs_inode *ip = iip->ili_inode; 49 struct xfs_inode *ip = iip->ili_inode;
58 50
59 *nvecs += 2;
60 *nbytes += sizeof(struct xfs_inode_log_format) +
61 xfs_icdinode_size(ip->i_d.di_version);
62
63 switch (ip->i_d.di_format) { 51 switch (ip->i_d.di_format) {
64 case XFS_DINODE_FMT_EXTENTS: 52 case XFS_DINODE_FMT_EXTENTS:
65 if ((iip->ili_fields & XFS_ILOG_DEXT) && 53 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
@@ -70,7 +58,6 @@ xfs_inode_item_size(
70 *nvecs += 1; 58 *nvecs += 1;
71 } 59 }
72 break; 60 break;
73
74 case XFS_DINODE_FMT_BTREE: 61 case XFS_DINODE_FMT_BTREE:
75 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 62 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
76 ip->i_df.if_broot_bytes > 0) { 63 ip->i_df.if_broot_bytes > 0) {
@@ -78,7 +65,6 @@ xfs_inode_item_size(
78 *nvecs += 1; 65 *nvecs += 1;
79 } 66 }
80 break; 67 break;
81
82 case XFS_DINODE_FMT_LOCAL: 68 case XFS_DINODE_FMT_LOCAL:
83 if ((iip->ili_fields & XFS_ILOG_DDATA) && 69 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
84 ip->i_df.if_bytes > 0) { 70 ip->i_df.if_bytes > 0) {
@@ -90,19 +76,20 @@ xfs_inode_item_size(
90 case XFS_DINODE_FMT_DEV: 76 case XFS_DINODE_FMT_DEV:
91 case XFS_DINODE_FMT_UUID: 77 case XFS_DINODE_FMT_UUID:
92 break; 78 break;
93
94 default: 79 default:
95 ASSERT(0); 80 ASSERT(0);
96 break; 81 break;
97 } 82 }
83}
98 84
99 if (!XFS_IFORK_Q(ip)) 85STATIC void
100 return; 86xfs_inode_item_attr_fork_size(
101 87 struct xfs_inode_log_item *iip,
88 int *nvecs,
89 int *nbytes)
90{
91 struct xfs_inode *ip = iip->ili_inode;
102 92
103 /*
104 * Log any necessary attribute data.
105 */
106 switch (ip->i_d.di_aformat) { 93 switch (ip->i_d.di_aformat) {
107 case XFS_DINODE_FMT_EXTENTS: 94 case XFS_DINODE_FMT_EXTENTS:
108 if ((iip->ili_fields & XFS_ILOG_AEXT) && 95 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
@@ -113,7 +100,6 @@ xfs_inode_item_size(
113 *nvecs += 1; 100 *nvecs += 1;
114 } 101 }
115 break; 102 break;
116
117 case XFS_DINODE_FMT_BTREE: 103 case XFS_DINODE_FMT_BTREE:
118 if ((iip->ili_fields & XFS_ILOG_ABROOT) && 104 if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
119 ip->i_afp->if_broot_bytes > 0) { 105 ip->i_afp->if_broot_bytes > 0) {
@@ -121,7 +107,6 @@ xfs_inode_item_size(
121 *nvecs += 1; 107 *nvecs += 1;
122 } 108 }
123 break; 109 break;
124
125 case XFS_DINODE_FMT_LOCAL: 110 case XFS_DINODE_FMT_LOCAL:
126 if ((iip->ili_fields & XFS_ILOG_ADATA) && 111 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
127 ip->i_afp->if_bytes > 0) { 112 ip->i_afp->if_bytes > 0) {
@@ -129,7 +114,6 @@ xfs_inode_item_size(
129 *nvecs += 1; 114 *nvecs += 1;
130 } 115 }
131 break; 116 break;
132
133 default: 117 default:
134 ASSERT(0); 118 ASSERT(0);
135 break; 119 break;
@@ -137,98 +121,67 @@ xfs_inode_item_size(
137} 121}
138 122
139/* 123/*
140 * xfs_inode_item_format_extents - convert in-core extents to on-disk form 124 * This returns the number of iovecs needed to log the given inode item.
141 *
142 * For either the data or attr fork in extent format, we need to endian convert
143 * the in-core extent as we place them into the on-disk inode. In this case, we
144 * need to do this conversion before we write the extents into the log. Because
145 * we don't have the disk inode to write into here, we allocate a buffer and
146 * format the extents into it via xfs_iextents_copy(). We free the buffer in
147 * the unlock routine after the copy for the log has been made.
148 * 125 *
149 * In the case of the data fork, the in-core and on-disk fork sizes can be 126 * We need one iovec for the inode log format structure, one for the
150 * different due to delayed allocation extents. We only log on-disk extents 127 * inode core, and possibly one for the inode data/extents/b-tree root
151 * here, so always use the physical fork size to determine the size of the 128 * and one for the inode attribute data/extents/b-tree root.
152 * buffer we need to allocate.
153 */ 129 */
154STATIC void 130STATIC void
155xfs_inode_item_format_extents( 131xfs_inode_item_size(
156 struct xfs_inode *ip, 132 struct xfs_log_item *lip,
157 struct xfs_log_iovec *vecp, 133 int *nvecs,
158 int whichfork, 134 int *nbytes)
159 int type)
160{ 135{
161 xfs_bmbt_rec_t *ext_buffer; 136 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
137 struct xfs_inode *ip = iip->ili_inode;
162 138
163 ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); 139 *nvecs += 2;
164 if (whichfork == XFS_DATA_FORK) 140 *nbytes += sizeof(struct xfs_inode_log_format) +
165 ip->i_itemp->ili_extents_buf = ext_buffer; 141 xfs_icdinode_size(ip->i_d.di_version);
166 else
167 ip->i_itemp->ili_aextents_buf = ext_buffer;
168 142
169 vecp->i_addr = ext_buffer; 143 xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
170 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); 144 if (XFS_IFORK_Q(ip))
171 vecp->i_type = type; 145 xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
172} 146}
173 147
174/* 148/*
175 * This is called to fill in the vector of log iovecs for the 149 * If this is a v1 format inode, then we need to log it as such. This means
176 * given inode log item. It fills the first item with an inode 150 * that we have to copy the link count from the new field to the old. We
177 * log format structure, the second with the on-disk inode structure, 151 * don't have to worry about the new fields, because nothing trusts them as
178 * and a possible third and/or fourth with the inode data/extents/b-tree 152 * long as the old inode version number is there.
179 * root and inode attributes data/extents/b-tree root.
180 */ 153 */
181STATIC void 154STATIC void
182xfs_inode_item_format( 155xfs_inode_item_format_v1_inode(
183 struct xfs_log_item *lip, 156 struct xfs_inode *ip)
184 struct xfs_log_iovec *vecp) 157{
158 if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) {
159 /*
160 * Convert it back.
161 */
162 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
163 ip->i_d.di_onlink = ip->i_d.di_nlink;
164 } else {
165 /*
166 * The superblock version has already been bumped,
167 * so just make the conversion to the new inode
168 * format permanent.
169 */
170 ip->i_d.di_version = 2;
171 ip->i_d.di_onlink = 0;
172 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
173 }
174}
175
176STATIC void
177xfs_inode_item_format_data_fork(
178 struct xfs_inode_log_item *iip,
179 struct xfs_inode_log_format *ilf,
180 struct xfs_log_vec *lv,
181 struct xfs_log_iovec **vecp)
185{ 182{
186 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
187 struct xfs_inode *ip = iip->ili_inode; 183 struct xfs_inode *ip = iip->ili_inode;
188 uint nvecs;
189 size_t data_bytes; 184 size_t data_bytes;
190 xfs_mount_t *mp;
191
192 vecp->i_addr = &iip->ili_format;
193 vecp->i_len = sizeof(xfs_inode_log_format_t);
194 vecp->i_type = XLOG_REG_TYPE_IFORMAT;
195 vecp++;
196 nvecs = 1;
197
198 vecp->i_addr = &ip->i_d;
199 vecp->i_len = xfs_icdinode_size(ip->i_d.di_version);
200 vecp->i_type = XLOG_REG_TYPE_ICORE;
201 vecp++;
202 nvecs++;
203
204 /*
205 * If this is really an old format inode, then we need to
206 * log it as such. This means that we have to copy the link
207 * count from the new field to the old. We don't have to worry
208 * about the new fields, because nothing trusts them as long as
209 * the old inode version number is there. If the superblock already
210 * has a new version number, then we don't bother converting back.
211 */
212 mp = ip->i_mount;
213 ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
214 if (ip->i_d.di_version == 1) {
215 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
216 /*
217 * Convert it back.
218 */
219 ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
220 ip->i_d.di_onlink = ip->i_d.di_nlink;
221 } else {
222 /*
223 * The superblock version has already been bumped,
224 * so just make the conversion to the new inode
225 * format permanent.
226 */
227 ip->i_d.di_version = 2;
228 ip->i_d.di_onlink = 0;
229 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
230 }
231 }
232 185
233 switch (ip->i_d.di_format) { 186 switch (ip->i_d.di_format) {
234 case XFS_DINODE_FMT_EXTENTS: 187 case XFS_DINODE_FMT_EXTENTS:
@@ -239,36 +192,23 @@ xfs_inode_item_format(
239 if ((iip->ili_fields & XFS_ILOG_DEXT) && 192 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
240 ip->i_d.di_nextents > 0 && 193 ip->i_d.di_nextents > 0 &&
241 ip->i_df.if_bytes > 0) { 194 ip->i_df.if_bytes > 0) {
195 struct xfs_bmbt_rec *p;
196
242 ASSERT(ip->i_df.if_u1.if_extents != NULL); 197 ASSERT(ip->i_df.if_u1.if_extents != NULL);
243 ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); 198 ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
244 ASSERT(iip->ili_extents_buf == NULL); 199
245 200 p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
246#ifdef XFS_NATIVE_HOST 201 data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
247 if (ip->i_d.di_nextents == ip->i_df.if_bytes / 202 xlog_finish_iovec(lv, *vecp, data_bytes);
248 (uint)sizeof(xfs_bmbt_rec_t)) { 203
249 /* 204 ASSERT(data_bytes <= ip->i_df.if_bytes);
250 * There are no delayed allocation 205
251 * extents, so just point to the 206 ilf->ilf_dsize = data_bytes;
252 * real extents array. 207 ilf->ilf_size++;
253 */
254 vecp->i_addr = ip->i_df.if_u1.if_extents;
255 vecp->i_len = ip->i_df.if_bytes;
256 vecp->i_type = XLOG_REG_TYPE_IEXT;
257 } else
258#endif
259 {
260 xfs_inode_item_format_extents(ip, vecp,
261 XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
262 }
263 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
264 iip->ili_format.ilf_dsize = vecp->i_len;
265 vecp++;
266 nvecs++;
267 } else { 208 } else {
268 iip->ili_fields &= ~XFS_ILOG_DEXT; 209 iip->ili_fields &= ~XFS_ILOG_DEXT;
269 } 210 }
270 break; 211 break;
271
272 case XFS_DINODE_FMT_BTREE: 212 case XFS_DINODE_FMT_BTREE:
273 iip->ili_fields &= 213 iip->ili_fields &=
274 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 214 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
@@ -277,80 +217,70 @@ xfs_inode_item_format(
277 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 217 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
278 ip->i_df.if_broot_bytes > 0) { 218 ip->i_df.if_broot_bytes > 0) {
279 ASSERT(ip->i_df.if_broot != NULL); 219 ASSERT(ip->i_df.if_broot != NULL);
280 vecp->i_addr = ip->i_df.if_broot; 220 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
281 vecp->i_len = ip->i_df.if_broot_bytes; 221 ip->i_df.if_broot,
282 vecp->i_type = XLOG_REG_TYPE_IBROOT; 222 ip->i_df.if_broot_bytes);
283 vecp++; 223 ilf->ilf_dsize = ip->i_df.if_broot_bytes;
284 nvecs++; 224 ilf->ilf_size++;
285 iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
286 } else { 225 } else {
287 ASSERT(!(iip->ili_fields & 226 ASSERT(!(iip->ili_fields &
288 XFS_ILOG_DBROOT)); 227 XFS_ILOG_DBROOT));
289 iip->ili_fields &= ~XFS_ILOG_DBROOT; 228 iip->ili_fields &= ~XFS_ILOG_DBROOT;
290 } 229 }
291 break; 230 break;
292
293 case XFS_DINODE_FMT_LOCAL: 231 case XFS_DINODE_FMT_LOCAL:
294 iip->ili_fields &= 232 iip->ili_fields &=
295 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | 233 ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
296 XFS_ILOG_DEV | XFS_ILOG_UUID); 234 XFS_ILOG_DEV | XFS_ILOG_UUID);
297 if ((iip->ili_fields & XFS_ILOG_DDATA) && 235 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
298 ip->i_df.if_bytes > 0) { 236 ip->i_df.if_bytes > 0) {
299 ASSERT(ip->i_df.if_u1.if_data != NULL);
300 ASSERT(ip->i_d.di_size > 0);
301
302 vecp->i_addr = ip->i_df.if_u1.if_data;
303 /* 237 /*
304 * Round i_bytes up to a word boundary. 238 * Round i_bytes up to a word boundary.
305 * The underlying memory is guaranteed to 239 * The underlying memory is guaranteed to
306 * to be there by xfs_idata_realloc(). 240 * to be there by xfs_idata_realloc().
307 */ 241 */
308 data_bytes = roundup(ip->i_df.if_bytes, 4); 242 data_bytes = roundup(ip->i_df.if_bytes, 4);
309 ASSERT((ip->i_df.if_real_bytes == 0) || 243 ASSERT(ip->i_df.if_real_bytes == 0 ||
310 (ip->i_df.if_real_bytes == data_bytes)); 244 ip->i_df.if_real_bytes == data_bytes);
311 vecp->i_len = (int)data_bytes; 245 ASSERT(ip->i_df.if_u1.if_data != NULL);
312 vecp->i_type = XLOG_REG_TYPE_ILOCAL; 246 ASSERT(ip->i_d.di_size > 0);
313 vecp++; 247 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
314 nvecs++; 248 ip->i_df.if_u1.if_data, data_bytes);
315 iip->ili_format.ilf_dsize = (unsigned)data_bytes; 249 ilf->ilf_dsize = (unsigned)data_bytes;
250 ilf->ilf_size++;
316 } else { 251 } else {
317 iip->ili_fields &= ~XFS_ILOG_DDATA; 252 iip->ili_fields &= ~XFS_ILOG_DDATA;
318 } 253 }
319 break; 254 break;
320
321 case XFS_DINODE_FMT_DEV: 255 case XFS_DINODE_FMT_DEV:
322 iip->ili_fields &= 256 iip->ili_fields &=
323 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 257 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
324 XFS_ILOG_DEXT | XFS_ILOG_UUID); 258 XFS_ILOG_DEXT | XFS_ILOG_UUID);
325 if (iip->ili_fields & XFS_ILOG_DEV) { 259 if (iip->ili_fields & XFS_ILOG_DEV)
326 iip->ili_format.ilf_u.ilfu_rdev = 260 ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
327 ip->i_df.if_u2.if_rdev;
328 }
329 break; 261 break;
330
331 case XFS_DINODE_FMT_UUID: 262 case XFS_DINODE_FMT_UUID:
332 iip->ili_fields &= 263 iip->ili_fields &=
333 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | 264 ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
334 XFS_ILOG_DEXT | XFS_ILOG_DEV); 265 XFS_ILOG_DEXT | XFS_ILOG_DEV);
335 if (iip->ili_fields & XFS_ILOG_UUID) { 266 if (iip->ili_fields & XFS_ILOG_UUID)
336 iip->ili_format.ilf_u.ilfu_uuid = 267 ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
337 ip->i_df.if_u2.if_uuid;
338 }
339 break; 268 break;
340
341 default: 269 default:
342 ASSERT(0); 270 ASSERT(0);
343 break; 271 break;
344 } 272 }
273}
345 274
346 /* 275STATIC void
347 * If there are no attributes associated with the file, then we're done. 276xfs_inode_item_format_attr_fork(
348 */ 277 struct xfs_inode_log_item *iip,
349 if (!XFS_IFORK_Q(ip)) { 278 struct xfs_inode_log_format *ilf,
350 iip->ili_fields &= 279 struct xfs_log_vec *lv,
351 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); 280 struct xfs_log_iovec **vecp)
352 goto out; 281{
353 } 282 struct xfs_inode *ip = iip->ili_inode;
283 size_t data_bytes;
354 284
355 switch (ip->i_d.di_aformat) { 285 switch (ip->i_d.di_aformat) {
356 case XFS_DINODE_FMT_EXTENTS: 286 case XFS_DINODE_FMT_EXTENTS:
@@ -360,30 +290,22 @@ xfs_inode_item_format(
360 if ((iip->ili_fields & XFS_ILOG_AEXT) && 290 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
361 ip->i_d.di_anextents > 0 && 291 ip->i_d.di_anextents > 0 &&
362 ip->i_afp->if_bytes > 0) { 292 ip->i_afp->if_bytes > 0) {
293 struct xfs_bmbt_rec *p;
294
363 ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == 295 ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
364 ip->i_d.di_anextents); 296 ip->i_d.di_anextents);
365 ASSERT(ip->i_afp->if_u1.if_extents != NULL); 297 ASSERT(ip->i_afp->if_u1.if_extents != NULL);
366#ifdef XFS_NATIVE_HOST 298
367 /* 299 p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
368 * There are not delayed allocation extents 300 data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
369 * for attributes, so just point at the array. 301 xlog_finish_iovec(lv, *vecp, data_bytes);
370 */ 302
371 vecp->i_addr = ip->i_afp->if_u1.if_extents; 303 ilf->ilf_asize = data_bytes;
372 vecp->i_len = ip->i_afp->if_bytes; 304 ilf->ilf_size++;
373 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
374#else
375 ASSERT(iip->ili_aextents_buf == NULL);
376 xfs_inode_item_format_extents(ip, vecp,
377 XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
378#endif
379 iip->ili_format.ilf_asize = vecp->i_len;
380 vecp++;
381 nvecs++;
382 } else { 305 } else {
383 iip->ili_fields &= ~XFS_ILOG_AEXT; 306 iip->ili_fields &= ~XFS_ILOG_AEXT;
384 } 307 }
385 break; 308 break;
386
387 case XFS_DINODE_FMT_BTREE: 309 case XFS_DINODE_FMT_BTREE:
388 iip->ili_fields &= 310 iip->ili_fields &=
389 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); 311 ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
@@ -392,61 +314,89 @@ xfs_inode_item_format(
392 ip->i_afp->if_broot_bytes > 0) { 314 ip->i_afp->if_broot_bytes > 0) {
393 ASSERT(ip->i_afp->if_broot != NULL); 315 ASSERT(ip->i_afp->if_broot != NULL);
394 316
395 vecp->i_addr = ip->i_afp->if_broot; 317 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
396 vecp->i_len = ip->i_afp->if_broot_bytes; 318 ip->i_afp->if_broot,
397 vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; 319 ip->i_afp->if_broot_bytes);
398 vecp++; 320 ilf->ilf_asize = ip->i_afp->if_broot_bytes;
399 nvecs++; 321 ilf->ilf_size++;
400 iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
401 } else { 322 } else {
402 iip->ili_fields &= ~XFS_ILOG_ABROOT; 323 iip->ili_fields &= ~XFS_ILOG_ABROOT;
403 } 324 }
404 break; 325 break;
405
406 case XFS_DINODE_FMT_LOCAL: 326 case XFS_DINODE_FMT_LOCAL:
407 iip->ili_fields &= 327 iip->ili_fields &=
408 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); 328 ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
409 329
410 if ((iip->ili_fields & XFS_ILOG_ADATA) && 330 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
411 ip->i_afp->if_bytes > 0) { 331 ip->i_afp->if_bytes > 0) {
412 ASSERT(ip->i_afp->if_u1.if_data != NULL);
413
414 vecp->i_addr = ip->i_afp->if_u1.if_data;
415 /* 332 /*
416 * Round i_bytes up to a word boundary. 333 * Round i_bytes up to a word boundary.
417 * The underlying memory is guaranteed to 334 * The underlying memory is guaranteed to
418 * to be there by xfs_idata_realloc(). 335 * to be there by xfs_idata_realloc().
419 */ 336 */
420 data_bytes = roundup(ip->i_afp->if_bytes, 4); 337 data_bytes = roundup(ip->i_afp->if_bytes, 4);
421 ASSERT((ip->i_afp->if_real_bytes == 0) || 338 ASSERT(ip->i_afp->if_real_bytes == 0 ||
422 (ip->i_afp->if_real_bytes == data_bytes)); 339 ip->i_afp->if_real_bytes == data_bytes);
423 vecp->i_len = (int)data_bytes; 340 ASSERT(ip->i_afp->if_u1.if_data != NULL);
424 vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; 341 xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
425 vecp++; 342 ip->i_afp->if_u1.if_data,
426 nvecs++; 343 data_bytes);
427 iip->ili_format.ilf_asize = (unsigned)data_bytes; 344 ilf->ilf_asize = (unsigned)data_bytes;
345 ilf->ilf_size++;
428 } else { 346 } else {
429 iip->ili_fields &= ~XFS_ILOG_ADATA; 347 iip->ili_fields &= ~XFS_ILOG_ADATA;
430 } 348 }
431 break; 349 break;
432
433 default: 350 default:
434 ASSERT(0); 351 ASSERT(0);
435 break; 352 break;
436 } 353 }
437
438out:
439 /*
440 * Now update the log format that goes out to disk from the in-core
441 * values. We always write the inode core to make the arithmetic
442 * games in recovery easier, which isn't a big deal as just about any
443 * transaction would dirty it anyway.
444 */
445 iip->ili_format.ilf_fields = XFS_ILOG_CORE |
446 (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
447 iip->ili_format.ilf_size = nvecs;
448} 354}
449 355
356/*
357 * This is called to fill in the vector of log iovecs for the given inode
358 * log item. It fills the first item with an inode log format structure,
359 * the second with the on-disk inode structure, and a possible third and/or
360 * fourth with the inode data/extents/b-tree root and inode attributes
361 * data/extents/b-tree root.
362 */
363STATIC void
364xfs_inode_item_format(
365 struct xfs_log_item *lip,
366 struct xfs_log_vec *lv)
367{
368 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
369 struct xfs_inode *ip = iip->ili_inode;
370 struct xfs_inode_log_format *ilf;
371 struct xfs_log_iovec *vecp = NULL;
372
373 ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
374 ilf->ilf_type = XFS_LI_INODE;
375 ilf->ilf_ino = ip->i_ino;
376 ilf->ilf_blkno = ip->i_imap.im_blkno;
377 ilf->ilf_len = ip->i_imap.im_len;
378 ilf->ilf_boffset = ip->i_imap.im_boffset;
379 ilf->ilf_fields = XFS_ILOG_CORE;
380 ilf->ilf_size = 2; /* format + core */
381 xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
382
383 if (ip->i_d.di_version == 1)
384 xfs_inode_item_format_v1_inode(ip);
385 xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
386 &ip->i_d,
387 xfs_icdinode_size(ip->i_d.di_version));
388
389 xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
390 if (XFS_IFORK_Q(ip)) {
391 xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
392 } else {
393 iip->ili_fields &=
394 ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
395 }
396
397 /* update the format with the exact fields we actually logged */
398 ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
399}
450 400
451/* 401/*
452 * This is called to pin the inode associated with the inode log 402 * This is called to pin the inode associated with the inode log
@@ -563,27 +513,6 @@ xfs_inode_item_unlock(
563 ASSERT(ip->i_itemp != NULL); 513 ASSERT(ip->i_itemp != NULL);
564 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 514 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
565 515
566 /*
567 * If the inode needed a separate buffer with which to log
568 * its extents, then free it now.
569 */
570 if (iip->ili_extents_buf != NULL) {
571 ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
572 ASSERT(ip->i_d.di_nextents > 0);
573 ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
574 ASSERT(ip->i_df.if_bytes > 0);
575 kmem_free(iip->ili_extents_buf);
576 iip->ili_extents_buf = NULL;
577 }
578 if (iip->ili_aextents_buf != NULL) {
579 ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
580 ASSERT(ip->i_d.di_anextents > 0);
581 ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
582 ASSERT(ip->i_afp->if_bytes > 0);
583 kmem_free(iip->ili_aextents_buf);
584 iip->ili_aextents_buf = NULL;
585 }
586
587 lock_flags = iip->ili_lock_flags; 516 lock_flags = iip->ili_lock_flags;
588 iip->ili_lock_flags = 0; 517 iip->ili_lock_flags = 0;
589 if (lock_flags) 518 if (lock_flags)
@@ -670,11 +599,6 @@ xfs_inode_item_init(
670 iip->ili_inode = ip; 599 iip->ili_inode = ip;
671 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, 600 xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
672 &xfs_inode_item_ops); 601 &xfs_inode_item_ops);
673 iip->ili_format.ilf_type = XFS_LI_INODE;
674 iip->ili_format.ilf_ino = ip->i_ino;
675 iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
676 iip->ili_format.ilf_len = ip->i_imap.im_len;
677 iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
678} 602}
679 603
680/* 604/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index dce4d656768c..488d81254e28 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,11 +34,6 @@ typedef struct xfs_inode_log_item {
34 unsigned short ili_logged; /* flushed logged data */ 34 unsigned short ili_logged; /* flushed logged data */
35 unsigned int ili_last_fields; /* fields when flushed */ 35 unsigned int ili_last_fields; /* fields when flushed */
36 unsigned int ili_fields; /* fields to be logged */ 36 unsigned int ili_fields; /* fields to be logged */
37 struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
38 data exts */
39 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
40 attr exts */
41 xfs_inode_log_format_t ili_format; /* logged structure */
42} xfs_inode_log_item_t; 37} xfs_inode_log_item_t;
43 38
44static inline int xfs_inode_clean(xfs_inode_t *ip) 39static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 33ad9a77791f..bcfe61202115 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -112,15 +112,11 @@ xfs_find_handle(
112 memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); 112 memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
113 hsize = sizeof(xfs_fsid_t); 113 hsize = sizeof(xfs_fsid_t);
114 } else { 114 } else {
115 int lock_mode;
116
117 lock_mode = xfs_ilock_map_shared(ip);
118 handle.ha_fid.fid_len = sizeof(xfs_fid_t) - 115 handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
119 sizeof(handle.ha_fid.fid_len); 116 sizeof(handle.ha_fid.fid_len);
120 handle.ha_fid.fid_pad = 0; 117 handle.ha_fid.fid_pad = 0;
121 handle.ha_fid.fid_gen = ip->i_d.di_gen; 118 handle.ha_fid.fid_gen = ip->i_d.di_gen;
122 handle.ha_fid.fid_ino = ip->i_ino; 119 handle.ha_fid.fid_ino = ip->i_ino;
123 xfs_iunlock_map_shared(ip, lock_mode);
124 120
125 hsize = XFS_HSIZE(handle); 121 hsize = XFS_HSIZE(handle);
126 } 122 }
@@ -1587,7 +1583,7 @@ xfs_file_ioctl(
1587 XFS_IS_REALTIME_INODE(ip) ? 1583 XFS_IS_REALTIME_INODE(ip) ?
1588 mp->m_rtdev_targp : mp->m_ddev_targp; 1584 mp->m_rtdev_targp : mp->m_ddev_targp;
1589 1585
1590 da.d_mem = da.d_miniosz = 1 << target->bt_sshift; 1586 da.d_mem = da.d_miniosz = target->bt_logical_sectorsize;
1591 da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); 1587 da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
1592 1588
1593 if (copy_to_user(arg, &da, sizeof(da))) 1589 if (copy_to_user(arg, &da, sizeof(da)))
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 104455b8046c..9ddfb8190ca1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -123,7 +123,7 @@ xfs_vn_mknod(
123{ 123{
124 struct inode *inode; 124 struct inode *inode;
125 struct xfs_inode *ip = NULL; 125 struct xfs_inode *ip = NULL;
126 struct posix_acl *default_acl = NULL; 126 struct posix_acl *default_acl, *acl;
127 struct xfs_name name; 127 struct xfs_name name;
128 int error; 128 int error;
129 129
@@ -139,14 +139,9 @@ xfs_vn_mknod(
139 rdev = 0; 139 rdev = 0;
140 } 140 }
141 141
142 if (IS_POSIXACL(dir)) { 142 error = posix_acl_create(dir, &mode, &default_acl, &acl);
143 default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); 143 if (error)
144 if (IS_ERR(default_acl)) 144 return error;
145 return PTR_ERR(default_acl);
146
147 if (!default_acl)
148 mode &= ~current_umask();
149 }
150 145
151 xfs_dentry_to_name(&name, dentry, mode); 146 xfs_dentry_to_name(&name, dentry, mode);
152 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); 147 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
@@ -159,22 +154,30 @@ xfs_vn_mknod(
159 if (unlikely(error)) 154 if (unlikely(error))
160 goto out_cleanup_inode; 155 goto out_cleanup_inode;
161 156
157#ifdef CONFIG_XFS_POSIX_ACL
162 if (default_acl) { 158 if (default_acl) {
163 error = -xfs_inherit_acl(inode, default_acl); 159 error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
164 default_acl = NULL; 160 if (error)
165 if (unlikely(error))
166 goto out_cleanup_inode; 161 goto out_cleanup_inode;
167 } 162 }
168 163 if (acl) {
164 error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
165 if (error)
166 goto out_cleanup_inode;
167 }
168#endif
169 169
170 d_instantiate(dentry, inode); 170 d_instantiate(dentry, inode);
171 out_free_acl:
172 if (default_acl)
173 posix_acl_release(default_acl);
174 if (acl)
175 posix_acl_release(acl);
171 return -error; 176 return -error;
172 177
173 out_cleanup_inode: 178 out_cleanup_inode:
174 xfs_cleanup_inode(dir, inode, dentry); 179 xfs_cleanup_inode(dir, inode, dentry);
175 out_free_acl: 180 goto out_free_acl;
176 posix_acl_release(default_acl);
177 return -error;
178} 181}
179 182
180STATIC int 183STATIC int
@@ -391,18 +394,6 @@ xfs_vn_follow_link(
391 return NULL; 394 return NULL;
392} 395}
393 396
394STATIC void
395xfs_vn_put_link(
396 struct dentry *dentry,
397 struct nameidata *nd,
398 void *p)
399{
400 char *s = nd_get_link(nd);
401
402 if (!IS_ERR(s))
403 kfree(s);
404}
405
406STATIC int 397STATIC int
407xfs_vn_getattr( 398xfs_vn_getattr(
408 struct vfsmount *mnt, 399 struct vfsmount *mnt,
@@ -459,14 +450,12 @@ xfs_vn_getattr(
459 450
460static void 451static void
461xfs_setattr_mode( 452xfs_setattr_mode(
462 struct xfs_trans *tp,
463 struct xfs_inode *ip, 453 struct xfs_inode *ip,
464 struct iattr *iattr) 454 struct iattr *iattr)
465{ 455{
466 struct inode *inode = VFS_I(ip); 456 struct inode *inode = VFS_I(ip);
467 umode_t mode = iattr->ia_mode; 457 umode_t mode = iattr->ia_mode;
468 458
469 ASSERT(tp);
470 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 459 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
471 460
472 ip->i_d.di_mode &= S_IFMT; 461 ip->i_d.di_mode &= S_IFMT;
@@ -476,6 +465,32 @@ xfs_setattr_mode(
476 inode->i_mode |= mode & ~S_IFMT; 465 inode->i_mode |= mode & ~S_IFMT;
477} 466}
478 467
468static void
469xfs_setattr_time(
470 struct xfs_inode *ip,
471 struct iattr *iattr)
472{
473 struct inode *inode = VFS_I(ip);
474
475 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
476
477 if (iattr->ia_valid & ATTR_ATIME) {
478 inode->i_atime = iattr->ia_atime;
479 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
480 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
481 }
482 if (iattr->ia_valid & ATTR_CTIME) {
483 inode->i_ctime = iattr->ia_ctime;
484 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
485 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
486 }
487 if (iattr->ia_valid & ATTR_MTIME) {
488 inode->i_mtime = iattr->ia_mtime;
489 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
490 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
491 }
492}
493
479int 494int
480xfs_setattr_nonsize( 495xfs_setattr_nonsize(
481 struct xfs_inode *ip, 496 struct xfs_inode *ip,
@@ -630,30 +645,10 @@ xfs_setattr_nonsize(
630 } 645 }
631 } 646 }
632 647
633 /*
634 * Change file access modes.
635 */
636 if (mask & ATTR_MODE) 648 if (mask & ATTR_MODE)
637 xfs_setattr_mode(tp, ip, iattr); 649 xfs_setattr_mode(ip, iattr);
638 650 if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
639 /* 651 xfs_setattr_time(ip, iattr);
640 * Change file access or modified times.
641 */
642 if (mask & ATTR_ATIME) {
643 inode->i_atime = iattr->ia_atime;
644 ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
645 ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
646 }
647 if (mask & ATTR_CTIME) {
648 inode->i_ctime = iattr->ia_ctime;
649 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
650 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
651 }
652 if (mask & ATTR_MTIME) {
653 inode->i_mtime = iattr->ia_mtime;
654 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
655 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
656 }
657 652
658 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 653 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
659 654
@@ -684,7 +679,7 @@ xfs_setattr_nonsize(
684 * Posix ACL code seems to care about this issue either. 679 * Posix ACL code seems to care about this issue either.
685 */ 680 */
686 if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { 681 if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
687 error = -xfs_acl_chmod(inode); 682 error = -posix_acl_chmod(inode, inode->i_mode);
688 if (error) 683 if (error)
689 return XFS_ERROR(error); 684 return XFS_ERROR(error);
690 } 685 }
@@ -710,7 +705,6 @@ xfs_setattr_size(
710{ 705{
711 struct xfs_mount *mp = ip->i_mount; 706 struct xfs_mount *mp = ip->i_mount;
712 struct inode *inode = VFS_I(ip); 707 struct inode *inode = VFS_I(ip);
713 int mask = iattr->ia_valid;
714 xfs_off_t oldsize, newsize; 708 xfs_off_t oldsize, newsize;
715 struct xfs_trans *tp; 709 struct xfs_trans *tp;
716 int error; 710 int error;
@@ -731,8 +725,8 @@ xfs_setattr_size(
731 725
732 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 726 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
733 ASSERT(S_ISREG(ip->i_d.di_mode)); 727 ASSERT(S_ISREG(ip->i_d.di_mode));
734 ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| 728 ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
735 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); 729 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
736 730
737 oldsize = inode->i_size; 731 oldsize = inode->i_size;
738 newsize = iattr->ia_size; 732 newsize = iattr->ia_size;
@@ -741,7 +735,7 @@ xfs_setattr_size(
741 * Short circuit the truncate case for zero length files. 735 * Short circuit the truncate case for zero length files.
742 */ 736 */
743 if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { 737 if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
744 if (!(mask & (ATTR_CTIME|ATTR_MTIME))) 738 if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
745 return 0; 739 return 0;
746 740
747 /* 741 /*
@@ -829,10 +823,11 @@ xfs_setattr_size(
829 * these flags set. For all other operations the VFS set these flags 823 * these flags set. For all other operations the VFS set these flags
830 * explicitly if it wants a timestamp update. 824 * explicitly if it wants a timestamp update.
831 */ 825 */
832 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { 826 if (newsize != oldsize &&
827 !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
833 iattr->ia_ctime = iattr->ia_mtime = 828 iattr->ia_ctime = iattr->ia_mtime =
834 current_fs_time(inode->i_sb); 829 current_fs_time(inode->i_sb);
835 mask |= ATTR_CTIME | ATTR_MTIME; 830 iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
836 } 831 }
837 832
838 /* 833 /*
@@ -868,22 +863,10 @@ xfs_setattr_size(
868 xfs_inode_clear_eofblocks_tag(ip); 863 xfs_inode_clear_eofblocks_tag(ip);
869 } 864 }
870 865
871 /* 866 if (iattr->ia_valid & ATTR_MODE)
872 * Change file access modes. 867 xfs_setattr_mode(ip, iattr);
873 */ 868 if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
874 if (mask & ATTR_MODE) 869 xfs_setattr_time(ip, iattr);
875 xfs_setattr_mode(tp, ip, iattr);
876
877 if (mask & ATTR_CTIME) {
878 inode->i_ctime = iattr->ia_ctime;
879 ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
880 ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
881 }
882 if (mask & ATTR_MTIME) {
883 inode->i_mtime = iattr->ia_mtime;
884 ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
885 ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
886 }
887 870
888 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 871 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
889 872
@@ -1053,6 +1036,7 @@ xfs_vn_fiemap(
1053 1036
1054static const struct inode_operations xfs_inode_operations = { 1037static const struct inode_operations xfs_inode_operations = {
1055 .get_acl = xfs_get_acl, 1038 .get_acl = xfs_get_acl,
1039 .set_acl = xfs_set_acl,
1056 .getattr = xfs_vn_getattr, 1040 .getattr = xfs_vn_getattr,
1057 .setattr = xfs_vn_setattr, 1041 .setattr = xfs_vn_setattr,
1058 .setxattr = generic_setxattr, 1042 .setxattr = generic_setxattr,
@@ -1080,6 +1064,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
1080 .mknod = xfs_vn_mknod, 1064 .mknod = xfs_vn_mknod,
1081 .rename = xfs_vn_rename, 1065 .rename = xfs_vn_rename,
1082 .get_acl = xfs_get_acl, 1066 .get_acl = xfs_get_acl,
1067 .set_acl = xfs_set_acl,
1083 .getattr = xfs_vn_getattr, 1068 .getattr = xfs_vn_getattr,
1084 .setattr = xfs_vn_setattr, 1069 .setattr = xfs_vn_setattr,
1085 .setxattr = generic_setxattr, 1070 .setxattr = generic_setxattr,
@@ -1106,6 +1091,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
1106 .mknod = xfs_vn_mknod, 1091 .mknod = xfs_vn_mknod,
1107 .rename = xfs_vn_rename, 1092 .rename = xfs_vn_rename,
1108 .get_acl = xfs_get_acl, 1093 .get_acl = xfs_get_acl,
1094 .set_acl = xfs_set_acl,
1109 .getattr = xfs_vn_getattr, 1095 .getattr = xfs_vn_getattr,
1110 .setattr = xfs_vn_setattr, 1096 .setattr = xfs_vn_setattr,
1111 .setxattr = generic_setxattr, 1097 .setxattr = generic_setxattr,
@@ -1118,8 +1104,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
1118static const struct inode_operations xfs_symlink_inode_operations = { 1104static const struct inode_operations xfs_symlink_inode_operations = {
1119 .readlink = generic_readlink, 1105 .readlink = generic_readlink,
1120 .follow_link = xfs_vn_follow_link, 1106 .follow_link = xfs_vn_follow_link,
1121 .put_link = xfs_vn_put_link, 1107 .put_link = kfree_put_link,
1122 .get_acl = xfs_get_acl,
1123 .getattr = xfs_vn_getattr, 1108 .getattr = xfs_vn_getattr,
1124 .setattr = xfs_vn_setattr, 1109 .setattr = xfs_vn_setattr,
1125 .setxattr = generic_setxattr, 1110 .setxattr = generic_setxattr,
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index d2c5057b5cc4..1c34e4335920 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -30,7 +30,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
30/* 30/*
31 * Internal setattr interfaces. 31 * Internal setattr interfaces.
32 */ 32 */
33#define XFS_ATTR_NOACL 0x01 /* Don't call xfs_acl_chmod */ 33#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
34 34
35extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, 35extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
36 int flags); 36 int flags);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c237ad15d500..f46338285152 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -209,9 +209,8 @@ xfs_bulkstat(
209 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ 209 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
210 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ 210 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
211 xfs_ino_t lastino; /* last inode number returned */ 211 xfs_ino_t lastino; /* last inode number returned */
212 int nbcluster; /* # of blocks in a cluster */ 212 int blks_per_cluster; /* # of blocks per cluster */
213 int nicluster; /* # of inodes in a cluster */ 213 int inodes_per_cluster;/* # of inodes per cluster */
214 int nimask; /* mask for inode clusters */
215 int nirbuf; /* size of irbuf */ 214 int nirbuf; /* size of irbuf */
216 int rval; /* return value error code */ 215 int rval; /* return value error code */
217 int tmp; /* result value from btree calls */ 216 int tmp; /* result value from btree calls */
@@ -243,11 +242,8 @@ xfs_bulkstat(
243 *done = 0; 242 *done = 0;
244 fmterror = 0; 243 fmterror = 0;
245 ubufp = ubuffer; 244 ubufp = ubuffer;
246 nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? 245 blks_per_cluster = xfs_icluster_size_fsb(mp);
247 mp->m_sb.sb_inopblock : 246 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
248 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
249 nimask = ~(nicluster - 1);
250 nbcluster = nicluster >> mp->m_sb.sb_inopblog;
251 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); 247 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
252 if (!irbuf) 248 if (!irbuf)
253 return ENOMEM; 249 return ENOMEM;
@@ -390,12 +386,12 @@ xfs_bulkstat(
390 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); 386 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
391 for (chunkidx = 0; 387 for (chunkidx = 0;
392 chunkidx < XFS_INODES_PER_CHUNK; 388 chunkidx < XFS_INODES_PER_CHUNK;
393 chunkidx += nicluster, 389 chunkidx += inodes_per_cluster,
394 agbno += nbcluster) { 390 agbno += blks_per_cluster) {
395 if (xfs_inobt_maskn(chunkidx, nicluster) 391 if (xfs_inobt_maskn(chunkidx,
396 & ~r.ir_free) 392 inodes_per_cluster) & ~r.ir_free)
397 xfs_btree_reada_bufs(mp, agno, 393 xfs_btree_reada_bufs(mp, agno,
398 agbno, nbcluster, 394 agbno, blks_per_cluster,
399 &xfs_inode_buf_ops); 395 &xfs_inode_buf_ops);
400 } 396 }
401 blk_finish_plug(&plug); 397 blk_finish_plug(&plug);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index e148719e0a5d..b0f4ef77fa70 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -30,6 +30,52 @@ struct xfs_log_vec {
30 30
31#define XFS_LOG_VEC_ORDERED (-1) 31#define XFS_LOG_VEC_ORDERED (-1)
32 32
33static inline void *
34xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
35 uint type)
36{
37 struct xfs_log_iovec *vec = *vecp;
38
39 if (vec) {
40 ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
41 vec++;
42 } else {
43 vec = &lv->lv_iovecp[0];
44 }
45
46 vec->i_type = type;
47 vec->i_addr = lv->lv_buf + lv->lv_buf_len;
48
49 ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
50
51 *vecp = vec;
52 return vec->i_addr;
53}
54
55static inline void
56xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
57{
58 /*
59 * We need to make sure the next buffer is naturally aligned for the
60 * biggest basic data type we put into it. We already accounted for
61 * this when sizing the buffer.
62 */
63 lv->lv_buf_len += round_up(len, sizeof(uint64_t));
64 vec->i_len = len;
65}
66
67static inline void *
68xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
69 uint type, void *data, int len)
70{
71 void *buf;
72
73 buf = xlog_prepare_iovec(lv, vecp, type);
74 memcpy(buf, data, len);
75 xlog_finish_iovec(lv, *vecp, len);
76 return buf;
77}
78
33/* 79/*
34 * Structure used to pass callback function and the function's argument 80 * Structure used to pass callback function and the function's argument
35 * to the log manager. 81 * to the log manager.
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5eb51fc5eb84..4ef6fdbced78 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -82,36 +82,6 @@ xlog_cil_init_post_recovery(
82 log->l_curr_block); 82 log->l_curr_block);
83} 83}
84 84
85STATIC int
86xlog_cil_lv_item_format(
87 struct xfs_log_item *lip,
88 struct xfs_log_vec *lv)
89{
90 int index;
91 char *ptr;
92
93 /* format new vectors into array */
94 lip->li_ops->iop_format(lip, lv->lv_iovecp);
95
96 /* copy data into existing array */
97 ptr = lv->lv_buf;
98 for (index = 0; index < lv->lv_niovecs; index++) {
99 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
100
101 memcpy(ptr, vec->i_addr, vec->i_len);
102 vec->i_addr = ptr;
103 ptr += vec->i_len;
104 }
105
106 /*
107 * some size calculations for log vectors over-estimate, so the caller
108 * doesn't know the amount of space actually used by the item. Return
109 * the byte count to the caller so they can check and store it
110 * appropriately.
111 */
112 return ptr - lv->lv_buf;
113}
114
115/* 85/*
116 * Prepare the log item for insertion into the CIL. Calculate the difference in 86 * Prepare the log item for insertion into the CIL. Calculate the difference in
117 * log space and vectors it will consume, and if it is a new item pin it as 87 * log space and vectors it will consume, and if it is a new item pin it as
@@ -232,12 +202,28 @@ xlog_cil_insert_format_items(
232 nbytes = 0; 202 nbytes = 0;
233 } 203 }
234 204
205 /*
206 * We 64-bit align the length of each iovec so that the start
207 * of the next one is naturally aligned. We'll need to
208 * account for that slack space here. Then round nbytes up
209 * to 64-bit alignment so that the initial buffer alignment is
210 * easy to calculate and verify.
211 */
212 nbytes += niovecs * sizeof(uint64_t);
213 nbytes = round_up(nbytes, sizeof(uint64_t));
214
235 /* grab the old item if it exists for reservation accounting */ 215 /* grab the old item if it exists for reservation accounting */
236 old_lv = lip->li_lv; 216 old_lv = lip->li_lv;
237 217
238 /* calc buffer size */ 218 /*
239 buf_size = sizeof(struct xfs_log_vec) + nbytes + 219 * The data buffer needs to start 64-bit aligned, so round up
240 niovecs * sizeof(struct xfs_log_iovec); 220 * that space to ensure we can align it appropriately and not
221 * overrun the buffer.
222 */
223 buf_size = nbytes +
224 round_up((sizeof(struct xfs_log_vec) +
225 niovecs * sizeof(struct xfs_log_iovec)),
226 sizeof(uint64_t));
241 227
242 /* compare to existing item size */ 228 /* compare to existing item size */
243 if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { 229 if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
@@ -254,34 +240,29 @@ xlog_cil_insert_format_items(
254 */ 240 */
255 *diff_iovecs -= lv->lv_niovecs; 241 *diff_iovecs -= lv->lv_niovecs;
256 *diff_len -= lv->lv_buf_len; 242 *diff_len -= lv->lv_buf_len;
257 243 } else {
258 /* Ensure the lv is set up according to ->iop_size */ 244 /* allocate new data chunk */
259 lv->lv_niovecs = niovecs; 245 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
260 lv->lv_buf = (char *)lv + buf_size - nbytes; 246 lv->lv_item = lip;
261 247 lv->lv_size = buf_size;
262 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); 248 if (ordered) {
263 goto insert; 249 /* track as an ordered logvec */
250 ASSERT(lip->li_lv == NULL);
251 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
252 goto insert;
253 }
254 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
264 } 255 }
265 256
266 /* allocate new data chunk */ 257 /* Ensure the lv is set up according to ->iop_size */
267 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
268 lv->lv_item = lip;
269 lv->lv_size = buf_size;
270 lv->lv_niovecs = niovecs; 258 lv->lv_niovecs = niovecs;
271 if (ordered) {
272 /* track as an ordered logvec */
273 ASSERT(lip->li_lv == NULL);
274 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
275 goto insert;
276 }
277
278 /* The allocated iovec region lies beyond the log vector. */
279 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
280 259
281 /* The allocated data region lies beyond the iovec region */ 260 /* The allocated data region lies beyond the iovec region */
261 lv->lv_buf_len = 0;
282 lv->lv_buf = (char *)lv + buf_size - nbytes; 262 lv->lv_buf = (char *)lv + buf_size - nbytes;
263 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
283 264
284 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); 265 lip->li_ops->iop_format(lip, lv);
285insert: 266insert:
286 ASSERT(lv->lv_buf_len <= nbytes); 267 ASSERT(lv->lv_buf_len <= nbytes);
287 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); 268 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index eae16920655b..bce53ac81096 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1654,6 +1654,7 @@ xlog_recover_reorder_trans(
1654 int pass) 1654 int pass)
1655{ 1655{
1656 xlog_recover_item_t *item, *n; 1656 xlog_recover_item_t *item, *n;
1657 int error = 0;
1657 LIST_HEAD(sort_list); 1658 LIST_HEAD(sort_list);
1658 LIST_HEAD(cancel_list); 1659 LIST_HEAD(cancel_list);
1659 LIST_HEAD(buffer_list); 1660 LIST_HEAD(buffer_list);
@@ -1695,9 +1696,17 @@ xlog_recover_reorder_trans(
1695 "%s: unrecognized type of log operation", 1696 "%s: unrecognized type of log operation",
1696 __func__); 1697 __func__);
1697 ASSERT(0); 1698 ASSERT(0);
1698 return XFS_ERROR(EIO); 1699 /*
1700 * return the remaining items back to the transaction
1701 * item list so they can be freed in caller.
1702 */
1703 if (!list_empty(&sort_list))
1704 list_splice_init(&sort_list, &trans->r_itemq);
1705 error = XFS_ERROR(EIO);
1706 goto out;
1699 } 1707 }
1700 } 1708 }
1709out:
1701 ASSERT(list_empty(&sort_list)); 1710 ASSERT(list_empty(&sort_list));
1702 if (!list_empty(&buffer_list)) 1711 if (!list_empty(&buffer_list))
1703 list_splice(&buffer_list, &trans->r_itemq); 1712 list_splice(&buffer_list, &trans->r_itemq);
@@ -1707,7 +1716,7 @@ xlog_recover_reorder_trans(
1707 list_splice_tail(&inode_buffer_list, &trans->r_itemq); 1716 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1708 if (!list_empty(&cancel_list)) 1717 if (!list_empty(&cancel_list))
1709 list_splice_tail(&cancel_list, &trans->r_itemq); 1718 list_splice_tail(&cancel_list, &trans->r_itemq);
1710 return 0; 1719 return error;
1711} 1720}
1712 1721
1713/* 1722/*
@@ -2517,19 +2526,19 @@ xlog_recover_buffer_pass2(
2517 * 2526 *
2518 * Also make sure that only inode buffers with good sizes stay in 2527 * Also make sure that only inode buffers with good sizes stay in
2519 * the buffer cache. The kernel moves inodes in buffers of 1 block 2528 * the buffer cache. The kernel moves inodes in buffers of 1 block
2520 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode 2529 * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
2521 * buffers in the log can be a different size if the log was generated 2530 * buffers in the log can be a different size if the log was generated
2522 * by an older kernel using unclustered inode buffers or a newer kernel 2531 * by an older kernel using unclustered inode buffers or a newer kernel
2523 * running with a different inode cluster size. Regardless, if the 2532 * running with a different inode cluster size. Regardless, if the
2524 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) 2533 * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
2525 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep 2534 * for *our* value of mp->m_inode_cluster_size, then we need to keep
2526 * the buffer out of the buffer cache so that the buffer won't 2535 * the buffer out of the buffer cache so that the buffer won't
2527 * overlap with future reads of those inodes. 2536 * overlap with future reads of those inodes.
2528 */ 2537 */
2529 if (XFS_DINODE_MAGIC == 2538 if (XFS_DINODE_MAGIC ==
2530 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 2539 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2531 (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, 2540 (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2532 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { 2541 (__uint32_t)log->l_mp->m_inode_cluster_size))) {
2533 xfs_buf_stale(bp); 2542 xfs_buf_stale(bp);
2534 error = xfs_bwrite(bp); 2543 error = xfs_bwrite(bp);
2535 } else { 2544 } else {
@@ -3202,10 +3211,10 @@ xlog_recover_do_icreate_pass2(
3202 } 3211 }
3203 3212
3204 /* existing allocation is fixed value */ 3213 /* existing allocation is fixed value */
3205 ASSERT(count == XFS_IALLOC_INODES(mp)); 3214 ASSERT(count == mp->m_ialloc_inos);
3206 ASSERT(length == XFS_IALLOC_BLOCKS(mp)); 3215 ASSERT(length == mp->m_ialloc_blks);
3207 if (count != XFS_IALLOC_INODES(mp) || 3216 if (count != mp->m_ialloc_inos ||
3208 length != XFS_IALLOC_BLOCKS(mp)) { 3217 length != mp->m_ialloc_blks) {
3209 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); 3218 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
3210 return EINVAL; 3219 return EINVAL;
3211 } 3220 }
@@ -3611,8 +3620,10 @@ xlog_recover_process_data(
3611 error = XFS_ERROR(EIO); 3620 error = XFS_ERROR(EIO);
3612 break; 3621 break;
3613 } 3622 }
3614 if (error) 3623 if (error) {
3624 xlog_recover_free_trans(trans);
3615 return error; 3625 return error;
3626 }
3616 } 3627 }
3617 dp += be32_to_cpu(ohead->oh_len); 3628 dp += be32_to_cpu(ohead->oh_len);
3618 num_logops--; 3629 num_logops--;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 02df7b408a26..f96c05669a9e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -282,22 +282,29 @@ xfs_readsb(
282 struct xfs_sb *sbp = &mp->m_sb; 282 struct xfs_sb *sbp = &mp->m_sb;
283 int error; 283 int error;
284 int loud = !(flags & XFS_MFSI_QUIET); 284 int loud = !(flags & XFS_MFSI_QUIET);
285 const struct xfs_buf_ops *buf_ops;
285 286
286 ASSERT(mp->m_sb_bp == NULL); 287 ASSERT(mp->m_sb_bp == NULL);
287 ASSERT(mp->m_ddev_targp != NULL); 288 ASSERT(mp->m_ddev_targp != NULL);
288 289
289 /* 290 /*
291 * For the initial read, we must guess at the sector
292 * size based on the block device. It's enough to
293 * get the sb_sectsize out of the superblock and
294 * then reread with the proper length.
295 * We don't verify it yet, because it may not be complete.
296 */
297 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
298 buf_ops = NULL;
299
300 /*
290 * Allocate a (locked) buffer to hold the superblock. 301 * Allocate a (locked) buffer to hold the superblock.
291 * This will be kept around at all times to optimize 302 * This will be kept around at all times to optimize
292 * access to the superblock. 303 * access to the superblock.
293 */ 304 */
294 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
295
296reread: 305reread:
297 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 306 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
298 BTOBB(sector_size), 0, 307 BTOBB(sector_size), 0, buf_ops);
299 loud ? &xfs_sb_buf_ops
300 : &xfs_sb_quiet_buf_ops);
301 if (!bp) { 308 if (!bp) {
302 if (loud) 309 if (loud)
303 xfs_warn(mp, "SB buffer read failed"); 310 xfs_warn(mp, "SB buffer read failed");
@@ -328,12 +335,13 @@ reread:
328 } 335 }
329 336
330 /* 337 /*
331 * If device sector size is smaller than the superblock size, 338 * Re-read the superblock so the buffer is correctly sized,
332 * re-read the superblock so the buffer is correctly sized. 339 * and properly verified.
333 */ 340 */
334 if (sector_size < sbp->sb_sectsize) { 341 if (buf_ops == NULL) {
335 xfs_buf_relse(bp); 342 xfs_buf_relse(bp);
336 sector_size = sbp->sb_sectsize; 343 sector_size = sbp->sb_sectsize;
344 buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
337 goto reread; 345 goto reread;
338 } 346 }
339 347
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index dd88f0e27bd8..348e4d2ed6e6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1222,16 +1222,18 @@ xfs_qm_dqiterate(
1222 lblkno = 0; 1222 lblkno = 0;
1223 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); 1223 maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1224 do { 1224 do {
1225 uint lock_mode;
1226
1225 nmaps = XFS_DQITER_MAP_SIZE; 1227 nmaps = XFS_DQITER_MAP_SIZE;
1226 /* 1228 /*
1227 * We aren't changing the inode itself. Just changing 1229 * We aren't changing the inode itself. Just changing
1228 * some of its data. No new blocks are added here, and 1230 * some of its data. No new blocks are added here, and
1229 * the inode is never added to the transaction. 1231 * the inode is never added to the transaction.
1230 */ 1232 */
1231 xfs_ilock(qip, XFS_ILOCK_SHARED); 1233 lock_mode = xfs_ilock_data_map_shared(qip);
1232 error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, 1234 error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
1233 map, &nmaps, 0); 1235 map, &nmaps, 0);
1234 xfs_iunlock(qip, XFS_ILOCK_SHARED); 1236 xfs_iunlock(qip, lock_mode);
1235 if (error) 1237 if (error)
1236 break; 1238 break;
1237 1239
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index a788b66a5cb1..797fd4636273 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,13 +20,29 @@
20 20
21#include "xfs_dquot_item.h" 21#include "xfs_dquot_item.h"
22#include "xfs_dquot.h" 22#include "xfs_dquot.h"
23#include "xfs_quota_priv.h"
24 23
25struct xfs_inode; 24struct xfs_inode;
26 25
27extern struct kmem_zone *xfs_qm_dqtrxzone; 26extern struct kmem_zone *xfs_qm_dqtrxzone;
28 27
29/* 28/*
29 * Number of bmaps that we ask from bmapi when doing a quotacheck.
30 * We make this restriction to keep the memory usage to a minimum.
31 */
32#define XFS_DQITER_MAP_SIZE 10
33
34#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
35 !dqp->q_core.d_blk_hardlimit && \
36 !dqp->q_core.d_blk_softlimit && \
37 !dqp->q_core.d_rtb_hardlimit && \
38 !dqp->q_core.d_rtb_softlimit && \
39 !dqp->q_core.d_ino_hardlimit && \
40 !dqp->q_core.d_ino_softlimit && \
41 !dqp->q_core.d_bcount && \
42 !dqp->q_core.d_rtbcount && \
43 !dqp->q_core.d_icount)
44
45/*
30 * This defines the unit of allocation of dquots. 46 * This defines the unit of allocation of dquots.
31 * Currently, it is just one file system block, and a 4K blk contains 30 47 * Currently, it is just one file system block, and a 4K blk contains 30
32 * (136 * 30 = 4080) dquots. It's probably not worth trying to make 48 * (136 * 30 = 4080) dquots. It's probably not worth trying to make
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 437c9198031a..3daf5ea1eb8d 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -278,7 +278,7 @@ xfs_qm_scall_trunc_qfiles(
278 xfs_mount_t *mp, 278 xfs_mount_t *mp,
279 uint flags) 279 uint flags)
280{ 280{
281 int error = 0, error2 = 0; 281 int error;
282 282
283 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 283 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
284 xfs_debug(mp, "%s: flags=%x m_qflags=%x", 284 xfs_debug(mp, "%s: flags=%x m_qflags=%x",
@@ -286,14 +286,20 @@ xfs_qm_scall_trunc_qfiles(
286 return XFS_ERROR(EINVAL); 286 return XFS_ERROR(EINVAL);
287 } 287 }
288 288
289 if (flags & XFS_DQ_USER) 289 if (flags & XFS_DQ_USER) {
290 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); 290 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
291 if (flags & XFS_DQ_GROUP) 291 if (error)
292 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); 292 return error;
293 }
294 if (flags & XFS_DQ_GROUP) {
295 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
296 if (error)
297 return error;
298 }
293 if (flags & XFS_DQ_PROJ) 299 if (flags & XFS_DQ_PROJ)
294 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); 300 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
295 301
296 return error ? error : error2; 302 return error;
297} 303}
298 304
299/* 305/*
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
deleted file mode 100644
index 6d86219d93da..000000000000
--- a/fs/xfs/xfs_quota_priv.h
+++ /dev/null
@@ -1,42 +0,0 @@
1/*
2 * Copyright (c) 2000-2003 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_QUOTA_PRIV_H__
19#define __XFS_QUOTA_PRIV_H__
20
21/*
22 * Number of bmaps that we ask from bmapi when doing a quotacheck.
23 * We make this restriction to keep the memory usage to a minimum.
24 */
25#define XFS_DQITER_MAP_SIZE 10
26
27#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
28 !dqp->q_core.d_blk_hardlimit && \
29 !dqp->q_core.d_blk_softlimit && \
30 !dqp->q_core.d_rtb_hardlimit && \
31 !dqp->q_core.d_rtb_softlimit && \
32 !dqp->q_core.d_ino_hardlimit && \
33 !dqp->q_core.d_ino_softlimit && \
34 !dqp->q_core.d_bcount && \
35 !dqp->q_core.d_rtbcount && \
36 !dqp->q_core.d_icount)
37
38#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
39 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
40 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
41
42#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index b7c9aea77f8f..1e116794bb66 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -295,8 +295,7 @@ xfs_mount_validate_sb(
295 sbp->sb_dblocks == 0 || 295 sbp->sb_dblocks == 0 ||
296 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || 296 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
297 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { 297 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
298 XFS_CORRUPTION_ERROR("SB sanity check failed", 298 xfs_notice(mp, "SB sanity check failed");
299 XFS_ERRLEVEL_LOW, mp, sbp);
300 return XFS_ERROR(EFSCORRUPTED); 299 return XFS_ERROR(EFSCORRUPTED);
301 } 300 }
302 301
@@ -611,10 +610,10 @@ xfs_sb_read_verify(
611 XFS_SB_VERSION_5) || 610 XFS_SB_VERSION_5) ||
612 dsb->sb_crc != 0)) { 611 dsb->sb_crc != 0)) {
613 612
614 if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize), 613 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
615 offsetof(struct xfs_sb, sb_crc))) { 614 offsetof(struct xfs_sb, sb_crc))) {
616 /* Only fail bad secondaries on a known V5 filesystem */ 615 /* Only fail bad secondaries on a known V5 filesystem */
617 if (bp->b_bn != XFS_SB_DADDR && 616 if (bp->b_bn == XFS_SB_DADDR ||
618 xfs_sb_version_hascrc(&mp->m_sb)) { 617 xfs_sb_version_hascrc(&mp->m_sb)) {
619 error = EFSCORRUPTED; 618 error = EFSCORRUPTED;
620 goto out_error; 619 goto out_error;
@@ -625,7 +624,7 @@ xfs_sb_read_verify(
625 624
626out_error: 625out_error:
627 if (error) { 626 if (error) {
628 if (error != EWRONGFS) 627 if (error == EFSCORRUPTED)
629 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, 628 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
630 mp, bp->b_addr); 629 mp, bp->b_addr);
631 xfs_buf_ioerror(bp, error); 630 xfs_buf_ioerror(bp, error);
@@ -644,7 +643,6 @@ xfs_sb_quiet_read_verify(
644{ 643{
645 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); 644 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
646 645
647
648 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { 646 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
649 /* XFS filesystem, verify noisily! */ 647 /* XFS filesystem, verify noisily! */
650 xfs_sb_read_verify(bp); 648 xfs_sb_read_verify(bp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f317488263dd..d971f4932b5d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -913,7 +913,7 @@ xfs_flush_inodes(
913 struct super_block *sb = mp->m_super; 913 struct super_block *sb = mp->m_super;
914 914
915 if (down_read_trylock(&sb->s_umount)) { 915 if (down_read_trylock(&sb->s_umount)) {
916 sync_inodes_sb(sb, jiffies); 916 sync_inodes_sb(sb);
917 up_read(&sb->s_umount); 917 up_read(&sb->s_umount);
918 } 918 }
919} 919}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9b96d35e483d..b5bc1ab3c4da 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -64,7 +64,7 @@ typedef struct xfs_log_item {
64 64
65struct xfs_item_ops { 65struct xfs_item_ops {
66 void (*iop_size)(xfs_log_item_t *, int *, int *); 66 void (*iop_size)(xfs_log_item_t *, int *, int *);
67 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 67 void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
68 void (*iop_pin)(xfs_log_item_t *); 68 void (*iop_pin)(xfs_log_item_t *);
69 void (*iop_unpin)(xfs_log_item_t *, int remove); 69 void (*iop_unpin)(xfs_log_item_t *, int remove);
70 uint (*iop_push)(struct xfs_log_item *, struct list_head *); 70 uint (*iop_push)(struct xfs_log_item *, struct list_head *);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index cd2a10e15d3a..41172861e857 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -295,8 +295,8 @@ xfs_trans_mod_dquot(
295/* 295/*
296 * Given an array of dqtrx structures, lock all the dquots associated and join 296 * Given an array of dqtrx structures, lock all the dquots associated and join
297 * them to the transaction, provided they have been modified. We know that the 297 * them to the transaction, provided they have been modified. We know that the
298 * highest number of dquots of one type - usr, grp OR prj - involved in a 298 * highest number of dquots of one type - usr, grp and prj - involved in a
299 * transaction is 2 so we don't need to make this very generic. 299 * transaction is 3 so we don't need to make this very generic.
300 */ 300 */
301STATIC void 301STATIC void
302xfs_trans_dqlockedjoin( 302xfs_trans_dqlockedjoin(
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2fd59c0dae66..2ffd3e331b49 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -174,7 +174,7 @@ xfs_calc_itruncate_reservation(
174 xfs_calc_buf_res(5, 0) + 174 xfs_calc_buf_res(5, 0) +
175 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 175 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
176 XFS_FSB_TO_B(mp, 1)) + 176 XFS_FSB_TO_B(mp, 1)) +
177 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + 177 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
178 mp->m_in_maxlevels, 0))); 178 mp->m_in_maxlevels, 0)));
179} 179}
180 180
@@ -282,7 +282,7 @@ xfs_calc_create_resv_modify(
282 * For create we can allocate some inodes giving: 282 * For create we can allocate some inodes giving:
283 * the agi and agf of the ag getting the new inodes: 2 * sectorsize 283 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
284 * the superblock for the nlink flag: sector size 284 * the superblock for the nlink flag: sector size
285 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize 285 * the inode blocks allocated: mp->m_ialloc_blks * blocksize
286 * the inode btree: max depth * blocksize 286 * the inode btree: max depth * blocksize
287 * the allocation btrees: 2 trees * (max depth - 1) * block size 287 * the allocation btrees: 2 trees * (max depth - 1) * block size
288 */ 288 */
@@ -292,7 +292,7 @@ xfs_calc_create_resv_alloc(
292{ 292{
293 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 293 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
294 mp->m_sb.sb_sectsize + 294 mp->m_sb.sb_sectsize +
295 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + 295 xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
296 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + 296 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
297 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 297 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
298 XFS_FSB_TO_B(mp, 1)); 298 XFS_FSB_TO_B(mp, 1));
@@ -385,9 +385,9 @@ xfs_calc_ifree_reservation(
385 xfs_calc_inode_res(mp, 1) + 385 xfs_calc_inode_res(mp, 1) +
386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
388 max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + 388 max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) +
389 xfs_calc_buf_res(1, 0) + 389 xfs_calc_buf_res(1, 0) +
390 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + 390 xfs_calc_buf_res(2 + mp->m_ialloc_blks +
391 mp->m_in_maxlevels, 0) + 391 mp->m_in_maxlevels, 0) +
392 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), 392 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
393 XFS_FSB_TO_B(mp, 1)); 393 XFS_FSB_TO_B(mp, 1));
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7d2c920dfb9c..af5dbe06cb65 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -47,7 +47,7 @@
47#define XFS_DIRREMOVE_SPACE_RES(mp) \ 47#define XFS_DIRREMOVE_SPACE_RES(mp) \
48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) 48 XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
49#define XFS_IALLOC_SPACE_RES(mp) \ 49#define XFS_IALLOC_SPACE_RES(mp) \
50 (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) 50 ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1)
51 51
52/* 52/*
53 * Space reservation values for various transactions. 53 * Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index 3e8e797c6d11..e8a77383c0d5 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -35,15 +35,6 @@ struct attrlist_cursor_kern;
35 { IO_INVIS, "INVIS"} 35 { IO_INVIS, "INVIS"}
36 36
37/* 37/*
38 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
39 */
40#define FI_NONE 0 /* none */
41#define FI_REMAPF 1 /* Do a remapf prior to the operation */
42#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation.
43 Prevent VM access to the pages until
44 the operation completes. */
45
46/*
47 * Some useful predicates. 38 * Some useful predicates.
48 */ 39 */
49#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) 40#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 9d479073ba41..78ed92a46fdd 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -102,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
102 &xfs_xattr_trusted_handler, 102 &xfs_xattr_trusted_handler,
103 &xfs_xattr_security_handler, 103 &xfs_xattr_security_handler,
104#ifdef CONFIG_XFS_POSIX_ACL 104#ifdef CONFIG_XFS_POSIX_ACL
105 &xfs_xattr_acl_access_handler, 105 &posix_acl_access_xattr_handler,
106 &xfs_xattr_acl_default_handler, 106 &posix_acl_default_xattr_handler,
107#endif 107#endif
108 NULL 108 NULL
109}; 109};