aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/inode.c15
-rw-r--r--fs/affs/file.c18
-rw-r--r--fs/affs/inode.c5
-rw-r--r--fs/attr.c11
-rw-r--r--fs/autofs4/autofs_i.h8
-rw-r--r--fs/autofs4/dev-ioctl.c4
-rw-r--r--fs/autofs4/expire.c9
-rw-r--r--fs/autofs4/inode.c24
-rw-r--r--fs/autofs4/root.c83
-rw-r--r--fs/autofs4/waitq.c5
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/bfs/file.c15
-rw-r--r--fs/binfmt_aout.c5
-rw-r--r--fs/binfmt_elf.c9
-rw-r--r--fs/binfmt_elf_fdpic.c6
-rw-r--r--fs/binfmt_em86.c5
-rw-r--r--fs/binfmt_flat.c5
-rw-r--r--fs/binfmt_misc.c15
-rw-r--r--fs/binfmt_script.c12
-rw-r--r--fs/binfmt_som.c5
-rw-r--r--fs/block_dev.c4
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c2
-rw-r--r--fs/btrfs/backref.c16
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c31
-rw-r--r--fs/btrfs/compression.c6
-rw-r--r--fs/btrfs/ctree.c229
-rw-r--r--fs/btrfs/ctree.h184
-rw-r--r--fs/btrfs/delayed-inode.c11
-rw-r--r--fs/btrfs/dev-replace.c856
-rw-r--r--fs/btrfs/dev-replace.h44
-rw-r--r--fs/btrfs/dir-item.c59
-rw-r--r--fs/btrfs/disk-io.c146
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c233
-rw-r--r--fs/btrfs/extent_io.c37
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c40
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file-item.c25
-rw-r--r--fs/btrfs/file.c435
-rw-r--r--fs/btrfs/free-space-cache.c71
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c617
-rw-r--r--fs/btrfs/ioctl.c406
-rw-r--r--fs/btrfs/ioctl.h48
-rw-r--r--fs/btrfs/math.h44
-rw-r--r--fs/btrfs/ordered-data.c90
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c20
-rw-r--r--fs/btrfs/reada.c31
-rw-r--r--fs/btrfs/relocation.c40
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c1836
-rw-r--r--fs/btrfs/send.c12
-rw-r--r--fs/btrfs/super.c50
-rw-r--r--fs/btrfs/transaction.c187
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c485
-rw-r--r--fs/btrfs/volumes.c981
-rw-r--r--fs/btrfs/volumes.h35
-rw-r--r--fs/btrfs/xattr.c13
-rw-r--r--fs/buffer.c19
-rw-r--r--fs/cachefiles/interface.c57
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/key.c2
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/cachefiles/rdwr.c114
-rw-r--r--fs/cachefiles/xattr.c2
-rw-r--r--fs/ceph/addr.c60
-rw-r--r--fs/ceph/caps.c18
-rw-r--r--fs/ceph/dir.c4
-rw-r--r--fs/ceph/export.c4
-rw-r--r--fs/ceph/file.c79
-rw-r--r--fs/ceph/inode.c15
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/cifs/Kconfig10
-rw-r--r--fs/cifs/README2
-rw-r--r--fs/cifs/cifs_debug.h72
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsacl.c758
-rw-r--r--fs/cifs/cifsacl.h66
-rw-r--r--fs/cifs/cifsfs.c25
-rw-r--r--fs/cifs/cifsglob.h36
-rw-r--r--fs/cifs/cifsproto.h10
-rw-r--r--fs/cifs/connect.c313
-rw-r--r--fs/cifs/dir.c32
-rw-r--r--fs/cifs/file.c209
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/netmisc.c14
-rw-r--r--fs/cifs/readdir.c69
-rw-r--r--fs/cifs/smb1ops.c40
-rw-r--r--fs/cifs/smb2file.c12
-rw-r--r--fs/cifs/smb2ops.c105
-rw-r--r--fs/cifs/smb2pdu.c5
-rw-r--r--fs/cifs/smb2proto.h4
-rw-r--r--fs/cifs/smb2transport.c13
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/dir.c4
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/dcache.c35
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c61
-rw-r--r--fs/dlm/Kconfig2
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lock.c16
-rw-r--r--fs/dlm/lowcomms.c5
-rw-r--r--fs/dlm/recover.c37
-rw-r--r--fs/dlm/user.c8
-rw-r--r--fs/ecryptfs/crypto.c2
-rw-r--r--fs/ecryptfs/kthread.c6
-rw-r--r--fs/ecryptfs/mmap.c12
-rw-r--r--fs/eventfd.c20
-rw-r--r--fs/eventpoll.c50
-rw-r--r--fs/exec.c89
-rw-r--r--fs/exofs/inode.c16
-rw-r--r--fs/exportfs/expfs.c23
-rw-r--r--fs/ext3/dir.c6
-rw-r--r--fs/ext3/inode.c3
-rw-r--r--fs/ext3/super.c3
-rw-r--r--fs/ext4/Kconfig17
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c6
-rw-r--r--fs/ext4/dir.c47
-rw-r--r--fs/ext4/ext4.h167
-rw-r--r--fs/ext4/ext4_extents.h40
-rw-r--r--fs/ext4/ext4_jbd2.h7
-rw-r--r--fs/ext4/extents.c502
-rw-r--r--fs/ext4/extents_status.c500
-rw-r--r--fs/ext4/extents_status.h45
-rw-r--r--fs/ext4/file.c346
-rw-r--r--fs/ext4/fsync.c8
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/indirect.c5
-rw-r--r--fs/ext4/inline.c1884
-rw-r--r--fs/ext4/inode.c728
-rw-r--r--fs/ext4/mballoc.c60
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/namei.c535
-rw-r--r--fs/ext4/page-io.c3
-rw-r--r--fs/ext4/resize.c17
-rw-r--r--fs/ext4/super.c87
-rw-r--r--fs/ext4/symlink.c4
-rw-r--r--fs/ext4/xattr.c110
-rw-r--r--fs/ext4/xattr.h158
-rw-r--r--fs/f2fs/Kconfig53
-rw-r--r--fs/f2fs/Makefile7
-rw-r--r--fs/f2fs/acl.c412
-rw-r--r--fs/f2fs/acl.h57
-rw-r--r--fs/f2fs/checkpoint.c793
-rw-r--r--fs/f2fs/data.c718
-rw-r--r--fs/f2fs/debug.c353
-rw-r--r--fs/f2fs/dir.c674
-rw-r--r--fs/f2fs/f2fs.h1087
-rw-r--r--fs/f2fs/file.c646
-rw-r--r--fs/f2fs/gc.c716
-rw-r--r--fs/f2fs/gc.h117
-rw-r--r--fs/f2fs/hash.c101
-rw-r--r--fs/f2fs/inode.c272
-rw-r--r--fs/f2fs/namei.c503
-rw-r--r--fs/f2fs/node.c1760
-rw-r--r--fs/f2fs/node.h353
-rw-r--r--fs/f2fs/recovery.c377
-rw-r--r--fs/f2fs/segment.c1757
-rw-r--r--fs/f2fs/segment.h631
-rw-r--r--fs/f2fs/super.c701
-rw-r--r--fs/f2fs/xattr.c443
-rw-r--r--fs/f2fs/xattr.h145
-rw-r--r--fs/fat/dir.c5
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/inode.c57
-rw-r--r--fs/fat/misc.c13
-rw-r--r--fs/fhandle.c6
-rw-r--r--fs/file.c8
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fs_struct.c24
-rw-r--r--fs/fscache/cache.c8
-rw-r--r--fs/fscache/cookie.c78
-rw-r--r--fs/fscache/internal.h15
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fscache/object.c101
-rw-r--r--fs/fscache/operation.c140
-rw-r--r--fs/fscache/page.c195
-rw-r--r--fs/fscache/stats.c17
-rw-r--r--fs/fuse/Kconfig16
-rw-r--r--fs/fuse/cuse.c36
-rw-r--r--fs/fuse/dev.c9
-rw-r--r--fs/fuse/dir.c20
-rw-r--r--fs/fuse/file.c13
-rw-r--r--fs/fuse/fuse_i.h4
-rw-r--r--fs/fuse/inode.c23
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c54
-rw-r--r--fs/gfs2/dir.c7
-rw-r--r--fs/gfs2/file.c14
-rw-r--r--fs/gfs2/glock.c42
-rw-r--r--fs/gfs2/glock.h54
-rw-r--r--fs/gfs2/glops.c19
-rw-r--r--fs/gfs2/incore.h6
-rw-r--r--fs/gfs2/inode.c209
-rw-r--r--fs/gfs2/lock_dlm.c26
-rw-r--r--fs/gfs2/ops_fstype.c3
-rw-r--r--fs/gfs2/quota.c10
-rw-r--r--fs/gfs2/rgrp.c174
-rw-r--r--fs/gfs2/rgrp.h3
-rw-r--r--fs/gfs2/trace_gfs2.h2
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfs/inode.c26
-rw-r--r--fs/hfsplus/bitmap.c13
-rw-r--r--fs/hfsplus/btree.c5
-rw-r--r--fs/hfsplus/extents.c24
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c27
-rw-r--r--fs/hfsplus/super.c15
-rw-r--r--fs/hpfs/file.c20
-rw-r--r--fs/hpfs/hpfs_fn.h1
-rw-r--r--fs/hpfs/inode.c5
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c111
-rw-r--r--fs/inode.c2
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/journal.c1
-rw-r--r--fs/jbd2/transaction.c43
-rw-r--r--fs/jffs2/nodemgmt.c6
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/inode.c20
-rw-r--r--fs/libfs.c6
-rw-r--r--fs/lockd/clnt4xdr.c8
-rw-r--r--fs/lockd/clntproc.c3
-rw-r--r--fs/lockd/clntxdr.c8
-rw-r--r--fs/lockd/host.c15
-rw-r--r--fs/lockd/mon.c3
-rw-r--r--fs/logfs/inode.c2
-rw-r--r--fs/logfs/readwrite.c10
-rw-r--r--fs/minix/file.c6
-rw-r--r--fs/minix/inode.c17
-rw-r--r--fs/mount.h3
-rw-r--r--fs/namei.c118
-rw-r--r--fs/namespace.c214
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1
-rw-r--r--fs/nfs/cache_lib.c1
-rw-r--r--fs/nfs/callback.h4
-rw-r--r--fs/nfs/callback_proc.c19
-rw-r--r--fs/nfs/callback_xdr.c5
-rw-r--r--fs/nfs/client.c9
-rw-r--r--fs/nfs/dir.c44
-rw-r--r--fs/nfs/direct.c17
-rw-r--r--fs/nfs/file.c10
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/fscache.h21
-rw-r--r--fs/nfs/idmap.c12
-rw-r--r--fs/nfs/inode.c30
-rw-r--r--fs/nfs/internal.h42
-rw-r--r--fs/nfs/mount_clnt.c7
-rw-r--r--fs/nfs/namespace.c20
-rw-r--r--fs/nfs/nfs2xdr.c4
-rw-r--r--fs/nfs/nfs3proc.c6
-rw-r--r--fs/nfs/nfs3xdr.c7
-rw-r--r--fs/nfs/nfs4_fs.h29
-rw-r--r--fs/nfs/nfs4client.c67
-rw-r--r--fs/nfs/nfs4file.c3
-rw-r--r--fs/nfs/nfs4filelayout.c45
-rw-r--r--fs/nfs/nfs4filelayoutdev.c3
-rw-r--r--fs/nfs/nfs4proc.c839
-rw-r--r--fs/nfs/nfs4session.c552
-rw-r--r--fs/nfs/nfs4session.h142
-rw-r--r--fs/nfs/nfs4state.c165
-rw-r--r--fs/nfs/nfs4super.c1
-rw-r--r--fs/nfs/nfs4xdr.c52
-rw-r--r--fs/nfs/objlayout/objlayout.c11
-rw-r--r--fs/nfs/pnfs.c19
-rw-r--r--fs/nfs/proc.c43
-rw-r--r--fs/nfs/read.c10
-rw-r--r--fs/nfs/super.c45
-rw-r--r--fs/nfs/write.c44
-rw-r--r--fs/nfsd/fault_inject.c113
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/netns.h66
-rw-r--r--fs/nfsd/nfs2acl.c2
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs3proc.c6
-rw-r--r--fs/nfsd/nfs3xdr.c47
-rw-r--r--fs/nfsd/nfs4callback.c69
-rw-r--r--fs/nfsd/nfs4proc.c74
-rw-r--r--fs/nfsd/nfs4recover.c561
-rw-r--r--fs/nfsd/nfs4state.c1015
-rw-r--r--fs/nfsd/nfs4xdr.c324
-rw-r--r--fs/nfsd/nfsctl.c100
-rw-r--r--fs/nfsd/nfsd.h36
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfssvc.c203
-rw-r--r--fs/nfsd/nfsxdr.c11
-rw-r--r--fs/nfsd/state.h64
-rw-r--r--fs/nfsd/vfs.c52
-rw-r--r--fs/nfsd/xdr4.h15
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/inode.c24
-rw-r--r--fs/nilfs2/ioctl.c5
-rw-r--r--fs/nilfs2/nilfs.h1
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/notify/Makefile2
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/notify/fanotify/Kconfig2
-rw-r--r--fs/notify/fanotify/fanotify.c6
-rw-r--r--fs/notify/fanotify/fanotify_user.c39
-rw-r--r--fs/notify/fdinfo.c179
-rw-r--r--fs/notify/fdinfo.h27
-rw-r--r--fs/notify/group.c47
-rw-r--r--fs/notify/inode_mark.c19
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c4
-rw-r--r--fs/notify/inotify/inotify_user.c36
-rw-r--r--fs/notify/mark.c91
-rw-r--r--fs/notify/notification.c3
-rw-r--r--fs/notify/vfsmount_mark.c14
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/inode.c8
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ocfs2/extent_map.c12
-rw-r--r--fs/ocfs2/file.c29
-rw-r--r--fs/omfs/file.c22
-rw-r--r--fs/open.c99
-rw-r--r--fs/pnode.h1
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c27
-rw-r--r--fs/proc/base.c186
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/generic.c48
-rw-r--r--fs/proc/inode.c6
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/namespaces.c185
-rw-r--r--fs/proc/proc_devtree.c6
-rw-r--r--fs/proc/proc_sysctl.c16
-rw-r--r--fs/proc/root.c17
-rw-r--r--fs/proc/self.c59
-rw-r--r--fs/proc/task_mmu.c61
-rw-r--r--fs/pstore/ftrace.c4
-rw-r--r--fs/pstore/inode.c13
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c13
-rw-r--r--fs/pstore/ram.c43
-rw-r--r--fs/pstore/ram_core.c9
-rw-r--r--fs/quota/quota.c4
-rw-r--r--fs/read_write.c42
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c15
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/seq_file.c6
-rw-r--r--fs/signalfd.c18
-rw-r--r--fs/splice.c9
-rw-r--r--fs/stat.c16
-rw-r--r--fs/statfs.c9
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/sysfs/mount.c1
-rw-r--r--fs/sysv/file.c5
-rw-r--r--fs/sysv/itree.c17
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/dir.c4
-rw-r--r--fs/udf/inode.c14
-rw-r--r--fs/udf/super.c3
-rw-r--r--fs/ufs/inode.c15
-rw-r--r--fs/utimes.c6
-rw-r--r--fs/xattr.c72
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/uuid.h6
-rw-r--r--fs/xfs/xfs_ag.h5
-rw-r--r--fs/xfs/xfs_alloc.c140
-rw-r--r--fs/xfs/xfs_alloc.h3
-rw-r--r--fs/xfs/xfs_alloc_btree.c77
-rw-r--r--fs/xfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/xfs_aops.c85
-rw-r--r--fs/xfs/xfs_attr.c103
-rw-r--r--fs/xfs/xfs_attr_leaf.c143
-rw-r--r--fs/xfs/xfs_attr_leaf.h6
-rw-r--r--fs/xfs/xfs_bmap.c70
-rw-r--r--fs/xfs/xfs_bmap_btree.c63
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c111
-rw-r--r--fs/xfs/xfs_btree.h22
-rw-r--r--fs/xfs/xfs_buf.c91
-rw-r--r--fs/xfs/xfs_buf.h33
-rw-r--r--fs/xfs/xfs_buf_item.c61
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_da_btree.c141
-rw-r--r--fs/xfs/xfs_da_btree.h10
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c434
-rw-r--r--fs/xfs/xfs_dir2_data.c170
-rw-r--r--fs/xfs/xfs_dir2_leaf.c172
-rw-r--r--fs/xfs/xfs_dir2_node.c288
-rw-r--r--fs/xfs/xfs_dir2_priv.h19
-rw-r--r--fs/xfs/xfs_dquot.c134
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c1
-rw-r--r--fs/xfs/xfs_file.c42
-rw-r--r--fs/xfs/xfs_fs.h33
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_fsops.c141
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_ialloc.c83
-rw-r--r--fs/xfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/xfs_ialloc_btree.c55
-rw-r--r--fs/xfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/xfs_icache.c (renamed from fs/xfs/xfs_sync.c)914
-rw-r--r--fs/xfs/xfs_icache.h (renamed from fs/xfs/xfs_sync.h)28
-rw-r--r--fs/xfs/xfs_iget.c705
-rw-r--r--fs/xfs/xfs_inode.c437
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_ioctl.c21
-rw-r--r--fs/xfs/xfs_iomap.c40
-rw-r--r--fs/xfs/xfs_iops.c8
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c241
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h12
-rw-r--r--fs/xfs/xfs_log_recover.c146
-rw-r--r--fs/xfs/xfs_mount.c163
-rw-r--r--fs/xfs/xfs_mount.h13
-rw-r--r--fs/xfs/xfs_qm.c22
-rw-r--r--fs/xfs/xfs_qm_syscalls.c10
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_sb.h7
-rw-r--r--fs/xfs/xfs_super.c148
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h61
-rw-r--r--fs/xfs/xfs_trans.h19
-rw-r--r--fs/xfs/xfs_trans_buf.c36
-rw-r--r--fs/xfs/xfs_vnodeops.c168
-rw-r--r--fs/xfs/xfs_vnodeops.h9
447 files changed, 33404 insertions, 9981 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index f95ae3a027f3..780725a463b1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -28,8 +28,8 @@ config FS_MBCACHE
28 tristate 28 tristate
29 default y if EXT2_FS=y && EXT2_FS_XATTR 29 default y if EXT2_FS=y && EXT2_FS_XATTR
30 default y if EXT3_FS=y && EXT3_FS_XATTR 30 default y if EXT3_FS=y && EXT3_FS_XATTR
31 default y if EXT4_FS=y && EXT4_FS_XATTR 31 default y if EXT4_FS=y
32 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR 32 default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
33 33
34source "fs/reiserfs/Kconfig" 34source "fs/reiserfs/Kconfig"
35source "fs/jfs/Kconfig" 35source "fs/jfs/Kconfig"
@@ -68,16 +68,6 @@ source "fs/quota/Kconfig"
68source "fs/autofs4/Kconfig" 68source "fs/autofs4/Kconfig"
69source "fs/fuse/Kconfig" 69source "fs/fuse/Kconfig"
70 70
71config CUSE
72 tristate "Character device in Userspace support"
73 depends on FUSE_FS
74 help
75 This FUSE extension allows character devices to be
76 implemented in userspace.
77
78 If you want to develop or use userspace character device
79 based on CUSE, answer Y or M.
80
81config GENERIC_ACL 71config GENERIC_ACL
82 bool 72 bool
83 select FS_POSIX_ACL 73 select FS_POSIX_ACL
@@ -220,6 +210,7 @@ source "fs/pstore/Kconfig"
220source "fs/sysv/Kconfig" 210source "fs/sysv/Kconfig"
221source "fs/ufs/Kconfig" 211source "fs/ufs/Kconfig"
222source "fs/exofs/Kconfig" 212source "fs/exofs/Kconfig"
213source "fs/f2fs/Kconfig"
223 214
224endif # MISC_FILESYSTEMS 215endif # MISC_FILESYSTEMS
225 216
diff --git a/fs/Makefile b/fs/Makefile
index 1d7af79288a0..9d53192236fc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
123obj-$(CONFIG_OCFS2_FS) += ocfs2/ 123obj-$(CONFIG_OCFS2_FS) += ocfs2/
124obj-$(CONFIG_BTRFS_FS) += btrfs/ 124obj-$(CONFIG_BTRFS_FS) += btrfs/
125obj-$(CONFIG_GFS2_FS) += gfs2/ 125obj-$(CONFIG_GFS2_FS) += gfs2/
126obj-$(CONFIG_F2FS_FS) += f2fs/
126obj-y += exofs/ # Multiple modules 127obj-y += exofs/ # Multiple modules
127obj-$(CONFIG_CEPH_FS) += ceph/ 128obj-$(CONFIG_CEPH_FS) += ceph/
128obj-$(CONFIG_PSTORE) += pstore/ 129obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e9bad5093a3f..5f95d1ed9c6d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page)
45 return block_read_full_page(page, adfs_get_block); 45 return block_read_full_page(page, adfs_get_block);
46} 46}
47 47
48static void adfs_write_failed(struct address_space *mapping, loff_t to)
49{
50 struct inode *inode = mapping->host;
51
52 if (to > inode->i_size)
53 truncate_pagecache(inode, to, inode->i_size);
54}
55
48static int adfs_write_begin(struct file *file, struct address_space *mapping, 56static int adfs_write_begin(struct file *file, struct address_space *mapping,
49 loff_t pos, unsigned len, unsigned flags, 57 loff_t pos, unsigned len, unsigned flags,
50 struct page **pagep, void **fsdata) 58 struct page **pagep, void **fsdata)
@@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
55 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 63 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
56 adfs_get_block, 64 adfs_get_block,
57 &ADFS_I(mapping->host)->mmu_private); 65 &ADFS_I(mapping->host)->mmu_private);
58 if (unlikely(ret)) { 66 if (unlikely(ret))
59 loff_t isize = mapping->host->i_size; 67 adfs_write_failed(mapping, pos + len);
60 if (pos + len > isize)
61 vmtruncate(mapping->host, isize);
62 }
63 68
64 return ret; 69 return ret;
65} 70}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 2f4c935cb327..af3261b78102 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = {
39}; 39};
40 40
41const struct inode_operations affs_file_inode_operations = { 41const struct inode_operations affs_file_inode_operations = {
42 .truncate = affs_truncate,
43 .setattr = affs_notify_change, 42 .setattr = affs_notify_change,
44}; 43};
45 44
@@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page)
402 return block_read_full_page(page, affs_get_block); 401 return block_read_full_page(page, affs_get_block);
403} 402}
404 403
404static void affs_write_failed(struct address_space *mapping, loff_t to)
405{
406 struct inode *inode = mapping->host;
407
408 if (to > inode->i_size) {
409 truncate_pagecache(inode, to, inode->i_size);
410 affs_truncate(inode);
411 }
412}
413
405static int affs_write_begin(struct file *file, struct address_space *mapping, 414static int affs_write_begin(struct file *file, struct address_space *mapping,
406 loff_t pos, unsigned len, unsigned flags, 415 loff_t pos, unsigned len, unsigned flags,
407 struct page **pagep, void **fsdata) 416 struct page **pagep, void **fsdata)
@@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
412 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 421 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
413 affs_get_block, 422 affs_get_block,
414 &AFFS_I(mapping->host)->mmu_private); 423 &AFFS_I(mapping->host)->mmu_private);
415 if (unlikely(ret)) { 424 if (unlikely(ret))
416 loff_t isize = mapping->host->i_size; 425 affs_write_failed(mapping, pos + len);
417 if (pos + len > isize)
418 vmtruncate(mapping->host, isize);
419 }
420 426
421 return ret; 427 return ret;
422} 428}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 15c484268229..0e092d08680e 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
237 237
238 if ((attr->ia_valid & ATTR_SIZE) && 238 if ((attr->ia_valid & ATTR_SIZE) &&
239 attr->ia_size != i_size_read(inode)) { 239 attr->ia_size != i_size_read(inode)) {
240 error = vmtruncate(inode, attr->ia_size); 240 error = inode_newsize_ok(inode, attr->ia_size);
241 if (error) 241 if (error)
242 return error; 242 return error;
243
244 truncate_setsize(inode, attr->ia_size);
245 affs_truncate(inode);
243 } 246 }
244 247
245 setattr_copy(inode, attr); 248 setattr_copy(inode, attr);
diff --git a/fs/attr.c b/fs/attr.c
index cce7df53b694..1449adb14ef6 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -49,14 +49,15 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
49 /* Make sure a caller can chown. */ 49 /* Make sure a caller can chown. */
50 if ((ia_valid & ATTR_UID) && 50 if ((ia_valid & ATTR_UID) &&
51 (!uid_eq(current_fsuid(), inode->i_uid) || 51 (!uid_eq(current_fsuid(), inode->i_uid) ||
52 !uid_eq(attr->ia_uid, inode->i_uid)) && !capable(CAP_CHOWN)) 52 !uid_eq(attr->ia_uid, inode->i_uid)) &&
53 !inode_capable(inode, CAP_CHOWN))
53 return -EPERM; 54 return -EPERM;
54 55
55 /* Make sure caller can chgrp. */ 56 /* Make sure caller can chgrp. */
56 if ((ia_valid & ATTR_GID) && 57 if ((ia_valid & ATTR_GID) &&
57 (!uid_eq(current_fsuid(), inode->i_uid) || 58 (!uid_eq(current_fsuid(), inode->i_uid) ||
58 (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) && 59 (!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
59 !capable(CAP_CHOWN)) 60 !inode_capable(inode, CAP_CHOWN))
60 return -EPERM; 61 return -EPERM;
61 62
62 /* Make sure a caller can chmod. */ 63 /* Make sure a caller can chmod. */
@@ -65,7 +66,8 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
65 return -EPERM; 66 return -EPERM;
66 /* Also check the setgid bit! */ 67 /* Also check the setgid bit! */
67 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 68 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
68 inode->i_gid) && !capable(CAP_FSETID)) 69 inode->i_gid) &&
70 !inode_capable(inode, CAP_FSETID))
69 attr->ia_mode &= ~S_ISGID; 71 attr->ia_mode &= ~S_ISGID;
70 } 72 }
71 73
@@ -157,7 +159,8 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
157 if (ia_valid & ATTR_MODE) { 159 if (ia_valid & ATTR_MODE) {
158 umode_t mode = attr->ia_mode; 160 umode_t mode = attr->ia_mode;
159 161
160 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 162 if (!in_group_p(inode->i_gid) &&
163 !inode_capable(inode, CAP_FSETID))
161 mode &= ~S_ISGID; 164 mode &= ~S_ISGID;
162 inode->i_mode = mode; 165 inode->i_mode = mode;
163 } 166 }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 908e18455413..b785e7707959 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -74,8 +74,8 @@ struct autofs_info {
74 unsigned long last_used; 74 unsigned long last_used;
75 atomic_t count; 75 atomic_t count;
76 76
77 uid_t uid; 77 kuid_t uid;
78 gid_t gid; 78 kgid_t gid;
79}; 79};
80 80
81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
@@ -89,8 +89,8 @@ struct autofs_wait_queue {
89 struct qstr name; 89 struct qstr name;
90 u32 dev; 90 u32 dev;
91 u64 ino; 91 u64 ino;
92 uid_t uid; 92 kuid_t uid;
93 gid_t gid; 93 kgid_t gid;
94 pid_t pid; 94 pid_t pid;
95 pid_t tgid; 95 pid_t tgid;
96 /* This is for status reporting upon return */ 96 /* This is for status reporting upon return */
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index a16214109d31..9f68a37bb2b2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -437,8 +437,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
437 err = 0; 437 err = 0;
438 autofs4_expire_wait(path.dentry); 438 autofs4_expire_wait(path.dentry);
439 spin_lock(&sbi->fs_lock); 439 spin_lock(&sbi->fs_lock);
440 param->requester.uid = ino->uid; 440 param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
441 param->requester.gid = ino->gid; 441 param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
442 spin_unlock(&sbi->fs_lock); 442 spin_unlock(&sbi->fs_lock);
443 } 443 }
444 path_put(&path); 444 path_put(&path);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 842d00048a65..01443ce43ee7 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -548,15 +548,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
548 548
549 spin_lock(&sbi->fs_lock); 549 spin_lock(&sbi->fs_lock);
550 ino->flags &= ~AUTOFS_INF_EXPIRING; 550 ino->flags &= ~AUTOFS_INF_EXPIRING;
551 spin_lock(&dentry->d_lock);
552 if (!ret) {
553 if ((IS_ROOT(dentry) ||
554 (autofs_type_indirect(sbi->type) &&
555 IS_ROOT(dentry->d_parent))) &&
556 !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
557 __managed_dentry_set_automount(dentry);
558 }
559 spin_unlock(&dentry->d_lock);
560 complete_all(&ino->expire_complete); 551 complete_all(&ino->expire_complete);
561 spin_unlock(&sbi->fs_lock); 552 spin_unlock(&sbi->fs_lock);
562 dput(dentry); 553 dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 8a4fed8ead30..b104726e2d0a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -36,8 +36,8 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
36 36
37void autofs4_clean_ino(struct autofs_info *ino) 37void autofs4_clean_ino(struct autofs_info *ino)
38{ 38{
39 ino->uid = 0; 39 ino->uid = GLOBAL_ROOT_UID;
40 ino->gid = 0; 40 ino->gid = GLOBAL_ROOT_GID;
41 ino->last_used = jiffies; 41 ino->last_used = jiffies;
42} 42}
43 43
@@ -79,10 +79,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
79 return 0; 79 return 0;
80 80
81 seq_printf(m, ",fd=%d", sbi->pipefd); 81 seq_printf(m, ",fd=%d", sbi->pipefd);
82 if (root_inode->i_uid != 0) 82 if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
83 seq_printf(m, ",uid=%u", root_inode->i_uid); 83 seq_printf(m, ",uid=%u",
84 if (root_inode->i_gid != 0) 84 from_kuid_munged(&init_user_ns, root_inode->i_uid));
85 seq_printf(m, ",gid=%u", root_inode->i_gid); 85 if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
86 seq_printf(m, ",gid=%u",
87 from_kgid_munged(&init_user_ns, root_inode->i_gid));
86 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); 88 seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
87 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); 89 seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
88 seq_printf(m, ",minproto=%d", sbi->min_proto); 90 seq_printf(m, ",minproto=%d", sbi->min_proto);
@@ -126,7 +128,7 @@ static const match_table_t tokens = {
126 {Opt_err, NULL} 128 {Opt_err, NULL}
127}; 129};
128 130
129static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, 131static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
130 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) 132 pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
131{ 133{
132 char *p; 134 char *p;
@@ -159,12 +161,16 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
159 case Opt_uid: 161 case Opt_uid:
160 if (match_int(args, &option)) 162 if (match_int(args, &option))
161 return 1; 163 return 1;
162 *uid = option; 164 *uid = make_kuid(current_user_ns(), option);
165 if (!uid_valid(*uid))
166 return 1;
163 break; 167 break;
164 case Opt_gid: 168 case Opt_gid:
165 if (match_int(args, &option)) 169 if (match_int(args, &option))
166 return 1; 170 return 1;
167 *gid = option; 171 *gid = make_kgid(current_user_ns(), option);
172 if (!gid_valid(*gid))
173 return 1;
168 break; 174 break;
169 case Opt_pgrp: 175 case Opt_pgrp:
170 if (match_int(args, &option)) 176 if (match_int(args, &option))
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 91b11650722e..c93447604da8 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -124,13 +124,10 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
124 * it. 124 * it.
125 */ 125 */
126 spin_lock(&sbi->lookup_lock); 126 spin_lock(&sbi->lookup_lock);
127 spin_lock(&dentry->d_lock); 127 if (!d_mountpoint(dentry) && simple_empty(dentry)) {
128 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
129 spin_unlock(&dentry->d_lock);
130 spin_unlock(&sbi->lookup_lock); 128 spin_unlock(&sbi->lookup_lock);
131 return -ENOENT; 129 return -ENOENT;
132 } 130 }
133 spin_unlock(&dentry->d_lock);
134 spin_unlock(&sbi->lookup_lock); 131 spin_unlock(&sbi->lookup_lock);
135 132
136out: 133out:
@@ -355,7 +352,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
355 status = autofs4_mount_wait(dentry); 352 status = autofs4_mount_wait(dentry);
356 if (status) 353 if (status)
357 return ERR_PTR(status); 354 return ERR_PTR(status);
358 spin_lock(&sbi->fs_lock);
359 goto done; 355 goto done;
360 } 356 }
361 357
@@ -364,8 +360,11 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
364 * having d_mountpoint() true, so there's no need to call back 360 * having d_mountpoint() true, so there's no need to call back
365 * to the daemon. 361 * to the daemon.
366 */ 362 */
367 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) 363 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
364 spin_unlock(&sbi->fs_lock);
368 goto done; 365 goto done;
366 }
367
369 if (!d_mountpoint(dentry)) { 368 if (!d_mountpoint(dentry)) {
370 /* 369 /*
371 * It's possible that user space hasn't removed directories 370 * It's possible that user space hasn't removed directories
@@ -379,15 +378,13 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
379 * require user space behave. 378 * require user space behave.
380 */ 379 */
381 if (sbi->version > 4) { 380 if (sbi->version > 4) {
382 if (have_submounts(dentry)) 381 if (have_submounts(dentry)) {
382 spin_unlock(&sbi->fs_lock);
383 goto done; 383 goto done;
384 }
384 } else { 385 } else {
385 spin_lock(&dentry->d_lock); 386 if (!simple_empty(dentry))
386 if (!list_empty(&dentry->d_subdirs)) {
387 spin_unlock(&dentry->d_lock);
388 goto done; 387 goto done;
389 }
390 spin_unlock(&dentry->d_lock);
391 } 388 }
392 ino->flags |= AUTOFS_INF_PENDING; 389 ino->flags |= AUTOFS_INF_PENDING;
393 spin_unlock(&sbi->fs_lock); 390 spin_unlock(&sbi->fs_lock);
@@ -399,28 +396,8 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
399 return ERR_PTR(status); 396 return ERR_PTR(status);
400 } 397 }
401 } 398 }
402done:
403 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
404 /*
405 * Any needed mounting has been completed and the path
406 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
407 * call ->d_automount() on rootless multi-mounts since
408 * it can lead to an incorrect ELOOP error return.
409 *
410 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
411 * symlinks as in all other cases the dentry will be covered by
412 * an actual mount so ->d_automount() won't be called during
413 * the follow.
414 */
415 spin_lock(&dentry->d_lock);
416 if ((!d_mountpoint(dentry) &&
417 !list_empty(&dentry->d_subdirs)) ||
418 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
419 __managed_dentry_clear_automount(dentry);
420 spin_unlock(&dentry->d_lock);
421 }
422 spin_unlock(&sbi->fs_lock); 399 spin_unlock(&sbi->fs_lock);
423 400done:
424 /* Mount succeeded, check if we ended up with a new dentry */ 401 /* Mount succeeded, check if we ended up with a new dentry */
425 dentry = autofs4_mountpoint_changed(path); 402 dentry = autofs4_mountpoint_changed(path);
426 if (!dentry) 403 if (!dentry)
@@ -432,6 +409,8 @@ done:
432int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) 409int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
433{ 410{
434 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 411 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
412 struct autofs_info *ino = autofs4_dentry_ino(dentry);
413 int status;
435 414
436 DPRINTK("dentry=%p %.*s", 415 DPRINTK("dentry=%p %.*s",
437 dentry, dentry->d_name.len, dentry->d_name.name); 416 dentry, dentry->d_name.len, dentry->d_name.name);
@@ -456,7 +435,32 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
456 * This dentry may be under construction so wait on mount 435 * This dentry may be under construction so wait on mount
457 * completion. 436 * completion.
458 */ 437 */
459 return autofs4_mount_wait(dentry); 438 status = autofs4_mount_wait(dentry);
439 if (status)
440 return status;
441
442 spin_lock(&sbi->fs_lock);
443 /*
444 * If the dentry has been selected for expire while we slept
445 * on the lock then it might go away. We'll deal with that in
446 * ->d_automount() and wait on a new mount if the expire
447 * succeeds or return here if it doesn't (since there's no
448 * mount to follow with a rootless multi-mount).
449 */
450 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
451 /*
452 * Any needed mounting has been completed and the path
453 * updated so check if this is a rootless multi-mount so
454 * we can avoid needless calls ->d_automount() and avoid
455 * an incorrect ELOOP error return.
456 */
457 if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
458 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
459 status = -EISDIR;
460 }
461 spin_unlock(&sbi->fs_lock);
462
463 return status;
460} 464}
461 465
462/* Lookups in the root directory */ 466/* Lookups in the root directory */
@@ -599,9 +603,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
599 603
600 spin_lock(&sbi->lookup_lock); 604 spin_lock(&sbi->lookup_lock);
601 __autofs4_add_expiring(dentry); 605 __autofs4_add_expiring(dentry);
602 spin_lock(&dentry->d_lock); 606 d_drop(dentry);
603 __d_drop(dentry);
604 spin_unlock(&dentry->d_lock);
605 spin_unlock(&sbi->lookup_lock); 607 spin_unlock(&sbi->lookup_lock);
606 608
607 return 0; 609 return 0;
@@ -672,15 +674,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
672 return -EACCES; 674 return -EACCES;
673 675
674 spin_lock(&sbi->lookup_lock); 676 spin_lock(&sbi->lookup_lock);
675 spin_lock(&dentry->d_lock); 677 if (!simple_empty(dentry)) {
676 if (!list_empty(&dentry->d_subdirs)) {
677 spin_unlock(&dentry->d_lock);
678 spin_unlock(&sbi->lookup_lock); 678 spin_unlock(&sbi->lookup_lock);
679 return -ENOTEMPTY; 679 return -ENOTEMPTY;
680 } 680 }
681 __autofs4_add_expiring(dentry); 681 __autofs4_add_expiring(dentry);
682 __d_drop(dentry); 682 d_drop(dentry);
683 spin_unlock(&dentry->d_lock);
684 spin_unlock(&sbi->lookup_lock); 683 spin_unlock(&sbi->lookup_lock);
685 684
686 if (sbi->version < 5) 685 if (sbi->version < 5)
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index dce436e595c1..03bc1d347d8e 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -154,6 +154,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
154 case autofs_ptype_expire_direct: 154 case autofs_ptype_expire_direct:
155 { 155 {
156 struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; 156 struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
157 struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns;
157 158
158 pktsz = sizeof(*packet); 159 pktsz = sizeof(*packet);
159 160
@@ -163,8 +164,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
163 packet->name[wq->name.len] = '\0'; 164 packet->name[wq->name.len] = '\0';
164 packet->dev = wq->dev; 165 packet->dev = wq->dev;
165 packet->ino = wq->ino; 166 packet->ino = wq->ino;
166 packet->uid = wq->uid; 167 packet->uid = from_kuid_munged(user_ns, wq->uid);
167 packet->gid = wq->gid; 168 packet->gid = from_kgid_munged(user_ns, wq->gid);
168 packet->pid = wq->pid; 169 packet->pid = wq->pid;
169 packet->tgid = wq->tgid; 170 packet->tgid = wq->tgid;
170 break; 171 break;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index b1342ffb3cf6..922ad460bff9 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -16,7 +16,7 @@
16#include <linux/poll.h> 16#include <linux/poll.h>
17 17
18 18
19static loff_t bad_file_llseek(struct file *file, loff_t offset, int origin) 19static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence)
20{ 20{
21 return -EIO; 21 return -EIO;
22} 22}
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index f20e8a71062f..ad3ea1497cc3 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page)
161 return block_read_full_page(page, bfs_get_block); 161 return block_read_full_page(page, bfs_get_block);
162} 162}
163 163
164static void bfs_write_failed(struct address_space *mapping, loff_t to)
165{
166 struct inode *inode = mapping->host;
167
168 if (to > inode->i_size)
169 truncate_pagecache(inode, to, inode->i_size);
170}
171
164static int bfs_write_begin(struct file *file, struct address_space *mapping, 172static int bfs_write_begin(struct file *file, struct address_space *mapping,
165 loff_t pos, unsigned len, unsigned flags, 173 loff_t pos, unsigned len, unsigned flags,
166 struct page **pagep, void **fsdata) 174 struct page **pagep, void **fsdata)
@@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
169 177
170 ret = block_write_begin(mapping, pos, len, flags, pagep, 178 ret = block_write_begin(mapping, pos, len, flags, pagep,
171 bfs_get_block); 179 bfs_get_block);
172 if (unlikely(ret)) { 180 if (unlikely(ret))
173 loff_t isize = mapping->host->i_size; 181 bfs_write_failed(mapping, pos + len);
174 if (pos + len > isize)
175 vmtruncate(mapping->host, isize);
176 }
177 182
178 return ret; 183 return ret;
179} 184}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 0e7a6f81ae36..6043567b95c2 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -30,7 +30,7 @@
30#include <asm/cacheflush.h> 30#include <asm/cacheflush.h>
31#include <asm/a.out-core.h> 31#include <asm/a.out-core.h>
32 32
33static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); 33static int load_aout_binary(struct linux_binprm *);
34static int load_aout_library(struct file*); 34static int load_aout_library(struct file*);
35 35
36#ifdef CONFIG_COREDUMP 36#ifdef CONFIG_COREDUMP
@@ -201,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
201 * libraries. There is no binary dependent code anywhere else. 201 * libraries. There is no binary dependent code anywhere else.
202 */ 202 */
203 203
204static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) 204static int load_aout_binary(struct linux_binprm * bprm)
205{ 205{
206 struct pt_regs *regs = current_pt_regs();
206 struct exec ex; 207 struct exec ex;
207 unsigned long error; 208 unsigned long error;
208 unsigned long fd_offset; 209 unsigned long fd_offset;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fbd9f60bd763..0c42cdbabecf 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -44,7 +44,7 @@
44#define user_siginfo_t siginfo_t 44#define user_siginfo_t siginfo_t
45#endif 45#endif
46 46
47static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); 47static int load_elf_binary(struct linux_binprm *bprm);
48static int load_elf_library(struct file *); 48static int load_elf_library(struct file *);
49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, 49static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
50 int, int, unsigned long); 50 int, int, unsigned long);
@@ -558,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
558#endif 558#endif
559} 559}
560 560
561static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) 561static int load_elf_binary(struct linux_binprm *bprm)
562{ 562{
563 struct file *interpreter = NULL; /* to shut gcc up */ 563 struct file *interpreter = NULL; /* to shut gcc up */
564 unsigned long load_addr = 0, load_bias = 0; 564 unsigned long load_addr = 0, load_bias = 0;
@@ -575,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
575 unsigned long reloc_func_desc __maybe_unused = 0; 575 unsigned long reloc_func_desc __maybe_unused = 0;
576 int executable_stack = EXSTACK_DEFAULT; 576 int executable_stack = EXSTACK_DEFAULT;
577 unsigned long def_flags = 0; 577 unsigned long def_flags = 0;
578 struct pt_regs *regs = current_pt_regs();
578 struct { 579 struct {
579 struct elfhdr elf_ex; 580 struct elfhdr elf_ex;
580 struct elfhdr interp_elf_ex; 581 struct elfhdr interp_elf_ex;
@@ -1600,8 +1601,10 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1600 info->thread = NULL; 1601 info->thread = NULL;
1601 1602
1602 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); 1603 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
1603 if (psinfo == NULL) 1604 if (psinfo == NULL) {
1605 info->psinfo.data = NULL; /* So we don't free this wrongly */
1604 return 0; 1606 return 0;
1607 }
1605 1608
1606 fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); 1609 fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
1607 1610
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a46049154107..dc84732e554f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -56,7 +56,7 @@ typedef char *elf_caddr_t;
56 56
57MODULE_LICENSE("GPL"); 57MODULE_LICENSE("GPL");
58 58
59static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *); 59static int load_elf_fdpic_binary(struct linux_binprm *);
60static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *); 60static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
61static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *, 61static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
62 struct mm_struct *, const char *); 62 struct mm_struct *, const char *);
@@ -164,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
164/* 164/*
165 * load an fdpic binary into various bits of memory 165 * load an fdpic binary into various bits of memory
166 */ 166 */
167static int load_elf_fdpic_binary(struct linux_binprm *bprm, 167static int load_elf_fdpic_binary(struct linux_binprm *bprm)
168 struct pt_regs *regs)
169{ 168{
170 struct elf_fdpic_params exec_params, interp_params; 169 struct elf_fdpic_params exec_params, interp_params;
170 struct pt_regs *regs = current_pt_regs();
171 struct elf_phdr *phdr; 171 struct elf_phdr *phdr;
172 unsigned long stack_size, entryaddr; 172 unsigned long stack_size, entryaddr;
173#ifdef ELF_FDPIC_PLAT_INIT 173#ifdef ELF_FDPIC_PLAT_INIT
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 2790c7e1912e..037a3e2b045b 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -22,7 +22,7 @@
22#define EM86_INTERP "/usr/bin/em86" 22#define EM86_INTERP "/usr/bin/em86"
23#define EM86_I_NAME "em86" 23#define EM86_I_NAME "em86"
24 24
25static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) 25static int load_em86(struct linux_binprm *bprm)
26{ 26{
27 char *interp, *i_name, *i_arg; 27 char *interp, *i_name, *i_arg;
28 struct file * file; 28 struct file * file;
@@ -42,7 +42,6 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
42 return -ENOEXEC; 42 return -ENOEXEC;
43 } 43 }
44 44
45 bprm->recursion_depth++; /* Well, the bang-shell is implicit... */
46 allow_write_access(bprm->file); 45 allow_write_access(bprm->file);
47 fput(bprm->file); 46 fput(bprm->file);
48 bprm->file = NULL; 47 bprm->file = NULL;
@@ -90,7 +89,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
90 if (retval < 0) 89 if (retval < 0)
91 return retval; 90 return retval;
92 91
93 return search_binary_handler(bprm, regs); 92 return search_binary_handler(bprm);
94} 93}
95 94
96static struct linux_binfmt em86_format = { 95static struct linux_binfmt em86_format = {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e280352b28f9..b56371981d16 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -88,7 +88,7 @@ struct lib_info {
88static int load_flat_shared_library(int id, struct lib_info *p); 88static int load_flat_shared_library(int id, struct lib_info *p);
89#endif 89#endif
90 90
91static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); 91static int load_flat_binary(struct linux_binprm *);
92static int flat_core_dump(struct coredump_params *cprm); 92static int flat_core_dump(struct coredump_params *cprm);
93 93
94static struct linux_binfmt flat_format = { 94static struct linux_binfmt flat_format = {
@@ -858,9 +858,10 @@ out:
858 * libraries. There is no binary dependent code anywhere else. 858 * libraries. There is no binary dependent code anywhere else.
859 */ 859 */
860 860
861static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) 861static int load_flat_binary(struct linux_binprm * bprm)
862{ 862{
863 struct lib_info libinfo; 863 struct lib_info libinfo;
864 struct pt_regs *regs = current_pt_regs();
864 unsigned long p = bprm->p; 865 unsigned long p = bprm->p;
865 unsigned long stack_len; 866 unsigned long stack_len;
866 unsigned long start_addr; 867 unsigned long start_addr;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 790b3cddca67..0c8869fdd14e 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm)
104/* 104/*
105 * the loader itself 105 * the loader itself
106 */ 106 */
107static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) 107static int load_misc_binary(struct linux_binprm *bprm)
108{ 108{
109 Node *fmt; 109 Node *fmt;
110 struct file * interp_file = NULL; 110 struct file * interp_file = NULL;
@@ -117,10 +117,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
117 if (!enabled) 117 if (!enabled)
118 goto _ret; 118 goto _ret;
119 119
120 retval = -ENOEXEC;
121 if (bprm->recursion_depth > BINPRM_MAX_RECURSION)
122 goto _ret;
123
124 /* to keep locking time low, we copy the interpreter string */ 120 /* to keep locking time low, we copy the interpreter string */
125 read_lock(&entries_lock); 121 read_lock(&entries_lock);
126 fmt = check_file(bprm); 122 fmt = check_file(bprm);
@@ -176,7 +172,10 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
176 goto _error; 172 goto _error;
177 bprm->argc ++; 173 bprm->argc ++;
178 174
179 bprm->interp = iname; /* for binfmt_script */ 175 /* Update interp in case binfmt_script needs it. */
176 retval = bprm_change_interp(iname, bprm);
177 if (retval < 0)
178 goto _error;
180 179
181 interp_file = open_exec (iname); 180 interp_file = open_exec (iname);
182 retval = PTR_ERR (interp_file); 181 retval = PTR_ERR (interp_file);
@@ -197,9 +196,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
197 if (retval < 0) 196 if (retval < 0)
198 goto _error; 197 goto _error;
199 198
200 bprm->recursion_depth++; 199 retval = search_binary_handler(bprm);
201
202 retval = search_binary_handler (bprm, regs);
203 if (retval < 0) 200 if (retval < 0)
204 goto _error; 201 goto _error;
205 202
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d3b8c1f63155..5027a3e14922 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,7 +14,7 @@
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16 16
17static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) 17static int load_script(struct linux_binprm *bprm)
18{ 18{
19 const char *i_arg, *i_name; 19 const char *i_arg, *i_name;
20 char *cp; 20 char *cp;
@@ -22,15 +22,13 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
22 char interp[BINPRM_BUF_SIZE]; 22 char interp[BINPRM_BUF_SIZE];
23 int retval; 23 int retval;
24 24
25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || 25 if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
26 (bprm->recursion_depth > BINPRM_MAX_RECURSION))
27 return -ENOEXEC; 26 return -ENOEXEC;
28 /* 27 /*
29 * This section does the #! interpretation. 28 * This section does the #! interpretation.
30 * Sorta complicated, but hopefully it will work. -TYT 29 * Sorta complicated, but hopefully it will work. -TYT
31 */ 30 */
32 31
33 bprm->recursion_depth++;
34 allow_write_access(bprm->file); 32 allow_write_access(bprm->file);
35 fput(bprm->file); 33 fput(bprm->file);
36 bprm->file = NULL; 34 bprm->file = NULL;
@@ -82,7 +80,9 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
82 retval = copy_strings_kernel(1, &i_name, bprm); 80 retval = copy_strings_kernel(1, &i_name, bprm);
83 if (retval) return retval; 81 if (retval) return retval;
84 bprm->argc++; 82 bprm->argc++;
85 bprm->interp = interp; 83 retval = bprm_change_interp(interp, bprm);
84 if (retval < 0)
85 return retval;
86 86
87 /* 87 /*
88 * OK, now restart the process with the interpreter's dentry. 88 * OK, now restart the process with the interpreter's dentry.
@@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
95 retval = prepare_binprm(bprm); 95 retval = prepare_binprm(bprm);
96 if (retval < 0) 96 if (retval < 0)
97 return retval; 97 return retval;
98 return search_binary_handler(bprm,regs); 98 return search_binary_handler(bprm);
99} 99}
100 100
101static struct linux_binfmt script_format = { 101static struct linux_binfmt script_format = {
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 4517aaff61b4..4e00ed68d4a6 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -35,7 +35,7 @@
35 35
36#include <linux/elf.h> 36#include <linux/elf.h>
37 37
38static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs); 38static int load_som_binary(struct linux_binprm * bprm);
39static int load_som_library(struct file *); 39static int load_som_library(struct file *);
40 40
41/* 41/*
@@ -180,13 +180,14 @@ out:
180 */ 180 */
181 181
182static int 182static int
183load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) 183load_som_binary(struct linux_binprm * bprm)
184{ 184{
185 int retval; 185 int retval;
186 unsigned int size; 186 unsigned int size;
187 unsigned long som_entry; 187 unsigned long som_entry;
188 struct som_hdr *som_ex; 188 struct som_hdr *som_ex;
189 struct som_exec_auxhdr *hpuxhdr; 189 struct som_exec_auxhdr *hpuxhdr;
190 struct pt_regs *regs = current_pt_regs();
190 191
191 /* Get the exec-header */ 192 /* Get the exec-header */
192 som_ex = (struct som_hdr *) bprm->buf; 193 som_ex = (struct som_hdr *) bprm->buf;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ab3a456f6650..172f8491a2bd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -321,7 +321,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
321 * for a block special file file->f_path.dentry->d_inode->i_size is zero 321 * for a block special file file->f_path.dentry->d_inode->i_size is zero
322 * so we compute the size by hand (just as in block_read/write above) 322 * so we compute the size by hand (just as in block_read/write above)
323 */ 323 */
324static loff_t block_llseek(struct file *file, loff_t offset, int origin) 324static loff_t block_llseek(struct file *file, loff_t offset, int whence)
325{ 325{
326 struct inode *bd_inode = file->f_mapping->host; 326 struct inode *bd_inode = file->f_mapping->host;
327 loff_t size; 327 loff_t size;
@@ -331,7 +331,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
331 size = i_size_read(bd_inode); 331 size = i_size_read(bd_inode);
332 332
333 retval = -EINVAL; 333 retval = -EINVAL;
334 switch (origin) { 334 switch (whence) {
335 case SEEK_END: 335 case SEEK_END:
336 offset += size; 336 offset += size;
337 break; 337 break;
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d7fcdba141a2..7df3e0f0ee51 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 0c16e3dbfd56..e15d2b0d8d3b 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
121 ret = posix_acl_equiv_mode(acl, &inode->i_mode); 121 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
122 if (ret < 0) 122 if (ret < 0)
123 return ret; 123 return ret;
124 if (ret == 0)
125 acl = NULL;
124 } 126 }
125 ret = 0; 127 ret = 0;
126 break; 128 break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 208d8aa5b07e..04edf69be875 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
461 pos2 = n2, n2 = pos2->next) { 461 pos2 = n2, n2 = pos2->next) {
462 struct __prelim_ref *ref2; 462 struct __prelim_ref *ref2;
463 struct __prelim_ref *xchg; 463 struct __prelim_ref *xchg;
464 struct extent_inode_elem *eie;
464 465
465 ref2 = list_entry(pos2, struct __prelim_ref, list); 466 ref2 = list_entry(pos2, struct __prelim_ref, list);
466 467
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
472 ref1 = ref2; 473 ref1 = ref2;
473 ref2 = xchg; 474 ref2 = xchg;
474 } 475 }
475 ref1->count += ref2->count;
476 } else { 476 } else {
477 if (ref1->parent != ref2->parent) 477 if (ref1->parent != ref2->parent)
478 continue; 478 continue;
479 ref1->count += ref2->count;
480 } 479 }
480
481 eie = ref1->inode_list;
482 while (eie && eie->next)
483 eie = eie->next;
484 if (eie)
485 eie->next = ref2->inode_list;
486 else
487 ref1->inode_list = ref2->inode_list;
488 ref1->count += ref2->count;
489
481 list_del(&ref2->list); 490 list_del(&ref2->list);
482 kfree(ref2); 491 kfree(ref2);
483 } 492 }
@@ -890,8 +899,7 @@ again:
890 while (!list_empty(&prefs)) { 899 while (!list_empty(&prefs)) {
891 ref = list_first_entry(&prefs, struct __prelim_ref, list); 900 ref = list_first_entry(&prefs, struct __prelim_ref, list);
892 list_del(&ref->list); 901 list_del(&ref->list);
893 if (ref->count < 0) 902 WARN_ON(ref->count < 0);
894 WARN_ON(1);
895 if (ref->count && ref->root_id && ref->parent == 0) { 903 if (ref->count && ref->root_id && ref->parent == 0) {
896 /* no parent == root of tree */ 904 /* no parent == root of tree */
897 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 905 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ed8ca7ca5eff..2a8c242bc4f5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 39#define BTRFS_INODE_HAS_ORPHAN_ITEM 5
40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 40#define BTRFS_INODE_HAS_ASYNC_EXTENT 6
41#define BTRFS_INODE_NEEDS_FULL_SYNC 7 41#define BTRFS_INODE_NEEDS_FULL_SYNC 7
42#define BTRFS_INODE_COPY_EVERYTHING 8
42 43
43/* in memory btrfs inode */ 44/* in memory btrfs inode */
44struct btrfs_inode { 45struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
90 91
91 unsigned long runtime_flags; 92 unsigned long runtime_flags;
92 93
94 /* Keep track of who's O_SYNC/fsycing currently */
95 atomic_t sync_writers;
96
93 /* full 64 bit generation number, struct vfs_inode doesn't have a big 97 /* full 64 bit generation number, struct vfs_inode doesn't have a big
94 * enough field for this. 98 * enough field for this.
95 */ 99 */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 5a3e45db642a..11d47bfb62b4 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
137 unsigned int never_written:1; /* block was added because it was 137 unsigned int never_written:1; /* block was added because it was
138 * referenced, not because it was 138 * referenced, not because it was
139 * written */ 139 * written */
140 unsigned int mirror_num:2; /* large enough to hold 140 unsigned int mirror_num; /* large enough to hold
141 * BTRFS_SUPER_MIRROR_MAX */ 141 * BTRFS_SUPER_MIRROR_MAX */
142 struct btrfsic_dev_state *dev_state; 142 struct btrfsic_dev_state *dev_state;
143 u64 dev_bytenr; /* key, physical byte num on disk */ 143 u64 dev_bytenr; /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
723 } 723 }
724 724
725 num_copies = 725 num_copies =
726 btrfs_num_copies(&state->root->fs_info->mapping_tree, 726 btrfs_num_copies(state->root->fs_info,
727 next_bytenr, state->metablock_size); 727 next_bytenr, state->metablock_size);
728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 728 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 729 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
903 } 903 }
904 904
905 num_copies = 905 num_copies =
906 btrfs_num_copies(&state->root->fs_info->mapping_tree, 906 btrfs_num_copies(state->root->fs_info,
907 next_bytenr, state->metablock_size); 907 next_bytenr, state->metablock_size);
908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 908 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 909 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
1287 *next_blockp = NULL; 1287 *next_blockp = NULL;
1288 if (0 == *num_copiesp) { 1288 if (0 == *num_copiesp) {
1289 *num_copiesp = 1289 *num_copiesp =
1290 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1290 btrfs_num_copies(state->root->fs_info,
1291 next_bytenr, state->metablock_size); 1291 next_bytenr, state->metablock_size);
1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1292 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1293 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
1489 chunk_len = num_bytes; 1489 chunk_len = num_bytes;
1490 1490
1491 num_copies = 1491 num_copies =
1492 btrfs_num_copies(&state->root->fs_info->mapping_tree, 1492 btrfs_num_copies(state->root->fs_info,
1493 next_bytenr, state->datablock_size); 1493 next_bytenr, state->datablock_size);
1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 1494 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 1495 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1582 struct btrfs_device *device; 1582 struct btrfs_device *device;
1583 1583
1584 length = len; 1584 length = len;
1585 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ, 1585 ret = btrfs_map_block(state->root->fs_info, READ,
1586 bytenr, &length, &multi, mirror_num); 1586 bytenr, &length, &multi, mirror_num);
1587 1587
1588 if (ret) {
1589 block_ctx_out->start = 0;
1590 block_ctx_out->dev_bytenr = 0;
1591 block_ctx_out->len = 0;
1592 block_ctx_out->dev = NULL;
1593 block_ctx_out->datav = NULL;
1594 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL;
1596
1597 return ret;
1598 }
1599
1588 device = multi->stripes[0].dev; 1600 device = multi->stripes[0].dev;
1589 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); 1601 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1590 block_ctx_out->dev_bytenr = multi->stripes[0].physical; 1602 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1594 block_ctx_out->pagev = NULL; 1606 block_ctx_out->pagev = NULL;
1595 block_ctx_out->mem_to_free = NULL; 1607 block_ctx_out->mem_to_free = NULL;
1596 1608
1597 if (0 == ret) 1609 kfree(multi);
1598 kfree(multi);
1599 if (NULL == block_ctx_out->dev) { 1610 if (NULL == block_ctx_out->dev) {
1600 ret = -ENXIO; 1611 ret = -ENXIO;
1601 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n"); 1612 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
2463 } 2474 }
2464 2475
2465 num_copies = 2476 num_copies =
2466 btrfs_num_copies(&state->root->fs_info->mapping_tree, 2477 btrfs_num_copies(state->root->fs_info,
2467 next_bytenr, BTRFS_SUPER_INFO_SIZE); 2478 next_bytenr, BTRFS_SUPER_INFO_SIZE);
2468 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) 2479 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2469 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", 2480 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2960 struct btrfsic_block_data_ctx block_ctx; 2971 struct btrfsic_block_data_ctx block_ctx;
2961 int match = 0; 2972 int match = 0;
2962 2973
2963 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree, 2974 num_copies = btrfs_num_copies(state->root->fs_info,
2964 bytenr, state->metablock_size); 2975 bytenr, state->metablock_size);
2965 2976
2966 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { 2977 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c6467aa88bee..94ab2f80e7e3 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
687 687
688 ret = btrfs_map_bio(root, READ, comp_bio, 688 ret = btrfs_map_bio(root, READ, comp_bio,
689 mirror_num, 0); 689 mirror_num, 0);
690 BUG_ON(ret); /* -ENOMEM */ 690 if (ret)
691 bio_endio(comp_bio, ret);
691 692
692 bio_put(comp_bio); 693 bio_put(comp_bio);
693 694
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
712 } 713 }
713 714
714 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 715 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
715 BUG_ON(ret); /* -ENOMEM */ 716 if (ret)
717 bio_endio(comp_bio, ret);
716 718
717 bio_put(comp_bio); 719 bio_put(comp_bio);
718 return 0; 720 return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdfb4c49a806..eea5da7a2b9a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *dst_buf, 38 struct extent_buffer *dst_buf,
39 struct extent_buffer *src_buf); 39 struct extent_buffer *src_buf);
40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 40static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
41 struct btrfs_path *path, int level, int slot, 41 struct btrfs_path *path, int level, int slot);
42 int tree_mod_log);
43static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, 42static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
44 struct extent_buffer *eb); 43 struct extent_buffer *eb);
45struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, 44struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
776 775
777static noinline void 776static noinline void
778tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, 777tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
779 struct extent_buffer *eb, 778 struct extent_buffer *eb, int slot, int atomic)
780 struct btrfs_disk_key *disk_key, int slot, int atomic)
781{ 779{
782 int ret; 780 int ret;
783 781
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1361 u64 search_start; 1359 u64 search_start;
1362 int ret; 1360 int ret;
1363 1361
1364 if (trans->transaction != root->fs_info->running_transaction) { 1362 if (trans->transaction != root->fs_info->running_transaction)
1365 printk(KERN_CRIT "trans %llu running %llu\n", 1363 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1366 (unsigned long long)trans->transid, 1364 (unsigned long long)trans->transid,
1367 (unsigned long long) 1365 (unsigned long long)
1368 root->fs_info->running_transaction->transid); 1366 root->fs_info->running_transaction->transid);
1369 WARN_ON(1); 1367
1370 } 1368 if (trans->transid != root->fs_info->generation)
1371 if (trans->transid != root->fs_info->generation) { 1369 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1372 printk(KERN_CRIT "trans %llu running %llu\n",
1373 (unsigned long long)trans->transid, 1370 (unsigned long long)trans->transid,
1374 (unsigned long long)root->fs_info->generation); 1371 (unsigned long long)root->fs_info->generation);
1375 WARN_ON(1);
1376 }
1377 1372
1378 if (!should_cow_block(trans, root, buf)) { 1373 if (!should_cow_block(trans, root, buf)) {
1379 *cow_ret = buf; 1374 *cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1469 if (cache_only && parent_level != 1) 1464 if (cache_only && parent_level != 1)
1470 return 0; 1465 return 0;
1471 1466
1472 if (trans->transaction != root->fs_info->running_transaction) 1467 WARN_ON(trans->transaction != root->fs_info->running_transaction);
1473 WARN_ON(1); 1468 WARN_ON(trans->transid != root->fs_info->generation);
1474 if (trans->transid != root->fs_info->generation)
1475 WARN_ON(1);
1476 1469
1477 parent_nritems = btrfs_header_nritems(parent); 1470 parent_nritems = btrfs_header_nritems(parent);
1478 blocksize = btrfs_level_size(root, parent_level - 1); 1471 blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1827 if (btrfs_header_nritems(right) == 0) { 1820 if (btrfs_header_nritems(right) == 0) {
1828 clean_tree_block(trans, root, right); 1821 clean_tree_block(trans, root, right);
1829 btrfs_tree_unlock(right); 1822 btrfs_tree_unlock(right);
1830 del_ptr(trans, root, path, level + 1, pslot + 1, 1); 1823 del_ptr(trans, root, path, level + 1, pslot + 1);
1831 root_sub_used(root, right->len); 1824 root_sub_used(root, right->len);
1832 btrfs_free_tree_block(trans, root, right, 0, 1); 1825 btrfs_free_tree_block(trans, root, right, 0, 1);
1833 free_extent_buffer_stale(right); 1826 free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1836 struct btrfs_disk_key right_key; 1829 struct btrfs_disk_key right_key;
1837 btrfs_node_key(right, &right_key, 0); 1830 btrfs_node_key(right, &right_key, 0);
1838 tree_mod_log_set_node_key(root->fs_info, parent, 1831 tree_mod_log_set_node_key(root->fs_info, parent,
1839 &right_key, pslot + 1, 0); 1832 pslot + 1, 0);
1840 btrfs_set_node_key(parent, &right_key, pslot + 1); 1833 btrfs_set_node_key(parent, &right_key, pslot + 1);
1841 btrfs_mark_buffer_dirty(parent); 1834 btrfs_mark_buffer_dirty(parent);
1842 } 1835 }
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1871 if (btrfs_header_nritems(mid) == 0) { 1864 if (btrfs_header_nritems(mid) == 0) {
1872 clean_tree_block(trans, root, mid); 1865 clean_tree_block(trans, root, mid);
1873 btrfs_tree_unlock(mid); 1866 btrfs_tree_unlock(mid);
1874 del_ptr(trans, root, path, level + 1, pslot, 1); 1867 del_ptr(trans, root, path, level + 1, pslot);
1875 root_sub_used(root, mid->len); 1868 root_sub_used(root, mid->len);
1876 btrfs_free_tree_block(trans, root, mid, 0, 1); 1869 btrfs_free_tree_block(trans, root, mid, 0, 1);
1877 free_extent_buffer_stale(mid); 1870 free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1880 /* update the parent key to reflect our changes */ 1873 /* update the parent key to reflect our changes */
1881 struct btrfs_disk_key mid_key; 1874 struct btrfs_disk_key mid_key;
1882 btrfs_node_key(mid, &mid_key, 0); 1875 btrfs_node_key(mid, &mid_key, 0);
1883 tree_mod_log_set_node_key(root->fs_info, parent, &mid_key, 1876 tree_mod_log_set_node_key(root->fs_info, parent,
1884 pslot, 0); 1877 pslot, 0);
1885 btrfs_set_node_key(parent, &mid_key, pslot); 1878 btrfs_set_node_key(parent, &mid_key, pslot);
1886 btrfs_mark_buffer_dirty(parent); 1879 btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1980 orig_slot += left_nr; 1973 orig_slot += left_nr;
1981 btrfs_node_key(mid, &disk_key, 0); 1974 btrfs_node_key(mid, &disk_key, 0);
1982 tree_mod_log_set_node_key(root->fs_info, parent, 1975 tree_mod_log_set_node_key(root->fs_info, parent,
1983 &disk_key, pslot, 0); 1976 pslot, 0);
1984 btrfs_set_node_key(parent, &disk_key, pslot); 1977 btrfs_set_node_key(parent, &disk_key, pslot);
1985 btrfs_mark_buffer_dirty(parent); 1978 btrfs_mark_buffer_dirty(parent);
1986 if (btrfs_header_nritems(left) > orig_slot) { 1979 if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2033 2026
2034 btrfs_node_key(right, &disk_key, 0); 2027 btrfs_node_key(right, &disk_key, 0);
2035 tree_mod_log_set_node_key(root->fs_info, parent, 2028 tree_mod_log_set_node_key(root->fs_info, parent,
2036 &disk_key, pslot + 1, 0); 2029 pslot + 1, 0);
2037 btrfs_set_node_key(parent, &disk_key, pslot + 1); 2030 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2038 btrfs_mark_buffer_dirty(parent); 2031 btrfs_mark_buffer_dirty(parent);
2039 2032
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
2219 int no_skips = 0; 2212 int no_skips = 0;
2220 struct extent_buffer *t; 2213 struct extent_buffer *t;
2221 2214
2215 if (path->really_keep_locks)
2216 return;
2217
2222 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2218 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2223 if (!path->nodes[i]) 2219 if (!path->nodes[i])
2224 break; 2220 break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2266{ 2262{
2267 int i; 2263 int i;
2268 2264
2269 if (path->keep_locks) 2265 if (path->keep_locks || path->really_keep_locks)
2270 return; 2266 return;
2271 2267
2272 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 2268 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2499 if (!cow) 2495 if (!cow)
2500 write_lock_level = -1; 2496 write_lock_level = -1;
2501 2497
2502 if (cow && (p->keep_locks || p->lowest_level)) 2498 if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
2503 write_lock_level = BTRFS_MAX_LEVEL; 2499 write_lock_level = BTRFS_MAX_LEVEL;
2504 2500
2505 min_write_lock_level = write_lock_level; 2501 min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
2568 * must have write locks on this node and the 2564 * must have write locks on this node and the
2569 * parent 2565 * parent
2570 */ 2566 */
2571 if (level + 1 > write_lock_level) { 2567 if (level > write_lock_level ||
2568 (level + 1 > write_lock_level &&
2569 level + 1 < BTRFS_MAX_LEVEL &&
2570 p->nodes[level + 1])) {
2572 write_lock_level = level + 1; 2571 write_lock_level = level + 1;
2573 btrfs_release_path(p); 2572 btrfs_release_path(p);
2574 goto again; 2573 goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
2917 if (!path->nodes[i]) 2916 if (!path->nodes[i])
2918 break; 2917 break;
2919 t = path->nodes[i]; 2918 t = path->nodes[i];
2920 tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1); 2919 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2921 btrfs_set_node_key(t, key, tslot); 2920 btrfs_set_node_key(t, key, tslot);
2922 btrfs_mark_buffer_dirty(path->nodes[i]); 2921 btrfs_mark_buffer_dirty(path->nodes[i]);
2923 if (tslot != 0) 2922 if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3302 */ 3301 */
3303static int leaf_space_used(struct extent_buffer *l, int start, int nr) 3302static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3304{ 3303{
3304 struct btrfs_item *start_item;
3305 struct btrfs_item *end_item;
3306 struct btrfs_map_token token;
3305 int data_len; 3307 int data_len;
3306 int nritems = btrfs_header_nritems(l); 3308 int nritems = btrfs_header_nritems(l);
3307 int end = min(nritems, start + nr) - 1; 3309 int end = min(nritems, start + nr) - 1;
3308 3310
3309 if (!nr) 3311 if (!nr)
3310 return 0; 3312 return 0;
3311 data_len = btrfs_item_end_nr(l, start); 3313 btrfs_init_map_token(&token);
3312 data_len = data_len - btrfs_item_offset_nr(l, end); 3314 start_item = btrfs_item_nr(l, start);
3315 end_item = btrfs_item_nr(l, end);
3316 data_len = btrfs_token_item_offset(l, start_item, &token) +
3317 btrfs_token_item_size(l, start_item, &token);
3318 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3313 data_len += sizeof(struct btrfs_item) * nr; 3319 data_len += sizeof(struct btrfs_item) * nr;
3314 WARN_ON(data_len < 0); 3320 WARN_ON(data_len < 0);
3315 return data_len; 3321 return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3403 if (push_items == 0) 3409 if (push_items == 0)
3404 goto out_unlock; 3410 goto out_unlock;
3405 3411
3406 if (!empty && push_items == left_nritems) 3412 WARN_ON(!empty && push_items == left_nritems);
3407 WARN_ON(1);
3408 3413
3409 /* push left to right */ 3414 /* push left to right */
3410 right_nritems = btrfs_header_nritems(right); 3415 right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3642 btrfs_set_header_nritems(left, old_left_nritems + push_items); 3647 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3643 3648
3644 /* fixup right node */ 3649 /* fixup right node */
3645 if (push_items > right_nritems) { 3650 if (push_items > right_nritems)
3646 printk(KERN_CRIT "push items %d nr %u\n", push_items, 3651 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3647 right_nritems); 3652 right_nritems);
3648 WARN_ON(1);
3649 }
3650 3653
3651 if (push_items < right_nritems) { 3654 if (push_items < right_nritems) {
3652 push_space = btrfs_item_offset_nr(right, push_items - 1) - 3655 push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,8 +4605,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4602 * empty a node. 4605 * empty a node.
4603 */ 4606 */
4604static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 4607static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4605 struct btrfs_path *path, int level, int slot, 4608 struct btrfs_path *path, int level, int slot)
4606 int tree_mod_log)
4607{ 4609{
4608 struct extent_buffer *parent = path->nodes[level]; 4610 struct extent_buffer *parent = path->nodes[level];
4609 u32 nritems; 4611 u32 nritems;
@@ -4611,7 +4613,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4611 4613
4612 nritems = btrfs_header_nritems(parent); 4614 nritems = btrfs_header_nritems(parent);
4613 if (slot != nritems - 1) { 4615 if (slot != nritems - 1) {
4614 if (tree_mod_log && level) 4616 if (level)
4615 tree_mod_log_eb_move(root->fs_info, parent, slot, 4617 tree_mod_log_eb_move(root->fs_info, parent, slot,
4616 slot + 1, nritems - slot - 1); 4618 slot + 1, nritems - slot - 1);
4617 memmove_extent_buffer(parent, 4619 memmove_extent_buffer(parent,
@@ -4619,7 +4621,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4619 btrfs_node_key_ptr_offset(slot + 1), 4621 btrfs_node_key_ptr_offset(slot + 1),
4620 sizeof(struct btrfs_key_ptr) * 4622 sizeof(struct btrfs_key_ptr) *
4621 (nritems - slot - 1)); 4623 (nritems - slot - 1));
4622 } else if (tree_mod_log && level) { 4624 } else if (level) {
4623 ret = tree_mod_log_insert_key(root->fs_info, parent, slot, 4625 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4624 MOD_LOG_KEY_REMOVE); 4626 MOD_LOG_KEY_REMOVE);
4625 BUG_ON(ret < 0); 4627 BUG_ON(ret < 0);
@@ -4656,7 +4658,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4656 struct extent_buffer *leaf) 4658 struct extent_buffer *leaf)
4657{ 4659{
4658 WARN_ON(btrfs_header_generation(leaf) != trans->transid); 4660 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4659 del_ptr(trans, root, path, 1, path->slots[1], 1); 4661 del_ptr(trans, root, path, 1, path->slots[1]);
4660 4662
4661 /* 4663 /*
4662 * btrfs_free_extent is expensive, we want to make sure we 4664 * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5125,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5123 right_path->search_commit_root = 1; 5125 right_path->search_commit_root = 1;
5124 right_path->skip_locking = 1; 5126 right_path->skip_locking = 1;
5125 5127
5126 spin_lock(&left_root->root_times_lock); 5128 spin_lock(&left_root->root_item_lock);
5127 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item); 5129 left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5128 spin_unlock(&left_root->root_times_lock); 5130 spin_unlock(&left_root->root_item_lock);
5129 5131
5130 spin_lock(&right_root->root_times_lock); 5132 spin_lock(&right_root->root_item_lock);
5131 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item); 5133 right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5132 spin_unlock(&right_root->root_times_lock); 5134 spin_unlock(&right_root->root_item_lock);
5133 5135
5134 trans = btrfs_join_transaction(left_root); 5136 trans = btrfs_join_transaction(left_root);
5135 if (IS_ERR(trans)) { 5137 if (IS_ERR(trans)) {
@@ -5224,15 +5226,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5224 goto out; 5226 goto out;
5225 } 5227 }
5226 5228
5227 spin_lock(&left_root->root_times_lock); 5229 spin_lock(&left_root->root_item_lock);
5228 ctransid = btrfs_root_ctransid(&left_root->root_item); 5230 ctransid = btrfs_root_ctransid(&left_root->root_item);
5229 spin_unlock(&left_root->root_times_lock); 5231 spin_unlock(&left_root->root_item_lock);
5230 if (ctransid != left_start_ctransid) 5232 if (ctransid != left_start_ctransid)
5231 left_start_ctransid = 0; 5233 left_start_ctransid = 0;
5232 5234
5233 spin_lock(&right_root->root_times_lock); 5235 spin_lock(&right_root->root_item_lock);
5234 ctransid = btrfs_root_ctransid(&right_root->root_item); 5236 ctransid = btrfs_root_ctransid(&right_root->root_item);
5235 spin_unlock(&right_root->root_times_lock); 5237 spin_unlock(&right_root->root_item_lock);
5236 if (ctransid != right_start_ctransid) 5238 if (ctransid != right_start_ctransid)
5237 right_start_ctransid = 0; 5239 right_start_ctransid = 0;
5238 5240
@@ -5496,6 +5498,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5496 return btrfs_next_old_leaf(root, path, 0); 5498 return btrfs_next_old_leaf(root, path, 0);
5497} 5499}
5498 5500
5501/* Release the path up to but not including the given level */
5502static void btrfs_release_level(struct btrfs_path *path, int level)
5503{
5504 int i;
5505
5506 for (i = 0; i < level; i++) {
5507 path->slots[i] = 0;
5508 if (!path->nodes[i])
5509 continue;
5510 if (path->locks[i]) {
5511 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
5512 path->locks[i] = 0;
5513 }
5514 free_extent_buffer(path->nodes[i]);
5515 path->nodes[i] = NULL;
5516 }
5517}
5518
5519/*
5520 * This function assumes 2 things
5521 *
5522 * 1) You are using path->keep_locks
5523 * 2) You are not inserting items.
5524 *
5525 * If either of these are not true do not use this function. If you need a next
5526 * leaf with either of these not being true then this function can be easily
5527 * adapted to do that, but at the moment these are the limitations.
5528 */
5529int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
5530 struct btrfs_root *root, struct btrfs_path *path,
5531 int del)
5532{
5533 struct extent_buffer *b;
5534 struct btrfs_key key;
5535 u32 nritems;
5536 int level = 1;
5537 int slot;
5538 int ret = 1;
5539 int write_lock_level = BTRFS_MAX_LEVEL;
5540 int ins_len = del ? -1 : 0;
5541
5542 WARN_ON(!(path->keep_locks || path->really_keep_locks));
5543
5544 nritems = btrfs_header_nritems(path->nodes[0]);
5545 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5546
5547 while (path->nodes[level]) {
5548 nritems = btrfs_header_nritems(path->nodes[level]);
5549 if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
5550search:
5551 btrfs_release_path(path);
5552 ret = btrfs_search_slot(trans, root, &key, path,
5553 ins_len, 1);
5554 if (ret < 0)
5555 goto out;
5556 level = 1;
5557 continue;
5558 }
5559
5560 if (path->slots[level] >= nritems - 1) {
5561 level++;
5562 continue;
5563 }
5564
5565 btrfs_release_level(path, level);
5566 break;
5567 }
5568
5569 if (!path->nodes[level]) {
5570 ret = 1;
5571 goto out;
5572 }
5573
5574 path->slots[level]++;
5575 b = path->nodes[level];
5576
5577 while (b) {
5578 level = btrfs_header_level(b);
5579
5580 if (!should_cow_block(trans, root, b))
5581 goto cow_done;
5582
5583 btrfs_set_path_blocking(path);
5584 ret = btrfs_cow_block(trans, root, b,
5585 path->nodes[level + 1],
5586 path->slots[level + 1], &b);
5587 if (ret)
5588 goto out;
5589cow_done:
5590 path->nodes[level] = b;
5591 btrfs_clear_path_blocking(path, NULL, 0);
5592 if (level != 0) {
5593 ret = setup_nodes_for_search(trans, root, path, b,
5594 level, ins_len,
5595 &write_lock_level);
5596 if (ret == -EAGAIN)
5597 goto search;
5598 if (ret)
5599 goto out;
5600
5601 b = path->nodes[level];
5602 slot = path->slots[level];
5603
5604 ret = read_block_for_search(trans, root, path,
5605 &b, level, slot, &key, 0);
5606 if (ret == -EAGAIN)
5607 goto search;
5608 if (ret)
5609 goto out;
5610 level = btrfs_header_level(b);
5611 if (!btrfs_try_tree_write_lock(b)) {
5612 btrfs_set_path_blocking(path);
5613 btrfs_tree_lock(b);
5614 btrfs_clear_path_blocking(path, b,
5615 BTRFS_WRITE_LOCK);
5616 }
5617 path->locks[level] = BTRFS_WRITE_LOCK;
5618 path->nodes[level] = b;
5619 path->slots[level] = 0;
5620 } else {
5621 path->slots[level] = 0;
5622 ret = 0;
5623 break;
5624 }
5625 }
5626
5627out:
5628 if (ret)
5629 btrfs_release_path(path);
5630
5631 return ret;
5632}
5633
5499int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 5634int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5500 u64 time_seq) 5635 u64 time_seq)
5501{ 5636{
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead869507..547b7b05727f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
48 48
49#define BTRFS_MAGIC "_BHRfS_M" 49#define BTRFS_MAGIC "_BHRfS_M"
50 50
51#define BTRFS_MAX_MIRRORS 2 51#define BTRFS_MAX_MIRRORS 3
52 52
53#define BTRFS_MAX_LEVEL 8 53#define BTRFS_MAX_LEVEL 8
54 54
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
142 142
143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 143#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
144 144
145#define BTRFS_DEV_REPLACE_DEVID 0
146
145/* 147/*
146 * the max metadata block size. This limit is somewhat artificial, 148 * the max metadata block size. This limit is somewhat artificial,
147 * but the memmove costs go through the roof for larger blocks. 149 * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
172/* four bytes for CRC32 */ 174/* four bytes for CRC32 */
173#define BTRFS_EMPTY_DIR_SIZE 0 175#define BTRFS_EMPTY_DIR_SIZE 0
174 176
177/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
178#define REQ_GET_READ_MIRRORS (1 << 30)
179
175#define BTRFS_FT_UNKNOWN 0 180#define BTRFS_FT_UNKNOWN 0
176#define BTRFS_FT_REG_FILE 1 181#define BTRFS_FT_REG_FILE 1
177#define BTRFS_FT_DIR 2 182#define BTRFS_FT_DIR 2
@@ -413,7 +418,7 @@ struct btrfs_root_backup {
413 __le64 bytes_used; 418 __le64 bytes_used;
414 __le64 num_devices; 419 __le64 num_devices;
415 /* future */ 420 /* future */
416 __le64 unsed_64[4]; 421 __le64 unused_64[4];
417 422
418 u8 tree_root_level; 423 u8 tree_root_level;
419 u8 chunk_root_level; 424 u8 chunk_root_level;
@@ -571,6 +576,7 @@ struct btrfs_path {
571 unsigned int skip_locking:1; 576 unsigned int skip_locking:1;
572 unsigned int leave_spinning:1; 577 unsigned int leave_spinning:1;
573 unsigned int search_commit_root:1; 578 unsigned int search_commit_root:1;
579 unsigned int really_keep_locks:1;
574}; 580};
575 581
576/* 582/*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
885 __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; 891 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
886} __attribute__ ((__packed__)); 892} __attribute__ ((__packed__));
887 893
894#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
895#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
896#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
897#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
898#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
899#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
900#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
901
902struct btrfs_dev_replace {
903 u64 replace_state; /* see #define above */
904 u64 time_started; /* seconds since 1-Jan-1970 */
905 u64 time_stopped; /* seconds since 1-Jan-1970 */
906 atomic64_t num_write_errors;
907 atomic64_t num_uncorrectable_read_errors;
908
909 u64 cursor_left;
910 u64 committed_cursor_left;
911 u64 cursor_left_last_write_of_item;
912 u64 cursor_right;
913
914 u64 cont_reading_from_srcdev_mode; /* see #define above */
915
916 int is_valid;
917 int item_needs_writeback;
918 struct btrfs_device *srcdev;
919 struct btrfs_device *tgtdev;
920
921 pid_t lock_owner;
922 atomic_t nesting_level;
923 struct mutex lock_finishing_cancel_unmount;
924 struct mutex lock_management_lock;
925 struct mutex lock;
926
927 struct btrfs_scrub_progress scrub_progress;
928};
929
930struct btrfs_dev_replace_item {
931 /*
932 * grow this item struct at the end for future enhancements and keep
933 * the existing values unchanged
934 */
935 __le64 src_devid;
936 __le64 cursor_left;
937 __le64 cursor_right;
938 __le64 cont_reading_from_srcdev_mode;
939
940 __le64 replace_state;
941 __le64 time_started;
942 __le64 time_stopped;
943 __le64 num_write_errors;
944 __le64 num_uncorrectable_read_errors;
945} __attribute__ ((__packed__));
946
888/* different types of block groups (and chunks) */ 947/* different types of block groups (and chunks) */
889#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) 948#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
890#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) 949#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
1333 struct btrfs_workers generic_worker; 1392 struct btrfs_workers generic_worker;
1334 struct btrfs_workers workers; 1393 struct btrfs_workers workers;
1335 struct btrfs_workers delalloc_workers; 1394 struct btrfs_workers delalloc_workers;
1395 struct btrfs_workers flush_workers;
1336 struct btrfs_workers endio_workers; 1396 struct btrfs_workers endio_workers;
1337 struct btrfs_workers endio_meta_workers; 1397 struct btrfs_workers endio_meta_workers;
1338 struct btrfs_workers endio_meta_write_workers; 1398 struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
1429 struct rw_semaphore scrub_super_lock; 1489 struct rw_semaphore scrub_super_lock;
1430 int scrub_workers_refcnt; 1490 int scrub_workers_refcnt;
1431 struct btrfs_workers scrub_workers; 1491 struct btrfs_workers scrub_workers;
1492 struct btrfs_workers scrub_wr_completion_workers;
1493 struct btrfs_workers scrub_nocow_workers;
1432 1494
1433#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1495#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1434 u32 check_integrity_print_mask; 1496 u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
1470 int backup_root_index; 1532 int backup_root_index;
1471 1533
1472 int num_tolerated_disk_barrier_failures; 1534 int num_tolerated_disk_barrier_failures;
1535
1536 /* device replace state */
1537 struct btrfs_dev_replace dev_replace;
1538
1539 atomic_t mutually_exclusive_operation_running;
1473}; 1540};
1474 1541
1475/* 1542/*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
1579 1646
1580 int force_cow; 1647 int force_cow;
1581 1648
1582 spinlock_t root_times_lock; 1649 spinlock_t root_item_lock;
1583}; 1650};
1584 1651
1585struct btrfs_ioctl_defrag_range_args { 1652struct btrfs_ioctl_defrag_range_args {
@@ -1723,6 +1790,12 @@ struct btrfs_ioctl_defrag_range_args {
1723#define BTRFS_DEV_STATS_KEY 249 1790#define BTRFS_DEV_STATS_KEY 249
1724 1791
1725/* 1792/*
1793 * Persistantly stores the device replace state in the device tree.
1794 * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
1795 */
1796#define BTRFS_DEV_REPLACE_KEY 250
1797
1798/*
1726 * string items are for debugging. They just store a short string of 1799 * string items are for debugging. They just store a short string of
1727 * data in the FS 1800 * data in the FS
1728 */ 1801 */
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
1787 1860
1788static inline void btrfs_init_map_token (struct btrfs_map_token *token) 1861static inline void btrfs_init_map_token (struct btrfs_map_token *token)
1789{ 1862{
1790 memset(token, 0, sizeof(*token)); 1863 token->kaddr = NULL;
1791} 1864}
1792 1865
1793/* some macros to generate set/get funcs for the struct fields. This 1866/* some macros to generate set/get funcs for the struct fields. This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
2755BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, 2828BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
2756 rsv_excl, 64); 2829 rsv_excl, 64);
2757 2830
2831/* btrfs_dev_replace_item */
2832BTRFS_SETGET_FUNCS(dev_replace_src_devid,
2833 struct btrfs_dev_replace_item, src_devid, 64);
2834BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
2835 struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
2836 64);
2837BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
2838 replace_state, 64);
2839BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
2840 time_started, 64);
2841BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
2842 time_stopped, 64);
2843BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
2844 num_write_errors, 64);
2845BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
2846 struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
2847 64);
2848BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
2849 cursor_left, 64);
2850BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
2851 cursor_right, 64);
2852
2853BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
2854 struct btrfs_dev_replace_item, src_devid, 64);
2855BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
2856 struct btrfs_dev_replace_item,
2857 cont_reading_from_srcdev_mode, 64);
2858BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
2859 struct btrfs_dev_replace_item, replace_state, 64);
2860BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
2861 struct btrfs_dev_replace_item, time_started, 64);
2862BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
2863 struct btrfs_dev_replace_item, time_stopped, 64);
2864BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
2865 struct btrfs_dev_replace_item, num_write_errors, 64);
2866BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
2867 struct btrfs_dev_replace_item,
2868 num_uncorrectable_read_errors, 64);
2869BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
2870 struct btrfs_dev_replace_item, cursor_left, 64);
2871BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
2872 struct btrfs_dev_replace_item, cursor_right, 64);
2873
2758static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 2874static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2759{ 2875{
2760 return sb->s_fs_info; 2876 return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
2900u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 3016u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2901u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3017u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2902void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 3018void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
3019
3020enum btrfs_reserve_flush_enum {
3021 /* If we are in the transaction, we can't flush anything.*/
3022 BTRFS_RESERVE_NO_FLUSH,
3023 /*
3024 * Flushing delalloc may cause deadlock somewhere, in this
3025 * case, use FLUSH LIMIT
3026 */
3027 BTRFS_RESERVE_FLUSH_LIMIT,
3028 BTRFS_RESERVE_FLUSH_ALL,
3029};
3030
2903int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3031int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2904void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3032void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2905void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3033void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
2919void btrfs_free_block_rsv(struct btrfs_root *root, 3047void btrfs_free_block_rsv(struct btrfs_root *root,
2920 struct btrfs_block_rsv *rsv); 3048 struct btrfs_block_rsv *rsv);
2921int btrfs_block_rsv_add(struct btrfs_root *root, 3049int btrfs_block_rsv_add(struct btrfs_root *root,
2922 struct btrfs_block_rsv *block_rsv, 3050 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
2923 u64 num_bytes); 3051 enum btrfs_reserve_flush_enum flush);
2924int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2925 struct btrfs_block_rsv *block_rsv,
2926 u64 num_bytes);
2927int btrfs_block_rsv_check(struct btrfs_root *root, 3052int btrfs_block_rsv_check(struct btrfs_root *root,
2928 struct btrfs_block_rsv *block_rsv, int min_factor); 3053 struct btrfs_block_rsv *block_rsv, int min_factor);
2929int btrfs_block_rsv_refill(struct btrfs_root *root, 3054int btrfs_block_rsv_refill(struct btrfs_root *root,
2930 struct btrfs_block_rsv *block_rsv, 3055 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
2931 u64 min_reserved); 3056 enum btrfs_reserve_flush_enum flush);
2932int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2933 struct btrfs_block_rsv *block_rsv,
2934 u64 min_reserved);
2935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3057int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2936 struct btrfs_block_rsv *dst_rsv, 3058 struct btrfs_block_rsv *dst_rsv,
2937 u64 num_bytes); 3059 u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2955int btrfs_init_space_info(struct btrfs_fs_info *fs_info); 3077int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2956int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, 3078int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2957 struct btrfs_fs_info *fs_info); 3079 struct btrfs_fs_info *fs_info);
3080int __get_raid_index(u64 flags);
2958/* ctree.c */ 3081/* ctree.c */
2959int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 3082int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2960 int level, int *slot); 3083 int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
3065} 3188}
3066 3189
3067int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 3190int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
3191int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
3192 struct btrfs_root *root, struct btrfs_path *path,
3193 int del);
3068int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, 3194int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
3069 u64 time_seq); 3195 u64 time_seq);
3070static inline int btrfs_next_old_item(struct btrfs_root *root, 3196static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
3157 struct btrfs_root *root); 3283 struct btrfs_root *root);
3158 3284
3159/* dir-item.c */ 3285/* dir-item.c */
3286int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
3287 const char *name, int name_len);
3160int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 3288int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
3161 struct btrfs_root *root, const char *name, 3289 struct btrfs_root *root, const char *name,
3162 int name_len, struct inode *dir, 3290 int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
3256 struct btrfs_root *root, 3384 struct btrfs_root *root,
3257 struct btrfs_path *path, u64 objectid, 3385 struct btrfs_path *path, u64 objectid,
3258 u64 bytenr, int mod); 3386 u64 bytenr, int mod);
3387u64 btrfs_file_extent_length(struct btrfs_path *path);
3259int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 3388int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
3260 struct btrfs_root *root, 3389 struct btrfs_root *root,
3261 struct btrfs_ordered_sum *sums); 3390 struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
3271int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3400int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3272 struct list_head *list, int search_commit); 3401 struct list_head *list, int search_commit);
3273/* inode.c */ 3402/* inode.c */
3403struct btrfs_delalloc_work {
3404 struct inode *inode;
3405 int wait;
3406 int delay_iput;
3407 struct completion completion;
3408 struct list_head list;
3409 struct btrfs_work work;
3410};
3411
3412struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
3413 int wait, int delay_iput);
3414void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
3415
3274struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, 3416struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
3275 size_t pg_offset, u64 start, u64 len, 3417 size_t pg_offset, u64 start, u64 len,
3276 int create); 3418 int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
3370 struct btrfs_ioctl_space_info *space); 3512 struct btrfs_ioctl_space_info *space);
3371 3513
3372/* file.c */ 3514/* file.c */
3515int btrfs_auto_defrag_init(void);
3516void btrfs_auto_defrag_exit(void);
3373int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3517int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3374 struct inode *inode); 3518 struct inode *inode);
3375int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 3519int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
3520void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
3376int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3521int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3377void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3522void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3378 int skip_pinned); 3523 int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
3519 struct btrfs_pending_snapshot *pending); 3664 struct btrfs_pending_snapshot *pending);
3520 3665
3521/* scrub.c */ 3666/* scrub.c */
3522int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 3667int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3523 struct btrfs_scrub_progress *progress, int readonly); 3668 u64 end, struct btrfs_scrub_progress *progress,
3669 int readonly, int is_dev_replace);
3524void btrfs_scrub_pause(struct btrfs_root *root); 3670void btrfs_scrub_pause(struct btrfs_root *root);
3525void btrfs_scrub_pause_super(struct btrfs_root *root); 3671void btrfs_scrub_pause_super(struct btrfs_root *root);
3526void btrfs_scrub_continue(struct btrfs_root *root); 3672void btrfs_scrub_continue(struct btrfs_root *root);
3527void btrfs_scrub_continue_super(struct btrfs_root *root); 3673void btrfs_scrub_continue_super(struct btrfs_root *root);
3528int __btrfs_scrub_cancel(struct btrfs_fs_info *info); 3674int btrfs_scrub_cancel(struct btrfs_fs_info *info);
3529int btrfs_scrub_cancel(struct btrfs_root *root); 3675int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
3530int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev); 3676 struct btrfs_device *dev);
3531int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); 3677int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
3532int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3678int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3533 struct btrfs_scrub_progress *progress); 3679 struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 478f66bdc57b..34836036f01b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 651 */
652 if (!src_rsv || (!trans->bytes_reserved && 652 if (!src_rsv || (!trans->bytes_reserved &&
653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 653 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
654 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 654 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
655 BTRFS_RESERVE_NO_FLUSH);
655 /* 656 /*
656 * Since we're under a transaction reserve_metadata_bytes could 657 * Since we're under a transaction reserve_metadata_bytes could
657 * try to commit the transaction which will make it return 658 * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
686 * reserve something strictly for us. If not be a pain and try 687 * reserve something strictly for us. If not be a pain and try
687 * to steal from the delalloc block rsv. 688 * to steal from the delalloc block rsv.
688 */ 689 */
689 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); 690 ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
691 BTRFS_RESERVE_NO_FLUSH);
690 if (!ret) 692 if (!ret)
691 goto out; 693 goto out;
692 694
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1255 struct btrfs_delayed_node *delayed_node = NULL; 1257 struct btrfs_delayed_node *delayed_node = NULL;
1256 struct btrfs_root *root; 1258 struct btrfs_root *root;
1257 struct btrfs_block_rsv *block_rsv; 1259 struct btrfs_block_rsv *block_rsv;
1258 unsigned long nr = 0;
1259 int need_requeue = 0; 1260 int need_requeue = 0;
1260 int ret; 1261 int ret;
1261 1262
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1316 delayed_node); 1317 delayed_node);
1317 mutex_unlock(&delayed_node->mutex); 1318 mutex_unlock(&delayed_node->mutex);
1318 1319
1319 nr = trans->blocks_used;
1320
1321 trans->block_rsv = block_rsv; 1320 trans->block_rsv = block_rsv;
1322 btrfs_end_transaction_dmeta(trans, root); 1321 btrfs_end_transaction_dmeta(trans, root);
1323 __btrfs_btree_balance_dirty(root, nr); 1322 btrfs_btree_balance_dirty_nodelay(root);
1324free_path: 1323free_path:
1325 btrfs_free_path(path); 1324 btrfs_free_path(path);
1326out: 1325out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
new file mode 100644
index 000000000000..66dbc8dbddf7
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/bio.h>
20#include <linux/slab.h>
21#include <linux/buffer_head.h>
22#include <linux/blkdev.h>
23#include <linux/random.h>
24#include <linux/iocontext.h>
25#include <linux/capability.h>
26#include <linux/kthread.h>
27#include <linux/math64.h>
28#include <asm/div64.h>
29#include "compat.h"
30#include "ctree.h"
31#include "extent_map.h"
32#include "disk-io.h"
33#include "transaction.h"
34#include "print-tree.h"
35#include "volumes.h"
36#include "async-thread.h"
37#include "check-integrity.h"
38#include "rcu-string.h"
39#include "dev-replace.h"
40
41static u64 btrfs_get_seconds_since_1970(void);
42static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
43 int scrub_ret);
44static void btrfs_dev_replace_update_device_in_mapping_tree(
45 struct btrfs_fs_info *fs_info,
46 struct btrfs_device *srcdev,
47 struct btrfs_device *tgtdev);
48static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
49 char *srcdev_name,
50 struct btrfs_device **device);
51static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
52static int btrfs_dev_replace_kthread(void *data);
53static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
54
55
56int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
57{
58 struct btrfs_key key;
59 struct btrfs_root *dev_root = fs_info->dev_root;
60 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
61 struct extent_buffer *eb;
62 int slot;
63 int ret = 0;
64 struct btrfs_path *path = NULL;
65 int item_size;
66 struct btrfs_dev_replace_item *ptr;
67 u64 src_devid;
68
69 path = btrfs_alloc_path();
70 if (!path) {
71 ret = -ENOMEM;
72 goto out;
73 }
74
75 key.objectid = 0;
76 key.type = BTRFS_DEV_REPLACE_KEY;
77 key.offset = 0;
78 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
79 if (ret) {
80no_valid_dev_replace_entry_found:
81 ret = 0;
82 dev_replace->replace_state =
83 BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
84 dev_replace->cont_reading_from_srcdev_mode =
85 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
86 dev_replace->replace_state = 0;
87 dev_replace->time_started = 0;
88 dev_replace->time_stopped = 0;
89 atomic64_set(&dev_replace->num_write_errors, 0);
90 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
91 dev_replace->cursor_left = 0;
92 dev_replace->committed_cursor_left = 0;
93 dev_replace->cursor_left_last_write_of_item = 0;
94 dev_replace->cursor_right = 0;
95 dev_replace->srcdev = NULL;
96 dev_replace->tgtdev = NULL;
97 dev_replace->is_valid = 0;
98 dev_replace->item_needs_writeback = 0;
99 goto out;
100 }
101 slot = path->slots[0];
102 eb = path->nodes[0];
103 item_size = btrfs_item_size_nr(eb, slot);
104 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
105
106 if (item_size != sizeof(struct btrfs_dev_replace_item)) {
107 pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
108 goto no_valid_dev_replace_entry_found;
109 }
110
111 src_devid = btrfs_dev_replace_src_devid(eb, ptr);
112 dev_replace->cont_reading_from_srcdev_mode =
113 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
114 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
115 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
116 dev_replace->time_stopped =
117 btrfs_dev_replace_time_stopped(eb, ptr);
118 atomic64_set(&dev_replace->num_write_errors,
119 btrfs_dev_replace_num_write_errors(eb, ptr));
120 atomic64_set(&dev_replace->num_uncorrectable_read_errors,
121 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
122 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
123 dev_replace->committed_cursor_left = dev_replace->cursor_left;
124 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
125 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
126 dev_replace->is_valid = 1;
127
128 dev_replace->item_needs_writeback = 0;
129 switch (dev_replace->replace_state) {
130 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
131 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
132 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
133 dev_replace->srcdev = NULL;
134 dev_replace->tgtdev = NULL;
135 break;
136 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
137 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
138 dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
139 NULL, NULL);
140 dev_replace->tgtdev = btrfs_find_device(fs_info,
141 BTRFS_DEV_REPLACE_DEVID,
142 NULL, NULL);
143 /*
144 * allow 'btrfs dev replace_cancel' if src/tgt device is
145 * missing
146 */
147 if (!dev_replace->srcdev &&
148 !btrfs_test_opt(dev_root, DEGRADED)) {
149 ret = -EIO;
150 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
151 (unsigned long long)src_devid);
152 }
153 if (!dev_replace->tgtdev &&
154 !btrfs_test_opt(dev_root, DEGRADED)) {
155 ret = -EIO;
156 pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
157 (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
158 }
159 if (dev_replace->tgtdev) {
160 if (dev_replace->srcdev) {
161 dev_replace->tgtdev->total_bytes =
162 dev_replace->srcdev->total_bytes;
163 dev_replace->tgtdev->disk_total_bytes =
164 dev_replace->srcdev->disk_total_bytes;
165 dev_replace->tgtdev->bytes_used =
166 dev_replace->srcdev->bytes_used;
167 }
168 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
169 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
170 dev_replace->tgtdev);
171 }
172 break;
173 }
174
175out:
176 if (path)
177 btrfs_free_path(path);
178 return ret;
179}
180
181/*
182 * called from commit_transaction. Writes changed device replace state to
183 * disk.
184 */
185int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
186 struct btrfs_fs_info *fs_info)
187{
188 int ret;
189 struct btrfs_root *dev_root = fs_info->dev_root;
190 struct btrfs_path *path;
191 struct btrfs_key key;
192 struct extent_buffer *eb;
193 struct btrfs_dev_replace_item *ptr;
194 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
195
196 btrfs_dev_replace_lock(dev_replace);
197 if (!dev_replace->is_valid ||
198 !dev_replace->item_needs_writeback) {
199 btrfs_dev_replace_unlock(dev_replace);
200 return 0;
201 }
202 btrfs_dev_replace_unlock(dev_replace);
203
204 key.objectid = 0;
205 key.type = BTRFS_DEV_REPLACE_KEY;
206 key.offset = 0;
207
208 path = btrfs_alloc_path();
209 if (!path) {
210 ret = -ENOMEM;
211 goto out;
212 }
213 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
214 if (ret < 0) {
215 pr_warn("btrfs: error %d while searching for dev_replace item!\n",
216 ret);
217 goto out;
218 }
219
220 if (ret == 0 &&
221 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
222 /*
223 * need to delete old one and insert a new one.
224 * Since no attempt is made to recover any old state, if the
225 * dev_replace state is 'running', the data on the target
226 * drive is lost.
227 * It would be possible to recover the state: just make sure
228 * that the beginning of the item is never changed and always
229 * contains all the essential information. Then read this
230 * minimal set of information and use it as a base for the
231 * new state.
232 */
233 ret = btrfs_del_item(trans, dev_root, path);
234 if (ret != 0) {
235 pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
236 ret);
237 goto out;
238 }
239 ret = 1;
240 }
241
242 if (ret == 1) {
243 /* need to insert a new item */
244 btrfs_release_path(path);
245 ret = btrfs_insert_empty_item(trans, dev_root, path,
246 &key, sizeof(*ptr));
247 if (ret < 0) {
248 pr_warn("btrfs: insert dev_replace item failed %d!\n",
249 ret);
250 goto out;
251 }
252 }
253
254 eb = path->nodes[0];
255 ptr = btrfs_item_ptr(eb, path->slots[0],
256 struct btrfs_dev_replace_item);
257
258 btrfs_dev_replace_lock(dev_replace);
259 if (dev_replace->srcdev)
260 btrfs_set_dev_replace_src_devid(eb, ptr,
261 dev_replace->srcdev->devid);
262 else
263 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
264 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
265 dev_replace->cont_reading_from_srcdev_mode);
266 btrfs_set_dev_replace_replace_state(eb, ptr,
267 dev_replace->replace_state);
268 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
269 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
270 btrfs_set_dev_replace_num_write_errors(eb, ptr,
271 atomic64_read(&dev_replace->num_write_errors));
272 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
273 atomic64_read(&dev_replace->num_uncorrectable_read_errors));
274 dev_replace->cursor_left_last_write_of_item =
275 dev_replace->cursor_left;
276 btrfs_set_dev_replace_cursor_left(eb, ptr,
277 dev_replace->cursor_left_last_write_of_item);
278 btrfs_set_dev_replace_cursor_right(eb, ptr,
279 dev_replace->cursor_right);
280 dev_replace->item_needs_writeback = 0;
281 btrfs_dev_replace_unlock(dev_replace);
282
283 btrfs_mark_buffer_dirty(eb);
284
285out:
286 btrfs_free_path(path);
287
288 return ret;
289}
290
291void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
292{
293 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
294
295 dev_replace->committed_cursor_left =
296 dev_replace->cursor_left_last_write_of_item;
297}
298
299static u64 btrfs_get_seconds_since_1970(void)
300{
301 struct timespec t = CURRENT_TIME_SEC;
302
303 return t.tv_sec;
304}
305
306int btrfs_dev_replace_start(struct btrfs_root *root,
307 struct btrfs_ioctl_dev_replace_args *args)
308{
309 struct btrfs_trans_handle *trans;
310 struct btrfs_fs_info *fs_info = root->fs_info;
311 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
312 int ret;
313 struct btrfs_device *tgt_device = NULL;
314 struct btrfs_device *src_device = NULL;
315
316 switch (args->start.cont_reading_from_srcdev_mode) {
317 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
318 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
319 break;
320 default:
321 return -EINVAL;
322 }
323
324 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
325 args->start.tgtdev_name[0] == '\0')
326 return -EINVAL;
327
328 mutex_lock(&fs_info->volume_mutex);
329 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
330 &tgt_device);
331 if (ret) {
332 pr_err("btrfs: target device %s is invalid!\n",
333 args->start.tgtdev_name);
334 mutex_unlock(&fs_info->volume_mutex);
335 return -EINVAL;
336 }
337
338 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
339 args->start.srcdev_name,
340 &src_device);
341 mutex_unlock(&fs_info->volume_mutex);
342 if (ret) {
343 ret = -EINVAL;
344 goto leave_no_lock;
345 }
346
347 if (tgt_device->total_bytes < src_device->total_bytes) {
348 pr_err("btrfs: target device is smaller than source device!\n");
349 ret = -EINVAL;
350 goto leave_no_lock;
351 }
352
353 btrfs_dev_replace_lock(dev_replace);
354 switch (dev_replace->replace_state) {
355 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
356 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
357 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
358 break;
359 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
360 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
361 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
362 goto leave;
363 }
364
365 dev_replace->cont_reading_from_srcdev_mode =
366 args->start.cont_reading_from_srcdev_mode;
367 WARN_ON(!src_device);
368 dev_replace->srcdev = src_device;
369 WARN_ON(!tgt_device);
370 dev_replace->tgtdev = tgt_device;
371
372 printk_in_rcu(KERN_INFO
373 "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
374 src_device->missing ? "<missing disk>" :
375 rcu_str_deref(src_device->name),
376 src_device->devid,
377 rcu_str_deref(tgt_device->name));
378
379 tgt_device->total_bytes = src_device->total_bytes;
380 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
381 tgt_device->bytes_used = src_device->bytes_used;
382
383 /*
384 * from now on, the writes to the srcdev are all duplicated to
385 * go to the tgtdev as well (refer to btrfs_map_block()).
386 */
387 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
388 dev_replace->time_started = btrfs_get_seconds_since_1970();
389 dev_replace->cursor_left = 0;
390 dev_replace->committed_cursor_left = 0;
391 dev_replace->cursor_left_last_write_of_item = 0;
392 dev_replace->cursor_right = 0;
393 dev_replace->is_valid = 1;
394 dev_replace->item_needs_writeback = 1;
395 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
396 btrfs_dev_replace_unlock(dev_replace);
397
398 btrfs_wait_ordered_extents(root, 0);
399
400 /* force writing the updated state information to disk */
401 trans = btrfs_start_transaction(root, 0);
402 if (IS_ERR(trans)) {
403 ret = PTR_ERR(trans);
404 btrfs_dev_replace_lock(dev_replace);
405 goto leave;
406 }
407
408 ret = btrfs_commit_transaction(trans, root);
409 WARN_ON(ret);
410
411 /* the disk copy procedure reuses the scrub code */
412 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
413 src_device->total_bytes,
414 &dev_replace->scrub_progress, 0, 1);
415
416 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
417 WARN_ON(ret);
418
419 return 0;
420
421leave:
422 dev_replace->srcdev = NULL;
423 dev_replace->tgtdev = NULL;
424 btrfs_dev_replace_unlock(dev_replace);
425leave_no_lock:
426 if (tgt_device)
427 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
428 return ret;
429}
430
431static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
432 int scrub_ret)
433{
434 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
435 struct btrfs_device *tgt_device;
436 struct btrfs_device *src_device;
437 struct btrfs_root *root = fs_info->tree_root;
438 u8 uuid_tmp[BTRFS_UUID_SIZE];
439 struct btrfs_trans_handle *trans;
440 int ret = 0;
441
442 /* don't allow cancel or unmount to disturb the finishing procedure */
443 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
444
445 btrfs_dev_replace_lock(dev_replace);
446 /* was the operation canceled, or is it finished? */
447 if (dev_replace->replace_state !=
448 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
449 btrfs_dev_replace_unlock(dev_replace);
450 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
451 return 0;
452 }
453
454 tgt_device = dev_replace->tgtdev;
455 src_device = dev_replace->srcdev;
456 btrfs_dev_replace_unlock(dev_replace);
457
458 /* replace old device with new one in mapping tree */
459 if (!scrub_ret)
460 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
461 src_device,
462 tgt_device);
463
464 /*
465 * flush all outstanding I/O and inode extent mappings before the
466 * copy operation is declared as being finished
467 */
468 btrfs_start_delalloc_inodes(root, 0);
469 btrfs_wait_ordered_extents(root, 0);
470
471 trans = btrfs_start_transaction(root, 0);
472 if (IS_ERR(trans)) {
473 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
474 return PTR_ERR(trans);
475 }
476 ret = btrfs_commit_transaction(trans, root);
477 WARN_ON(ret);
478
479 /* keep away write_all_supers() during the finishing procedure */
480 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
481 btrfs_dev_replace_lock(dev_replace);
482 dev_replace->replace_state =
483 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
484 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
485 dev_replace->tgtdev = NULL;
486 dev_replace->srcdev = NULL;
487 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
488 dev_replace->item_needs_writeback = 1;
489
490 if (scrub_ret) {
491 printk_in_rcu(KERN_ERR
492 "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
493 src_device->missing ? "<missing disk>" :
494 rcu_str_deref(src_device->name),
495 src_device->devid,
496 rcu_str_deref(tgt_device->name), scrub_ret);
497 btrfs_dev_replace_unlock(dev_replace);
498 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
499 if (tgt_device)
500 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
501 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
502
503 return 0;
504 }
505
506 printk_in_rcu(KERN_INFO
507 "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
508 src_device->missing ? "<missing disk>" :
509 rcu_str_deref(src_device->name),
510 src_device->devid,
511 rcu_str_deref(tgt_device->name));
512 tgt_device->is_tgtdev_for_dev_replace = 0;
513 tgt_device->devid = src_device->devid;
514 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
515 tgt_device->bytes_used = src_device->bytes_used;
516 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
517 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
518 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
519 tgt_device->total_bytes = src_device->total_bytes;
520 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
521 tgt_device->bytes_used = src_device->bytes_used;
522 if (fs_info->sb->s_bdev == src_device->bdev)
523 fs_info->sb->s_bdev = tgt_device->bdev;
524 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
525 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
526 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
527
528 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
529 if (src_device->bdev) {
530 /* zero out the old super */
531 btrfs_scratch_superblock(src_device);
532 }
533 /*
534 * this is again a consistent state where no dev_replace procedure
535 * is running, the target device is part of the filesystem, the
536 * source device is not part of the filesystem anymore and its 1st
537 * superblock is scratched out so that it is no longer marked to
538 * belong to this filesystem.
539 */
540 btrfs_dev_replace_unlock(dev_replace);
541 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
542
543 /* write back the superblocks */
544 trans = btrfs_start_transaction(root, 0);
545 if (!IS_ERR(trans))
546 btrfs_commit_transaction(trans, root);
547
548 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
549
550 return 0;
551}
552
553static void btrfs_dev_replace_update_device_in_mapping_tree(
554 struct btrfs_fs_info *fs_info,
555 struct btrfs_device *srcdev,
556 struct btrfs_device *tgtdev)
557{
558 struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
559 struct extent_map *em;
560 struct map_lookup *map;
561 u64 start = 0;
562 int i;
563
564 write_lock(&em_tree->lock);
565 do {
566 em = lookup_extent_mapping(em_tree, start, (u64)-1);
567 if (!em)
568 break;
569 map = (struct map_lookup *)em->bdev;
570 for (i = 0; i < map->num_stripes; i++)
571 if (srcdev == map->stripes[i].dev)
572 map->stripes[i].dev = tgtdev;
573 start = em->start + em->len;
574 free_extent_map(em);
575 } while (start);
576 write_unlock(&em_tree->lock);
577}
578
579static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
580 char *srcdev_name,
581 struct btrfs_device **device)
582{
583 int ret;
584
585 if (srcdevid) {
586 ret = 0;
587 *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
588 NULL);
589 if (!*device)
590 ret = -ENOENT;
591 } else {
592 ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
593 device);
594 }
595 return ret;
596}
597
598void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
599 struct btrfs_ioctl_dev_replace_args *args)
600{
601 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
602
603 btrfs_dev_replace_lock(dev_replace);
604 /* even if !dev_replace_is_valid, the values are good enough for
605 * the replace_status ioctl */
606 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
607 args->status.replace_state = dev_replace->replace_state;
608 args->status.time_started = dev_replace->time_started;
609 args->status.time_stopped = dev_replace->time_stopped;
610 args->status.num_write_errors =
611 atomic64_read(&dev_replace->num_write_errors);
612 args->status.num_uncorrectable_read_errors =
613 atomic64_read(&dev_replace->num_uncorrectable_read_errors);
614 switch (dev_replace->replace_state) {
615 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
616 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
617 args->status.progress_1000 = 0;
618 break;
619 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
620 args->status.progress_1000 = 1000;
621 break;
622 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
623 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
624 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
625 div64_u64(dev_replace->srcdev->total_bytes, 1000));
626 break;
627 }
628 btrfs_dev_replace_unlock(dev_replace);
629}
630
631int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
632 struct btrfs_ioctl_dev_replace_args *args)
633{
634 args->result = __btrfs_dev_replace_cancel(fs_info);
635 return 0;
636}
637
638static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
639{
640 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
641 struct btrfs_device *tgt_device = NULL;
642 struct btrfs_trans_handle *trans;
643 struct btrfs_root *root = fs_info->tree_root;
644 u64 result;
645 int ret;
646
647 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
648 btrfs_dev_replace_lock(dev_replace);
649 switch (dev_replace->replace_state) {
650 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
651 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
652 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
653 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
654 btrfs_dev_replace_unlock(dev_replace);
655 goto leave;
656 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
657 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
658 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
659 tgt_device = dev_replace->tgtdev;
660 dev_replace->tgtdev = NULL;
661 dev_replace->srcdev = NULL;
662 break;
663 }
664 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
665 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
666 dev_replace->item_needs_writeback = 1;
667 btrfs_dev_replace_unlock(dev_replace);
668 btrfs_scrub_cancel(fs_info);
669
670 trans = btrfs_start_transaction(root, 0);
671 if (IS_ERR(trans)) {
672 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
673 return PTR_ERR(trans);
674 }
675 ret = btrfs_commit_transaction(trans, root);
676 WARN_ON(ret);
677 if (tgt_device)
678 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
679
680leave:
681 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
682 return result;
683}
684
685void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
686{
687 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
688
689 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
690 btrfs_dev_replace_lock(dev_replace);
691 switch (dev_replace->replace_state) {
692 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
693 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
694 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
695 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
696 break;
697 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
698 dev_replace->replace_state =
699 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
700 dev_replace->time_stopped = btrfs_get_seconds_since_1970();
701 dev_replace->item_needs_writeback = 1;
702 pr_info("btrfs: suspending dev_replace for unmount\n");
703 break;
704 }
705
706 btrfs_dev_replace_unlock(dev_replace);
707 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
708}
709
710/* resume dev_replace procedure that was interrupted by unmount */
711int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
712{
713 struct task_struct *task;
714 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
715
716 btrfs_dev_replace_lock(dev_replace);
717 switch (dev_replace->replace_state) {
718 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
719 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
720 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
721 btrfs_dev_replace_unlock(dev_replace);
722 return 0;
723 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
724 break;
725 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
726 dev_replace->replace_state =
727 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
728 break;
729 }
730 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
731 pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
732 "btrfs: you may cancel the operation after 'mount -o degraded'\n");
733 btrfs_dev_replace_unlock(dev_replace);
734 return 0;
735 }
736 btrfs_dev_replace_unlock(dev_replace);
737
738 WARN_ON(atomic_xchg(
739 &fs_info->mutually_exclusive_operation_running, 1));
740 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
741 return PTR_RET(task);
742}
743
744static int btrfs_dev_replace_kthread(void *data)
745{
746 struct btrfs_fs_info *fs_info = data;
747 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
748 struct btrfs_ioctl_dev_replace_args *status_args;
749 u64 progress;
750
751 status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
752 if (status_args) {
753 btrfs_dev_replace_status(fs_info, status_args);
754 progress = status_args->status.progress_1000;
755 kfree(status_args);
756 do_div(progress, 10);
757 printk_in_rcu(KERN_INFO
758 "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
759 dev_replace->srcdev->missing ? "<missing disk>" :
760 rcu_str_deref(dev_replace->srcdev->name),
761 dev_replace->srcdev->devid,
762 dev_replace->tgtdev ?
763 rcu_str_deref(dev_replace->tgtdev->name) :
764 "<missing target disk>",
765 (unsigned int)progress);
766 }
767 btrfs_dev_replace_continue_on_mount(fs_info);
768 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
769
770 return 0;
771}
772
773static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
774{
775 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
776 int ret;
777
778 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
779 dev_replace->committed_cursor_left,
780 dev_replace->srcdev->total_bytes,
781 &dev_replace->scrub_progress, 0, 1);
782 ret = btrfs_dev_replace_finishing(fs_info, ret);
783 WARN_ON(ret);
784 return 0;
785}
786
787int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
788{
789 if (!dev_replace->is_valid)
790 return 0;
791
792 switch (dev_replace->replace_state) {
793 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
794 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
795 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
796 return 0;
797 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
798 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
799 /*
800 * return true even if tgtdev is missing (this is
801 * something that can happen if the dev_replace
802 * procedure is suspended by an umount and then
803 * the tgtdev is missing (or "btrfs dev scan") was
804 * not called and the the filesystem is remounted
805 * in degraded state. This does not stop the
806 * dev_replace procedure. It needs to be canceled
807 * manually if the cancelation is wanted.
808 */
809 break;
810 }
811 return 1;
812}
813
814void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
815{
816 /* the beginning is just an optimization for the typical case */
817 if (atomic_read(&dev_replace->nesting_level) == 0) {
818acquire_lock:
819 /* this is not a nested case where the same thread
820 * is trying to acqurire the same lock twice */
821 mutex_lock(&dev_replace->lock);
822 mutex_lock(&dev_replace->lock_management_lock);
823 dev_replace->lock_owner = current->pid;
824 atomic_inc(&dev_replace->nesting_level);
825 mutex_unlock(&dev_replace->lock_management_lock);
826 return;
827 }
828
829 mutex_lock(&dev_replace->lock_management_lock);
830 if (atomic_read(&dev_replace->nesting_level) > 0 &&
831 dev_replace->lock_owner == current->pid) {
832 WARN_ON(!mutex_is_locked(&dev_replace->lock));
833 atomic_inc(&dev_replace->nesting_level);
834 mutex_unlock(&dev_replace->lock_management_lock);
835 return;
836 }
837
838 mutex_unlock(&dev_replace->lock_management_lock);
839 goto acquire_lock;
840}
841
842void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
843{
844 WARN_ON(!mutex_is_locked(&dev_replace->lock));
845 mutex_lock(&dev_replace->lock_management_lock);
846 WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
847 WARN_ON(dev_replace->lock_owner != current->pid);
848 atomic_dec(&dev_replace->nesting_level);
849 if (atomic_read(&dev_replace->nesting_level) == 0) {
850 dev_replace->lock_owner = 0;
851 mutex_unlock(&dev_replace->lock_management_lock);
852 mutex_unlock(&dev_replace->lock);
853 } else {
854 mutex_unlock(&dev_replace->lock_management_lock);
855 }
856}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
new file mode 100644
index 000000000000..20035cbbf021
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
1/*
2 * Copyright (C) STRATO AG 2012. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_DEV_REPLACE__)
20#define __BTRFS_DEV_REPLACE__
21
22struct btrfs_ioctl_dev_replace_args;
23
24int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
25int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info);
27void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
28int btrfs_dev_replace_start(struct btrfs_root *root,
29 struct btrfs_ioctl_dev_replace_args *args);
30void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
31 struct btrfs_ioctl_dev_replace_args *args);
32int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
33 struct btrfs_ioctl_dev_replace_args *args);
34void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
35int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
36int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
37void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
38void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
39
40static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
41{
42 atomic64_inc(stat_value);
43}
44#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c1a074d0696f..502c2158167c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
213 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
214} 214}
215 215
216int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
217 const char *name, int name_len)
218{
219 int ret;
220 struct btrfs_key key;
221 struct btrfs_dir_item *di;
222 int data_size;
223 struct extent_buffer *leaf;
224 int slot;
225 struct btrfs_path *path;
226
227
228 path = btrfs_alloc_path();
229 if (!path)
230 return -ENOMEM;
231
232 key.objectid = dir;
233 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
234 key.offset = btrfs_name_hash(name, name_len);
235
236 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
237
238 /* return back any errors */
239 if (ret < 0)
240 goto out;
241
242 /* nothing found, we're safe */
243 if (ret > 0) {
244 ret = 0;
245 goto out;
246 }
247
248 /* we found an item, look for our name in the item */
249 di = btrfs_match_dir_item_name(root, path, name, name_len);
250 if (di) {
251 /* our exact name was found */
252 ret = -EEXIST;
253 goto out;
254 }
255
256 /*
257 * see if there is room in the item to insert this
258 * name
259 */
260 data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
261 leaf = path->nodes[0];
262 slot = path->slots[0];
263 if (data_size + btrfs_item_size_nr(leaf, slot) +
264 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
265 ret = -EOVERFLOW;
266 } else {
267 /* plenty of insertion room */
268 ret = 0;
269 }
270out:
271 btrfs_free_path(path);
272 return ret;
273}
274
216/* 275/*
217 * lookup a directory item based on index. 'dir' is the objectid 276 * lookup a directory item based on index. 'dir' is the objectid
218 * we're searching in, and 'mod' tells us if you plan on deleting the 277 * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..a8f652dc940b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h"
48 49
49#ifdef CONFIG_X86 50#ifdef CONFIG_X86
50#include <asm/cpufeature.h> 51#include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
387 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) 388 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
388 break; 389 break;
389 390
390 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 391 num_copies = btrfs_num_copies(root->fs_info,
391 eb->start, eb->len); 392 eb->start, eb->len);
392 if (num_copies == 1) 393 if (num_copies == 1)
393 break; 394 break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
852 int mirror_num, unsigned long bio_flags, 853 int mirror_num, unsigned long bio_flags,
853 u64 bio_offset) 854 u64 bio_offset)
854{ 855{
856 int ret;
857
855 /* 858 /*
856 * when we're called for a write, we're already in the async 859 * when we're called for a write, we're already in the async
857 * submission context. Just jump into btrfs_map_bio 860 * submission context. Just jump into btrfs_map_bio
858 */ 861 */
859 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); 862 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
863 if (ret)
864 bio_endio(bio, ret);
865 return ret;
860} 866}
861 867
862static int check_async_write(struct inode *inode, unsigned long bio_flags) 868static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
878 int ret; 884 int ret;
879 885
880 if (!(rw & REQ_WRITE)) { 886 if (!(rw & REQ_WRITE)) {
881
882 /* 887 /*
883 * called for a read, do the setup so that checksum validation 888 * called for a read, do the setup so that checksum validation
884 * can happen in the async kernel threads 889 * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
886 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 891 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
887 bio, 1); 892 bio, 1);
888 if (ret) 893 if (ret)
889 return ret; 894 goto out_w_error;
890 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 895 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
891 mirror_num, 0); 896 mirror_num, 0);
892 } else if (!async) { 897 } else if (!async) {
893 ret = btree_csum_one_bio(bio); 898 ret = btree_csum_one_bio(bio);
894 if (ret) 899 if (ret)
895 return ret; 900 goto out_w_error;
896 return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 901 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
897 mirror_num, 0); 902 mirror_num, 0);
903 } else {
904 /*
905 * kthread helpers are used to submit writes so that
906 * checksumming can happen in parallel across all CPUs
907 */
908 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
909 inode, rw, bio, mirror_num, 0,
910 bio_offset,
911 __btree_submit_bio_start,
912 __btree_submit_bio_done);
898 } 913 }
899 914
900 /* 915 if (ret) {
901 * kthread helpers are used to submit writes so that checksumming 916out_w_error:
902 * can happen in parallel across all CPUs 917 bio_endio(bio, ret);
903 */ 918 }
904 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 919 return ret;
905 inode, rw, bio, mirror_num, 0,
906 bio_offset,
907 __btree_submit_bio_start,
908 __btree_submit_bio_done);
909} 920}
910 921
911#ifdef CONFIG_MIGRATION 922#ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
990 1001
991static int btree_set_page_dirty(struct page *page) 1002static int btree_set_page_dirty(struct page *page)
992{ 1003{
1004#ifdef DEBUG
993 struct extent_buffer *eb; 1005 struct extent_buffer *eb;
994 1006
995 BUG_ON(!PagePrivate(page)); 1007 BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
998 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1010 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
999 BUG_ON(!atomic_read(&eb->refs)); 1011 BUG_ON(!atomic_read(&eb->refs));
1000 btrfs_assert_tree_locked(eb); 1012 btrfs_assert_tree_locked(eb);
1013#endif
1001 return __set_page_dirty_nobuffers(page); 1014 return __set_page_dirty_nobuffers(page);
1002} 1015}
1003 1016
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1129 root->fs_info->dirty_metadata_bytes); 1142 root->fs_info->dirty_metadata_bytes);
1130 } 1143 }
1131 spin_unlock(&root->fs_info->delalloc_lock); 1144 spin_unlock(&root->fs_info->delalloc_lock);
1132 }
1133 1145
1134 /* ugh, clear_extent_buffer_dirty needs to lock the page */ 1146 /* ugh, clear_extent_buffer_dirty needs to lock the page */
1135 btrfs_set_lock_blocking(buf); 1147 btrfs_set_lock_blocking(buf);
1136 clear_extent_buffer_dirty(buf); 1148 clear_extent_buffer_dirty(buf);
1149 }
1137 } 1150 }
1138} 1151}
1139 1152
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1193 root->root_key.objectid = objectid; 1206 root->root_key.objectid = objectid;
1194 root->anon_dev = 0; 1207 root->anon_dev = 0;
1195 1208
1196 spin_lock_init(&root->root_times_lock); 1209 spin_lock_init(&root->root_item_lock);
1197} 1210}
1198 1211
1199static int __must_check find_and_setup_root(struct btrfs_root *tree_root, 1212static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
2131 init_rwsem(&fs_info->extent_commit_sem); 2144 init_rwsem(&fs_info->extent_commit_sem);
2132 init_rwsem(&fs_info->cleanup_work_sem); 2145 init_rwsem(&fs_info->cleanup_work_sem);
2133 init_rwsem(&fs_info->subvol_sem); 2146 init_rwsem(&fs_info->subvol_sem);
2147 fs_info->dev_replace.lock_owner = 0;
2148 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2149 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2150 mutex_init(&fs_info->dev_replace.lock_management_lock);
2151 mutex_init(&fs_info->dev_replace.lock);
2134 2152
2135 spin_lock_init(&fs_info->qgroup_lock); 2153 spin_lock_init(&fs_info->qgroup_lock);
2136 fs_info->qgroup_tree = RB_ROOT; 2154 fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
2279 fs_info->thread_pool_size, 2297 fs_info->thread_pool_size,
2280 &fs_info->generic_worker); 2298 &fs_info->generic_worker);
2281 2299
2300 btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
2301 fs_info->thread_pool_size,
2302 &fs_info->generic_worker);
2303
2282 btrfs_init_workers(&fs_info->submit_workers, "submit", 2304 btrfs_init_workers(&fs_info->submit_workers, "submit",
2283 min_t(u64, fs_devices->num_devices, 2305 min_t(u64, fs_devices->num_devices,
2284 fs_info->thread_pool_size), 2306 fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
2350 ret |= btrfs_start_workers(&fs_info->delayed_workers); 2372 ret |= btrfs_start_workers(&fs_info->delayed_workers);
2351 ret |= btrfs_start_workers(&fs_info->caching_workers); 2373 ret |= btrfs_start_workers(&fs_info->caching_workers);
2352 ret |= btrfs_start_workers(&fs_info->readahead_workers); 2374 ret |= btrfs_start_workers(&fs_info->readahead_workers);
2375 ret |= btrfs_start_workers(&fs_info->flush_workers);
2353 if (ret) { 2376 if (ret) {
2354 err = -ENOMEM; 2377 err = -ENOMEM;
2355 goto fail_sb_buffer; 2378 goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
2418 goto fail_tree_roots; 2441 goto fail_tree_roots;
2419 } 2442 }
2420 2443
2421 btrfs_close_extra_devices(fs_devices); 2444 /*
2445 * keep the device that is marked to be the target device for the
2446 * dev_replace procedure
2447 */
2448 btrfs_close_extra_devices(fs_info, fs_devices, 0);
2422 2449
2423 if (!fs_devices->latest_bdev) { 2450 if (!fs_devices->latest_bdev) {
2424 printk(KERN_CRIT "btrfs: failed to read devices on %s\n", 2451 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
2490 goto fail_block_groups; 2517 goto fail_block_groups;
2491 } 2518 }
2492 2519
2520 ret = btrfs_init_dev_replace(fs_info);
2521 if (ret) {
2522 pr_err("btrfs: failed to init dev_replace: %d\n", ret);
2523 goto fail_block_groups;
2524 }
2525
2526 btrfs_close_extra_devices(fs_info, fs_devices, 1);
2527
2493 ret = btrfs_init_space_info(fs_info); 2528 ret = btrfs_init_space_info(fs_info);
2494 if (ret) { 2529 if (ret) {
2495 printk(KERN_ERR "Failed to initial space info: %d\n", ret); 2530 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
2503 } 2538 }
2504 fs_info->num_tolerated_disk_barrier_failures = 2539 fs_info->num_tolerated_disk_barrier_failures =
2505 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 2540 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2541 if (fs_info->fs_devices->missing_devices >
2542 fs_info->num_tolerated_disk_barrier_failures &&
2543 !(sb->s_flags & MS_RDONLY)) {
2544 printk(KERN_WARNING
2545 "Btrfs: too many missing devices, writeable mount is not allowed\n");
2546 goto fail_block_groups;
2547 }
2506 2548
2507 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2549 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2508 "btrfs-cleaner"); 2550 "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
2631 return ret; 2673 return ret;
2632 } 2674 }
2633 2675
2676 ret = btrfs_resume_dev_replace_async(fs_info);
2677 if (ret) {
2678 pr_warn("btrfs: failed to resume dev_replace\n");
2679 close_ctree(tree_root);
2680 return ret;
2681 }
2682
2634 return 0; 2683 return 0;
2635 2684
2636fail_qgroup: 2685fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
2667 btrfs_stop_workers(&fs_info->submit_workers); 2716 btrfs_stop_workers(&fs_info->submit_workers);
2668 btrfs_stop_workers(&fs_info->delayed_workers); 2717 btrfs_stop_workers(&fs_info->delayed_workers);
2669 btrfs_stop_workers(&fs_info->caching_workers); 2718 btrfs_stop_workers(&fs_info->caching_workers);
2719 btrfs_stop_workers(&fs_info->flush_workers);
2670fail_alloc: 2720fail_alloc:
2671fail_iput: 2721fail_iput:
2672 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2722 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
3270 smp_mb(); 3320 smp_mb();
3271 3321
3272 /* pause restriper - we want to resume on mount */ 3322 /* pause restriper - we want to resume on mount */
3273 btrfs_pause_balance(root->fs_info); 3323 btrfs_pause_balance(fs_info);
3324
3325 btrfs_dev_replace_suspend_for_unmount(fs_info);
3274 3326
3275 btrfs_scrub_cancel(root); 3327 btrfs_scrub_cancel(fs_info);
3276 3328
3277 /* wait for any defraggers to finish */ 3329 /* wait for any defraggers to finish */
3278 wait_event(fs_info->transaction_wait, 3330 wait_event(fs_info->transaction_wait,
3279 (atomic_read(&fs_info->defrag_running) == 0)); 3331 (atomic_read(&fs_info->defrag_running) == 0));
3280 3332
3281 /* clear out the rbtree of defraggable inodes */ 3333 /* clear out the rbtree of defraggable inodes */
3282 btrfs_run_defrag_inodes(fs_info); 3334 btrfs_cleanup_defrag_inodes(fs_info);
3283 3335
3284 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3336 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3285 ret = btrfs_commit_super(root); 3337 ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
3339 btrfs_stop_workers(&fs_info->delayed_workers); 3391 btrfs_stop_workers(&fs_info->delayed_workers);
3340 btrfs_stop_workers(&fs_info->caching_workers); 3392 btrfs_stop_workers(&fs_info->caching_workers);
3341 btrfs_stop_workers(&fs_info->readahead_workers); 3393 btrfs_stop_workers(&fs_info->readahead_workers);
3394 btrfs_stop_workers(&fs_info->flush_workers);
3342 3395
3343#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3396#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3344 if (btrfs_test_opt(root, CHECK_INTEGRITY)) 3397 if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3383 int was_dirty; 3436 int was_dirty;
3384 3437
3385 btrfs_assert_tree_locked(buf); 3438 btrfs_assert_tree_locked(buf);
3386 if (transid != root->fs_info->generation) { 3439 if (transid != root->fs_info->generation)
3387 printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " 3440 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
3388 "found %llu running %llu\n", 3441 "found %llu running %llu\n",
3389 (unsigned long long)buf->start, 3442 (unsigned long long)buf->start,
3390 (unsigned long long)transid, 3443 (unsigned long long)transid,
3391 (unsigned long long)root->fs_info->generation); 3444 (unsigned long long)root->fs_info->generation);
3392 WARN_ON(1);
3393 }
3394 was_dirty = set_extent_buffer_dirty(buf); 3445 was_dirty = set_extent_buffer_dirty(buf);
3395 if (!was_dirty) { 3446 if (!was_dirty) {
3396 spin_lock(&root->fs_info->delalloc_lock); 3447 spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
3399 } 3450 }
3400} 3451}
3401 3452
3402void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3453static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
3454 int flush_delayed)
3403{ 3455{
3404 /* 3456 /*
3405 * looks as though older kernels can get into trouble with 3457 * looks as though older kernels can get into trouble with
@@ -3411,36 +3463,26 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
3411 if (current->flags & PF_MEMALLOC) 3463 if (current->flags & PF_MEMALLOC)
3412 return; 3464 return;
3413 3465
3414 btrfs_balance_delayed_items(root); 3466 if (flush_delayed)
3467 btrfs_balance_delayed_items(root);
3415 3468
3416 num_dirty = root->fs_info->dirty_metadata_bytes; 3469 num_dirty = root->fs_info->dirty_metadata_bytes;
3417 3470
3418 if (num_dirty > thresh) { 3471 if (num_dirty > thresh) {
3419 balance_dirty_pages_ratelimited_nr( 3472 balance_dirty_pages_ratelimited(
3420 root->fs_info->btree_inode->i_mapping, 1); 3473 root->fs_info->btree_inode->i_mapping);
3421 } 3474 }
3422 return; 3475 return;
3423} 3476}
3424 3477
3425void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) 3478void btrfs_btree_balance_dirty(struct btrfs_root *root)
3426{ 3479{
3427 /* 3480 __btrfs_btree_balance_dirty(root, 1);
3428 * looks as though older kernels can get into trouble with 3481}
3429 * this code, they end up stuck in balance_dirty_pages forever
3430 */
3431 u64 num_dirty;
3432 unsigned long thresh = 32 * 1024 * 1024;
3433
3434 if (current->flags & PF_MEMALLOC)
3435 return;
3436
3437 num_dirty = root->fs_info->dirty_metadata_bytes;
3438 3482
3439 if (num_dirty > thresh) { 3483void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
3440 balance_dirty_pages_ratelimited_nr( 3484{
3441 root->fs_info->btree_inode->i_mapping, 1); 3485 __btrfs_btree_balance_dirty(root, 0);
3442 }
3443 return;
3444} 3486}
3445 3487
3446int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) 3488int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2025a9132c16..305c33efb0e3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 62struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
63 struct btrfs_key *location); 63 struct btrfs_key *location);
64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 64int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
65void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 65void btrfs_btree_balance_dirty(struct btrfs_root *root);
66void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 66void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 67void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
68void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 68void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 69int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..a8b8adc05070 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
33#include "volumes.h" 33#include "volumes.h"
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36#include "math.h"
36 37
37#undef SCRAMBLE_DELAYED_REFS 38#undef SCRAMBLE_DELAYED_REFS
38 39
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
649 rcu_read_unlock(); 650 rcu_read_unlock();
650} 651}
651 652
652static u64 div_factor(u64 num, int factor)
653{
654 if (factor == 10)
655 return num;
656 num *= factor;
657 do_div(num, 10);
658 return num;
659}
660
661static u64 div_factor_fine(u64 num, int factor)
662{
663 if (factor == 100)
664 return num;
665 num *= factor;
666 do_div(num, 100);
667 return num;
668}
669
670u64 btrfs_find_block_group(struct btrfs_root *root, 653u64 btrfs_find_block_group(struct btrfs_root *root,
671 u64 search_start, u64 search_hint, int owner) 654 u64 search_start, u64 search_hint, int owner)
672{ 655{
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1835 1818
1836 1819
1837 /* Tell the block device(s) that the sectors can be discarded */ 1820 /* Tell the block device(s) that the sectors can be discarded */
1838 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1821 ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1839 bytenr, &num_bytes, &bbio, 0); 1822 bytenr, &num_bytes, &bbio, 0);
1840 /* Error condition is -ENOMEM */ 1823 /* Error condition is -ENOMEM */
1841 if (!ret) { 1824 if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2314 kfree(extent_op); 2297 kfree(extent_op);
2315 2298
2316 if (ret) { 2299 if (ret) {
2300 list_del_init(&locked_ref->cluster);
2301 mutex_unlock(&locked_ref->mutex);
2302
2317 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); 2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2318 spin_lock(&delayed_refs->lock); 2304 spin_lock(&delayed_refs->lock);
2319 return ret; 2305 return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2356 count++; 2342 count++;
2357 2343
2358 if (ret) { 2344 if (ret) {
2345 if (locked_ref) {
2346 list_del_init(&locked_ref->cluster);
2347 mutex_unlock(&locked_ref->mutex);
2348 }
2359 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); 2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2360 spin_lock(&delayed_refs->lock); 2350 spin_lock(&delayed_refs->lock);
2361 return ret; 2351 return ret;
@@ -3661,7 +3651,7 @@ out:
3661 3651
3662static int can_overcommit(struct btrfs_root *root, 3652static int can_overcommit(struct btrfs_root *root,
3663 struct btrfs_space_info *space_info, u64 bytes, 3653 struct btrfs_space_info *space_info, u64 bytes,
3664 int flush) 3654 enum btrfs_reserve_flush_enum flush)
3665{ 3655{
3666 u64 profile = btrfs_get_alloc_profile(root, 0); 3656 u64 profile = btrfs_get_alloc_profile(root, 0);
3667 u64 avail; 3657 u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
3685 avail >>= 1; 3675 avail >>= 1;
3686 3676
3687 /* 3677 /*
3688 * If we aren't flushing don't let us overcommit too much, say 3678 * If we aren't flushing all things, let us overcommit up to
3689 * 1/8th of the space. If we can flush, let it overcommit up to 3679 * 1/2th of the space. If we can flush, don't let us overcommit
3690 * 1/2 of the space. 3680 * too much, let it overcommit up to 1/8 of the space.
3691 */ 3681 */
3692 if (flush) 3682 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3693 avail >>= 3; 3683 avail >>= 3;
3694 else 3684 else
3695 avail >>= 1; 3685 avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
3699 return 0; 3689 return 0;
3700} 3690}
3701 3691
3692static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3693 unsigned long nr_pages,
3694 enum wb_reason reason)
3695{
3696 if (!writeback_in_progress(sb->s_bdi) &&
3697 down_read_trylock(&sb->s_umount)) {
3698 writeback_inodes_sb_nr(sb, nr_pages, reason);
3699 up_read(&sb->s_umount);
3700 return 1;
3701 }
3702
3703 return 0;
3704}
3705
3702/* 3706/*
3703 * shrink metadata reservation for delalloc 3707 * shrink metadata reservation for delalloc
3704 */ 3708 */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3713 long time_left; 3717 long time_left;
3714 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3718 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3715 int loops = 0; 3719 int loops = 0;
3720 enum btrfs_reserve_flush_enum flush;
3716 3721
3717 trans = (struct btrfs_trans_handle *)current->journal_info; 3722 trans = (struct btrfs_trans_handle *)current->journal_info;
3718 block_rsv = &root->fs_info->delalloc_block_rsv; 3723 block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3730 while (delalloc_bytes && loops < 3) { 3735 while (delalloc_bytes && loops < 3) {
3731 max_reclaim = min(delalloc_bytes, to_reclaim); 3736 max_reclaim = min(delalloc_bytes, to_reclaim);
3732 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3737 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3733 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, 3738 writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
3734 WB_REASON_FS_FREE_SPACE); 3739 nr_pages,
3740 WB_REASON_FS_FREE_SPACE);
3735 3741
3736 /* 3742 /*
3737 * We need to wait for the async pages to actually start before 3743 * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3740 wait_event(root->fs_info->async_submit_wait, 3746 wait_event(root->fs_info->async_submit_wait,
3741 !atomic_read(&root->fs_info->async_delalloc_pages)); 3747 !atomic_read(&root->fs_info->async_delalloc_pages));
3742 3748
3749 if (!trans)
3750 flush = BTRFS_RESERVE_FLUSH_ALL;
3751 else
3752 flush = BTRFS_RESERVE_NO_FLUSH;
3743 spin_lock(&space_info->lock); 3753 spin_lock(&space_info->lock);
3744 if (can_overcommit(root, space_info, orig, !trans)) { 3754 if (can_overcommit(root, space_info, orig, flush)) {
3745 spin_unlock(&space_info->lock); 3755 spin_unlock(&space_info->lock);
3746 break; 3756 break;
3747 } 3757 }
@@ -3888,7 +3898,7 @@ static int flush_space(struct btrfs_root *root,
3888 * @root - the root we're allocating for 3898 * @root - the root we're allocating for
3889 * @block_rsv - the block_rsv we're allocating for 3899 * @block_rsv - the block_rsv we're allocating for
3890 * @orig_bytes - the number of bytes we want 3900 * @orig_bytes - the number of bytes we want
3891 * @flush - wether or not we can flush to make our reservation 3901 * @flush - whether or not we can flush to make our reservation
3892 * 3902 *
3893 * This will reserve orgi_bytes number of bytes from the space info associated 3903 * This will reserve orgi_bytes number of bytes from the space info associated
3894 * with the block_rsv. If there is not enough space it will make an attempt to 3904 * with the block_rsv. If there is not enough space it will make an attempt to
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
3899 */ 3909 */
3900static int reserve_metadata_bytes(struct btrfs_root *root, 3910static int reserve_metadata_bytes(struct btrfs_root *root,
3901 struct btrfs_block_rsv *block_rsv, 3911 struct btrfs_block_rsv *block_rsv,
3902 u64 orig_bytes, int flush) 3912 u64 orig_bytes,
3913 enum btrfs_reserve_flush_enum flush)
3903{ 3914{
3904 struct btrfs_space_info *space_info = block_rsv->space_info; 3915 struct btrfs_space_info *space_info = block_rsv->space_info;
3905 u64 used; 3916 u64 used;
@@ -3912,10 +3923,11 @@ again:
3912 ret = 0; 3923 ret = 0;
3913 spin_lock(&space_info->lock); 3924 spin_lock(&space_info->lock);
3914 /* 3925 /*
3915 * We only want to wait if somebody other than us is flushing and we are 3926 * We only want to wait if somebody other than us is flushing and we
3916 * actually alloed to flush. 3927 * are actually allowed to flush all things.
3917 */ 3928 */
3918 while (flush && !flushing && space_info->flush) { 3929 while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3930 space_info->flush) {
3919 spin_unlock(&space_info->lock); 3931 spin_unlock(&space_info->lock);
3920 /* 3932 /*
3921 * If we have a trans handle we can't wait because the flusher 3933 * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
3981 * Couldn't make our reservation, save our place so while we're trying 3993 * Couldn't make our reservation, save our place so while we're trying
3982 * to reclaim space we can actually use it instead of somebody else 3994 * to reclaim space we can actually use it instead of somebody else
3983 * stealing it from us. 3995 * stealing it from us.
3996 *
3997 * We make the other tasks wait for the flush only when we can flush
3998 * all things.
3984 */ 3999 */
3985 if (ret && flush) { 4000 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
3986 flushing = true; 4001 flushing = true;
3987 space_info->flush = 1; 4002 space_info->flush = 1;
3988 } 4003 }
3989 4004
3990 spin_unlock(&space_info->lock); 4005 spin_unlock(&space_info->lock);
3991 4006
3992 if (!ret || !flush) 4007 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
3993 goto out; 4008 goto out;
3994 4009
3995 ret = flush_space(root, space_info, num_bytes, orig_bytes, 4010 ret = flush_space(root, space_info, num_bytes, orig_bytes,
3996 flush_state); 4011 flush_state);
3997 flush_state++; 4012 flush_state++;
4013
4014 /*
4015 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4016 * would happen. So skip delalloc flush.
4017 */
4018 if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4019 (flush_state == FLUSH_DELALLOC ||
4020 flush_state == FLUSH_DELALLOC_WAIT))
4021 flush_state = ALLOC_CHUNK;
4022
3998 if (!ret) 4023 if (!ret)
3999 goto again; 4024 goto again;
4000 else if (flush_state <= COMMIT_TRANS) 4025 else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4026 flush_state < COMMIT_TRANS)
4027 goto again;
4028 else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4029 flush_state <= COMMIT_TRANS)
4001 goto again; 4030 goto again;
4002 4031
4003out: 4032out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
4148 kfree(rsv); 4177 kfree(rsv);
4149} 4178}
4150 4179
4151static inline int __block_rsv_add(struct btrfs_root *root, 4180int btrfs_block_rsv_add(struct btrfs_root *root,
4152 struct btrfs_block_rsv *block_rsv, 4181 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4153 u64 num_bytes, int flush) 4182 enum btrfs_reserve_flush_enum flush)
4154{ 4183{
4155 int ret; 4184 int ret;
4156 4185
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
4166 return ret; 4195 return ret;
4167} 4196}
4168 4197
4169int btrfs_block_rsv_add(struct btrfs_root *root,
4170 struct btrfs_block_rsv *block_rsv,
4171 u64 num_bytes)
4172{
4173 return __block_rsv_add(root, block_rsv, num_bytes, 1);
4174}
4175
4176int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
4177 struct btrfs_block_rsv *block_rsv,
4178 u64 num_bytes)
4179{
4180 return __block_rsv_add(root, block_rsv, num_bytes, 0);
4181}
4182
4183int btrfs_block_rsv_check(struct btrfs_root *root, 4198int btrfs_block_rsv_check(struct btrfs_root *root,
4184 struct btrfs_block_rsv *block_rsv, int min_factor) 4199 struct btrfs_block_rsv *block_rsv, int min_factor)
4185{ 4200{
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
4198 return ret; 4213 return ret;
4199} 4214}
4200 4215
4201static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, 4216int btrfs_block_rsv_refill(struct btrfs_root *root,
4202 struct btrfs_block_rsv *block_rsv, 4217 struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4203 u64 min_reserved, int flush) 4218 enum btrfs_reserve_flush_enum flush)
4204{ 4219{
4205 u64 num_bytes = 0; 4220 u64 num_bytes = 0;
4206 int ret = -ENOSPC; 4221 int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
4228 return ret; 4243 return ret;
4229} 4244}
4230 4245
4231int btrfs_block_rsv_refill(struct btrfs_root *root,
4232 struct btrfs_block_rsv *block_rsv,
4233 u64 min_reserved)
4234{
4235 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
4236}
4237
4238int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
4239 struct btrfs_block_rsv *block_rsv,
4240 u64 min_reserved)
4241{
4242 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
4243}
4244
4245int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 4246int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4246 struct btrfs_block_rsv *dst_rsv, 4247 struct btrfs_block_rsv *dst_rsv,
4247 u64 num_bytes) 4248 u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4532 u64 csum_bytes; 4533 u64 csum_bytes;
4533 unsigned nr_extents = 0; 4534 unsigned nr_extents = 0;
4534 int extra_reserve = 0; 4535 int extra_reserve = 0;
4535 int flush = 1; 4536 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4536 int ret; 4537 int ret;
4538 bool delalloc_lock = true;
4537 4539
4538 /* Need to be holding the i_mutex here if we aren't free space cache */ 4540 /* If we are a free space inode we need to not flush since we will be in
4539 if (btrfs_is_free_space_inode(inode)) 4541 * the middle of a transaction commit. We also don't need the delalloc
4540 flush = 0; 4542 * mutex since we won't race with anybody. We need this mostly to make
4543 * lockdep shut its filthy mouth.
4544 */
4545 if (btrfs_is_free_space_inode(inode)) {
4546 flush = BTRFS_RESERVE_NO_FLUSH;
4547 delalloc_lock = false;
4548 }
4541 4549
4542 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4550 if (flush != BTRFS_RESERVE_NO_FLUSH &&
4551 btrfs_transaction_in_commit(root->fs_info))
4543 schedule_timeout(1); 4552 schedule_timeout(1);
4544 4553
4545 mutex_lock(&BTRFS_I(inode)->delalloc_mutex); 4554 if (delalloc_lock)
4555 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4556
4546 num_bytes = ALIGN(num_bytes, root->sectorsize); 4557 num_bytes = ALIGN(num_bytes, root->sectorsize);
4547 4558
4548 spin_lock(&BTRFS_I(inode)->lock); 4559 spin_lock(&BTRFS_I(inode)->lock);
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4572 ret = btrfs_qgroup_reserve(root, num_bytes + 4583 ret = btrfs_qgroup_reserve(root, num_bytes +
4573 nr_extents * root->leafsize); 4584 nr_extents * root->leafsize);
4574 if (ret) { 4585 if (ret) {
4575 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4586 spin_lock(&BTRFS_I(inode)->lock);
4587 calc_csum_metadata_size(inode, num_bytes, 0);
4588 spin_unlock(&BTRFS_I(inode)->lock);
4589 if (delalloc_lock)
4590 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4576 return ret; 4591 return ret;
4577 } 4592 }
4578 } 4593 }
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4607 btrfs_ino(inode), 4622 btrfs_ino(inode),
4608 to_free, 0); 4623 to_free, 0);
4609 } 4624 }
4610 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4625 if (root->fs_info->quota_enabled) {
4626 btrfs_qgroup_free(root, num_bytes +
4627 nr_extents * root->leafsize);
4628 }
4629 if (delalloc_lock)
4630 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4611 return ret; 4631 return ret;
4612 } 4632 }
4613 4633
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4619 } 4639 }
4620 BTRFS_I(inode)->reserved_extents += nr_extents; 4640 BTRFS_I(inode)->reserved_extents += nr_extents;
4621 spin_unlock(&BTRFS_I(inode)->lock); 4641 spin_unlock(&BTRFS_I(inode)->lock);
4622 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); 4642
4643 if (delalloc_lock)
4644 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4623 4645
4624 if (to_reserve) 4646 if (to_reserve)
4625 trace_btrfs_space_reservation(root->fs_info,"delalloc", 4647 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4969{ 4991{
4970 struct btrfs_fs_info *fs_info = root->fs_info; 4992 struct btrfs_fs_info *fs_info = root->fs_info;
4971 struct btrfs_block_group_cache *cache = NULL; 4993 struct btrfs_block_group_cache *cache = NULL;
4994 struct btrfs_space_info *space_info;
4995 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4972 u64 len; 4996 u64 len;
4997 bool readonly;
4973 4998
4974 while (start <= end) { 4999 while (start <= end) {
5000 readonly = false;
4975 if (!cache || 5001 if (!cache ||
4976 start >= cache->key.objectid + cache->key.offset) { 5002 start >= cache->key.objectid + cache->key.offset) {
4977 if (cache) 5003 if (cache)
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4989 } 5015 }
4990 5016
4991 start += len; 5017 start += len;
5018 space_info = cache->space_info;
4992 5019
4993 spin_lock(&cache->space_info->lock); 5020 spin_lock(&space_info->lock);
4994 spin_lock(&cache->lock); 5021 spin_lock(&cache->lock);
4995 cache->pinned -= len; 5022 cache->pinned -= len;
4996 cache->space_info->bytes_pinned -= len; 5023 space_info->bytes_pinned -= len;
4997 if (cache->ro) 5024 if (cache->ro) {
4998 cache->space_info->bytes_readonly += len; 5025 space_info->bytes_readonly += len;
5026 readonly = true;
5027 }
4999 spin_unlock(&cache->lock); 5028 spin_unlock(&cache->lock);
5000 spin_unlock(&cache->space_info->lock); 5029 if (!readonly && global_rsv->space_info == space_info) {
5030 spin_lock(&global_rsv->lock);
5031 if (!global_rsv->full) {
5032 len = min(len, global_rsv->size -
5033 global_rsv->reserved);
5034 global_rsv->reserved += len;
5035 space_info->bytes_may_use += len;
5036 if (global_rsv->reserved >= global_rsv->size)
5037 global_rsv->full = 1;
5038 }
5039 spin_unlock(&global_rsv->lock);
5040 }
5041 spin_unlock(&space_info->lock);
5001 } 5042 }
5002 5043
5003 if (cache) 5044 if (cache)
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5466 return 0; 5507 return 0;
5467} 5508}
5468 5509
5469static int __get_block_group_index(u64 flags) 5510int __get_raid_index(u64 flags)
5470{ 5511{
5471 int index; 5512 int index;
5472 5513
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
5486 5527
5487static int get_block_group_index(struct btrfs_block_group_cache *cache) 5528static int get_block_group_index(struct btrfs_block_group_cache *cache)
5488{ 5529{
5489 return __get_block_group_index(cache->flags); 5530 return __get_raid_index(cache->flags);
5490} 5531}
5491 5532
5492enum btrfs_loop_type { 5533enum btrfs_loop_type {
@@ -5519,7 +5560,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5519 int empty_cluster = 2 * 1024 * 1024; 5560 int empty_cluster = 2 * 1024 * 1024;
5520 struct btrfs_space_info *space_info; 5561 struct btrfs_space_info *space_info;
5521 int loop = 0; 5562 int loop = 0;
5522 int index = 0; 5563 int index = __get_raid_index(data);
5523 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? 5564 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5524 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; 5565 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5525 bool found_uncached_bg = false; 5566 bool found_uncached_bg = false;
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6269 block_rsv = get_block_rsv(trans, root); 6310 block_rsv = get_block_rsv(trans, root);
6270 6311
6271 if (block_rsv->size == 0) { 6312 if (block_rsv->size == 0) {
6272 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6313 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6314 BTRFS_RESERVE_NO_FLUSH);
6273 /* 6315 /*
6274 * If we couldn't reserve metadata bytes try and use some from 6316 * If we couldn't reserve metadata bytes try and use some from
6275 * the global reserve. 6317 * the global reserve.
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6292 static DEFINE_RATELIMIT_STATE(_rs, 6334 static DEFINE_RATELIMIT_STATE(_rs,
6293 DEFAULT_RATELIMIT_INTERVAL, 6335 DEFAULT_RATELIMIT_INTERVAL,
6294 /*DEFAULT_RATELIMIT_BURST*/ 2); 6336 /*DEFAULT_RATELIMIT_BURST*/ 2);
6295 if (__ratelimit(&_rs)) { 6337 if (__ratelimit(&_rs))
6296 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); 6338 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6297 WARN_ON(1); 6339 ret);
6298 } 6340 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6299 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); 6341 BTRFS_RESERVE_NO_FLUSH);
6300 if (!ret) { 6342 if (!ret) {
6301 return block_rsv; 6343 return block_rsv;
6302 } else if (ret && block_rsv != global_rsv) { 6344 } else if (ret && block_rsv != global_rsv) {
@@ -6746,11 +6788,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6746 &wc->flags[level]); 6788 &wc->flags[level]);
6747 if (ret < 0) { 6789 if (ret < 0) {
6748 btrfs_tree_unlock_rw(eb, path->locks[level]); 6790 btrfs_tree_unlock_rw(eb, path->locks[level]);
6791 path->locks[level] = 0;
6749 return ret; 6792 return ret;
6750 } 6793 }
6751 BUG_ON(wc->refs[level] == 0); 6794 BUG_ON(wc->refs[level] == 0);
6752 if (wc->refs[level] == 1) { 6795 if (wc->refs[level] == 1) {
6753 btrfs_tree_unlock_rw(eb, path->locks[level]); 6796 btrfs_tree_unlock_rw(eb, path->locks[level]);
6797 path->locks[level] = 0;
6754 return 1; 6798 return 1;
6755 } 6799 }
6756 } 6800 }
@@ -7427,7 +7471,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7427 */ 7471 */
7428 target = get_restripe_target(root->fs_info, block_group->flags); 7472 target = get_restripe_target(root->fs_info, block_group->flags);
7429 if (target) { 7473 if (target) {
7430 index = __get_block_group_index(extended_to_chunk(target)); 7474 index = __get_raid_index(extended_to_chunk(target));
7431 } else { 7475 } else {
7432 /* 7476 /*
7433 * this is just a balance, so if we were marked as full 7477 * this is just a balance, so if we were marked as full
@@ -7461,7 +7505,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7461 * check to make sure we can actually find a chunk with enough 7505 * check to make sure we can actually find a chunk with enough
7462 * space to fit our block group in. 7506 * space to fit our block group in.
7463 */ 7507 */
7464 if (device->total_bytes > device->bytes_used + min_free) { 7508 if (device->total_bytes > device->bytes_used + min_free &&
7509 !device->is_tgtdev_for_dev_replace) {
7465 ret = find_free_dev_extent(device, min_free, 7510 ret = find_free_dev_extent(device, min_free,
7466 &dev_offset, NULL); 7511 &dev_offset, NULL);
7467 if (!ret) 7512 if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 472873a94d96..1b319df29eee 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
341{ 341{
342 struct rb_node *node; 342 struct rb_node *node;
343 343
344 if (end < start) { 344 if (end < start)
345 printk(KERN_ERR "btrfs end < start %llu %llu\n", 345 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
346 (unsigned long long)end, 346 (unsigned long long)end,
347 (unsigned long long)start); 347 (unsigned long long)start);
348 WARN_ON(1);
349 }
350 state->start = start; 348 state->start = start;
351 state->end = end; 349 state->end = end;
352 350
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
1919 * the standard behavior is to write all copies in a raid setup. here we only 1917 * the standard behavior is to write all copies in a raid setup. here we only
1920 * want to write the one bad copy. so we do the mapping for ourselves and issue 1918 * want to write the one bad copy. so we do the mapping for ourselves and issue
1921 * submit_bio directly. 1919 * submit_bio directly.
1922 * to avoid any synchonization issues, wait for the data after writing, which 1920 * to avoid any synchronization issues, wait for the data after writing, which
1923 * actually prevents the read that triggered the error from finishing. 1921 * actually prevents the read that triggered the error from finishing.
1924 * currently, there can be no more than two copies of every data bit. thus, 1922 * currently, there can be no more than two copies of every data bit. thus,
1925 * exactly one rewrite is required. 1923 * exactly one rewrite is required.
1926 */ 1924 */
1927int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1925int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1928 u64 length, u64 logical, struct page *page, 1926 u64 length, u64 logical, struct page *page,
1929 int mirror_num) 1927 int mirror_num)
1930{ 1928{
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1946 bio->bi_size = 0; 1944 bio->bi_size = 0;
1947 map_length = length; 1945 map_length = length;
1948 1946
1949 ret = btrfs_map_block(map_tree, WRITE, logical, 1947 ret = btrfs_map_block(fs_info, WRITE, logical,
1950 &map_length, &bbio, mirror_num); 1948 &map_length, &bbio, mirror_num);
1951 if (ret) { 1949 if (ret) {
1952 bio_put(bio); 1950 bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1984int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1982int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1985 int mirror_num) 1983 int mirror_num)
1986{ 1984{
1987 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
1988 u64 start = eb->start; 1985 u64 start = eb->start;
1989 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1986 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1990 int ret = 0; 1987 int ret = 0;
1991 1988
1992 for (i = 0; i < num_pages; i++) { 1989 for (i = 0; i < num_pages; i++) {
1993 struct page *p = extent_buffer_page(eb, i); 1990 struct page *p = extent_buffer_page(eb, i);
1994 ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1991 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
1995 start, p, mirror_num); 1992 start, p, mirror_num);
1996 if (ret) 1993 if (ret)
1997 break; 1994 break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
2010 u64 private; 2007 u64 private;
2011 u64 private_failure; 2008 u64 private_failure;
2012 struct io_failure_record *failrec; 2009 struct io_failure_record *failrec;
2013 struct btrfs_mapping_tree *map_tree; 2010 struct btrfs_fs_info *fs_info;
2014 struct extent_state *state; 2011 struct extent_state *state;
2015 int num_copies; 2012 int num_copies;
2016 int did_repair = 0; 2013 int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
2046 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2043 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2047 2044
2048 if (state && state->start == failrec->start) { 2045 if (state && state->start == failrec->start) {
2049 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2046 fs_info = BTRFS_I(inode)->root->fs_info;
2050 num_copies = btrfs_num_copies(map_tree, failrec->logical, 2047 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2051 failrec->len); 2048 failrec->len);
2052 if (num_copies > 1) { 2049 if (num_copies > 1) {
2053 ret = repair_io_failure(map_tree, start, failrec->len, 2050 ret = repair_io_failure(fs_info, start, failrec->len,
2054 failrec->logical, page, 2051 failrec->logical, page,
2055 failrec->failed_mirror); 2052 failrec->failed_mirror);
2056 did_repair = !ret; 2053 did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2159 * clean_io_failure() clean all those errors at once. 2156 * clean_io_failure() clean all those errors at once.
2160 */ 2157 */
2161 } 2158 }
2162 num_copies = btrfs_num_copies( 2159 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2163 &BTRFS_I(inode)->root->fs_info->mapping_tree, 2160 failrec->logical, failrec->len);
2164 failrec->logical, failrec->len);
2165 if (num_copies == 1) { 2161 if (num_copies == 1) {
2166 /* 2162 /*
2167 * we only have a single copy of the data, so don't bother with 2163 * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2466 return bio; 2462 return bio;
2467} 2463}
2468 2464
2469/*
2470 * Since writes are async, they will only return -ENOMEM.
2471 * Reads can return the full range of I/O error conditions.
2472 */
2473static int __must_check submit_one_bio(int rw, struct bio *bio, 2465static int __must_check submit_one_bio(int rw, struct bio *bio,
2474 int mirror_num, unsigned long bio_flags) 2466 int mirror_num, unsigned long bio_flags)
2475{ 2467{
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4721 } 4713 }
4722 4714
4723 if (start + min_len > eb->len) { 4715 if (start + min_len > eb->len) {
4724 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4716 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4725 "wanted %lu %lu\n", (unsigned long long)eb->start, 4717 "wanted %lu %lu\n", (unsigned long long)eb->start,
4726 eb->len, start, min_len); 4718 eb->len, start, min_len);
4727 WARN_ON(1);
4728 return -EINVAL; 4719 return -EINVAL;
4729 } 4720 }
4730 4721
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 711d12b80028..2eacfabd3263 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 337btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
338 gfp_t gfp_flags); 338 gfp_t gfp_flags);
339 339
340struct btrfs_mapping_tree; 340struct btrfs_fs_info;
341 341
342int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 342int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
343 u64 length, u64 logical, struct page *page, 343 u64 length, u64 logical, struct page *page,
344 int mirror_num); 344 int mirror_num);
345int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 345int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8cbc8d5c7f7..2e8cae63d247 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
49struct extent_map *alloc_extent_map(void) 49struct extent_map *alloc_extent_map(void)
50{ 50{
51 struct extent_map *em; 51 struct extent_map *em;
52 em = kmem_cache_alloc(extent_map_cache, GFP_NOFS); 52 em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
53 if (!em) 53 if (!em)
54 return NULL; 54 return NULL;
55 em->in_tree = 0; 55 em->in_tree = 0;
@@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
171 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) 171 if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
172 return 0; 172 return 0;
173 173
174 if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
175 test_bit(EXTENT_FLAG_LOGGING, &next->flags))
176 return 0;
177
174 if (extent_map_end(prev) == next->start && 178 if (extent_map_end(prev) == next->start &&
175 prev->flags == next->flags && 179 prev->flags == next->flags &&
176 prev->bdev == next->bdev && 180 prev->bdev == next->bdev &&
@@ -198,16 +202,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
198 merge = rb_entry(rb, struct extent_map, rb_node); 202 merge = rb_entry(rb, struct extent_map, rb_node);
199 if (rb && mergable_maps(merge, em)) { 203 if (rb && mergable_maps(merge, em)) {
200 em->start = merge->start; 204 em->start = merge->start;
205 em->orig_start = merge->orig_start;
201 em->len += merge->len; 206 em->len += merge->len;
202 em->block_len += merge->block_len; 207 em->block_len += merge->block_len;
203 em->block_start = merge->block_start; 208 em->block_start = merge->block_start;
204 merge->in_tree = 0; 209 merge->in_tree = 0;
205 if (merge->generation > em->generation) { 210 em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
206 em->mod_start = em->start; 211 em->mod_start = merge->mod_start;
207 em->mod_len = em->len; 212 em->generation = max(em->generation, merge->generation);
208 em->generation = merge->generation; 213 list_move(&em->list, &tree->modified_extents);
209 list_move(&em->list, &tree->modified_extents);
210 }
211 214
212 list_del_init(&merge->list); 215 list_del_init(&merge->list);
213 rb_erase(&merge->rb_node, &tree->map); 216 rb_erase(&merge->rb_node, &tree->map);
@@ -223,23 +226,19 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
223 em->block_len += merge->len; 226 em->block_len += merge->len;
224 rb_erase(&merge->rb_node, &tree->map); 227 rb_erase(&merge->rb_node, &tree->map);
225 merge->in_tree = 0; 228 merge->in_tree = 0;
226 if (merge->generation > em->generation) { 229 em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
227 em->mod_len = em->len; 230 em->generation = max(em->generation, merge->generation);
228 em->generation = merge->generation;
229 list_move(&em->list, &tree->modified_extents);
230 }
231 list_del_init(&merge->list); 231 list_del_init(&merge->list);
232 free_extent_map(merge); 232 free_extent_map(merge);
233 } 233 }
234} 234}
235 235
236/** 236/**
237 * unpint_extent_cache - unpin an extent from the cache 237 * unpin_extent_cache - unpin an extent from the cache
238 * @tree: tree to unpin the extent in 238 * @tree: tree to unpin the extent in
239 * @start: logical offset in the file 239 * @start: logical offset in the file
240 * @len: length of the extent 240 * @len: length of the extent
241 * @gen: generation that this extent has been modified in 241 * @gen: generation that this extent has been modified in
242 * @prealloc: if this is set we need to clear the prealloc flag
243 * 242 *
244 * Called after an extent has been written to disk properly. Set the generation 243 * Called after an extent has been written to disk properly. Set the generation
245 * to the generation that actually added the file item to the inode so we know 244 * to the generation that actually added the file item to the inode so we know
@@ -260,15 +259,16 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
260 if (!em) 259 if (!em)
261 goto out; 260 goto out;
262 261
263 list_move(&em->list, &tree->modified_extents); 262 if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
263 list_move(&em->list, &tree->modified_extents);
264 em->generation = gen; 264 em->generation = gen;
265 clear_bit(EXTENT_FLAG_PINNED, &em->flags); 265 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
266 em->mod_start = em->start; 266 em->mod_start = em->start;
267 em->mod_len = em->len; 267 em->mod_len = em->len;
268 268
269 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 269 if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
270 prealloc = true; 270 prealloc = true;
271 clear_bit(EXTENT_FLAG_PREALLOC, &em->flags); 271 clear_bit(EXTENT_FLAG_FILLING, &em->flags);
272 } 272 }
273 273
274 try_merge_map(tree, em); 274 try_merge_map(tree, em);
@@ -285,6 +285,12 @@ out:
285 285
286} 286}
287 287
288void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
289{
290 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
291 try_merge_map(tree, em);
292}
293
288/** 294/**
289 * add_extent_mapping - add new extent map to the extent tree 295 * add_extent_mapping - add new extent map to the extent tree
290 * @tree: tree to insert new map in 296 * @tree: tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 679225555f7b..c6598c89cff8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ 14#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ 15#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ 16#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
17#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
17 18
18struct extent_map { 19struct extent_map {
19 struct rb_node rb_node; 20 struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
24 u64 mod_start; 25 u64 mod_start;
25 u64 mod_len; 26 u64 mod_len;
26 u64 orig_start; 27 u64 orig_start;
28 u64 orig_block_len;
27 u64 block_start; 29 u64 block_start;
28 u64 block_len; 30 u64 block_len;
29 u64 generation; 31 u64 generation;
@@ -67,6 +69,7 @@ void free_extent_map(struct extent_map *em);
67int __init extent_map_init(void); 69int __init extent_map_init(void);
68void extent_map_exit(void); 70void extent_map_exit(void);
69int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); 71int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
72void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
70struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 73struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
71 u64 start, u64 len); 74 u64 start, u64 len);
72#endif 75#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1ad08e4e4a15..94aa53b38721 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
133 return ERR_PTR(ret); 133 return ERR_PTR(ret);
134} 134}
135 135
136
137int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, 136int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
138 struct btrfs_root *root, 137 struct btrfs_root *root,
139 struct btrfs_path *path, u64 objectid, 138 struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
151 return ret; 150 return ret;
152} 151}
153 152
153u64 btrfs_file_extent_length(struct btrfs_path *path)
154{
155 int extent_type;
156 struct btrfs_file_extent_item *fi;
157 u64 len;
158
159 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
160 struct btrfs_file_extent_item);
161 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
162
163 if (extent_type == BTRFS_FILE_EXTENT_REG ||
164 extent_type == BTRFS_FILE_EXTENT_PREALLOC)
165 len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
166 else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
167 len = btrfs_file_extent_inline_len(path->nodes[0], fi);
168 else
169 BUG();
170
171 return len;
172}
154 173
155static int __btrfs_lookup_bio_sums(struct btrfs_root *root, 174static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
156 struct inode *inode, struct bio *bio, 175 struct inode *inode, struct bio *bio,
@@ -441,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
441 if (!contig) 460 if (!contig)
442 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 461 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
443 462
444 if (!contig && (offset >= ordered->file_offset + ordered->len || 463 if (offset >= ordered->file_offset + ordered->len ||
445 offset < ordered->file_offset)) { 464 offset < ordered->file_offset) {
446 unsigned long bytes_left; 465 unsigned long bytes_left;
447 sums->len = this_sum_bytes; 466 sums->len = this_sum_bytes;
448 this_sum_bytes = 0; 467 this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..f76b1fd160d4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
41#include "compat.h" 41#include "compat.h"
42#include "volumes.h" 42#include "volumes.h"
43 43
44static struct kmem_cache *btrfs_inode_defrag_cachep;
44/* 45/*
45 * when auto defrag is enabled we 46 * when auto defrag is enabled we
46 * queue up these defrag structs to remember which 47 * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
90 * If an existing record is found the defrag item you 91 * If an existing record is found the defrag item you
91 * pass in is freed 92 * pass in is freed
92 */ 93 */
93static void __btrfs_add_inode_defrag(struct inode *inode, 94static int __btrfs_add_inode_defrag(struct inode *inode,
94 struct inode_defrag *defrag) 95 struct inode_defrag *defrag)
95{ 96{
96 struct btrfs_root *root = BTRFS_I(inode)->root; 97 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
118 entry->transid = defrag->transid; 119 entry->transid = defrag->transid;
119 if (defrag->last_offset > entry->last_offset) 120 if (defrag->last_offset > entry->last_offset)
120 entry->last_offset = defrag->last_offset; 121 entry->last_offset = defrag->last_offset;
121 goto exists; 122 return -EEXIST;
122 } 123 }
123 } 124 }
124 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 125 set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
125 rb_link_node(&defrag->rb_node, parent, p); 126 rb_link_node(&defrag->rb_node, parent, p);
126 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 127 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
127 return; 128 return 0;
129}
128 130
129exists: 131static inline int __need_auto_defrag(struct btrfs_root *root)
130 kfree(defrag); 132{
131 return; 133 if (!btrfs_test_opt(root, AUTO_DEFRAG))
134 return 0;
135
136 if (btrfs_fs_closing(root->fs_info))
137 return 0;
132 138
139 return 1;
133} 140}
134 141
135/* 142/*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
142 struct btrfs_root *root = BTRFS_I(inode)->root; 149 struct btrfs_root *root = BTRFS_I(inode)->root;
143 struct inode_defrag *defrag; 150 struct inode_defrag *defrag;
144 u64 transid; 151 u64 transid;
152 int ret;
145 153
146 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 154 if (!__need_auto_defrag(root))
147 return 0;
148
149 if (btrfs_fs_closing(root->fs_info))
150 return 0; 155 return 0;
151 156
152 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 157 if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
157 else 162 else
158 transid = BTRFS_I(inode)->root->last_trans; 163 transid = BTRFS_I(inode)->root->last_trans;
159 164
160 defrag = kzalloc(sizeof(*defrag), GFP_NOFS); 165 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
161 if (!defrag) 166 if (!defrag)
162 return -ENOMEM; 167 return -ENOMEM;
163 168
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
166 defrag->root = root->root_key.objectid; 171 defrag->root = root->root_key.objectid;
167 172
168 spin_lock(&root->fs_info->defrag_inodes_lock); 173 spin_lock(&root->fs_info->defrag_inodes_lock);
169 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) 174 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
170 __btrfs_add_inode_defrag(inode, defrag); 175 /*
171 else 176 * If we set IN_DEFRAG flag and evict the inode from memory,
172 kfree(defrag); 177 * and then re-read this inode, this new inode doesn't have
178 * IN_DEFRAG flag. At the case, we may find the existed defrag.
179 */
180 ret = __btrfs_add_inode_defrag(inode, defrag);
181 if (ret)
182 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
183 } else {
184 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
185 }
173 spin_unlock(&root->fs_info->defrag_inodes_lock); 186 spin_unlock(&root->fs_info->defrag_inodes_lock);
174 return 0; 187 return 0;
175} 188}
176 189
177/* 190/*
178 * must be called with the defrag_inodes lock held 191 * Requeue the defrag object. If there is a defrag object that points to
192 * the same inode in the tree, we will merge them together (by
193 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
194 */
195void btrfs_requeue_inode_defrag(struct inode *inode,
196 struct inode_defrag *defrag)
197{
198 struct btrfs_root *root = BTRFS_I(inode)->root;
199 int ret;
200
201 if (!__need_auto_defrag(root))
202 goto out;
203
204 /*
205 * Here we don't check the IN_DEFRAG flag, because we need merge
206 * them together.
207 */
208 spin_lock(&root->fs_info->defrag_inodes_lock);
209 ret = __btrfs_add_inode_defrag(inode, defrag);
210 spin_unlock(&root->fs_info->defrag_inodes_lock);
211 if (ret)
212 goto out;
213 return;
214out:
215 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
216}
217
218/*
219 * pick the defragable inode that we want, if it doesn't exist, we will get
220 * the next one.
179 */ 221 */
180struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, 222static struct inode_defrag *
181 u64 root, u64 ino, 223btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
182 struct rb_node **next)
183{ 224{
184 struct inode_defrag *entry = NULL; 225 struct inode_defrag *entry = NULL;
185 struct inode_defrag tmp; 226 struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
190 tmp.ino = ino; 231 tmp.ino = ino;
191 tmp.root = root; 232 tmp.root = root;
192 233
193 p = info->defrag_inodes.rb_node; 234 spin_lock(&fs_info->defrag_inodes_lock);
235 p = fs_info->defrag_inodes.rb_node;
194 while (p) { 236 while (p) {
195 parent = p; 237 parent = p;
196 entry = rb_entry(parent, struct inode_defrag, rb_node); 238 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
201 else if (ret > 0) 243 else if (ret > 0)
202 p = parent->rb_right; 244 p = parent->rb_right;
203 else 245 else
204 return entry; 246 goto out;
205 } 247 }
206 248
207 if (next) { 249 if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
208 while (parent && __compare_inode_defrag(&tmp, entry) > 0) { 250 parent = rb_next(parent);
209 parent = rb_next(parent); 251 if (parent)
210 entry = rb_entry(parent, struct inode_defrag, rb_node); 252 entry = rb_entry(parent, struct inode_defrag, rb_node);
211 } 253 else
212 *next = parent; 254 entry = NULL;
213 } 255 }
214 return NULL; 256out:
257 if (entry)
258 rb_erase(parent, &fs_info->defrag_inodes);
259 spin_unlock(&fs_info->defrag_inodes_lock);
260 return entry;
215} 261}
216 262
217/* 263void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
218 * run through the list of inodes in the FS that need
219 * defragging
220 */
221int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
222{ 264{
223 struct inode_defrag *defrag; 265 struct inode_defrag *defrag;
266 struct rb_node *node;
267
268 spin_lock(&fs_info->defrag_inodes_lock);
269 node = rb_first(&fs_info->defrag_inodes);
270 while (node) {
271 rb_erase(node, &fs_info->defrag_inodes);
272 defrag = rb_entry(node, struct inode_defrag, rb_node);
273 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
274
275 if (need_resched()) {
276 spin_unlock(&fs_info->defrag_inodes_lock);
277 cond_resched();
278 spin_lock(&fs_info->defrag_inodes_lock);
279 }
280
281 node = rb_first(&fs_info->defrag_inodes);
282 }
283 spin_unlock(&fs_info->defrag_inodes_lock);
284}
285
286#define BTRFS_DEFRAG_BATCH 1024
287
288static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
289 struct inode_defrag *defrag)
290{
224 struct btrfs_root *inode_root; 291 struct btrfs_root *inode_root;
225 struct inode *inode; 292 struct inode *inode;
226 struct rb_node *n;
227 struct btrfs_key key; 293 struct btrfs_key key;
228 struct btrfs_ioctl_defrag_range_args range; 294 struct btrfs_ioctl_defrag_range_args range;
229 u64 first_ino = 0;
230 u64 root_objectid = 0;
231 int num_defrag; 295 int num_defrag;
232 int defrag_batch = 1024;
233 296
297 /* get the inode */
298 key.objectid = defrag->root;
299 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
300 key.offset = (u64)-1;
301 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
302 if (IS_ERR(inode_root)) {
303 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
304 return PTR_ERR(inode_root);
305 }
306
307 key.objectid = defrag->ino;
308 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
309 key.offset = 0;
310 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
311 if (IS_ERR(inode)) {
312 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
313 return PTR_ERR(inode);
314 }
315
316 /* do a chunk of defrag */
317 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
234 memset(&range, 0, sizeof(range)); 318 memset(&range, 0, sizeof(range));
235 range.len = (u64)-1; 319 range.len = (u64)-1;
320 range.start = defrag->last_offset;
321
322 sb_start_write(fs_info->sb);
323 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
324 BTRFS_DEFRAG_BATCH);
325 sb_end_write(fs_info->sb);
326 /*
327 * if we filled the whole defrag batch, there
328 * must be more work to do. Queue this defrag
329 * again
330 */
331 if (num_defrag == BTRFS_DEFRAG_BATCH) {
332 defrag->last_offset = range.start;
333 btrfs_requeue_inode_defrag(inode, defrag);
334 } else if (defrag->last_offset && !defrag->cycled) {
335 /*
336 * we didn't fill our defrag batch, but
337 * we didn't start at zero. Make sure we loop
338 * around to the start of the file.
339 */
340 defrag->last_offset = 0;
341 defrag->cycled = 1;
342 btrfs_requeue_inode_defrag(inode, defrag);
343 } else {
344 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
345 }
346
347 iput(inode);
348 return 0;
349}
350
351/*
352 * run through the list of inodes in the FS that need
353 * defragging
354 */
355int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
356{
357 struct inode_defrag *defrag;
358 u64 first_ino = 0;
359 u64 root_objectid = 0;
236 360
237 atomic_inc(&fs_info->defrag_running); 361 atomic_inc(&fs_info->defrag_running);
238 spin_lock(&fs_info->defrag_inodes_lock);
239 while(1) { 362 while(1) {
240 n = NULL; 363 if (!__need_auto_defrag(fs_info->tree_root))
364 break;
241 365
242 /* find an inode to defrag */ 366 /* find an inode to defrag */
243 defrag = btrfs_find_defrag_inode(fs_info, root_objectid, 367 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
244 first_ino, &n); 368 first_ino);
245 if (!defrag) { 369 if (!defrag) {
246 if (n) { 370 if (root_objectid || first_ino) {
247 defrag = rb_entry(n, struct inode_defrag,
248 rb_node);
249 } else if (root_objectid || first_ino) {
250 root_objectid = 0; 371 root_objectid = 0;
251 first_ino = 0; 372 first_ino = 0;
252 continue; 373 continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
255 } 376 }
256 } 377 }
257 378
258 /* remove it from the rbtree */
259 first_ino = defrag->ino + 1; 379 first_ino = defrag->ino + 1;
260 root_objectid = defrag->root; 380 root_objectid = defrag->root;
261 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
262
263 if (btrfs_fs_closing(fs_info))
264 goto next_free;
265
266 spin_unlock(&fs_info->defrag_inodes_lock);
267
268 /* get the inode */
269 key.objectid = defrag->root;
270 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
271 key.offset = (u64)-1;
272 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
273 if (IS_ERR(inode_root))
274 goto next;
275
276 key.objectid = defrag->ino;
277 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
278 key.offset = 0;
279
280 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
281 if (IS_ERR(inode))
282 goto next;
283
284 /* do a chunk of defrag */
285 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
286 range.start = defrag->last_offset;
287 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
288 defrag_batch);
289 /*
290 * if we filled the whole defrag batch, there
291 * must be more work to do. Queue this defrag
292 * again
293 */
294 if (num_defrag == defrag_batch) {
295 defrag->last_offset = range.start;
296 __btrfs_add_inode_defrag(inode, defrag);
297 /*
298 * we don't want to kfree defrag, we added it back to
299 * the rbtree
300 */
301 defrag = NULL;
302 } else if (defrag->last_offset && !defrag->cycled) {
303 /*
304 * we didn't fill our defrag batch, but
305 * we didn't start at zero. Make sure we loop
306 * around to the start of the file.
307 */
308 defrag->last_offset = 0;
309 defrag->cycled = 1;
310 __btrfs_add_inode_defrag(inode, defrag);
311 defrag = NULL;
312 }
313 381
314 iput(inode); 382 __btrfs_run_defrag_inode(fs_info, defrag);
315next:
316 spin_lock(&fs_info->defrag_inodes_lock);
317next_free:
318 kfree(defrag);
319 } 383 }
320 spin_unlock(&fs_info->defrag_inodes_lock);
321
322 atomic_dec(&fs_info->defrag_running); 384 atomic_dec(&fs_info->defrag_running);
323 385
324 /* 386 /*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
526 split->block_len = em->block_len; 588 split->block_len = em->block_len;
527 else 589 else
528 split->block_len = split->len; 590 split->block_len = split->len;
591 split->orig_block_len = max(split->block_len,
592 em->orig_block_len);
529 split->generation = gen; 593 split->generation = gen;
530 split->bdev = em->bdev; 594 split->bdev = em->bdev;
531 split->flags = flags; 595 split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
547 split->flags = flags; 611 split->flags = flags;
548 split->compress_type = em->compress_type; 612 split->compress_type = em->compress_type;
549 split->generation = gen; 613 split->generation = gen;
614 split->orig_block_len = max(em->block_len,
615 em->orig_block_len);
550 616
551 if (compressed) { 617 if (compressed) {
552 split->block_len = em->block_len; 618 split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
555 } else { 621 } else {
556 split->block_len = split->len; 622 split->block_len = split->len;
557 split->block_start = em->block_start + diff; 623 split->block_start = em->block_start + diff;
558 split->orig_start = split->start; 624 split->orig_start = em->orig_start;
559 } 625 }
560 626
561 ret = add_extent_mapping(em_tree, split); 627 ret = add_extent_mapping(em_tree, split);
@@ -1346,10 +1412,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1346 1412
1347 cond_resched(); 1413 cond_resched();
1348 1414
1349 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1415 balance_dirty_pages_ratelimited(inode->i_mapping);
1350 dirty_pages);
1351 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1416 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1352 btrfs_btree_balance_dirty(root, 1); 1417 btrfs_btree_balance_dirty(root);
1353 1418
1354 pos += copied; 1419 pos += copied;
1355 num_written += copied; 1420 num_written += copied;
@@ -1398,6 +1463,24 @@ out:
1398 return written ? written : err; 1463 return written ? written : err;
1399} 1464}
1400 1465
1466static void update_time_for_write(struct inode *inode)
1467{
1468 struct timespec now;
1469
1470 if (IS_NOCMTIME(inode))
1471 return;
1472
1473 now = current_fs_time(inode->i_sb);
1474 if (!timespec_equal(&inode->i_mtime, &now))
1475 inode->i_mtime = now;
1476
1477 if (!timespec_equal(&inode->i_ctime, &now))
1478 inode->i_ctime = now;
1479
1480 if (IS_I_VERSION(inode))
1481 inode_inc_iversion(inode);
1482}
1483
1401static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1484static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1402 const struct iovec *iov, 1485 const struct iovec *iov,
1403 unsigned long nr_segs, loff_t pos) 1486 unsigned long nr_segs, loff_t pos)
@@ -1410,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1410 ssize_t num_written = 0; 1493 ssize_t num_written = 0;
1411 ssize_t err = 0; 1494 ssize_t err = 0;
1412 size_t count, ocount; 1495 size_t count, ocount;
1496 bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1413 1497
1414 sb_start_write(inode->i_sb); 1498 sb_start_write(inode->i_sb);
1415 1499
@@ -1452,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1452 goto out; 1536 goto out;
1453 } 1537 }
1454 1538
1455 err = file_update_time(file); 1539 /*
1456 if (err) { 1540 * We reserve space for updating the inode when we reserve space for the
1457 mutex_unlock(&inode->i_mutex); 1541 * extent we are going to write, so we will enospc out there. We don't
1458 goto out; 1542 * need to start yet another transaction to update the inode as we will
1459 } 1543 * update the inode when we finish writing whatever data we write.
1544 */
1545 update_time_for_write(inode);
1460 1546
1461 start_pos = round_down(pos, root->sectorsize); 1547 start_pos = round_down(pos, root->sectorsize);
1462 if (start_pos > i_size_read(inode)) { 1548 if (start_pos > i_size_read(inode)) {
@@ -1467,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1467 } 1553 }
1468 } 1554 }
1469 1555
1556 if (sync)
1557 atomic_inc(&BTRFS_I(inode)->sync_writers);
1558
1470 if (unlikely(file->f_flags & O_DIRECT)) { 1559 if (unlikely(file->f_flags & O_DIRECT)) {
1471 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1560 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1472 pos, ppos, count, ocount); 1561 pos, ppos, count, ocount);
@@ -1493,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1493 * this will either be one more than the running transaction 1582 * this will either be one more than the running transaction
1494 * or the generation used for the next transaction if there isn't 1583 * or the generation used for the next transaction if there isn't
1495 * one running right now. 1584 * one running right now.
1585 *
1586 * We also have to set last_sub_trans to the current log transid,
1587 * otherwise subsequent syncs to a file that's been synced in this
1588 * transaction will appear to have already occured.
1496 */ 1589 */
1497 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1590 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1591 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1498 if (num_written > 0 || num_written == -EIOCBQUEUED) { 1592 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1499 err = generic_write_sync(file, pos, num_written); 1593 err = generic_write_sync(file, pos, num_written);
1500 if (err < 0 && num_written > 0) 1594 if (err < 0 && num_written > 0)
1501 num_written = err; 1595 num_written = err;
1502 } 1596 }
1503out: 1597out:
1598 if (sync)
1599 atomic_dec(&BTRFS_I(inode)->sync_writers);
1504 sb_end_write(inode->i_sb); 1600 sb_end_write(inode->i_sb);
1505 current->backing_dev_info = NULL; 1601 current->backing_dev_info = NULL;
1506 return num_written ? num_written : err; 1602 return num_written ? num_written : err;
@@ -1551,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1551 * out of the ->i_mutex. If so, we can flush the dirty pages by 1647 * out of the ->i_mutex. If so, we can flush the dirty pages by
1552 * multi-task, and make the performance up. 1648 * multi-task, and make the performance up.
1553 */ 1649 */
1650 atomic_inc(&BTRFS_I(inode)->sync_writers);
1554 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 1651 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1652 atomic_dec(&BTRFS_I(inode)->sync_writers);
1555 if (ret) 1653 if (ret)
1556 return ret; 1654 return ret;
1557 1655
@@ -1562,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1562 * range being left. 1660 * range being left.
1563 */ 1661 */
1564 atomic_inc(&root->log_batch); 1662 atomic_inc(&root->log_batch);
1565 btrfs_wait_ordered_range(inode, start, end); 1663 btrfs_wait_ordered_range(inode, start, end - start + 1);
1566 atomic_inc(&root->log_batch); 1664 atomic_inc(&root->log_batch);
1567 1665
1568 /* 1666 /*
@@ -1768,6 +1866,7 @@ out:
1768 1866
1769 hole_em->block_start = EXTENT_MAP_HOLE; 1867 hole_em->block_start = EXTENT_MAP_HOLE;
1770 hole_em->block_len = 0; 1868 hole_em->block_len = 0;
1869 hole_em->orig_block_len = 0;
1771 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 1870 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
1772 hole_em->compress_type = BTRFS_COMPRESS_NONE; 1871 hole_em->compress_type = BTRFS_COMPRESS_NONE;
1773 hole_em->generation = trans->transid; 1872 hole_em->generation = trans->transid;
@@ -1797,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1797 struct btrfs_path *path; 1896 struct btrfs_path *path;
1798 struct btrfs_block_rsv *rsv; 1897 struct btrfs_block_rsv *rsv;
1799 struct btrfs_trans_handle *trans; 1898 struct btrfs_trans_handle *trans;
1800 u64 mask = BTRFS_I(inode)->root->sectorsize - 1; 1899 u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
1801 u64 lockstart = (offset + mask) & ~mask; 1900 u64 lockend = round_down(offset + len,
1802 u64 lockend = ((offset + len) & ~mask) - 1; 1901 BTRFS_I(inode)->root->sectorsize) - 1;
1803 u64 cur_offset = lockstart; 1902 u64 cur_offset = lockstart;
1804 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 1903 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
1805 u64 drop_end; 1904 u64 drop_end;
1806 unsigned long nr;
1807 int ret = 0; 1905 int ret = 0;
1808 int err = 0; 1906 int err = 0;
1809 bool same_page = (offset >> PAGE_CACHE_SHIFT) == 1907 bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
1810 ((offset + len) >> PAGE_CACHE_SHIFT); 1908 ((offset + len - 1) >> PAGE_CACHE_SHIFT));
1811 1909
1812 btrfs_wait_ordered_range(inode, offset, len); 1910 btrfs_wait_ordered_range(inode, offset, len);
1813 1911
1814 mutex_lock(&inode->i_mutex); 1912 mutex_lock(&inode->i_mutex);
1815 if (offset >= inode->i_size) { 1913 /*
1816 mutex_unlock(&inode->i_mutex); 1914 * We needn't truncate any page which is beyond the end of the file
1817 return 0; 1915 * because we are sure there is no data there.
1818 } 1916 */
1819
1820 /* 1917 /*
1821 * Only do this if we are in the same page and we aren't doing the 1918 * Only do this if we are in the same page and we aren't doing the
1822 * entire page. 1919 * entire page.
1823 */ 1920 */
1824 if (same_page && len < PAGE_CACHE_SIZE) { 1921 if (same_page && len < PAGE_CACHE_SIZE) {
1825 ret = btrfs_truncate_page(inode, offset, len, 0); 1922 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
1923 ret = btrfs_truncate_page(inode, offset, len, 0);
1826 mutex_unlock(&inode->i_mutex); 1924 mutex_unlock(&inode->i_mutex);
1827 return ret; 1925 return ret;
1828 } 1926 }
1829 1927
1830 /* zero back part of the first page */ 1928 /* zero back part of the first page */
1831 ret = btrfs_truncate_page(inode, offset, 0, 0); 1929 if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1832 if (ret) { 1930 ret = btrfs_truncate_page(inode, offset, 0, 0);
1833 mutex_unlock(&inode->i_mutex); 1931 if (ret) {
1834 return ret; 1932 mutex_unlock(&inode->i_mutex);
1933 return ret;
1934 }
1835 } 1935 }
1836 1936
1837 /* zero the front end of the last page */ 1937 /* zero the front end of the last page */
1838 ret = btrfs_truncate_page(inode, offset + len, 0, 1); 1938 if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
1839 if (ret) { 1939 ret = btrfs_truncate_page(inode, offset + len, 0, 1);
1840 mutex_unlock(&inode->i_mutex); 1940 if (ret) {
1841 return ret; 1941 mutex_unlock(&inode->i_mutex);
1942 return ret;
1943 }
1842 } 1944 }
1843 1945
1844 if (lockend < lockstart) { 1946 if (lockend < lockstart) {
@@ -1931,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
1931 break; 2033 break;
1932 } 2034 }
1933 2035
1934 nr = trans->blocks_used;
1935 btrfs_end_transaction(trans, root); 2036 btrfs_end_transaction(trans, root);
1936 btrfs_btree_balance_dirty(root, nr); 2037 btrfs_btree_balance_dirty(root);
1937 2038
1938 trans = btrfs_start_transaction(root, 3); 2039 trans = btrfs_start_transaction(root, 3);
1939 if (IS_ERR(trans)) { 2040 if (IS_ERR(trans)) {
@@ -1964,11 +2065,13 @@ out_trans:
1964 if (!trans) 2065 if (!trans)
1965 goto out_free; 2066 goto out_free;
1966 2067
2068 inode_inc_iversion(inode);
2069 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2070
1967 trans->block_rsv = &root->fs_info->trans_block_rsv; 2071 trans->block_rsv = &root->fs_info->trans_block_rsv;
1968 ret = btrfs_update_inode(trans, root, inode); 2072 ret = btrfs_update_inode(trans, root, inode);
1969 nr = trans->blocks_used;
1970 btrfs_end_transaction(trans, root); 2073 btrfs_end_transaction(trans, root);
1971 btrfs_btree_balance_dirty(root, nr); 2074 btrfs_btree_balance_dirty(root);
1972out_free: 2075out_free:
1973 btrfs_free_path(path); 2076 btrfs_free_path(path);
1974 btrfs_free_block_rsv(root, rsv); 2077 btrfs_free_block_rsv(root, rsv);
@@ -1992,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
1992 u64 alloc_end; 2095 u64 alloc_end;
1993 u64 alloc_hint = 0; 2096 u64 alloc_hint = 0;
1994 u64 locked_end; 2097 u64 locked_end;
1995 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1996 struct extent_map *em; 2098 struct extent_map *em;
2099 int blocksize = BTRFS_I(inode)->root->sectorsize;
1997 int ret; 2100 int ret;
1998 2101
1999 alloc_start = offset & ~mask; 2102 alloc_start = round_down(offset, blocksize);
2000 alloc_end = (offset + len + mask) & ~mask; 2103 alloc_end = round_up(offset + len, blocksize);
2001 2104
2002 /* Make sure we aren't being give some crap mode */ 2105 /* Make sure we aren't being give some crap mode */
2003 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2106 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2010,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2010 * Make sure we have enough space before we do the 2113 * Make sure we have enough space before we do the
2011 * allocation. 2114 * allocation.
2012 */ 2115 */
2013 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1); 2116 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
2014 if (ret) 2117 if (ret)
2015 return ret; 2118 return ret;
2016 2119
@@ -2078,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2078 } 2181 }
2079 last_byte = min(extent_map_end(em), alloc_end); 2182 last_byte = min(extent_map_end(em), alloc_end);
2080 actual_end = min_t(u64, extent_map_end(em), offset + len); 2183 actual_end = min_t(u64, extent_map_end(em), offset + len);
2081 last_byte = (last_byte + mask) & ~mask; 2184 last_byte = ALIGN(last_byte, blocksize);
2082 2185
2083 if (em->block_start == EXTENT_MAP_HOLE || 2186 if (em->block_start == EXTENT_MAP_HOLE ||
2084 (cur_offset >= inode->i_size && 2187 (cur_offset >= inode->i_size &&
@@ -2117,11 +2220,11 @@ static long btrfs_fallocate(struct file *file, int mode,
2117out: 2220out:
2118 mutex_unlock(&inode->i_mutex); 2221 mutex_unlock(&inode->i_mutex);
2119 /* Let go of our reservation. */ 2222 /* Let go of our reservation. */
2120 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1); 2223 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2121 return ret; 2224 return ret;
2122} 2225}
2123 2226
2124static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) 2227static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2125{ 2228{
2126 struct btrfs_root *root = BTRFS_I(inode)->root; 2229 struct btrfs_root *root = BTRFS_I(inode)->root;
2127 struct extent_map *em; 2230 struct extent_map *em;
@@ -2138,6 +2241,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2138 if (lockend <= lockstart) 2241 if (lockend <= lockstart)
2139 lockend = lockstart + root->sectorsize; 2242 lockend = lockstart + root->sectorsize;
2140 2243
2244 lockend--;
2141 len = lockend - lockstart + 1; 2245 len = lockend - lockstart + 1;
2142 2246
2143 len = max_t(u64, len, root->sectorsize); 2247 len = max_t(u64, len, root->sectorsize);
@@ -2155,7 +2259,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2155 * before the position we want in case there is outstanding delalloc 2259 * before the position we want in case there is outstanding delalloc
2156 * going on here. 2260 * going on here.
2157 */ 2261 */
2158 if (origin == SEEK_HOLE && start != 0) { 2262 if (whence == SEEK_HOLE && start != 0) {
2159 if (start <= root->sectorsize) 2263 if (start <= root->sectorsize)
2160 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0, 2264 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
2161 root->sectorsize, 0); 2265 root->sectorsize, 0);
@@ -2189,13 +2293,13 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2189 } 2293 }
2190 } 2294 }
2191 2295
2192 if (origin == SEEK_HOLE) { 2296 if (whence == SEEK_HOLE) {
2193 *offset = start; 2297 *offset = start;
2194 free_extent_map(em); 2298 free_extent_map(em);
2195 break; 2299 break;
2196 } 2300 }
2197 } else { 2301 } else {
2198 if (origin == SEEK_DATA) { 2302 if (whence == SEEK_DATA) {
2199 if (em->block_start == EXTENT_MAP_DELALLOC) { 2303 if (em->block_start == EXTENT_MAP_DELALLOC) {
2200 if (start >= inode->i_size) { 2304 if (start >= inode->i_size) {
2201 free_extent_map(em); 2305 free_extent_map(em);
@@ -2204,9 +2308,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
2204 } 2308 }
2205 } 2309 }
2206 2310
2207 *offset = start; 2311 if (!test_bit(EXTENT_FLAG_PREALLOC,
2208 free_extent_map(em); 2312 &em->flags)) {
2209 break; 2313 *offset = start;
2314 free_extent_map(em);
2315 break;
2316 }
2210 } 2317 }
2211 } 2318 }
2212 2319
@@ -2232,16 +2339,16 @@ out:
2232 return ret; 2339 return ret;
2233} 2340}
2234 2341
2235static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin) 2342static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
2236{ 2343{
2237 struct inode *inode = file->f_mapping->host; 2344 struct inode *inode = file->f_mapping->host;
2238 int ret; 2345 int ret;
2239 2346
2240 mutex_lock(&inode->i_mutex); 2347 mutex_lock(&inode->i_mutex);
2241 switch (origin) { 2348 switch (whence) {
2242 case SEEK_END: 2349 case SEEK_END:
2243 case SEEK_CUR: 2350 case SEEK_CUR:
2244 offset = generic_file_llseek(file, offset, origin); 2351 offset = generic_file_llseek(file, offset, whence);
2245 goto out; 2352 goto out;
2246 case SEEK_DATA: 2353 case SEEK_DATA:
2247 case SEEK_HOLE: 2354 case SEEK_HOLE:
@@ -2250,7 +2357,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
2250 return -ENXIO; 2357 return -ENXIO;
2251 } 2358 }
2252 2359
2253 ret = find_desired_extent(inode, &offset, origin); 2360 ret = find_desired_extent(inode, &offset, whence);
2254 if (ret) { 2361 if (ret) {
2255 mutex_unlock(&inode->i_mutex); 2362 mutex_unlock(&inode->i_mutex);
2256 return ret; 2363 return ret;
@@ -2293,3 +2400,21 @@ const struct file_operations btrfs_file_operations = {
2293 .compat_ioctl = btrfs_ioctl, 2400 .compat_ioctl = btrfs_ioctl,
2294#endif 2401#endif
2295}; 2402};
2403
2404void btrfs_auto_defrag_exit(void)
2405{
2406 if (btrfs_inode_defrag_cachep)
2407 kmem_cache_destroy(btrfs_inode_defrag_cachep);
2408}
2409
2410int btrfs_auto_defrag_init(void)
2411{
2412 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
2413 sizeof(struct inode_defrag), 0,
2414 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
2415 NULL);
2416 if (!btrfs_inode_defrag_cachep)
2417 return -ENOMEM;
2418
2419 return 0;
2420}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1027b854b90c..0be7a8742a43 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
307 307
308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 308static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
309{ 309{
310 WARN_ON(io_ctl->cur);
311 BUG_ON(io_ctl->index >= io_ctl->num_pages); 310 BUG_ON(io_ctl->index >= io_ctl->num_pages);
312 io_ctl->page = io_ctl->pages[io_ctl->index++]; 311 io_ctl->page = io_ctl->pages[io_ctl->index++];
313 io_ctl->cur = kmap(io_ctl->page); 312 io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1250 * if previous extent entry covers the offset, 1249 * if previous extent entry covers the offset,
1251 * we should return it instead of the bitmap entry 1250 * we should return it instead of the bitmap entry
1252 */ 1251 */
1253 n = &entry->offset_index; 1252 n = rb_prev(&entry->offset_index);
1254 while (1) { 1253 if (n) {
1255 n = rb_prev(n);
1256 if (!n)
1257 break;
1258 prev = rb_entry(n, struct btrfs_free_space, 1254 prev = rb_entry(n, struct btrfs_free_space,
1259 offset_index); 1255 offset_index);
1260 if (!prev->bitmap) { 1256 if (!prev->bitmap &&
1261 if (prev->offset + prev->bytes > offset) 1257 prev->offset + prev->bytes > offset)
1262 entry = prev; 1258 entry = prev;
1263 break;
1264 }
1265 } 1259 }
1266 } 1260 }
1267 return entry; 1261 return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
1287 } 1281 }
1288 1282
1289 if (entry->bitmap) { 1283 if (entry->bitmap) {
1290 n = &entry->offset_index; 1284 n = rb_prev(&entry->offset_index);
1291 while (1) { 1285 if (n) {
1292 n = rb_prev(n);
1293 if (!n)
1294 break;
1295 prev = rb_entry(n, struct btrfs_free_space, 1286 prev = rb_entry(n, struct btrfs_free_space,
1296 offset_index); 1287 offset_index);
1297 if (!prev->bitmap) { 1288 if (!prev->bitmap &&
1298 if (prev->offset + prev->bytes > offset) 1289 prev->offset + prev->bytes > offset)
1299 return prev; 1290 return prev;
1300 break;
1301 }
1302 } 1291 }
1303 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset) 1292 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1304 return entry; 1293 return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1364 u64 bitmap_bytes; 1353 u64 bitmap_bytes;
1365 u64 extent_bytes; 1354 u64 extent_bytes;
1366 u64 size = block_group->key.offset; 1355 u64 size = block_group->key.offset;
1367 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1356 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1368 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1357 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1369 1358
1370 BUG_ON(ctl->total_bitmaps > max_bitmaps); 1359 BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1650 * some block groups are so tiny they can't be enveloped by a bitmap, so 1639 * some block groups are so tiny they can't be enveloped by a bitmap, so
1651 * don't even bother to create a bitmap for this 1640 * don't even bother to create a bitmap for this
1652 */ 1641 */
1653 if (BITS_PER_BITMAP * block_group->sectorsize > 1642 if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
1654 block_group->key.offset)
1655 return false; 1643 return false;
1656 1644
1657 return true; 1645 return true;
@@ -1874,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
1874{ 1862{
1875 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1863 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1876 struct btrfs_free_space *info; 1864 struct btrfs_free_space *info;
1877 int ret = 0; 1865 int ret;
1866 bool re_search = false;
1878 1867
1879 spin_lock(&ctl->tree_lock); 1868 spin_lock(&ctl->tree_lock);
1880 1869
1881again: 1870again:
1871 ret = 0;
1882 if (!bytes) 1872 if (!bytes)
1883 goto out_lock; 1873 goto out_lock;
1884 1874
@@ -1891,17 +1881,17 @@ again:
1891 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1881 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1892 1, 0); 1882 1, 0);
1893 if (!info) { 1883 if (!info) {
1894 /* the tree logging code might be calling us before we 1884 /*
1895 * have fully loaded the free space rbtree for this 1885 * If we found a partial bit of our free space in a
1896 * block group. So it is possible the entry won't 1886 * bitmap but then couldn't find the other part this may
1897 * be in the rbtree yet at all. The caching code 1887 * be a problem, so WARN about it.
1898 * will make sure not to put it in the rbtree if
1899 * the logging code has pinned it.
1900 */ 1888 */
1889 WARN_ON(re_search);
1901 goto out_lock; 1890 goto out_lock;
1902 } 1891 }
1903 } 1892 }
1904 1893
1894 re_search = false;
1905 if (!info->bitmap) { 1895 if (!info->bitmap) {
1906 unlink_free_space(ctl, info); 1896 unlink_free_space(ctl, info);
1907 if (offset == info->offset) { 1897 if (offset == info->offset) {
@@ -1947,8 +1937,10 @@ again:
1947 } 1937 }
1948 1938
1949 ret = remove_from_bitmap(ctl, info, &offset, &bytes); 1939 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
1950 if (ret == -EAGAIN) 1940 if (ret == -EAGAIN) {
1941 re_search = true;
1951 goto again; 1942 goto again;
1943 }
1952 BUG_ON(ret); /* logic error */ 1944 BUG_ON(ret); /* logic error */
1953out_lock: 1945out_lock:
1954 spin_unlock(&ctl->tree_lock); 1946 spin_unlock(&ctl->tree_lock);
@@ -2298,10 +2290,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2298 unsigned long total_found = 0; 2290 unsigned long total_found = 0;
2299 int ret; 2291 int ret;
2300 2292
2301 i = offset_to_bit(entry->offset, block_group->sectorsize, 2293 i = offset_to_bit(entry->offset, ctl->unit,
2302 max_t(u64, offset, entry->offset)); 2294 max_t(u64, offset, entry->offset));
2303 want_bits = bytes_to_bits(bytes, block_group->sectorsize); 2295 want_bits = bytes_to_bits(bytes, ctl->unit);
2304 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2296 min_bits = bytes_to_bits(min_bytes, ctl->unit);
2305 2297
2306again: 2298again:
2307 found_bits = 0; 2299 found_bits = 0;
@@ -2325,23 +2317,22 @@ again:
2325 2317
2326 total_found += found_bits; 2318 total_found += found_bits;
2327 2319
2328 if (cluster->max_size < found_bits * block_group->sectorsize) 2320 if (cluster->max_size < found_bits * ctl->unit)
2329 cluster->max_size = found_bits * block_group->sectorsize; 2321 cluster->max_size = found_bits * ctl->unit;
2330 2322
2331 if (total_found < want_bits || cluster->max_size < cont1_bytes) { 2323 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2332 i = next_zero + 1; 2324 i = next_zero + 1;
2333 goto again; 2325 goto again;
2334 } 2326 }
2335 2327
2336 cluster->window_start = start * block_group->sectorsize + 2328 cluster->window_start = start * ctl->unit + entry->offset;
2337 entry->offset;
2338 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2329 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2339 ret = tree_insert_offset(&cluster->root, entry->offset, 2330 ret = tree_insert_offset(&cluster->root, entry->offset,
2340 &entry->offset_index, 1); 2331 &entry->offset_index, 1);
2341 BUG_ON(ret); /* -EEXIST; Logic error */ 2332 BUG_ON(ret); /* -EEXIST; Logic error */
2342 2333
2343 trace_btrfs_setup_cluster(block_group, cluster, 2334 trace_btrfs_setup_cluster(block_group, cluster,
2344 total_found * block_group->sectorsize, 1); 2335 total_found * ctl->unit, 1);
2345 return 0; 2336 return 0;
2346} 2337}
2347 2338
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b1a1c929ba80..d26f67a59e36 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
434 * 3 items for pre-allocation 434 * 3 items for pre-allocation
435 */ 435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); 436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, 437 ret = btrfs_block_rsv_add(root, trans->block_rsv,
438 trans->bytes_reserved); 438 trans->bytes_reserved,
439 BTRFS_RESERVE_NO_FLUSH);
439 if (ret) 440 if (ret)
440 goto out; 441 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", 442 trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95542a1b3dfc..cc93b23ca352 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
71static struct extent_io_ops btrfs_extent_io_ops; 71static struct extent_io_ops btrfs_extent_io_ops;
72 72
73static struct kmem_cache *btrfs_inode_cachep; 73static struct kmem_cache *btrfs_inode_cachep;
74static struct kmem_cache *btrfs_delalloc_work_cachep;
74struct kmem_cache *btrfs_trans_handle_cachep; 75struct kmem_cache *btrfs_trans_handle_cachep;
75struct kmem_cache *btrfs_transaction_cachep; 76struct kmem_cache *btrfs_transaction_cachep;
76struct kmem_cache *btrfs_path_cachep; 77struct kmem_cache *btrfs_path_cachep;
@@ -87,13 +88,17 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
87 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 88 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
88}; 89};
89 90
90static int btrfs_setsize(struct inode *inode, loff_t newsize); 91static int btrfs_setsize(struct inode *inode, struct iattr *attr);
91static int btrfs_truncate(struct inode *inode); 92static int btrfs_truncate(struct inode *inode);
92static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 93static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
93static noinline int cow_file_range(struct inode *inode, 94static noinline int cow_file_range(struct inode *inode,
94 struct page *locked_page, 95 struct page *locked_page,
95 u64 start, u64 end, int *page_started, 96 u64 start, u64 end, int *page_started,
96 unsigned long *nr_written, int unlock); 97 unsigned long *nr_written, int unlock);
98static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
99 u64 len, u64 orig_start,
100 u64 block_start, u64 block_len,
101 u64 orig_block_len, int type);
97 102
98static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 103static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
99 struct inode *inode, struct inode *dir, 104 struct inode *inode, struct inode *dir,
@@ -698,14 +703,19 @@ retry:
698 703
699 em->block_start = ins.objectid; 704 em->block_start = ins.objectid;
700 em->block_len = ins.offset; 705 em->block_len = ins.offset;
706 em->orig_block_len = ins.offset;
701 em->bdev = root->fs_info->fs_devices->latest_bdev; 707 em->bdev = root->fs_info->fs_devices->latest_bdev;
702 em->compress_type = async_extent->compress_type; 708 em->compress_type = async_extent->compress_type;
703 set_bit(EXTENT_FLAG_PINNED, &em->flags); 709 set_bit(EXTENT_FLAG_PINNED, &em->flags);
704 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 710 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
711 em->generation = -1;
705 712
706 while (1) { 713 while (1) {
707 write_lock(&em_tree->lock); 714 write_lock(&em_tree->lock);
708 ret = add_extent_mapping(em_tree, em); 715 ret = add_extent_mapping(em_tree, em);
716 if (!ret)
717 list_move(&em->list,
718 &em_tree->modified_extents);
709 write_unlock(&em_tree->lock); 719 write_unlock(&em_tree->lock);
710 if (ret != -EEXIST) { 720 if (ret != -EEXIST) {
711 free_extent_map(em); 721 free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
803 * required to start IO on it. It may be clean and already done with 813 * required to start IO on it. It may be clean and already done with
804 * IO when we return. 814 * IO when we return.
805 */ 815 */
806static noinline int cow_file_range(struct inode *inode, 816static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
807 struct page *locked_page, 817 struct inode *inode,
808 u64 start, u64 end, int *page_started, 818 struct btrfs_root *root,
809 unsigned long *nr_written, 819 struct page *locked_page,
810 int unlock) 820 u64 start, u64 end, int *page_started,
821 unsigned long *nr_written,
822 int unlock)
811{ 823{
812 struct btrfs_root *root = BTRFS_I(inode)->root;
813 struct btrfs_trans_handle *trans;
814 u64 alloc_hint = 0; 824 u64 alloc_hint = 0;
815 u64 num_bytes; 825 u64 num_bytes;
816 unsigned long ram_size; 826 unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
823 int ret = 0; 833 int ret = 0;
824 834
825 BUG_ON(btrfs_is_free_space_inode(inode)); 835 BUG_ON(btrfs_is_free_space_inode(inode));
826 trans = btrfs_join_transaction(root);
827 if (IS_ERR(trans)) {
828 extent_clear_unlock_delalloc(inode,
829 &BTRFS_I(inode)->io_tree,
830 start, end, locked_page,
831 EXTENT_CLEAR_UNLOCK_PAGE |
832 EXTENT_CLEAR_UNLOCK |
833 EXTENT_CLEAR_DELALLOC |
834 EXTENT_CLEAR_DIRTY |
835 EXTENT_SET_WRITEBACK |
836 EXTENT_END_WRITEBACK);
837 return PTR_ERR(trans);
838 }
839 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
840 836
841 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 837 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
842 num_bytes = max(blocksize, num_bytes); 838 num_bytes = max(blocksize, num_bytes);
843 disk_num_bytes = num_bytes; 839 disk_num_bytes = num_bytes;
844 ret = 0;
845 840
846 /* if this is a small write inside eof, kick off defrag */ 841 /* if this is a small write inside eof, kick off defrag */
847 if (num_bytes < 64 * 1024 && 842 if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
900 895
901 em->block_start = ins.objectid; 896 em->block_start = ins.objectid;
902 em->block_len = ins.offset; 897 em->block_len = ins.offset;
898 em->orig_block_len = ins.offset;
903 em->bdev = root->fs_info->fs_devices->latest_bdev; 899 em->bdev = root->fs_info->fs_devices->latest_bdev;
904 set_bit(EXTENT_FLAG_PINNED, &em->flags); 900 set_bit(EXTENT_FLAG_PINNED, &em->flags);
901 em->generation = -1;
905 902
906 while (1) { 903 while (1) {
907 write_lock(&em_tree->lock); 904 write_lock(&em_tree->lock);
908 ret = add_extent_mapping(em_tree, em); 905 ret = add_extent_mapping(em_tree, em);
906 if (!ret)
907 list_move(&em->list,
908 &em_tree->modified_extents);
909 write_unlock(&em_tree->lock); 909 write_unlock(&em_tree->lock);
910 if (ret != -EEXIST) { 910 if (ret != -EEXIST) {
911 free_extent_map(em); 911 free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
952 alloc_hint = ins.objectid + ins.offset; 952 alloc_hint = ins.objectid + ins.offset;
953 start += cur_alloc_size; 953 start += cur_alloc_size;
954 } 954 }
955 ret = 0;
956out: 955out:
957 btrfs_end_transaction(trans, root);
958
959 return ret; 956 return ret;
957
960out_unlock: 958out_unlock:
961 extent_clear_unlock_delalloc(inode, 959 extent_clear_unlock_delalloc(inode,
962 &BTRFS_I(inode)->io_tree, 960 &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
971 goto out; 969 goto out;
972} 970}
973 971
972static noinline int cow_file_range(struct inode *inode,
973 struct page *locked_page,
974 u64 start, u64 end, int *page_started,
975 unsigned long *nr_written,
976 int unlock)
977{
978 struct btrfs_trans_handle *trans;
979 struct btrfs_root *root = BTRFS_I(inode)->root;
980 int ret;
981
982 trans = btrfs_join_transaction(root);
983 if (IS_ERR(trans)) {
984 extent_clear_unlock_delalloc(inode,
985 &BTRFS_I(inode)->io_tree,
986 start, end, locked_page,
987 EXTENT_CLEAR_UNLOCK_PAGE |
988 EXTENT_CLEAR_UNLOCK |
989 EXTENT_CLEAR_DELALLOC |
990 EXTENT_CLEAR_DIRTY |
991 EXTENT_SET_WRITEBACK |
992 EXTENT_END_WRITEBACK);
993 return PTR_ERR(trans);
994 }
995 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
996
997 ret = __cow_file_range(trans, inode, root, locked_page, start, end,
998 page_started, nr_written, unlock);
999
1000 btrfs_end_transaction(trans, root);
1001
1002 return ret;
1003}
1004
974/* 1005/*
975 * work queue call back to started compression on a file and pages 1006 * work queue call back to started compression on a file and pages
976 */ 1007 */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1126 u64 extent_offset; 1157 u64 extent_offset;
1127 u64 disk_bytenr; 1158 u64 disk_bytenr;
1128 u64 num_bytes; 1159 u64 num_bytes;
1160 u64 disk_num_bytes;
1129 int extent_type; 1161 int extent_type;
1130 int ret, err; 1162 int ret, err;
1131 int type; 1163 int type;
@@ -1228,6 +1260,8 @@ next_slot:
1228 extent_offset = btrfs_file_extent_offset(leaf, fi); 1260 extent_offset = btrfs_file_extent_offset(leaf, fi);
1229 extent_end = found_key.offset + 1261 extent_end = found_key.offset +
1230 btrfs_file_extent_num_bytes(leaf, fi); 1262 btrfs_file_extent_num_bytes(leaf, fi);
1263 disk_num_bytes =
1264 btrfs_file_extent_disk_num_bytes(leaf, fi);
1231 if (extent_end <= start) { 1265 if (extent_end <= start) {
1232 path->slots[0]++; 1266 path->slots[0]++;
1233 goto next_slot; 1267 goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
1281 1315
1282 btrfs_release_path(path); 1316 btrfs_release_path(path);
1283 if (cow_start != (u64)-1) { 1317 if (cow_start != (u64)-1) {
1284 ret = cow_file_range(inode, locked_page, cow_start, 1318 ret = __cow_file_range(trans, inode, root, locked_page,
1285 found_key.offset - 1, page_started, 1319 cow_start, found_key.offset - 1,
1286 nr_written, 1); 1320 page_started, nr_written, 1);
1287 if (ret) { 1321 if (ret) {
1288 btrfs_abort_transaction(trans, root, ret); 1322 btrfs_abort_transaction(trans, root, ret);
1289 goto error; 1323 goto error;
@@ -1298,16 +1332,21 @@ out_check:
1298 em = alloc_extent_map(); 1332 em = alloc_extent_map();
1299 BUG_ON(!em); /* -ENOMEM */ 1333 BUG_ON(!em); /* -ENOMEM */
1300 em->start = cur_offset; 1334 em->start = cur_offset;
1301 em->orig_start = em->start; 1335 em->orig_start = found_key.offset - extent_offset;
1302 em->len = num_bytes; 1336 em->len = num_bytes;
1303 em->block_len = num_bytes; 1337 em->block_len = num_bytes;
1304 em->block_start = disk_bytenr; 1338 em->block_start = disk_bytenr;
1339 em->orig_block_len = disk_num_bytes;
1305 em->bdev = root->fs_info->fs_devices->latest_bdev; 1340 em->bdev = root->fs_info->fs_devices->latest_bdev;
1306 set_bit(EXTENT_FLAG_PINNED, &em->flags); 1341 set_bit(EXTENT_FLAG_PINNED, &em->flags);
1307 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 1342 set_bit(EXTENT_FLAG_FILLING, &em->flags);
1343 em->generation = -1;
1308 while (1) { 1344 while (1) {
1309 write_lock(&em_tree->lock); 1345 write_lock(&em_tree->lock);
1310 ret = add_extent_mapping(em_tree, em); 1346 ret = add_extent_mapping(em_tree, em);
1347 if (!ret)
1348 list_move(&em->list,
1349 &em_tree->modified_extents);
1311 write_unlock(&em_tree->lock); 1350 write_unlock(&em_tree->lock);
1312 if (ret != -EEXIST) { 1351 if (ret != -EEXIST) {
1313 free_extent_map(em); 1352 free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
1352 } 1391 }
1353 1392
1354 if (cow_start != (u64)-1) { 1393 if (cow_start != (u64)-1) {
1355 ret = cow_file_range(inode, locked_page, cow_start, end, 1394 ret = __cow_file_range(trans, inode, root, locked_page,
1356 page_started, nr_written, 1); 1395 cow_start, end,
1396 page_started, nr_written, 1);
1357 if (ret) { 1397 if (ret) {
1358 btrfs_abort_transaction(trans, root, ret); 1398 btrfs_abort_transaction(trans, root, ret);
1359 goto error; 1399 goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1531 unsigned long bio_flags) 1571 unsigned long bio_flags)
1532{ 1572{
1533 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 1573 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1534 struct btrfs_mapping_tree *map_tree;
1535 u64 logical = (u64)bio->bi_sector << 9; 1574 u64 logical = (u64)bio->bi_sector << 9;
1536 u64 length = 0; 1575 u64 length = 0;
1537 u64 map_length; 1576 u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1541 return 0; 1580 return 0;
1542 1581
1543 length = bio->bi_size; 1582 length = bio->bi_size;
1544 map_tree = &root->fs_info->mapping_tree;
1545 map_length = length; 1583 map_length = length;
1546 ret = btrfs_map_block(map_tree, READ, logical, 1584 ret = btrfs_map_block(root->fs_info, READ, logical,
1547 &map_length, NULL, 0); 1585 &map_length, NULL, 0);
1548 /* Will always return 0 or 1 with map_multi == NULL */ 1586 /* Will always return 0 with map_multi == NULL */
1549 BUG_ON(ret < 0); 1587 BUG_ON(ret < 0);
1550 if (map_length < length + size) 1588 if (map_length < length + size)
1551 return 1; 1589 return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1586 u64 bio_offset) 1624 u64 bio_offset)
1587{ 1625{
1588 struct btrfs_root *root = BTRFS_I(inode)->root; 1626 struct btrfs_root *root = BTRFS_I(inode)->root;
1589 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1627 int ret;
1628
1629 ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1630 if (ret)
1631 bio_endio(bio, ret);
1632 return ret;
1590} 1633}
1591 1634
1592/* 1635/*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1601 int ret = 0; 1644 int ret = 0;
1602 int skip_sum; 1645 int skip_sum;
1603 int metadata = 0; 1646 int metadata = 0;
1647 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1604 1648
1605 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1649 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1606 1650
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1610 if (!(rw & REQ_WRITE)) { 1654 if (!(rw & REQ_WRITE)) {
1611 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1655 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1612 if (ret) 1656 if (ret)
1613 return ret; 1657 goto out;
1614 1658
1615 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1659 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1616 return btrfs_submit_compressed_read(inode, bio, 1660 ret = btrfs_submit_compressed_read(inode, bio,
1617 mirror_num, bio_flags); 1661 mirror_num,
1662 bio_flags);
1663 goto out;
1618 } else if (!skip_sum) { 1664 } else if (!skip_sum) {
1619 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); 1665 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1620 if (ret) 1666 if (ret)
1621 return ret; 1667 goto out;
1622 } 1668 }
1623 goto mapit; 1669 goto mapit;
1624 } else if (!skip_sum) { 1670 } else if (async && !skip_sum) {
1625 /* csum items have already been cloned */ 1671 /* csum items have already been cloned */
1626 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) 1672 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1627 goto mapit; 1673 goto mapit;
1628 /* we're doing a write, do the async checksumming */ 1674 /* we're doing a write, do the async checksumming */
1629 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1675 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1630 inode, rw, bio, mirror_num, 1676 inode, rw, bio, mirror_num,
1631 bio_flags, bio_offset, 1677 bio_flags, bio_offset,
1632 __btrfs_submit_bio_start, 1678 __btrfs_submit_bio_start,
1633 __btrfs_submit_bio_done); 1679 __btrfs_submit_bio_done);
1680 goto out;
1681 } else if (!skip_sum) {
1682 ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1683 if (ret)
1684 goto out;
1634 } 1685 }
1635 1686
1636mapit: 1687mapit:
1637 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 1688 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1689
1690out:
1691 if (ret < 0)
1692 bio_endio(bio, ret);
1693 return ret;
1638} 1694}
1639 1695
1640/* 1696/*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1657int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 1713int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1658 struct extent_state **cached_state) 1714 struct extent_state **cached_state)
1659{ 1715{
1660 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1716 WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1661 WARN_ON(1);
1662 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1717 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1663 cached_state, GFP_NOFS); 1718 cached_state, GFP_NOFS);
1664} 1719}
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1867 1922
1868 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1923 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1869 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 1924 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
1870 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1925 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1871 if (!ret) { 1926 if (nolock)
1872 if (nolock) 1927 trans = btrfs_join_transaction_nolock(root);
1873 trans = btrfs_join_transaction_nolock(root); 1928 else
1874 else 1929 trans = btrfs_join_transaction(root);
1875 trans = btrfs_join_transaction(root); 1930 if (IS_ERR(trans)) {
1876 if (IS_ERR(trans)) { 1931 ret = PTR_ERR(trans);
1877 ret = PTR_ERR(trans); 1932 trans = NULL;
1878 trans = NULL; 1933 goto out;
1879 goto out;
1880 }
1881 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1882 ret = btrfs_update_inode_fallback(trans, root, inode);
1883 if (ret) /* -ENOMEM or corruption */
1884 btrfs_abort_transaction(trans, root, ret);
1885 } 1934 }
1935 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1936 ret = btrfs_update_inode_fallback(trans, root, inode);
1937 if (ret) /* -ENOMEM or corruption */
1938 btrfs_abort_transaction(trans, root, ret);
1886 goto out; 1939 goto out;
1887 } 1940 }
1888 1941
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
1931 add_pending_csums(trans, inode, ordered_extent->file_offset, 1984 add_pending_csums(trans, inode, ordered_extent->file_offset,
1932 &ordered_extent->list); 1985 &ordered_extent->list);
1933 1986
1934 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1987 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1935 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1988 ret = btrfs_update_inode_fallback(trans, root, inode);
1936 ret = btrfs_update_inode_fallback(trans, root, inode); 1989 if (ret) { /* -ENOMEM or corruption */
1937 if (ret) { /* -ENOMEM or corruption */ 1990 btrfs_abort_transaction(trans, root, ret);
1938 btrfs_abort_transaction(trans, root, ret); 1991 goto out_unlock;
1939 goto out_unlock;
1940 }
1941 } else {
1942 btrfs_set_inode_last_trans(trans, inode);
1943 } 1992 }
1944 ret = 0; 1993 ret = 0;
1945out_unlock: 1994out_unlock:
@@ -2429,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2429 continue; 2478 continue;
2430 } 2479 }
2431 nr_truncate++; 2480 nr_truncate++;
2481
2482 /* 1 for the orphan item deletion. */
2483 trans = btrfs_start_transaction(root, 1);
2484 if (IS_ERR(trans)) {
2485 ret = PTR_ERR(trans);
2486 goto out;
2487 }
2488 ret = btrfs_orphan_add(trans, inode);
2489 btrfs_end_transaction(trans, root);
2490 if (ret)
2491 goto out;
2492
2432 ret = btrfs_truncate(inode); 2493 ret = btrfs_truncate(inode);
2433 } else { 2494 } else {
2434 nr_unlink++; 2495 nr_unlink++;
@@ -3074,7 +3135,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3074 struct btrfs_trans_handle *trans; 3135 struct btrfs_trans_handle *trans;
3075 struct inode *inode = dentry->d_inode; 3136 struct inode *inode = dentry->d_inode;
3076 int ret; 3137 int ret;
3077 unsigned long nr = 0;
3078 3138
3079 trans = __unlink_start_trans(dir, dentry); 3139 trans = __unlink_start_trans(dir, dentry);
3080 if (IS_ERR(trans)) 3140 if (IS_ERR(trans))
@@ -3094,9 +3154,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3094 } 3154 }
3095 3155
3096out: 3156out:
3097 nr = trans->blocks_used;
3098 __unlink_end_trans(trans, root); 3157 __unlink_end_trans(trans, root);
3099 btrfs_btree_balance_dirty(root, nr); 3158 btrfs_btree_balance_dirty(root);
3100 return ret; 3159 return ret;
3101} 3160}
3102 3161
@@ -3186,7 +3245,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3186 int err = 0; 3245 int err = 0;
3187 struct btrfs_root *root = BTRFS_I(dir)->root; 3246 struct btrfs_root *root = BTRFS_I(dir)->root;
3188 struct btrfs_trans_handle *trans; 3247 struct btrfs_trans_handle *trans;
3189 unsigned long nr = 0;
3190 3248
3191 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 3249 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3192 return -ENOTEMPTY; 3250 return -ENOTEMPTY;
@@ -3215,9 +3273,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3215 if (!err) 3273 if (!err)
3216 btrfs_i_size_write(inode, 0); 3274 btrfs_i_size_write(inode, 0);
3217out: 3275out:
3218 nr = trans->blocks_used;
3219 __unlink_end_trans(trans, root); 3276 __unlink_end_trans(trans, root);
3220 btrfs_btree_balance_dirty(root, nr); 3277 btrfs_btree_balance_dirty(root);
3221 3278
3222 return err; 3279 return err;
3223} 3280}
@@ -3497,11 +3554,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
3497 if (ret) 3554 if (ret)
3498 goto out; 3555 goto out;
3499 3556
3500 ret = -ENOMEM;
3501again: 3557again:
3502 page = find_or_create_page(mapping, index, mask); 3558 page = find_or_create_page(mapping, index, mask);
3503 if (!page) { 3559 if (!page) {
3504 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3560 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3561 ret = -ENOMEM;
3505 goto out; 3562 goto out;
3506 } 3563 }
3507 3564
@@ -3550,7 +3607,6 @@ again:
3550 goto out_unlock; 3607 goto out_unlock;
3551 } 3608 }
3552 3609
3553 ret = 0;
3554 if (offset != PAGE_CACHE_SIZE) { 3610 if (offset != PAGE_CACHE_SIZE) {
3555 if (!len) 3611 if (!len)
3556 len = PAGE_CACHE_SIZE - offset; 3612 len = PAGE_CACHE_SIZE - offset;
@@ -3621,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3621 block_end - cur_offset, 0); 3677 block_end - cur_offset, 0);
3622 if (IS_ERR(em)) { 3678 if (IS_ERR(em)) {
3623 err = PTR_ERR(em); 3679 err = PTR_ERR(em);
3680 em = NULL;
3624 break; 3681 break;
3625 } 3682 }
3626 last_byte = min(extent_map_end(em), block_end); 3683 last_byte = min(extent_map_end(em), block_end);
@@ -3668,6 +3725,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3668 3725
3669 hole_em->block_start = EXTENT_MAP_HOLE; 3726 hole_em->block_start = EXTENT_MAP_HOLE;
3670 hole_em->block_len = 0; 3727 hole_em->block_len = 0;
3728 hole_em->orig_block_len = 0;
3671 hole_em->bdev = root->fs_info->fs_devices->latest_bdev; 3729 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
3672 hole_em->compress_type = BTRFS_COMPRESS_NONE; 3730 hole_em->compress_type = BTRFS_COMPRESS_NONE;
3673 hole_em->generation = trans->transid; 3731 hole_em->generation = trans->transid;
@@ -3703,16 +3761,27 @@ next:
3703 return err; 3761 return err;
3704} 3762}
3705 3763
3706static int btrfs_setsize(struct inode *inode, loff_t newsize) 3764static int btrfs_setsize(struct inode *inode, struct iattr *attr)
3707{ 3765{
3708 struct btrfs_root *root = BTRFS_I(inode)->root; 3766 struct btrfs_root *root = BTRFS_I(inode)->root;
3709 struct btrfs_trans_handle *trans; 3767 struct btrfs_trans_handle *trans;
3710 loff_t oldsize = i_size_read(inode); 3768 loff_t oldsize = i_size_read(inode);
3769 loff_t newsize = attr->ia_size;
3770 int mask = attr->ia_valid;
3711 int ret; 3771 int ret;
3712 3772
3713 if (newsize == oldsize) 3773 if (newsize == oldsize)
3714 return 0; 3774 return 0;
3715 3775
3776 /*
3777 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
3778 * special case where we need to update the times despite not having
3779 * these flags set. For all other operations the VFS set these flags
3780 * explicitly if it wants a timestamp update.
3781 */
3782 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
3783 inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
3784
3716 if (newsize > oldsize) { 3785 if (newsize > oldsize) {
3717 truncate_pagecache(inode, oldsize, newsize); 3786 truncate_pagecache(inode, oldsize, newsize);
3718 ret = btrfs_cont_expand(inode, oldsize, newsize); 3787 ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3738,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3738 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 3807 set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
3739 &BTRFS_I(inode)->runtime_flags); 3808 &BTRFS_I(inode)->runtime_flags);
3740 3809
3810 /*
3811 * 1 for the orphan item we're going to add
3812 * 1 for the orphan item deletion.
3813 */
3814 trans = btrfs_start_transaction(root, 2);
3815 if (IS_ERR(trans))
3816 return PTR_ERR(trans);
3817
3818 /*
3819 * We need to do this in case we fail at _any_ point during the
3820 * actual truncate. Once we do the truncate_setsize we could
3821 * invalidate pages which forces any outstanding ordered io to
3822 * be instantly completed which will give us extents that need
3823 * to be truncated. If we fail to get an orphan inode down we
3824 * could have left over extents that were never meant to live,
3825 * so we need to garuntee from this point on that everything
3826 * will be consistent.
3827 */
3828 ret = btrfs_orphan_add(trans, inode);
3829 btrfs_end_transaction(trans, root);
3830 if (ret)
3831 return ret;
3832
3741 /* we don't support swapfiles, so vmtruncate shouldn't fail */ 3833 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3742 truncate_setsize(inode, newsize); 3834 truncate_setsize(inode, newsize);
3743 ret = btrfs_truncate(inode); 3835 ret = btrfs_truncate(inode);
3836 if (ret && inode->i_nlink)
3837 btrfs_orphan_del(NULL, inode);
3744 } 3838 }
3745 3839
3746 return ret; 3840 return ret;
@@ -3760,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3760 return err; 3854 return err;
3761 3855
3762 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3856 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3763 err = btrfs_setsize(inode, attr->ia_size); 3857 err = btrfs_setsize(inode, attr);
3764 if (err) 3858 if (err)
3765 return err; 3859 return err;
3766 } 3860 }
@@ -3783,7 +3877,6 @@ void btrfs_evict_inode(struct inode *inode)
3783 struct btrfs_root *root = BTRFS_I(inode)->root; 3877 struct btrfs_root *root = BTRFS_I(inode)->root;
3784 struct btrfs_block_rsv *rsv, *global_rsv; 3878 struct btrfs_block_rsv *rsv, *global_rsv;
3785 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 3879 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3786 unsigned long nr;
3787 int ret; 3880 int ret;
3788 3881
3789 trace_btrfs_inode_evict(inode); 3882 trace_btrfs_inode_evict(inode);
@@ -3829,7 +3922,8 @@ void btrfs_evict_inode(struct inode *inode)
3829 * inode item when doing the truncate. 3922 * inode item when doing the truncate.
3830 */ 3923 */
3831 while (1) { 3924 while (1) {
3832 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); 3925 ret = btrfs_block_rsv_refill(root, rsv, min_size,
3926 BTRFS_RESERVE_FLUSH_LIMIT);
3833 3927
3834 /* 3928 /*
3835 * Try and steal from the global reserve since we will 3929 * Try and steal from the global reserve since we will
@@ -3847,7 +3941,7 @@ void btrfs_evict_inode(struct inode *inode)
3847 goto no_delete; 3941 goto no_delete;
3848 } 3942 }
3849 3943
3850 trans = btrfs_start_transaction_noflush(root, 1); 3944 trans = btrfs_start_transaction_lflush(root, 1);
3851 if (IS_ERR(trans)) { 3945 if (IS_ERR(trans)) {
3852 btrfs_orphan_del(NULL, inode); 3946 btrfs_orphan_del(NULL, inode);
3853 btrfs_free_block_rsv(root, rsv); 3947 btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3958,9 @@ void btrfs_evict_inode(struct inode *inode)
3864 ret = btrfs_update_inode(trans, root, inode); 3958 ret = btrfs_update_inode(trans, root, inode);
3865 BUG_ON(ret); 3959 BUG_ON(ret);
3866 3960
3867 nr = trans->blocks_used;
3868 btrfs_end_transaction(trans, root); 3961 btrfs_end_transaction(trans, root);
3869 trans = NULL; 3962 trans = NULL;
3870 btrfs_btree_balance_dirty(root, nr); 3963 btrfs_btree_balance_dirty(root);
3871 } 3964 }
3872 3965
3873 btrfs_free_block_rsv(root, rsv); 3966 btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3976,8 @@ void btrfs_evict_inode(struct inode *inode)
3883 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3976 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3884 btrfs_return_ino(root, btrfs_ino(inode)); 3977 btrfs_return_ino(root, btrfs_ino(inode));
3885 3978
3886 nr = trans->blocks_used;
3887 btrfs_end_transaction(trans, root); 3979 btrfs_end_transaction(trans, root);
3888 btrfs_btree_balance_dirty(root, nr); 3980 btrfs_btree_balance_dirty(root);
3889no_delete: 3981no_delete:
3890 clear_inode(inode); 3982 clear_inode(inode);
3891 return; 3983 return;
@@ -4219,16 +4311,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4219 if (dentry->d_name.len > BTRFS_NAME_LEN) 4311 if (dentry->d_name.len > BTRFS_NAME_LEN)
4220 return ERR_PTR(-ENAMETOOLONG); 4312 return ERR_PTR(-ENAMETOOLONG);
4221 4313
4222 if (unlikely(d_need_lookup(dentry))) { 4314 ret = btrfs_inode_by_name(dir, dentry, &location);
4223 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
4224 kfree(dentry->d_fsdata);
4225 dentry->d_fsdata = NULL;
4226 /* This thing is hashed, drop it for now */
4227 d_drop(dentry);
4228 } else {
4229 ret = btrfs_inode_by_name(dir, dentry, &location);
4230 }
4231
4232 if (ret < 0) 4315 if (ret < 0)
4233 return ERR_PTR(ret); 4316 return ERR_PTR(ret);
4234 4317
@@ -4298,11 +4381,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4298 struct dentry *ret; 4381 struct dentry *ret;
4299 4382
4300 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4383 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4301 if (unlikely(d_need_lookup(dentry))) {
4302 spin_lock(&dentry->d_lock);
4303 dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
4304 spin_unlock(&dentry->d_lock);
4305 }
4306 return ret; 4384 return ret;
4307} 4385}
4308 4386
@@ -4775,8 +4853,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4775 if (S_ISREG(mode)) { 4853 if (S_ISREG(mode)) {
4776 if (btrfs_test_opt(root, NODATASUM)) 4854 if (btrfs_test_opt(root, NODATASUM))
4777 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4855 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4778 if (btrfs_test_opt(root, NODATACOW) || 4856 if (btrfs_test_opt(root, NODATACOW))
4779 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4780 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4857 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4781 } 4858 }
4782 4859
@@ -4842,7 +4919,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4842 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4919 ret = btrfs_insert_dir_item(trans, root, name, name_len,
4843 parent_inode, &key, 4920 parent_inode, &key,
4844 btrfs_inode_type(inode), index); 4921 btrfs_inode_type(inode), index);
4845 if (ret == -EEXIST) 4922 if (ret == -EEXIST || ret == -EOVERFLOW)
4846 goto fail_dir_item; 4923 goto fail_dir_item;
4847 else if (ret) { 4924 else if (ret) {
4848 btrfs_abort_transaction(trans, root, ret); 4925 btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4974,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4897 int err; 4974 int err;
4898 int drop_inode = 0; 4975 int drop_inode = 0;
4899 u64 objectid; 4976 u64 objectid;
4900 unsigned long nr = 0;
4901 u64 index = 0; 4977 u64 index = 0;
4902 4978
4903 if (!new_valid_dev(rdev)) 4979 if (!new_valid_dev(rdev))
@@ -4930,6 +5006,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4930 goto out_unlock; 5006 goto out_unlock;
4931 } 5007 }
4932 5008
5009 err = btrfs_update_inode(trans, root, inode);
5010 if (err) {
5011 drop_inode = 1;
5012 goto out_unlock;
5013 }
5014
4933 /* 5015 /*
4934 * If the active LSM wants to access the inode during 5016 * If the active LSM wants to access the inode during
4935 * d_instantiate it needs these. Smack checks to see 5017 * d_instantiate it needs these. Smack checks to see
@@ -4947,9 +5029,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4947 d_instantiate(dentry, inode); 5029 d_instantiate(dentry, inode);
4948 } 5030 }
4949out_unlock: 5031out_unlock:
4950 nr = trans->blocks_used;
4951 btrfs_end_transaction(trans, root); 5032 btrfs_end_transaction(trans, root);
4952 btrfs_btree_balance_dirty(root, nr); 5033 btrfs_btree_balance_dirty(root);
4953 if (drop_inode) { 5034 if (drop_inode) {
4954 inode_dec_link_count(inode); 5035 inode_dec_link_count(inode);
4955 iput(inode); 5036 iput(inode);
@@ -4963,9 +5044,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4963 struct btrfs_trans_handle *trans; 5044 struct btrfs_trans_handle *trans;
4964 struct btrfs_root *root = BTRFS_I(dir)->root; 5045 struct btrfs_root *root = BTRFS_I(dir)->root;
4965 struct inode *inode = NULL; 5046 struct inode *inode = NULL;
4966 int drop_inode = 0; 5047 int drop_inode_on_err = 0;
4967 int err; 5048 int err;
4968 unsigned long nr = 0;
4969 u64 objectid; 5049 u64 objectid;
4970 u64 index = 0; 5050 u64 index = 0;
4971 5051
@@ -4989,12 +5069,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4989 err = PTR_ERR(inode); 5069 err = PTR_ERR(inode);
4990 goto out_unlock; 5070 goto out_unlock;
4991 } 5071 }
5072 drop_inode_on_err = 1;
4992 5073
4993 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 5074 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4994 if (err) { 5075 if (err)
4995 drop_inode = 1; 5076 goto out_unlock;
5077
5078 err = btrfs_update_inode(trans, root, inode);
5079 if (err)
4996 goto out_unlock; 5080 goto out_unlock;
4997 }
4998 5081
4999 /* 5082 /*
5000 * If the active LSM wants to access the inode during 5083 * If the active LSM wants to access the inode during
@@ -5007,21 +5090,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5007 5090
5008 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5091 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5009 if (err) 5092 if (err)
5010 drop_inode = 1; 5093 goto out_unlock;
5011 else { 5094
5012 inode->i_mapping->a_ops = &btrfs_aops; 5095 inode->i_mapping->a_ops = &btrfs_aops;
5013 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 5096 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5014 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 5097 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5015 d_instantiate(dentry, inode); 5098 d_instantiate(dentry, inode);
5016 } 5099
5017out_unlock: 5100out_unlock:
5018 nr = trans->blocks_used;
5019 btrfs_end_transaction(trans, root); 5101 btrfs_end_transaction(trans, root);
5020 if (drop_inode) { 5102 if (err && drop_inode_on_err) {
5021 inode_dec_link_count(inode); 5103 inode_dec_link_count(inode);
5022 iput(inode); 5104 iput(inode);
5023 } 5105 }
5024 btrfs_btree_balance_dirty(root, nr); 5106 btrfs_btree_balance_dirty(root);
5025 return err; 5107 return err;
5026} 5108}
5027 5109
@@ -5032,7 +5114,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5032 struct btrfs_root *root = BTRFS_I(dir)->root; 5114 struct btrfs_root *root = BTRFS_I(dir)->root;
5033 struct inode *inode = old_dentry->d_inode; 5115 struct inode *inode = old_dentry->d_inode;
5034 u64 index; 5116 u64 index;
5035 unsigned long nr = 0;
5036 int err; 5117 int err;
5037 int drop_inode = 0; 5118 int drop_inode = 0;
5038 5119
@@ -5062,6 +5143,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5062 inode_inc_iversion(inode); 5143 inode_inc_iversion(inode);
5063 inode->i_ctime = CURRENT_TIME; 5144 inode->i_ctime = CURRENT_TIME;
5064 ihold(inode); 5145 ihold(inode);
5146 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5065 5147
5066 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 5148 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5067 5149
@@ -5076,14 +5158,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5076 btrfs_log_new_name(trans, inode, NULL, parent); 5158 btrfs_log_new_name(trans, inode, NULL, parent);
5077 } 5159 }
5078 5160
5079 nr = trans->blocks_used;
5080 btrfs_end_transaction(trans, root); 5161 btrfs_end_transaction(trans, root);
5081fail: 5162fail:
5082 if (drop_inode) { 5163 if (drop_inode) {
5083 inode_dec_link_count(inode); 5164 inode_dec_link_count(inode);
5084 iput(inode); 5165 iput(inode);
5085 } 5166 }
5086 btrfs_btree_balance_dirty(root, nr); 5167 btrfs_btree_balance_dirty(root);
5087 return err; 5168 return err;
5088} 5169}
5089 5170
@@ -5096,7 +5177,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5096 int drop_on_err = 0; 5177 int drop_on_err = 0;
5097 u64 objectid = 0; 5178 u64 objectid = 0;
5098 u64 index = 0; 5179 u64 index = 0;
5099 unsigned long nr = 1;
5100 5180
5101 /* 5181 /*
5102 * 2 items for inode and ref 5182 * 2 items for inode and ref
@@ -5142,11 +5222,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5142 drop_on_err = 0; 5222 drop_on_err = 0;
5143 5223
5144out_fail: 5224out_fail:
5145 nr = trans->blocks_used;
5146 btrfs_end_transaction(trans, root); 5225 btrfs_end_transaction(trans, root);
5147 if (drop_on_err) 5226 if (drop_on_err)
5148 iput(inode); 5227 iput(inode);
5149 btrfs_btree_balance_dirty(root, nr); 5228 btrfs_btree_balance_dirty(root);
5150 return err; 5229 return err;
5151} 5230}
5152 5231
@@ -5340,6 +5419,7 @@ again:
5340 if (start + len <= found_key.offset) 5419 if (start + len <= found_key.offset)
5341 goto not_found; 5420 goto not_found;
5342 em->start = start; 5421 em->start = start;
5422 em->orig_start = start;
5343 em->len = found_key.offset - start; 5423 em->len = found_key.offset - start;
5344 goto not_found_em; 5424 goto not_found_em;
5345 } 5425 }
@@ -5350,6 +5430,8 @@ again:
5350 em->len = extent_end - extent_start; 5430 em->len = extent_end - extent_start;
5351 em->orig_start = extent_start - 5431 em->orig_start = extent_start -
5352 btrfs_file_extent_offset(leaf, item); 5432 btrfs_file_extent_offset(leaf, item);
5433 em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5434 item);
5353 bytenr = btrfs_file_extent_disk_bytenr(leaf, item); 5435 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5354 if (bytenr == 0) { 5436 if (bytenr == 0) {
5355 em->block_start = EXTENT_MAP_HOLE; 5437 em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5441,7 @@ again:
5359 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5441 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5360 em->compress_type = compress_type; 5442 em->compress_type = compress_type;
5361 em->block_start = bytenr; 5443 em->block_start = bytenr;
5362 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5444 em->block_len = em->orig_block_len;
5363 item);
5364 } else { 5445 } else {
5365 bytenr += btrfs_file_extent_offset(leaf, item); 5446 bytenr += btrfs_file_extent_offset(leaf, item);
5366 em->block_start = bytenr; 5447 em->block_start = bytenr;
@@ -5390,7 +5471,8 @@ again:
5390 em->start = extent_start + extent_offset; 5471 em->start = extent_start + extent_offset;
5391 em->len = (copy_size + root->sectorsize - 1) & 5472 em->len = (copy_size + root->sectorsize - 1) &
5392 ~((u64)root->sectorsize - 1); 5473 ~((u64)root->sectorsize - 1);
5393 em->orig_start = EXTENT_MAP_INLINE; 5474 em->orig_block_len = em->len;
5475 em->orig_start = em->start;
5394 if (compress_type) { 5476 if (compress_type) {
5395 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5477 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5396 em->compress_type = compress_type; 5478 em->compress_type = compress_type;
@@ -5439,11 +5521,11 @@ again:
5439 extent_map_end(em) - 1, NULL, GFP_NOFS); 5521 extent_map_end(em) - 1, NULL, GFP_NOFS);
5440 goto insert; 5522 goto insert;
5441 } else { 5523 } else {
5442 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5524 WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
5443 WARN_ON(1);
5444 } 5525 }
5445not_found: 5526not_found:
5446 em->start = start; 5527 em->start = start;
5528 em->orig_start = start;
5447 em->len = len; 5529 em->len = len;
5448not_found_em: 5530not_found_em:
5449 em->block_start = EXTENT_MAP_HOLE; 5531 em->block_start = EXTENT_MAP_HOLE;
@@ -5539,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
5539 return em; 5621 return em;
5540 if (em) { 5622 if (em) {
5541 /* 5623 /*
5542 * if our em maps to a hole, there might 5624 * if our em maps to
5543 * actually be delalloc bytes behind it 5625 * - a hole or
5626 * - a pre-alloc extent,
5627 * there might actually be delalloc bytes behind it.
5544 */ 5628 */
5545 if (em->block_start != EXTENT_MAP_HOLE) 5629 if (em->block_start != EXTENT_MAP_HOLE &&
5630 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5546 return em; 5631 return em;
5547 else 5632 else
5548 hole_em = em; 5633 hole_em = em;
@@ -5624,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
5624 */ 5709 */
5625 em->block_start = hole_em->block_start; 5710 em->block_start = hole_em->block_start;
5626 em->block_len = hole_len; 5711 em->block_len = hole_len;
5712 if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
5713 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5627 } else { 5714 } else {
5628 em->start = range_start; 5715 em->start = range_start;
5629 em->len = found; 5716 em->len = found;
@@ -5645,38 +5732,19 @@ out:
5645} 5732}
5646 5733
5647static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5734static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5648 struct extent_map *em,
5649 u64 start, u64 len) 5735 u64 start, u64 len)
5650{ 5736{
5651 struct btrfs_root *root = BTRFS_I(inode)->root; 5737 struct btrfs_root *root = BTRFS_I(inode)->root;
5652 struct btrfs_trans_handle *trans; 5738 struct btrfs_trans_handle *trans;
5653 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5739 struct extent_map *em;
5654 struct btrfs_key ins; 5740 struct btrfs_key ins;
5655 u64 alloc_hint; 5741 u64 alloc_hint;
5656 int ret; 5742 int ret;
5657 bool insert = false;
5658
5659 /*
5660 * Ok if the extent map we looked up is a hole and is for the exact
5661 * range we want, there is no reason to allocate a new one, however if
5662 * it is not right then we need to free this one and drop the cache for
5663 * our range.
5664 */
5665 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5666 em->len != len) {
5667 free_extent_map(em);
5668 em = NULL;
5669 insert = true;
5670 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5671 }
5672 5743
5673 trans = btrfs_join_transaction(root); 5744 trans = btrfs_join_transaction(root);
5674 if (IS_ERR(trans)) 5745 if (IS_ERR(trans))
5675 return ERR_CAST(trans); 5746 return ERR_CAST(trans);
5676 5747
5677 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5678 btrfs_add_inode_defrag(trans, inode);
5679
5680 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5748 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5681 5749
5682 alloc_hint = get_extent_allocation_hint(inode, start, len); 5750 alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5755,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5687 goto out; 5755 goto out;
5688 } 5756 }
5689 5757
5690 if (!em) { 5758 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
5691 em = alloc_extent_map(); 5759 ins.offset, ins.offset, 0);
5692 if (!em) { 5760 if (IS_ERR(em))
5693 em = ERR_PTR(-ENOMEM); 5761 goto out;
5694 goto out;
5695 }
5696 }
5697
5698 em->start = start;
5699 em->orig_start = em->start;
5700 em->len = ins.offset;
5701
5702 em->block_start = ins.objectid;
5703 em->block_len = ins.offset;
5704 em->bdev = root->fs_info->fs_devices->latest_bdev;
5705
5706 /*
5707 * We need to do this because if we're using the original em we searched
5708 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5709 */
5710 em->flags = 0;
5711 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5712
5713 while (insert) {
5714 write_lock(&em_tree->lock);
5715 ret = add_extent_mapping(em_tree, em);
5716 write_unlock(&em_tree->lock);
5717 if (ret != -EEXIST)
5718 break;
5719 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5720 }
5721 5762
5722 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, 5763 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5723 ins.offset, ins.offset, 0); 5764 ins.offset, ins.offset, 0);
@@ -5894,7 +5935,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
5894static struct extent_map *create_pinned_em(struct inode *inode, u64 start, 5935static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5895 u64 len, u64 orig_start, 5936 u64 len, u64 orig_start,
5896 u64 block_start, u64 block_len, 5937 u64 block_start, u64 block_len,
5897 int type) 5938 u64 orig_block_len, int type)
5898{ 5939{
5899 struct extent_map_tree *em_tree; 5940 struct extent_map_tree *em_tree;
5900 struct extent_map *em; 5941 struct extent_map *em;
@@ -5912,15 +5953,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
5912 em->block_len = block_len; 5953 em->block_len = block_len;
5913 em->block_start = block_start; 5954 em->block_start = block_start;
5914 em->bdev = root->fs_info->fs_devices->latest_bdev; 5955 em->bdev = root->fs_info->fs_devices->latest_bdev;
5956 em->orig_block_len = orig_block_len;
5957 em->generation = -1;
5915 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5958 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5916 if (type == BTRFS_ORDERED_PREALLOC) 5959 if (type == BTRFS_ORDERED_PREALLOC)
5917 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 5960 set_bit(EXTENT_FLAG_FILLING, &em->flags);
5918 5961
5919 do { 5962 do {
5920 btrfs_drop_extent_cache(inode, em->start, 5963 btrfs_drop_extent_cache(inode, em->start,
5921 em->start + em->len - 1, 0); 5964 em->start + em->len - 1, 0);
5922 write_lock(&em_tree->lock); 5965 write_lock(&em_tree->lock);
5923 ret = add_extent_mapping(em_tree, em); 5966 ret = add_extent_mapping(em_tree, em);
5967 if (!ret)
5968 list_move(&em->list,
5969 &em_tree->modified_extents);
5924 write_unlock(&em_tree->lock); 5970 write_unlock(&em_tree->lock);
5925 } while (ret == -EEXIST); 5971 } while (ret == -EEXIST);
5926 5972
@@ -6047,13 +6093,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6047 goto must_cow; 6093 goto must_cow;
6048 6094
6049 if (can_nocow_odirect(trans, inode, start, len) == 1) { 6095 if (can_nocow_odirect(trans, inode, start, len) == 1) {
6050 u64 orig_start = em->start; 6096 u64 orig_start = em->orig_start;
6097 u64 orig_block_len = em->orig_block_len;
6051 6098
6052 if (type == BTRFS_ORDERED_PREALLOC) { 6099 if (type == BTRFS_ORDERED_PREALLOC) {
6053 free_extent_map(em); 6100 free_extent_map(em);
6054 em = create_pinned_em(inode, start, len, 6101 em = create_pinned_em(inode, start, len,
6055 orig_start, 6102 orig_start,
6056 block_start, len, type); 6103 block_start, len,
6104 orig_block_len, type);
6057 if (IS_ERR(em)) { 6105 if (IS_ERR(em)) {
6058 btrfs_end_transaction(trans, root); 6106 btrfs_end_transaction(trans, root);
6059 goto unlock_err; 6107 goto unlock_err;
@@ -6077,7 +6125,8 @@ must_cow:
6077 * it above 6125 * it above
6078 */ 6126 */
6079 len = bh_result->b_size; 6127 len = bh_result->b_size;
6080 em = btrfs_new_extent_direct(inode, em, start, len); 6128 free_extent_map(em);
6129 em = btrfs_new_extent_direct(inode, start, len);
6081 if (IS_ERR(em)) { 6130 if (IS_ERR(em)) {
6082 ret = PTR_ERR(em); 6131 ret = PTR_ERR(em);
6083 goto unlock_err; 6132 goto unlock_err;
@@ -6318,6 +6367,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6318 struct btrfs_root *root = BTRFS_I(inode)->root; 6367 struct btrfs_root *root = BTRFS_I(inode)->root;
6319 int ret; 6368 int ret;
6320 6369
6370 if (async_submit)
6371 async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6372
6321 bio_get(bio); 6373 bio_get(bio);
6322 6374
6323 if (!write) { 6375 if (!write) {
@@ -6362,7 +6414,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6362{ 6414{
6363 struct inode *inode = dip->inode; 6415 struct inode *inode = dip->inode;
6364 struct btrfs_root *root = BTRFS_I(inode)->root; 6416 struct btrfs_root *root = BTRFS_I(inode)->root;
6365 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
6366 struct bio *bio; 6417 struct bio *bio;
6367 struct bio *orig_bio = dip->orig_bio; 6418 struct bio *orig_bio = dip->orig_bio;
6368 struct bio_vec *bvec = orig_bio->bi_io_vec; 6419 struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6426,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6375 int async_submit = 0; 6426 int async_submit = 0;
6376 6427
6377 map_length = orig_bio->bi_size; 6428 map_length = orig_bio->bi_size;
6378 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6429 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
6379 &map_length, NULL, 0); 6430 &map_length, NULL, 0);
6380 if (ret) { 6431 if (ret) {
6381 bio_put(orig_bio); 6432 bio_put(orig_bio);
@@ -6429,7 +6480,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6429 bio->bi_end_io = btrfs_end_dio_bio; 6480 bio->bi_end_io = btrfs_end_dio_bio;
6430 6481
6431 map_length = orig_bio->bi_size; 6482 map_length = orig_bio->bi_size;
6432 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6483 ret = btrfs_map_block(root->fs_info, READ,
6484 start_sector << 9,
6433 &map_length, NULL, 0); 6485 &map_length, NULL, 0);
6434 if (ret) { 6486 if (ret) {
6435 bio_put(bio); 6487 bio_put(bio);
@@ -6582,9 +6634,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6582 btrfs_submit_direct, 0); 6634 btrfs_submit_direct, 0);
6583} 6635}
6584 6636
6637#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
6638
6585static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6639static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6586 __u64 start, __u64 len) 6640 __u64 start, __u64 len)
6587{ 6641{
6642 int ret;
6643
6644 ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
6645 if (ret)
6646 return ret;
6647
6588 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); 6648 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6589} 6649}
6590 6650
@@ -6855,7 +6915,6 @@ static int btrfs_truncate(struct inode *inode)
6855 int ret; 6915 int ret;
6856 int err = 0; 6916 int err = 0;
6857 struct btrfs_trans_handle *trans; 6917 struct btrfs_trans_handle *trans;
6858 unsigned long nr;
6859 u64 mask = root->sectorsize - 1; 6918 u64 mask = root->sectorsize - 1;
6860 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 6919 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6861 6920
@@ -6910,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode)
6910 6969
6911 /* 6970 /*
6912 * 1 for the truncate slack space 6971 * 1 for the truncate slack space
6913 * 1 for the orphan item we're going to add
6914 * 1 for the orphan item deletion
6915 * 1 for updating the inode. 6972 * 1 for updating the inode.
6916 */ 6973 */
6917 trans = btrfs_start_transaction(root, 4); 6974 trans = btrfs_start_transaction(root, 2);
6918 if (IS_ERR(trans)) { 6975 if (IS_ERR(trans)) {
6919 err = PTR_ERR(trans); 6976 err = PTR_ERR(trans);
6920 goto out; 6977 goto out;
@@ -6925,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode)
6925 min_size); 6982 min_size);
6926 BUG_ON(ret); 6983 BUG_ON(ret);
6927 6984
6928 ret = btrfs_orphan_add(trans, inode);
6929 if (ret) {
6930 btrfs_end_transaction(trans, root);
6931 goto out;
6932 }
6933
6934 /* 6985 /*
6935 * setattr is responsible for setting the ordered_data_close flag, 6986 * setattr is responsible for setting the ordered_data_close flag,
6936 * but that is only tested during the last file release. That 6987 * but that is only tested during the last file release. That
@@ -6978,9 +7029,8 @@ static int btrfs_truncate(struct inode *inode)
6978 break; 7029 break;
6979 } 7030 }
6980 7031
6981 nr = trans->blocks_used;
6982 btrfs_end_transaction(trans, root); 7032 btrfs_end_transaction(trans, root);
6983 btrfs_btree_balance_dirty(root, nr); 7033 btrfs_btree_balance_dirty(root);
6984 7034
6985 trans = btrfs_start_transaction(root, 2); 7035 trans = btrfs_start_transaction(root, 2);
6986 if (IS_ERR(trans)) { 7036 if (IS_ERR(trans)) {
@@ -7000,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode)
7000 ret = btrfs_orphan_del(trans, inode); 7050 ret = btrfs_orphan_del(trans, inode);
7001 if (ret) 7051 if (ret)
7002 err = ret; 7052 err = ret;
7003 } else if (ret && inode->i_nlink > 0) {
7004 /*
7005 * Failed to do the truncate, remove us from the in memory
7006 * orphan list.
7007 */
7008 ret = btrfs_orphan_del(NULL, inode);
7009 } 7053 }
7010 7054
7011 if (trans) { 7055 if (trans) {
@@ -7014,9 +7058,8 @@ static int btrfs_truncate(struct inode *inode)
7014 if (ret && !err) 7058 if (ret && !err)
7015 err = ret; 7059 err = ret;
7016 7060
7017 nr = trans->blocks_used;
7018 ret = btrfs_end_transaction(trans, root); 7061 ret = btrfs_end_transaction(trans, root);
7019 btrfs_btree_balance_dirty(root, nr); 7062 btrfs_btree_balance_dirty(root);
7020 } 7063 }
7021 7064
7022out: 7065out:
@@ -7093,6 +7136,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
7093 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 7136 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7094 ei->io_tree.track_uptodate = 1; 7137 ei->io_tree.track_uptodate = 1;
7095 ei->io_failure_tree.track_uptodate = 1; 7138 ei->io_failure_tree.track_uptodate = 1;
7139 atomic_set(&ei->sync_writers, 0);
7096 mutex_init(&ei->log_mutex); 7140 mutex_init(&ei->log_mutex);
7097 mutex_init(&ei->delalloc_mutex); 7141 mutex_init(&ei->delalloc_mutex);
7098 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 7142 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7247,8 @@ void btrfs_destroy_cachep(void)
7203 kmem_cache_destroy(btrfs_path_cachep); 7247 kmem_cache_destroy(btrfs_path_cachep);
7204 if (btrfs_free_space_cachep) 7248 if (btrfs_free_space_cachep)
7205 kmem_cache_destroy(btrfs_free_space_cachep); 7249 kmem_cache_destroy(btrfs_free_space_cachep);
7250 if (btrfs_delalloc_work_cachep)
7251 kmem_cache_destroy(btrfs_delalloc_work_cachep);
7206} 7252}
7207 7253
7208int btrfs_init_cachep(void) 7254int btrfs_init_cachep(void)
@@ -7237,6 +7283,13 @@ int btrfs_init_cachep(void)
7237 if (!btrfs_free_space_cachep) 7283 if (!btrfs_free_space_cachep)
7238 goto fail; 7284 goto fail;
7239 7285
7286 btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7287 sizeof(struct btrfs_delalloc_work), 0,
7288 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7289 NULL);
7290 if (!btrfs_delalloc_work_cachep)
7291 goto fail;
7292
7240 return 0; 7293 return 0;
7241fail: 7294fail:
7242 btrfs_destroy_cachep(); 7295 btrfs_destroy_cachep();
@@ -7308,6 +7361,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7308 if (S_ISDIR(old_inode->i_mode) && new_inode && 7361 if (S_ISDIR(old_inode->i_mode) && new_inode &&
7309 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 7362 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7310 return -ENOTEMPTY; 7363 return -ENOTEMPTY;
7364
7365
7366 /* check for collisions, even if the name isn't there */
7367 ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
7368 new_dentry->d_name.name,
7369 new_dentry->d_name.len);
7370
7371 if (ret) {
7372 if (ret == -EEXIST) {
7373 /* we shouldn't get
7374 * eexist without a new_inode */
7375 if (!new_inode) {
7376 WARN_ON(1);
7377 return ret;
7378 }
7379 } else {
7380 /* maybe -EOVERFLOW */
7381 return ret;
7382 }
7383 }
7384 ret = 0;
7385
7311 /* 7386 /*
7312 * we're using rename to replace one file with another. 7387 * we're using rename to replace one file with another.
7313 * and the replacement file is large. Start IO on it now so 7388 * and the replacement file is large. Start IO on it now so
@@ -7447,39 +7522,110 @@ out_notrans:
7447 return ret; 7522 return ret;
7448} 7523}
7449 7524
7525static void btrfs_run_delalloc_work(struct btrfs_work *work)
7526{
7527 struct btrfs_delalloc_work *delalloc_work;
7528
7529 delalloc_work = container_of(work, struct btrfs_delalloc_work,
7530 work);
7531 if (delalloc_work->wait)
7532 btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
7533 else
7534 filemap_flush(delalloc_work->inode->i_mapping);
7535
7536 if (delalloc_work->delay_iput)
7537 btrfs_add_delayed_iput(delalloc_work->inode);
7538 else
7539 iput(delalloc_work->inode);
7540 complete(&delalloc_work->completion);
7541}
7542
7543struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
7544 int wait, int delay_iput)
7545{
7546 struct btrfs_delalloc_work *work;
7547
7548 work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
7549 if (!work)
7550 return NULL;
7551
7552 init_completion(&work->completion);
7553 INIT_LIST_HEAD(&work->list);
7554 work->inode = inode;
7555 work->wait = wait;
7556 work->delay_iput = delay_iput;
7557 work->work.func = btrfs_run_delalloc_work;
7558
7559 return work;
7560}
7561
7562void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
7563{
7564 wait_for_completion(&work->completion);
7565 kmem_cache_free(btrfs_delalloc_work_cachep, work);
7566}
7567
7450/* 7568/*
7451 * some fairly slow code that needs optimization. This walks the list 7569 * some fairly slow code that needs optimization. This walks the list
7452 * of all the inodes with pending delalloc and forces them to disk. 7570 * of all the inodes with pending delalloc and forces them to disk.
7453 */ 7571 */
7454int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) 7572int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7455{ 7573{
7456 struct list_head *head = &root->fs_info->delalloc_inodes;
7457 struct btrfs_inode *binode; 7574 struct btrfs_inode *binode;
7458 struct inode *inode; 7575 struct inode *inode;
7576 struct btrfs_delalloc_work *work, *next;
7577 struct list_head works;
7578 struct list_head splice;
7579 int ret = 0;
7459 7580
7460 if (root->fs_info->sb->s_flags & MS_RDONLY) 7581 if (root->fs_info->sb->s_flags & MS_RDONLY)
7461 return -EROFS; 7582 return -EROFS;
7462 7583
7584 INIT_LIST_HEAD(&works);
7585 INIT_LIST_HEAD(&splice);
7586again:
7463 spin_lock(&root->fs_info->delalloc_lock); 7587 spin_lock(&root->fs_info->delalloc_lock);
7464 while (!list_empty(head)) { 7588 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
7465 binode = list_entry(head->next, struct btrfs_inode, 7589 while (!list_empty(&splice)) {
7590 binode = list_entry(splice.next, struct btrfs_inode,
7466 delalloc_inodes); 7591 delalloc_inodes);
7592
7593 list_del_init(&binode->delalloc_inodes);
7594
7467 inode = igrab(&binode->vfs_inode); 7595 inode = igrab(&binode->vfs_inode);
7468 if (!inode) 7596 if (!inode)
7469 list_del_init(&binode->delalloc_inodes); 7597 continue;
7598
7599 list_add_tail(&binode->delalloc_inodes,
7600 &root->fs_info->delalloc_inodes);
7470 spin_unlock(&root->fs_info->delalloc_lock); 7601 spin_unlock(&root->fs_info->delalloc_lock);
7471 if (inode) { 7602
7472 filemap_flush(inode->i_mapping); 7603 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
7473 if (delay_iput) 7604 if (unlikely(!work)) {
7474 btrfs_add_delayed_iput(inode); 7605 ret = -ENOMEM;
7475 else 7606 goto out;
7476 iput(inode);
7477 } 7607 }
7608 list_add_tail(&work->list, &works);
7609 btrfs_queue_worker(&root->fs_info->flush_workers,
7610 &work->work);
7611
7478 cond_resched(); 7612 cond_resched();
7479 spin_lock(&root->fs_info->delalloc_lock); 7613 spin_lock(&root->fs_info->delalloc_lock);
7480 } 7614 }
7481 spin_unlock(&root->fs_info->delalloc_lock); 7615 spin_unlock(&root->fs_info->delalloc_lock);
7482 7616
7617 list_for_each_entry_safe(work, next, &works, list) {
7618 list_del_init(&work->list);
7619 btrfs_wait_and_free_delalloc_work(work);
7620 }
7621
7622 spin_lock(&root->fs_info->delalloc_lock);
7623 if (!list_empty(&root->fs_info->delalloc_inodes)) {
7624 spin_unlock(&root->fs_info->delalloc_lock);
7625 goto again;
7626 }
7627 spin_unlock(&root->fs_info->delalloc_lock);
7628
7483 /* the filemap_flush will queue IO into the worker threads, but 7629 /* the filemap_flush will queue IO into the worker threads, but
7484 * we have to make sure the IO is actually started and that 7630 * we have to make sure the IO is actually started and that
7485 * ordered extents get created before we return 7631 * ordered extents get created before we return
@@ -7493,6 +7639,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7493 } 7639 }
7494 atomic_dec(&root->fs_info->async_submit_draining); 7640 atomic_dec(&root->fs_info->async_submit_draining);
7495 return 0; 7641 return 0;
7642out:
7643 list_for_each_entry_safe(work, next, &works, list) {
7644 list_del_init(&work->list);
7645 btrfs_wait_and_free_delalloc_work(work);
7646 }
7647
7648 if (!list_empty_careful(&splice)) {
7649 spin_lock(&root->fs_info->delalloc_lock);
7650 list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
7651 spin_unlock(&root->fs_info->delalloc_lock);
7652 }
7653 return ret;
7496} 7654}
7497 7655
7498static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7656static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7670,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7512 unsigned long ptr; 7670 unsigned long ptr;
7513 struct btrfs_file_extent_item *ei; 7671 struct btrfs_file_extent_item *ei;
7514 struct extent_buffer *leaf; 7672 struct extent_buffer *leaf;
7515 unsigned long nr = 0;
7516 7673
7517 name_len = strlen(symname) + 1; 7674 name_len = strlen(symname) + 1;
7518 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7675 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7767,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7610out_unlock: 7767out_unlock:
7611 if (!err) 7768 if (!err)
7612 d_instantiate(dentry, inode); 7769 d_instantiate(dentry, inode);
7613 nr = trans->blocks_used;
7614 btrfs_end_transaction(trans, root); 7770 btrfs_end_transaction(trans, root);
7615 if (drop_inode) { 7771 if (drop_inode) {
7616 inode_dec_link_count(inode); 7772 inode_dec_link_count(inode);
7617 iput(inode); 7773 iput(inode);
7618 } 7774 }
7619 btrfs_btree_balance_dirty(root, nr); 7775 btrfs_btree_balance_dirty(root);
7620 return err; 7776 return err;
7621} 7777}
7622 7778
@@ -7679,6 +7835,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7679 em->len = ins.offset; 7835 em->len = ins.offset;
7680 em->block_start = ins.objectid; 7836 em->block_start = ins.objectid;
7681 em->block_len = ins.offset; 7837 em->block_len = ins.offset;
7838 em->orig_block_len = ins.offset;
7682 em->bdev = root->fs_info->fs_devices->latest_bdev; 7839 em->bdev = root->fs_info->fs_devices->latest_bdev;
7683 set_bit(EXTENT_FLAG_PREALLOC, &em->flags); 7840 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7684 em->generation = trans->transid; 7841 em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..5b22d45d3c6a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
55#include "backref.h" 55#include "backref.h"
56#include "rcu-string.h" 56#include "rcu-string.h"
57#include "send.h" 57#include "send.h"
58#include "dev-replace.h"
58 59
59/* Mask out flags that are inappropriate for the given type of inode. */ 60/* Mask out flags that are inappropriate for the given type of inode. */
60static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 61static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
140 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; 141 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
141 } 142 }
142 143
143 if (flags & BTRFS_INODE_NODATACOW) 144 if (flags & BTRFS_INODE_NODATACOW) {
144 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 145 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
146 if (S_ISREG(inode->i_mode))
147 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
148 }
145 149
146 btrfs_update_iflags(inode); 150 btrfs_update_iflags(inode);
147} 151}
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
571 ret = btrfs_commit_transaction(trans, 575 ret = btrfs_commit_transaction(trans,
572 root->fs_info->extent_root); 576 root->fs_info->extent_root);
573 } 577 }
574 if (ret) 578 if (ret) {
579 /* cleanup_transaction has freed this for us */
580 if (trans->aborted)
581 pending_snapshot = NULL;
575 goto fail; 582 goto fail;
583 }
576 584
577 ret = pending_snapshot->error; 585 ret = pending_snapshot->error;
578 if (ret) 586 if (ret)
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
705 if (error) 713 if (error)
706 goto out_dput; 714 goto out_dput;
707 715
716 /*
717 * even if this name doesn't exist, we may get hash collisions.
718 * check for them now when we can safely fail
719 */
720 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
721 dir->i_ino, name,
722 namelen);
723 if (error)
724 goto out_dput;
725
708 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); 726 down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
709 727
710 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) 728 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1225,7 +1243,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1225 } 1243 }
1226 1244
1227 defrag_count += ret; 1245 defrag_count += ret;
1228 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1246 balance_dirty_pages_ratelimited(inode->i_mapping);
1229 mutex_unlock(&inode->i_mutex); 1247 mutex_unlock(&inode->i_mutex);
1230 1248
1231 if (newer_than) { 1249 if (newer_than) {
@@ -1293,12 +1311,13 @@ out_ra:
1293 return ret; 1311 return ret;
1294} 1312}
1295 1313
1296static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 1314static noinline int btrfs_ioctl_resize(struct file *file,
1297 void __user *arg) 1315 void __user *arg)
1298{ 1316{
1299 u64 new_size; 1317 u64 new_size;
1300 u64 old_size; 1318 u64 old_size;
1301 u64 devid = 1; 1319 u64 devid = 1;
1320 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
1302 struct btrfs_ioctl_vol_args *vol_args; 1321 struct btrfs_ioctl_vol_args *vol_args;
1303 struct btrfs_trans_handle *trans; 1322 struct btrfs_trans_handle *trans;
1304 struct btrfs_device *device = NULL; 1323 struct btrfs_device *device = NULL;
@@ -1313,13 +1332,18 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1313 if (!capable(CAP_SYS_ADMIN)) 1332 if (!capable(CAP_SYS_ADMIN))
1314 return -EPERM; 1333 return -EPERM;
1315 1334
1316 mutex_lock(&root->fs_info->volume_mutex); 1335 ret = mnt_want_write_file(file);
1317 if (root->fs_info->balance_ctl) { 1336 if (ret)
1318 printk(KERN_INFO "btrfs: balance in progress\n"); 1337 return ret;
1319 ret = -EINVAL; 1338
1320 goto out; 1339 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1340 1)) {
1341 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
1342 mnt_drop_write_file(file);
1343 return -EINVAL;
1321 } 1344 }
1322 1345
1346 mutex_lock(&root->fs_info->volume_mutex);
1323 vol_args = memdup_user(arg, sizeof(*vol_args)); 1347 vol_args = memdup_user(arg, sizeof(*vol_args));
1324 if (IS_ERR(vol_args)) { 1348 if (IS_ERR(vol_args)) {
1325 ret = PTR_ERR(vol_args); 1349 ret = PTR_ERR(vol_args);
@@ -1339,16 +1363,18 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1339 printk(KERN_INFO "btrfs: resizing devid %llu\n", 1363 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1340 (unsigned long long)devid); 1364 (unsigned long long)devid);
1341 } 1365 }
1342 device = btrfs_find_device(root, devid, NULL, NULL); 1366
1367 device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
1343 if (!device) { 1368 if (!device) {
1344 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1369 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1345 (unsigned long long)devid); 1370 (unsigned long long)devid);
1346 ret = -EINVAL; 1371 ret = -EINVAL;
1347 goto out_free; 1372 goto out_free;
1348 } 1373 }
1349 if (device->fs_devices && device->fs_devices->seeding) { 1374
1375 if (!device->writeable) {
1350 printk(KERN_INFO "btrfs: resizer unable to apply on " 1376 printk(KERN_INFO "btrfs: resizer unable to apply on "
1351 "seeding device %llu\n", 1377 "readonly device %llu\n",
1352 (unsigned long long)devid); 1378 (unsigned long long)devid);
1353 ret = -EINVAL; 1379 ret = -EINVAL;
1354 goto out_free; 1380 goto out_free;
@@ -1371,6 +1397,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1371 } 1397 }
1372 } 1398 }
1373 1399
1400 if (device->is_tgtdev_for_dev_replace) {
1401 ret = -EINVAL;
1402 goto out_free;
1403 }
1404
1374 old_size = device->total_bytes; 1405 old_size = device->total_bytes;
1375 1406
1376 if (mod < 0) { 1407 if (mod < 0) {
@@ -1409,12 +1440,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1409 btrfs_commit_transaction(trans, root); 1440 btrfs_commit_transaction(trans, root);
1410 } else if (new_size < old_size) { 1441 } else if (new_size < old_size) {
1411 ret = btrfs_shrink_device(device, new_size); 1442 ret = btrfs_shrink_device(device, new_size);
1412 } 1443 } /* equal, nothing need to do */
1413 1444
1414out_free: 1445out_free:
1415 kfree(vol_args); 1446 kfree(vol_args);
1416out: 1447out:
1417 mutex_unlock(&root->fs_info->volume_mutex); 1448 mutex_unlock(&root->fs_info->volume_mutex);
1449 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
1450 mnt_drop_write_file(file);
1418 return ret; 1451 return ret;
1419} 1452}
1420 1453
@@ -2065,13 +2098,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2065 err = inode_permission(inode, MAY_WRITE | MAY_EXEC); 2098 err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
2066 if (err) 2099 if (err)
2067 goto out_dput; 2100 goto out_dput;
2068
2069 /* check if subvolume may be deleted by a non-root user */
2070 err = btrfs_may_delete(dir, dentry, 1);
2071 if (err)
2072 goto out_dput;
2073 } 2101 }
2074 2102
2103 /* check if subvolume may be deleted by a user */
2104 err = btrfs_may_delete(dir, dentry, 1);
2105 if (err)
2106 goto out_dput;
2107
2075 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { 2108 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
2076 err = -EINVAL; 2109 err = -EINVAL;
2077 goto out_dput; 2110 goto out_dput;
@@ -2153,13 +2186,22 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2153 struct btrfs_ioctl_defrag_range_args *range; 2186 struct btrfs_ioctl_defrag_range_args *range;
2154 int ret; 2187 int ret;
2155 2188
2156 if (btrfs_root_readonly(root))
2157 return -EROFS;
2158
2159 ret = mnt_want_write_file(file); 2189 ret = mnt_want_write_file(file);
2160 if (ret) 2190 if (ret)
2161 return ret; 2191 return ret;
2162 2192
2193 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2194 1)) {
2195 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2196 mnt_drop_write_file(file);
2197 return -EINVAL;
2198 }
2199
2200 if (btrfs_root_readonly(root)) {
2201 ret = -EROFS;
2202 goto out;
2203 }
2204
2163 switch (inode->i_mode & S_IFMT) { 2205 switch (inode->i_mode & S_IFMT) {
2164 case S_IFDIR: 2206 case S_IFDIR:
2165 if (!capable(CAP_SYS_ADMIN)) { 2207 if (!capable(CAP_SYS_ADMIN)) {
@@ -2209,6 +2251,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2209 ret = -EINVAL; 2251 ret = -EINVAL;
2210 } 2252 }
2211out: 2253out:
2254 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2212 mnt_drop_write_file(file); 2255 mnt_drop_write_file(file);
2213 return ret; 2256 return ret;
2214} 2257}
@@ -2221,13 +2264,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2221 if (!capable(CAP_SYS_ADMIN)) 2264 if (!capable(CAP_SYS_ADMIN))
2222 return -EPERM; 2265 return -EPERM;
2223 2266
2224 mutex_lock(&root->fs_info->volume_mutex); 2267 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2225 if (root->fs_info->balance_ctl) { 2268 1)) {
2226 printk(KERN_INFO "btrfs: balance in progress\n"); 2269 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2227 ret = -EINVAL; 2270 return -EINVAL;
2228 goto out;
2229 } 2271 }
2230 2272
2273 mutex_lock(&root->fs_info->volume_mutex);
2231 vol_args = memdup_user(arg, sizeof(*vol_args)); 2274 vol_args = memdup_user(arg, sizeof(*vol_args));
2232 if (IS_ERR(vol_args)) { 2275 if (IS_ERR(vol_args)) {
2233 ret = PTR_ERR(vol_args); 2276 ret = PTR_ERR(vol_args);
@@ -2240,27 +2283,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2240 kfree(vol_args); 2283 kfree(vol_args);
2241out: 2284out:
2242 mutex_unlock(&root->fs_info->volume_mutex); 2285 mutex_unlock(&root->fs_info->volume_mutex);
2286 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2243 return ret; 2287 return ret;
2244} 2288}
2245 2289
2246static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) 2290static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2247{ 2291{
2292 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
2248 struct btrfs_ioctl_vol_args *vol_args; 2293 struct btrfs_ioctl_vol_args *vol_args;
2249 int ret; 2294 int ret;
2250 2295
2251 if (!capable(CAP_SYS_ADMIN)) 2296 if (!capable(CAP_SYS_ADMIN))
2252 return -EPERM; 2297 return -EPERM;
2253 2298
2254 if (root->fs_info->sb->s_flags & MS_RDONLY) 2299 ret = mnt_want_write_file(file);
2255 return -EROFS; 2300 if (ret)
2301 return ret;
2256 2302
2257 mutex_lock(&root->fs_info->volume_mutex); 2303 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2258 if (root->fs_info->balance_ctl) { 2304 1)) {
2259 printk(KERN_INFO "btrfs: balance in progress\n"); 2305 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
2260 ret = -EINVAL; 2306 mnt_drop_write_file(file);
2261 goto out; 2307 return -EINVAL;
2262 } 2308 }
2263 2309
2310 mutex_lock(&root->fs_info->volume_mutex);
2264 vol_args = memdup_user(arg, sizeof(*vol_args)); 2311 vol_args = memdup_user(arg, sizeof(*vol_args));
2265 if (IS_ERR(vol_args)) { 2312 if (IS_ERR(vol_args)) {
2266 ret = PTR_ERR(vol_args); 2313 ret = PTR_ERR(vol_args);
@@ -2273,6 +2320,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2273 kfree(vol_args); 2320 kfree(vol_args);
2274out: 2321out:
2275 mutex_unlock(&root->fs_info->volume_mutex); 2322 mutex_unlock(&root->fs_info->volume_mutex);
2323 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2324 mnt_drop_write_file(file);
2276 return ret; 2325 return ret;
2277} 2326}
2278 2327
@@ -2328,7 +2377,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2328 s_uuid = di_args->uuid; 2377 s_uuid = di_args->uuid;
2329 2378
2330 mutex_lock(&fs_devices->device_list_mutex); 2379 mutex_lock(&fs_devices->device_list_mutex);
2331 dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); 2380 dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
2332 mutex_unlock(&fs_devices->device_list_mutex); 2381 mutex_unlock(&fs_devices->device_list_mutex);
2333 2382
2334 if (!dev) { 2383 if (!dev) {
@@ -2821,12 +2870,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2821 struct btrfs_disk_key disk_key; 2870 struct btrfs_disk_key disk_key;
2822 u64 objectid = 0; 2871 u64 objectid = 0;
2823 u64 dir_id; 2872 u64 dir_id;
2873 int ret;
2824 2874
2825 if (!capable(CAP_SYS_ADMIN)) 2875 if (!capable(CAP_SYS_ADMIN))
2826 return -EPERM; 2876 return -EPERM;
2827 2877
2828 if (copy_from_user(&objectid, argp, sizeof(objectid))) 2878 ret = mnt_want_write_file(file);
2829 return -EFAULT; 2879 if (ret)
2880 return ret;
2881
2882 if (copy_from_user(&objectid, argp, sizeof(objectid))) {
2883 ret = -EFAULT;
2884 goto out;
2885 }
2830 2886
2831 if (!objectid) 2887 if (!objectid)
2832 objectid = root->root_key.objectid; 2888 objectid = root->root_key.objectid;
@@ -2836,21 +2892,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2836 location.offset = (u64)-1; 2892 location.offset = (u64)-1;
2837 2893
2838 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 2894 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
2839 if (IS_ERR(new_root)) 2895 if (IS_ERR(new_root)) {
2840 return PTR_ERR(new_root); 2896 ret = PTR_ERR(new_root);
2897 goto out;
2898 }
2841 2899
2842 if (btrfs_root_refs(&new_root->root_item) == 0) 2900 if (btrfs_root_refs(&new_root->root_item) == 0) {
2843 return -ENOENT; 2901 ret = -ENOENT;
2902 goto out;
2903 }
2844 2904
2845 path = btrfs_alloc_path(); 2905 path = btrfs_alloc_path();
2846 if (!path) 2906 if (!path) {
2847 return -ENOMEM; 2907 ret = -ENOMEM;
2908 goto out;
2909 }
2848 path->leave_spinning = 1; 2910 path->leave_spinning = 1;
2849 2911
2850 trans = btrfs_start_transaction(root, 1); 2912 trans = btrfs_start_transaction(root, 1);
2851 if (IS_ERR(trans)) { 2913 if (IS_ERR(trans)) {
2852 btrfs_free_path(path); 2914 btrfs_free_path(path);
2853 return PTR_ERR(trans); 2915 ret = PTR_ERR(trans);
2916 goto out;
2854 } 2917 }
2855 2918
2856 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 2919 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2924,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2861 btrfs_end_transaction(trans, root); 2924 btrfs_end_transaction(trans, root);
2862 printk(KERN_ERR "Umm, you don't have the default dir item, " 2925 printk(KERN_ERR "Umm, you don't have the default dir item, "
2863 "this isn't going to work\n"); 2926 "this isn't going to work\n");
2864 return -ENOENT; 2927 ret = -ENOENT;
2928 goto out;
2865 } 2929 }
2866 2930
2867 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); 2931 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2935,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2871 2935
2872 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); 2936 btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2873 btrfs_end_transaction(trans, root); 2937 btrfs_end_transaction(trans, root);
2874 2938out:
2875 return 0; 2939 mnt_drop_write_file(file);
2940 return ret;
2876} 2941}
2877 2942
2878void btrfs_get_block_group_info(struct list_head *groups_list, 2943void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3101,38 @@ long btrfs_ioctl_trans_end(struct file *file)
3036 return 0; 3101 return 0;
3037} 3102}
3038 3103
3039static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) 3104static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3105 void __user *argp)
3040{ 3106{
3041 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3042 struct btrfs_trans_handle *trans; 3107 struct btrfs_trans_handle *trans;
3043 u64 transid; 3108 u64 transid;
3044 int ret; 3109 int ret;
3045 3110
3046 trans = btrfs_start_transaction(root, 0); 3111 trans = btrfs_attach_transaction(root);
3047 if (IS_ERR(trans)) 3112 if (IS_ERR(trans)) {
3048 return PTR_ERR(trans); 3113 if (PTR_ERR(trans) != -ENOENT)
3114 return PTR_ERR(trans);
3115
3116 /* No running transaction, don't bother */
3117 transid = root->fs_info->last_trans_committed;
3118 goto out;
3119 }
3049 transid = trans->transid; 3120 transid = trans->transid;
3050 ret = btrfs_commit_transaction_async(trans, root, 0); 3121 ret = btrfs_commit_transaction_async(trans, root, 0);
3051 if (ret) { 3122 if (ret) {
3052 btrfs_end_transaction(trans, root); 3123 btrfs_end_transaction(trans, root);
3053 return ret; 3124 return ret;
3054 } 3125 }
3055 3126out:
3056 if (argp) 3127 if (argp)
3057 if (copy_to_user(argp, &transid, sizeof(transid))) 3128 if (copy_to_user(argp, &transid, sizeof(transid)))
3058 return -EFAULT; 3129 return -EFAULT;
3059 return 0; 3130 return 0;
3060} 3131}
3061 3132
3062static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) 3133static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
3134 void __user *argp)
3063{ 3135{
3064 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3065 u64 transid; 3136 u64 transid;
3066 3137
3067 if (argp) { 3138 if (argp) {
@@ -3073,10 +3144,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
3073 return btrfs_wait_for_commit(root, transid); 3144 return btrfs_wait_for_commit(root, transid);
3074} 3145}
3075 3146
3076static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) 3147static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3077{ 3148{
3078 int ret; 3149 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3079 struct btrfs_ioctl_scrub_args *sa; 3150 struct btrfs_ioctl_scrub_args *sa;
3151 int ret;
3080 3152
3081 if (!capable(CAP_SYS_ADMIN)) 3153 if (!capable(CAP_SYS_ADMIN))
3082 return -EPERM; 3154 return -EPERM;
@@ -3085,12 +3157,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
3085 if (IS_ERR(sa)) 3157 if (IS_ERR(sa))
3086 return PTR_ERR(sa); 3158 return PTR_ERR(sa);
3087 3159
3088 ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, 3160 if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
3089 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); 3161 ret = mnt_want_write_file(file);
3162 if (ret)
3163 goto out;
3164 }
3165
3166 ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
3167 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
3168 0);
3090 3169
3091 if (copy_to_user(arg, sa, sizeof(*sa))) 3170 if (copy_to_user(arg, sa, sizeof(*sa)))
3092 ret = -EFAULT; 3171 ret = -EFAULT;
3093 3172
3173 if (!(sa->flags & BTRFS_SCRUB_READONLY))
3174 mnt_drop_write_file(file);
3175out:
3094 kfree(sa); 3176 kfree(sa);
3095 return ret; 3177 return ret;
3096} 3178}
@@ -3100,7 +3182,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
3100 if (!capable(CAP_SYS_ADMIN)) 3182 if (!capable(CAP_SYS_ADMIN))
3101 return -EPERM; 3183 return -EPERM;
3102 3184
3103 return btrfs_scrub_cancel(root); 3185 return btrfs_scrub_cancel(root->fs_info);
3104} 3186}
3105 3187
3106static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, 3188static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3231,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3149 return ret; 3231 return ret;
3150} 3232}
3151 3233
3234static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
3235{
3236 struct btrfs_ioctl_dev_replace_args *p;
3237 int ret;
3238
3239 if (!capable(CAP_SYS_ADMIN))
3240 return -EPERM;
3241
3242 p = memdup_user(arg, sizeof(*p));
3243 if (IS_ERR(p))
3244 return PTR_ERR(p);
3245
3246 switch (p->cmd) {
3247 case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
3248 if (atomic_xchg(
3249 &root->fs_info->mutually_exclusive_operation_running,
3250 1)) {
3251 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3252 ret = -EINPROGRESS;
3253 } else {
3254 ret = btrfs_dev_replace_start(root, p);
3255 atomic_set(
3256 &root->fs_info->mutually_exclusive_operation_running,
3257 0);
3258 }
3259 break;
3260 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
3261 btrfs_dev_replace_status(root->fs_info, p);
3262 ret = 0;
3263 break;
3264 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
3265 ret = btrfs_dev_replace_cancel(root->fs_info, p);
3266 break;
3267 default:
3268 ret = -EINVAL;
3269 break;
3270 }
3271
3272 if (copy_to_user(arg, p, sizeof(*p)))
3273 ret = -EFAULT;
3274
3275 kfree(p);
3276 return ret;
3277}
3278
3152static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) 3279static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3153{ 3280{
3154 int ret = 0; 3281 int ret = 0;
@@ -3314,6 +3441,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3314 struct btrfs_fs_info *fs_info = root->fs_info; 3441 struct btrfs_fs_info *fs_info = root->fs_info;
3315 struct btrfs_ioctl_balance_args *bargs; 3442 struct btrfs_ioctl_balance_args *bargs;
3316 struct btrfs_balance_control *bctl; 3443 struct btrfs_balance_control *bctl;
3444 bool need_unlock; /* for mut. excl. ops lock */
3317 int ret; 3445 int ret;
3318 3446
3319 if (!capable(CAP_SYS_ADMIN)) 3447 if (!capable(CAP_SYS_ADMIN))
@@ -3323,14 +3451,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3323 if (ret) 3451 if (ret)
3324 return ret; 3452 return ret;
3325 3453
3326 mutex_lock(&fs_info->volume_mutex); 3454again:
3455 if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
3456 mutex_lock(&fs_info->volume_mutex);
3457 mutex_lock(&fs_info->balance_mutex);
3458 need_unlock = true;
3459 goto locked;
3460 }
3461
3462 /*
3463 * mut. excl. ops lock is locked. Three possibilites:
3464 * (1) some other op is running
3465 * (2) balance is running
3466 * (3) balance is paused -- special case (think resume)
3467 */
3327 mutex_lock(&fs_info->balance_mutex); 3468 mutex_lock(&fs_info->balance_mutex);
3469 if (fs_info->balance_ctl) {
3470 /* this is either (2) or (3) */
3471 if (!atomic_read(&fs_info->balance_running)) {
3472 mutex_unlock(&fs_info->balance_mutex);
3473 if (!mutex_trylock(&fs_info->volume_mutex))
3474 goto again;
3475 mutex_lock(&fs_info->balance_mutex);
3476
3477 if (fs_info->balance_ctl &&
3478 !atomic_read(&fs_info->balance_running)) {
3479 /* this is (3) */
3480 need_unlock = false;
3481 goto locked;
3482 }
3483
3484 mutex_unlock(&fs_info->balance_mutex);
3485 mutex_unlock(&fs_info->volume_mutex);
3486 goto again;
3487 } else {
3488 /* this is (2) */
3489 mutex_unlock(&fs_info->balance_mutex);
3490 ret = -EINPROGRESS;
3491 goto out;
3492 }
3493 } else {
3494 /* this is (1) */
3495 mutex_unlock(&fs_info->balance_mutex);
3496 pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
3497 ret = -EINVAL;
3498 goto out;
3499 }
3500
3501locked:
3502 BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
3328 3503
3329 if (arg) { 3504 if (arg) {
3330 bargs = memdup_user(arg, sizeof(*bargs)); 3505 bargs = memdup_user(arg, sizeof(*bargs));
3331 if (IS_ERR(bargs)) { 3506 if (IS_ERR(bargs)) {
3332 ret = PTR_ERR(bargs); 3507 ret = PTR_ERR(bargs);
3333 goto out; 3508 goto out_unlock;
3334 } 3509 }
3335 3510
3336 if (bargs->flags & BTRFS_BALANCE_RESUME) { 3511 if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3374,11 +3549,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3374 } 3549 }
3375 3550
3376do_balance: 3551do_balance:
3377 ret = btrfs_balance(bctl, bargs);
3378 /* 3552 /*
3379 * bctl is freed in __cancel_balance or in free_fs_info if 3553 * Ownership of bctl and mutually_exclusive_operation_running
3380 * restriper was paused all the way until unmount 3554 * goes to to btrfs_balance. bctl is freed in __cancel_balance,
3555 * or, if restriper was paused all the way until unmount, in
3556 * free_fs_info. mutually_exclusive_operation_running is
3557 * cleared in __cancel_balance.
3381 */ 3558 */
3559 need_unlock = false;
3560
3561 ret = btrfs_balance(bctl, bargs);
3562
3382 if (arg) { 3563 if (arg) {
3383 if (copy_to_user(arg, bargs, sizeof(*bargs))) 3564 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3384 ret = -EFAULT; 3565 ret = -EFAULT;
@@ -3386,9 +3567,12 @@ do_balance:
3386 3567
3387out_bargs: 3568out_bargs:
3388 kfree(bargs); 3569 kfree(bargs);
3389out: 3570out_unlock:
3390 mutex_unlock(&fs_info->balance_mutex); 3571 mutex_unlock(&fs_info->balance_mutex);
3391 mutex_unlock(&fs_info->volume_mutex); 3572 mutex_unlock(&fs_info->volume_mutex);
3573 if (need_unlock)
3574 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3575out:
3392 mnt_drop_write_file(file); 3576 mnt_drop_write_file(file);
3393 return ret; 3577 return ret;
3394} 3578}
@@ -3441,8 +3625,9 @@ out:
3441 return ret; 3625 return ret;
3442} 3626}
3443 3627
3444static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg) 3628static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3445{ 3629{
3630 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3446 struct btrfs_ioctl_quota_ctl_args *sa; 3631 struct btrfs_ioctl_quota_ctl_args *sa;
3447 struct btrfs_trans_handle *trans = NULL; 3632 struct btrfs_trans_handle *trans = NULL;
3448 int ret; 3633 int ret;
@@ -3451,12 +3636,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3451 if (!capable(CAP_SYS_ADMIN)) 3636 if (!capable(CAP_SYS_ADMIN))
3452 return -EPERM; 3637 return -EPERM;
3453 3638
3454 if (root->fs_info->sb->s_flags & MS_RDONLY) 3639 ret = mnt_want_write_file(file);
3455 return -EROFS; 3640 if (ret)
3641 return ret;
3456 3642
3457 sa = memdup_user(arg, sizeof(*sa)); 3643 sa = memdup_user(arg, sizeof(*sa));
3458 if (IS_ERR(sa)) 3644 if (IS_ERR(sa)) {
3459 return PTR_ERR(sa); 3645 ret = PTR_ERR(sa);
3646 goto drop_write;
3647 }
3460 3648
3461 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { 3649 if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3462 trans = btrfs_start_transaction(root, 2); 3650 trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3677,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3489 if (err && !ret) 3677 if (err && !ret)
3490 ret = err; 3678 ret = err;
3491 } 3679 }
3492
3493out: 3680out:
3494 kfree(sa); 3681 kfree(sa);
3682drop_write:
3683 mnt_drop_write_file(file);
3495 return ret; 3684 return ret;
3496} 3685}
3497 3686
3498static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg) 3687static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
3499{ 3688{
3689 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3500 struct btrfs_ioctl_qgroup_assign_args *sa; 3690 struct btrfs_ioctl_qgroup_assign_args *sa;
3501 struct btrfs_trans_handle *trans; 3691 struct btrfs_trans_handle *trans;
3502 int ret; 3692 int ret;
@@ -3505,12 +3695,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3505 if (!capable(CAP_SYS_ADMIN)) 3695 if (!capable(CAP_SYS_ADMIN))
3506 return -EPERM; 3696 return -EPERM;
3507 3697
3508 if (root->fs_info->sb->s_flags & MS_RDONLY) 3698 ret = mnt_want_write_file(file);
3509 return -EROFS; 3699 if (ret)
3700 return ret;
3510 3701
3511 sa = memdup_user(arg, sizeof(*sa)); 3702 sa = memdup_user(arg, sizeof(*sa));
3512 if (IS_ERR(sa)) 3703 if (IS_ERR(sa)) {
3513 return PTR_ERR(sa); 3704 ret = PTR_ERR(sa);
3705 goto drop_write;
3706 }
3514 3707
3515 trans = btrfs_join_transaction(root); 3708 trans = btrfs_join_transaction(root);
3516 if (IS_ERR(trans)) { 3709 if (IS_ERR(trans)) {
@@ -3533,11 +3726,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3533 3726
3534out: 3727out:
3535 kfree(sa); 3728 kfree(sa);
3729drop_write:
3730 mnt_drop_write_file(file);
3536 return ret; 3731 return ret;
3537} 3732}
3538 3733
3539static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg) 3734static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3540{ 3735{
3736 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3541 struct btrfs_ioctl_qgroup_create_args *sa; 3737 struct btrfs_ioctl_qgroup_create_args *sa;
3542 struct btrfs_trans_handle *trans; 3738 struct btrfs_trans_handle *trans;
3543 int ret; 3739 int ret;
@@ -3546,12 +3742,20 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3546 if (!capable(CAP_SYS_ADMIN)) 3742 if (!capable(CAP_SYS_ADMIN))
3547 return -EPERM; 3743 return -EPERM;
3548 3744
3549 if (root->fs_info->sb->s_flags & MS_RDONLY) 3745 ret = mnt_want_write_file(file);
3550 return -EROFS; 3746 if (ret)
3747 return ret;
3551 3748
3552 sa = memdup_user(arg, sizeof(*sa)); 3749 sa = memdup_user(arg, sizeof(*sa));
3553 if (IS_ERR(sa)) 3750 if (IS_ERR(sa)) {
3554 return PTR_ERR(sa); 3751 ret = PTR_ERR(sa);
3752 goto drop_write;
3753 }
3754
3755 if (!sa->qgroupid) {
3756 ret = -EINVAL;
3757 goto out;
3758 }
3555 3759
3556 trans = btrfs_join_transaction(root); 3760 trans = btrfs_join_transaction(root);
3557 if (IS_ERR(trans)) { 3761 if (IS_ERR(trans)) {
@@ -3573,11 +3777,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3573 3777
3574out: 3778out:
3575 kfree(sa); 3779 kfree(sa);
3780drop_write:
3781 mnt_drop_write_file(file);
3576 return ret; 3782 return ret;
3577} 3783}
3578 3784
3579static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg) 3785static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
3580{ 3786{
3787 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3581 struct btrfs_ioctl_qgroup_limit_args *sa; 3788 struct btrfs_ioctl_qgroup_limit_args *sa;
3582 struct btrfs_trans_handle *trans; 3789 struct btrfs_trans_handle *trans;
3583 int ret; 3790 int ret;
@@ -3587,12 +3794,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3587 if (!capable(CAP_SYS_ADMIN)) 3794 if (!capable(CAP_SYS_ADMIN))
3588 return -EPERM; 3795 return -EPERM;
3589 3796
3590 if (root->fs_info->sb->s_flags & MS_RDONLY) 3797 ret = mnt_want_write_file(file);
3591 return -EROFS; 3798 if (ret)
3799 return ret;
3592 3800
3593 sa = memdup_user(arg, sizeof(*sa)); 3801 sa = memdup_user(arg, sizeof(*sa));
3594 if (IS_ERR(sa)) 3802 if (IS_ERR(sa)) {
3595 return PTR_ERR(sa); 3803 ret = PTR_ERR(sa);
3804 goto drop_write;
3805 }
3596 3806
3597 trans = btrfs_join_transaction(root); 3807 trans = btrfs_join_transaction(root);
3598 if (IS_ERR(trans)) { 3808 if (IS_ERR(trans)) {
@@ -3615,6 +3825,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3615 3825
3616out: 3826out:
3617 kfree(sa); 3827 kfree(sa);
3828drop_write:
3829 mnt_drop_write_file(file);
3618 return ret; 3830 return ret;
3619} 3831}
3620 3832
@@ -3735,11 +3947,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3735 case BTRFS_IOC_DEFRAG_RANGE: 3947 case BTRFS_IOC_DEFRAG_RANGE:
3736 return btrfs_ioctl_defrag(file, argp); 3948 return btrfs_ioctl_defrag(file, argp);
3737 case BTRFS_IOC_RESIZE: 3949 case BTRFS_IOC_RESIZE:
3738 return btrfs_ioctl_resize(root, argp); 3950 return btrfs_ioctl_resize(file, argp);
3739 case BTRFS_IOC_ADD_DEV: 3951 case BTRFS_IOC_ADD_DEV:
3740 return btrfs_ioctl_add_dev(root, argp); 3952 return btrfs_ioctl_add_dev(root, argp);
3741 case BTRFS_IOC_RM_DEV: 3953 case BTRFS_IOC_RM_DEV:
3742 return btrfs_ioctl_rm_dev(root, argp); 3954 return btrfs_ioctl_rm_dev(file, argp);
3743 case BTRFS_IOC_FS_INFO: 3955 case BTRFS_IOC_FS_INFO:
3744 return btrfs_ioctl_fs_info(root, argp); 3956 return btrfs_ioctl_fs_info(root, argp);
3745 case BTRFS_IOC_DEV_INFO: 3957 case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3980,11 @@ long btrfs_ioctl(struct file *file, unsigned int
3768 btrfs_sync_fs(file->f_dentry->d_sb, 1); 3980 btrfs_sync_fs(file->f_dentry->d_sb, 1);
3769 return 0; 3981 return 0;
3770 case BTRFS_IOC_START_SYNC: 3982 case BTRFS_IOC_START_SYNC:
3771 return btrfs_ioctl_start_sync(file, argp); 3983 return btrfs_ioctl_start_sync(root, argp);
3772 case BTRFS_IOC_WAIT_SYNC: 3984 case BTRFS_IOC_WAIT_SYNC:
3773 return btrfs_ioctl_wait_sync(file, argp); 3985 return btrfs_ioctl_wait_sync(root, argp);
3774 case BTRFS_IOC_SCRUB: 3986 case BTRFS_IOC_SCRUB:
3775 return btrfs_ioctl_scrub(root, argp); 3987 return btrfs_ioctl_scrub(file, argp);
3776 case BTRFS_IOC_SCRUB_CANCEL: 3988 case BTRFS_IOC_SCRUB_CANCEL:
3777 return btrfs_ioctl_scrub_cancel(root, argp); 3989 return btrfs_ioctl_scrub_cancel(root, argp);
3778 case BTRFS_IOC_SCRUB_PROGRESS: 3990 case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +4002,15 @@ long btrfs_ioctl(struct file *file, unsigned int
3790 case BTRFS_IOC_GET_DEV_STATS: 4002 case BTRFS_IOC_GET_DEV_STATS:
3791 return btrfs_ioctl_get_dev_stats(root, argp); 4003 return btrfs_ioctl_get_dev_stats(root, argp);
3792 case BTRFS_IOC_QUOTA_CTL: 4004 case BTRFS_IOC_QUOTA_CTL:
3793 return btrfs_ioctl_quota_ctl(root, argp); 4005 return btrfs_ioctl_quota_ctl(file, argp);
3794 case BTRFS_IOC_QGROUP_ASSIGN: 4006 case BTRFS_IOC_QGROUP_ASSIGN:
3795 return btrfs_ioctl_qgroup_assign(root, argp); 4007 return btrfs_ioctl_qgroup_assign(file, argp);
3796 case BTRFS_IOC_QGROUP_CREATE: 4008 case BTRFS_IOC_QGROUP_CREATE:
3797 return btrfs_ioctl_qgroup_create(root, argp); 4009 return btrfs_ioctl_qgroup_create(file, argp);
3798 case BTRFS_IOC_QGROUP_LIMIT: 4010 case BTRFS_IOC_QGROUP_LIMIT:
3799 return btrfs_ioctl_qgroup_limit(root, argp); 4011 return btrfs_ioctl_qgroup_limit(file, argp);
4012 case BTRFS_IOC_DEV_REPLACE:
4013 return btrfs_ioctl_dev_replace(root, argp);
3800 } 4014 }
3801 4015
3802 return -ENOTTY; 4016 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 731e2875ab93..dabca9cc8c2e 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_DEVICE_PATH_NAME_MAX 1024
34
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) 35#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1) 36#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) 37#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
123 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; 125 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
124}; 126};
125 127
126#define BTRFS_DEVICE_PATH_NAME_MAX 1024 128#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
129#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
130struct btrfs_ioctl_dev_replace_start_params {
131 __u64 srcdevid; /* in, if 0, use srcdev_name instead */
132 __u64 cont_reading_from_srcdev_mode; /* in, see #define
133 * above */
134 __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
135 __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */
136};
137
138#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0
139#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1
140#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2
141#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3
142#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4
143struct btrfs_ioctl_dev_replace_status_params {
144 __u64 replace_state; /* out, see #define above */
145 __u64 progress_1000; /* out, 0 <= x <= 1000 */
146 __u64 time_started; /* out, seconds since 1-Jan-1970 */
147 __u64 time_stopped; /* out, seconds since 1-Jan-1970 */
148 __u64 num_write_errors; /* out */
149 __u64 num_uncorrectable_read_errors; /* out */
150};
151
152#define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0
153#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1
154#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2
155#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0
156#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1
157#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2
158struct btrfs_ioctl_dev_replace_args {
159 __u64 cmd; /* in */
160 __u64 result; /* out */
161
162 union {
163 struct btrfs_ioctl_dev_replace_start_params start;
164 struct btrfs_ioctl_dev_replace_status_params status;
165 }; /* in/out */
166
167 __u64 spare[64];
168};
169
127struct btrfs_ioctl_dev_info_args { 170struct btrfs_ioctl_dev_info_args {
128 __u64 devid; /* in/out */ 171 __u64 devid; /* in/out */
129 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ 172 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
453 struct btrfs_ioctl_qgroup_limit_args) 496 struct btrfs_ioctl_qgroup_limit_args)
454#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ 497#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
455 struct btrfs_ioctl_get_dev_stats) 498 struct btrfs_ioctl_get_dev_stats)
499#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
500 struct btrfs_ioctl_dev_replace_args)
501
456#endif 502#endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
new file mode 100644
index 000000000000..b7816cefbd13
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
1
2/*
3 * Copyright (C) 2012 Fujitsu. All rights reserved.
4 * Written by Miao Xie <miaox@cn.fujitsu.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; if not, write to the
17 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 021110-1307, USA.
19 */
20
21#ifndef __BTRFS_MATH_H
22#define __BTRFS_MATH_H
23
24#include <asm/div64.h>
25
26static inline u64 div_factor(u64 num, int factor)
27{
28 if (factor == 10)
29 return num;
30 num *= factor;
31 do_div(num, 10);
32 return num;
33}
34
35static inline u64 div_factor_fine(u64 num, int factor)
36{
37 if (factor == 100)
38 return num;
39 num *= factor;
40 do_div(num, 100);
41 return num;
42}
43
44#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7772f02ba28e..f10731297040 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
211 init_waitqueue_head(&entry->wait); 211 init_waitqueue_head(&entry->wait);
212 INIT_LIST_HEAD(&entry->list); 212 INIT_LIST_HEAD(&entry->list);
213 INIT_LIST_HEAD(&entry->root_extent_list); 213 INIT_LIST_HEAD(&entry->root_extent_list);
214 INIT_LIST_HEAD(&entry->work_list);
215 init_completion(&entry->completion);
214 216
215 trace_btrfs_ordered_extent_add(inode, entry); 217 trace_btrfs_ordered_extent_add(inode, entry);
216 218
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
464 wake_up(&entry->wait); 466 wake_up(&entry->wait);
465} 467}
466 468
469static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
470{
471 struct btrfs_ordered_extent *ordered;
472
473 ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
474 btrfs_start_ordered_extent(ordered->inode, ordered, 1);
475 complete(&ordered->completion);
476}
477
467/* 478/*
468 * wait for all the ordered extents in a root. This is done when balancing 479 * wait for all the ordered extents in a root. This is done when balancing
469 * space between drives. 480 * space between drives.
470 */ 481 */
471void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) 482void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
472{ 483{
473 struct list_head splice; 484 struct list_head splice, works;
474 struct list_head *cur; 485 struct list_head *cur;
475 struct btrfs_ordered_extent *ordered; 486 struct btrfs_ordered_extent *ordered, *next;
476 struct inode *inode; 487 struct inode *inode;
477 488
478 INIT_LIST_HEAD(&splice); 489 INIT_LIST_HEAD(&splice);
490 INIT_LIST_HEAD(&works);
479 491
480 spin_lock(&root->fs_info->ordered_extent_lock); 492 spin_lock(&root->fs_info->ordered_extent_lock);
481 list_splice_init(&root->fs_info->ordered_extents, &splice); 493 list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
494 spin_unlock(&root->fs_info->ordered_extent_lock); 506 spin_unlock(&root->fs_info->ordered_extent_lock);
495 507
496 if (inode) { 508 if (inode) {
497 btrfs_start_ordered_extent(inode, ordered, 1); 509 ordered->flush_work.func = btrfs_run_ordered_extent_work;
498 btrfs_put_ordered_extent(ordered); 510 list_add_tail(&ordered->work_list, &works);
499 if (delay_iput) 511 btrfs_queue_worker(&root->fs_info->flush_workers,
500 btrfs_add_delayed_iput(inode); 512 &ordered->flush_work);
501 else
502 iput(inode);
503 } else { 513 } else {
504 btrfs_put_ordered_extent(ordered); 514 btrfs_put_ordered_extent(ordered);
505 } 515 }
506 516
517 cond_resched();
507 spin_lock(&root->fs_info->ordered_extent_lock); 518 spin_lock(&root->fs_info->ordered_extent_lock);
508 } 519 }
509 spin_unlock(&root->fs_info->ordered_extent_lock); 520 spin_unlock(&root->fs_info->ordered_extent_lock);
521
522 list_for_each_entry_safe(ordered, next, &works, work_list) {
523 list_del_init(&ordered->work_list);
524 wait_for_completion(&ordered->completion);
525
526 inode = ordered->inode;
527 btrfs_put_ordered_extent(ordered);
528 if (delay_iput)
529 btrfs_add_delayed_iput(inode);
530 else
531 iput(inode);
532
533 cond_resched();
534 }
510} 535}
511 536
512/* 537/*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
519 * extra check to make sure the ordered operation list really is empty 544 * extra check to make sure the ordered operation list really is empty
520 * before we return 545 * before we return
521 */ 546 */
522void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) 547int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
523{ 548{
524 struct btrfs_inode *btrfs_inode; 549 struct btrfs_inode *btrfs_inode;
525 struct inode *inode; 550 struct inode *inode;
526 struct list_head splice; 551 struct list_head splice;
552 struct list_head works;
553 struct btrfs_delalloc_work *work, *next;
554 int ret = 0;
527 555
528 INIT_LIST_HEAD(&splice); 556 INIT_LIST_HEAD(&splice);
557 INIT_LIST_HEAD(&works);
529 558
530 mutex_lock(&root->fs_info->ordered_operations_mutex); 559 mutex_lock(&root->fs_info->ordered_operations_mutex);
531 spin_lock(&root->fs_info->ordered_extent_lock); 560 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
533 list_splice_init(&root->fs_info->ordered_operations, &splice); 562 list_splice_init(&root->fs_info->ordered_operations, &splice);
534 563
535 while (!list_empty(&splice)) { 564 while (!list_empty(&splice)) {
565
536 btrfs_inode = list_entry(splice.next, struct btrfs_inode, 566 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
537 ordered_operations); 567 ordered_operations);
538 568
@@ -549,15 +579,26 @@ again:
549 list_add_tail(&BTRFS_I(inode)->ordered_operations, 579 list_add_tail(&BTRFS_I(inode)->ordered_operations,
550 &root->fs_info->ordered_operations); 580 &root->fs_info->ordered_operations);
551 } 581 }
582
583 if (!inode)
584 continue;
552 spin_unlock(&root->fs_info->ordered_extent_lock); 585 spin_unlock(&root->fs_info->ordered_extent_lock);
553 586
554 if (inode) { 587 work = btrfs_alloc_delalloc_work(inode, wait, 1);
555 if (wait) 588 if (!work) {
556 btrfs_wait_ordered_range(inode, 0, (u64)-1); 589 if (list_empty(&BTRFS_I(inode)->ordered_operations))
557 else 590 list_add_tail(&btrfs_inode->ordered_operations,
558 filemap_flush(inode->i_mapping); 591 &splice);
559 btrfs_add_delayed_iput(inode); 592 spin_lock(&root->fs_info->ordered_extent_lock);
593 list_splice_tail(&splice,
594 &root->fs_info->ordered_operations);
595 spin_unlock(&root->fs_info->ordered_extent_lock);
596 ret = -ENOMEM;
597 goto out;
560 } 598 }
599 list_add_tail(&work->list, &works);
600 btrfs_queue_worker(&root->fs_info->flush_workers,
601 &work->work);
561 602
562 cond_resched(); 603 cond_resched();
563 spin_lock(&root->fs_info->ordered_extent_lock); 604 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
566 goto again; 607 goto again;
567 608
568 spin_unlock(&root->fs_info->ordered_extent_lock); 609 spin_unlock(&root->fs_info->ordered_extent_lock);
610out:
611 list_for_each_entry_safe(work, next, &works, list) {
612 list_del_init(&work->list);
613 btrfs_wait_and_free_delalloc_work(work);
614 }
569 mutex_unlock(&root->fs_info->ordered_operations_mutex); 615 mutex_unlock(&root->fs_info->ordered_operations_mutex);
616 return ret;
570} 617}
571 618
572/* 619/*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
606 u64 end; 653 u64 end;
607 u64 orig_end; 654 u64 orig_end;
608 struct btrfs_ordered_extent *ordered; 655 struct btrfs_ordered_extent *ordered;
609 int found;
610 656
611 if (start + len < start) { 657 if (start + len < start) {
612 orig_end = INT_LIMIT(loff_t); 658 orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
642 filemap_fdatawait_range(inode->i_mapping, start, orig_end); 688 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
643 689
644 end = orig_end; 690 end = orig_end;
645 found = 0;
646 while (1) { 691 while (1) {
647 ordered = btrfs_lookup_first_ordered_extent(inode, end); 692 ordered = btrfs_lookup_first_ordered_extent(inode, end);
648 if (!ordered) 693 if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
655 btrfs_put_ordered_extent(ordered); 700 btrfs_put_ordered_extent(ordered);
656 break; 701 break;
657 } 702 }
658 found++;
659 btrfs_start_ordered_extent(inode, ordered, 1); 703 btrfs_start_ordered_extent(inode, ordered, 1);
660 end = ordered->file_offset; 704 end = ordered->file_offset;
661 btrfs_put_ordered_extent(ordered); 705 btrfs_put_ordered_extent(ordered);
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
934 if (last_mod < root->fs_info->last_trans_committed) 978 if (last_mod < root->fs_info->last_trans_committed)
935 return; 979 return;
936 980
937 /*
938 * the transaction is already committing. Just start the IO and
939 * don't bother with all of this list nonsense
940 */
941 if (trans && root->fs_info->running_transaction->blocked) {
942 btrfs_wait_ordered_range(inode, 0, (u64)-1);
943 return;
944 }
945
946 spin_lock(&root->fs_info->ordered_extent_lock); 981 spin_lock(&root->fs_info->ordered_extent_lock);
947 if (list_empty(&BTRFS_I(inode)->ordered_operations)) { 982 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
948 list_add_tail(&BTRFS_I(inode)->ordered_operations, 983 list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
959 NULL); 994 NULL);
960 if (!btrfs_ordered_extent_cache) 995 if (!btrfs_ordered_extent_cache)
961 return -ENOMEM; 996 return -ENOMEM;
997
962 return 0; 998 return 0;
963} 999}
964 1000
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index dd27a0b46a37..f29d4bf5fbe7 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
76 76
77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ 77#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
78 78
79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent 79#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
80 * has done its due diligence in updating 80 * has done its due diligence in updating
81 * the isize. */ 81 * the isize. */
82 82
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
128 struct list_head root_extent_list; 128 struct list_head root_extent_list;
129 129
130 struct btrfs_work work; 130 struct btrfs_work work;
131};
132 131
132 struct completion completion;
133 struct btrfs_work flush_work;
134 struct list_head work_list;
135};
133 136
134/* 137/*
135 * calculates the total size you need to allocate for an ordered sum 138 * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
186int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 189int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
187 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
188int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
189void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 192int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
190void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 193void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
191 struct btrfs_root *root, 194 struct btrfs_root *root,
192 struct inode *inode); 195 struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5e23684887eb..50d95fd190a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
297 case BTRFS_DEV_STATS_KEY: 297 case BTRFS_DEV_STATS_KEY:
298 printk(KERN_INFO "\t\tdevice stats\n"); 298 printk(KERN_INFO "\t\tdevice stats\n");
299 break; 299 break;
300 case BTRFS_DEV_REPLACE_KEY:
301 printk(KERN_INFO "\t\tdev replace\n");
302 break;
300 }; 303 };
301 } 304 }
302} 305}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8e..a5c856234323 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -379,6 +379,13 @@ next1:
379 379
380 ret = add_relation_rb(fs_info, found_key.objectid, 380 ret = add_relation_rb(fs_info, found_key.objectid,
381 found_key.offset); 381 found_key.offset);
382 if (ret == -ENOENT) {
383 printk(KERN_WARNING
384 "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
385 (unsigned long long)found_key.objectid,
386 (unsigned long long)found_key.offset);
387 ret = 0; /* ignore the error */
388 }
382 if (ret) 389 if (ret)
383 goto out; 390 goto out;
384next2: 391next2:
@@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
956 struct btrfs_fs_info *fs_info, u64 qgroupid) 963 struct btrfs_fs_info *fs_info, u64 qgroupid)
957{ 964{
958 struct btrfs_root *quota_root; 965 struct btrfs_root *quota_root;
966 struct btrfs_qgroup *qgroup;
959 int ret = 0; 967 int ret = 0;
960 968
961 quota_root = fs_info->quota_root; 969 quota_root = fs_info->quota_root;
962 if (!quota_root) 970 if (!quota_root)
963 return -EINVAL; 971 return -EINVAL;
964 972
973 /* check if there are no relations to this qgroup */
974 spin_lock(&fs_info->qgroup_lock);
975 qgroup = find_qgroup_rb(fs_info, qgroupid);
976 if (qgroup) {
977 if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
978 spin_unlock(&fs_info->qgroup_lock);
979 return -EBUSY;
980 }
981 }
982 spin_unlock(&fs_info->qgroup_lock);
983
965 ret = del_qgroup_item(trans, quota_root, qgroupid); 984 ret = del_qgroup_item(trans, quota_root, qgroupid);
966 985
967 spin_lock(&fs_info->qgroup_lock); 986 spin_lock(&fs_info->qgroup_lock);
968 del_qgroup_rb(quota_root->fs_info, qgroupid); 987 del_qgroup_rb(quota_root->fs_info, qgroupid);
969
970 spin_unlock(&fs_info->qgroup_lock); 988 spin_unlock(&fs_info->qgroup_lock);
971 989
972 return ret; 990 return ret;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index a955669519a2..96b93daa0bbb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
27#include "volumes.h" 27#include "volumes.h"
28#include "disk-io.h" 28#include "disk-io.h"
29#include "transaction.h" 29#include "transaction.h"
30#include "dev-replace.h"
30 31
31#undef DEBUG 32#undef DEBUG
32 33
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
323 struct reada_extent *re = NULL; 324 struct reada_extent *re = NULL;
324 struct reada_extent *re_exist = NULL; 325 struct reada_extent *re_exist = NULL;
325 struct btrfs_fs_info *fs_info = root->fs_info; 326 struct btrfs_fs_info *fs_info = root->fs_info;
326 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
327 struct btrfs_bio *bbio = NULL; 327 struct btrfs_bio *bbio = NULL;
328 struct btrfs_device *dev; 328 struct btrfs_device *dev;
329 struct btrfs_device *prev_dev; 329 struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
332 int nzones = 0; 332 int nzones = 0;
333 int i; 333 int i;
334 unsigned long index = logical >> PAGE_CACHE_SHIFT; 334 unsigned long index = logical >> PAGE_CACHE_SHIFT;
335 int dev_replace_is_ongoing;
335 336
336 spin_lock(&fs_info->reada_lock); 337 spin_lock(&fs_info->reada_lock);
337 re = radix_tree_lookup(&fs_info->reada_tree, index); 338 re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
358 * map block 359 * map block
359 */ 360 */
360 length = blocksize; 361 length = blocksize;
361 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); 362 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
363 &bbio, 0);
362 if (ret || !bbio || length < blocksize) 364 if (ret || !bbio || length < blocksize)
363 goto error; 365 goto error;
364 366
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
393 } 395 }
394 396
395 /* insert extent in reada_tree + all per-device trees, all or nothing */ 397 /* insert extent in reada_tree + all per-device trees, all or nothing */
398 btrfs_dev_replace_lock(&fs_info->dev_replace);
396 spin_lock(&fs_info->reada_lock); 399 spin_lock(&fs_info->reada_lock);
397 ret = radix_tree_insert(&fs_info->reada_tree, index, re); 400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
398 if (ret == -EEXIST) { 401 if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
400 BUG_ON(!re_exist); 403 BUG_ON(!re_exist);
401 re_exist->refcnt++; 404 re_exist->refcnt++;
402 spin_unlock(&fs_info->reada_lock); 405 spin_unlock(&fs_info->reada_lock);
406 btrfs_dev_replace_unlock(&fs_info->dev_replace);
403 goto error; 407 goto error;
404 } 408 }
405 if (ret) { 409 if (ret) {
406 spin_unlock(&fs_info->reada_lock); 410 spin_unlock(&fs_info->reada_lock);
411 btrfs_dev_replace_unlock(&fs_info->dev_replace);
407 goto error; 412 goto error;
408 } 413 }
409 prev_dev = NULL; 414 prev_dev = NULL;
415 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
416 &fs_info->dev_replace);
410 for (i = 0; i < nzones; ++i) { 417 for (i = 0; i < nzones; ++i) {
411 dev = bbio->stripes[i].dev; 418 dev = bbio->stripes[i].dev;
412 if (dev == prev_dev) { 419 if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
419 */ 426 */
420 continue; 427 continue;
421 } 428 }
429 if (!dev->bdev) {
430 /* cannot read ahead on missing device */
431 continue;
432 }
433 if (dev_replace_is_ongoing &&
434 dev == fs_info->dev_replace.tgtdev) {
435 /*
436 * as this device is selected for reading only as
437 * a last resort, skip it for read ahead.
438 */
439 continue;
440 }
422 prev_dev = dev; 441 prev_dev = dev;
423 ret = radix_tree_insert(&dev->reada_extents, index, re); 442 ret = radix_tree_insert(&dev->reada_extents, index, re);
424 if (ret) { 443 if (ret) {
425 while (--i >= 0) { 444 while (--i >= 0) {
426 dev = bbio->stripes[i].dev; 445 dev = bbio->stripes[i].dev;
427 BUG_ON(dev == NULL); 446 BUG_ON(dev == NULL);
447 /* ignore whether the entry was inserted */
428 radix_tree_delete(&dev->reada_extents, index); 448 radix_tree_delete(&dev->reada_extents, index);
429 } 449 }
430 BUG_ON(fs_info == NULL); 450 BUG_ON(fs_info == NULL);
431 radix_tree_delete(&fs_info->reada_tree, index); 451 radix_tree_delete(&fs_info->reada_tree, index);
432 spin_unlock(&fs_info->reada_lock); 452 spin_unlock(&fs_info->reada_lock);
453 btrfs_dev_replace_unlock(&fs_info->dev_replace);
433 goto error; 454 goto error;
434 } 455 }
435 } 456 }
436 spin_unlock(&fs_info->reada_lock); 457 spin_unlock(&fs_info->reada_lock);
458 btrfs_dev_replace_unlock(&fs_info->dev_replace);
437 459
438 kfree(bbio); 460 kfree(bbio);
439 return re; 461 return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
915 generation = btrfs_header_generation(node); 937 generation = btrfs_header_generation(node);
916 free_extent_buffer(node); 938 free_extent_buffer(node);
917 939
918 reada_add_block(rc, start, &max_key, level, generation); 940 if (reada_add_block(rc, start, &max_key, level, generation)) {
941 kfree(rc);
942 return ERR_PTR(-ENOMEM);
943 }
919 944
920 reada_start_machine(root->fs_info); 945 reada_start_machine(root->fs_info);
921 946
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 776f0aa128fc..300e09ac3659 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2025 struct btrfs_root_item *root_item; 2025 struct btrfs_root_item *root_item;
2026 struct btrfs_path *path; 2026 struct btrfs_path *path;
2027 struct extent_buffer *leaf; 2027 struct extent_buffer *leaf;
2028 unsigned long nr;
2029 int level; 2028 int level;
2030 int max_level; 2029 int max_level;
2031 int replaced = 0; 2030 int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2074 BUG_ON(IS_ERR(trans)); 2073 BUG_ON(IS_ERR(trans));
2075 trans->block_rsv = rc->block_rsv; 2074 trans->block_rsv = rc->block_rsv;
2076 2075
2077 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); 2076 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2077 BTRFS_RESERVE_FLUSH_ALL);
2078 if (ret) { 2078 if (ret) {
2079 BUG_ON(ret != -EAGAIN); 2079 BUG_ON(ret != -EAGAIN);
2080 ret = btrfs_commit_transaction(trans, root); 2080 ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2125 path->slots[level]); 2125 path->slots[level]);
2126 root_item->drop_level = level; 2126 root_item->drop_level = level;
2127 2127
2128 nr = trans->blocks_used;
2129 btrfs_end_transaction_throttle(trans, root); 2128 btrfs_end_transaction_throttle(trans, root);
2130 2129
2131 btrfs_btree_balance_dirty(root, nr); 2130 btrfs_btree_balance_dirty(root);
2132 2131
2133 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2132 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2134 invalidate_extent_cache(root, &key, &next_key); 2133 invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
2155 btrfs_update_reloc_root(trans, root); 2154 btrfs_update_reloc_root(trans, root);
2156 } 2155 }
2157 2156
2158 nr = trans->blocks_used;
2159 btrfs_end_transaction_throttle(trans, root); 2157 btrfs_end_transaction_throttle(trans, root);
2160 2158
2161 btrfs_btree_balance_dirty(root, nr); 2159 btrfs_btree_balance_dirty(root);
2162 2160
2163 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2161 if (replaced && rc->stage == UPDATE_DATA_PTRS)
2164 invalidate_extent_cache(root, &key, &next_key); 2162 invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2184again: 2182again:
2185 if (!err) { 2183 if (!err) {
2186 num_bytes = rc->merging_rsv_size; 2184 num_bytes = rc->merging_rsv_size;
2187 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2185 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2186 BTRFS_RESERVE_FLUSH_ALL);
2188 if (ret) 2187 if (ret)
2189 err = ret; 2188 err = ret;
2190 } 2189 }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2459 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2458 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2460 2459
2461 trans->block_rsv = rc->block_rsv; 2460 trans->block_rsv = rc->block_rsv;
2462 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); 2461 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2462 BTRFS_RESERVE_FLUSH_ALL);
2463 if (ret) { 2463 if (ret) {
2464 if (ret == -EAGAIN) 2464 if (ret == -EAGAIN)
2465 rc->commit_transaction = 1; 2465 rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3259 struct btrfs_path *path; 3259 struct btrfs_path *path;
3260 struct btrfs_root *root = fs_info->tree_root; 3260 struct btrfs_root *root = fs_info->tree_root;
3261 struct btrfs_trans_handle *trans; 3261 struct btrfs_trans_handle *trans;
3262 unsigned long nr;
3263 int ret = 0; 3262 int ret = 0;
3264 3263
3265 if (inode) 3264 if (inode)
@@ -3293,9 +3292,8 @@ truncate:
3293 ret = btrfs_truncate_free_space_cache(root, trans, path, inode); 3292 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3294 3293
3295 btrfs_free_path(path); 3294 btrfs_free_path(path);
3296 nr = trans->blocks_used;
3297 btrfs_end_transaction(trans, root); 3295 btrfs_end_transaction(trans, root);
3298 btrfs_btree_balance_dirty(root, nr); 3296 btrfs_btree_balance_dirty(root);
3299out: 3297out:
3300 iput(inode); 3298 iput(inode);
3301 return ret; 3299 return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
3685 * is no reservation in transaction handle. 3683 * is no reservation in transaction handle.
3686 */ 3684 */
3687 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, 3685 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3688 rc->extent_root->nodesize * 256); 3686 rc->extent_root->nodesize * 256,
3687 BTRFS_RESERVE_FLUSH_ALL);
3689 if (ret) 3688 if (ret)
3690 return ret; 3689 return ret;
3691 3690
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3711 struct btrfs_trans_handle *trans = NULL; 3710 struct btrfs_trans_handle *trans = NULL;
3712 struct btrfs_path *path; 3711 struct btrfs_path *path;
3713 struct btrfs_extent_item *ei; 3712 struct btrfs_extent_item *ei;
3714 unsigned long nr;
3715 u64 flags; 3713 u64 flags;
3716 u32 item_size; 3714 u32 item_size;
3717 int ret; 3715 int ret;
@@ -3828,9 +3826,8 @@ restart:
3828 ret = btrfs_commit_transaction(trans, rc->extent_root); 3826 ret = btrfs_commit_transaction(trans, rc->extent_root);
3829 BUG_ON(ret); 3827 BUG_ON(ret);
3830 } else { 3828 } else {
3831 nr = trans->blocks_used;
3832 btrfs_end_transaction_throttle(trans, rc->extent_root); 3829 btrfs_end_transaction_throttle(trans, rc->extent_root);
3833 btrfs_btree_balance_dirty(rc->extent_root, nr); 3830 btrfs_btree_balance_dirty(rc->extent_root);
3834 } 3831 }
3835 trans = NULL; 3832 trans = NULL;
3836 3833
@@ -3860,9 +3857,8 @@ restart:
3860 GFP_NOFS); 3857 GFP_NOFS);
3861 3858
3862 if (trans) { 3859 if (trans) {
3863 nr = trans->blocks_used;
3864 btrfs_end_transaction_throttle(trans, rc->extent_root); 3860 btrfs_end_transaction_throttle(trans, rc->extent_root);
3865 btrfs_btree_balance_dirty(rc->extent_root, nr); 3861 btrfs_btree_balance_dirty(rc->extent_root);
3866 } 3862 }
3867 3863
3868 if (!err) { 3864 if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3941 struct btrfs_trans_handle *trans; 3937 struct btrfs_trans_handle *trans;
3942 struct btrfs_root *root; 3938 struct btrfs_root *root;
3943 struct btrfs_key key; 3939 struct btrfs_key key;
3944 unsigned long nr;
3945 u64 objectid = BTRFS_FIRST_FREE_OBJECTID; 3940 u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3946 int err = 0; 3941 int err = 0;
3947 3942
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3969 3964
3970 err = btrfs_orphan_add(trans, inode); 3965 err = btrfs_orphan_add(trans, inode);
3971out: 3966out:
3972 nr = trans->blocks_used;
3973 btrfs_end_transaction(trans, root); 3967 btrfs_end_transaction(trans, root);
3974 btrfs_btree_balance_dirty(root, nr); 3968 btrfs_btree_balance_dirty(root);
3975 if (err) { 3969 if (err) {
3976 if (inode) 3970 if (inode)
3977 iput(inode); 3971 iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4057 (unsigned long long)rc->block_group->key.objectid, 4051 (unsigned long long)rc->block_group->key.objectid,
4058 (unsigned long long)rc->block_group->flags); 4052 (unsigned long long)rc->block_group->flags);
4059 4053
4060 btrfs_start_delalloc_inodes(fs_info->tree_root, 0); 4054 ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4055 if (ret < 0) {
4056 err = ret;
4057 goto out;
4058 }
4061 btrfs_wait_ordered_extents(fs_info->tree_root, 0); 4059 btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4062 4060
4063 while (1) { 4061 while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb923d087da7..668af537a3ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
548 struct btrfs_root_item *item = &root->root_item; 548 struct btrfs_root_item *item = &root->root_item;
549 struct timespec ct = CURRENT_TIME; 549 struct timespec ct = CURRENT_TIME;
550 550
551 spin_lock(&root->root_times_lock); 551 spin_lock(&root->root_item_lock);
552 item->ctransid = cpu_to_le64(trans->transid); 552 item->ctransid = cpu_to_le64(trans->transid);
553 item->ctime.sec = cpu_to_le64(ct.tv_sec); 553 item->ctime.sec = cpu_to_le64(ct.tv_sec);
554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec); 554 item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
555 spin_unlock(&root->root_times_lock); 555 spin_unlock(&root->root_item_lock);
556} 556}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 27892f67e69b..bdbb94f245c9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2011 STRATO. All rights reserved. 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
3 * 3 *
4 * This program is free software; you can redistribute it and/or 4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public 5 * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "dev-replace.h"
28#include "check-integrity.h" 29#include "check-integrity.h"
29#include "rcu-string.h" 30#include "rcu-string.h"
30 31
@@ -42,10 +43,23 @@
42 */ 43 */
43 44
44struct scrub_block; 45struct scrub_block;
45struct scrub_dev; 46struct scrub_ctx;
46 47
47#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48/*
48#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 49 * the following three values only influence the performance.
50 * The last one configures the number of parallel and outstanding I/O
51 * operations. The first two values configure an upper limit for the number
52 * of (dynamically allocated) pages that are added to a bio.
53 */
54#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
55#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
56#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
57
58/*
59 * the following value times PAGE_SIZE needs to be large enough to match the
60 * largest node/leaf/sector size that shall be supported.
61 * Values larger than BTRFS_STRIPE_LEN are not supported.
62 */
49#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 63#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
50 64
51struct scrub_page { 65struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
56 u64 generation; 70 u64 generation;
57 u64 logical; 71 u64 logical;
58 u64 physical; 72 u64 physical;
73 u64 physical_for_dev_replace;
74 atomic_t ref_count;
59 struct { 75 struct {
60 unsigned int mirror_num:8; 76 unsigned int mirror_num:8;
61 unsigned int have_csum:1; 77 unsigned int have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
66 82
67struct scrub_bio { 83struct scrub_bio {
68 int index; 84 int index;
69 struct scrub_dev *sdev; 85 struct scrub_ctx *sctx;
86 struct btrfs_device *dev;
70 struct bio *bio; 87 struct bio *bio;
71 int err; 88 int err;
72 u64 logical; 89 u64 logical;
73 u64 physical; 90 u64 physical;
74 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
93#else
94 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
95#endif
75 int page_count; 96 int page_count;
76 int next_free; 97 int next_free;
77 struct btrfs_work work; 98 struct btrfs_work work;
78}; 99};
79 100
80struct scrub_block { 101struct scrub_block {
81 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 102 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
82 int page_count; 103 int page_count;
83 atomic_t outstanding_pages; 104 atomic_t outstanding_pages;
84 atomic_t ref_count; /* free mem on transition to zero */ 105 atomic_t ref_count; /* free mem on transition to zero */
85 struct scrub_dev *sdev; 106 struct scrub_ctx *sctx;
86 struct { 107 struct {
87 unsigned int header_error:1; 108 unsigned int header_error:1;
88 unsigned int checksum_error:1; 109 unsigned int checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
91 }; 112 };
92}; 113};
93 114
94struct scrub_dev { 115struct scrub_wr_ctx {
95 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 116 struct scrub_bio *wr_curr_bio;
96 struct btrfs_device *dev; 117 struct btrfs_device *tgtdev;
118 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119 atomic_t flush_all_writes;
120 struct mutex wr_lock;
121};
122
123struct scrub_ctx {
124 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
125 struct btrfs_root *dev_root;
97 int first_free; 126 int first_free;
98 int curr; 127 int curr;
99 atomic_t in_flight; 128 atomic_t bios_in_flight;
100 atomic_t fixup_cnt; 129 atomic_t workers_pending;
101 spinlock_t list_lock; 130 spinlock_t list_lock;
102 wait_queue_head_t list_wait; 131 wait_queue_head_t list_wait;
103 u16 csum_size; 132 u16 csum_size;
104 struct list_head csum_list; 133 struct list_head csum_list;
105 atomic_t cancel_req; 134 atomic_t cancel_req;
106 int readonly; 135 int readonly;
107 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 136 int pages_per_rd_bio;
108 u32 sectorsize; 137 u32 sectorsize;
109 u32 nodesize; 138 u32 nodesize;
110 u32 leafsize; 139 u32 leafsize;
140
141 int is_dev_replace;
142 struct scrub_wr_ctx wr_ctx;
143
111 /* 144 /*
112 * statistics 145 * statistics
113 */ 146 */
@@ -116,13 +149,23 @@ struct scrub_dev {
116}; 149};
117 150
118struct scrub_fixup_nodatasum { 151struct scrub_fixup_nodatasum {
119 struct scrub_dev *sdev; 152 struct scrub_ctx *sctx;
153 struct btrfs_device *dev;
120 u64 logical; 154 u64 logical;
121 struct btrfs_root *root; 155 struct btrfs_root *root;
122 struct btrfs_work work; 156 struct btrfs_work work;
123 int mirror_num; 157 int mirror_num;
124}; 158};
125 159
160struct scrub_copy_nocow_ctx {
161 struct scrub_ctx *sctx;
162 u64 logical;
163 u64 len;
164 int mirror_num;
165 u64 physical_for_dev_replace;
166 struct btrfs_work work;
167};
168
126struct scrub_warning { 169struct scrub_warning {
127 struct btrfs_path *path; 170 struct btrfs_path *path;
128 u64 extent_item_size; 171 u64 extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
137}; 180};
138 181
139 182
183static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
184static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
140static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
141static int scrub_setup_recheck_block(struct scrub_dev *sdev, 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
142 struct btrfs_mapping_tree *map_tree, 189 struct btrfs_fs_info *fs_info,
190 struct scrub_block *original_sblock,
143 u64 length, u64 logical, 191 u64 length, u64 logical,
144 struct scrub_block *sblock); 192 struct scrub_block *sblocks_for_recheck);
145static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
146 struct scrub_block *sblock, int is_metadata, 194 struct scrub_block *sblock, int is_metadata,
147 int have_csum, u8 *csum, u64 generation, 195 int have_csum, u8 *csum, u64 generation,
148 u16 csum_size); 196 u16 csum_size);
149static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
150 struct scrub_block *sblock, 198 struct scrub_block *sblock,
151 int is_metadata, int have_csum, 199 int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
158static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
159 struct scrub_block *sblock_good, 207 struct scrub_block *sblock_good,
160 int page_num, int force_write); 208 int page_num, int force_write);
209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211 int page_num);
161static int scrub_checksum_data(struct scrub_block *sblock); 212static int scrub_checksum_data(struct scrub_block *sblock);
162static int scrub_checksum_tree_block(struct scrub_block *sblock); 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
163static int scrub_checksum_super(struct scrub_block *sblock); 214static int scrub_checksum_super(struct scrub_block *sblock);
164static void scrub_block_get(struct scrub_block *sblock); 215static void scrub_block_get(struct scrub_block *sblock);
165static void scrub_block_put(struct scrub_block *sblock); 216static void scrub_block_put(struct scrub_block *sblock);
166static int scrub_add_page_to_bio(struct scrub_dev *sdev, 217static void scrub_page_get(struct scrub_page *spage);
167 struct scrub_page *spage); 218static void scrub_page_put(struct scrub_page *spage);
168static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
169 u64 physical, u64 flags, u64 gen, int mirror_num, 220 struct scrub_page *spage);
170 u8 *csum, int force); 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
222 u64 physical, struct btrfs_device *dev, u64 flags,
223 u64 gen, int mirror_num, u8 *csum, int force,
224 u64 physical_for_dev_replace);
171static void scrub_bio_end_io(struct bio *bio, int err); 225static void scrub_bio_end_io(struct bio *bio, int err);
172static void scrub_bio_end_io_worker(struct btrfs_work *work); 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
173static void scrub_block_complete(struct scrub_block *sblock); 227static void scrub_block_complete(struct scrub_block *sblock);
228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229 u64 extent_logical, u64 extent_len,
230 u64 *extent_physical,
231 struct btrfs_device **extent_dev,
232 int *extent_mirror_num);
233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234 struct scrub_wr_ctx *wr_ctx,
235 struct btrfs_fs_info *fs_info,
236 struct btrfs_device *dev,
237 int is_dev_replace);
238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_page *spage);
241static void scrub_wr_submit(struct scrub_ctx *sctx);
242static void scrub_wr_bio_end_io(struct bio *bio, int err);
243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244static int write_page_nocow(struct scrub_ctx *sctx,
245 u64 physical_for_dev_replace, struct page *page);
246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247 void *ctx);
248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249 int mirror_num, u64 physical_for_dev_replace);
250static void copy_nocow_pages_worker(struct btrfs_work *work);
251
252
253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254{
255 atomic_inc(&sctx->bios_in_flight);
256}
257
258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259{
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
262}
263
264/*
265 * used for workers that require transaction commits (i.e., for the
266 * NOCOW case)
267 */
268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
269{
270 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
271
272 /*
273 * increment scrubs_running to prevent cancel requests from
274 * completing as long as a worker is running. we must also
275 * increment scrubs_paused to prevent deadlocking on pause
276 * requests used for transactions commits (as the worker uses a
277 * transaction context). it is safe to regard the worker
278 * as paused for all matters practical. effectively, we only
279 * avoid cancellation requests from completing.
280 */
281 mutex_lock(&fs_info->scrub_lock);
282 atomic_inc(&fs_info->scrubs_running);
283 atomic_inc(&fs_info->scrubs_paused);
284 mutex_unlock(&fs_info->scrub_lock);
285 atomic_inc(&sctx->workers_pending);
286}
174 287
288/* used for workers that require transaction commits */
289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
290{
291 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
175 292
176static void scrub_free_csums(struct scrub_dev *sdev) 293 /*
294 * see scrub_pending_trans_workers_inc() why we're pretending
295 * to be paused in the scrub counters
296 */
297 mutex_lock(&fs_info->scrub_lock);
298 atomic_dec(&fs_info->scrubs_running);
299 atomic_dec(&fs_info->scrubs_paused);
300 mutex_unlock(&fs_info->scrub_lock);
301 atomic_dec(&sctx->workers_pending);
302 wake_up(&fs_info->scrub_pause_wait);
303 wake_up(&sctx->list_wait);
304}
305
306static void scrub_free_csums(struct scrub_ctx *sctx)
177{ 307{
178 while (!list_empty(&sdev->csum_list)) { 308 while (!list_empty(&sctx->csum_list)) {
179 struct btrfs_ordered_sum *sum; 309 struct btrfs_ordered_sum *sum;
180 sum = list_first_entry(&sdev->csum_list, 310 sum = list_first_entry(&sctx->csum_list,
181 struct btrfs_ordered_sum, list); 311 struct btrfs_ordered_sum, list);
182 list_del(&sum->list); 312 list_del(&sum->list);
183 kfree(sum); 313 kfree(sum);
184 } 314 }
185} 315}
186 316
187static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
188{ 318{
189 int i; 319 int i;
190 320
191 if (!sdev) 321 if (!sctx)
192 return; 322 return;
193 323
324 scrub_free_wr_ctx(&sctx->wr_ctx);
325
194 /* this can happen when scrub is cancelled */ 326 /* this can happen when scrub is cancelled */
195 if (sdev->curr != -1) { 327 if (sctx->curr != -1) {
196 struct scrub_bio *sbio = sdev->bios[sdev->curr]; 328 struct scrub_bio *sbio = sctx->bios[sctx->curr];
197 329
198 for (i = 0; i < sbio->page_count; i++) { 330 for (i = 0; i < sbio->page_count; i++) {
199 BUG_ON(!sbio->pagev[i]); 331 WARN_ON(!sbio->pagev[i]->page);
200 BUG_ON(!sbio->pagev[i]->page);
201 scrub_block_put(sbio->pagev[i]->sblock); 332 scrub_block_put(sbio->pagev[i]->sblock);
202 } 333 }
203 bio_put(sbio->bio); 334 bio_put(sbio->bio);
204 } 335 }
205 336
206 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 337 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
207 struct scrub_bio *sbio = sdev->bios[i]; 338 struct scrub_bio *sbio = sctx->bios[i];
208 339
209 if (!sbio) 340 if (!sbio)
210 break; 341 break;
211 kfree(sbio); 342 kfree(sbio);
212 } 343 }
213 344
214 scrub_free_csums(sdev); 345 scrub_free_csums(sctx);
215 kfree(sdev); 346 kfree(sctx);
216} 347}
217 348
218static noinline_for_stack 349static noinline_for_stack
219struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
220{ 351{
221 struct scrub_dev *sdev; 352 struct scrub_ctx *sctx;
222 int i; 353 int i;
223 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 354 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
224 int pages_per_bio; 355 int pages_per_rd_bio;
356 int ret;
225 357
226 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 358 /*
227 bio_get_nr_vecs(dev->bdev)); 359 * the setting of pages_per_rd_bio is correct for scrub but might
228 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 360 * be wrong for the dev_replace code where we might read from
229 if (!sdev) 361 * different devices in the initial huge bios. However, that
362 * code is able to correctly handle the case when adding a page
363 * to a bio fails.
364 */
365 if (dev->bdev)
366 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367 bio_get_nr_vecs(dev->bdev));
368 else
369 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
370 sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
371 if (!sctx)
230 goto nomem; 372 goto nomem;
231 sdev->dev = dev; 373 sctx->is_dev_replace = is_dev_replace;
232 sdev->pages_per_bio = pages_per_bio; 374 sctx->pages_per_rd_bio = pages_per_rd_bio;
233 sdev->curr = -1; 375 sctx->curr = -1;
234 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 376 sctx->dev_root = dev->dev_root;
377 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
235 struct scrub_bio *sbio; 378 struct scrub_bio *sbio;
236 379
237 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 380 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
238 if (!sbio) 381 if (!sbio)
239 goto nomem; 382 goto nomem;
240 sdev->bios[i] = sbio; 383 sctx->bios[i] = sbio;
241 384
242 sbio->index = i; 385 sbio->index = i;
243 sbio->sdev = sdev; 386 sbio->sctx = sctx;
244 sbio->page_count = 0; 387 sbio->page_count = 0;
245 sbio->work.func = scrub_bio_end_io_worker; 388 sbio->work.func = scrub_bio_end_io_worker;
246 389
247 if (i != SCRUB_BIOS_PER_DEV-1) 390 if (i != SCRUB_BIOS_PER_SCTX - 1)
248 sdev->bios[i]->next_free = i + 1; 391 sctx->bios[i]->next_free = i + 1;
249 else 392 else
250 sdev->bios[i]->next_free = -1; 393 sctx->bios[i]->next_free = -1;
251 } 394 }
252 sdev->first_free = 0; 395 sctx->first_free = 0;
253 sdev->nodesize = dev->dev_root->nodesize; 396 sctx->nodesize = dev->dev_root->nodesize;
254 sdev->leafsize = dev->dev_root->leafsize; 397 sctx->leafsize = dev->dev_root->leafsize;
255 sdev->sectorsize = dev->dev_root->sectorsize; 398 sctx->sectorsize = dev->dev_root->sectorsize;
256 atomic_set(&sdev->in_flight, 0); 399 atomic_set(&sctx->bios_in_flight, 0);
257 atomic_set(&sdev->fixup_cnt, 0); 400 atomic_set(&sctx->workers_pending, 0);
258 atomic_set(&sdev->cancel_req, 0); 401 atomic_set(&sctx->cancel_req, 0);
259 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 402 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
260 INIT_LIST_HEAD(&sdev->csum_list); 403 INIT_LIST_HEAD(&sctx->csum_list);
261 404
262 spin_lock_init(&sdev->list_lock); 405 spin_lock_init(&sctx->list_lock);
263 spin_lock_init(&sdev->stat_lock); 406 spin_lock_init(&sctx->stat_lock);
264 init_waitqueue_head(&sdev->list_wait); 407 init_waitqueue_head(&sctx->list_wait);
265 return sdev; 408
409 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410 fs_info->dev_replace.tgtdev, is_dev_replace);
411 if (ret) {
412 scrub_free_ctx(sctx);
413 return ERR_PTR(ret);
414 }
415 return sctx;
266 416
267nomem: 417nomem:
268 scrub_free_dev(sdev); 418 scrub_free_ctx(sctx);
269 return ERR_PTR(-ENOMEM); 419 return ERR_PTR(-ENOMEM);
270} 420}
271 421
272static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423 void *warn_ctx)
273{ 424{
274 u64 isize; 425 u64 isize;
275 u32 nlink; 426 u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
277 int i; 428 int i;
278 struct extent_buffer *eb; 429 struct extent_buffer *eb;
279 struct btrfs_inode_item *inode_item; 430 struct btrfs_inode_item *inode_item;
280 struct scrub_warning *swarn = ctx; 431 struct scrub_warning *swarn = warn_ctx;
281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 432 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
282 struct inode_fs_paths *ipath = NULL; 433 struct inode_fs_paths *ipath = NULL;
283 struct btrfs_root *local_root; 434 struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
345 496
346static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
347{ 498{
348 struct btrfs_device *dev = sblock->sdev->dev; 499 struct btrfs_device *dev;
349 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 500 struct btrfs_fs_info *fs_info;
350 struct btrfs_path *path; 501 struct btrfs_path *path;
351 struct btrfs_key found_key; 502 struct btrfs_key found_key;
352 struct extent_buffer *eb; 503 struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
361 const int bufsize = 4096; 512 const int bufsize = 4096;
362 int ret; 513 int ret;
363 514
515 WARN_ON(sblock->page_count < 1);
516 dev = sblock->pagev[0]->dev;
517 fs_info = sblock->sctx->dev_root->fs_info;
518
364 path = btrfs_alloc_path(); 519 path = btrfs_alloc_path();
365 520
366 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 521 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
367 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 522 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
368 BUG_ON(sblock->page_count < 1); 523 swarn.sector = (sblock->pagev[0]->physical) >> 9;
369 swarn.sector = (sblock->pagev[0].physical) >> 9; 524 swarn.logical = sblock->pagev[0]->logical;
370 swarn.logical = sblock->pagev[0].logical;
371 swarn.errstr = errstr; 525 swarn.errstr = errstr;
372 swarn.dev = dev; 526 swarn.dev = NULL;
373 swarn.msg_bufsize = bufsize; 527 swarn.msg_bufsize = bufsize;
374 swarn.scratch_bufsize = bufsize; 528 swarn.scratch_bufsize = bufsize;
375 529
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
405 } while (ret != 1); 559 } while (ret != 1);
406 } else { 560 } else {
407 swarn.path = path; 561 swarn.path = path;
562 swarn.dev = dev;
408 iterate_extent_inodes(fs_info, found_key.objectid, 563 iterate_extent_inodes(fs_info, found_key.objectid,
409 extent_item_pos, 1, 564 extent_item_pos, 1,
410 scrub_print_warning_inode, &swarn); 565 scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ out:
416 kfree(swarn.msg_buf); 571 kfree(swarn.msg_buf);
417} 572}
418 573
419static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
420{ 575{
421 struct page *page = NULL; 576 struct page *page = NULL;
422 unsigned long index; 577 unsigned long index;
423 struct scrub_fixup_nodatasum *fixup = ctx; 578 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
424 int ret; 579 int ret;
425 int corrected = 0; 580 int corrected = 0;
426 struct btrfs_key key; 581 struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
451 } 606 }
452 607
453 if (PageUptodate(page)) { 608 if (PageUptodate(page)) {
454 struct btrfs_mapping_tree *map_tree; 609 struct btrfs_fs_info *fs_info;
455 if (PageDirty(page)) { 610 if (PageDirty(page)) {
456 /* 611 /*
457 * we need to write the data to the defect sector. the 612 * we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
472 ret = -EIO; 627 ret = -EIO;
473 goto out; 628 goto out;
474 } 629 }
475 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 630 fs_info = BTRFS_I(inode)->root->fs_info;
476 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 631 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
477 fixup->logical, page, 632 fixup->logical, page,
478 fixup->mirror_num); 633 fixup->mirror_num);
479 unlock_page(page); 634 unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
530{ 685{
531 int ret; 686 int ret;
532 struct scrub_fixup_nodatasum *fixup; 687 struct scrub_fixup_nodatasum *fixup;
533 struct scrub_dev *sdev; 688 struct scrub_ctx *sctx;
534 struct btrfs_trans_handle *trans = NULL; 689 struct btrfs_trans_handle *trans = NULL;
535 struct btrfs_fs_info *fs_info; 690 struct btrfs_fs_info *fs_info;
536 struct btrfs_path *path; 691 struct btrfs_path *path;
537 int uncorrectable = 0; 692 int uncorrectable = 0;
538 693
539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 694 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
540 sdev = fixup->sdev; 695 sctx = fixup->sctx;
541 fs_info = fixup->root->fs_info; 696 fs_info = fixup->root->fs_info;
542 697
543 path = btrfs_alloc_path(); 698 path = btrfs_alloc_path();
544 if (!path) { 699 if (!path) {
545 spin_lock(&sdev->stat_lock); 700 spin_lock(&sctx->stat_lock);
546 ++sdev->stat.malloc_errors; 701 ++sctx->stat.malloc_errors;
547 spin_unlock(&sdev->stat_lock); 702 spin_unlock(&sctx->stat_lock);
548 uncorrectable = 1; 703 uncorrectable = 1;
549 goto out; 704 goto out;
550 } 705 }
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
573 } 728 }
574 WARN_ON(ret != 1); 729 WARN_ON(ret != 1);
575 730
576 spin_lock(&sdev->stat_lock); 731 spin_lock(&sctx->stat_lock);
577 ++sdev->stat.corrected_errors; 732 ++sctx->stat.corrected_errors;
578 spin_unlock(&sdev->stat_lock); 733 spin_unlock(&sctx->stat_lock);
579 734
580out: 735out:
581 if (trans && !IS_ERR(trans)) 736 if (trans && !IS_ERR(trans))
582 btrfs_end_transaction(trans, fixup->root); 737 btrfs_end_transaction(trans, fixup->root);
583 if (uncorrectable) { 738 if (uncorrectable) {
584 spin_lock(&sdev->stat_lock); 739 spin_lock(&sctx->stat_lock);
585 ++sdev->stat.uncorrectable_errors; 740 ++sctx->stat.uncorrectable_errors;
586 spin_unlock(&sdev->stat_lock); 741 spin_unlock(&sctx->stat_lock);
587 742 btrfs_dev_replace_stats_inc(
743 &sctx->dev_root->fs_info->dev_replace.
744 num_uncorrectable_read_errors);
588 printk_ratelimited_in_rcu(KERN_ERR 745 printk_ratelimited_in_rcu(KERN_ERR
589 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 746 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
590 (unsigned long long)fixup->logical, 747 (unsigned long long)fixup->logical,
591 rcu_str_deref(sdev->dev->name)); 748 rcu_str_deref(fixup->dev->name));
592 } 749 }
593 750
594 btrfs_free_path(path); 751 btrfs_free_path(path);
595 kfree(fixup); 752 kfree(fixup);
596 753
597 /* see caller why we're pretending to be paused in the scrub counters */ 754 scrub_pending_trans_workers_dec(sctx);
598 mutex_lock(&fs_info->scrub_lock);
599 atomic_dec(&fs_info->scrubs_running);
600 atomic_dec(&fs_info->scrubs_paused);
601 mutex_unlock(&fs_info->scrub_lock);
602 atomic_dec(&sdev->fixup_cnt);
603 wake_up(&fs_info->scrub_pause_wait);
604 wake_up(&sdev->list_wait);
605} 755}
606 756
607/* 757/*
@@ -614,7 +764,8 @@ out:
614 */ 764 */
615static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 765static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
616{ 766{
617 struct scrub_dev *sdev = sblock_to_check->sdev; 767 struct scrub_ctx *sctx = sblock_to_check->sctx;
768 struct btrfs_device *dev;
618 struct btrfs_fs_info *fs_info; 769 struct btrfs_fs_info *fs_info;
619 u64 length; 770 u64 length;
620 u64 logical; 771 u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
633 DEFAULT_RATELIMIT_BURST); 784 DEFAULT_RATELIMIT_BURST);
634 785
635 BUG_ON(sblock_to_check->page_count < 1); 786 BUG_ON(sblock_to_check->page_count < 1);
636 fs_info = sdev->dev->dev_root->fs_info; 787 fs_info = sctx->dev_root->fs_info;
788 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
789 /*
790 * if we find an error in a super block, we just report it.
791 * They will get written with the next transaction commit
792 * anyway
793 */
794 spin_lock(&sctx->stat_lock);
795 ++sctx->stat.super_errors;
796 spin_unlock(&sctx->stat_lock);
797 return 0;
798 }
637 length = sblock_to_check->page_count * PAGE_SIZE; 799 length = sblock_to_check->page_count * PAGE_SIZE;
638 logical = sblock_to_check->pagev[0].logical; 800 logical = sblock_to_check->pagev[0]->logical;
639 generation = sblock_to_check->pagev[0].generation; 801 generation = sblock_to_check->pagev[0]->generation;
640 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 802 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
641 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 803 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
642 is_metadata = !(sblock_to_check->pagev[0].flags & 804 is_metadata = !(sblock_to_check->pagev[0]->flags &
643 BTRFS_EXTENT_FLAG_DATA); 805 BTRFS_EXTENT_FLAG_DATA);
644 have_csum = sblock_to_check->pagev[0].have_csum; 806 have_csum = sblock_to_check->pagev[0]->have_csum;
645 csum = sblock_to_check->pagev[0].csum; 807 csum = sblock_to_check->pagev[0]->csum;
808 dev = sblock_to_check->pagev[0]->dev;
809
810 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
811 sblocks_for_recheck = NULL;
812 goto nodatasum_case;
813 }
646 814
647 /* 815 /*
648 * read all mirrors one after the other. This includes to 816 * read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
677 sizeof(*sblocks_for_recheck), 845 sizeof(*sblocks_for_recheck),
678 GFP_NOFS); 846 GFP_NOFS);
679 if (!sblocks_for_recheck) { 847 if (!sblocks_for_recheck) {
680 spin_lock(&sdev->stat_lock); 848 spin_lock(&sctx->stat_lock);
681 sdev->stat.malloc_errors++; 849 sctx->stat.malloc_errors++;
682 sdev->stat.read_errors++; 850 sctx->stat.read_errors++;
683 sdev->stat.uncorrectable_errors++; 851 sctx->stat.uncorrectable_errors++;
684 spin_unlock(&sdev->stat_lock); 852 spin_unlock(&sctx->stat_lock);
685 btrfs_dev_stat_inc_and_print(sdev->dev, 853 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
686 BTRFS_DEV_STAT_READ_ERRS);
687 goto out; 854 goto out;
688 } 855 }
689 856
690 /* setup the context, map the logical blocks and alloc the pages */ 857 /* setup the context, map the logical blocks and alloc the pages */
691 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 858 ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
692 logical, sblocks_for_recheck); 859 logical, sblocks_for_recheck);
693 if (ret) { 860 if (ret) {
694 spin_lock(&sdev->stat_lock); 861 spin_lock(&sctx->stat_lock);
695 sdev->stat.read_errors++; 862 sctx->stat.read_errors++;
696 sdev->stat.uncorrectable_errors++; 863 sctx->stat.uncorrectable_errors++;
697 spin_unlock(&sdev->stat_lock); 864 spin_unlock(&sctx->stat_lock);
698 btrfs_dev_stat_inc_and_print(sdev->dev, 865 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
699 BTRFS_DEV_STAT_READ_ERRS);
700 goto out; 866 goto out;
701 } 867 }
702 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 868 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
703 sblock_bad = sblocks_for_recheck + failed_mirror_index; 869 sblock_bad = sblocks_for_recheck + failed_mirror_index;
704 870
705 /* build and submit the bios for the failed mirror, check checksums */ 871 /* build and submit the bios for the failed mirror, check checksums */
706 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 872 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
707 csum, generation, sdev->csum_size); 873 csum, generation, sctx->csum_size);
708 if (ret) {
709 spin_lock(&sdev->stat_lock);
710 sdev->stat.read_errors++;
711 sdev->stat.uncorrectable_errors++;
712 spin_unlock(&sdev->stat_lock);
713 btrfs_dev_stat_inc_and_print(sdev->dev,
714 BTRFS_DEV_STAT_READ_ERRS);
715 goto out;
716 }
717 874
718 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 875 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
719 sblock_bad->no_io_error_seen) { 876 sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
725 * different bio (usually one of the two latter cases is 882 * different bio (usually one of the two latter cases is
726 * the cause) 883 * the cause)
727 */ 884 */
728 spin_lock(&sdev->stat_lock); 885 spin_lock(&sctx->stat_lock);
729 sdev->stat.unverified_errors++; 886 sctx->stat.unverified_errors++;
730 spin_unlock(&sdev->stat_lock); 887 spin_unlock(&sctx->stat_lock);
731 888
889 if (sctx->is_dev_replace)
890 scrub_write_block_to_dev_replace(sblock_bad);
732 goto out; 891 goto out;
733 } 892 }
734 893
735 if (!sblock_bad->no_io_error_seen) { 894 if (!sblock_bad->no_io_error_seen) {
736 spin_lock(&sdev->stat_lock); 895 spin_lock(&sctx->stat_lock);
737 sdev->stat.read_errors++; 896 sctx->stat.read_errors++;
738 spin_unlock(&sdev->stat_lock); 897 spin_unlock(&sctx->stat_lock);
739 if (__ratelimit(&_rs)) 898 if (__ratelimit(&_rs))
740 scrub_print_warning("i/o error", sblock_to_check); 899 scrub_print_warning("i/o error", sblock_to_check);
741 btrfs_dev_stat_inc_and_print(sdev->dev, 900 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
742 BTRFS_DEV_STAT_READ_ERRS);
743 } else if (sblock_bad->checksum_error) { 901 } else if (sblock_bad->checksum_error) {
744 spin_lock(&sdev->stat_lock); 902 spin_lock(&sctx->stat_lock);
745 sdev->stat.csum_errors++; 903 sctx->stat.csum_errors++;
746 spin_unlock(&sdev->stat_lock); 904 spin_unlock(&sctx->stat_lock);
747 if (__ratelimit(&_rs)) 905 if (__ratelimit(&_rs))
748 scrub_print_warning("checksum error", sblock_to_check); 906 scrub_print_warning("checksum error", sblock_to_check);
749 btrfs_dev_stat_inc_and_print(sdev->dev, 907 btrfs_dev_stat_inc_and_print(dev,
750 BTRFS_DEV_STAT_CORRUPTION_ERRS); 908 BTRFS_DEV_STAT_CORRUPTION_ERRS);
751 } else if (sblock_bad->header_error) { 909 } else if (sblock_bad->header_error) {
752 spin_lock(&sdev->stat_lock); 910 spin_lock(&sctx->stat_lock);
753 sdev->stat.verify_errors++; 911 sctx->stat.verify_errors++;
754 spin_unlock(&sdev->stat_lock); 912 spin_unlock(&sctx->stat_lock);
755 if (__ratelimit(&_rs)) 913 if (__ratelimit(&_rs))
756 scrub_print_warning("checksum/header error", 914 scrub_print_warning("checksum/header error",
757 sblock_to_check); 915 sblock_to_check);
758 if (sblock_bad->generation_error) 916 if (sblock_bad->generation_error)
759 btrfs_dev_stat_inc_and_print(sdev->dev, 917 btrfs_dev_stat_inc_and_print(dev,
760 BTRFS_DEV_STAT_GENERATION_ERRS); 918 BTRFS_DEV_STAT_GENERATION_ERRS);
761 else 919 else
762 btrfs_dev_stat_inc_and_print(sdev->dev, 920 btrfs_dev_stat_inc_and_print(dev,
763 BTRFS_DEV_STAT_CORRUPTION_ERRS); 921 BTRFS_DEV_STAT_CORRUPTION_ERRS);
764 } 922 }
765 923
766 if (sdev->readonly) 924 if (sctx->readonly && !sctx->is_dev_replace)
767 goto did_not_correct_error; 925 goto did_not_correct_error;
768 926
769 if (!is_metadata && !have_csum) { 927 if (!is_metadata && !have_csum) {
770 struct scrub_fixup_nodatasum *fixup_nodatasum; 928 struct scrub_fixup_nodatasum *fixup_nodatasum;
771 929
930nodatasum_case:
931 WARN_ON(sctx->is_dev_replace);
932
772 /* 933 /*
773 * !is_metadata and !have_csum, this means that the data 934 * !is_metadata and !have_csum, this means that the data
774 * might not be COW'ed, that it might be modified 935 * might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 940 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
780 if (!fixup_nodatasum) 941 if (!fixup_nodatasum)
781 goto did_not_correct_error; 942 goto did_not_correct_error;
782 fixup_nodatasum->sdev = sdev; 943 fixup_nodatasum->sctx = sctx;
944 fixup_nodatasum->dev = dev;
783 fixup_nodatasum->logical = logical; 945 fixup_nodatasum->logical = logical;
784 fixup_nodatasum->root = fs_info->extent_root; 946 fixup_nodatasum->root = fs_info->extent_root;
785 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 947 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
786 /* 948 scrub_pending_trans_workers_inc(sctx);
787 * increment scrubs_running to prevent cancel requests from
788 * completing as long as a fixup worker is running. we must also
789 * increment scrubs_paused to prevent deadlocking on pause
790 * requests used for transactions commits (as the worker uses a
791 * transaction context). it is safe to regard the fixup worker
792 * as paused for all matters practical. effectively, we only
793 * avoid cancellation requests from completing.
794 */
795 mutex_lock(&fs_info->scrub_lock);
796 atomic_inc(&fs_info->scrubs_running);
797 atomic_inc(&fs_info->scrubs_paused);
798 mutex_unlock(&fs_info->scrub_lock);
799 atomic_inc(&sdev->fixup_cnt);
800 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 949 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
801 btrfs_queue_worker(&fs_info->scrub_workers, 950 btrfs_queue_worker(&fs_info->scrub_workers,
802 &fixup_nodatasum->work); 951 &fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
805 954
806 /* 955 /*
807 * now build and submit the bios for the other mirrors, check 956 * now build and submit the bios for the other mirrors, check
808 * checksums 957 * checksums.
809 */ 958 * First try to pick the mirror which is completely without I/O
810 for (mirror_index = 0;
811 mirror_index < BTRFS_MAX_MIRRORS &&
812 sblocks_for_recheck[mirror_index].page_count > 0;
813 mirror_index++) {
814 if (mirror_index == failed_mirror_index)
815 continue;
816
817 /* build and submit the bios, check checksums */
818 ret = scrub_recheck_block(fs_info,
819 sblocks_for_recheck + mirror_index,
820 is_metadata, have_csum, csum,
821 generation, sdev->csum_size);
822 if (ret)
823 goto did_not_correct_error;
824 }
825
826 /*
827 * first try to pick the mirror which is completely without I/O
828 * errors and also does not have a checksum error. 959 * errors and also does not have a checksum error.
829 * If one is found, and if a checksum is present, the full block 960 * If one is found, and if a checksum is present, the full block
830 * that is known to contain an error is rewritten. Afterwards 961 * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
840 mirror_index < BTRFS_MAX_MIRRORS && 971 mirror_index < BTRFS_MAX_MIRRORS &&
841 sblocks_for_recheck[mirror_index].page_count > 0; 972 sblocks_for_recheck[mirror_index].page_count > 0;
842 mirror_index++) { 973 mirror_index++) {
843 struct scrub_block *sblock_other = sblocks_for_recheck + 974 struct scrub_block *sblock_other;
844 mirror_index; 975
976 if (mirror_index == failed_mirror_index)
977 continue;
978 sblock_other = sblocks_for_recheck + mirror_index;
979
980 /* build and submit the bios, check checksums */
981 scrub_recheck_block(fs_info, sblock_other, is_metadata,
982 have_csum, csum, generation,
983 sctx->csum_size);
845 984
846 if (!sblock_other->header_error && 985 if (!sblock_other->header_error &&
847 !sblock_other->checksum_error && 986 !sblock_other->checksum_error &&
848 sblock_other->no_io_error_seen) { 987 sblock_other->no_io_error_seen) {
849 int force_write = is_metadata || have_csum; 988 if (sctx->is_dev_replace) {
850 989 scrub_write_block_to_dev_replace(sblock_other);
851 ret = scrub_repair_block_from_good_copy(sblock_bad, 990 } else {
852 sblock_other, 991 int force_write = is_metadata || have_csum;
853 force_write); 992
993 ret = scrub_repair_block_from_good_copy(
994 sblock_bad, sblock_other,
995 force_write);
996 }
854 if (0 == ret) 997 if (0 == ret)
855 goto corrected_error; 998 goto corrected_error;
856 } 999 }
857 } 1000 }
858 1001
859 /* 1002 /*
860 * in case of I/O errors in the area that is supposed to be 1003 * for dev_replace, pick good pages and write to the target device.
1004 */
1005 if (sctx->is_dev_replace) {
1006 success = 1;
1007 for (page_num = 0; page_num < sblock_bad->page_count;
1008 page_num++) {
1009 int sub_success;
1010
1011 sub_success = 0;
1012 for (mirror_index = 0;
1013 mirror_index < BTRFS_MAX_MIRRORS &&
1014 sblocks_for_recheck[mirror_index].page_count > 0;
1015 mirror_index++) {
1016 struct scrub_block *sblock_other =
1017 sblocks_for_recheck + mirror_index;
1018 struct scrub_page *page_other =
1019 sblock_other->pagev[page_num];
1020
1021 if (!page_other->io_error) {
1022 ret = scrub_write_page_to_dev_replace(
1023 sblock_other, page_num);
1024 if (ret == 0) {
1025 /* succeeded for this page */
1026 sub_success = 1;
1027 break;
1028 } else {
1029 btrfs_dev_replace_stats_inc(
1030 &sctx->dev_root->
1031 fs_info->dev_replace.
1032 num_write_errors);
1033 }
1034 }
1035 }
1036
1037 if (!sub_success) {
1038 /*
1039 * did not find a mirror to fetch the page
1040 * from. scrub_write_page_to_dev_replace()
1041 * handles this case (page->io_error), by
1042 * filling the block with zeros before
1043 * submitting the write request
1044 */
1045 success = 0;
1046 ret = scrub_write_page_to_dev_replace(
1047 sblock_bad, page_num);
1048 if (ret)
1049 btrfs_dev_replace_stats_inc(
1050 &sctx->dev_root->fs_info->
1051 dev_replace.num_write_errors);
1052 }
1053 }
1054
1055 goto out;
1056 }
1057
1058 /*
1059 * for regular scrub, repair those pages that are errored.
1060 * In case of I/O errors in the area that is supposed to be
861 * repaired, continue by picking good copies of those pages. 1061 * repaired, continue by picking good copies of those pages.
862 * Select the good pages from mirrors to rewrite bad pages from 1062 * Select the good pages from mirrors to rewrite bad pages from
863 * the area to fix. Afterwards verify the checksum of the block 1063 * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
887 1087
888 success = 1; 1088 success = 1;
889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1089 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
890 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1090 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
891 1091
892 if (!page_bad->io_error) 1092 if (!page_bad->io_error)
893 continue; 1093 continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
898 mirror_index++) { 1098 mirror_index++) {
899 struct scrub_block *sblock_other = sblocks_for_recheck + 1099 struct scrub_block *sblock_other = sblocks_for_recheck +
900 mirror_index; 1100 mirror_index;
901 struct scrub_page *page_other = sblock_other->pagev + 1101 struct scrub_page *page_other = sblock_other->pagev[
902 page_num; 1102 page_num];
903 1103
904 if (!page_other->io_error) { 1104 if (!page_other->io_error) {
905 ret = scrub_repair_page_from_good_copy( 1105 ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
928 * is verified, but most likely the data comes out 1128 * is verified, but most likely the data comes out
929 * of the page cache. 1129 * of the page cache.
930 */ 1130 */
931 ret = scrub_recheck_block(fs_info, sblock_bad, 1131 scrub_recheck_block(fs_info, sblock_bad,
932 is_metadata, have_csum, csum, 1132 is_metadata, have_csum, csum,
933 generation, sdev->csum_size); 1133 generation, sctx->csum_size);
934 if (!ret && !sblock_bad->header_error && 1134 if (!sblock_bad->header_error &&
935 !sblock_bad->checksum_error && 1135 !sblock_bad->checksum_error &&
936 sblock_bad->no_io_error_seen) 1136 sblock_bad->no_io_error_seen)
937 goto corrected_error; 1137 goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
939 goto did_not_correct_error; 1139 goto did_not_correct_error;
940 } else { 1140 } else {
941corrected_error: 1141corrected_error:
942 spin_lock(&sdev->stat_lock); 1142 spin_lock(&sctx->stat_lock);
943 sdev->stat.corrected_errors++; 1143 sctx->stat.corrected_errors++;
944 spin_unlock(&sdev->stat_lock); 1144 spin_unlock(&sctx->stat_lock);
945 printk_ratelimited_in_rcu(KERN_ERR 1145 printk_ratelimited_in_rcu(KERN_ERR
946 "btrfs: fixed up error at logical %llu on dev %s\n", 1146 "btrfs: fixed up error at logical %llu on dev %s\n",
947 (unsigned long long)logical, 1147 (unsigned long long)logical,
948 rcu_str_deref(sdev->dev->name)); 1148 rcu_str_deref(dev->name));
949 } 1149 }
950 } else { 1150 } else {
951did_not_correct_error: 1151did_not_correct_error:
952 spin_lock(&sdev->stat_lock); 1152 spin_lock(&sctx->stat_lock);
953 sdev->stat.uncorrectable_errors++; 1153 sctx->stat.uncorrectable_errors++;
954 spin_unlock(&sdev->stat_lock); 1154 spin_unlock(&sctx->stat_lock);
955 printk_ratelimited_in_rcu(KERN_ERR 1155 printk_ratelimited_in_rcu(KERN_ERR
956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 1156 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
957 (unsigned long long)logical, 1157 (unsigned long long)logical,
958 rcu_str_deref(sdev->dev->name)); 1158 rcu_str_deref(dev->name));
959 } 1159 }
960 1160
961out: 1161out:
@@ -966,11 +1166,11 @@ out:
966 mirror_index; 1166 mirror_index;
967 int page_index; 1167 int page_index;
968 1168
969 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 1169 for (page_index = 0; page_index < sblock->page_count;
970 page_index++) 1170 page_index++) {
971 if (sblock->pagev[page_index].page) 1171 sblock->pagev[page_index]->sblock = NULL;
972 __free_page( 1172 scrub_page_put(sblock->pagev[page_index]);
973 sblock->pagev[page_index].page); 1173 }
974 } 1174 }
975 kfree(sblocks_for_recheck); 1175 kfree(sblocks_for_recheck);
976 } 1176 }
@@ -978,8 +1178,9 @@ out:
978 return 0; 1178 return 0;
979} 1179}
980 1180
981static int scrub_setup_recheck_block(struct scrub_dev *sdev, 1181static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
982 struct btrfs_mapping_tree *map_tree, 1182 struct btrfs_fs_info *fs_info,
1183 struct scrub_block *original_sblock,
983 u64 length, u64 logical, 1184 u64 length, u64 logical,
984 struct scrub_block *sblocks_for_recheck) 1185 struct scrub_block *sblocks_for_recheck)
985{ 1186{
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
988 int ret; 1189 int ret;
989 1190
990 /* 1191 /*
991 * note: the three members sdev, ref_count and outstanding_pages 1192 * note: the two members ref_count and outstanding_pages
992 * are not used (and not set) in the blocks that are used for 1193 * are not used (and not set) in the blocks that are used for
993 * the recheck procedure 1194 * the recheck procedure
994 */ 1195 */
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1003 * with a length of PAGE_SIZE, each returned stripe 1204 * with a length of PAGE_SIZE, each returned stripe
1004 * represents one mirror 1205 * represents one mirror
1005 */ 1206 */
1006 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1207 ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1007 &bbio, 0); 1208 &mapped_length, &bbio, 0);
1008 if (ret || !bbio || mapped_length < sublen) { 1209 if (ret || !bbio || mapped_length < sublen) {
1009 kfree(bbio); 1210 kfree(bbio);
1010 return -EIO; 1211 return -EIO;
1011 } 1212 }
1012 1213
1013 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1214 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1215 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1015 mirror_index++) { 1216 mirror_index++) {
1016 struct scrub_block *sblock; 1217 struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1020 continue; 1221 continue;
1021 1222
1022 sblock = sblocks_for_recheck + mirror_index; 1223 sblock = sblocks_for_recheck + mirror_index;
1023 page = sblock->pagev + page_index; 1224 sblock->sctx = sctx;
1225 page = kzalloc(sizeof(*page), GFP_NOFS);
1226 if (!page) {
1227leave_nomem:
1228 spin_lock(&sctx->stat_lock);
1229 sctx->stat.malloc_errors++;
1230 spin_unlock(&sctx->stat_lock);
1231 kfree(bbio);
1232 return -ENOMEM;
1233 }
1234 scrub_page_get(page);
1235 sblock->pagev[page_index] = page;
1024 page->logical = logical; 1236 page->logical = logical;
1025 page->physical = bbio->stripes[mirror_index].physical; 1237 page->physical = bbio->stripes[mirror_index].physical;
1238 BUG_ON(page_index >= original_sblock->page_count);
1239 page->physical_for_dev_replace =
1240 original_sblock->pagev[page_index]->
1241 physical_for_dev_replace;
1026 /* for missing devices, dev->bdev is NULL */ 1242 /* for missing devices, dev->bdev is NULL */
1027 page->dev = bbio->stripes[mirror_index].dev; 1243 page->dev = bbio->stripes[mirror_index].dev;
1028 page->mirror_num = mirror_index + 1; 1244 page->mirror_num = mirror_index + 1;
1029 page->page = alloc_page(GFP_NOFS);
1030 if (!page->page) {
1031 spin_lock(&sdev->stat_lock);
1032 sdev->stat.malloc_errors++;
1033 spin_unlock(&sdev->stat_lock);
1034 kfree(bbio);
1035 return -ENOMEM;
1036 }
1037 sblock->page_count++; 1245 sblock->page_count++;
1246 page->page = alloc_page(GFP_NOFS);
1247 if (!page->page)
1248 goto leave_nomem;
1038 } 1249 }
1039 kfree(bbio); 1250 kfree(bbio);
1040 length -= sublen; 1251 length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
1052 * to take those pages that are not errored from all the mirrors so that 1263 * to take those pages that are not errored from all the mirrors so that
1053 * the pages that are errored in the just handled mirror can be repaired. 1264 * the pages that are errored in the just handled mirror can be repaired.
1054 */ 1265 */
1055static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1266static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1056 struct scrub_block *sblock, int is_metadata, 1267 struct scrub_block *sblock, int is_metadata,
1057 int have_csum, u8 *csum, u64 generation, 1268 int have_csum, u8 *csum, u64 generation,
1058 u16 csum_size) 1269 u16 csum_size)
1059{ 1270{
1060 int page_num; 1271 int page_num;
1061 1272
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1065 1276
1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1277 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1067 struct bio *bio; 1278 struct bio *bio;
1068 int ret; 1279 struct scrub_page *page = sblock->pagev[page_num];
1069 struct scrub_page *page = sblock->pagev + page_num;
1070 DECLARE_COMPLETION_ONSTACK(complete); 1280 DECLARE_COMPLETION_ONSTACK(complete);
1071 1281
1072 if (page->dev->bdev == NULL) { 1282 if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1075 continue; 1285 continue;
1076 } 1286 }
1077 1287
1078 BUG_ON(!page->page); 1288 WARN_ON(!page->page);
1079 bio = bio_alloc(GFP_NOFS, 1); 1289 bio = bio_alloc(GFP_NOFS, 1);
1080 if (!bio) 1290 if (!bio) {
1081 return -EIO; 1291 page->io_error = 1;
1292 sblock->no_io_error_seen = 0;
1293 continue;
1294 }
1082 bio->bi_bdev = page->dev->bdev; 1295 bio->bi_bdev = page->dev->bdev;
1083 bio->bi_sector = page->physical >> 9; 1296 bio->bi_sector = page->physical >> 9;
1084 bio->bi_end_io = scrub_complete_bio_end_io; 1297 bio->bi_end_io = scrub_complete_bio_end_io;
1085 bio->bi_private = &complete; 1298 bio->bi_private = &complete;
1086 1299
1087 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1300 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1088 if (PAGE_SIZE != ret) {
1089 bio_put(bio);
1090 return -EIO;
1091 }
1092 btrfsic_submit_bio(READ, bio); 1301 btrfsic_submit_bio(READ, bio);
1093 1302
1094 /* this will also unplug the queue */ 1303 /* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
1105 have_csum, csum, generation, 1314 have_csum, csum, generation,
1106 csum_size); 1315 csum_size);
1107 1316
1108 return 0; 1317 return;
1109} 1318}
1110 1319
1111static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1320static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1120 struct btrfs_root *root = fs_info->extent_root; 1329 struct btrfs_root *root = fs_info->extent_root;
1121 void *mapped_buffer; 1330 void *mapped_buffer;
1122 1331
1123 BUG_ON(!sblock->pagev[0].page); 1332 WARN_ON(!sblock->pagev[0]->page);
1124 if (is_metadata) { 1333 if (is_metadata) {
1125 struct btrfs_header *h; 1334 struct btrfs_header *h;
1126 1335
1127 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1336 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1128 h = (struct btrfs_header *)mapped_buffer; 1337 h = (struct btrfs_header *)mapped_buffer;
1129 1338
1130 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1339 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1340 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1341 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1133 BTRFS_UUID_SIZE)) { 1342 BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1141 if (!have_csum) 1350 if (!have_csum)
1142 return; 1351 return;
1143 1352
1144 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1353 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1145 } 1354 }
1146 1355
1147 for (page_num = 0;;) { 1356 for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1157 page_num++; 1366 page_num++;
1158 if (page_num >= sblock->page_count) 1367 if (page_num >= sblock->page_count)
1159 break; 1368 break;
1160 BUG_ON(!sblock->pagev[page_num].page); 1369 WARN_ON(!sblock->pagev[page_num]->page);
1161 1370
1162 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1371 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1163 } 1372 }
1164 1373
1165 btrfs_csum_final(crc, calculated_csum); 1374 btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1197 struct scrub_block *sblock_good, 1406 struct scrub_block *sblock_good,
1198 int page_num, int force_write) 1407 int page_num, int force_write)
1199{ 1408{
1200 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1409 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1201 struct scrub_page *page_good = sblock_good->pagev + page_num; 1410 struct scrub_page *page_good = sblock_good->pagev[page_num];
1202 1411
1203 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1412 BUG_ON(page_bad->page == NULL);
1204 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1413 BUG_ON(page_good->page == NULL);
1205 if (force_write || sblock_bad->header_error || 1414 if (force_write || sblock_bad->header_error ||
1206 sblock_bad->checksum_error || page_bad->io_error) { 1415 sblock_bad->checksum_error || page_bad->io_error) {
1207 struct bio *bio; 1416 struct bio *bio;
1208 int ret; 1417 int ret;
1209 DECLARE_COMPLETION_ONSTACK(complete); 1418 DECLARE_COMPLETION_ONSTACK(complete);
1210 1419
1420 if (!page_bad->dev->bdev) {
1421 printk_ratelimited(KERN_WARNING
1422 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1423 return -EIO;
1424 }
1425
1211 bio = bio_alloc(GFP_NOFS, 1); 1426 bio = bio_alloc(GFP_NOFS, 1);
1212 if (!bio) 1427 if (!bio)
1213 return -EIO; 1428 return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1443 if (!bio_flagged(bio, BIO_UPTODATE)) {
1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1444 btrfs_dev_stat_inc_and_print(page_bad->dev,
1230 BTRFS_DEV_STAT_WRITE_ERRS); 1445 BTRFS_DEV_STAT_WRITE_ERRS);
1446 btrfs_dev_replace_stats_inc(
1447 &sblock_bad->sctx->dev_root->fs_info->
1448 dev_replace.num_write_errors);
1231 bio_put(bio); 1449 bio_put(bio);
1232 return -EIO; 1450 return -EIO;
1233 } 1451 }
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1237 return 0; 1455 return 0;
1238} 1456}
1239 1457
1240static void scrub_checksum(struct scrub_block *sblock) 1458static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1459{
1460 int page_num;
1461
1462 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1463 int ret;
1464
1465 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1466 if (ret)
1467 btrfs_dev_replace_stats_inc(
1468 &sblock->sctx->dev_root->fs_info->dev_replace.
1469 num_write_errors);
1470 }
1471}
1472
1473static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1474 int page_num)
1475{
1476 struct scrub_page *spage = sblock->pagev[page_num];
1477
1478 BUG_ON(spage->page == NULL);
1479 if (spage->io_error) {
1480 void *mapped_buffer = kmap_atomic(spage->page);
1481
1482 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1483 flush_dcache_page(spage->page);
1484 kunmap_atomic(mapped_buffer);
1485 }
1486 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1487}
1488
1489static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1490 struct scrub_page *spage)
1491{
1492 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1493 struct scrub_bio *sbio;
1494 int ret;
1495
1496 mutex_lock(&wr_ctx->wr_lock);
1497again:
1498 if (!wr_ctx->wr_curr_bio) {
1499 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1500 GFP_NOFS);
1501 if (!wr_ctx->wr_curr_bio) {
1502 mutex_unlock(&wr_ctx->wr_lock);
1503 return -ENOMEM;
1504 }
1505 wr_ctx->wr_curr_bio->sctx = sctx;
1506 wr_ctx->wr_curr_bio->page_count = 0;
1507 }
1508 sbio = wr_ctx->wr_curr_bio;
1509 if (sbio->page_count == 0) {
1510 struct bio *bio;
1511
1512 sbio->physical = spage->physical_for_dev_replace;
1513 sbio->logical = spage->logical;
1514 sbio->dev = wr_ctx->tgtdev;
1515 bio = sbio->bio;
1516 if (!bio) {
1517 bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1518 if (!bio) {
1519 mutex_unlock(&wr_ctx->wr_lock);
1520 return -ENOMEM;
1521 }
1522 sbio->bio = bio;
1523 }
1524
1525 bio->bi_private = sbio;
1526 bio->bi_end_io = scrub_wr_bio_end_io;
1527 bio->bi_bdev = sbio->dev->bdev;
1528 bio->bi_sector = sbio->physical >> 9;
1529 sbio->err = 0;
1530 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1531 spage->physical_for_dev_replace ||
1532 sbio->logical + sbio->page_count * PAGE_SIZE !=
1533 spage->logical) {
1534 scrub_wr_submit(sctx);
1535 goto again;
1536 }
1537
1538 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1539 if (ret != PAGE_SIZE) {
1540 if (sbio->page_count < 1) {
1541 bio_put(sbio->bio);
1542 sbio->bio = NULL;
1543 mutex_unlock(&wr_ctx->wr_lock);
1544 return -EIO;
1545 }
1546 scrub_wr_submit(sctx);
1547 goto again;
1548 }
1549
1550 sbio->pagev[sbio->page_count] = spage;
1551 scrub_page_get(spage);
1552 sbio->page_count++;
1553 if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1554 scrub_wr_submit(sctx);
1555 mutex_unlock(&wr_ctx->wr_lock);
1556
1557 return 0;
1558}
1559
1560static void scrub_wr_submit(struct scrub_ctx *sctx)
1561{
1562 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1563 struct scrub_bio *sbio;
1564
1565 if (!wr_ctx->wr_curr_bio)
1566 return;
1567
1568 sbio = wr_ctx->wr_curr_bio;
1569 wr_ctx->wr_curr_bio = NULL;
1570 WARN_ON(!sbio->bio->bi_bdev);
1571 scrub_pending_bio_inc(sctx);
1572 /* process all writes in a single worker thread. Then the block layer
1573 * orders the requests before sending them to the driver which
1574 * doubled the write performance on spinning disks when measured
1575 * with Linux 3.5 */
1576 btrfsic_submit_bio(WRITE, sbio->bio);
1577}
1578
1579static void scrub_wr_bio_end_io(struct bio *bio, int err)
1580{
1581 struct scrub_bio *sbio = bio->bi_private;
1582 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1583
1584 sbio->err = err;
1585 sbio->bio = bio;
1586
1587 sbio->work.func = scrub_wr_bio_end_io_worker;
1588 btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1589}
1590
1591static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1592{
1593 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1594 struct scrub_ctx *sctx = sbio->sctx;
1595 int i;
1596
1597 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1598 if (sbio->err) {
1599 struct btrfs_dev_replace *dev_replace =
1600 &sbio->sctx->dev_root->fs_info->dev_replace;
1601
1602 for (i = 0; i < sbio->page_count; i++) {
1603 struct scrub_page *spage = sbio->pagev[i];
1604
1605 spage->io_error = 1;
1606 btrfs_dev_replace_stats_inc(&dev_replace->
1607 num_write_errors);
1608 }
1609 }
1610
1611 for (i = 0; i < sbio->page_count; i++)
1612 scrub_page_put(sbio->pagev[i]);
1613
1614 bio_put(sbio->bio);
1615 kfree(sbio);
1616 scrub_pending_bio_dec(sctx);
1617}
1618
1619static int scrub_checksum(struct scrub_block *sblock)
1241{ 1620{
1242 u64 flags; 1621 u64 flags;
1243 int ret; 1622 int ret;
1244 1623
1245 BUG_ON(sblock->page_count < 1); 1624 WARN_ON(sblock->page_count < 1);
1246 flags = sblock->pagev[0].flags; 1625 flags = sblock->pagev[0]->flags;
1247 ret = 0; 1626 ret = 0;
1248 if (flags & BTRFS_EXTENT_FLAG_DATA) 1627 if (flags & BTRFS_EXTENT_FLAG_DATA)
1249 ret = scrub_checksum_data(sblock); 1628 ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
1255 WARN_ON(1); 1634 WARN_ON(1);
1256 if (ret) 1635 if (ret)
1257 scrub_handle_errored_block(sblock); 1636 scrub_handle_errored_block(sblock);
1637
1638 return ret;
1258} 1639}
1259 1640
1260static int scrub_checksum_data(struct scrub_block *sblock) 1641static int scrub_checksum_data(struct scrub_block *sblock)
1261{ 1642{
1262 struct scrub_dev *sdev = sblock->sdev; 1643 struct scrub_ctx *sctx = sblock->sctx;
1263 u8 csum[BTRFS_CSUM_SIZE]; 1644 u8 csum[BTRFS_CSUM_SIZE];
1264 u8 *on_disk_csum; 1645 u8 *on_disk_csum;
1265 struct page *page; 1646 struct page *page;
1266 void *buffer; 1647 void *buffer;
1267 u32 crc = ~(u32)0; 1648 u32 crc = ~(u32)0;
1268 int fail = 0; 1649 int fail = 0;
1269 struct btrfs_root *root = sdev->dev->dev_root; 1650 struct btrfs_root *root = sctx->dev_root;
1270 u64 len; 1651 u64 len;
1271 int index; 1652 int index;
1272 1653
1273 BUG_ON(sblock->page_count < 1); 1654 BUG_ON(sblock->page_count < 1);
1274 if (!sblock->pagev[0].have_csum) 1655 if (!sblock->pagev[0]->have_csum)
1275 return 0; 1656 return 0;
1276 1657
1277 on_disk_csum = sblock->pagev[0].csum; 1658 on_disk_csum = sblock->pagev[0]->csum;
1278 page = sblock->pagev[0].page; 1659 page = sblock->pagev[0]->page;
1279 buffer = kmap_atomic(page); 1660 buffer = kmap_atomic(page);
1280 1661
1281 len = sdev->sectorsize; 1662 len = sctx->sectorsize;
1282 index = 0; 1663 index = 0;
1283 for (;;) { 1664 for (;;) {
1284 u64 l = min_t(u64, len, PAGE_SIZE); 1665 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1290 break; 1671 break;
1291 index++; 1672 index++;
1292 BUG_ON(index >= sblock->page_count); 1673 BUG_ON(index >= sblock->page_count);
1293 BUG_ON(!sblock->pagev[index].page); 1674 BUG_ON(!sblock->pagev[index]->page);
1294 page = sblock->pagev[index].page; 1675 page = sblock->pagev[index]->page;
1295 buffer = kmap_atomic(page); 1676 buffer = kmap_atomic(page);
1296 } 1677 }
1297 1678
1298 btrfs_csum_final(crc, csum); 1679 btrfs_csum_final(crc, csum);
1299 if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1680 if (memcmp(csum, on_disk_csum, sctx->csum_size))
1300 fail = 1; 1681 fail = 1;
1301 1682
1302 return fail; 1683 return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
1304 1685
1305static int scrub_checksum_tree_block(struct scrub_block *sblock) 1686static int scrub_checksum_tree_block(struct scrub_block *sblock)
1306{ 1687{
1307 struct scrub_dev *sdev = sblock->sdev; 1688 struct scrub_ctx *sctx = sblock->sctx;
1308 struct btrfs_header *h; 1689 struct btrfs_header *h;
1309 struct btrfs_root *root = sdev->dev->dev_root; 1690 struct btrfs_root *root = sctx->dev_root;
1310 struct btrfs_fs_info *fs_info = root->fs_info; 1691 struct btrfs_fs_info *fs_info = root->fs_info;
1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1692 u8 calculated_csum[BTRFS_CSUM_SIZE];
1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1693 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1321 int index; 1702 int index;
1322 1703
1323 BUG_ON(sblock->page_count < 1); 1704 BUG_ON(sblock->page_count < 1);
1324 page = sblock->pagev[0].page; 1705 page = sblock->pagev[0]->page;
1325 mapped_buffer = kmap_atomic(page); 1706 mapped_buffer = kmap_atomic(page);
1326 h = (struct btrfs_header *)mapped_buffer; 1707 h = (struct btrfs_header *)mapped_buffer;
1327 memcpy(on_disk_csum, h->csum, sdev->csum_size); 1708 memcpy(on_disk_csum, h->csum, sctx->csum_size);
1328 1709
1329 /* 1710 /*
1330 * we don't use the getter functions here, as we 1711 * we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1332 * b) the page is already kmapped 1713 * b) the page is already kmapped
1333 */ 1714 */
1334 1715
1335 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1716 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1336 ++fail; 1717 ++fail;
1337 1718
1338 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1719 if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1339 ++fail; 1720 ++fail;
1340 1721
1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1722 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1345 BTRFS_UUID_SIZE)) 1726 BTRFS_UUID_SIZE))
1346 ++fail; 1727 ++fail;
1347 1728
1348 BUG_ON(sdev->nodesize != sdev->leafsize); 1729 WARN_ON(sctx->nodesize != sctx->leafsize);
1349 len = sdev->nodesize - BTRFS_CSUM_SIZE; 1730 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1731 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1732 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1352 index = 0; 1733 index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1360 break; 1741 break;
1361 index++; 1742 index++;
1362 BUG_ON(index >= sblock->page_count); 1743 BUG_ON(index >= sblock->page_count);
1363 BUG_ON(!sblock->pagev[index].page); 1744 BUG_ON(!sblock->pagev[index]->page);
1364 page = sblock->pagev[index].page; 1745 page = sblock->pagev[index]->page;
1365 mapped_buffer = kmap_atomic(page); 1746 mapped_buffer = kmap_atomic(page);
1366 mapped_size = PAGE_SIZE; 1747 mapped_size = PAGE_SIZE;
1367 p = mapped_buffer; 1748 p = mapped_buffer;
1368 } 1749 }
1369 1750
1370 btrfs_csum_final(crc, calculated_csum); 1751 btrfs_csum_final(crc, calculated_csum);
1371 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1752 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1372 ++crc_fail; 1753 ++crc_fail;
1373 1754
1374 return fail || crc_fail; 1755 return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1377static int scrub_checksum_super(struct scrub_block *sblock) 1758static int scrub_checksum_super(struct scrub_block *sblock)
1378{ 1759{
1379 struct btrfs_super_block *s; 1760 struct btrfs_super_block *s;
1380 struct scrub_dev *sdev = sblock->sdev; 1761 struct scrub_ctx *sctx = sblock->sctx;
1381 struct btrfs_root *root = sdev->dev->dev_root; 1762 struct btrfs_root *root = sctx->dev_root;
1382 struct btrfs_fs_info *fs_info = root->fs_info; 1763 struct btrfs_fs_info *fs_info = root->fs_info;
1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1764 u8 calculated_csum[BTRFS_CSUM_SIZE];
1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1765 u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1393 int index; 1774 int index;
1394 1775
1395 BUG_ON(sblock->page_count < 1); 1776 BUG_ON(sblock->page_count < 1);
1396 page = sblock->pagev[0].page; 1777 page = sblock->pagev[0]->page;
1397 mapped_buffer = kmap_atomic(page); 1778 mapped_buffer = kmap_atomic(page);
1398 s = (struct btrfs_super_block *)mapped_buffer; 1779 s = (struct btrfs_super_block *)mapped_buffer;
1399 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1780 memcpy(on_disk_csum, s->csum, sctx->csum_size);
1400 1781
1401 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1782 if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1402 ++fail_cor; 1783 ++fail_cor;
1403 1784
1404 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1785 if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1405 ++fail_gen; 1786 ++fail_gen;
1406 1787
1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1788 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1421 break; 1802 break;
1422 index++; 1803 index++;
1423 BUG_ON(index >= sblock->page_count); 1804 BUG_ON(index >= sblock->page_count);
1424 BUG_ON(!sblock->pagev[index].page); 1805 BUG_ON(!sblock->pagev[index]->page);
1425 page = sblock->pagev[index].page; 1806 page = sblock->pagev[index]->page;
1426 mapped_buffer = kmap_atomic(page); 1807 mapped_buffer = kmap_atomic(page);
1427 mapped_size = PAGE_SIZE; 1808 mapped_size = PAGE_SIZE;
1428 p = mapped_buffer; 1809 p = mapped_buffer;
1429 } 1810 }
1430 1811
1431 btrfs_csum_final(crc, calculated_csum); 1812 btrfs_csum_final(crc, calculated_csum);
1432 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1813 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1433 ++fail_cor; 1814 ++fail_cor;
1434 1815
1435 if (fail_cor + fail_gen) { 1816 if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1438 * They will get written with the next transaction commit 1819 * They will get written with the next transaction commit
1439 * anyway 1820 * anyway
1440 */ 1821 */
1441 spin_lock(&sdev->stat_lock); 1822 spin_lock(&sctx->stat_lock);
1442 ++sdev->stat.super_errors; 1823 ++sctx->stat.super_errors;
1443 spin_unlock(&sdev->stat_lock); 1824 spin_unlock(&sctx->stat_lock);
1444 if (fail_cor) 1825 if (fail_cor)
1445 btrfs_dev_stat_inc_and_print(sdev->dev, 1826 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1827 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1447 else 1828 else
1448 btrfs_dev_stat_inc_and_print(sdev->dev, 1829 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1830 BTRFS_DEV_STAT_GENERATION_ERRS);
1450 } 1831 }
1451 1832
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
1463 int i; 1844 int i;
1464 1845
1465 for (i = 0; i < sblock->page_count; i++) 1846 for (i = 0; i < sblock->page_count; i++)
1466 if (sblock->pagev[i].page) 1847 scrub_page_put(sblock->pagev[i]);
1467 __free_page(sblock->pagev[i].page);
1468 kfree(sblock); 1848 kfree(sblock);
1469 } 1849 }
1470} 1850}
1471 1851
1472static void scrub_submit(struct scrub_dev *sdev) 1852static void scrub_page_get(struct scrub_page *spage)
1853{
1854 atomic_inc(&spage->ref_count);
1855}
1856
1857static void scrub_page_put(struct scrub_page *spage)
1858{
1859 if (atomic_dec_and_test(&spage->ref_count)) {
1860 if (spage->page)
1861 __free_page(spage->page);
1862 kfree(spage);
1863 }
1864}
1865
1866static void scrub_submit(struct scrub_ctx *sctx)
1473{ 1867{
1474 struct scrub_bio *sbio; 1868 struct scrub_bio *sbio;
1475 1869
1476 if (sdev->curr == -1) 1870 if (sctx->curr == -1)
1477 return; 1871 return;
1478 1872
1479 sbio = sdev->bios[sdev->curr]; 1873 sbio = sctx->bios[sctx->curr];
1480 sdev->curr = -1; 1874 sctx->curr = -1;
1481 atomic_inc(&sdev->in_flight); 1875 scrub_pending_bio_inc(sctx);
1482 1876
1483 btrfsic_submit_bio(READ, sbio->bio); 1877 if (!sbio->bio->bi_bdev) {
1878 /*
1879 * this case should not happen. If btrfs_map_block() is
1880 * wrong, it could happen for dev-replace operations on
1881 * missing devices when no mirrors are available, but in
1882 * this case it should already fail the mount.
1883 * This case is handled correctly (but _very_ slowly).
1884 */
1885 printk_ratelimited(KERN_WARNING
1886 "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1887 bio_endio(sbio->bio, -EIO);
1888 } else {
1889 btrfsic_submit_bio(READ, sbio->bio);
1890 }
1484} 1891}
1485 1892
1486static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1893static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1487 struct scrub_page *spage) 1894 struct scrub_page *spage)
1488{ 1895{
1489 struct scrub_block *sblock = spage->sblock; 1896 struct scrub_block *sblock = spage->sblock;
1490 struct scrub_bio *sbio; 1897 struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ again:
1494 /* 1901 /*
1495 * grab a fresh bio or wait for one to become available 1902 * grab a fresh bio or wait for one to become available
1496 */ 1903 */
1497 while (sdev->curr == -1) { 1904 while (sctx->curr == -1) {
1498 spin_lock(&sdev->list_lock); 1905 spin_lock(&sctx->list_lock);
1499 sdev->curr = sdev->first_free; 1906 sctx->curr = sctx->first_free;
1500 if (sdev->curr != -1) { 1907 if (sctx->curr != -1) {
1501 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1908 sctx->first_free = sctx->bios[sctx->curr]->next_free;
1502 sdev->bios[sdev->curr]->next_free = -1; 1909 sctx->bios[sctx->curr]->next_free = -1;
1503 sdev->bios[sdev->curr]->page_count = 0; 1910 sctx->bios[sctx->curr]->page_count = 0;
1504 spin_unlock(&sdev->list_lock); 1911 spin_unlock(&sctx->list_lock);
1505 } else { 1912 } else {
1506 spin_unlock(&sdev->list_lock); 1913 spin_unlock(&sctx->list_lock);
1507 wait_event(sdev->list_wait, sdev->first_free != -1); 1914 wait_event(sctx->list_wait, sctx->first_free != -1);
1508 } 1915 }
1509 } 1916 }
1510 sbio = sdev->bios[sdev->curr]; 1917 sbio = sctx->bios[sctx->curr];
1511 if (sbio->page_count == 0) { 1918 if (sbio->page_count == 0) {
1512 struct bio *bio; 1919 struct bio *bio;
1513 1920
1514 sbio->physical = spage->physical; 1921 sbio->physical = spage->physical;
1515 sbio->logical = spage->logical; 1922 sbio->logical = spage->logical;
1923 sbio->dev = spage->dev;
1516 bio = sbio->bio; 1924 bio = sbio->bio;
1517 if (!bio) { 1925 if (!bio) {
1518 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1926 bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1519 if (!bio) 1927 if (!bio)
1520 return -ENOMEM; 1928 return -ENOMEM;
1521 sbio->bio = bio; 1929 sbio->bio = bio;
@@ -1523,14 +1931,15 @@ again:
1523 1931
1524 bio->bi_private = sbio; 1932 bio->bi_private = sbio;
1525 bio->bi_end_io = scrub_bio_end_io; 1933 bio->bi_end_io = scrub_bio_end_io;
1526 bio->bi_bdev = sdev->dev->bdev; 1934 bio->bi_bdev = sbio->dev->bdev;
1527 bio->bi_sector = spage->physical >> 9; 1935 bio->bi_sector = sbio->physical >> 9;
1528 sbio->err = 0; 1936 sbio->err = 0;
1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1937 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1530 spage->physical || 1938 spage->physical ||
1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1939 sbio->logical + sbio->page_count * PAGE_SIZE !=
1532 spage->logical) { 1940 spage->logical ||
1533 scrub_submit(sdev); 1941 sbio->dev != spage->dev) {
1942 scrub_submit(sctx);
1534 goto again; 1943 goto again;
1535 } 1944 }
1536 1945
@@ -1542,81 +1951,87 @@ again:
1542 sbio->bio = NULL; 1951 sbio->bio = NULL;
1543 return -EIO; 1952 return -EIO;
1544 } 1953 }
1545 scrub_submit(sdev); 1954 scrub_submit(sctx);
1546 goto again; 1955 goto again;
1547 } 1956 }
1548 1957
1549 scrub_block_get(sblock); /* one for the added page */ 1958 scrub_block_get(sblock); /* one for the page added to the bio */
1550 atomic_inc(&sblock->outstanding_pages); 1959 atomic_inc(&sblock->outstanding_pages);
1551 sbio->page_count++; 1960 sbio->page_count++;
1552 if (sbio->page_count == sdev->pages_per_bio) 1961 if (sbio->page_count == sctx->pages_per_rd_bio)
1553 scrub_submit(sdev); 1962 scrub_submit(sctx);
1554 1963
1555 return 0; 1964 return 0;
1556} 1965}
1557 1966
1558static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1967static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1559 u64 physical, u64 flags, u64 gen, int mirror_num, 1968 u64 physical, struct btrfs_device *dev, u64 flags,
1560 u8 *csum, int force) 1969 u64 gen, int mirror_num, u8 *csum, int force,
1970 u64 physical_for_dev_replace)
1561{ 1971{
1562 struct scrub_block *sblock; 1972 struct scrub_block *sblock;
1563 int index; 1973 int index;
1564 1974
1565 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1975 sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1566 if (!sblock) { 1976 if (!sblock) {
1567 spin_lock(&sdev->stat_lock); 1977 spin_lock(&sctx->stat_lock);
1568 sdev->stat.malloc_errors++; 1978 sctx->stat.malloc_errors++;
1569 spin_unlock(&sdev->stat_lock); 1979 spin_unlock(&sctx->stat_lock);
1570 return -ENOMEM; 1980 return -ENOMEM;
1571 } 1981 }
1572 1982
1573 /* one ref inside this function, plus one for each page later on */ 1983 /* one ref inside this function, plus one for each page added to
1984 * a bio later on */
1574 atomic_set(&sblock->ref_count, 1); 1985 atomic_set(&sblock->ref_count, 1);
1575 sblock->sdev = sdev; 1986 sblock->sctx = sctx;
1576 sblock->no_io_error_seen = 1; 1987 sblock->no_io_error_seen = 1;
1577 1988
1578 for (index = 0; len > 0; index++) { 1989 for (index = 0; len > 0; index++) {
1579 struct scrub_page *spage = sblock->pagev + index; 1990 struct scrub_page *spage;
1580 u64 l = min_t(u64, len, PAGE_SIZE); 1991 u64 l = min_t(u64, len, PAGE_SIZE);
1581 1992
1582 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1993 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1583 spage->page = alloc_page(GFP_NOFS); 1994 if (!spage) {
1584 if (!spage->page) { 1995leave_nomem:
1585 spin_lock(&sdev->stat_lock); 1996 spin_lock(&sctx->stat_lock);
1586 sdev->stat.malloc_errors++; 1997 sctx->stat.malloc_errors++;
1587 spin_unlock(&sdev->stat_lock); 1998 spin_unlock(&sctx->stat_lock);
1588 while (index > 0) { 1999 scrub_block_put(sblock);
1589 index--;
1590 __free_page(sblock->pagev[index].page);
1591 }
1592 kfree(sblock);
1593 return -ENOMEM; 2000 return -ENOMEM;
1594 } 2001 }
2002 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2003 scrub_page_get(spage);
2004 sblock->pagev[index] = spage;
1595 spage->sblock = sblock; 2005 spage->sblock = sblock;
1596 spage->dev = sdev->dev; 2006 spage->dev = dev;
1597 spage->flags = flags; 2007 spage->flags = flags;
1598 spage->generation = gen; 2008 spage->generation = gen;
1599 spage->logical = logical; 2009 spage->logical = logical;
1600 spage->physical = physical; 2010 spage->physical = physical;
2011 spage->physical_for_dev_replace = physical_for_dev_replace;
1601 spage->mirror_num = mirror_num; 2012 spage->mirror_num = mirror_num;
1602 if (csum) { 2013 if (csum) {
1603 spage->have_csum = 1; 2014 spage->have_csum = 1;
1604 memcpy(spage->csum, csum, sdev->csum_size); 2015 memcpy(spage->csum, csum, sctx->csum_size);
1605 } else { 2016 } else {
1606 spage->have_csum = 0; 2017 spage->have_csum = 0;
1607 } 2018 }
1608 sblock->page_count++; 2019 sblock->page_count++;
2020 spage->page = alloc_page(GFP_NOFS);
2021 if (!spage->page)
2022 goto leave_nomem;
1609 len -= l; 2023 len -= l;
1610 logical += l; 2024 logical += l;
1611 physical += l; 2025 physical += l;
2026 physical_for_dev_replace += l;
1612 } 2027 }
1613 2028
1614 BUG_ON(sblock->page_count == 0); 2029 WARN_ON(sblock->page_count == 0);
1615 for (index = 0; index < sblock->page_count; index++) { 2030 for (index = 0; index < sblock->page_count; index++) {
1616 struct scrub_page *spage = sblock->pagev + index; 2031 struct scrub_page *spage = sblock->pagev[index];
1617 int ret; 2032 int ret;
1618 2033
1619 ret = scrub_add_page_to_bio(sdev, spage); 2034 ret = scrub_add_page_to_rd_bio(sctx, spage);
1620 if (ret) { 2035 if (ret) {
1621 scrub_block_put(sblock); 2036 scrub_block_put(sblock);
1622 return ret; 2037 return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1624 } 2039 }
1625 2040
1626 if (force) 2041 if (force)
1627 scrub_submit(sdev); 2042 scrub_submit(sctx);
1628 2043
1629 /* last one frees, either here or in bio completion for last page */ 2044 /* last one frees, either here or in bio completion for last page */
1630 scrub_block_put(sblock); 2045 scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
1634static void scrub_bio_end_io(struct bio *bio, int err) 2049static void scrub_bio_end_io(struct bio *bio, int err)
1635{ 2050{
1636 struct scrub_bio *sbio = bio->bi_private; 2051 struct scrub_bio *sbio = bio->bi_private;
1637 struct scrub_dev *sdev = sbio->sdev; 2052 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1638 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
1639 2053
1640 sbio->err = err; 2054 sbio->err = err;
1641 sbio->bio = bio; 2055 sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
1646static void scrub_bio_end_io_worker(struct btrfs_work *work) 2060static void scrub_bio_end_io_worker(struct btrfs_work *work)
1647{ 2061{
1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2062 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1649 struct scrub_dev *sdev = sbio->sdev; 2063 struct scrub_ctx *sctx = sbio->sctx;
1650 int i; 2064 int i;
1651 2065
1652 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 2066 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
1653 if (sbio->err) { 2067 if (sbio->err) {
1654 for (i = 0; i < sbio->page_count; i++) { 2068 for (i = 0; i < sbio->page_count; i++) {
1655 struct scrub_page *spage = sbio->pagev[i]; 2069 struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
1671 2085
1672 bio_put(sbio->bio); 2086 bio_put(sbio->bio);
1673 sbio->bio = NULL; 2087 sbio->bio = NULL;
1674 spin_lock(&sdev->list_lock); 2088 spin_lock(&sctx->list_lock);
1675 sbio->next_free = sdev->first_free; 2089 sbio->next_free = sctx->first_free;
1676 sdev->first_free = sbio->index; 2090 sctx->first_free = sbio->index;
1677 spin_unlock(&sdev->list_lock); 2091 spin_unlock(&sctx->list_lock);
1678 atomic_dec(&sdev->in_flight); 2092
1679 wake_up(&sdev->list_wait); 2093 if (sctx->is_dev_replace &&
2094 atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2095 mutex_lock(&sctx->wr_ctx.wr_lock);
2096 scrub_wr_submit(sctx);
2097 mutex_unlock(&sctx->wr_ctx.wr_lock);
2098 }
2099
2100 scrub_pending_bio_dec(sctx);
1680} 2101}
1681 2102
1682static void scrub_block_complete(struct scrub_block *sblock) 2103static void scrub_block_complete(struct scrub_block *sblock)
1683{ 2104{
1684 if (!sblock->no_io_error_seen) 2105 if (!sblock->no_io_error_seen) {
1685 scrub_handle_errored_block(sblock); 2106 scrub_handle_errored_block(sblock);
1686 else 2107 } else {
1687 scrub_checksum(sblock); 2108 /*
2109 * if has checksum error, write via repair mechanism in
2110 * dev replace case, otherwise write here in dev replace
2111 * case.
2112 */
2113 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2114 scrub_write_block_to_dev_replace(sblock);
2115 }
1688} 2116}
1689 2117
1690static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 2118static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
1691 u8 *csum) 2119 u8 *csum)
1692{ 2120{
1693 struct btrfs_ordered_sum *sum = NULL; 2121 struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1695 unsigned long i; 2123 unsigned long i;
1696 unsigned long num_sectors; 2124 unsigned long num_sectors;
1697 2125
1698 while (!list_empty(&sdev->csum_list)) { 2126 while (!list_empty(&sctx->csum_list)) {
1699 sum = list_first_entry(&sdev->csum_list, 2127 sum = list_first_entry(&sctx->csum_list,
1700 struct btrfs_ordered_sum, list); 2128 struct btrfs_ordered_sum, list);
1701 if (sum->bytenr > logical) 2129 if (sum->bytenr > logical)
1702 return 0; 2130 return 0;
1703 if (sum->bytenr + sum->len > logical) 2131 if (sum->bytenr + sum->len > logical)
1704 break; 2132 break;
1705 2133
1706 ++sdev->stat.csum_discards; 2134 ++sctx->stat.csum_discards;
1707 list_del(&sum->list); 2135 list_del(&sum->list);
1708 kfree(sum); 2136 kfree(sum);
1709 sum = NULL; 2137 sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1711 if (!sum) 2139 if (!sum)
1712 return 0; 2140 return 0;
1713 2141
1714 num_sectors = sum->len / sdev->sectorsize; 2142 num_sectors = sum->len / sctx->sectorsize;
1715 for (i = 0; i < num_sectors; ++i) { 2143 for (i = 0; i < num_sectors; ++i) {
1716 if (sum->sums[i].bytenr == logical) { 2144 if (sum->sums[i].bytenr == logical) {
1717 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 2145 memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
1718 ret = 1; 2146 ret = 1;
1719 break; 2147 break;
1720 } 2148 }
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
1727} 2155}
1728 2156
1729/* scrub extent tries to collect up to 64 kB for each bio */ 2157/* scrub extent tries to collect up to 64 kB for each bio */
1730static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 2158static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
1731 u64 physical, u64 flags, u64 gen, int mirror_num) 2159 u64 physical, struct btrfs_device *dev, u64 flags,
2160 u64 gen, int mirror_num, u64 physical_for_dev_replace)
1732{ 2161{
1733 int ret; 2162 int ret;
1734 u8 csum[BTRFS_CSUM_SIZE]; 2163 u8 csum[BTRFS_CSUM_SIZE];
1735 u32 blocksize; 2164 u32 blocksize;
1736 2165
1737 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2166 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1738 blocksize = sdev->sectorsize; 2167 blocksize = sctx->sectorsize;
1739 spin_lock(&sdev->stat_lock); 2168 spin_lock(&sctx->stat_lock);
1740 sdev->stat.data_extents_scrubbed++; 2169 sctx->stat.data_extents_scrubbed++;
1741 sdev->stat.data_bytes_scrubbed += len; 2170 sctx->stat.data_bytes_scrubbed += len;
1742 spin_unlock(&sdev->stat_lock); 2171 spin_unlock(&sctx->stat_lock);
1743 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2172 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1744 BUG_ON(sdev->nodesize != sdev->leafsize); 2173 WARN_ON(sctx->nodesize != sctx->leafsize);
1745 blocksize = sdev->nodesize; 2174 blocksize = sctx->nodesize;
1746 spin_lock(&sdev->stat_lock); 2175 spin_lock(&sctx->stat_lock);
1747 sdev->stat.tree_extents_scrubbed++; 2176 sctx->stat.tree_extents_scrubbed++;
1748 sdev->stat.tree_bytes_scrubbed += len; 2177 sctx->stat.tree_bytes_scrubbed += len;
1749 spin_unlock(&sdev->stat_lock); 2178 spin_unlock(&sctx->stat_lock);
1750 } else { 2179 } else {
1751 blocksize = sdev->sectorsize; 2180 blocksize = sctx->sectorsize;
1752 BUG_ON(1); 2181 WARN_ON(1);
1753 } 2182 }
1754 2183
1755 while (len) { 2184 while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
1758 2187
1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2188 if (flags & BTRFS_EXTENT_FLAG_DATA) {
1760 /* push csums to sbio */ 2189 /* push csums to sbio */
1761 have_csum = scrub_find_csum(sdev, logical, l, csum); 2190 have_csum = scrub_find_csum(sctx, logical, l, csum);
1762 if (have_csum == 0) 2191 if (have_csum == 0)
1763 ++sdev->stat.no_csum; 2192 ++sctx->stat.no_csum;
2193 if (sctx->is_dev_replace && !have_csum) {
2194 ret = copy_nocow_pages(sctx, logical, l,
2195 mirror_num,
2196 physical_for_dev_replace);
2197 goto behind_scrub_pages;
2198 }
1764 } 2199 }
1765 ret = scrub_pages(sdev, logical, l, physical, flags, gen, 2200 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
1766 mirror_num, have_csum ? csum : NULL, 0); 2201 mirror_num, have_csum ? csum : NULL, 0,
2202 physical_for_dev_replace);
2203behind_scrub_pages:
1767 if (ret) 2204 if (ret)
1768 return ret; 2205 return ret;
1769 len -= l; 2206 len -= l;
1770 logical += l; 2207 logical += l;
1771 physical += l; 2208 physical += l;
2209 physical_for_dev_replace += l;
1772 } 2210 }
1773 return 0; 2211 return 0;
1774} 2212}
1775 2213
1776static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 2214static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
1777 struct map_lookup *map, int num, u64 base, u64 length) 2215 struct map_lookup *map,
2216 struct btrfs_device *scrub_dev,
2217 int num, u64 base, u64 length,
2218 int is_dev_replace)
1778{ 2219{
1779 struct btrfs_path *path; 2220 struct btrfs_path *path;
1780 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 2221 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1781 struct btrfs_root *root = fs_info->extent_root; 2222 struct btrfs_root *root = fs_info->extent_root;
1782 struct btrfs_root *csum_root = fs_info->csum_root; 2223 struct btrfs_root *csum_root = fs_info->csum_root;
1783 struct btrfs_extent_item *extent; 2224 struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1797 struct reada_control *reada2; 2238 struct reada_control *reada2;
1798 struct btrfs_key key_start; 2239 struct btrfs_key key_start;
1799 struct btrfs_key key_end; 2240 struct btrfs_key key_end;
1800
1801 u64 increment = map->stripe_len; 2241 u64 increment = map->stripe_len;
1802 u64 offset; 2242 u64 offset;
2243 u64 extent_logical;
2244 u64 extent_physical;
2245 u64 extent_len;
2246 struct btrfs_device *extent_dev;
2247 int extent_mirror_num;
1803 2248
1804 nstripes = length; 2249 nstripes = length;
1805 offset = 0; 2250 offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1843 */ 2288 */
1844 logical = base + offset; 2289 logical = base + offset;
1845 2290
1846 wait_event(sdev->list_wait, 2291 wait_event(sctx->list_wait,
1847 atomic_read(&sdev->in_flight) == 0); 2292 atomic_read(&sctx->bios_in_flight) == 0);
1848 atomic_inc(&fs_info->scrubs_paused); 2293 atomic_inc(&fs_info->scrubs_paused);
1849 wake_up(&fs_info->scrub_pause_wait); 2294 wake_up(&fs_info->scrub_pause_wait);
1850 2295
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1898 * canceled? 2343 * canceled?
1899 */ 2344 */
1900 if (atomic_read(&fs_info->scrub_cancel_req) || 2345 if (atomic_read(&fs_info->scrub_cancel_req) ||
1901 atomic_read(&sdev->cancel_req)) { 2346 atomic_read(&sctx->cancel_req)) {
1902 ret = -ECANCELED; 2347 ret = -ECANCELED;
1903 goto out; 2348 goto out;
1904 } 2349 }
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1907 */ 2352 */
1908 if (atomic_read(&fs_info->scrub_pause_req)) { 2353 if (atomic_read(&fs_info->scrub_pause_req)) {
1909 /* push queued extents */ 2354 /* push queued extents */
1910 scrub_submit(sdev); 2355 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
1911 wait_event(sdev->list_wait, 2356 scrub_submit(sctx);
1912 atomic_read(&sdev->in_flight) == 0); 2357 mutex_lock(&sctx->wr_ctx.wr_lock);
2358 scrub_wr_submit(sctx);
2359 mutex_unlock(&sctx->wr_ctx.wr_lock);
2360 wait_event(sctx->list_wait,
2361 atomic_read(&sctx->bios_in_flight) == 0);
2362 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
1913 atomic_inc(&fs_info->scrubs_paused); 2363 atomic_inc(&fs_info->scrubs_paused);
1914 wake_up(&fs_info->scrub_pause_wait); 2364 wake_up(&fs_info->scrub_pause_wait);
1915 mutex_lock(&fs_info->scrub_lock); 2365 mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
1926 2376
1927 ret = btrfs_lookup_csums_range(csum_root, logical, 2377 ret = btrfs_lookup_csums_range(csum_root, logical,
1928 logical + map->stripe_len - 1, 2378 logical + map->stripe_len - 1,
1929 &sdev->csum_list, 1); 2379 &sctx->csum_list, 1);
1930 if (ret) 2380 if (ret)
1931 goto out; 2381 goto out;
1932 2382
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
2004 key.objectid; 2454 key.objectid;
2005 } 2455 }
2006 2456
2007 ret = scrub_extent(sdev, key.objectid, key.offset, 2457 extent_logical = key.objectid;
2008 key.objectid - logical + physical, 2458 extent_physical = key.objectid - logical + physical;
2009 flags, generation, mirror_num); 2459 extent_len = key.offset;
2460 extent_dev = scrub_dev;
2461 extent_mirror_num = mirror_num;
2462 if (is_dev_replace)
2463 scrub_remap_extent(fs_info, extent_logical,
2464 extent_len, &extent_physical,
2465 &extent_dev,
2466 &extent_mirror_num);
2467 ret = scrub_extent(sctx, extent_logical, extent_len,
2468 extent_physical, extent_dev, flags,
2469 generation, extent_mirror_num,
2470 key.objectid - logical + physical);
2010 if (ret) 2471 if (ret)
2011 goto out; 2472 goto out;
2012 2473
@@ -2016,29 +2477,34 @@ next:
2016 btrfs_release_path(path); 2477 btrfs_release_path(path);
2017 logical += increment; 2478 logical += increment;
2018 physical += map->stripe_len; 2479 physical += map->stripe_len;
2019 spin_lock(&sdev->stat_lock); 2480 spin_lock(&sctx->stat_lock);
2020 sdev->stat.last_physical = physical; 2481 sctx->stat.last_physical = physical;
2021 spin_unlock(&sdev->stat_lock); 2482 spin_unlock(&sctx->stat_lock);
2022 } 2483 }
2484out:
2023 /* push queued extents */ 2485 /* push queued extents */
2024 scrub_submit(sdev); 2486 scrub_submit(sctx);
2487 mutex_lock(&sctx->wr_ctx.wr_lock);
2488 scrub_wr_submit(sctx);
2489 mutex_unlock(&sctx->wr_ctx.wr_lock);
2025 2490
2026out:
2027 blk_finish_plug(&plug); 2491 blk_finish_plug(&plug);
2028 btrfs_free_path(path); 2492 btrfs_free_path(path);
2029 return ret < 0 ? ret : 0; 2493 return ret < 0 ? ret : 0;
2030} 2494}
2031 2495
2032static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2496static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2033 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2497 struct btrfs_device *scrub_dev,
2034 u64 dev_offset) 2498 u64 chunk_tree, u64 chunk_objectid,
2499 u64 chunk_offset, u64 length,
2500 u64 dev_offset, int is_dev_replace)
2035{ 2501{
2036 struct btrfs_mapping_tree *map_tree = 2502 struct btrfs_mapping_tree *map_tree =
2037 &sdev->dev->dev_root->fs_info->mapping_tree; 2503 &sctx->dev_root->fs_info->mapping_tree;
2038 struct map_lookup *map; 2504 struct map_lookup *map;
2039 struct extent_map *em; 2505 struct extent_map *em;
2040 int i; 2506 int i;
2041 int ret = -EINVAL; 2507 int ret = 0;
2042 2508
2043 read_lock(&map_tree->map_tree.lock); 2509 read_lock(&map_tree->map_tree.lock);
2044 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2510 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
2055 goto out; 2521 goto out;
2056 2522
2057 for (i = 0; i < map->num_stripes; ++i) { 2523 for (i = 0; i < map->num_stripes; ++i) {
2058 if (map->stripes[i].dev == sdev->dev && 2524 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2059 map->stripes[i].physical == dev_offset) { 2525 map->stripes[i].physical == dev_offset) {
2060 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2526 ret = scrub_stripe(sctx, map, scrub_dev, i,
2527 chunk_offset, length,
2528 is_dev_replace);
2061 if (ret) 2529 if (ret)
2062 goto out; 2530 goto out;
2063 } 2531 }
@@ -2069,11 +2537,13 @@ out:
2069} 2537}
2070 2538
2071static noinline_for_stack 2539static noinline_for_stack
2072int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2540int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2541 struct btrfs_device *scrub_dev, u64 start, u64 end,
2542 int is_dev_replace)
2073{ 2543{
2074 struct btrfs_dev_extent *dev_extent = NULL; 2544 struct btrfs_dev_extent *dev_extent = NULL;
2075 struct btrfs_path *path; 2545 struct btrfs_path *path;
2076 struct btrfs_root *root = sdev->dev->dev_root; 2546 struct btrfs_root *root = sctx->dev_root;
2077 struct btrfs_fs_info *fs_info = root->fs_info; 2547 struct btrfs_fs_info *fs_info = root->fs_info;
2078 u64 length; 2548 u64 length;
2079 u64 chunk_tree; 2549 u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2085 struct btrfs_key key; 2555 struct btrfs_key key;
2086 struct btrfs_key found_key; 2556 struct btrfs_key found_key;
2087 struct btrfs_block_group_cache *cache; 2557 struct btrfs_block_group_cache *cache;
2558 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2088 2559
2089 path = btrfs_alloc_path(); 2560 path = btrfs_alloc_path();
2090 if (!path) 2561 if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2094 path->search_commit_root = 1; 2565 path->search_commit_root = 1;
2095 path->skip_locking = 1; 2566 path->skip_locking = 1;
2096 2567
2097 key.objectid = sdev->dev->devid; 2568 key.objectid = scrub_dev->devid;
2098 key.offset = 0ull; 2569 key.offset = 0ull;
2099 key.type = BTRFS_DEV_EXTENT_KEY; 2570 key.type = BTRFS_DEV_EXTENT_KEY;
2100 2571
2101
2102 while (1) { 2572 while (1) {
2103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2573 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2104 if (ret < 0) 2574 if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2117 2587
2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2588 btrfs_item_key_to_cpu(l, &found_key, slot);
2119 2589
2120 if (found_key.objectid != sdev->dev->devid) 2590 if (found_key.objectid != scrub_dev->devid)
2121 break; 2591 break;
2122 2592
2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2593 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2151 ret = -ENOENT; 2621 ret = -ENOENT;
2152 break; 2622 break;
2153 } 2623 }
2154 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2624 dev_replace->cursor_right = found_key.offset + length;
2155 chunk_offset, length, found_key.offset); 2625 dev_replace->cursor_left = found_key.offset;
2626 dev_replace->item_needs_writeback = 1;
2627 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2628 chunk_offset, length, found_key.offset,
2629 is_dev_replace);
2630
2631 /*
2632 * flush, submit all pending read and write bios, afterwards
2633 * wait for them.
2634 * Note that in the dev replace case, a read request causes
2635 * write requests that are submitted in the read completion
2636 * worker. Therefore in the current situation, it is required
2637 * that all write requests are flushed, so that all read and
2638 * write requests are really completed when bios_in_flight
2639 * changes to 0.
2640 */
2641 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2642 scrub_submit(sctx);
2643 mutex_lock(&sctx->wr_ctx.wr_lock);
2644 scrub_wr_submit(sctx);
2645 mutex_unlock(&sctx->wr_ctx.wr_lock);
2646
2647 wait_event(sctx->list_wait,
2648 atomic_read(&sctx->bios_in_flight) == 0);
2649 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2650 atomic_inc(&fs_info->scrubs_paused);
2651 wake_up(&fs_info->scrub_pause_wait);
2652 wait_event(sctx->list_wait,
2653 atomic_read(&sctx->workers_pending) == 0);
2654
2655 mutex_lock(&fs_info->scrub_lock);
2656 while (atomic_read(&fs_info->scrub_pause_req)) {
2657 mutex_unlock(&fs_info->scrub_lock);
2658 wait_event(fs_info->scrub_pause_wait,
2659 atomic_read(&fs_info->scrub_pause_req) == 0);
2660 mutex_lock(&fs_info->scrub_lock);
2661 }
2662 atomic_dec(&fs_info->scrubs_paused);
2663 mutex_unlock(&fs_info->scrub_lock);
2664 wake_up(&fs_info->scrub_pause_wait);
2665
2666 dev_replace->cursor_left = dev_replace->cursor_right;
2667 dev_replace->item_needs_writeback = 1;
2156 btrfs_put_block_group(cache); 2668 btrfs_put_block_group(cache);
2157 if (ret) 2669 if (ret)
2158 break; 2670 break;
2671 if (is_dev_replace &&
2672 atomic64_read(&dev_replace->num_write_errors) > 0) {
2673 ret = -EIO;
2674 break;
2675 }
2676 if (sctx->stat.malloc_errors > 0) {
2677 ret = -ENOMEM;
2678 break;
2679 }
2159 2680
2160 key.offset = found_key.offset + length; 2681 key.offset = found_key.offset + length;
2161 btrfs_release_path(path); 2682 btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
2170 return ret < 0 ? ret : 0; 2691 return ret < 0 ? ret : 0;
2171} 2692}
2172 2693
2173static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2694static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2695 struct btrfs_device *scrub_dev)
2174{ 2696{
2175 int i; 2697 int i;
2176 u64 bytenr; 2698 u64 bytenr;
2177 u64 gen; 2699 u64 gen;
2178 int ret; 2700 int ret;
2179 struct btrfs_device *device = sdev->dev; 2701 struct btrfs_root *root = sctx->dev_root;
2180 struct btrfs_root *root = device->dev_root;
2181 2702
2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2703 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2183 return -EIO; 2704 return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2186 2707
2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2708 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2188 bytenr = btrfs_sb_offset(i); 2709 bytenr = btrfs_sb_offset(i);
2189 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2710 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2190 break; 2711 break;
2191 2712
2192 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2713 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2193 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2714 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2715 NULL, 1, bytenr);
2194 if (ret) 2716 if (ret)
2195 return ret; 2717 return ret;
2196 } 2718 }
2197 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2719 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2198 2720
2199 return 0; 2721 return 0;
2200} 2722}
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
2202/* 2724/*
2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2725 * get a reference count on fs_info->scrub_workers. start worker if necessary
2204 */ 2726 */
2205static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2727static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2728 int is_dev_replace)
2206{ 2729{
2207 struct btrfs_fs_info *fs_info = root->fs_info;
2208 int ret = 0; 2730 int ret = 0;
2209 2731
2210 mutex_lock(&fs_info->scrub_lock); 2732 mutex_lock(&fs_info->scrub_lock);
2211 if (fs_info->scrub_workers_refcnt == 0) { 2733 if (fs_info->scrub_workers_refcnt == 0) {
2212 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2734 if (is_dev_replace)
2213 fs_info->thread_pool_size, &fs_info->generic_worker); 2735 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2736 &fs_info->generic_worker);
2737 else
2738 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2739 fs_info->thread_pool_size,
2740 &fs_info->generic_worker);
2214 fs_info->scrub_workers.idle_thresh = 4; 2741 fs_info->scrub_workers.idle_thresh = 4;
2215 ret = btrfs_start_workers(&fs_info->scrub_workers); 2742 ret = btrfs_start_workers(&fs_info->scrub_workers);
2216 if (ret) 2743 if (ret)
2217 goto out; 2744 goto out;
2745 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2746 "scrubwrc",
2747 fs_info->thread_pool_size,
2748 &fs_info->generic_worker);
2749 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2750 ret = btrfs_start_workers(
2751 &fs_info->scrub_wr_completion_workers);
2752 if (ret)
2753 goto out;
2754 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2755 &fs_info->generic_worker);
2756 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2757 if (ret)
2758 goto out;
2218 } 2759 }
2219 ++fs_info->scrub_workers_refcnt; 2760 ++fs_info->scrub_workers_refcnt;
2220out: 2761out:
@@ -2223,40 +2764,41 @@ out:
2223 return ret; 2764 return ret;
2224} 2765}
2225 2766
2226static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2767static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2227{ 2768{
2228 struct btrfs_fs_info *fs_info = root->fs_info;
2229
2230 mutex_lock(&fs_info->scrub_lock); 2769 mutex_lock(&fs_info->scrub_lock);
2231 if (--fs_info->scrub_workers_refcnt == 0) 2770 if (--fs_info->scrub_workers_refcnt == 0) {
2232 btrfs_stop_workers(&fs_info->scrub_workers); 2771 btrfs_stop_workers(&fs_info->scrub_workers);
2772 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2773 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2774 }
2233 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2775 WARN_ON(fs_info->scrub_workers_refcnt < 0);
2234 mutex_unlock(&fs_info->scrub_lock); 2776 mutex_unlock(&fs_info->scrub_lock);
2235} 2777}
2236 2778
2237 2779int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2238int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2780 u64 end, struct btrfs_scrub_progress *progress,
2239 struct btrfs_scrub_progress *progress, int readonly) 2781 int readonly, int is_dev_replace)
2240{ 2782{
2241 struct scrub_dev *sdev; 2783 struct scrub_ctx *sctx;
2242 struct btrfs_fs_info *fs_info = root->fs_info;
2243 int ret; 2784 int ret;
2244 struct btrfs_device *dev; 2785 struct btrfs_device *dev;
2245 2786
2246 if (btrfs_fs_closing(root->fs_info)) 2787 if (btrfs_fs_closing(fs_info))
2247 return -EINVAL; 2788 return -EINVAL;
2248 2789
2249 /* 2790 /*
2250 * check some assumptions 2791 * check some assumptions
2251 */ 2792 */
2252 if (root->nodesize != root->leafsize) { 2793 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2253 printk(KERN_ERR 2794 printk(KERN_ERR
2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2795 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2255 root->nodesize, root->leafsize); 2796 fs_info->chunk_root->nodesize,
2797 fs_info->chunk_root->leafsize);
2256 return -EINVAL; 2798 return -EINVAL;
2257 } 2799 }
2258 2800
2259 if (root->nodesize > BTRFS_STRIPE_LEN) { 2801 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2260 /* 2802 /*
2261 * in this case scrub is unable to calculate the checksum 2803 * in this case scrub is unable to calculate the checksum
2262 * the way scrub is implemented. Do not handle this 2804 * the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2264 */ 2806 */
2265 printk(KERN_ERR 2807 printk(KERN_ERR
2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2808 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2267 root->nodesize, BTRFS_STRIPE_LEN); 2809 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2268 return -EINVAL; 2810 return -EINVAL;
2269 } 2811 }
2270 2812
2271 if (root->sectorsize != PAGE_SIZE) { 2813 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2272 /* not supported for data w/o checksums */ 2814 /* not supported for data w/o checksums */
2273 printk(KERN_ERR 2815 printk(KERN_ERR
2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2816 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2275 root->sectorsize, (unsigned long long)PAGE_SIZE); 2817 fs_info->chunk_root->sectorsize,
2818 (unsigned long long)PAGE_SIZE);
2276 return -EINVAL; 2819 return -EINVAL;
2277 } 2820 }
2278 2821
2279 ret = scrub_workers_get(root); 2822 if (fs_info->chunk_root->nodesize >
2823 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2824 fs_info->chunk_root->sectorsize >
2825 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2826 /*
2827 * would exhaust the array bounds of pagev member in
2828 * struct scrub_block
2829 */
2830 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2831 fs_info->chunk_root->nodesize,
2832 SCRUB_MAX_PAGES_PER_BLOCK,
2833 fs_info->chunk_root->sectorsize,
2834 SCRUB_MAX_PAGES_PER_BLOCK);
2835 return -EINVAL;
2836 }
2837
2838 ret = scrub_workers_get(fs_info, is_dev_replace);
2280 if (ret) 2839 if (ret)
2281 return ret; 2840 return ret;
2282 2841
2283 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2842 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2284 dev = btrfs_find_device(root, devid, NULL, NULL); 2843 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2285 if (!dev || dev->missing) { 2844 if (!dev || (dev->missing && !is_dev_replace)) {
2286 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2845 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2287 scrub_workers_put(root); 2846 scrub_workers_put(fs_info);
2288 return -ENODEV; 2847 return -ENODEV;
2289 } 2848 }
2290 mutex_lock(&fs_info->scrub_lock); 2849 mutex_lock(&fs_info->scrub_lock);
2291 2850
2292 if (!dev->in_fs_metadata) { 2851 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2293 mutex_unlock(&fs_info->scrub_lock); 2852 mutex_unlock(&fs_info->scrub_lock);
2294 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2853 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2295 scrub_workers_put(root); 2854 scrub_workers_put(fs_info);
2296 return -ENODEV; 2855 return -EIO;
2297 } 2856 }
2298 2857
2299 if (dev->scrub_device) { 2858 btrfs_dev_replace_lock(&fs_info->dev_replace);
2859 if (dev->scrub_device ||
2860 (!is_dev_replace &&
2861 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2862 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2300 mutex_unlock(&fs_info->scrub_lock); 2863 mutex_unlock(&fs_info->scrub_lock);
2301 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2864 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2302 scrub_workers_put(root); 2865 scrub_workers_put(fs_info);
2303 return -EINPROGRESS; 2866 return -EINPROGRESS;
2304 } 2867 }
2305 sdev = scrub_setup_dev(dev); 2868 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2306 if (IS_ERR(sdev)) { 2869 sctx = scrub_setup_ctx(dev, is_dev_replace);
2870 if (IS_ERR(sctx)) {
2307 mutex_unlock(&fs_info->scrub_lock); 2871 mutex_unlock(&fs_info->scrub_lock);
2308 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2872 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2309 scrub_workers_put(root); 2873 scrub_workers_put(fs_info);
2310 return PTR_ERR(sdev); 2874 return PTR_ERR(sctx);
2311 } 2875 }
2312 sdev->readonly = readonly; 2876 sctx->readonly = readonly;
2313 dev->scrub_device = sdev; 2877 dev->scrub_device = sctx;
2314 2878
2315 atomic_inc(&fs_info->scrubs_running); 2879 atomic_inc(&fs_info->scrubs_running);
2316 mutex_unlock(&fs_info->scrub_lock); 2880 mutex_unlock(&fs_info->scrub_lock);
2317 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2881 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2318 2882
2319 down_read(&fs_info->scrub_super_lock); 2883 if (!is_dev_replace) {
2320 ret = scrub_supers(sdev); 2884 down_read(&fs_info->scrub_super_lock);
2321 up_read(&fs_info->scrub_super_lock); 2885 ret = scrub_supers(sctx, dev);
2886 up_read(&fs_info->scrub_super_lock);
2887 }
2322 2888
2323 if (!ret) 2889 if (!ret)
2324 ret = scrub_enumerate_chunks(sdev, start, end); 2890 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2891 is_dev_replace);
2325 2892
2326 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2893 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2327 atomic_dec(&fs_info->scrubs_running); 2894 atomic_dec(&fs_info->scrubs_running);
2328 wake_up(&fs_info->scrub_pause_wait); 2895 wake_up(&fs_info->scrub_pause_wait);
2329 2896
2330 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2897 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2331 2898
2332 if (progress) 2899 if (progress)
2333 memcpy(progress, &sdev->stat, sizeof(*progress)); 2900 memcpy(progress, &sctx->stat, sizeof(*progress));
2334 2901
2335 mutex_lock(&fs_info->scrub_lock); 2902 mutex_lock(&fs_info->scrub_lock);
2336 dev->scrub_device = NULL; 2903 dev->scrub_device = NULL;
2337 mutex_unlock(&fs_info->scrub_lock); 2904 mutex_unlock(&fs_info->scrub_lock);
2338 2905
2339 scrub_free_dev(sdev); 2906 scrub_free_ctx(sctx);
2340 scrub_workers_put(root); 2907 scrub_workers_put(fs_info);
2341 2908
2342 return ret; 2909 return ret;
2343} 2910}
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
2377 up_write(&root->fs_info->scrub_super_lock); 2944 up_write(&root->fs_info->scrub_super_lock);
2378} 2945}
2379 2946
2380int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2947int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2381{ 2948{
2382
2383 mutex_lock(&fs_info->scrub_lock); 2949 mutex_lock(&fs_info->scrub_lock);
2384 if (!atomic_read(&fs_info->scrubs_running)) { 2950 if (!atomic_read(&fs_info->scrubs_running)) {
2385 mutex_unlock(&fs_info->scrub_lock); 2951 mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2399 return 0; 2965 return 0;
2400} 2966}
2401 2967
2402int btrfs_scrub_cancel(struct btrfs_root *root) 2968int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2969 struct btrfs_device *dev)
2403{ 2970{
2404 return __btrfs_scrub_cancel(root->fs_info); 2971 struct scrub_ctx *sctx;
2405}
2406
2407int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
2408{
2409 struct btrfs_fs_info *fs_info = root->fs_info;
2410 struct scrub_dev *sdev;
2411 2972
2412 mutex_lock(&fs_info->scrub_lock); 2973 mutex_lock(&fs_info->scrub_lock);
2413 sdev = dev->scrub_device; 2974 sctx = dev->scrub_device;
2414 if (!sdev) { 2975 if (!sctx) {
2415 mutex_unlock(&fs_info->scrub_lock); 2976 mutex_unlock(&fs_info->scrub_lock);
2416 return -ENOTCONN; 2977 return -ENOTCONN;
2417 } 2978 }
2418 atomic_inc(&sdev->cancel_req); 2979 atomic_inc(&sctx->cancel_req);
2419 while (dev->scrub_device) { 2980 while (dev->scrub_device) {
2420 mutex_unlock(&fs_info->scrub_lock); 2981 mutex_unlock(&fs_info->scrub_lock);
2421 wait_event(fs_info->scrub_pause_wait, 2982 wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2438 * does not go away in cancel_dev. FIXME: find a better solution 2999 * does not go away in cancel_dev. FIXME: find a better solution
2439 */ 3000 */
2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3001 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2441 dev = btrfs_find_device(root, devid, NULL, NULL); 3002 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2442 if (!dev) { 3003 if (!dev) {
2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3004 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2444 return -ENODEV; 3005 return -ENODEV;
2445 } 3006 }
2446 ret = btrfs_scrub_cancel_dev(root, dev); 3007 ret = btrfs_scrub_cancel_dev(fs_info, dev);
2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3008 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2448 3009
2449 return ret; 3010 return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2453 struct btrfs_scrub_progress *progress) 3014 struct btrfs_scrub_progress *progress)
2454{ 3015{
2455 struct btrfs_device *dev; 3016 struct btrfs_device *dev;
2456 struct scrub_dev *sdev = NULL; 3017 struct scrub_ctx *sctx = NULL;
2457 3018
2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3019 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2459 dev = btrfs_find_device(root, devid, NULL, NULL); 3020 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
2460 if (dev) 3021 if (dev)
2461 sdev = dev->scrub_device; 3022 sctx = dev->scrub_device;
2462 if (sdev) 3023 if (sctx)
2463 memcpy(progress, &sdev->stat, sizeof(*progress)); 3024 memcpy(progress, &sctx->stat, sizeof(*progress));
2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3025 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2465 3026
2466 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 3027 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3028}
3029
3030static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3031 u64 extent_logical, u64 extent_len,
3032 u64 *extent_physical,
3033 struct btrfs_device **extent_dev,
3034 int *extent_mirror_num)
3035{
3036 u64 mapped_length;
3037 struct btrfs_bio *bbio = NULL;
3038 int ret;
3039
3040 mapped_length = extent_len;
3041 ret = btrfs_map_block(fs_info, READ, extent_logical,
3042 &mapped_length, &bbio, 0);
3043 if (ret || !bbio || mapped_length < extent_len ||
3044 !bbio->stripes[0].dev->bdev) {
3045 kfree(bbio);
3046 return;
3047 }
3048
3049 *extent_physical = bbio->stripes[0].physical;
3050 *extent_mirror_num = bbio->mirror_num;
3051 *extent_dev = bbio->stripes[0].dev;
3052 kfree(bbio);
3053}
3054
3055static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3056 struct scrub_wr_ctx *wr_ctx,
3057 struct btrfs_fs_info *fs_info,
3058 struct btrfs_device *dev,
3059 int is_dev_replace)
3060{
3061 WARN_ON(wr_ctx->wr_curr_bio != NULL);
3062
3063 mutex_init(&wr_ctx->wr_lock);
3064 wr_ctx->wr_curr_bio = NULL;
3065 if (!is_dev_replace)
3066 return 0;
3067
3068 WARN_ON(!dev->bdev);
3069 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3070 bio_get_nr_vecs(dev->bdev));
3071 wr_ctx->tgtdev = dev;
3072 atomic_set(&wr_ctx->flush_all_writes, 0);
3073 return 0;
3074}
3075
3076static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3077{
3078 mutex_lock(&wr_ctx->wr_lock);
3079 kfree(wr_ctx->wr_curr_bio);
3080 wr_ctx->wr_curr_bio = NULL;
3081 mutex_unlock(&wr_ctx->wr_lock);
3082}
3083
3084static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3085 int mirror_num, u64 physical_for_dev_replace)
3086{
3087 struct scrub_copy_nocow_ctx *nocow_ctx;
3088 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3089
3090 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3091 if (!nocow_ctx) {
3092 spin_lock(&sctx->stat_lock);
3093 sctx->stat.malloc_errors++;
3094 spin_unlock(&sctx->stat_lock);
3095 return -ENOMEM;
3096 }
3097
3098 scrub_pending_trans_workers_inc(sctx);
3099
3100 nocow_ctx->sctx = sctx;
3101 nocow_ctx->logical = logical;
3102 nocow_ctx->len = len;
3103 nocow_ctx->mirror_num = mirror_num;
3104 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3105 nocow_ctx->work.func = copy_nocow_pages_worker;
3106 btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3107 &nocow_ctx->work);
3108
3109 return 0;
3110}
3111
3112static void copy_nocow_pages_worker(struct btrfs_work *work)
3113{
3114 struct scrub_copy_nocow_ctx *nocow_ctx =
3115 container_of(work, struct scrub_copy_nocow_ctx, work);
3116 struct scrub_ctx *sctx = nocow_ctx->sctx;
3117 u64 logical = nocow_ctx->logical;
3118 u64 len = nocow_ctx->len;
3119 int mirror_num = nocow_ctx->mirror_num;
3120 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3121 int ret;
3122 struct btrfs_trans_handle *trans = NULL;
3123 struct btrfs_fs_info *fs_info;
3124 struct btrfs_path *path;
3125 struct btrfs_root *root;
3126 int not_written = 0;
3127
3128 fs_info = sctx->dev_root->fs_info;
3129 root = fs_info->extent_root;
3130
3131 path = btrfs_alloc_path();
3132 if (!path) {
3133 spin_lock(&sctx->stat_lock);
3134 sctx->stat.malloc_errors++;
3135 spin_unlock(&sctx->stat_lock);
3136 not_written = 1;
3137 goto out;
3138 }
3139
3140 trans = btrfs_join_transaction(root);
3141 if (IS_ERR(trans)) {
3142 not_written = 1;
3143 goto out;
3144 }
3145
3146 ret = iterate_inodes_from_logical(logical, fs_info, path,
3147 copy_nocow_pages_for_inode,
3148 nocow_ctx);
3149 if (ret != 0 && ret != -ENOENT) {
3150 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3151 (unsigned long long)logical,
3152 (unsigned long long)physical_for_dev_replace,
3153 (unsigned long long)len,
3154 (unsigned long long)mirror_num, ret);
3155 not_written = 1;
3156 goto out;
3157 }
3158
3159out:
3160 if (trans && !IS_ERR(trans))
3161 btrfs_end_transaction(trans, root);
3162 if (not_written)
3163 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3164 num_uncorrectable_read_errors);
3165
3166 btrfs_free_path(path);
3167 kfree(nocow_ctx);
3168
3169 scrub_pending_trans_workers_dec(sctx);
3170}
3171
3172static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3173{
3174 unsigned long index;
3175 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3176 int ret = 0;
3177 struct btrfs_key key;
3178 struct inode *inode = NULL;
3179 struct btrfs_root *local_root;
3180 u64 physical_for_dev_replace;
3181 u64 len;
3182 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3183
3184 key.objectid = root;
3185 key.type = BTRFS_ROOT_ITEM_KEY;
3186 key.offset = (u64)-1;
3187 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3188 if (IS_ERR(local_root))
3189 return PTR_ERR(local_root);
3190
3191 key.type = BTRFS_INODE_ITEM_KEY;
3192 key.objectid = inum;
3193 key.offset = 0;
3194 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3195 if (IS_ERR(inode))
3196 return PTR_ERR(inode);
3197
3198 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3199 len = nocow_ctx->len;
3200 while (len >= PAGE_CACHE_SIZE) {
3201 struct page *page = NULL;
3202 int ret_sub;
3203
3204 index = offset >> PAGE_CACHE_SHIFT;
3205
3206 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3207 if (!page) {
3208 pr_err("find_or_create_page() failed\n");
3209 ret = -ENOMEM;
3210 goto next_page;
3211 }
3212
3213 if (PageUptodate(page)) {
3214 if (PageDirty(page))
3215 goto next_page;
3216 } else {
3217 ClearPageError(page);
3218 ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3219 io_tree,
3220 page, btrfs_get_extent,
3221 nocow_ctx->mirror_num);
3222 if (ret_sub) {
3223 ret = ret_sub;
3224 goto next_page;
3225 }
3226 wait_on_page_locked(page);
3227 if (!PageUptodate(page)) {
3228 ret = -EIO;
3229 goto next_page;
3230 }
3231 }
3232 ret_sub = write_page_nocow(nocow_ctx->sctx,
3233 physical_for_dev_replace, page);
3234 if (ret_sub) {
3235 ret = ret_sub;
3236 goto next_page;
3237 }
3238
3239next_page:
3240 if (page) {
3241 unlock_page(page);
3242 put_page(page);
3243 }
3244 offset += PAGE_CACHE_SIZE;
3245 physical_for_dev_replace += PAGE_CACHE_SIZE;
3246 len -= PAGE_CACHE_SIZE;
3247 }
3248
3249 if (inode)
3250 iput(inode);
3251 return ret;
3252}
3253
3254static int write_page_nocow(struct scrub_ctx *sctx,
3255 u64 physical_for_dev_replace, struct page *page)
3256{
3257 struct bio *bio;
3258 struct btrfs_device *dev;
3259 int ret;
3260 DECLARE_COMPLETION_ONSTACK(compl);
3261
3262 dev = sctx->wr_ctx.tgtdev;
3263 if (!dev)
3264 return -EIO;
3265 if (!dev->bdev) {
3266 printk_ratelimited(KERN_WARNING
3267 "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3268 return -EIO;
3269 }
3270 bio = bio_alloc(GFP_NOFS, 1);
3271 if (!bio) {
3272 spin_lock(&sctx->stat_lock);
3273 sctx->stat.malloc_errors++;
3274 spin_unlock(&sctx->stat_lock);
3275 return -ENOMEM;
3276 }
3277 bio->bi_private = &compl;
3278 bio->bi_end_io = scrub_complete_bio_end_io;
3279 bio->bi_size = 0;
3280 bio->bi_sector = physical_for_dev_replace >> 9;
3281 bio->bi_bdev = dev->bdev;
3282 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3283 if (ret != PAGE_CACHE_SIZE) {
3284leave_with_eio:
3285 bio_put(bio);
3286 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3287 return -EIO;
3288 }
3289 btrfsic_submit_bio(WRITE_SYNC, bio);
3290 wait_for_completion(&compl);
3291
3292 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3293 goto leave_with_eio;
3294
3295 bio_put(bio);
3296 return 0;
2467} 3297}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e78b297b0b00..321b7fb4e441 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx,
1814 (unsigned long)nce->ino); 1814 (unsigned long)nce->ino);
1815 if (!nce_head) { 1815 if (!nce_head) {
1816 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); 1816 nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
1817 if (!nce_head) 1817 if (!nce_head) {
1818 kfree(nce);
1818 return -ENOMEM; 1819 return -ENOMEM;
1820 }
1819 INIT_LIST_HEAD(nce_head); 1821 INIT_LIST_HEAD(nce_head);
1820 1822
1821 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); 1823 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
@@ -4397,9 +4399,9 @@ static int full_send_tree(struct send_ctx *sctx)
4397 if (!path) 4399 if (!path)
4398 return -ENOMEM; 4400 return -ENOMEM;
4399 4401
4400 spin_lock(&send_root->root_times_lock); 4402 spin_lock(&send_root->root_item_lock);
4401 start_ctransid = btrfs_root_ctransid(&send_root->root_item); 4403 start_ctransid = btrfs_root_ctransid(&send_root->root_item);
4402 spin_unlock(&send_root->root_times_lock); 4404 spin_unlock(&send_root->root_item_lock);
4403 4405
4404 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 4406 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
4405 key.type = BTRFS_INODE_ITEM_KEY; 4407 key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4424,9 @@ join_trans:
4422 * Make sure the tree has not changed after re-joining. We detect this 4424 * Make sure the tree has not changed after re-joining. We detect this
4423 * by comparing start_ctransid and ctransid. They should always match. 4425 * by comparing start_ctransid and ctransid. They should always match.
4424 */ 4426 */
4425 spin_lock(&send_root->root_times_lock); 4427 spin_lock(&send_root->root_item_lock);
4426 ctransid = btrfs_root_ctransid(&send_root->root_item); 4428 ctransid = btrfs_root_ctransid(&send_root->root_item);
4427 spin_unlock(&send_root->root_times_lock); 4429 spin_unlock(&send_root->root_item_lock);
4428 4430
4429 if (ctransid != start_ctransid) { 4431 if (ctransid != start_ctransid) {
4430 WARN(1, KERN_WARNING "btrfs: the root that you're trying to " 4432 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14c2064..d8982e9601d3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
55#include "export.h" 55#include "export.h"
56#include "compression.h" 56#include "compression.h"
57#include "rcu-string.h" 57#include "rcu-string.h"
58#include "dev-replace.h"
58 59
59#define CREATE_TRACE_POINTS 60#define CREATE_TRACE_POINTS
60#include <trace/events/btrfs.h> 61#include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
116 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { 117 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
117 sb->s_flags |= MS_RDONLY; 118 sb->s_flags |= MS_RDONLY;
118 printk(KERN_INFO "btrfs is forced readonly\n"); 119 printk(KERN_INFO "btrfs is forced readonly\n");
119 __btrfs_scrub_cancel(fs_info); 120 /*
121 * Note that a running device replace operation is not
122 * canceled here although there is no way to update
123 * the progress. It would add the risk of a deadlock,
124 * therefore the canceling is ommited. The only penalty
125 * is that some I/O remains active until the procedure
126 * completes. The next time when the filesystem is
127 * mounted writeable again, the device replace
128 * operation continues.
129 */
120// WARN_ON(1); 130// WARN_ON(1);
121 } 131 }
122} 132}
@@ -257,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
257 function, line, errstr); 267 function, line, errstr);
258 return; 268 return;
259 } 269 }
260 trans->transaction->aborted = errno; 270 ACCESS_ONCE(trans->transaction->aborted) = errno;
261 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 271 __btrfs_std_error(root->fs_info, function, line, errno, NULL);
262} 272}
263/* 273/*
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1186 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); 1196 btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
1187 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); 1197 btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
1188 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); 1198 btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
1189 btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); 1199 btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
1200 new_pool_size);
1190} 1201}
1191 1202
1192static int btrfs_remount(struct super_block *sb, int *flags, char *data) 1203static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1215 return 0; 1226 return 0;
1216 1227
1217 if (*flags & MS_RDONLY) { 1228 if (*flags & MS_RDONLY) {
1229 /*
1230 * this also happens on 'umount -rf' or on shutdown, when
1231 * the filesystem is busy.
1232 */
1218 sb->s_flags |= MS_RDONLY; 1233 sb->s_flags |= MS_RDONLY;
1219 1234
1235 btrfs_dev_replace_suspend_for_unmount(fs_info);
1236 btrfs_scrub_cancel(fs_info);
1237
1220 ret = btrfs_commit_super(root); 1238 ret = btrfs_commit_super(root);
1221 if (ret) 1239 if (ret)
1222 goto restore; 1240 goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1226 goto restore; 1244 goto restore;
1227 } 1245 }
1228 1246
1247 if (fs_info->fs_devices->missing_devices >
1248 fs_info->num_tolerated_disk_barrier_failures &&
1249 !(*flags & MS_RDONLY)) {
1250 printk(KERN_WARNING
1251 "Btrfs: too many missing devices, writeable remount is not allowed\n");
1252 ret = -EACCES;
1253 goto restore;
1254 }
1255
1229 if (btrfs_super_log_root(fs_info->super_copy) != 0) { 1256 if (btrfs_super_log_root(fs_info->super_copy) != 0) {
1230 ret = -EINVAL; 1257 ret = -EINVAL;
1231 goto restore; 1258 goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1244 if (ret) 1271 if (ret)
1245 goto restore; 1272 goto restore;
1246 1273
1274 ret = btrfs_resume_dev_replace_async(fs_info);
1275 if (ret) {
1276 pr_warn("btrfs: failed to resume dev_replace\n");
1277 goto restore;
1278 }
1247 sb->s_flags &= ~MS_RDONLY; 1279 sb->s_flags &= ~MS_RDONLY;
1248 } 1280 }
1249 1281
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1336 min_stripe_size = BTRFS_STRIPE_LEN; 1368 min_stripe_size = BTRFS_STRIPE_LEN;
1337 1369
1338 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1370 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1339 if (!device->in_fs_metadata || !device->bdev) 1371 if (!device->in_fs_metadata || !device->bdev ||
1372 device->is_tgtdev_for_dev_replace)
1340 continue; 1373 continue;
1341 1374
1342 avail_space = device->total_bytes - device->bytes_used; 1375 avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
1647 if (err) 1680 if (err)
1648 goto free_ordered_data; 1681 goto free_ordered_data;
1649 1682
1650 err = btrfs_interface_init(); 1683 err = btrfs_auto_defrag_init();
1651 if (err) 1684 if (err)
1652 goto free_delayed_inode; 1685 goto free_delayed_inode;
1653 1686
1687 err = btrfs_interface_init();
1688 if (err)
1689 goto free_auto_defrag;
1690
1654 err = register_filesystem(&btrfs_fs_type); 1691 err = register_filesystem(&btrfs_fs_type);
1655 if (err) 1692 if (err)
1656 goto unregister_ioctl; 1693 goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
1662 1699
1663unregister_ioctl: 1700unregister_ioctl:
1664 btrfs_interface_exit(); 1701 btrfs_interface_exit();
1702free_auto_defrag:
1703 btrfs_auto_defrag_exit();
1665free_delayed_inode: 1704free_delayed_inode:
1666 btrfs_delayed_inode_exit(); 1705 btrfs_delayed_inode_exit();
1667free_ordered_data: 1706free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
1681static void __exit exit_btrfs_fs(void) 1720static void __exit exit_btrfs_fs(void)
1682{ 1721{
1683 btrfs_destroy_cachep(); 1722 btrfs_destroy_cachep();
1723 btrfs_auto_defrag_exit();
1684 btrfs_delayed_inode_exit(); 1724 btrfs_delayed_inode_exit();
1685 ordered_data_exit(); 1725 ordered_data_exit();
1686 extent_map_exit(); 1726 extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04bbfb1052eb..f15494699f3b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
30#include "tree-log.h" 30#include "tree-log.h"
31#include "inode-map.h" 31#include "inode-map.h"
32#include "volumes.h" 32#include "volumes.h"
33#include "dev-replace.h"
33 34
34#define BTRFS_ROOT_TRANS_TAG 0 35#define BTRFS_ROOT_TRANS_TAG 0
35 36
@@ -145,16 +146,12 @@ loop:
145 * the log must never go across transaction boundaries. 146 * the log must never go across transaction boundaries.
146 */ 147 */
147 smp_mb(); 148 smp_mb();
148 if (!list_empty(&fs_info->tree_mod_seq_list)) { 149 if (!list_empty(&fs_info->tree_mod_seq_list))
149 printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when " 150 WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
150 "creating a fresh transaction\n"); 151 "creating a fresh transaction\n");
151 WARN_ON(1); 152 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
152 } 153 WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
153 if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
154 printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
155 "creating a fresh transaction\n"); 154 "creating a fresh transaction\n");
156 WARN_ON(1);
157 }
158 atomic_set(&fs_info->tree_mod_seq, 0); 155 atomic_set(&fs_info->tree_mod_seq, 0);
159 156
160 spin_lock_init(&cur_trans->commit_lock); 157 spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
295 return 0; 292 return 0;
296} 293}
297 294
298static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 295static struct btrfs_trans_handle *
299 u64 num_items, int type, 296start_transaction(struct btrfs_root *root, u64 num_items, int type,
300 int noflush) 297 enum btrfs_reserve_flush_enum flush)
301{ 298{
302 struct btrfs_trans_handle *h; 299 struct btrfs_trans_handle *h;
303 struct btrfs_transaction *cur_trans; 300 struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
312 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); 309 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
313 h = current->journal_info; 310 h = current->journal_info;
314 h->use_count++; 311 h->use_count++;
312 WARN_ON(h->use_count > 2);
315 h->orig_rsv = h->block_rsv; 313 h->orig_rsv = h->block_rsv;
316 h->block_rsv = NULL; 314 h->block_rsv = NULL;
317 goto got_it; 315 goto got_it;
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
331 } 329 }
332 330
333 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 331 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
334 if (noflush) 332 ret = btrfs_block_rsv_add(root,
335 ret = btrfs_block_rsv_add_noflush(root, 333 &root->fs_info->trans_block_rsv,
336 &root->fs_info->trans_block_rsv, 334 num_bytes, flush);
337 num_bytes);
338 else
339 ret = btrfs_block_rsv_add(root,
340 &root->fs_info->trans_block_rsv,
341 num_bytes);
342 if (ret) 335 if (ret)
343 return ERR_PTR(ret); 336 return ERR_PTR(ret);
344 } 337 }
@@ -422,13 +415,15 @@ got_it:
422struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 415struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
423 int num_items) 416 int num_items)
424{ 417{
425 return start_transaction(root, num_items, TRANS_START, 0); 418 return start_transaction(root, num_items, TRANS_START,
419 BTRFS_RESERVE_FLUSH_ALL);
426} 420}
427 421
428struct btrfs_trans_handle *btrfs_start_transaction_noflush( 422struct btrfs_trans_handle *btrfs_start_transaction_lflush(
429 struct btrfs_root *root, int num_items) 423 struct btrfs_root *root, int num_items)
430{ 424{
431 return start_transaction(root, num_items, TRANS_START, 1); 425 return start_transaction(root, num_items, TRANS_START,
426 BTRFS_RESERVE_FLUSH_LIMIT);
432} 427}
433 428
434struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) 429struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
461int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 456int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
462{ 457{
463 struct btrfs_transaction *cur_trans = NULL, *t; 458 struct btrfs_transaction *cur_trans = NULL, *t;
464 int ret; 459 int ret = 0;
465 460
466 ret = 0;
467 if (transid) { 461 if (transid) {
468 if (transid <= root->fs_info->last_trans_committed) 462 if (transid <= root->fs_info->last_trans_committed)
469 goto out; 463 goto out;
470 464
465 ret = -EINVAL;
471 /* find specified transaction */ 466 /* find specified transaction */
472 spin_lock(&root->fs_info->trans_lock); 467 spin_lock(&root->fs_info->trans_lock);
473 list_for_each_entry(t, &root->fs_info->trans_list, list) { 468 list_for_each_entry(t, &root->fs_info->trans_list, list) {
474 if (t->transid == transid) { 469 if (t->transid == transid) {
475 cur_trans = t; 470 cur_trans = t;
476 atomic_inc(&cur_trans->use_count); 471 atomic_inc(&cur_trans->use_count);
472 ret = 0;
477 break; 473 break;
478 } 474 }
479 if (t->transid > transid) 475 if (t->transid > transid) {
476 ret = 0;
480 break; 477 break;
478 }
481 } 479 }
482 spin_unlock(&root->fs_info->trans_lock); 480 spin_unlock(&root->fs_info->trans_lock);
483 ret = -EINVAL; 481 /* The specified transaction doesn't exist */
484 if (!cur_trans) 482 if (!cur_trans)
485 goto out; /* bad transid */ 483 goto out;
486 } else { 484 } else {
487 /* find newest transaction that is committing | committed */ 485 /* find newest transaction that is committing | committed */
488 spin_lock(&root->fs_info->trans_lock); 486 spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
502 } 500 }
503 501
504 wait_for_commit(root, cur_trans); 502 wait_for_commit(root, cur_trans);
505
506 put_transaction(cur_trans); 503 put_transaction(cur_trans);
507 ret = 0;
508out: 504out:
509 return ret; 505 return ret;
510} 506}
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
851 return ret; 847 return ret;
852 848
853 ret = btrfs_run_dev_stats(trans, root->fs_info); 849 ret = btrfs_run_dev_stats(trans, root->fs_info);
854 BUG_ON(ret); 850 WARN_ON(ret);
851 ret = btrfs_run_dev_replace(trans, root->fs_info);
852 WARN_ON(ret);
855 853
856 ret = btrfs_run_qgroups(trans, root->fs_info); 854 ret = btrfs_run_qgroups(trans, root->fs_info);
857 BUG_ON(ret); 855 BUG_ON(ret);
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
874 switch_commit_root(fs_info->extent_root); 872 switch_commit_root(fs_info->extent_root);
875 up_write(&fs_info->extent_commit_sem); 873 up_write(&fs_info->extent_commit_sem);
876 874
875 btrfs_after_dev_replace_commit(fs_info);
876
877 return 0; 877 return 0;
878} 878}
879 879
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
958 struct btrfs_fs_info *info = root->fs_info; 958 struct btrfs_fs_info *info = root->fs_info;
959 struct btrfs_trans_handle *trans; 959 struct btrfs_trans_handle *trans;
960 int ret; 960 int ret;
961 unsigned long nr;
962 961
963 if (xchg(&root->defrag_running, 1)) 962 if (xchg(&root->defrag_running, 1))
964 return 0; 963 return 0;
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
970 969
971 ret = btrfs_defrag_leaves(trans, root, cacheonly); 970 ret = btrfs_defrag_leaves(trans, root, cacheonly);
972 971
973 nr = trans->blocks_used;
974 btrfs_end_transaction(trans, root); 972 btrfs_end_transaction(trans, root);
975 btrfs_btree_balance_dirty(info->tree_root, nr); 973 btrfs_btree_balance_dirty(info->tree_root);
976 cond_resched(); 974 cond_resched();
977 975
978 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) 976 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1032 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 1030 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
1033 1031
1034 if (to_reserve > 0) { 1032 if (to_reserve > 0) {
1035 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, 1033 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
1036 to_reserve); 1034 to_reserve,
1035 BTRFS_RESERVE_NO_FLUSH);
1037 if (ret) { 1036 if (ret) {
1038 pending->error = ret; 1037 pending->error = ret;
1039 goto no_free_objectid; 1038 goto no_free_objectid;
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1191 parent_inode, &key, 1190 parent_inode, &key,
1192 BTRFS_FT_DIR, index); 1191 BTRFS_FT_DIR, index);
1193 /* We have check then name at the beginning, so it is impossible. */ 1192 /* We have check then name at the beginning, so it is impossible. */
1194 BUG_ON(ret == -EEXIST); 1193 BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
1195 if (ret) { 1194 if (ret) {
1196 btrfs_abort_transaction(trans, root, ret); 1195 btrfs_abort_transaction(trans, root, ret);
1197 goto fail; 1196 goto fail;
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
1309 * We've got freeze protection passed with the transaction. 1308 * We've got freeze protection passed with the transaction.
1310 * Tell lockdep about it. 1309 * Tell lockdep about it.
1311 */ 1310 */
1312 rwsem_acquire_read( 1311 if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1312 rwsem_acquire_read(
1314 0, 1, _THIS_IP_); 1313 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1314 0, 1, _THIS_IP_);
1315 1315
1316 current->journal_info = ac->newtrans; 1316 current->journal_info = ac->newtrans;
1317 1317
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1349 * Tell lockdep we've released the freeze rwsem, since the 1349 * Tell lockdep we've released the freeze rwsem, since the
1350 * async commit thread will be the one to unlock it. 1350 * async commit thread will be the one to unlock it.
1351 */ 1351 */
1352 rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1352 if (trans->type < TRANS_JOIN_NOLOCK)
1353 1, _THIS_IP_); 1353 rwsem_release(
1354 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1355 1, _THIS_IP_);
1354 1356
1355 schedule_delayed_work(&ac->work, 0); 1357 schedule_delayed_work(&ac->work, 0);
1356 1358
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1400 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1402 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1401} 1403}
1402 1404
1405static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1406 struct btrfs_root *root)
1407{
1408 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1409 int snap_pending = 0;
1410 int ret;
1411
1412 if (!flush_on_commit) {
1413 spin_lock(&root->fs_info->trans_lock);
1414 if (!list_empty(&trans->transaction->pending_snapshots))
1415 snap_pending = 1;
1416 spin_unlock(&root->fs_info->trans_lock);
1417 }
1418
1419 if (flush_on_commit || snap_pending) {
1420 btrfs_start_delalloc_inodes(root, 1);
1421 btrfs_wait_ordered_extents(root, 1);
1422 }
1423
1424 ret = btrfs_run_delayed_items(trans, root);
1425 if (ret)
1426 return ret;
1427
1428 /*
1429 * running the delayed items may have added new refs. account
1430 * them now so that they hinder processing of more delayed refs
1431 * as little as possible.
1432 */
1433 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1434
1435 /*
1436 * rename don't use btrfs_join_transaction, so, once we
1437 * set the transaction to blocked above, we aren't going
1438 * to get any new ordered operations. We can safely run
1439 * it here and no for sure that nothing new will be added
1440 * to the list
1441 */
1442 btrfs_run_ordered_operations(root, 1);
1443
1444 return 0;
1445}
1446
1403/* 1447/*
1404 * btrfs_transaction state sequence: 1448 * btrfs_transaction state sequence:
1405 * in_commit = 0, blocked = 0 (initial) 1449 * in_commit = 0, blocked = 0 (initial)
@@ -1414,15 +1458,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1414 struct btrfs_transaction *cur_trans = trans->transaction; 1458 struct btrfs_transaction *cur_trans = trans->transaction;
1415 struct btrfs_transaction *prev_trans = NULL; 1459 struct btrfs_transaction *prev_trans = NULL;
1416 DEFINE_WAIT(wait); 1460 DEFINE_WAIT(wait);
1417 int ret = -EIO; 1461 int ret;
1418 int should_grow = 0; 1462 int should_grow = 0;
1419 unsigned long now = get_seconds(); 1463 unsigned long now = get_seconds();
1420 int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
1421 1464
1422 btrfs_run_ordered_operations(root, 0); 1465 ret = btrfs_run_ordered_operations(root, 0);
1466 if (ret) {
1467 btrfs_abort_transaction(trans, root, ret);
1468 goto cleanup_transaction;
1469 }
1423 1470
1424 if (cur_trans->aborted) 1471 /* Stop the commit early if ->aborted is set */
1472 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1473 ret = cur_trans->aborted;
1425 goto cleanup_transaction; 1474 goto cleanup_transaction;
1475 }
1426 1476
1427 /* make a pass through all the delayed refs we have so far 1477 /* make a pass through all the delayed refs we have so far
1428 * any runnings procs may add more while we are here 1478 * any runnings procs may add more while we are here
@@ -1490,39 +1540,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1490 should_grow = 1; 1540 should_grow = 1;
1491 1541
1492 do { 1542 do {
1493 int snap_pending = 0;
1494
1495 joined = cur_trans->num_joined; 1543 joined = cur_trans->num_joined;
1496 if (!list_empty(&trans->transaction->pending_snapshots))
1497 snap_pending = 1;
1498 1544
1499 WARN_ON(cur_trans != trans->transaction); 1545 WARN_ON(cur_trans != trans->transaction);
1500 1546
1501 if (flush_on_commit || snap_pending) { 1547 ret = btrfs_flush_all_pending_stuffs(trans, root);
1502 btrfs_start_delalloc_inodes(root, 1);
1503 btrfs_wait_ordered_extents(root, 1);
1504 }
1505
1506 ret = btrfs_run_delayed_items(trans, root);
1507 if (ret) 1548 if (ret)
1508 goto cleanup_transaction; 1549 goto cleanup_transaction;
1509 1550
1510 /*
1511 * running the delayed items may have added new refs. account
1512 * them now so that they hinder processing of more delayed refs
1513 * as little as possible.
1514 */
1515 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
1516
1517 /*
1518 * rename don't use btrfs_join_transaction, so, once we
1519 * set the transaction to blocked above, we aren't going
1520 * to get any new ordered operations. We can safely run
1521 * it here and no for sure that nothing new will be added
1522 * to the list
1523 */
1524 btrfs_run_ordered_operations(root, 1);
1525
1526 prepare_to_wait(&cur_trans->writer_wait, &wait, 1551 prepare_to_wait(&cur_trans->writer_wait, &wait,
1527 TASK_UNINTERRUPTIBLE); 1552 TASK_UNINTERRUPTIBLE);
1528 1553
@@ -1535,6 +1560,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1535 } while (atomic_read(&cur_trans->num_writers) > 1 || 1560 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1536 (should_grow && cur_trans->num_joined != joined)); 1561 (should_grow && cur_trans->num_joined != joined));
1537 1562
1563 ret = btrfs_flush_all_pending_stuffs(trans, root);
1564 if (ret)
1565 goto cleanup_transaction;
1566
1538 /* 1567 /*
1539 * Ok now we need to make sure to block out any other joins while we 1568 * Ok now we need to make sure to block out any other joins while we
1540 * commit the transaction. We could have started a join before setting 1569 * commit the transaction. We could have started a join before setting
@@ -1546,6 +1575,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1546 wait_event(cur_trans->writer_wait, 1575 wait_event(cur_trans->writer_wait,
1547 atomic_read(&cur_trans->num_writers) == 1); 1576 atomic_read(&cur_trans->num_writers) == 1);
1548 1577
1578 /* ->aborted might be set after the previous check, so check it */
1579 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1580 ret = cur_trans->aborted;
1581 goto cleanup_transaction;
1582 }
1549 /* 1583 /*
1550 * the reloc mutex makes sure that we stop 1584 * the reloc mutex makes sure that we stop
1551 * the balancing code from coming in and moving 1585 * the balancing code from coming in and moving
@@ -1629,6 +1663,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1629 goto cleanup_transaction; 1663 goto cleanup_transaction;
1630 } 1664 }
1631 1665
1666 /*
1667 * The tasks which save the space cache and inode cache may also
1668 * update ->aborted, check it.
1669 */
1670 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1671 ret = cur_trans->aborted;
1672 mutex_unlock(&root->fs_info->tree_log_mutex);
1673 mutex_unlock(&root->fs_info->reloc_mutex);
1674 goto cleanup_transaction;
1675 }
1676
1632 btrfs_prepare_extent_commit(trans, root); 1677 btrfs_prepare_extent_commit(trans, root);
1633 1678
1634 cur_trans = root->fs_info->running_transaction; 1679 cur_trans = root->fs_info->running_transaction;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 80961947a6b2..0e8aa1e6c287 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 105 struct btrfs_root *root);
106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 106struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
107 int num_items); 107 int num_items);
108struct btrfs_trans_handle *btrfs_start_transaction_noflush( 108struct btrfs_trans_handle *btrfs_start_transaction_lflush(
109 struct btrfs_root *root, int num_items); 109 struct btrfs_root *root, int num_items);
110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 110struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 111struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 81e407d9677a..9027bb1e7466 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2952 struct btrfs_inode_item *item, 2952 struct btrfs_inode_item *item,
2953 struct inode *inode, int log_inode_only) 2953 struct inode *inode, int log_inode_only)
2954{ 2954{
2955 btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 2955 struct btrfs_map_token token;
2956 btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 2956
2957 btrfs_set_inode_mode(leaf, item, inode->i_mode); 2957 btrfs_init_map_token(&token);
2958 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2959
2960 btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2961 inode->i_atime.tv_sec);
2962 btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2963 inode->i_atime.tv_nsec);
2964
2965 btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2966 inode->i_mtime.tv_sec);
2967 btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2968 inode->i_mtime.tv_nsec);
2969
2970 btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2971 inode->i_ctime.tv_sec);
2972 btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2973 inode->i_ctime.tv_nsec);
2974
2975 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2976
2977 btrfs_set_inode_sequence(leaf, item, inode->i_version);
2978 btrfs_set_inode_transid(leaf, item, trans->transid);
2979 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2980 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2981 btrfs_set_inode_block_group(leaf, item, 0);
2982 2958
2983 if (log_inode_only) { 2959 if (log_inode_only) {
2984 /* set the generation to zero so the recover code 2960 /* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2986 * just to say 'this inode exists' and a logging 2962 * just to say 'this inode exists' and a logging
2987 * to say 'update this inode with these values' 2963 * to say 'update this inode with these values'
2988 */ 2964 */
2989 btrfs_set_inode_generation(leaf, item, 0); 2965 btrfs_set_token_inode_generation(leaf, item, 0, &token);
2990 btrfs_set_inode_size(leaf, item, 0); 2966 btrfs_set_token_inode_size(leaf, item, 0, &token);
2991 } else { 2967 } else {
2992 btrfs_set_inode_generation(leaf, item, 2968 btrfs_set_token_inode_generation(leaf, item,
2993 BTRFS_I(inode)->generation); 2969 BTRFS_I(inode)->generation,
2994 btrfs_set_inode_size(leaf, item, inode->i_size); 2970 &token);
2995 } 2971 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
2972 }
2973
2974 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
2975 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
2976 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
2977 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
2978
2979 btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
2980 inode->i_atime.tv_sec, &token);
2981 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
2982 inode->i_atime.tv_nsec, &token);
2983
2984 btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
2985 inode->i_mtime.tv_sec, &token);
2986 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
2987 inode->i_mtime.tv_nsec, &token);
2988
2989 btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
2990 inode->i_ctime.tv_sec, &token);
2991 btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
2992 inode->i_ctime.tv_nsec, &token);
2993
2994 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
2995 &token);
2996
2997 btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
2998 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
2999 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3000 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3001 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3002}
2996 3003
3004static int log_inode_item(struct btrfs_trans_handle *trans,
3005 struct btrfs_root *log, struct btrfs_path *path,
3006 struct inode *inode)
3007{
3008 struct btrfs_inode_item *inode_item;
3009 struct btrfs_key key;
3010 int ret;
3011
3012 memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
3013 ret = btrfs_insert_empty_item(trans, log, path, &key,
3014 sizeof(*inode_item));
3015 if (ret && ret != -EEXIST)
3016 return ret;
3017 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3018 struct btrfs_inode_item);
3019 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
3020 btrfs_release_path(path);
3021 return 0;
2997} 3022}
2998 3023
2999static noinline int copy_items(struct btrfs_trans_handle *trans, 3024static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,239 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3130 return 0; 3155 return 0;
3131} 3156}
3132 3157
3133struct log_args { 3158static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
3134 struct extent_buffer *src; 3159 struct btrfs_root *root, struct inode *inode,
3135 u64 next_offset; 3160 struct extent_map *em,
3136 int start_slot; 3161 struct btrfs_path *path)
3137 int nr; 3162{
3138}; 3163 struct btrfs_file_extent_item *fi;
3164 struct extent_buffer *leaf;
3165 struct btrfs_key key, new_key;
3166 struct btrfs_map_token token;
3167 u64 extent_end;
3168 u64 extent_offset = 0;
3169 int extent_type;
3170 int del_slot = 0;
3171 int del_nr = 0;
3172 int ret = 0;
3173
3174 while (1) {
3175 btrfs_init_map_token(&token);
3176 leaf = path->nodes[0];
3177 path->slots[0]++;
3178 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3179 if (del_nr) {
3180 ret = btrfs_del_items(trans, root, path,
3181 del_slot, del_nr);
3182 if (ret)
3183 return ret;
3184 del_nr = 0;
3185 }
3186
3187 ret = btrfs_next_leaf_write(trans, root, path, 1);
3188 if (ret < 0)
3189 return ret;
3190 if (ret > 0)
3191 return 0;
3192 leaf = path->nodes[0];
3193 }
3194
3195 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3196 if (key.objectid != btrfs_ino(inode) ||
3197 key.type != BTRFS_EXTENT_DATA_KEY ||
3198 key.offset >= em->start + em->len)
3199 break;
3200
3201 fi = btrfs_item_ptr(leaf, path->slots[0],
3202 struct btrfs_file_extent_item);
3203 extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
3204 if (extent_type == BTRFS_FILE_EXTENT_REG ||
3205 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
3206 extent_offset = btrfs_token_file_extent_offset(leaf,
3207 fi, &token);
3208 extent_end = key.offset +
3209 btrfs_token_file_extent_num_bytes(leaf, fi,
3210 &token);
3211 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3212 extent_end = key.offset +
3213 btrfs_file_extent_inline_len(leaf, fi);
3214 } else {
3215 BUG();
3216 }
3217
3218 if (extent_end <= em->len + em->start) {
3219 if (!del_nr) {
3220 del_slot = path->slots[0];
3221 }
3222 del_nr++;
3223 continue;
3224 }
3225
3226 /*
3227 * Ok so we'll ignore previous items if we log a new extent,
3228 * which can lead to overlapping extents, so if we have an
3229 * existing extent we want to adjust we _have_ to check the next
3230 * guy to make sure we even need this extent anymore, this keeps
3231 * us from panicing in set_item_key_safe.
3232 */
3233 if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
3234 struct btrfs_key tmp_key;
3235
3236 btrfs_item_key_to_cpu(leaf, &tmp_key,
3237 path->slots[0] + 1);
3238 if (tmp_key.objectid == btrfs_ino(inode) &&
3239 tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
3240 tmp_key.offset <= em->start + em->len) {
3241 if (!del_nr)
3242 del_slot = path->slots[0];
3243 del_nr++;
3244 continue;
3245 }
3246 }
3247
3248 BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
3249 memcpy(&new_key, &key, sizeof(new_key));
3250 new_key.offset = em->start + em->len;
3251 btrfs_set_item_key_safe(trans, root, path, &new_key);
3252 extent_offset += em->start + em->len - key.offset;
3253 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
3254 &token);
3255 btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
3256 (em->start + em->len),
3257 &token);
3258 btrfs_mark_buffer_dirty(leaf);
3259 }
3260
3261 if (del_nr)
3262 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
3263
3264 return ret;
3265}
3139 3266
3140static int log_one_extent(struct btrfs_trans_handle *trans, 3267static int log_one_extent(struct btrfs_trans_handle *trans,
3141 struct inode *inode, struct btrfs_root *root, 3268 struct inode *inode, struct btrfs_root *root,
3142 struct extent_map *em, struct btrfs_path *path, 3269 struct extent_map *em, struct btrfs_path *path)
3143 struct btrfs_path *dst_path, struct log_args *args)
3144{ 3270{
3145 struct btrfs_root *log = root->log_root; 3271 struct btrfs_root *log = root->log_root;
3146 struct btrfs_file_extent_item *fi; 3272 struct btrfs_file_extent_item *fi;
3273 struct extent_buffer *leaf;
3274 struct list_head ordered_sums;
3275 struct btrfs_map_token token;
3147 struct btrfs_key key; 3276 struct btrfs_key key;
3148 u64 start = em->mod_start; 3277 u64 csum_offset = em->mod_start - em->start;
3149 u64 search_start = start; 3278 u64 csum_len = em->mod_len;
3150 u64 len = em->mod_len; 3279 u64 extent_offset = em->start - em->orig_start;
3151 u64 num_bytes; 3280 u64 block_len;
3152 int nritems;
3153 int ret; 3281 int ret;
3282 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3154 3283
3155 if (BTRFS_I(inode)->logged_trans == trans->transid) { 3284 INIT_LIST_HEAD(&ordered_sums);
3156 ret = __btrfs_drop_extents(trans, log, inode, dst_path, start, 3285 btrfs_init_map_token(&token);
3157 start + len, NULL, 0); 3286 key.objectid = btrfs_ino(inode);
3158 if (ret) 3287 key.type = BTRFS_EXTENT_DATA_KEY;
3159 return ret; 3288 key.offset = em->start;
3289 path->really_keep_locks = 1;
3290
3291 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
3292 if (ret && ret != -EEXIST) {
3293 path->really_keep_locks = 0;
3294 return ret;
3160 } 3295 }
3296 leaf = path->nodes[0];
3297 fi = btrfs_item_ptr(leaf, path->slots[0],
3298 struct btrfs_file_extent_item);
3299 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3300 &token);
3301 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3302 skip_csum = true;
3303 btrfs_set_token_file_extent_type(leaf, fi,
3304 BTRFS_FILE_EXTENT_PREALLOC,
3305 &token);
3306 } else {
3307 btrfs_set_token_file_extent_type(leaf, fi,
3308 BTRFS_FILE_EXTENT_REG,
3309 &token);
3310 if (em->block_start == 0)
3311 skip_csum = true;
3312 }
3313
3314 block_len = max(em->block_len, em->orig_block_len);
3315 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3316 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3317 em->block_start,
3318 &token);
3319 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3320 &token);
3321 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3322 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3323 em->block_start -
3324 extent_offset, &token);
3325 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3326 &token);
3327 } else {
3328 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3329 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3330 &token);
3331 }
3332
3333 btrfs_set_token_file_extent_offset(leaf, fi,
3334 em->start - em->orig_start,
3335 &token);
3336 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3337 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
3338 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3339 &token);
3340 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3341 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3342 btrfs_mark_buffer_dirty(leaf);
3161 3343
3162 while (len) { 3344 /*
3163 if (args->nr) 3345 * Have to check the extent to the right of us to make sure it doesn't
3164 goto next_slot; 3346 * fall in our current range. We're ok if the previous extent is in our
3165again: 3347 * range since the recovery stuff will run us in key order and thus just
3166 key.objectid = btrfs_ino(inode); 3348 * drop the part we overwrote.
3167 key.type = BTRFS_EXTENT_DATA_KEY; 3349 */
3168 key.offset = search_start; 3350 ret = drop_adjacent_extents(trans, log, inode, em, path);
3169 3351 btrfs_release_path(path);
3170 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3352 path->really_keep_locks = 0;
3171 if (ret < 0) 3353 if (ret) {
3172 return ret; 3354 return ret;
3173 3355 }
3174 if (ret) {
3175 /*
3176 * A rare case were we can have an em for a section of a
3177 * larger extent so we need to make sure that this em
3178 * falls within the extent we've found. If not we just
3179 * bail and go back to ye-olde way of doing things but
3180 * it happens often enough in testing that we need to do
3181 * this dance to make sure.
3182 */
3183 do {
3184 if (path->slots[0] == 0) {
3185 btrfs_release_path(path);
3186 if (search_start == 0)
3187 return -ENOENT;
3188 search_start--;
3189 goto again;
3190 }
3191 3356
3192 path->slots[0]--; 3357 if (skip_csum)
3193 btrfs_item_key_to_cpu(path->nodes[0], &key, 3358 return 0;
3194 path->slots[0]);
3195 if (key.objectid != btrfs_ino(inode) ||
3196 key.type != BTRFS_EXTENT_DATA_KEY) {
3197 btrfs_release_path(path);
3198 return -ENOENT;
3199 }
3200 } while (key.offset > start);
3201 3359
3202 fi = btrfs_item_ptr(path->nodes[0], path->slots[0], 3360 if (em->compress_type) {
3203 struct btrfs_file_extent_item); 3361 csum_offset = 0;
3204 num_bytes = btrfs_file_extent_num_bytes(path->nodes[0], 3362 csum_len = block_len;
3205 fi); 3363 }
3206 if (key.offset + num_bytes <= start) {
3207 btrfs_release_path(path);
3208 return -ENOENT;
3209 }
3210 }
3211 args->src = path->nodes[0];
3212next_slot:
3213 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3214 fi = btrfs_item_ptr(args->src, path->slots[0],
3215 struct btrfs_file_extent_item);
3216 if (args->nr &&
3217 args->start_slot + args->nr == path->slots[0]) {
3218 args->nr++;
3219 } else if (args->nr) {
3220 ret = copy_items(trans, inode, dst_path, args->src,
3221 args->start_slot, args->nr,
3222 LOG_INODE_ALL);
3223 if (ret)
3224 return ret;
3225 args->nr = 1;
3226 args->start_slot = path->slots[0];
3227 } else if (!args->nr) {
3228 args->nr = 1;
3229 args->start_slot = path->slots[0];
3230 }
3231 nritems = btrfs_header_nritems(path->nodes[0]);
3232 path->slots[0]++;
3233 num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
3234 if (len < num_bytes) {
3235 /* I _think_ this is ok, envision we write to a
3236 * preallocated space that is adjacent to a previously
3237 * written preallocated space that gets merged when we
3238 * mark this preallocated space written. If we do not
3239 * have the adjacent extent in cache then when we copy
3240 * this extent it could end up being larger than our EM
3241 * thinks it is, which is a-ok, so just set len to 0.
3242 */
3243 len = 0;
3244 } else {
3245 len -= num_bytes;
3246 }
3247 start = key.offset + num_bytes;
3248 args->next_offset = start;
3249 search_start = start;
3250 3364
3251 if (path->slots[0] < nritems) { 3365 /* block start is already adjusted for the file extent offset. */
3252 if (len) 3366 ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
3253 goto next_slot; 3367 em->block_start + csum_offset,
3254 break; 3368 em->block_start + csum_offset +
3255 } 3369 csum_len - 1, &ordered_sums, 0);
3370 if (ret)
3371 return ret;
3256 3372
3257 if (args->nr) { 3373 while (!list_empty(&ordered_sums)) {
3258 ret = copy_items(trans, inode, dst_path, args->src, 3374 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3259 args->start_slot, args->nr, 3375 struct btrfs_ordered_sum,
3260 LOG_INODE_ALL); 3376 list);
3261 if (ret) 3377 if (!ret)
3262 return ret; 3378 ret = btrfs_csum_file_blocks(trans, log, sums);
3263 args->nr = 0; 3379 list_del(&sums->list);
3264 btrfs_release_path(path); 3380 kfree(sums);
3265 }
3266 } 3381 }
3267 3382
3268 return 0; 3383 return ret;
3269} 3384}
3270 3385
3271static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3386static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3272 struct btrfs_root *root, 3387 struct btrfs_root *root,
3273 struct inode *inode, 3388 struct inode *inode,
3274 struct btrfs_path *path, 3389 struct btrfs_path *path)
3275 struct btrfs_path *dst_path)
3276{ 3390{
3277 struct log_args args;
3278 struct extent_map *em, *n; 3391 struct extent_map *em, *n;
3279 struct list_head extents; 3392 struct list_head extents;
3280 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3393 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3396,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3283 3396
3284 INIT_LIST_HEAD(&extents); 3397 INIT_LIST_HEAD(&extents);
3285 3398
3286 memset(&args, 0, sizeof(args));
3287
3288 write_lock(&tree->lock); 3399 write_lock(&tree->lock);
3289 test_gen = root->fs_info->last_trans_committed; 3400 test_gen = root->fs_info->last_trans_committed;
3290 3401
@@ -3304,47 +3415,27 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3304 em = list_entry(extents.next, struct extent_map, list); 3415 em = list_entry(extents.next, struct extent_map, list);
3305 3416
3306 list_del_init(&em->list); 3417 list_del_init(&em->list);
3307 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
3308 3418
3309 /* 3419 /*
3310 * If we had an error we just need to delete everybody from our 3420 * If we had an error we just need to delete everybody from our
3311 * private list. 3421 * private list.
3312 */ 3422 */
3313 if (ret) { 3423 if (ret) {
3424 clear_em_logging(tree, em);
3314 free_extent_map(em); 3425 free_extent_map(em);
3315 continue; 3426 continue;
3316 } 3427 }
3317 3428
3318 write_unlock(&tree->lock); 3429 write_unlock(&tree->lock);
3319 3430
3320 /* 3431 ret = log_one_extent(trans, inode, root, em, path);
3321 * If the previous EM and the last extent we left off on aren't
3322 * sequential then we need to copy the items we have and redo
3323 * our search
3324 */
3325 if (args.nr && em->mod_start != args.next_offset) {
3326 ret = copy_items(trans, inode, dst_path, args.src,
3327 args.start_slot, args.nr,
3328 LOG_INODE_ALL);
3329 if (ret) {
3330 free_extent_map(em);
3331 write_lock(&tree->lock);
3332 continue;
3333 }
3334 btrfs_release_path(path);
3335 args.nr = 0;
3336 }
3337
3338 ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
3339 free_extent_map(em);
3340 write_lock(&tree->lock); 3432 write_lock(&tree->lock);
3433 clear_em_logging(tree, em);
3434 free_extent_map(em);
3341 } 3435 }
3342 WARN_ON(!list_empty(&extents)); 3436 WARN_ON(!list_empty(&extents));
3343 write_unlock(&tree->lock); 3437 write_unlock(&tree->lock);
3344 3438
3345 if (!ret && args.nr)
3346 ret = copy_items(trans, inode, dst_path, args.src,
3347 args.start_slot, args.nr, LOG_INODE_ALL);
3348 btrfs_release_path(path); 3439 btrfs_release_path(path);
3349 return ret; 3440 return ret;
3350} 3441}
@@ -3400,7 +3491,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3400 3491
3401 3492
3402 /* today the code can only do partial logging of directories */ 3493 /* today the code can only do partial logging of directories */
3403 if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) 3494 if (S_ISDIR(inode->i_mode) ||
3495 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3496 &BTRFS_I(inode)->runtime_flags) &&
3497 inode_only == LOG_INODE_EXISTS))
3404 max_key.type = BTRFS_XATTR_ITEM_KEY; 3498 max_key.type = BTRFS_XATTR_ITEM_KEY;
3405 else 3499 else
3406 max_key.type = (u8)-1; 3500 max_key.type = (u8)-1;
@@ -3432,14 +3526,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3432 } else { 3526 } else {
3433 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3527 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3434 &BTRFS_I(inode)->runtime_flags)) { 3528 &BTRFS_I(inode)->runtime_flags)) {
3529 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3530 &BTRFS_I(inode)->runtime_flags);
3435 ret = btrfs_truncate_inode_items(trans, log, 3531 ret = btrfs_truncate_inode_items(trans, log,
3436 inode, 0, 0); 3532 inode, 0, 0);
3437 } else { 3533 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3438 fast_search = true; 3534 &BTRFS_I(inode)->runtime_flags)) {
3535 if (inode_only == LOG_INODE_ALL)
3536 fast_search = true;
3439 max_key.type = BTRFS_XATTR_ITEM_KEY; 3537 max_key.type = BTRFS_XATTR_ITEM_KEY;
3440 ret = drop_objectid_items(trans, log, path, ino, 3538 ret = drop_objectid_items(trans, log, path, ino,
3441 BTRFS_XATTR_ITEM_KEY); 3539 max_key.type);
3540 } else {
3541 if (inode_only == LOG_INODE_ALL)
3542 fast_search = true;
3543 ret = log_inode_item(trans, log, dst_path, inode);
3544 if (ret) {
3545 err = ret;
3546 goto out_unlock;
3547 }
3548 goto log_extents;
3442 } 3549 }
3550
3443 } 3551 }
3444 if (ret) { 3552 if (ret) {
3445 err = ret; 3553 err = ret;
@@ -3518,11 +3626,10 @@ next_slot:
3518 ins_nr = 0; 3626 ins_nr = 0;
3519 } 3627 }
3520 3628
3629log_extents:
3521 if (fast_search) { 3630 if (fast_search) {
3522 btrfs_release_path(path);
3523 btrfs_release_path(dst_path); 3631 btrfs_release_path(dst_path);
3524 ret = btrfs_log_changed_extents(trans, root, inode, path, 3632 ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
3525 dst_path);
3526 if (ret) { 3633 if (ret) {
3527 err = ret; 3634 err = ret;
3528 goto out_unlock; 3635 goto out_unlock;
@@ -3531,8 +3638,10 @@ next_slot:
3531 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3638 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3532 struct extent_map *em, *n; 3639 struct extent_map *em, *n;
3533 3640
3641 write_lock(&tree->lock);
3534 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 3642 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
3535 list_del_init(&em->list); 3643 list_del_init(&em->list);
3644 write_unlock(&tree->lock);
3536 } 3645 }
3537 3646
3538 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 3647 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0f5ebb72a5ea..15f6efdf6463 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <asm/div64.h>
29#include "compat.h" 28#include "compat.h"
30#include "ctree.h" 29#include "ctree.h"
31#include "extent_map.h" 30#include "extent_map.h"
@@ -36,6 +35,8 @@
36#include "async-thread.h" 35#include "async-thread.h"
37#include "check-integrity.h" 36#include "check-integrity.h"
38#include "rcu-string.h" 37#include "rcu-string.h"
38#include "math.h"
39#include "dev-replace.h"
39 40
40static int init_first_rw_device(struct btrfs_trans_handle *trans, 41static int init_first_rw_device(struct btrfs_trans_handle *trans,
41 struct btrfs_root *root, 42 struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
71 kfree(fs_devices); 72 kfree(fs_devices);
72} 73}
73 74
75static void btrfs_kobject_uevent(struct block_device *bdev,
76 enum kobject_action action)
77{
78 int ret;
79
80 ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
81 if (ret)
82 pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
83 action,
84 kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
85 &disk_to_dev(bdev->bd_disk)->kobj);
86}
87
74void btrfs_cleanup_fs_uuids(void) 88void btrfs_cleanup_fs_uuids(void)
75{ 89{
76 struct btrfs_fs_devices *fs_devices; 90 struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
108 return NULL; 122 return NULL;
109} 123}
110 124
125static int
126btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
127 int flush, struct block_device **bdev,
128 struct buffer_head **bh)
129{
130 int ret;
131
132 *bdev = blkdev_get_by_path(device_path, flags, holder);
133
134 if (IS_ERR(*bdev)) {
135 ret = PTR_ERR(*bdev);
136 printk(KERN_INFO "btrfs: open %s failed\n", device_path);
137 goto error;
138 }
139
140 if (flush)
141 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
142 ret = set_blocksize(*bdev, 4096);
143 if (ret) {
144 blkdev_put(*bdev, flags);
145 goto error;
146 }
147 invalidate_bdev(*bdev);
148 *bh = btrfs_read_dev_super(*bdev);
149 if (!*bh) {
150 ret = -EINVAL;
151 blkdev_put(*bdev, flags);
152 goto error;
153 }
154
155 return 0;
156
157error:
158 *bdev = NULL;
159 *bh = NULL;
160 return ret;
161}
162
111static void requeue_list(struct btrfs_pending_bios *pending_bios, 163static void requeue_list(struct btrfs_pending_bios *pending_bios,
112 struct bio *head, struct bio *tail) 164 struct bio *head, struct bio *tail)
113{ 165{
@@ -467,7 +519,8 @@ error:
467 return ERR_PTR(-ENOMEM); 519 return ERR_PTR(-ENOMEM);
468} 520}
469 521
470void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) 522void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
523 struct btrfs_fs_devices *fs_devices, int step)
471{ 524{
472 struct btrfs_device *device, *next; 525 struct btrfs_device *device, *next;
473 526
@@ -480,8 +533,9 @@ again:
480 /* This is the initialized path, it is safe to release the devices. */ 533 /* This is the initialized path, it is safe to release the devices. */
481 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 534 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
482 if (device->in_fs_metadata) { 535 if (device->in_fs_metadata) {
483 if (!latest_transid || 536 if (!device->is_tgtdev_for_dev_replace &&
484 device->generation > latest_transid) { 537 (!latest_transid ||
538 device->generation > latest_transid)) {
485 latest_devid = device->devid; 539 latest_devid = device->devid;
486 latest_transid = device->generation; 540 latest_transid = device->generation;
487 latest_bdev = device->bdev; 541 latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
489 continue; 543 continue;
490 } 544 }
491 545
546 if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
547 /*
548 * In the first step, keep the device which has
549 * the correct fsid and the devid that is used
550 * for the dev_replace procedure.
551 * In the second step, the dev_replace state is
552 * read from the device tree and it is known
553 * whether the procedure is really active or
554 * not, which means whether this device is
555 * used or whether it should be removed.
556 */
557 if (step == 0 || device->is_tgtdev_for_dev_replace) {
558 continue;
559 }
560 }
492 if (device->bdev) { 561 if (device->bdev) {
493 blkdev_put(device->bdev, device->mode); 562 blkdev_put(device->bdev, device->mode);
494 device->bdev = NULL; 563 device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
497 if (device->writeable) { 566 if (device->writeable) {
498 list_del_init(&device->dev_alloc_list); 567 list_del_init(&device->dev_alloc_list);
499 device->writeable = 0; 568 device->writeable = 0;
500 fs_devices->rw_devices--; 569 if (!device->is_tgtdev_for_dev_replace)
570 fs_devices->rw_devices--;
501 } 571 }
502 list_del_init(&device->dev_list); 572 list_del_init(&device->dev_list);
503 fs_devices->num_devices--; 573 fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
555 if (device->bdev) 625 if (device->bdev)
556 fs_devices->open_devices--; 626 fs_devices->open_devices--;
557 627
558 if (device->writeable) { 628 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
559 list_del_init(&device->dev_alloc_list); 629 list_del_init(&device->dev_alloc_list);
560 fs_devices->rw_devices--; 630 fs_devices->rw_devices--;
561 } 631 }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
637 if (!device->name) 707 if (!device->name)
638 continue; 708 continue;
639 709
640 bdev = blkdev_get_by_path(device->name->str, flags, holder); 710 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
641 if (IS_ERR(bdev)) { 711 &bdev, &bh);
642 printk(KERN_INFO "btrfs: open %s failed\n", device->name->str); 712 if (ret)
643 goto error; 713 continue;
644 }
645 filemap_write_and_wait(bdev->bd_inode->i_mapping);
646 invalidate_bdev(bdev);
647 set_blocksize(bdev, 4096);
648
649 bh = btrfs_read_dev_super(bdev);
650 if (!bh)
651 goto error_close;
652 714
653 disk_super = (struct btrfs_super_block *)bh->b_data; 715 disk_super = (struct btrfs_super_block *)bh->b_data;
654 devid = btrfs_stack_device_id(&disk_super->dev_item); 716 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
687 fs_devices->rotating = 1; 749 fs_devices->rotating = 1;
688 750
689 fs_devices->open_devices++; 751 fs_devices->open_devices++;
690 if (device->writeable) { 752 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
691 fs_devices->rw_devices++; 753 fs_devices->rw_devices++;
692 list_add(&device->dev_alloc_list, 754 list_add(&device->dev_alloc_list,
693 &fs_devices->alloc_list); 755 &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
697 759
698error_brelse: 760error_brelse:
699 brelse(bh); 761 brelse(bh);
700error_close:
701 blkdev_put(bdev, flags); 762 blkdev_put(bdev, flags);
702error:
703 continue; 763 continue;
704 } 764 }
705 if (fs_devices->open_devices == 0) { 765 if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
744 u64 total_devices; 804 u64 total_devices;
745 805
746 flags |= FMODE_EXCL; 806 flags |= FMODE_EXCL;
747 bdev = blkdev_get_by_path(path, flags, holder);
748
749 if (IS_ERR(bdev)) {
750 ret = PTR_ERR(bdev);
751 goto error;
752 }
753
754 mutex_lock(&uuid_mutex); 807 mutex_lock(&uuid_mutex);
755 ret = set_blocksize(bdev, 4096); 808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
756 if (ret) 809 if (ret)
757 goto error_close; 810 goto error;
758 bh = btrfs_read_dev_super(bdev);
759 if (!bh) {
760 ret = -EINVAL;
761 goto error_close;
762 }
763 disk_super = (struct btrfs_super_block *)bh->b_data; 811 disk_super = (struct btrfs_super_block *)bh->b_data;
764 devid = btrfs_stack_device_id(&disk_super->dev_item); 812 devid = btrfs_stack_device_id(&disk_super->dev_item);
765 transid = btrfs_super_generation(disk_super); 813 transid = btrfs_super_generation(disk_super);
766 total_devices = btrfs_super_num_devices(disk_super); 814 total_devices = btrfs_super_num_devices(disk_super);
767 if (disk_super->label[0]) 815 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
768 printk(KERN_INFO "device label %s ", disk_super->label); 818 printk(KERN_INFO "device label %s ", disk_super->label);
769 else 819 } else {
770 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 }
771 printk(KERN_CONT "devid %llu transid %llu %s\n", 822 printk(KERN_CONT "devid %llu transid %llu %s\n",
772 (unsigned long long)devid, (unsigned long long)transid, path); 823 (unsigned long long)devid, (unsigned long long)transid, path);
773 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 824 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
774 if (!ret && fs_devices_ret) 825 if (!ret && fs_devices_ret)
775 (*fs_devices_ret)->total_devices = total_devices; 826 (*fs_devices_ret)->total_devices = total_devices;
776 brelse(bh); 827 brelse(bh);
777error_close:
778 mutex_unlock(&uuid_mutex);
779 blkdev_put(bdev, flags); 828 blkdev_put(bdev, flags);
780error: 829error:
830 mutex_unlock(&uuid_mutex);
781 return ret; 831 return ret;
782} 832}
783 833
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
796 846
797 *length = 0; 847 *length = 0;
798 848
799 if (start >= device->total_bytes) 849 if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
800 return 0; 850 return 0;
801 851
802 path = btrfs_alloc_path(); 852 path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
913 max_hole_size = 0; 963 max_hole_size = 0;
914 hole_size = 0; 964 hole_size = 0;
915 965
916 if (search_start >= search_end) { 966 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
917 ret = -ENOSPC; 967 ret = -ENOSPC;
918 goto error; 968 goto error;
919 } 969 }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1096 struct btrfs_key key; 1146 struct btrfs_key key;
1097 1147
1098 WARN_ON(!device->in_fs_metadata); 1148 WARN_ON(!device->in_fs_metadata);
1149 WARN_ON(device->is_tgtdev_for_dev_replace);
1099 path = btrfs_alloc_path(); 1150 path = btrfs_alloc_path();
1100 if (!path) 1151 if (!path)
1101 return -ENOMEM; 1152 return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1330 root->fs_info->avail_system_alloc_bits | 1381 root->fs_info->avail_system_alloc_bits |
1331 root->fs_info->avail_metadata_alloc_bits; 1382 root->fs_info->avail_metadata_alloc_bits;
1332 1383
1333 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && 1384 num_devices = root->fs_info->fs_devices->num_devices;
1334 root->fs_info->fs_devices->num_devices <= 4) { 1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1386 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1387 WARN_ON(num_devices < 1);
1388 num_devices--;
1389 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1335 printk(KERN_ERR "btrfs: unable to go below four devices " 1393 printk(KERN_ERR "btrfs: unable to go below four devices "
1336 "on raid10\n"); 1394 "on raid10\n");
1337 ret = -EINVAL; 1395 ret = -EINVAL;
1338 goto out; 1396 goto out;
1339 } 1397 }
1340 1398
1341 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && 1399 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1342 root->fs_info->fs_devices->num_devices <= 2) {
1343 printk(KERN_ERR "btrfs: unable to go below two " 1400 printk(KERN_ERR "btrfs: unable to go below two "
1344 "devices on raid1\n"); 1401 "devices on raid1\n");
1345 ret = -EINVAL; 1402 ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1357 * is held. 1414 * is held.
1358 */ 1415 */
1359 list_for_each_entry(tmp, devices, dev_list) { 1416 list_for_each_entry(tmp, devices, dev_list) {
1360 if (tmp->in_fs_metadata && !tmp->bdev) { 1417 if (tmp->in_fs_metadata &&
1418 !tmp->is_tgtdev_for_dev_replace &&
1419 !tmp->bdev) {
1361 device = tmp; 1420 device = tmp;
1362 break; 1421 break;
1363 } 1422 }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1371 goto out; 1430 goto out;
1372 } 1431 }
1373 } else { 1432 } else {
1374 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, 1433 ret = btrfs_get_bdev_and_sb(device_path,
1375 root->fs_info->bdev_holder); 1434 FMODE_WRITE | FMODE_EXCL,
1376 if (IS_ERR(bdev)) { 1435 root->fs_info->bdev_holder, 0,
1377 ret = PTR_ERR(bdev); 1436 &bdev, &bh);
1437 if (ret)
1378 goto out; 1438 goto out;
1379 }
1380
1381 set_blocksize(bdev, 4096);
1382 invalidate_bdev(bdev);
1383 bh = btrfs_read_dev_super(bdev);
1384 if (!bh) {
1385 ret = -EINVAL;
1386 goto error_close;
1387 }
1388 disk_super = (struct btrfs_super_block *)bh->b_data; 1439 disk_super = (struct btrfs_super_block *)bh->b_data;
1389 devid = btrfs_stack_device_id(&disk_super->dev_item); 1440 devid = btrfs_stack_device_id(&disk_super->dev_item);
1390 dev_uuid = disk_super->dev_item.uuid; 1441 dev_uuid = disk_super->dev_item.uuid;
1391 device = btrfs_find_device(root, devid, dev_uuid, 1442 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1392 disk_super->fsid); 1443 disk_super->fsid);
1393 if (!device) { 1444 if (!device) {
1394 ret = -ENOENT; 1445 ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1396 } 1447 }
1397 } 1448 }
1398 1449
1450 if (device->is_tgtdev_for_dev_replace) {
1451 pr_err("btrfs: unable to remove the dev_replace target dev\n");
1452 ret = -EINVAL;
1453 goto error_brelse;
1454 }
1455
1399 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1456 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1400 printk(KERN_ERR "btrfs: unable to remove the only writeable " 1457 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1401 "device\n"); 1458 "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1415 if (ret) 1472 if (ret)
1416 goto error_undo; 1473 goto error_undo;
1417 1474
1475 /*
1476 * TODO: the superblock still includes this device in its num_devices
1477 * counter although write_all_supers() is not locked out. This
1478 * could give a filesystem state which requires a degraded mount.
1479 */
1418 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1480 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1419 if (ret) 1481 if (ret)
1420 goto error_undo; 1482 goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1425 spin_unlock(&root->fs_info->free_chunk_lock); 1487 spin_unlock(&root->fs_info->free_chunk_lock);
1426 1488
1427 device->in_fs_metadata = 0; 1489 device->in_fs_metadata = 0;
1428 btrfs_scrub_cancel_dev(root, device); 1490 btrfs_scrub_cancel_dev(root->fs_info, device);
1429 1491
1430 /* 1492 /*
1431 * the device list mutex makes sure that we don't change 1493 * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1482 * at this point, the device is zero sized. We want to 1544 * at this point, the device is zero sized. We want to
1483 * remove it from the devices list and zero out the old super 1545 * remove it from the devices list and zero out the old super
1484 */ 1546 */
1485 if (clear_super) { 1547 if (clear_super && disk_super) {
1486 /* make sure this device isn't detected as part of 1548 /* make sure this device isn't detected as part of
1487 * the FS anymore 1549 * the FS anymore
1488 */ 1550 */
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1493 1555
1494 ret = 0; 1556 ret = 0;
1495 1557
1558 /* Notify udev that device has changed */
1559 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1560
1496error_brelse: 1561error_brelse:
1497 brelse(bh); 1562 brelse(bh);
1498error_close:
1499 if (bdev) 1563 if (bdev)
1500 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1564 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1501out: 1565out:
@@ -1512,6 +1576,112 @@ error_undo:
1512 goto error_brelse; 1576 goto error_brelse;
1513} 1577}
1514 1578
1579void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1580 struct btrfs_device *srcdev)
1581{
1582 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1583 list_del_rcu(&srcdev->dev_list);
1584 list_del_rcu(&srcdev->dev_alloc_list);
1585 fs_info->fs_devices->num_devices--;
1586 if (srcdev->missing) {
1587 fs_info->fs_devices->missing_devices--;
1588 fs_info->fs_devices->rw_devices++;
1589 }
1590 if (srcdev->can_discard)
1591 fs_info->fs_devices->num_can_discard--;
1592 if (srcdev->bdev)
1593 fs_info->fs_devices->open_devices--;
1594
1595 call_rcu(&srcdev->rcu, free_device);
1596}
1597
1598void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1599 struct btrfs_device *tgtdev)
1600{
1601 struct btrfs_device *next_device;
1602
1603 WARN_ON(!tgtdev);
1604 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1605 if (tgtdev->bdev) {
1606 btrfs_scratch_superblock(tgtdev);
1607 fs_info->fs_devices->open_devices--;
1608 }
1609 fs_info->fs_devices->num_devices--;
1610 if (tgtdev->can_discard)
1611 fs_info->fs_devices->num_can_discard++;
1612
1613 next_device = list_entry(fs_info->fs_devices->devices.next,
1614 struct btrfs_device, dev_list);
1615 if (tgtdev->bdev == fs_info->sb->s_bdev)
1616 fs_info->sb->s_bdev = next_device->bdev;
1617 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1618 fs_info->fs_devices->latest_bdev = next_device->bdev;
1619 list_del_rcu(&tgtdev->dev_list);
1620
1621 call_rcu(&tgtdev->rcu, free_device);
1622
1623 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1624}
1625
1626int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1627 struct btrfs_device **device)
1628{
1629 int ret = 0;
1630 struct btrfs_super_block *disk_super;
1631 u64 devid;
1632 u8 *dev_uuid;
1633 struct block_device *bdev;
1634 struct buffer_head *bh;
1635
1636 *device = NULL;
1637 ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1638 root->fs_info->bdev_holder, 0, &bdev, &bh);
1639 if (ret)
1640 return ret;
1641 disk_super = (struct btrfs_super_block *)bh->b_data;
1642 devid = btrfs_stack_device_id(&disk_super->dev_item);
1643 dev_uuid = disk_super->dev_item.uuid;
1644 *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1645 disk_super->fsid);
1646 brelse(bh);
1647 if (!*device)
1648 ret = -ENOENT;
1649 blkdev_put(bdev, FMODE_READ);
1650 return ret;
1651}
1652
1653int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1654 char *device_path,
1655 struct btrfs_device **device)
1656{
1657 *device = NULL;
1658 if (strcmp(device_path, "missing") == 0) {
1659 struct list_head *devices;
1660 struct btrfs_device *tmp;
1661
1662 devices = &root->fs_info->fs_devices->devices;
1663 /*
1664 * It is safe to read the devices since the volume_mutex
1665 * is held by the caller.
1666 */
1667 list_for_each_entry(tmp, devices, dev_list) {
1668 if (tmp->in_fs_metadata && !tmp->bdev) {
1669 *device = tmp;
1670 break;
1671 }
1672 }
1673
1674 if (!*device) {
1675 pr_err("btrfs: no missing device found\n");
1676 return -ENOENT;
1677 }
1678
1679 return 0;
1680 } else {
1681 return btrfs_find_device_by_path(root, device_path, device);
1682 }
1683}
1684
1515/* 1685/*
1516 * does all the dirty work required for changing file system's UUID. 1686 * does all the dirty work required for changing file system's UUID.
1517 */ 1687 */
@@ -1630,7 +1800,8 @@ next_slot:
1630 read_extent_buffer(leaf, fs_uuid, 1800 read_extent_buffer(leaf, fs_uuid,
1631 (unsigned long)btrfs_device_fsid(dev_item), 1801 (unsigned long)btrfs_device_fsid(dev_item),
1632 BTRFS_UUID_SIZE); 1802 BTRFS_UUID_SIZE);
1633 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 1803 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1804 fs_uuid);
1634 BUG_ON(!device); /* Logic error */ 1805 BUG_ON(!device); /* Logic error */
1635 1806
1636 if (device->fs_devices->seeding) { 1807 if (device->fs_devices->seeding) {
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1678 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1849 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1679 1850
1680 devices = &root->fs_info->fs_devices->devices; 1851 devices = &root->fs_info->fs_devices->devices;
1681 /* 1852
1682 * we have the volume lock, so we don't need the extra 1853 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1683 * device list mutex while reading the list here.
1684 */
1685 list_for_each_entry(device, devices, dev_list) { 1854 list_for_each_entry(device, devices, dev_list) {
1686 if (device->bdev == bdev) { 1855 if (device->bdev == bdev) {
1687 ret = -EEXIST; 1856 ret = -EEXIST;
1857 mutex_unlock(
1858 &root->fs_info->fs_devices->device_list_mutex);
1688 goto error; 1859 goto error;
1689 } 1860 }
1690 } 1861 }
1862 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1691 1863
1692 device = kzalloc(sizeof(*device), GFP_NOFS); 1864 device = kzalloc(sizeof(*device), GFP_NOFS);
1693 if (!device) { 1865 if (!device) {
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1737 device->dev_root = root->fs_info->dev_root; 1909 device->dev_root = root->fs_info->dev_root;
1738 device->bdev = bdev; 1910 device->bdev = bdev;
1739 device->in_fs_metadata = 1; 1911 device->in_fs_metadata = 1;
1912 device->is_tgtdev_for_dev_replace = 0;
1740 device->mode = FMODE_EXCL; 1913 device->mode = FMODE_EXCL;
1741 set_blocksize(device->bdev, 4096); 1914 set_blocksize(device->bdev, 4096);
1742 1915
@@ -1844,6 +2017,98 @@ error:
1844 return ret; 2017 return ret;
1845} 2018}
1846 2019
2020int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2021 struct btrfs_device **device_out)
2022{
2023 struct request_queue *q;
2024 struct btrfs_device *device;
2025 struct block_device *bdev;
2026 struct btrfs_fs_info *fs_info = root->fs_info;
2027 struct list_head *devices;
2028 struct rcu_string *name;
2029 int ret = 0;
2030
2031 *device_out = NULL;
2032 if (fs_info->fs_devices->seeding)
2033 return -EINVAL;
2034
2035 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2036 fs_info->bdev_holder);
2037 if (IS_ERR(bdev))
2038 return PTR_ERR(bdev);
2039
2040 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2041
2042 devices = &fs_info->fs_devices->devices;
2043 list_for_each_entry(device, devices, dev_list) {
2044 if (device->bdev == bdev) {
2045 ret = -EEXIST;
2046 goto error;
2047 }
2048 }
2049
2050 device = kzalloc(sizeof(*device), GFP_NOFS);
2051 if (!device) {
2052 ret = -ENOMEM;
2053 goto error;
2054 }
2055
2056 name = rcu_string_strdup(device_path, GFP_NOFS);
2057 if (!name) {
2058 kfree(device);
2059 ret = -ENOMEM;
2060 goto error;
2061 }
2062 rcu_assign_pointer(device->name, name);
2063
2064 q = bdev_get_queue(bdev);
2065 if (blk_queue_discard(q))
2066 device->can_discard = 1;
2067 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2068 device->writeable = 1;
2069 device->work.func = pending_bios_fn;
2070 generate_random_uuid(device->uuid);
2071 device->devid = BTRFS_DEV_REPLACE_DEVID;
2072 spin_lock_init(&device->io_lock);
2073 device->generation = 0;
2074 device->io_width = root->sectorsize;
2075 device->io_align = root->sectorsize;
2076 device->sector_size = root->sectorsize;
2077 device->total_bytes = i_size_read(bdev->bd_inode);
2078 device->disk_total_bytes = device->total_bytes;
2079 device->dev_root = fs_info->dev_root;
2080 device->bdev = bdev;
2081 device->in_fs_metadata = 1;
2082 device->is_tgtdev_for_dev_replace = 1;
2083 device->mode = FMODE_EXCL;
2084 set_blocksize(device->bdev, 4096);
2085 device->fs_devices = fs_info->fs_devices;
2086 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2087 fs_info->fs_devices->num_devices++;
2088 fs_info->fs_devices->open_devices++;
2089 if (device->can_discard)
2090 fs_info->fs_devices->num_can_discard++;
2091 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2092
2093 *device_out = device;
2094 return ret;
2095
2096error:
2097 blkdev_put(bdev, FMODE_EXCL);
2098 return ret;
2099}
2100
2101void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2102 struct btrfs_device *tgtdev)
2103{
2104 WARN_ON(fs_info->fs_devices->rw_devices == 0);
2105 tgtdev->io_width = fs_info->dev_root->sectorsize;
2106 tgtdev->io_align = fs_info->dev_root->sectorsize;
2107 tgtdev->sector_size = fs_info->dev_root->sectorsize;
2108 tgtdev->dev_root = fs_info->dev_root;
2109 tgtdev->in_fs_metadata = 1;
2110}
2111
1847static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2112static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1848 struct btrfs_device *device) 2113 struct btrfs_device *device)
1849{ 2114{
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1900 2165
1901 if (!device->writeable) 2166 if (!device->writeable)
1902 return -EACCES; 2167 return -EACCES;
1903 if (new_size <= device->total_bytes) 2168 if (new_size <= device->total_bytes ||
2169 device->is_tgtdev_for_dev_replace)
1904 return -EINVAL; 2170 return -EINVAL;
1905 2171
1906 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2172 btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
2338 return 1; 2604 return 1;
2339} 2605}
2340 2606
2341static u64 div_factor_fine(u64 num, int factor)
2342{
2343 if (factor <= 0)
2344 return 0;
2345 if (factor >= 100)
2346 return num;
2347
2348 num *= factor;
2349 do_div(num, 100);
2350 return num;
2351}
2352
2353static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 2607static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2354 struct btrfs_balance_args *bargs) 2608 struct btrfs_balance_args *bargs)
2355{ 2609{
@@ -2360,7 +2614,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2360 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2614 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2361 chunk_used = btrfs_block_group_used(&cache->item); 2615 chunk_used = btrfs_block_group_used(&cache->item);
2362 2616
2363 user_thresh = div_factor_fine(cache->key.offset, bargs->usage); 2617 if (bargs->usage == 0)
2618 user_thresh = 0;
2619 else if (bargs->usage > 100)
2620 user_thresh = cache->key.offset;
2621 else
2622 user_thresh = div_factor_fine(cache->key.offset,
2623 bargs->usage);
2624
2364 if (chunk_used < user_thresh) 2625 if (chunk_used < user_thresh)
2365 ret = 0; 2626 ret = 0;
2366 2627
@@ -2514,15 +2775,6 @@ static int should_balance_chunk(struct btrfs_root *root,
2514 return 1; 2775 return 1;
2515} 2776}
2516 2777
2517static u64 div_factor(u64 num, int factor)
2518{
2519 if (factor == 10)
2520 return num;
2521 num *= factor;
2522 do_div(num, 10);
2523 return num;
2524}
2525
2526static int __btrfs_balance(struct btrfs_fs_info *fs_info) 2778static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2527{ 2779{
2528 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 2780 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2802,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2550 size_to_free = div_factor(old_size, 1); 2802 size_to_free = div_factor(old_size, 1);
2551 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 2803 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2552 if (!device->writeable || 2804 if (!device->writeable ||
2553 device->total_bytes - device->bytes_used > size_to_free) 2805 device->total_bytes - device->bytes_used > size_to_free ||
2806 device->is_tgtdev_for_dev_replace)
2554 continue; 2807 continue;
2555 2808
2556 ret = btrfs_shrink_device(device, old_size - size_to_free); 2809 ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2713,6 +2966,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
2713 unset_balance_control(fs_info); 2966 unset_balance_control(fs_info);
2714 ret = del_balance_item(fs_info->tree_root); 2967 ret = del_balance_item(fs_info->tree_root);
2715 BUG_ON(ret); 2968 BUG_ON(ret);
2969
2970 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
2716} 2971}
2717 2972
2718void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 2973void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -2728,6 +2983,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2728 u64 allowed; 2983 u64 allowed;
2729 int mixed = 0; 2984 int mixed = 0;
2730 int ret; 2985 int ret;
2986 u64 num_devices;
2731 2987
2732 if (btrfs_fs_closing(fs_info) || 2988 if (btrfs_fs_closing(fs_info) ||
2733 atomic_read(&fs_info->balance_pause_req) || 2989 atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3012,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2756 } 3012 }
2757 } 3013 }
2758 3014
3015 num_devices = fs_info->fs_devices->num_devices;
3016 btrfs_dev_replace_lock(&fs_info->dev_replace);
3017 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3018 BUG_ON(num_devices < 1);
3019 num_devices--;
3020 }
3021 btrfs_dev_replace_unlock(&fs_info->dev_replace);
2759 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3022 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2760 if (fs_info->fs_devices->num_devices == 1) 3023 if (num_devices == 1)
2761 allowed |= BTRFS_BLOCK_GROUP_DUP; 3024 allowed |= BTRFS_BLOCK_GROUP_DUP;
2762 else if (fs_info->fs_devices->num_devices < 4) 3025 else if (num_devices < 4)
2763 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3026 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2764 else 3027 else
2765 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3028 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2884,8 +3147,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2884out: 3147out:
2885 if (bctl->flags & BTRFS_BALANCE_RESUME) 3148 if (bctl->flags & BTRFS_BALANCE_RESUME)
2886 __cancel_balance(fs_info); 3149 __cancel_balance(fs_info);
2887 else 3150 else {
2888 kfree(bctl); 3151 kfree(bctl);
3152 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3153 }
2889 return ret; 3154 return ret;
2890} 3155}
2891 3156
@@ -2977,6 +3242,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
2977 btrfs_balance_sys(leaf, item, &disk_bargs); 3242 btrfs_balance_sys(leaf, item, &disk_bargs);
2978 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3243 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2979 3244
3245 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3246
2980 mutex_lock(&fs_info->volume_mutex); 3247 mutex_lock(&fs_info->volume_mutex);
2981 mutex_lock(&fs_info->balance_mutex); 3248 mutex_lock(&fs_info->balance_mutex);
2982 3249
@@ -3080,7 +3347,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3080 u64 old_size = device->total_bytes; 3347 u64 old_size = device->total_bytes;
3081 u64 diff = device->total_bytes - new_size; 3348 u64 diff = device->total_bytes - new_size;
3082 3349
3083 if (new_size >= device->total_bytes) 3350 if (device->is_tgtdev_for_dev_replace)
3084 return -EINVAL; 3351 return -EINVAL;
3085 3352
3086 path = btrfs_alloc_path(); 3353 path = btrfs_alloc_path();
@@ -3235,6 +3502,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3235 return 0; 3502 return 0;
3236} 3503}
3237 3504
3505struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3506 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3507 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3508 { 1, 2, 1, 1, 1, 2 /* dup */ },
3509 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3510 { 1, 1, 1, 1, 1, 1 /* single */ },
3511};
3512
3238static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3513static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3239 struct btrfs_root *extent_root, 3514 struct btrfs_root *extent_root,
3240 struct map_lookup **map_ret, 3515 struct map_lookup **map_ret,
@@ -3264,43 +3539,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3264 int ndevs; 3539 int ndevs;
3265 int i; 3540 int i;
3266 int j; 3541 int j;
3542 int index;
3267 3543
3268 BUG_ON(!alloc_profile_is_valid(type, 0)); 3544 BUG_ON(!alloc_profile_is_valid(type, 0));
3269 3545
3270 if (list_empty(&fs_devices->alloc_list)) 3546 if (list_empty(&fs_devices->alloc_list))
3271 return -ENOSPC; 3547 return -ENOSPC;
3272 3548
3273 sub_stripes = 1; 3549 index = __get_raid_index(type);
3274 dev_stripes = 1;
3275 devs_increment = 1;
3276 ncopies = 1;
3277 devs_max = 0; /* 0 == as many as possible */
3278 devs_min = 1;
3279 3550
3280 /* 3551 sub_stripes = btrfs_raid_array[index].sub_stripes;
3281 * define the properties of each RAID type. 3552 dev_stripes = btrfs_raid_array[index].dev_stripes;
3282 * FIXME: move this to a global table and use it in all RAID 3553 devs_max = btrfs_raid_array[index].devs_max;
3283 * calculation code 3554 devs_min = btrfs_raid_array[index].devs_min;
3284 */ 3555 devs_increment = btrfs_raid_array[index].devs_increment;
3285 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 3556 ncopies = btrfs_raid_array[index].ncopies;
3286 dev_stripes = 2;
3287 ncopies = 2;
3288 devs_max = 1;
3289 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3290 devs_min = 2;
3291 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3292 devs_increment = 2;
3293 ncopies = 2;
3294 devs_max = 2;
3295 devs_min = 2;
3296 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3297 sub_stripes = 2;
3298 devs_increment = 2;
3299 ncopies = 2;
3300 devs_min = 4;
3301 } else {
3302 devs_max = 1;
3303 }
3304 3557
3305 if (type & BTRFS_BLOCK_GROUP_DATA) { 3558 if (type & BTRFS_BLOCK_GROUP_DATA) {
3306 max_stripe_size = 1024 * 1024 * 1024; 3559 max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3600,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3347 cur = cur->next; 3600 cur = cur->next;
3348 3601
3349 if (!device->writeable) { 3602 if (!device->writeable) {
3350 printk(KERN_ERR 3603 WARN(1, KERN_ERR
3351 "btrfs: read-only device in alloc_list\n"); 3604 "btrfs: read-only device in alloc_list\n");
3352 WARN_ON(1);
3353 continue; 3605 continue;
3354 } 3606 }
3355 3607
3356 if (!device->in_fs_metadata) 3608 if (!device->in_fs_metadata ||
3609 device->is_tgtdev_for_dev_replace)
3357 continue; 3610 continue;
3358 3611
3359 if (device->total_bytes > device->bytes_used) 3612 if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3635,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3382 devices_info[ndevs].total_avail = total_avail; 3635 devices_info[ndevs].total_avail = total_avail;
3383 devices_info[ndevs].dev = device; 3636 devices_info[ndevs].dev = device;
3384 ++ndevs; 3637 ++ndevs;
3638 WARN_ON(ndevs > fs_devices->rw_devices);
3385 } 3639 }
3386 3640
3387 /* 3641 /*
@@ -3740,8 +3994,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3740 } 3994 }
3741} 3995}
3742 3996
3743int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) 3997int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
3744{ 3998{
3999 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3745 struct extent_map *em; 4000 struct extent_map *em;
3746 struct map_lookup *map; 4001 struct map_lookup *map;
3747 struct extent_map_tree *em_tree = &map_tree->map_tree; 4002 struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4016,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3761 else 4016 else
3762 ret = 1; 4017 ret = 1;
3763 free_extent_map(em); 4018 free_extent_map(em);
4019
4020 btrfs_dev_replace_lock(&fs_info->dev_replace);
4021 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4022 ret++;
4023 btrfs_dev_replace_unlock(&fs_info->dev_replace);
4024
3764 return ret; 4025 return ret;
3765} 4026}
3766 4027
3767static int find_live_mirror(struct map_lookup *map, int first, int num, 4028static int find_live_mirror(struct btrfs_fs_info *fs_info,
3768 int optimal) 4029 struct map_lookup *map, int first, int num,
4030 int optimal, int dev_replace_is_ongoing)
3769{ 4031{
3770 int i; 4032 int i;
3771 if (map->stripes[optimal].dev->bdev) 4033 int tolerance;
3772 return optimal; 4034 struct btrfs_device *srcdev;
3773 for (i = first; i < first + num; i++) { 4035
3774 if (map->stripes[i].dev->bdev) 4036 if (dev_replace_is_ongoing &&
3775 return i; 4037 fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4038 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4039 srcdev = fs_info->dev_replace.srcdev;
4040 else
4041 srcdev = NULL;
4042
4043 /*
4044 * try to avoid the drive that is the source drive for a
4045 * dev-replace procedure, only choose it if no other non-missing
4046 * mirror is available
4047 */
4048 for (tolerance = 0; tolerance < 2; tolerance++) {
4049 if (map->stripes[optimal].dev->bdev &&
4050 (tolerance || map->stripes[optimal].dev != srcdev))
4051 return optimal;
4052 for (i = first; i < first + num; i++) {
4053 if (map->stripes[i].dev->bdev &&
4054 (tolerance || map->stripes[i].dev != srcdev))
4055 return i;
4056 }
3776 } 4057 }
4058
3777 /* we couldn't find one that doesn't fail. Just return something 4059 /* we couldn't find one that doesn't fail. Just return something
3778 * and the io error handling code will clean up eventually 4060 * and the io error handling code will clean up eventually
3779 */ 4061 */
3780 return optimal; 4062 return optimal;
3781} 4063}
3782 4064
3783static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4065static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
3784 u64 logical, u64 *length, 4066 u64 logical, u64 *length,
3785 struct btrfs_bio **bbio_ret, 4067 struct btrfs_bio **bbio_ret,
3786 int mirror_num) 4068 int mirror_num)
3787{ 4069{
3788 struct extent_map *em; 4070 struct extent_map *em;
3789 struct map_lookup *map; 4071 struct map_lookup *map;
4072 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3790 struct extent_map_tree *em_tree = &map_tree->map_tree; 4073 struct extent_map_tree *em_tree = &map_tree->map_tree;
3791 u64 offset; 4074 u64 offset;
3792 u64 stripe_offset; 4075 u64 stripe_offset;
@@ -3800,6 +4083,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3800 int num_stripes; 4083 int num_stripes;
3801 int max_errors = 0; 4084 int max_errors = 0;
3802 struct btrfs_bio *bbio = NULL; 4085 struct btrfs_bio *bbio = NULL;
4086 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4087 int dev_replace_is_ongoing = 0;
4088 int num_alloc_stripes;
4089 int patch_the_first_stripe_for_dev_replace = 0;
4090 u64 physical_to_patch_in_first_stripe = 0;
3803 4091
3804 read_lock(&em_tree->lock); 4092 read_lock(&em_tree->lock);
3805 em = lookup_extent_mapping(em_tree, logical, *length); 4093 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4104,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3816 map = (struct map_lookup *)em->bdev; 4104 map = (struct map_lookup *)em->bdev;
3817 offset = logical - em->start; 4105 offset = logical - em->start;
3818 4106
3819 if (mirror_num > map->num_stripes)
3820 mirror_num = 0;
3821
3822 stripe_nr = offset; 4107 stripe_nr = offset;
3823 /* 4108 /*
3824 * stripe_nr counts the total number of stripes we have to stride 4109 * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4130,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3845 if (!bbio_ret) 4130 if (!bbio_ret)
3846 goto out; 4131 goto out;
3847 4132
4133 btrfs_dev_replace_lock(dev_replace);
4134 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
4135 if (!dev_replace_is_ongoing)
4136 btrfs_dev_replace_unlock(dev_replace);
4137
4138 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
4139 !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
4140 dev_replace->tgtdev != NULL) {
4141 /*
4142 * in dev-replace case, for repair case (that's the only
4143 * case where the mirror is selected explicitly when
4144 * calling btrfs_map_block), blocks left of the left cursor
4145 * can also be read from the target drive.
4146 * For REQ_GET_READ_MIRRORS, the target drive is added as
4147 * the last one to the array of stripes. For READ, it also
4148 * needs to be supported using the same mirror number.
4149 * If the requested block is not left of the left cursor,
4150 * EIO is returned. This can happen because btrfs_num_copies()
4151 * returns one more in the dev-replace case.
4152 */
4153 u64 tmp_length = *length;
4154 struct btrfs_bio *tmp_bbio = NULL;
4155 int tmp_num_stripes;
4156 u64 srcdev_devid = dev_replace->srcdev->devid;
4157 int index_srcdev = 0;
4158 int found = 0;
4159 u64 physical_of_found = 0;
4160
4161 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4162 logical, &tmp_length, &tmp_bbio, 0);
4163 if (ret) {
4164 WARN_ON(tmp_bbio != NULL);
4165 goto out;
4166 }
4167
4168 tmp_num_stripes = tmp_bbio->num_stripes;
4169 if (mirror_num > tmp_num_stripes) {
4170 /*
4171 * REQ_GET_READ_MIRRORS does not contain this
4172 * mirror, that means that the requested area
4173 * is not left of the left cursor
4174 */
4175 ret = -EIO;
4176 kfree(tmp_bbio);
4177 goto out;
4178 }
4179
4180 /*
4181 * process the rest of the function using the mirror_num
4182 * of the source drive. Therefore look it up first.
4183 * At the end, patch the device pointer to the one of the
4184 * target drive.
4185 */
4186 for (i = 0; i < tmp_num_stripes; i++) {
4187 if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
4188 /*
4189 * In case of DUP, in order to keep it
4190 * simple, only add the mirror with the
4191 * lowest physical address
4192 */
4193 if (found &&
4194 physical_of_found <=
4195 tmp_bbio->stripes[i].physical)
4196 continue;
4197 index_srcdev = i;
4198 found = 1;
4199 physical_of_found =
4200 tmp_bbio->stripes[i].physical;
4201 }
4202 }
4203
4204 if (found) {
4205 mirror_num = index_srcdev + 1;
4206 patch_the_first_stripe_for_dev_replace = 1;
4207 physical_to_patch_in_first_stripe = physical_of_found;
4208 } else {
4209 WARN_ON(1);
4210 ret = -EIO;
4211 kfree(tmp_bbio);
4212 goto out;
4213 }
4214
4215 kfree(tmp_bbio);
4216 } else if (mirror_num > map->num_stripes) {
4217 mirror_num = 0;
4218 }
4219
3848 num_stripes = 1; 4220 num_stripes = 1;
3849 stripe_index = 0; 4221 stripe_index = 0;
3850 stripe_nr_orig = stripe_nr; 4222 stripe_nr_orig = stripe_nr;
@@ -3859,19 +4231,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3859 stripe_nr_end - stripe_nr_orig); 4231 stripe_nr_end - stripe_nr_orig);
3860 stripe_index = do_div(stripe_nr, map->num_stripes); 4232 stripe_index = do_div(stripe_nr, map->num_stripes);
3861 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 4233 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3862 if (rw & (REQ_WRITE | REQ_DISCARD)) 4234 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
3863 num_stripes = map->num_stripes; 4235 num_stripes = map->num_stripes;
3864 else if (mirror_num) 4236 else if (mirror_num)
3865 stripe_index = mirror_num - 1; 4237 stripe_index = mirror_num - 1;
3866 else { 4238 else {
3867 stripe_index = find_live_mirror(map, 0, 4239 stripe_index = find_live_mirror(fs_info, map, 0,
3868 map->num_stripes, 4240 map->num_stripes,
3869 current->pid % map->num_stripes); 4241 current->pid % map->num_stripes,
4242 dev_replace_is_ongoing);
3870 mirror_num = stripe_index + 1; 4243 mirror_num = stripe_index + 1;
3871 } 4244 }
3872 4245
3873 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 4246 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3874 if (rw & (REQ_WRITE | REQ_DISCARD)) { 4247 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
3875 num_stripes = map->num_stripes; 4248 num_stripes = map->num_stripes;
3876 } else if (mirror_num) { 4249 } else if (mirror_num) {
3877 stripe_index = mirror_num - 1; 4250 stripe_index = mirror_num - 1;
@@ -3885,7 +4258,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3885 stripe_index = do_div(stripe_nr, factor); 4258 stripe_index = do_div(stripe_nr, factor);
3886 stripe_index *= map->sub_stripes; 4259 stripe_index *= map->sub_stripes;
3887 4260
3888 if (rw & REQ_WRITE) 4261 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
3889 num_stripes = map->sub_stripes; 4262 num_stripes = map->sub_stripes;
3890 else if (rw & REQ_DISCARD) 4263 else if (rw & REQ_DISCARD)
3891 num_stripes = min_t(u64, map->sub_stripes * 4264 num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4268,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3895 stripe_index += mirror_num - 1; 4268 stripe_index += mirror_num - 1;
3896 else { 4269 else {
3897 int old_stripe_index = stripe_index; 4270 int old_stripe_index = stripe_index;
3898 stripe_index = find_live_mirror(map, stripe_index, 4271 stripe_index = find_live_mirror(fs_info, map,
4272 stripe_index,
3899 map->sub_stripes, stripe_index + 4273 map->sub_stripes, stripe_index +
3900 current->pid % map->sub_stripes); 4274 current->pid % map->sub_stripes,
4275 dev_replace_is_ongoing);
3901 mirror_num = stripe_index - old_stripe_index + 1; 4276 mirror_num = stripe_index - old_stripe_index + 1;
3902 } 4277 }
3903 } else { 4278 } else {
@@ -3911,7 +4286,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3911 } 4286 }
3912 BUG_ON(stripe_index >= map->num_stripes); 4287 BUG_ON(stripe_index >= map->num_stripes);
3913 4288
3914 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS); 4289 num_alloc_stripes = num_stripes;
4290 if (dev_replace_is_ongoing) {
4291 if (rw & (REQ_WRITE | REQ_DISCARD))
4292 num_alloc_stripes <<= 1;
4293 if (rw & REQ_GET_READ_MIRRORS)
4294 num_alloc_stripes++;
4295 }
4296 bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
3915 if (!bbio) { 4297 if (!bbio) {
3916 ret = -ENOMEM; 4298 ret = -ENOMEM;
3917 goto out; 4299 goto out;
@@ -3998,7 +4380,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3998 } 4380 }
3999 } 4381 }
4000 4382
4001 if (rw & REQ_WRITE) { 4383 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4002 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4384 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4003 BTRFS_BLOCK_GROUP_RAID10 | 4385 BTRFS_BLOCK_GROUP_RAID10 |
4004 BTRFS_BLOCK_GROUP_DUP)) { 4386 BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4388,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
4006 } 4388 }
4007 } 4389 }
4008 4390
4391 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
4392 dev_replace->tgtdev != NULL) {
4393 int index_where_to_add;
4394 u64 srcdev_devid = dev_replace->srcdev->devid;
4395
4396 /*
4397 * duplicate the write operations while the dev replace
4398 * procedure is running. Since the copying of the old disk
4399 * to the new disk takes place at run time while the
4400 * filesystem is mounted writable, the regular write
4401 * operations to the old disk have to be duplicated to go
4402 * to the new disk as well.
4403 * Note that device->missing is handled by the caller, and
4404 * that the write to the old disk is already set up in the
4405 * stripes array.
4406 */
4407 index_where_to_add = num_stripes;
4408 for (i = 0; i < num_stripes; i++) {
4409 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4410 /* write to new disk, too */
4411 struct btrfs_bio_stripe *new =
4412 bbio->stripes + index_where_to_add;
4413 struct btrfs_bio_stripe *old =
4414 bbio->stripes + i;
4415
4416 new->physical = old->physical;
4417 new->length = old->length;
4418 new->dev = dev_replace->tgtdev;
4419 index_where_to_add++;
4420 max_errors++;
4421 }
4422 }
4423 num_stripes = index_where_to_add;
4424 } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
4425 dev_replace->tgtdev != NULL) {
4426 u64 srcdev_devid = dev_replace->srcdev->devid;
4427 int index_srcdev = 0;
4428 int found = 0;
4429 u64 physical_of_found = 0;
4430
4431 /*
4432 * During the dev-replace procedure, the target drive can
4433 * also be used to read data in case it is needed to repair
4434 * a corrupt block elsewhere. This is possible if the
4435 * requested area is left of the left cursor. In this area,
4436 * the target drive is a full copy of the source drive.
4437 */
4438 for (i = 0; i < num_stripes; i++) {
4439 if (bbio->stripes[i].dev->devid == srcdev_devid) {
4440 /*
4441 * In case of DUP, in order to keep it
4442 * simple, only add the mirror with the
4443 * lowest physical address
4444 */
4445 if (found &&
4446 physical_of_found <=
4447 bbio->stripes[i].physical)
4448 continue;
4449 index_srcdev = i;
4450 found = 1;
4451 physical_of_found = bbio->stripes[i].physical;
4452 }
4453 }
4454 if (found) {
4455 u64 length = map->stripe_len;
4456
4457 if (physical_of_found + length <=
4458 dev_replace->cursor_left) {
4459 struct btrfs_bio_stripe *tgtdev_stripe =
4460 bbio->stripes + num_stripes;
4461
4462 tgtdev_stripe->physical = physical_of_found;
4463 tgtdev_stripe->length =
4464 bbio->stripes[index_srcdev].length;
4465 tgtdev_stripe->dev = dev_replace->tgtdev;
4466
4467 num_stripes++;
4468 }
4469 }
4470 }
4471
4009 *bbio_ret = bbio; 4472 *bbio_ret = bbio;
4010 bbio->num_stripes = num_stripes; 4473 bbio->num_stripes = num_stripes;
4011 bbio->max_errors = max_errors; 4474 bbio->max_errors = max_errors;
4012 bbio->mirror_num = mirror_num; 4475 bbio->mirror_num = mirror_num;
4476
4477 /*
4478 * this is the case that REQ_READ && dev_replace_is_ongoing &&
4479 * mirror_num == num_stripes + 1 && dev_replace target drive is
4480 * available as a mirror
4481 */
4482 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
4483 WARN_ON(num_stripes > 1);
4484 bbio->stripes[0].dev = dev_replace->tgtdev;
4485 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4486 bbio->mirror_num = map->num_stripes + 1;
4487 }
4013out: 4488out:
4489 if (dev_replace_is_ongoing)
4490 btrfs_dev_replace_unlock(dev_replace);
4014 free_extent_map(em); 4491 free_extent_map(em);
4015 return ret; 4492 return ret;
4016} 4493}
4017 4494
4018int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 4495int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4019 u64 logical, u64 *length, 4496 u64 logical, u64 *length,
4020 struct btrfs_bio **bbio_ret, int mirror_num) 4497 struct btrfs_bio **bbio_ret, int mirror_num)
4021{ 4498{
4022 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, 4499 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4023 mirror_num); 4500 mirror_num);
4024} 4501}
4025 4502
@@ -4238,10 +4715,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
4238 &device->work); 4715 &device->work);
4239} 4716}
4240 4717
4718static int bio_size_ok(struct block_device *bdev, struct bio *bio,
4719 sector_t sector)
4720{
4721 struct bio_vec *prev;
4722 struct request_queue *q = bdev_get_queue(bdev);
4723 unsigned short max_sectors = queue_max_sectors(q);
4724 struct bvec_merge_data bvm = {
4725 .bi_bdev = bdev,
4726 .bi_sector = sector,
4727 .bi_rw = bio->bi_rw,
4728 };
4729
4730 if (bio->bi_vcnt == 0) {
4731 WARN_ON(1);
4732 return 1;
4733 }
4734
4735 prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
4736 if ((bio->bi_size >> 9) > max_sectors)
4737 return 0;
4738
4739 if (!q->merge_bvec_fn)
4740 return 1;
4741
4742 bvm.bi_size = bio->bi_size - prev->bv_len;
4743 if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
4744 return 0;
4745 return 1;
4746}
4747
4748static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4749 struct bio *bio, u64 physical, int dev_nr,
4750 int rw, int async)
4751{
4752 struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
4753
4754 bio->bi_private = bbio;
4755 bio->bi_private = merge_stripe_index_into_bio_private(
4756 bio->bi_private, (unsigned int)dev_nr);
4757 bio->bi_end_io = btrfs_end_bio;
4758 bio->bi_sector = physical >> 9;
4759#ifdef DEBUG
4760 {
4761 struct rcu_string *name;
4762
4763 rcu_read_lock();
4764 name = rcu_dereference(dev->name);
4765 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
4766 "(%s id %llu), size=%u\n", rw,
4767 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4768 name->str, dev->devid, bio->bi_size);
4769 rcu_read_unlock();
4770 }
4771#endif
4772 bio->bi_bdev = dev->bdev;
4773 if (async)
4774 schedule_bio(root, dev, rw, bio);
4775 else
4776 btrfsic_submit_bio(rw, bio);
4777}
4778
4779static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4780 struct bio *first_bio, struct btrfs_device *dev,
4781 int dev_nr, int rw, int async)
4782{
4783 struct bio_vec *bvec = first_bio->bi_io_vec;
4784 struct bio *bio;
4785 int nr_vecs = bio_get_nr_vecs(dev->bdev);
4786 u64 physical = bbio->stripes[dev_nr].physical;
4787
4788again:
4789 bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
4790 if (!bio)
4791 return -ENOMEM;
4792
4793 while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
4794 if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
4795 bvec->bv_offset) < bvec->bv_len) {
4796 u64 len = bio->bi_size;
4797
4798 atomic_inc(&bbio->stripes_pending);
4799 submit_stripe_bio(root, bbio, bio, physical, dev_nr,
4800 rw, async);
4801 physical += len;
4802 goto again;
4803 }
4804 bvec++;
4805 }
4806
4807 submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
4808 return 0;
4809}
4810
4811static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
4812{
4813 atomic_inc(&bbio->error);
4814 if (atomic_dec_and_test(&bbio->stripes_pending)) {
4815 bio->bi_private = bbio->private;
4816 bio->bi_end_io = bbio->end_io;
4817 bio->bi_bdev = (struct block_device *)
4818 (unsigned long)bbio->mirror_num;
4819 bio->bi_sector = logical >> 9;
4820 kfree(bbio);
4821 bio_endio(bio, -EIO);
4822 }
4823}
4824
4241int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 4825int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4242 int mirror_num, int async_submit) 4826 int mirror_num, int async_submit)
4243{ 4827{
4244 struct btrfs_mapping_tree *map_tree;
4245 struct btrfs_device *dev; 4828 struct btrfs_device *dev;
4246 struct bio *first_bio = bio; 4829 struct bio *first_bio = bio;
4247 u64 logical = (u64)bio->bi_sector << 9; 4830 u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4836,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4253 struct btrfs_bio *bbio = NULL; 4836 struct btrfs_bio *bbio = NULL;
4254 4837
4255 length = bio->bi_size; 4838 length = bio->bi_size;
4256 map_tree = &root->fs_info->mapping_tree;
4257 map_length = length; 4839 map_length = length;
4258 4840
4259 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, 4841 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4260 mirror_num); 4842 mirror_num);
4261 if (ret) /* -ENOMEM */ 4843 if (ret)
4262 return ret; 4844 return ret;
4263 4845
4264 total_devs = bbio->num_stripes; 4846 total_devs = bbio->num_stripes;
@@ -4276,52 +4858,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4276 atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4858 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4277 4859
4278 while (dev_nr < total_devs) { 4860 while (dev_nr < total_devs) {
4861 dev = bbio->stripes[dev_nr].dev;
4862 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
4863 bbio_error(bbio, first_bio, logical);
4864 dev_nr++;
4865 continue;
4866 }
4867
4868 /*
4869 * Check and see if we're ok with this bio based on it's size
4870 * and offset with the given device.
4871 */
4872 if (!bio_size_ok(dev->bdev, first_bio,
4873 bbio->stripes[dev_nr].physical >> 9)) {
4874 ret = breakup_stripe_bio(root, bbio, first_bio, dev,
4875 dev_nr, rw, async_submit);
4876 BUG_ON(ret);
4877 dev_nr++;
4878 continue;
4879 }
4880
4279 if (dev_nr < total_devs - 1) { 4881 if (dev_nr < total_devs - 1) {
4280 bio = bio_clone(first_bio, GFP_NOFS); 4882 bio = bio_clone(first_bio, GFP_NOFS);
4281 BUG_ON(!bio); /* -ENOMEM */ 4883 BUG_ON(!bio); /* -ENOMEM */
4282 } else { 4884 } else {
4283 bio = first_bio; 4885 bio = first_bio;
4284 } 4886 }
4285 bio->bi_private = bbio; 4887
4286 bio->bi_private = merge_stripe_index_into_bio_private( 4888 submit_stripe_bio(root, bbio, bio,
4287 bio->bi_private, (unsigned int)dev_nr); 4889 bbio->stripes[dev_nr].physical, dev_nr, rw,
4288 bio->bi_end_io = btrfs_end_bio; 4890 async_submit);
4289 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4290 dev = bbio->stripes[dev_nr].dev;
4291 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4292#ifdef DEBUG
4293 struct rcu_string *name;
4294
4295 rcu_read_lock();
4296 name = rcu_dereference(dev->name);
4297 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4298 "(%s id %llu), size=%u\n", rw,
4299 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4300 name->str, dev->devid, bio->bi_size);
4301 rcu_read_unlock();
4302#endif
4303 bio->bi_bdev = dev->bdev;
4304 if (async_submit)
4305 schedule_bio(root, dev, rw, bio);
4306 else
4307 btrfsic_submit_bio(rw, bio);
4308 } else {
4309 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4310 bio->bi_sector = logical >> 9;
4311 bio_endio(bio, -EIO);
4312 }
4313 dev_nr++; 4891 dev_nr++;
4314 } 4892 }
4315 return 0; 4893 return 0;
4316} 4894}
4317 4895
4318struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 4896struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
4319 u8 *uuid, u8 *fsid) 4897 u8 *uuid, u8 *fsid)
4320{ 4898{
4321 struct btrfs_device *device; 4899 struct btrfs_device *device;
4322 struct btrfs_fs_devices *cur_devices; 4900 struct btrfs_fs_devices *cur_devices;
4323 4901
4324 cur_devices = root->fs_info->fs_devices; 4902 cur_devices = fs_info->fs_devices;
4325 while (cur_devices) { 4903 while (cur_devices) {
4326 if (!fsid || 4904 if (!fsid ||
4327 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 4905 !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4980,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4402 em->bdev = (struct block_device *)map; 4980 em->bdev = (struct block_device *)map;
4403 em->start = logical; 4981 em->start = logical;
4404 em->len = length; 4982 em->len = length;
4983 em->orig_start = 0;
4405 em->block_start = 0; 4984 em->block_start = 0;
4406 em->block_len = em->len; 4985 em->block_len = em->len;
4407 4986
@@ -4419,8 +4998,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4419 read_extent_buffer(leaf, uuid, (unsigned long) 4998 read_extent_buffer(leaf, uuid, (unsigned long)
4420 btrfs_stripe_dev_uuid_nr(chunk, i), 4999 btrfs_stripe_dev_uuid_nr(chunk, i),
4421 BTRFS_UUID_SIZE); 5000 BTRFS_UUID_SIZE);
4422 map->stripes[i].dev = btrfs_find_device(root, devid, uuid, 5001 map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
4423 NULL); 5002 uuid, NULL);
4424 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { 5003 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4425 kfree(map); 5004 kfree(map);
4426 free_extent_map(em); 5005 free_extent_map(em);
@@ -4461,6 +5040,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
4461 device->io_align = btrfs_device_io_align(leaf, dev_item); 5040 device->io_align = btrfs_device_io_align(leaf, dev_item);
4462 device->io_width = btrfs_device_io_width(leaf, dev_item); 5041 device->io_width = btrfs_device_io_width(leaf, dev_item);
4463 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 5042 device->sector_size = btrfs_device_sector_size(leaf, dev_item);
5043 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
5044 device->is_tgtdev_for_dev_replace = 0;
4464 5045
4465 ptr = (unsigned long)btrfs_device_uuid(dev_item); 5046 ptr = (unsigned long)btrfs_device_uuid(dev_item);
4466 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 5047 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5119,7 @@ static int read_one_dev(struct btrfs_root *root,
4538 return ret; 5119 return ret;
4539 } 5120 }
4540 5121
4541 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); 5122 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
4542 if (!device || !device->bdev) { 5123 if (!device || !device->bdev) {
4543 if (!btrfs_test_opt(root, DEGRADED)) 5124 if (!btrfs_test_opt(root, DEGRADED))
4544 return -EIO; 5125 return -EIO;
@@ -4571,7 +5152,7 @@ static int read_one_dev(struct btrfs_root *root,
4571 fill_device_from_item(leaf, dev_item, device); 5152 fill_device_from_item(leaf, dev_item, device);
4572 device->dev_root = root->fs_info->dev_root; 5153 device->dev_root = root->fs_info->dev_root;
4573 device->in_fs_metadata = 1; 5154 device->in_fs_metadata = 1;
4574 if (device->writeable) { 5155 if (device->writeable && !device->is_tgtdev_for_dev_replace) {
4575 device->fs_devices->total_rw_bytes += device->total_bytes; 5156 device->fs_devices->total_rw_bytes += device->total_bytes;
4576 spin_lock(&root->fs_info->free_chunk_lock); 5157 spin_lock(&root->fs_info->free_chunk_lock);
4577 root->fs_info->free_chunk_space += device->total_bytes - 5158 root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5511,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4930 int i; 5511 int i;
4931 5512
4932 mutex_lock(&fs_devices->device_list_mutex); 5513 mutex_lock(&fs_devices->device_list_mutex);
4933 dev = btrfs_find_device(root, stats->devid, NULL, NULL); 5514 dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
4934 mutex_unlock(&fs_devices->device_list_mutex); 5515 mutex_unlock(&fs_devices->device_list_mutex);
4935 5516
4936 if (!dev) { 5517 if (!dev) {
@@ -4958,3 +5539,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
4958 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 5539 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
4959 return 0; 5540 return 0;
4960} 5541}
5542
5543int btrfs_scratch_superblock(struct btrfs_device *device)
5544{
5545 struct buffer_head *bh;
5546 struct btrfs_super_block *disk_super;
5547
5548 bh = btrfs_read_dev_super(device->bdev);
5549 if (!bh)
5550 return -EINVAL;
5551 disk_super = (struct btrfs_super_block *)bh->b_data;
5552
5553 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
5554 set_buffer_dirty(bh);
5555 sync_dirty_buffer(bh);
5556 brelse(bh);
5557
5558 return 0;
5559}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 53c06af92e8d..d3c3939ac751 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
50 int in_fs_metadata; 50 int in_fs_metadata;
51 int missing; 51 int missing;
52 int can_discard; 52 int can_discard;
53 int is_tgtdev_for_dev_replace;
53 54
54 spinlock_t io_lock; 55 spinlock_t io_lock;
55 56
@@ -88,7 +89,7 @@ struct btrfs_device {
88 u8 uuid[BTRFS_UUID_SIZE]; 89 u8 uuid[BTRFS_UUID_SIZE];
89 90
90 /* per-device scrub information */ 91 /* per-device scrub information */
91 struct scrub_dev *scrub_device; 92 struct scrub_ctx *scrub_device;
92 93
93 struct btrfs_work work; 94 struct btrfs_work work;
94 struct rcu_head rcu; 95 struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
179 u64 total_avail; 180 u64 total_avail;
180}; 181};
181 182
183struct btrfs_raid_attr {
184 int sub_stripes; /* sub_stripes info for map */
185 int dev_stripes; /* stripes per dev */
186 int devs_max; /* max devs to use */
187 int devs_min; /* min devs needed */
188 int devs_increment; /* ndevs has to be a multiple of this */
189 int ncopies; /* how many copies to data has */
190};
191
182struct map_lookup { 192struct map_lookup {
183 u64 type; 193 u64 type;
184 int io_align; 194 int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
248 struct btrfs_device *device, 258 struct btrfs_device *device,
249 u64 chunk_tree, u64 chunk_objectid, 259 u64 chunk_tree, u64 chunk_objectid,
250 u64 chunk_offset, u64 start, u64 num_bytes); 260 u64 chunk_offset, u64 start, u64 num_bytes);
251int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 261int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
252 u64 logical, u64 *length, 262 u64 logical, u64 *length,
253 struct btrfs_bio **bbio_ret, int mirror_num); 263 struct btrfs_bio **bbio_ret, int mirror_num);
254int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 264int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
267int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 277int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
268 struct btrfs_fs_devices **fs_devices_ret); 278 struct btrfs_fs_devices **fs_devices_ret);
269int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 279int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
270void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); 280void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
281 struct btrfs_fs_devices *fs_devices, int step);
282int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
283 char *device_path,
284 struct btrfs_device **device);
285int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
286 struct btrfs_device **device);
271int btrfs_add_device(struct btrfs_trans_handle *trans, 287int btrfs_add_device(struct btrfs_trans_handle *trans,
272 struct btrfs_root *root, 288 struct btrfs_root *root,
273 struct btrfs_device *device); 289 struct btrfs_device *device);
274int btrfs_rm_device(struct btrfs_root *root, char *device_path); 290int btrfs_rm_device(struct btrfs_root *root, char *device_path);
275void btrfs_cleanup_fs_uuids(void); 291void btrfs_cleanup_fs_uuids(void);
276int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 292int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
277int btrfs_grow_device(struct btrfs_trans_handle *trans, 293int btrfs_grow_device(struct btrfs_trans_handle *trans,
278 struct btrfs_device *device, u64 new_size); 294 struct btrfs_device *device, u64 new_size);
279struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 295struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
280 u8 *uuid, u8 *fsid); 296 u8 *uuid, u8 *fsid);
281int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 297int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
282int btrfs_init_new_device(struct btrfs_root *root, char *path); 298int btrfs_init_new_device(struct btrfs_root *root, char *path);
299int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
300 struct btrfs_device **device_out);
283int btrfs_balance(struct btrfs_balance_control *bctl, 301int btrfs_balance(struct btrfs_balance_control *bctl,
284 struct btrfs_ioctl_balance_args *bargs); 302 struct btrfs_ioctl_balance_args *bargs);
285int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); 303int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
296int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); 314int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
297int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, 315int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
298 struct btrfs_fs_info *fs_info); 316 struct btrfs_fs_info *fs_info);
317void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
318 struct btrfs_device *srcdev);
319void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
320 struct btrfs_device *tgtdev);
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device);
299 324
300static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
301 int index) 326 int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3f4e2d69e83a..446a6848c554 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
122 */ 122 */
123 if (!value) 123 if (!value)
124 goto out; 124 goto out;
125 } else {
126 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
127 name, name_len, 0);
128 if (IS_ERR(di)) {
129 ret = PTR_ERR(di);
130 goto out;
131 }
132 if (!di && !value)
133 goto out;
134 btrfs_release_path(path);
125 } 135 }
126 136
127again: 137again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
198 208
199 inode_inc_iversion(inode); 209 inode_inc_iversion(inode);
200 inode->i_ctime = CURRENT_TIME; 210 inode->i_ctime = CURRENT_TIME;
211 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
201 ret = btrfs_update_inode(trans, root, inode); 212 ret = btrfs_update_inode(trans, root, inode);
202 BUG_ON(ret); 213 BUG_ON(ret);
203out: 214out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
265 276
266 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 277 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
267 if (verify_dir_item(root, leaf, di)) 278 if (verify_dir_item(root, leaf, di))
268 continue; 279 goto next;
269 280
270 name_len = btrfs_dir_name_len(leaf, di); 281 name_len = btrfs_dir_name_len(leaf, di);
271 total_size += name_len + 1; 282 total_size += name_len + 1;
diff --git a/fs/buffer.c b/fs/buffer.c
index ec0aca8ba6bf..7a75c3e0fd58 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 46
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 48
49inline void 49void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{ 50{
52 bh->b_end_io = handler; 51 bh->b_end_io = handler;
53 bh->b_private = private; 52 bh->b_private = private;
@@ -555,7 +554,7 @@ void emergency_thaw_all(void)
555 */ 554 */
556int sync_mapping_buffers(struct address_space *mapping) 555int sync_mapping_buffers(struct address_space *mapping)
557{ 556{
558 struct address_space *buffer_mapping = mapping->assoc_mapping; 557 struct address_space *buffer_mapping = mapping->private_data;
559 558
560 if (buffer_mapping == NULL || list_empty(&mapping->private_list)) 559 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
561 return 0; 560 return 0;
@@ -588,10 +587,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
588 struct address_space *buffer_mapping = bh->b_page->mapping; 587 struct address_space *buffer_mapping = bh->b_page->mapping;
589 588
590 mark_buffer_dirty(bh); 589 mark_buffer_dirty(bh);
591 if (!mapping->assoc_mapping) { 590 if (!mapping->private_data) {
592 mapping->assoc_mapping = buffer_mapping; 591 mapping->private_data = buffer_mapping;
593 } else { 592 } else {
594 BUG_ON(mapping->assoc_mapping != buffer_mapping); 593 BUG_ON(mapping->private_data != buffer_mapping);
595 } 594 }
596 if (!bh->b_assoc_map) { 595 if (!bh->b_assoc_map) {
597 spin_lock(&buffer_mapping->private_lock); 596 spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +787,7 @@ void invalidate_inode_buffers(struct inode *inode)
788 if (inode_has_buffers(inode)) { 787 if (inode_has_buffers(inode)) {
789 struct address_space *mapping = &inode->i_data; 788 struct address_space *mapping = &inode->i_data;
790 struct list_head *list = &mapping->private_list; 789 struct list_head *list = &mapping->private_list;
791 struct address_space *buffer_mapping = mapping->assoc_mapping; 790 struct address_space *buffer_mapping = mapping->private_data;
792 791
793 spin_lock(&buffer_mapping->private_lock); 792 spin_lock(&buffer_mapping->private_lock);
794 while (!list_empty(list)) 793 while (!list_empty(list))
@@ -811,7 +810,7 @@ int remove_inode_buffers(struct inode *inode)
811 if (inode_has_buffers(inode)) { 810 if (inode_has_buffers(inode)) {
812 struct address_space *mapping = &inode->i_data; 811 struct address_space *mapping = &inode->i_data;
813 struct list_head *list = &mapping->private_list; 812 struct list_head *list = &mapping->private_list;
814 struct address_space *buffer_mapping = mapping->assoc_mapping; 813 struct address_space *buffer_mapping = mapping->private_data;
815 814
816 spin_lock(&buffer_mapping->private_lock); 815 spin_lock(&buffer_mapping->private_lock);
817 while (!list_empty(list)) { 816 while (!list_empty(list)) {
@@ -850,13 +849,10 @@ try_again:
850 if (!bh) 849 if (!bh)
851 goto no_grow; 850 goto no_grow;
852 851
853 bh->b_bdev = NULL;
854 bh->b_this_page = head; 852 bh->b_this_page = head;
855 bh->b_blocknr = -1; 853 bh->b_blocknr = -1;
856 head = bh; 854 head = bh;
857 855
858 bh->b_state = 0;
859 atomic_set(&bh->b_count, 0);
860 bh->b_size = size; 856 bh->b_size = size;
861 857
862 /* Link the buffer to its page */ 858 /* Link the buffer to its page */
@@ -2939,6 +2935,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2939 void *kaddr = kmap_atomic(bh->b_page); 2935 void *kaddr = kmap_atomic(bh->b_page);
2940 memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); 2936 memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
2941 kunmap_atomic(kaddr); 2937 kunmap_atomic(kaddr);
2938 flush_dcache_page(bh->b_page);
2942 } 2939 }
2943} 2940}
2944 2941
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 67bef6d01484..746ce532e130 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -41,12 +41,12 @@ static struct fscache_object *cachefiles_alloc_object(
41 41
42 _enter("{%s},%p,", cache->cache.identifier, cookie); 42 _enter("{%s},%p,", cache->cache.identifier, cookie);
43 43
44 lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL); 44 lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
45 if (!lookup_data) 45 if (!lookup_data)
46 goto nomem_lookup_data; 46 goto nomem_lookup_data;
47 47
48 /* create a new object record and a temporary leaf image */ 48 /* create a new object record and a temporary leaf image */
49 object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); 49 object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
50 if (!object) 50 if (!object)
51 goto nomem_object; 51 goto nomem_object;
52 52
@@ -63,7 +63,7 @@ static struct fscache_object *cachefiles_alloc_object(
63 * - stick the length on the front and leave space on the back for the 63 * - stick the length on the front and leave space on the back for the
64 * encoder 64 * encoder
65 */ 65 */
66 buffer = kmalloc((2 + 512) + 3, GFP_KERNEL); 66 buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
67 if (!buffer) 67 if (!buffer)
68 goto nomem_buffer; 68 goto nomem_buffer;
69 69
@@ -219,7 +219,7 @@ static void cachefiles_update_object(struct fscache_object *_object)
219 return; 219 return;
220 } 220 }
221 221
222 auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL); 222 auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
223 if (!auxdata) { 223 if (!auxdata) {
224 _leave(" [nomem]"); 224 _leave(" [nomem]");
225 return; 225 return;
@@ -441,6 +441,54 @@ truncate_failed:
441} 441}
442 442
443/* 443/*
444 * Invalidate an object
445 */
446static void cachefiles_invalidate_object(struct fscache_operation *op)
447{
448 struct cachefiles_object *object;
449 struct cachefiles_cache *cache;
450 const struct cred *saved_cred;
451 struct path path;
452 uint64_t ni_size;
453 int ret;
454
455 object = container_of(op->object, struct cachefiles_object, fscache);
456 cache = container_of(object->fscache.cache,
457 struct cachefiles_cache, cache);
458
459 op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
460 &ni_size);
461
462 _enter("{OBJ%x},[%llu]",
463 op->object->debug_id, (unsigned long long)ni_size);
464
465 if (object->backer) {
466 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
467
468 fscache_set_store_limit(&object->fscache, ni_size);
469
470 path.dentry = object->backer;
471 path.mnt = cache->mnt;
472
473 cachefiles_begin_secure(cache, &saved_cred);
474 ret = vfs_truncate(&path, 0);
475 if (ret == 0)
476 ret = vfs_truncate(&path, ni_size);
477 cachefiles_end_secure(cache, saved_cred);
478
479 if (ret != 0) {
480 fscache_set_store_limit(&object->fscache, 0);
481 if (ret == -EIO)
482 cachefiles_io_error_obj(object,
483 "Invalidate failed");
484 }
485 }
486
487 fscache_op_complete(op, true);
488 _leave("");
489}
490
491/*
444 * dissociate a cache from all the pages it was backing 492 * dissociate a cache from all the pages it was backing
445 */ 493 */
446static void cachefiles_dissociate_pages(struct fscache_cache *cache) 494static void cachefiles_dissociate_pages(struct fscache_cache *cache)
@@ -455,6 +503,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
455 .lookup_complete = cachefiles_lookup_complete, 503 .lookup_complete = cachefiles_lookup_complete,
456 .grab_object = cachefiles_grab_object, 504 .grab_object = cachefiles_grab_object,
457 .update_object = cachefiles_update_object, 505 .update_object = cachefiles_update_object,
506 .invalidate_object = cachefiles_invalidate_object,
458 .drop_object = cachefiles_drop_object, 507 .drop_object = cachefiles_drop_object,
459 .put_object = cachefiles_put_object, 508 .put_object = cachefiles_put_object,
460 .sync_cache = cachefiles_sync_cache, 509 .sync_cache = cachefiles_sync_cache,
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index bd6bc1bde2d7..49382519907a 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -23,6 +23,8 @@ extern unsigned cachefiles_debug;
23#define CACHEFILES_DEBUG_KLEAVE 2 23#define CACHEFILES_DEBUG_KLEAVE 2
24#define CACHEFILES_DEBUG_KDEBUG 4 24#define CACHEFILES_DEBUG_KDEBUG 4
25 25
26#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
27
26/* 28/*
27 * node records 29 * node records
28 */ 30 */
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 81b8b2b3a674..33b58c60f2d1 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -78,7 +78,7 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
78 78
79 _debug("max: %d", max); 79 _debug("max: %d", max);
80 80
81 key = kmalloc(max, GFP_KERNEL); 81 key = kmalloc(max, cachefiles_gfp);
82 if (!key) 82 if (!key)
83 return NULL; 83 return NULL;
84 84
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index b0b5f7cdfffa..8c01c5fcdf75 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -40,8 +40,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", 40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
41 prefix, fscache_object_states[object->fscache.state], 41 prefix, fscache_object_states[object->fscache.state],
42 object->fscache.flags, work_busy(&object->fscache.work), 42 object->fscache.flags, work_busy(&object->fscache.work),
43 object->fscache.events, 43 object->fscache.events, object->fscache.event_mask);
44 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
45 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", 44 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
46 prefix, object->fscache.n_ops, object->fscache.n_in_progress, 45 prefix, object->fscache.n_ops, object->fscache.n_in_progress,
47 object->fscache.n_exclusive); 46 object->fscache.n_exclusive);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c994691d9445..480992259707 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -77,25 +77,25 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
77 struct page *backpage = monitor->back_page, *backpage2; 77 struct page *backpage = monitor->back_page, *backpage2;
78 int ret; 78 int ret;
79 79
80 kenter("{ino=%lx},{%lx,%lx}", 80 _enter("{ino=%lx},{%lx,%lx}",
81 object->backer->d_inode->i_ino, 81 object->backer->d_inode->i_ino,
82 backpage->index, backpage->flags); 82 backpage->index, backpage->flags);
83 83
84 /* skip if the page was truncated away completely */ 84 /* skip if the page was truncated away completely */
85 if (backpage->mapping != bmapping) { 85 if (backpage->mapping != bmapping) {
86 kleave(" = -ENODATA [mapping]"); 86 _leave(" = -ENODATA [mapping]");
87 return -ENODATA; 87 return -ENODATA;
88 } 88 }
89 89
90 backpage2 = find_get_page(bmapping, backpage->index); 90 backpage2 = find_get_page(bmapping, backpage->index);
91 if (!backpage2) { 91 if (!backpage2) {
92 kleave(" = -ENODATA [gone]"); 92 _leave(" = -ENODATA [gone]");
93 return -ENODATA; 93 return -ENODATA;
94 } 94 }
95 95
96 if (backpage != backpage2) { 96 if (backpage != backpage2) {
97 put_page(backpage2); 97 put_page(backpage2);
98 kleave(" = -ENODATA [different]"); 98 _leave(" = -ENODATA [different]");
99 return -ENODATA; 99 return -ENODATA;
100 } 100 }
101 101
@@ -114,7 +114,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
114 if (PageUptodate(backpage)) 114 if (PageUptodate(backpage))
115 goto unlock_discard; 115 goto unlock_discard;
116 116
117 kdebug("reissue read"); 117 _debug("reissue read");
118 ret = bmapping->a_ops->readpage(NULL, backpage); 118 ret = bmapping->a_ops->readpage(NULL, backpage);
119 if (ret < 0) 119 if (ret < 0)
120 goto unlock_discard; 120 goto unlock_discard;
@@ -129,7 +129,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
129 } 129 }
130 130
131 /* it'll reappear on the todo list */ 131 /* it'll reappear on the todo list */
132 kleave(" = -EINPROGRESS"); 132 _leave(" = -EINPROGRESS");
133 return -EINPROGRESS; 133 return -EINPROGRESS;
134 134
135unlock_discard: 135unlock_discard:
@@ -137,7 +137,7 @@ unlock_discard:
137 spin_lock_irq(&object->work_lock); 137 spin_lock_irq(&object->work_lock);
138 list_del(&monitor->op_link); 138 list_del(&monitor->op_link);
139 spin_unlock_irq(&object->work_lock); 139 spin_unlock_irq(&object->work_lock);
140 kleave(" = %d", ret); 140 _leave(" = %d", ret);
141 return ret; 141 return ret;
142} 142}
143 143
@@ -174,11 +174,13 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
174 _debug("- copy {%lu}", monitor->back_page->index); 174 _debug("- copy {%lu}", monitor->back_page->index);
175 175
176 recheck: 176 recheck:
177 if (PageUptodate(monitor->back_page)) { 177 if (test_bit(FSCACHE_COOKIE_INVALIDATING,
178 &object->fscache.cookie->flags)) {
179 error = -ESTALE;
180 } else if (PageUptodate(monitor->back_page)) {
178 copy_highpage(monitor->netfs_page, monitor->back_page); 181 copy_highpage(monitor->netfs_page, monitor->back_page);
179 182 fscache_mark_page_cached(monitor->op,
180 pagevec_add(&pagevec, monitor->netfs_page); 183 monitor->netfs_page);
181 fscache_mark_pages_cached(monitor->op, &pagevec);
182 error = 0; 184 error = 0;
183 } else if (!PageError(monitor->back_page)) { 185 } else if (!PageError(monitor->back_page)) {
184 /* the page has probably been truncated */ 186 /* the page has probably been truncated */
@@ -198,6 +200,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
198 200
199 fscache_end_io(op, monitor->netfs_page, error); 201 fscache_end_io(op, monitor->netfs_page, error);
200 page_cache_release(monitor->netfs_page); 202 page_cache_release(monitor->netfs_page);
203 fscache_retrieval_complete(op, 1);
201 fscache_put_retrieval(op); 204 fscache_put_retrieval(op);
202 kfree(monitor); 205 kfree(monitor);
203 206
@@ -239,7 +242,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
239 _debug("read back %p{%lu,%d}", 242 _debug("read back %p{%lu,%d}",
240 netpage, netpage->index, page_count(netpage)); 243 netpage, netpage->index, page_count(netpage));
241 244
242 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); 245 monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
243 if (!monitor) 246 if (!monitor)
244 goto nomem; 247 goto nomem;
245 248
@@ -258,13 +261,14 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
258 goto backing_page_already_present; 261 goto backing_page_already_present;
259 262
260 if (!newpage) { 263 if (!newpage) {
261 newpage = page_cache_alloc_cold(bmapping); 264 newpage = __page_cache_alloc(cachefiles_gfp |
265 __GFP_COLD);
262 if (!newpage) 266 if (!newpage)
263 goto nomem_monitor; 267 goto nomem_monitor;
264 } 268 }
265 269
266 ret = add_to_page_cache(newpage, bmapping, 270 ret = add_to_page_cache(newpage, bmapping,
267 netpage->index, GFP_KERNEL); 271 netpage->index, cachefiles_gfp);
268 if (ret == 0) 272 if (ret == 0)
269 goto installed_new_backing_page; 273 goto installed_new_backing_page;
270 if (ret != -EEXIST) 274 if (ret != -EEXIST)
@@ -335,11 +339,11 @@ backing_page_already_present:
335backing_page_already_uptodate: 339backing_page_already_uptodate:
336 _debug("- uptodate"); 340 _debug("- uptodate");
337 341
338 pagevec_add(pagevec, netpage); 342 fscache_mark_page_cached(op, netpage);
339 fscache_mark_pages_cached(op, pagevec);
340 343
341 copy_highpage(netpage, backpage); 344 copy_highpage(netpage, backpage);
342 fscache_end_io(op, netpage, 0); 345 fscache_end_io(op, netpage, 0);
346 fscache_retrieval_complete(op, 1);
343 347
344success: 348success:
345 _debug("success"); 349 _debug("success");
@@ -357,10 +361,13 @@ out:
357 361
358read_error: 362read_error:
359 _debug("read error %d", ret); 363 _debug("read error %d", ret);
360 if (ret == -ENOMEM) 364 if (ret == -ENOMEM) {
365 fscache_retrieval_complete(op, 1);
361 goto out; 366 goto out;
367 }
362io_error: 368io_error:
363 cachefiles_io_error_obj(object, "Page read error on backing file"); 369 cachefiles_io_error_obj(object, "Page read error on backing file");
370 fscache_retrieval_complete(op, 1);
364 ret = -ENOBUFS; 371 ret = -ENOBUFS;
365 goto out; 372 goto out;
366 373
@@ -370,6 +377,7 @@ nomem_monitor:
370 fscache_put_retrieval(monitor->op); 377 fscache_put_retrieval(monitor->op);
371 kfree(monitor); 378 kfree(monitor);
372nomem: 379nomem:
380 fscache_retrieval_complete(op, 1);
373 _leave(" = -ENOMEM"); 381 _leave(" = -ENOMEM");
374 return -ENOMEM; 382 return -ENOMEM;
375} 383}
@@ -408,7 +416,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
408 _enter("{%p},{%lx},,,", object, page->index); 416 _enter("{%p},{%lx},,,", object, page->index);
409 417
410 if (!object->backer) 418 if (!object->backer)
411 return -ENOBUFS; 419 goto enobufs;
412 420
413 inode = object->backer->d_inode; 421 inode = object->backer->d_inode;
414 ASSERT(S_ISREG(inode->i_mode)); 422 ASSERT(S_ISREG(inode->i_mode));
@@ -417,7 +425,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
417 425
418 /* calculate the shift required to use bmap */ 426 /* calculate the shift required to use bmap */
419 if (inode->i_sb->s_blocksize > PAGE_SIZE) 427 if (inode->i_sb->s_blocksize > PAGE_SIZE)
420 return -ENOBUFS; 428 goto enobufs;
421 429
422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 430 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
423 431
@@ -448,15 +456,20 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
448 &pagevec); 456 &pagevec);
449 } else if (cachefiles_has_space(cache, 0, 1) == 0) { 457 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
450 /* there's space in the cache we can use */ 458 /* there's space in the cache we can use */
451 pagevec_add(&pagevec, page); 459 fscache_mark_page_cached(op, page);
452 fscache_mark_pages_cached(op, &pagevec); 460 fscache_retrieval_complete(op, 1);
453 ret = -ENODATA; 461 ret = -ENODATA;
454 } else { 462 } else {
455 ret = -ENOBUFS; 463 goto enobufs;
456 } 464 }
457 465
458 _leave(" = %d", ret); 466 _leave(" = %d", ret);
459 return ret; 467 return ret;
468
469enobufs:
470 fscache_retrieval_complete(op, 1);
471 _leave(" = -ENOBUFS");
472 return -ENOBUFS;
460} 473}
461 474
462/* 475/*
@@ -465,8 +478,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
465 */ 478 */
466static int cachefiles_read_backing_file(struct cachefiles_object *object, 479static int cachefiles_read_backing_file(struct cachefiles_object *object,
467 struct fscache_retrieval *op, 480 struct fscache_retrieval *op,
468 struct list_head *list, 481 struct list_head *list)
469 struct pagevec *mark_pvec)
470{ 482{
471 struct cachefiles_one_read *monitor = NULL; 483 struct cachefiles_one_read *monitor = NULL;
472 struct address_space *bmapping = object->backer->d_inode->i_mapping; 484 struct address_space *bmapping = object->backer->d_inode->i_mapping;
@@ -485,7 +497,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
485 netpage, netpage->index, page_count(netpage)); 497 netpage, netpage->index, page_count(netpage));
486 498
487 if (!monitor) { 499 if (!monitor) {
488 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); 500 monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
489 if (!monitor) 501 if (!monitor)
490 goto nomem; 502 goto nomem;
491 503
@@ -500,13 +512,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
500 goto backing_page_already_present; 512 goto backing_page_already_present;
501 513
502 if (!newpage) { 514 if (!newpage) {
503 newpage = page_cache_alloc_cold(bmapping); 515 newpage = __page_cache_alloc(cachefiles_gfp |
516 __GFP_COLD);
504 if (!newpage) 517 if (!newpage)
505 goto nomem; 518 goto nomem;
506 } 519 }
507 520
508 ret = add_to_page_cache(newpage, bmapping, 521 ret = add_to_page_cache(newpage, bmapping,
509 netpage->index, GFP_KERNEL); 522 netpage->index, cachefiles_gfp);
510 if (ret == 0) 523 if (ret == 0)
511 goto installed_new_backing_page; 524 goto installed_new_backing_page;
512 if (ret != -EEXIST) 525 if (ret != -EEXIST)
@@ -536,10 +549,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
536 _debug("- monitor add"); 549 _debug("- monitor add");
537 550
538 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 551 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
539 GFP_KERNEL); 552 cachefiles_gfp);
540 if (ret < 0) { 553 if (ret < 0) {
541 if (ret == -EEXIST) { 554 if (ret == -EEXIST) {
542 page_cache_release(netpage); 555 page_cache_release(netpage);
556 fscache_retrieval_complete(op, 1);
543 continue; 557 continue;
544 } 558 }
545 goto nomem; 559 goto nomem;
@@ -612,10 +626,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
612 _debug("- uptodate"); 626 _debug("- uptodate");
613 627
614 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 628 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
615 GFP_KERNEL); 629 cachefiles_gfp);
616 if (ret < 0) { 630 if (ret < 0) {
617 if (ret == -EEXIST) { 631 if (ret == -EEXIST) {
618 page_cache_release(netpage); 632 page_cache_release(netpage);
633 fscache_retrieval_complete(op, 1);
619 continue; 634 continue;
620 } 635 }
621 goto nomem; 636 goto nomem;
@@ -626,16 +641,17 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
626 page_cache_release(backpage); 641 page_cache_release(backpage);
627 backpage = NULL; 642 backpage = NULL;
628 643
629 if (!pagevec_add(mark_pvec, netpage)) 644 fscache_mark_page_cached(op, netpage);
630 fscache_mark_pages_cached(op, mark_pvec);
631 645
632 page_cache_get(netpage); 646 page_cache_get(netpage);
633 if (!pagevec_add(&lru_pvec, netpage)) 647 if (!pagevec_add(&lru_pvec, netpage))
634 __pagevec_lru_add_file(&lru_pvec); 648 __pagevec_lru_add_file(&lru_pvec);
635 649
650 /* the netpage is unlocked and marked up to date here */
636 fscache_end_io(op, netpage, 0); 651 fscache_end_io(op, netpage, 0);
637 page_cache_release(netpage); 652 page_cache_release(netpage);
638 netpage = NULL; 653 netpage = NULL;
654 fscache_retrieval_complete(op, 1);
639 continue; 655 continue;
640 } 656 }
641 657
@@ -661,6 +677,7 @@ out:
661 list_for_each_entry_safe(netpage, _n, list, lru) { 677 list_for_each_entry_safe(netpage, _n, list, lru) {
662 list_del(&netpage->lru); 678 list_del(&netpage->lru);
663 page_cache_release(netpage); 679 page_cache_release(netpage);
680 fscache_retrieval_complete(op, 1);
664 } 681 }
665 682
666 _leave(" = %d", ret); 683 _leave(" = %d", ret);
@@ -669,15 +686,17 @@ out:
669nomem: 686nomem:
670 _debug("nomem"); 687 _debug("nomem");
671 ret = -ENOMEM; 688 ret = -ENOMEM;
672 goto out; 689 goto record_page_complete;
673 690
674read_error: 691read_error:
675 _debug("read error %d", ret); 692 _debug("read error %d", ret);
676 if (ret == -ENOMEM) 693 if (ret == -ENOMEM)
677 goto out; 694 goto record_page_complete;
678io_error: 695io_error:
679 cachefiles_io_error_obj(object, "Page read error on backing file"); 696 cachefiles_io_error_obj(object, "Page read error on backing file");
680 ret = -ENOBUFS; 697 ret = -ENOBUFS;
698record_page_complete:
699 fscache_retrieval_complete(op, 1);
681 goto out; 700 goto out;
682} 701}
683 702
@@ -709,7 +728,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
709 *nr_pages); 728 *nr_pages);
710 729
711 if (!object->backer) 730 if (!object->backer)
712 return -ENOBUFS; 731 goto all_enobufs;
713 732
714 space = 1; 733 space = 1;
715 if (cachefiles_has_space(cache, 0, *nr_pages) < 0) 734 if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
@@ -722,7 +741,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
722 741
723 /* calculate the shift required to use bmap */ 742 /* calculate the shift required to use bmap */
724 if (inode->i_sb->s_blocksize > PAGE_SIZE) 743 if (inode->i_sb->s_blocksize > PAGE_SIZE)
725 return -ENOBUFS; 744 goto all_enobufs;
726 745
727 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 746 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
728 747
@@ -762,7 +781,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
762 nrbackpages++; 781 nrbackpages++;
763 } else if (space && pagevec_add(&pagevec, page) == 0) { 782 } else if (space && pagevec_add(&pagevec, page) == 0) {
764 fscache_mark_pages_cached(op, &pagevec); 783 fscache_mark_pages_cached(op, &pagevec);
784 fscache_retrieval_complete(op, 1);
765 ret = -ENODATA; 785 ret = -ENODATA;
786 } else {
787 fscache_retrieval_complete(op, 1);
766 } 788 }
767 } 789 }
768 790
@@ -775,18 +797,18 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
775 /* submit the apparently valid pages to the backing fs to be read from 797 /* submit the apparently valid pages to the backing fs to be read from
776 * disk */ 798 * disk */
777 if (nrbackpages > 0) { 799 if (nrbackpages > 0) {
778 ret2 = cachefiles_read_backing_file(object, op, &backpages, 800 ret2 = cachefiles_read_backing_file(object, op, &backpages);
779 &pagevec);
780 if (ret2 == -ENOMEM || ret2 == -EINTR) 801 if (ret2 == -ENOMEM || ret2 == -EINTR)
781 ret = ret2; 802 ret = ret2;
782 } 803 }
783 804
784 if (pagevec_count(&pagevec) > 0)
785 fscache_mark_pages_cached(op, &pagevec);
786
787 _leave(" = %d [nr=%u%s]", 805 _leave(" = %d [nr=%u%s]",
788 ret, *nr_pages, list_empty(pages) ? " empty" : ""); 806 ret, *nr_pages, list_empty(pages) ? " empty" : "");
789 return ret; 807 return ret;
808
809all_enobufs:
810 fscache_retrieval_complete(op, *nr_pages);
811 return -ENOBUFS;
790} 812}
791 813
792/* 814/*
@@ -806,7 +828,6 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
806{ 828{
807 struct cachefiles_object *object; 829 struct cachefiles_object *object;
808 struct cachefiles_cache *cache; 830 struct cachefiles_cache *cache;
809 struct pagevec pagevec;
810 int ret; 831 int ret;
811 832
812 object = container_of(op->op.object, 833 object = container_of(op->op.object,
@@ -817,14 +838,12 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
817 _enter("%p,{%lx},", object, page->index); 838 _enter("%p,{%lx},", object, page->index);
818 839
819 ret = cachefiles_has_space(cache, 0, 1); 840 ret = cachefiles_has_space(cache, 0, 1);
820 if (ret == 0) { 841 if (ret == 0)
821 pagevec_init(&pagevec, 0); 842 fscache_mark_page_cached(op, page);
822 pagevec_add(&pagevec, page); 843 else
823 fscache_mark_pages_cached(op, &pagevec);
824 } else {
825 ret = -ENOBUFS; 844 ret = -ENOBUFS;
826 }
827 845
846 fscache_retrieval_complete(op, 1);
828 _leave(" = %d", ret); 847 _leave(" = %d", ret);
829 return ret; 848 return ret;
830} 849}
@@ -874,6 +893,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
874 ret = -ENOBUFS; 893 ret = -ENOBUFS;
875 } 894 }
876 895
896 fscache_retrieval_complete(op, *nr_pages);
877 _leave(" = %d", ret); 897 _leave(" = %d", ret);
878 return ret; 898 return ret;
879} 899}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index e18b183b47e1..73b46288b54b 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -174,7 +174,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
174 ASSERT(dentry); 174 ASSERT(dentry);
175 ASSERT(dentry->d_inode); 175 ASSERT(dentry->d_inode);
176 176
177 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); 177 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
178 if (!auxbuf) { 178 if (!auxbuf) {
179 _leave(" = -ENOMEM"); 179 _leave(" = -ENOMEM");
180 return -ENOMEM; 180 return -ENOMEM;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6690269f5dde..064d1a68d2c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
267 kfree(req->r_pages); 267 kfree(req->r_pages);
268} 268}
269 269
270static void ceph_unlock_page_vector(struct page **pages, int num_pages)
271{
272 int i;
273
274 for (i = 0; i < num_pages; i++)
275 unlock_page(pages[i]);
276}
277
270/* 278/*
271 * start an async read(ahead) operation. return nr_pages we submitted 279 * start an async read(ahead) operation. return nr_pages we submitted
272 * a read for on success, or negative error code. 280 * a read for on success, or negative error code.
@@ -347,6 +355,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
347 return nr_pages; 355 return nr_pages;
348 356
349out_pages: 357out_pages:
358 ceph_unlock_page_vector(pages, nr_pages);
350 ceph_release_page_vector(pages, nr_pages); 359 ceph_release_page_vector(pages, nr_pages);
351out: 360out:
352 ceph_osdc_put_request(req); 361 ceph_osdc_put_request(req);
@@ -1078,23 +1087,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1078 struct page **pagep, void **fsdata) 1087 struct page **pagep, void **fsdata)
1079{ 1088{
1080 struct inode *inode = file->f_dentry->d_inode; 1089 struct inode *inode = file->f_dentry->d_inode;
1090 struct ceph_inode_info *ci = ceph_inode(inode);
1091 struct ceph_file_info *fi = file->private_data;
1081 struct page *page; 1092 struct page *page;
1082 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1093 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1083 int r; 1094 int r, want, got = 0;
1095
1096 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1097 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1098 else
1099 want = CEPH_CAP_FILE_BUFFER;
1100
1101 dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
1102 inode, ceph_vinop(inode), pos, len, inode->i_size);
1103 r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
1104 if (r < 0)
1105 return r;
1106 dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
1107 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
1108 if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
1109 ceph_put_cap_refs(ci, got);
1110 return -EAGAIN;
1111 }
1084 1112
1085 do { 1113 do {
1086 /* get a page */ 1114 /* get a page */
1087 page = grab_cache_page_write_begin(mapping, index, 0); 1115 page = grab_cache_page_write_begin(mapping, index, 0);
1088 if (!page) 1116 if (!page) {
1089 return -ENOMEM; 1117 r = -ENOMEM;
1090 *pagep = page; 1118 break;
1119 }
1091 1120
1092 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1121 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1093 inode, page, (int)pos, (int)len); 1122 inode, page, (int)pos, (int)len);
1094 1123
1095 r = ceph_update_writeable_page(file, pos, len, page); 1124 r = ceph_update_writeable_page(file, pos, len, page);
1125 if (r)
1126 page_cache_release(page);
1096 } while (r == -EAGAIN); 1127 } while (r == -EAGAIN);
1097 1128
1129 if (r) {
1130 ceph_put_cap_refs(ci, got);
1131 } else {
1132 *pagep = page;
1133 *(int *)fsdata = got;
1134 }
1098 return r; 1135 return r;
1099} 1136}
1100 1137
@@ -1108,10 +1145,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1108 struct page *page, void *fsdata) 1145 struct page *page, void *fsdata)
1109{ 1146{
1110 struct inode *inode = file->f_dentry->d_inode; 1147 struct inode *inode = file->f_dentry->d_inode;
1148 struct ceph_inode_info *ci = ceph_inode(inode);
1111 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1149 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1112 struct ceph_mds_client *mdsc = fsc->mdsc; 1150 struct ceph_mds_client *mdsc = fsc->mdsc;
1113 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1151 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1114 int check_cap = 0; 1152 int check_cap = 0;
1153 int got = (unsigned long)fsdata;
1115 1154
1116 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, 1155 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1117 inode, page, (int)pos, (int)copied, (int)len); 1156 inode, page, (int)pos, (int)copied, (int)len);
@@ -1134,6 +1173,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1134 up_read(&mdsc->snap_rwsem); 1173 up_read(&mdsc->snap_rwsem);
1135 page_cache_release(page); 1174 page_cache_release(page);
1136 1175
1176 if (copied > 0) {
1177 int dirty;
1178 spin_lock(&ci->i_ceph_lock);
1179 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
1180 spin_unlock(&ci->i_ceph_lock);
1181 if (dirty)
1182 __mark_inode_dirty(inode, dirty);
1183 }
1184
1185 dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
1186 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
1187 ceph_put_cap_refs(ci, got);
1188
1137 if (check_cap) 1189 if (check_cap)
1138 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); 1190 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1139 1191
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3251e9cc6401..a1d9bb30c1bf 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
236 if (!ctx) { 236 if (!ctx) {
237 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 237 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
238 if (cap) { 238 if (cap) {
239 spin_lock(&mdsc->caps_list_lock);
239 mdsc->caps_use_count++; 240 mdsc->caps_use_count++;
240 mdsc->caps_total_count++; 241 mdsc->caps_total_count++;
242 spin_unlock(&mdsc->caps_list_lock);
241 } 243 }
242 return cap; 244 return cap;
243 } 245 }
@@ -1349,11 +1351,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1349 if (!ci->i_head_snapc) 1351 if (!ci->i_head_snapc)
1350 ci->i_head_snapc = ceph_get_snap_context( 1352 ci->i_head_snapc = ceph_get_snap_context(
1351 ci->i_snap_realm->cached_context); 1353 ci->i_snap_realm->cached_context);
1352 dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, 1354 dout(" inode %p now dirty snapc %p auth cap %p\n",
1353 ci->i_head_snapc); 1355 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
1354 BUG_ON(!list_empty(&ci->i_dirty_item)); 1356 BUG_ON(!list_empty(&ci->i_dirty_item));
1355 spin_lock(&mdsc->cap_dirty_lock); 1357 spin_lock(&mdsc->cap_dirty_lock);
1356 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1358 if (ci->i_auth_cap)
1359 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1360 else
1361 list_add(&ci->i_dirty_item,
1362 &mdsc->cap_dirty_migrating);
1357 spin_unlock(&mdsc->cap_dirty_lock); 1363 spin_unlock(&mdsc->cap_dirty_lock);
1358 if (ci->i_flushing_caps == 0) { 1364 if (ci->i_flushing_caps == 0) {
1359 ihold(inode); 1365 ihold(inode);
@@ -2388,7 +2394,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2388 &atime); 2394 &atime);
2389 2395
2390 /* max size increase? */ 2396 /* max size increase? */
2391 if (max_size != ci->i_max_size) { 2397 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
2392 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); 2398 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2393 ci->i_max_size = max_size; 2399 ci->i_max_size = max_size;
2394 if (max_size >= ci->i_wanted_max_size) { 2400 if (max_size >= ci->i_wanted_max_size) {
@@ -2745,6 +2751,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2745 2751
2746 /* make sure we re-request max_size, if necessary */ 2752 /* make sure we re-request max_size, if necessary */
2747 spin_lock(&ci->i_ceph_lock); 2753 spin_lock(&ci->i_ceph_lock);
2754 ci->i_wanted_max_size = 0; /* reset */
2748 ci->i_requested_max_size = 0; 2755 ci->i_requested_max_size = 0;
2749 spin_unlock(&ci->i_ceph_lock); 2756 spin_unlock(&ci->i_ceph_lock);
2750} 2757}
@@ -2840,8 +2847,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2840 case CEPH_CAP_OP_IMPORT: 2847 case CEPH_CAP_OP_IMPORT:
2841 handle_cap_import(mdsc, inode, h, session, 2848 handle_cap_import(mdsc, inode, h, session,
2842 snaptrace, snaptrace_len); 2849 snaptrace, snaptrace_len);
2843 ceph_check_caps(ceph_inode(inode), 0, session);
2844 goto done_unlocked;
2845 } 2850 }
2846 2851
2847 /* the rest require a cap */ 2852 /* the rest require a cap */
@@ -2858,6 +2863,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2858 switch (op) { 2863 switch (op) {
2859 case CEPH_CAP_OP_REVOKE: 2864 case CEPH_CAP_OP_REVOKE:
2860 case CEPH_CAP_OP_GRANT: 2865 case CEPH_CAP_OP_GRANT:
2866 case CEPH_CAP_OP_IMPORT:
2861 handle_cap_grant(inode, h, session, cap, msg->middle); 2867 handle_cap_grant(inode, h, session, cap, msg->middle);
2862 goto done_unlocked; 2868 goto done_unlocked;
2863 2869
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e5b77319c97b..8c1aabe93b67 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -454,7 +454,7 @@ static void reset_readdir(struct ceph_file_info *fi)
454 fi->flags &= ~CEPH_F_ATEND; 454 fi->flags &= ~CEPH_F_ATEND;
455} 455}
456 456
457static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) 457static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
458{ 458{
459 struct ceph_file_info *fi = file->private_data; 459 struct ceph_file_info *fi = file->private_data;
460 struct inode *inode = file->f_mapping->host; 460 struct inode *inode = file->f_mapping->host;
@@ -463,7 +463,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
463 463
464 mutex_lock(&inode->i_mutex); 464 mutex_lock(&inode->i_mutex);
465 retval = -EINVAL; 465 retval = -EINVAL;
466 switch (origin) { 466 switch (whence) {
467 case SEEK_END: 467 case SEEK_END:
468 offset += inode->i_size + 2; /* FIXME */ 468 offset += inode->i_size + 2; /* FIXME */
469 break; 469 break;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9349bb37a2fe..ca3ab3f9ca70 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,13 +56,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
56 struct ceph_nfs_confh *cfh = (void *)rawfh; 56 struct ceph_nfs_confh *cfh = (void *)rawfh;
57 int connected_handle_length = sizeof(*cfh)/4; 57 int connected_handle_length = sizeof(*cfh)/4;
58 int handle_length = sizeof(*fh)/4; 58 int handle_length = sizeof(*fh)/4;
59 struct dentry *dentry = d_find_alias(inode); 59 struct dentry *dentry;
60 struct dentry *parent; 60 struct dentry *parent;
61 61
62 /* don't re-export snaps */ 62 /* don't re-export snaps */
63 if (ceph_snap(inode) != CEPH_NOSNAP) 63 if (ceph_snap(inode) != CEPH_NOSNAP)
64 return -EINVAL; 64 return -EINVAL;
65 65
66 dentry = d_find_alias(inode);
67
66 /* if we found an alias, generate a connectable fh */ 68 /* if we found an alias, generate a connectable fh */
67 if (*max_len >= connected_handle_length && dentry) { 69 if (*max_len >= connected_handle_length && dentry) {
68 dout("encode_fh %p connectable\n", dentry); 70 dout("encode_fh %p connectable\n", dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5840d2aaed15..e51558fca3a3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
712 struct ceph_osd_client *osdc = 712 struct ceph_osd_client *osdc =
713 &ceph_sb_to_client(inode->i_sb)->client->osdc; 713 &ceph_sb_to_client(inode->i_sb)->client->osdc;
714 loff_t endoff = pos + iov->iov_len; 714 loff_t endoff = pos + iov->iov_len;
715 int want, got = 0; 715 int got = 0;
716 int ret, err; 716 int ret, err, written;
717 717
718 if (ceph_snap(inode) != CEPH_NOSNAP) 718 if (ceph_snap(inode) != CEPH_NOSNAP)
719 return -EROFS; 719 return -EROFS;
720 720
721retry_snap: 721retry_snap:
722 written = 0;
722 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 723 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
723 return -ENOSPC; 724 return -ENOSPC;
724 __ceph_do_pending_vmtruncate(inode); 725 __ceph_do_pending_vmtruncate(inode);
725 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
726 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
727 inode->i_size);
728 if (fi->fmode & CEPH_FILE_MODE_LAZY)
729 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
730 else
731 want = CEPH_CAP_FILE_BUFFER;
732 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
733 if (ret < 0)
734 goto out_put;
735
736 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
737 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
738 ceph_cap_string(got));
739
740 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
741 (iocb->ki_filp->f_flags & O_DIRECT) ||
742 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
743 (fi->flags & CEPH_F_SYNC)) {
744 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
745 &iocb->ki_pos);
746 } else {
747 /*
748 * buffered write; drop Fw early to avoid slow
749 * revocation if we get stuck on balance_dirty_pages
750 */
751 int dirty;
752
753 spin_lock(&ci->i_ceph_lock);
754 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
755 spin_unlock(&ci->i_ceph_lock);
756 ceph_put_cap_refs(ci, got);
757 726
727 /*
728 * try to do a buffered write. if we don't have sufficient
729 * caps, we'll get -EAGAIN from generic_file_aio_write, or a
730 * short write if we only get caps for some pages.
731 */
732 if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
733 !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
734 !(fi->flags & CEPH_F_SYNC)) {
758 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 735 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
736 if (ret >= 0)
737 written = ret;
738
759 if ((ret >= 0 || ret == -EIOCBQUEUED) && 739 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
760 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 740 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
761 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 741 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
762 err = vfs_fsync_range(file, pos, pos + ret - 1, 1); 742 err = vfs_fsync_range(file, pos, pos + written - 1, 1);
763 if (err < 0) 743 if (err < 0)
764 ret = err; 744 ret = err;
765 } 745 }
746 if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
747 goto out;
748 }
766 749
767 if (dirty) 750 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
768 __mark_inode_dirty(inode, dirty); 751 inode, ceph_vinop(inode), pos + written,
752 (unsigned)iov->iov_len - written, inode->i_size);
753 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
754 if (ret < 0)
769 goto out; 755 goto out;
770 }
771 756
757 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
758 inode, ceph_vinop(inode), pos + written,
759 (unsigned)iov->iov_len - written, ceph_cap_string(got));
760 ret = ceph_sync_write(file, iov->iov_base + written,
761 iov->iov_len - written, &iocb->ki_pos);
772 if (ret >= 0) { 762 if (ret >= 0) {
773 int dirty; 763 int dirty;
774 spin_lock(&ci->i_ceph_lock); 764 spin_lock(&ci->i_ceph_lock);
@@ -777,13 +767,10 @@ retry_snap:
777 if (dirty) 767 if (dirty)
778 __mark_inode_dirty(inode, dirty); 768 __mark_inode_dirty(inode, dirty);
779 } 769 }
780
781out_put:
782 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 770 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
783 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 771 inode, ceph_vinop(inode), pos + written,
784 ceph_cap_string(got)); 772 (unsigned)iov->iov_len - written, ceph_cap_string(got));
785 ceph_put_cap_refs(ci, got); 773 ceph_put_cap_refs(ci, got);
786
787out: 774out:
788 if (ret == -EOLDSNAPC) { 775 if (ret == -EOLDSNAPC) {
789 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", 776 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
@@ -797,7 +784,7 @@ out:
797/* 784/*
798 * llseek. be sure to verify file size on SEEK_END. 785 * llseek. be sure to verify file size on SEEK_END.
799 */ 786 */
800static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) 787static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
801{ 788{
802 struct inode *inode = file->f_mapping->host; 789 struct inode *inode = file->f_mapping->host;
803 int ret; 790 int ret;
@@ -805,7 +792,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
805 mutex_lock(&inode->i_mutex); 792 mutex_lock(&inode->i_mutex);
806 __ceph_do_pending_vmtruncate(inode); 793 __ceph_do_pending_vmtruncate(inode);
807 794
808 if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) { 795 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
809 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 796 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
810 if (ret < 0) { 797 if (ret < 0) {
811 offset = ret; 798 offset = ret;
@@ -813,7 +800,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
813 } 800 }
814 } 801 }
815 802
816 switch (origin) { 803 switch (whence) {
817 case SEEK_END: 804 case SEEK_END:
818 offset += inode->i_size; 805 offset += inode->i_size;
819 break; 806 break;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ba95eea201bf..2971eaa65cdc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
1466{ 1466{
1467 struct ceph_inode_info *ci = ceph_inode(inode); 1467 struct ceph_inode_info *ci = ceph_inode(inode);
1468 u64 to; 1468 u64 to;
1469 int wrbuffer_refs, wake = 0; 1469 int wrbuffer_refs, finish = 0;
1470 1470
1471retry: 1471retry:
1472 spin_lock(&ci->i_ceph_lock); 1472 spin_lock(&ci->i_ceph_lock);
@@ -1498,15 +1498,18 @@ retry:
1498 truncate_inode_pages(inode->i_mapping, to); 1498 truncate_inode_pages(inode->i_mapping, to);
1499 1499
1500 spin_lock(&ci->i_ceph_lock); 1500 spin_lock(&ci->i_ceph_lock);
1501 ci->i_truncate_pending--; 1501 if (to == ci->i_truncate_size) {
1502 if (ci->i_truncate_pending == 0) 1502 ci->i_truncate_pending = 0;
1503 wake = 1; 1503 finish = 1;
1504 }
1504 spin_unlock(&ci->i_ceph_lock); 1505 spin_unlock(&ci->i_ceph_lock);
1506 if (!finish)
1507 goto retry;
1505 1508
1506 if (wrbuffer_refs == 0) 1509 if (wrbuffer_refs == 0)
1507 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 1510 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1508 if (wake) 1511
1509 wake_up_all(&ci->i_cap_wq); 1512 wake_up_all(&ci->i_cap_wq);
1510} 1513}
1511 1514
1512 1515
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1bcf712655d9..9165eb8309eb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1590,7 +1590,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1590 } else if (rpath || rino) { 1590 } else if (rpath || rino) {
1591 *ino = rino; 1591 *ino = rino;
1592 *ppath = rpath; 1592 *ppath = rpath;
1593 *pathlen = strlen(rpath); 1593 *pathlen = rpath ? strlen(rpath) : 0;
1594 dout(" path %.*s\n", *pathlen, rpath); 1594 dout(" path %.*s\n", *pathlen, rpath);
1595 } 1595 }
1596 1596
@@ -1876,9 +1876,14 @@ finish:
1876static void __wake_requests(struct ceph_mds_client *mdsc, 1876static void __wake_requests(struct ceph_mds_client *mdsc,
1877 struct list_head *head) 1877 struct list_head *head)
1878{ 1878{
1879 struct ceph_mds_request *req, *nreq; 1879 struct ceph_mds_request *req;
1880 LIST_HEAD(tmp_list);
1881
1882 list_splice_init(head, &tmp_list);
1880 1883
1881 list_for_each_entry_safe(req, nreq, head, r_wait) { 1884 while (!list_empty(&tmp_list)) {
1885 req = list_entry(tmp_list.next,
1886 struct ceph_mds_request, r_wait);
1882 list_del_init(&req->r_wait); 1887 list_del_init(&req->r_wait);
1883 __do_request(mdsc, req); 1888 __do_request(mdsc, req);
1884 } 1889 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2eb43f211325..e86aa9948124 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
403 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); 403 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
404 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) 404 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
405 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); 405 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
406 if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
407 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
408 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) 406 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
409 seq_printf(m, ",osdkeepalivetimeout=%d", 407 seq_printf(m, ",osdkeepalivetimeout=%d",
410 opt->osd_keepalive_timeout); 408 opt->osd_keepalive_timeout);
@@ -849,7 +847,7 @@ static int ceph_register_bdi(struct super_block *sb,
849 fsc->backing_dev_info.ra_pages = 847 fsc->backing_dev_info.ra_pages =
850 default_backing_dev_info.ra_pages; 848 default_backing_dev_info.ra_pages;
851 849
852 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", 850 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
853 atomic_long_inc_return(&bdi_seq)); 851 atomic_long_inc_return(&bdi_seq));
854 if (!err) 852 if (!err)
855 sb->s_bdi = &fsc->backing_dev_info; 853 sb->s_bdi = &fsc->backing_dev_info;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2075ddfffa73..21ff76c22a17 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -122,9 +122,17 @@ config CIFS_ACL
122 Allows fetching CIFS/NTFS ACL from the server. The DACL blob 122 Allows fetching CIFS/NTFS ACL from the server. The DACL blob
123 is handed over to the application/caller. 123 is handed over to the application/caller.
124 124
125config CIFS_DEBUG
126 bool "Enable CIFS debugging routines"
127 default y
128 depends on CIFS
129 help
130 Enabling this option adds helpful debugging messages to
131 the cifs code which increases the size of the cifs module.
132 If unsure, say Y.
125config CIFS_DEBUG2 133config CIFS_DEBUG2
126 bool "Enable additional CIFS debugging routines" 134 bool "Enable additional CIFS debugging routines"
127 depends on CIFS 135 depends on CIFS_DEBUG
128 help 136 help
129 Enabling this option adds a few more debugging routines 137 Enabling this option adds a few more debugging routines
130 to the cifs code which slightly increases the size of 138 to the cifs code which slightly increases the size of
diff --git a/fs/cifs/README b/fs/cifs/README
index 22ab7b5b8da7..2d5622f60e11 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -480,7 +480,7 @@ A partial list of the supported mount options follows:
480 Unicode on the wire. 480 Unicode on the wire.
481 nomapchars Do not translate any of these seven characters (default). 481 nomapchars Do not translate any of these seven characters (default).
482 nocase Request case insensitive path name matching (case 482 nocase Request case insensitive path name matching (case
483 sensitive is the default if the server suports it). 483 sensitive is the default if the server supports it).
484 (mount option "ignorecase" is identical to "nocase") 484 (mount option "ignorecase" is identical to "nocase")
485 posixpaths If CIFS Unix extensions are supported, attempt to 485 posixpaths If CIFS Unix extensions are supported, attempt to
486 negotiate posix path name support which allows certain 486 negotiate posix path name support which allows certain
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c0c68bb492d7..69ae3d3c3b31 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -18,7 +18,6 @@
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * 19 *
20*/ 20*/
21#define CIFS_DEBUG /* BB temporary */
22 21
23#ifndef _H_CIFS_DEBUG 22#ifndef _H_CIFS_DEBUG
24#define _H_CIFS_DEBUG 23#define _H_CIFS_DEBUG
@@ -37,49 +36,39 @@ void dump_smb(void *, int);
37#define CIFS_RC 0x02 36#define CIFS_RC 0x02
38#define CIFS_TIMER 0x04 37#define CIFS_TIMER 0x04
39 38
39extern int cifsFYI;
40
40/* 41/*
41 * debug ON 42 * debug ON
42 * -------- 43 * --------
43 */ 44 */
44#ifdef CIFS_DEBUG 45#ifdef CONFIG_CIFS_DEBUG
45 46
46/* information message: e.g., configuration, major event */ 47/* information message: e.g., configuration, major event */
47extern int cifsFYI; 48#define cifsfyi(fmt, ...) \
48#define cifsfyi(fmt, arg...) \
49do { \ 49do { \
50 if (cifsFYI & CIFS_INFO) \ 50 if (cifsFYI & CIFS_INFO) \
51 printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \ 51 printk(KERN_DEBUG "%s: " fmt "\n", \
52} while (0) 52 __FILE__, ##__VA_ARGS__); \
53
54#define cFYI(set, fmt, arg...) \
55do { \
56 if (set) \
57 cifsfyi(fmt, ##arg); \
58} while (0) 53} while (0)
59 54
60#define cifswarn(fmt, arg...) \ 55#define cFYI(set, fmt, ...) \
61 printk(KERN_WARNING fmt "\n", ##arg)
62
63/* debug event message: */
64extern int cifsERROR;
65
66#define cEVENT(fmt, arg...) \
67do { \ 56do { \
68 if (cifsERROR) \ 57 if (set) \
69 printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \ 58 cifsfyi(fmt, ##__VA_ARGS__); \
70} while (0) 59} while (0)
71 60
61#define cifswarn(fmt, ...) \
62 printk(KERN_WARNING fmt "\n", ##__VA_ARGS__)
63
72/* error event message: e.g., i/o error */ 64/* error event message: e.g., i/o error */
73#define cifserror(fmt, arg...) \ 65#define cifserror(fmt, ...) \
74do { \ 66 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
75 if (cifsERROR) \
76 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \
77} while (0)
78 67
79#define cERROR(set, fmt, arg...) \ 68#define cERROR(set, fmt, ...) \
80do { \ 69do { \
81 if (set) \ 70 if (set) \
82 cifserror(fmt, ##arg); \ 71 cifserror(fmt, ##__VA_ARGS__); \
83} while (0) 72} while (0)
84 73
85/* 74/*
@@ -87,10 +76,27 @@ do { \
87 * --------- 76 * ---------
88 */ 77 */
89#else /* _CIFS_DEBUG */ 78#else /* _CIFS_DEBUG */
90#define cERROR(set, fmt, arg...) 79#define cifsfyi(fmt, ...) \
91#define cEVENT(fmt, arg...) 80do { \
92#define cFYI(set, fmt, arg...) 81 if (0) \
93#define cifserror(fmt, arg...) 82 printk(KERN_DEBUG "%s: " fmt "\n", \
83 __FILE__, ##__VA_ARGS__); \
84} while (0)
85#define cFYI(set, fmt, ...) \
86do { \
87 if (0 && set) \
88 cifsfyi(fmt, ##__VA_ARGS__); \
89} while (0)
90#define cifserror(fmt, ...) \
91do { \
92 if (0) \
93 printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
94} while (0)
95#define cERROR(set, fmt, ...) \
96do { \
97 if (0 && set) \
98 cifserror(fmt, ##__VA_ARGS__); \
99} while (0)
94#endif /* _CIFS_DEBUG */ 100#endif /* _CIFS_DEBUG */
95 101
96#endif /* _H_CIFS_DEBUG */ 102#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ce5cbd717bfc..210fce2df308 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,6 +226,8 @@ compose_mount_options_out:
226compose_mount_options_err: 226compose_mount_options_err:
227 kfree(mountdata); 227 kfree(mountdata);
228 mountdata = ERR_PTR(rc); 228 mountdata = ERR_PTR(rc);
229 kfree(*devname);
230 *devname = NULL;
229 goto compose_mount_options_out; 231 goto compose_mount_options_out;
230} 232}
231 233
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 0fb15bbbe43c..5cbd00e74067 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -42,135 +42,27 @@ static const struct cifs_sid sid_authusers = {
42/* group users */ 42/* group users */
43static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; 43static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
44 44
45const struct cred *root_cred; 45static const struct cred *root_cred;
46
47static void
48shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
49 int *nr_del)
50{
51 struct rb_node *node;
52 struct rb_node *tmp;
53 struct cifs_sid_id *psidid;
54
55 node = rb_first(root);
56 while (node) {
57 tmp = node;
58 node = rb_next(tmp);
59 psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
60 if (nr_to_scan == 0 || *nr_del == nr_to_scan)
61 ++(*nr_rem);
62 else {
63 if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
64 && psidid->refcount == 0) {
65 rb_erase(tmp, root);
66 ++(*nr_del);
67 } else
68 ++(*nr_rem);
69 }
70 }
71}
72
73/*
74 * Run idmap cache shrinker.
75 */
76static int
77cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
78{
79 int nr_to_scan = sc->nr_to_scan;
80 int nr_del = 0;
81 int nr_rem = 0;
82 struct rb_root *root;
83
84 root = &uidtree;
85 spin_lock(&siduidlock);
86 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
87 spin_unlock(&siduidlock);
88
89 root = &gidtree;
90 spin_lock(&sidgidlock);
91 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
92 spin_unlock(&sidgidlock);
93
94 root = &siduidtree;
95 spin_lock(&uidsidlock);
96 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
97 spin_unlock(&uidsidlock);
98
99 root = &sidgidtree;
100 spin_lock(&gidsidlock);
101 shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
102 spin_unlock(&gidsidlock);
103
104 return nr_rem;
105}
106
107static void
108sid_rb_insert(struct rb_root *root, unsigned long cid,
109 struct cifs_sid_id **psidid, char *typestr)
110{
111 char *strptr;
112 struct rb_node *node = root->rb_node;
113 struct rb_node *parent = NULL;
114 struct rb_node **linkto = &(root->rb_node);
115 struct cifs_sid_id *lsidid;
116
117 while (node) {
118 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
119 parent = node;
120 if (cid > lsidid->id) {
121 linkto = &(node->rb_left);
122 node = node->rb_left;
123 }
124 if (cid < lsidid->id) {
125 linkto = &(node->rb_right);
126 node = node->rb_right;
127 }
128 }
129
130 (*psidid)->id = cid;
131 (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
132 (*psidid)->refcount = 0;
133
134 sprintf((*psidid)->sidstr, "%s", typestr);
135 strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
136 sprintf(strptr, "%ld", cid);
137
138 clear_bit(SID_ID_PENDING, &(*psidid)->state);
139 clear_bit(SID_ID_MAPPED, &(*psidid)->state);
140
141 rb_link_node(&(*psidid)->rbnode, parent, linkto);
142 rb_insert_color(&(*psidid)->rbnode, root);
143}
144
145static struct cifs_sid_id *
146sid_rb_search(struct rb_root *root, unsigned long cid)
147{
148 struct rb_node *node = root->rb_node;
149 struct cifs_sid_id *lsidid;
150
151 while (node) {
152 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
153 if (cid > lsidid->id)
154 node = node->rb_left;
155 else if (cid < lsidid->id)
156 node = node->rb_right;
157 else /* node found */
158 return lsidid;
159 }
160
161 return NULL;
162}
163
164static struct shrinker cifs_shrinker = {
165 .shrink = cifs_idmap_shrinker,
166 .seeks = DEFAULT_SEEKS,
167};
168 46
169static int 47static int
170cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) 48cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
171{ 49{
172 char *payload; 50 char *payload;
173 51
52 /*
53 * If the payload is less than or equal to the size of a pointer, then
54 * an allocation here is wasteful. Just copy the data directly to the
55 * payload.value union member instead.
56 *
57 * With this however, you must check the datalen before trying to
58 * dereference payload.data!
59 */
60 if (prep->datalen <= sizeof(key->payload)) {
61 key->payload.value = 0;
62 memcpy(&key->payload.value, prep->data, prep->datalen);
63 key->datalen = prep->datalen;
64 return 0;
65 }
174 payload = kmalloc(prep->datalen, GFP_KERNEL); 66 payload = kmalloc(prep->datalen, GFP_KERNEL);
175 if (!payload) 67 if (!payload)
176 return -ENOMEM; 68 return -ENOMEM;
@@ -184,10 +76,11 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
184static inline void 76static inline void
185cifs_idmap_key_destroy(struct key *key) 77cifs_idmap_key_destroy(struct key *key)
186{ 78{
187 kfree(key->payload.data); 79 if (key->datalen > sizeof(key->payload))
80 kfree(key->payload.data);
188} 81}
189 82
190struct key_type cifs_idmap_key_type = { 83static struct key_type cifs_idmap_key_type = {
191 .name = "cifs.idmap", 84 .name = "cifs.idmap",
192 .instantiate = cifs_idmap_key_instantiate, 85 .instantiate = cifs_idmap_key_instantiate,
193 .destroy = cifs_idmap_key_destroy, 86 .destroy = cifs_idmap_key_destroy,
@@ -195,221 +88,174 @@ struct key_type cifs_idmap_key_type = {
195 .match = user_match, 88 .match = user_match,
196}; 89};
197 90
198static void 91static char *
199sid_to_str(struct cifs_sid *sidptr, char *sidstr) 92sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
200{ 93{
201 int i; 94 int i, len;
202 unsigned long saval; 95 unsigned int saval;
203 char *strptr; 96 char *sidstr, *strptr;
97 unsigned long long id_auth_val;
98
99 /* 3 bytes for prefix */
100 sidstr = kmalloc(3 + SID_STRING_BASE_SIZE +
101 (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth),
102 GFP_KERNEL);
103 if (!sidstr)
104 return sidstr;
204 105
205 strptr = sidstr; 106 strptr = sidstr;
107 len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g',
108 sidptr->revision);
109 strptr += len;
110
111 /* The authority field is a single 48-bit number */
112 id_auth_val = (unsigned long long)sidptr->authority[5];
113 id_auth_val |= (unsigned long long)sidptr->authority[4] << 8;
114 id_auth_val |= (unsigned long long)sidptr->authority[3] << 16;
115 id_auth_val |= (unsigned long long)sidptr->authority[2] << 24;
116 id_auth_val |= (unsigned long long)sidptr->authority[1] << 32;
117 id_auth_val |= (unsigned long long)sidptr->authority[0] << 48;
206 118
207 sprintf(strptr, "%s", "S"); 119 /*
208 strptr = sidstr + strlen(sidstr); 120 * MS-DTYP states that if the authority is >= 2^32, then it should be
209 121 * expressed as a hex value.
210 sprintf(strptr, "-%d", sidptr->revision); 122 */
211 strptr = sidstr + strlen(sidstr); 123 if (id_auth_val <= UINT_MAX)
124 len = sprintf(strptr, "-%llu", id_auth_val);
125 else
126 len = sprintf(strptr, "-0x%llx", id_auth_val);
212 127
213 for (i = 0; i < 6; ++i) { 128 strptr += len;
214 if (sidptr->authority[i]) {
215 sprintf(strptr, "-%d", sidptr->authority[i]);
216 strptr = sidstr + strlen(sidstr);
217 }
218 }
219 129
220 for (i = 0; i < sidptr->num_subauth; ++i) { 130 for (i = 0; i < sidptr->num_subauth; ++i) {
221 saval = le32_to_cpu(sidptr->sub_auth[i]); 131 saval = le32_to_cpu(sidptr->sub_auth[i]);
222 sprintf(strptr, "-%ld", saval); 132 len = sprintf(strptr, "-%u", saval);
223 strptr = sidstr + strlen(sidstr); 133 strptr += len;
224 } 134 }
225}
226 135
227static void 136 return sidstr;
228cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
229{
230 memcpy(dst, src, sizeof(*dst));
231 dst->num_subauth = min_t(u8, src->num_subauth, NUM_SUBAUTHS);
232} 137}
233 138
234static void 139/*
235id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, 140 * if the two SIDs (roughly equivalent to a UUID for a user or group) are
236 struct cifs_sid_id **psidid, char *typestr) 141 * the same returns zero, if they do not match returns non-zero.
142 */
143static int
144compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
237{ 145{
238 int rc; 146 int i;
239 char *strptr; 147 int num_subauth, num_sat, num_saw;
240 struct rb_node *node = root->rb_node;
241 struct rb_node *parent = NULL;
242 struct rb_node **linkto = &(root->rb_node);
243 struct cifs_sid_id *lsidid;
244
245 while (node) {
246 lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
247 parent = node;
248 rc = compare_sids(sidptr, &((lsidid)->sid));
249 if (rc > 0) {
250 linkto = &(node->rb_left);
251 node = node->rb_left;
252 } else if (rc < 0) {
253 linkto = &(node->rb_right);
254 node = node->rb_right;
255 }
256 }
257
258 cifs_copy_sid(&(*psidid)->sid, sidptr);
259 (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
260 (*psidid)->refcount = 0;
261 148
262 sprintf((*psidid)->sidstr, "%s", typestr); 149 if ((!ctsid) || (!cwsid))
263 strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); 150 return 1;
264 sid_to_str(&(*psidid)->sid, strptr);
265 151
266 clear_bit(SID_ID_PENDING, &(*psidid)->state); 152 /* compare the revision */
267 clear_bit(SID_ID_MAPPED, &(*psidid)->state); 153 if (ctsid->revision != cwsid->revision) {
154 if (ctsid->revision > cwsid->revision)
155 return 1;
156 else
157 return -1;
158 }
268 159
269 rb_link_node(&(*psidid)->rbnode, parent, linkto); 160 /* compare all of the six auth values */
270 rb_insert_color(&(*psidid)->rbnode, root); 161 for (i = 0; i < NUM_AUTHS; ++i) {
271} 162 if (ctsid->authority[i] != cwsid->authority[i]) {
163 if (ctsid->authority[i] > cwsid->authority[i])
164 return 1;
165 else
166 return -1;
167 }
168 }
272 169
273static struct cifs_sid_id * 170 /* compare all of the subauth values if any */
274id_rb_search(struct rb_root *root, struct cifs_sid *sidptr) 171 num_sat = ctsid->num_subauth;
275{ 172 num_saw = cwsid->num_subauth;
276 int rc; 173 num_subauth = num_sat < num_saw ? num_sat : num_saw;
277 struct rb_node *node = root->rb_node; 174 if (num_subauth) {
278 struct cifs_sid_id *lsidid; 175 for (i = 0; i < num_subauth; ++i) {
279 176 if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
280 while (node) { 177 if (le32_to_cpu(ctsid->sub_auth[i]) >
281 lsidid = rb_entry(node, struct cifs_sid_id, rbnode); 178 le32_to_cpu(cwsid->sub_auth[i]))
282 rc = compare_sids(sidptr, &((lsidid)->sid)); 179 return 1;
283 if (rc > 0) { 180 else
284 node = node->rb_left; 181 return -1;
285 } else if (rc < 0) { 182 }
286 node = node->rb_right; 183 }
287 } else /* node found */
288 return lsidid;
289 } 184 }
290 185
291 return NULL; 186 return 0; /* sids compare/match */
292} 187}
293 188
294static int 189static void
295sidid_pending_wait(void *unused) 190cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
296{ 191{
297 schedule(); 192 int i;
298 return signal_pending(current) ? -ERESTARTSYS : 0; 193
194 dst->revision = src->revision;
195 dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
196 for (i = 0; i < NUM_AUTHS; ++i)
197 dst->authority[i] = src->authority[i];
198 for (i = 0; i < dst->num_subauth; ++i)
199 dst->sub_auth[i] = src->sub_auth[i];
299} 200}
300 201
301static int 202static int
302id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid) 203id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
303{ 204{
304 int rc = 0; 205 int rc;
305 struct key *sidkey; 206 struct key *sidkey;
207 struct cifs_sid *ksid;
208 unsigned int ksid_size;
209 char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
306 const struct cred *saved_cred; 210 const struct cred *saved_cred;
307 struct cifs_sid *lsid;
308 struct cifs_sid_id *psidid, *npsidid;
309 struct rb_root *cidtree;
310 spinlock_t *cidlock;
311
312 if (sidtype == SIDOWNER) {
313 cidlock = &siduidlock;
314 cidtree = &uidtree;
315 } else if (sidtype == SIDGROUP) {
316 cidlock = &sidgidlock;
317 cidtree = &gidtree;
318 } else
319 return -EINVAL;
320 211
321 spin_lock(cidlock); 212 rc = snprintf(desc, sizeof(desc), "%ci:%u",
322 psidid = sid_rb_search(cidtree, cid); 213 sidtype == SIDOWNER ? 'o' : 'g', cid);
323 214 if (rc >= sizeof(desc))
324 if (!psidid) { /* node does not exist, allocate one & attempt adding */ 215 return -EINVAL;
325 spin_unlock(cidlock);
326 npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
327 if (!npsidid)
328 return -ENOMEM;
329
330 npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
331 if (!npsidid->sidstr) {
332 kfree(npsidid);
333 return -ENOMEM;
334 }
335 216
336 spin_lock(cidlock); 217 rc = 0;
337 psidid = sid_rb_search(cidtree, cid); 218 saved_cred = override_creds(root_cred);
338 if (psidid) { /* node happened to get inserted meanwhile */ 219 sidkey = request_key(&cifs_idmap_key_type, desc, "");
339 ++psidid->refcount; 220 if (IS_ERR(sidkey)) {
340 spin_unlock(cidlock); 221 rc = -EINVAL;
341 kfree(npsidid->sidstr); 222 cFYI(1, "%s: Can't map %cid %u to a SID", __func__,
342 kfree(npsidid); 223 sidtype == SIDOWNER ? 'u' : 'g', cid);
343 } else { 224 goto out_revert_creds;
344 psidid = npsidid; 225 } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
345 sid_rb_insert(cidtree, cid, &psidid, 226 rc = -EIO;
346 sidtype == SIDOWNER ? "oi:" : "gi:"); 227 cFYI(1, "%s: Downcall contained malformed key "
347 ++psidid->refcount; 228 "(datalen=%hu)", __func__, sidkey->datalen);
348 spin_unlock(cidlock); 229 goto invalidate_key;
349 }
350 } else {
351 ++psidid->refcount;
352 spin_unlock(cidlock);
353 } 230 }
354 231
355 /* 232 /*
356 * If we are here, it is safe to access psidid and its fields 233 * A sid is usually too large to be embedded in payload.value, but if
357 * since a reference was taken earlier while holding the spinlock. 234 * there are no subauthorities and the host has 8-byte pointers, then
358 * A reference on the node is put without holding the spinlock 235 * it could be.
359 * and it is OK to do so in this case, shrinker will not erase
360 * this node until all references are put and we do not access
361 * any fields of the node after a reference is put .
362 */ 236 */
363 if (test_bit(SID_ID_MAPPED, &psidid->state)) { 237 ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
364 cifs_copy_sid(ssid, &psidid->sid); 238 (struct cifs_sid *)&sidkey->payload.value :
365 psidid->time = jiffies; /* update ts for accessing */ 239 (struct cifs_sid *)sidkey->payload.data;
366 goto id_sid_out; 240
367 } 241 ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
368 242 if (ksid_size > sidkey->datalen) {
369 if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) { 243 rc = -EIO;
370 rc = -EINVAL; 244 cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, "
371 goto id_sid_out; 245 "ksid_size=%u)", __func__, sidkey->datalen, ksid_size);
246 goto invalidate_key;
372 } 247 }
373 248
374 if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { 249 cifs_copy_sid(ssid, ksid);
375 saved_cred = override_creds(root_cred); 250out_key_put:
376 sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); 251 key_put(sidkey);
377 if (IS_ERR(sidkey)) { 252out_revert_creds:
378 rc = -EINVAL; 253 revert_creds(saved_cred);
379 cFYI(1, "%s: Can't map and id to a SID", __func__);
380 } else if (sidkey->datalen < sizeof(struct cifs_sid)) {
381 rc = -EIO;
382 cFYI(1, "%s: Downcall contained malformed key "
383 "(datalen=%hu)", __func__, sidkey->datalen);
384 } else {
385 lsid = (struct cifs_sid *)sidkey->payload.data;
386 cifs_copy_sid(&psidid->sid, lsid);
387 cifs_copy_sid(ssid, &psidid->sid);
388 set_bit(SID_ID_MAPPED, &psidid->state);
389 key_put(sidkey);
390 kfree(psidid->sidstr);
391 }
392 psidid->time = jiffies; /* update ts for accessing */
393 revert_creds(saved_cred);
394 clear_bit(SID_ID_PENDING, &psidid->state);
395 wake_up_bit(&psidid->state, SID_ID_PENDING);
396 } else {
397 rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
398 sidid_pending_wait, TASK_INTERRUPTIBLE);
399 if (rc) {
400 cFYI(1, "%s: sidid_pending_wait interrupted %d",
401 __func__, rc);
402 --psidid->refcount;
403 return rc;
404 }
405 if (test_bit(SID_ID_MAPPED, &psidid->state))
406 cifs_copy_sid(ssid, &psidid->sid);
407 else
408 rc = -EINVAL;
409 }
410id_sid_out:
411 --psidid->refcount;
412 return rc; 254 return rc;
255
256invalidate_key:
257 key_invalidate(sidkey);
258 goto out_key_put;
413} 259}
414 260
415static int 261static int
@@ -417,111 +263,67 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
417 struct cifs_fattr *fattr, uint sidtype) 263 struct cifs_fattr *fattr, uint sidtype)
418{ 264{
419 int rc; 265 int rc;
420 unsigned long cid; 266 struct key *sidkey;
421 struct key *idkey; 267 char *sidstr;
422 const struct cred *saved_cred; 268 const struct cred *saved_cred;
423 struct cifs_sid_id *psidid, *npsidid; 269 uid_t fuid = cifs_sb->mnt_uid;
424 struct rb_root *cidtree; 270 gid_t fgid = cifs_sb->mnt_gid;
425 spinlock_t *cidlock;
426
427 if (sidtype == SIDOWNER) {
428 cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
429 cidlock = &siduidlock;
430 cidtree = &uidtree;
431 } else if (sidtype == SIDGROUP) {
432 cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
433 cidlock = &sidgidlock;
434 cidtree = &gidtree;
435 } else
436 return -ENOENT;
437
438 spin_lock(cidlock);
439 psidid = id_rb_search(cidtree, psid);
440
441 if (!psidid) { /* node does not exist, allocate one & attempt adding */
442 spin_unlock(cidlock);
443 npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
444 if (!npsidid)
445 return -ENOMEM;
446
447 npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
448 if (!npsidid->sidstr) {
449 kfree(npsidid);
450 return -ENOMEM;
451 }
452
453 spin_lock(cidlock);
454 psidid = id_rb_search(cidtree, psid);
455 if (psidid) { /* node happened to get inserted meanwhile */
456 ++psidid->refcount;
457 spin_unlock(cidlock);
458 kfree(npsidid->sidstr);
459 kfree(npsidid);
460 } else {
461 psidid = npsidid;
462 id_rb_insert(cidtree, psid, &psidid,
463 sidtype == SIDOWNER ? "os:" : "gs:");
464 ++psidid->refcount;
465 spin_unlock(cidlock);
466 }
467 } else {
468 ++psidid->refcount;
469 spin_unlock(cidlock);
470 }
471 271
472 /* 272 /*
473 * If we are here, it is safe to access psidid and its fields 273 * If we have too many subauthorities, then something is really wrong.
474 * since a reference was taken earlier while holding the spinlock. 274 * Just return an error.
475 * A reference on the node is put without holding the spinlock
476 * and it is OK to do so in this case, shrinker will not erase
477 * this node until all references are put and we do not access
478 * any fields of the node after a reference is put .
479 */ 275 */
480 if (test_bit(SID_ID_MAPPED, &psidid->state)) { 276 if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
481 cid = psidid->id; 277 cFYI(1, "%s: %u subauthorities is too many!", __func__,
482 psidid->time = jiffies; /* update ts for accessing */ 278 psid->num_subauth);
483 goto sid_to_id_out; 279 return -EIO;
484 } 280 }
485 281
486 if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) 282 sidstr = sid_to_key_str(psid, sidtype);
487 goto sid_to_id_out; 283 if (!sidstr)
488 284 return -ENOMEM;
489 if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { 285
490 saved_cred = override_creds(root_cred); 286 saved_cred = override_creds(root_cred);
491 idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); 287 sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
492 if (IS_ERR(idkey)) 288 if (IS_ERR(sidkey)) {
493 cFYI(1, "%s: Can't map SID to an id", __func__); 289 rc = -EINVAL;
494 else { 290 cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr,
495 cid = *(unsigned long *)idkey->payload.value; 291 sidtype == SIDOWNER ? 'u' : 'g');
496 psidid->id = cid; 292 goto out_revert_creds;
497 set_bit(SID_ID_MAPPED, &psidid->state); 293 }
498 key_put(idkey); 294
499 kfree(psidid->sidstr); 295 /*
500 } 296 * FIXME: Here we assume that uid_t and gid_t are same size. It's
501 revert_creds(saved_cred); 297 * probably a safe assumption but might be better to check based on
502 psidid->time = jiffies; /* update ts for accessing */ 298 * sidtype.
503 clear_bit(SID_ID_PENDING, &psidid->state); 299 */
504 wake_up_bit(&psidid->state, SID_ID_PENDING); 300 if (sidkey->datalen != sizeof(uid_t)) {
505 } else { 301 rc = -EIO;
506 rc = wait_on_bit(&psidid->state, SID_ID_PENDING, 302 cFYI(1, "%s: Downcall contained malformed key "
507 sidid_pending_wait, TASK_INTERRUPTIBLE); 303 "(datalen=%hu)", __func__, sidkey->datalen);
508 if (rc) { 304 key_invalidate(sidkey);
509 cFYI(1, "%s: sidid_pending_wait interrupted %d", 305 goto out_key_put;
510 __func__, rc);
511 --psidid->refcount; /* decremented without spinlock */
512 return rc;
513 }
514 if (test_bit(SID_ID_MAPPED, &psidid->state))
515 cid = psidid->id;
516 } 306 }
517 307
518sid_to_id_out:
519 --psidid->refcount; /* decremented without spinlock */
520 if (sidtype == SIDOWNER) 308 if (sidtype == SIDOWNER)
521 fattr->cf_uid = cid; 309 memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
522 else 310 else
523 fattr->cf_gid = cid; 311 memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
312
313out_key_put:
314 key_put(sidkey);
315out_revert_creds:
316 revert_creds(saved_cred);
317 kfree(sidstr);
524 318
319 /*
320 * Note that we return 0 here unconditionally. If the mapping
321 * fails then we just fall back to using the mnt_uid/mnt_gid.
322 */
323 if (sidtype == SIDOWNER)
324 fattr->cf_uid = fuid;
325 else
326 fattr->cf_gid = fgid;
525 return 0; 327 return 0;
526} 328}
527 329
@@ -544,19 +346,15 @@ init_cifs_idmap(void)
544 if (!cred) 346 if (!cred)
545 return -ENOMEM; 347 return -ENOMEM;
546 348
547 keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred, 349 keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
548 (KEY_POS_ALL & ~KEY_POS_SETATTR) | 350 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
549 KEY_USR_VIEW | KEY_USR_READ, 351 KEY_USR_VIEW | KEY_USR_READ,
550 KEY_ALLOC_NOT_IN_QUOTA); 352 KEY_ALLOC_NOT_IN_QUOTA, NULL);
551 if (IS_ERR(keyring)) { 353 if (IS_ERR(keyring)) {
552 ret = PTR_ERR(keyring); 354 ret = PTR_ERR(keyring);
553 goto failed_put_cred; 355 goto failed_put_cred;
554 } 356 }
555 357
556 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
557 if (ret < 0)
558 goto failed_put_key;
559
560 ret = register_key_type(&cifs_idmap_key_type); 358 ret = register_key_type(&cifs_idmap_key_type);
561 if (ret < 0) 359 if (ret < 0)
562 goto failed_put_key; 360 goto failed_put_key;
@@ -568,17 +366,6 @@ init_cifs_idmap(void)
568 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; 366 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
569 root_cred = cred; 367 root_cred = cred;
570 368
571 spin_lock_init(&siduidlock);
572 uidtree = RB_ROOT;
573 spin_lock_init(&sidgidlock);
574 gidtree = RB_ROOT;
575
576 spin_lock_init(&uidsidlock);
577 siduidtree = RB_ROOT;
578 spin_lock_init(&gidsidlock);
579 sidgidtree = RB_ROOT;
580 register_shrinker(&cifs_shrinker);
581
582 cFYI(1, "cifs idmap keyring: %d", key_serial(keyring)); 369 cFYI(1, "cifs idmap keyring: %d", key_serial(keyring));
583 return 0; 370 return 0;
584 371
@@ -595,89 +382,9 @@ exit_cifs_idmap(void)
595 key_revoke(root_cred->thread_keyring); 382 key_revoke(root_cred->thread_keyring);
596 unregister_key_type(&cifs_idmap_key_type); 383 unregister_key_type(&cifs_idmap_key_type);
597 put_cred(root_cred); 384 put_cred(root_cred);
598 unregister_shrinker(&cifs_shrinker);
599 cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name); 385 cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name);
600} 386}
601 387
602void
603cifs_destroy_idmaptrees(void)
604{
605 struct rb_root *root;
606 struct rb_node *node;
607
608 root = &uidtree;
609 spin_lock(&siduidlock);
610 while ((node = rb_first(root)))
611 rb_erase(node, root);
612 spin_unlock(&siduidlock);
613
614 root = &gidtree;
615 spin_lock(&sidgidlock);
616 while ((node = rb_first(root)))
617 rb_erase(node, root);
618 spin_unlock(&sidgidlock);
619
620 root = &siduidtree;
621 spin_lock(&uidsidlock);
622 while ((node = rb_first(root)))
623 rb_erase(node, root);
624 spin_unlock(&uidsidlock);
625
626 root = &sidgidtree;
627 spin_lock(&gidsidlock);
628 while ((node = rb_first(root)))
629 rb_erase(node, root);
630 spin_unlock(&gidsidlock);
631}
632
633/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
634 the same returns 1, if they do not match returns 0 */
635int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
636{
637 int i;
638 int num_subauth, num_sat, num_saw;
639
640 if ((!ctsid) || (!cwsid))
641 return 1;
642
643 /* compare the revision */
644 if (ctsid->revision != cwsid->revision) {
645 if (ctsid->revision > cwsid->revision)
646 return 1;
647 else
648 return -1;
649 }
650
651 /* compare all of the six auth values */
652 for (i = 0; i < 6; ++i) {
653 if (ctsid->authority[i] != cwsid->authority[i]) {
654 if (ctsid->authority[i] > cwsid->authority[i])
655 return 1;
656 else
657 return -1;
658 }
659 }
660
661 /* compare all of the subauth values if any */
662 num_sat = ctsid->num_subauth;
663 num_saw = cwsid->num_subauth;
664 num_subauth = num_sat < num_saw ? num_sat : num_saw;
665 if (num_subauth) {
666 for (i = 0; i < num_subauth; ++i) {
667 if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
668 if (le32_to_cpu(ctsid->sub_auth[i]) >
669 le32_to_cpu(cwsid->sub_auth[i]))
670 return 1;
671 else
672 return -1;
673 }
674 }
675 }
676
677 return 0; /* sids compare/match */
678}
679
680
681/* copy ntsd, owner sid, and group sid from a security descriptor to another */ 388/* copy ntsd, owner sid, and group sid from a security descriptor to another */
682static void copy_sec_desc(const struct cifs_ntsd *pntsd, 389static void copy_sec_desc(const struct cifs_ntsd *pntsd,
683 struct cifs_ntsd *pnntsd, __u32 sidsoffset) 390 struct cifs_ntsd *pnntsd, __u32 sidsoffset)
@@ -811,7 +518,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
811 518
812 pntace->sid.revision = psid->revision; 519 pntace->sid.revision = psid->revision;
813 pntace->sid.num_subauth = psid->num_subauth; 520 pntace->sid.num_subauth = psid->num_subauth;
814 for (i = 0; i < 6; i++) 521 for (i = 0; i < NUM_AUTHS; i++)
815 pntace->sid.authority[i] = psid->authority[i]; 522 pntace->sid.authority[i] = psid->authority[i];
816 for (i = 0; i < psid->num_subauth; i++) 523 for (i = 0; i < psid->num_subauth; i++)
817 pntace->sid.sub_auth[i] = psid->sub_auth[i]; 524 pntace->sid.sub_auth[i] = psid->sub_auth[i];
@@ -987,8 +694,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
987 return -EINVAL; 694 return -EINVAL;
988 } 695 }
989 696
990 if (psid->num_subauth) {
991#ifdef CONFIG_CIFS_DEBUG2 697#ifdef CONFIG_CIFS_DEBUG2
698 if (psid->num_subauth) {
992 int i; 699 int i;
993 cFYI(1, "SID revision %d num_auth %d", 700 cFYI(1, "SID revision %d num_auth %d",
994 psid->revision, psid->num_subauth); 701 psid->revision, psid->num_subauth);
@@ -1002,8 +709,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
1002 num auths and therefore go off the end */ 709 num auths and therefore go off the end */
1003 cFYI(1, "RID 0x%x", 710 cFYI(1, "RID 0x%x",
1004 le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); 711 le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
1005#endif
1006 } 712 }
713#endif
1007 714
1008 return 0; 715 return 0;
1009} 716}
@@ -1307,42 +1014,39 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
1307 1014
1308 /* Get the security descriptor */ 1015 /* Get the security descriptor */
1309 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); 1016 pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
1310
1311 /* Add three ACEs for owner, group, everyone getting rid of
1312 other ACEs as chmod disables ACEs and set the security descriptor */
1313
1314 if (IS_ERR(pntsd)) { 1017 if (IS_ERR(pntsd)) {
1315 rc = PTR_ERR(pntsd); 1018 rc = PTR_ERR(pntsd);
1316 cERROR(1, "%s: error %d getting sec desc", __func__, rc); 1019 cERROR(1, "%s: error %d getting sec desc", __func__, rc);
1317 } else { 1020 goto out;
1318 /* allocate memory for the smb header, 1021 }
1319 set security descriptor request security descriptor
1320 parameters, and secuirty descriptor itself */
1321
1322 secdesclen = secdesclen < DEFSECDESCLEN ?
1323 DEFSECDESCLEN : secdesclen;
1324 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
1325 if (!pnntsd) {
1326 cERROR(1, "Unable to allocate security descriptor");
1327 kfree(pntsd);
1328 return -ENOMEM;
1329 }
1330 1022
1331 rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid, 1023 /*
1332 &aclflag); 1024 * Add three ACEs for owner, group, everyone getting rid of other ACEs
1025 * as chmod disables ACEs and set the security descriptor. Allocate
1026 * memory for the smb header, set security descriptor request security
1027 * descriptor parameters, and secuirty descriptor itself
1028 */
1029 secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
1030 pnntsd = kmalloc(secdesclen, GFP_KERNEL);
1031 if (!pnntsd) {
1032 cERROR(1, "Unable to allocate security descriptor");
1033 kfree(pntsd);
1034 return -ENOMEM;
1035 }
1333 1036
1334 cFYI(DBG2, "build_sec_desc rc: %d", rc); 1037 rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
1038 &aclflag);
1335 1039
1336 if (!rc) { 1040 cFYI(DBG2, "build_sec_desc rc: %d", rc);
1337 /* Set the security descriptor */
1338 rc = set_cifs_acl(pnntsd, secdesclen, inode,
1339 path, aclflag);
1340 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
1341 }
1342 1041
1343 kfree(pnntsd); 1042 if (!rc) {
1344 kfree(pntsd); 1043 /* Set the security descriptor */
1044 rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
1045 cFYI(DBG2, "set_cifs_acl rc: %d", rc);
1345 } 1046 }
1346 1047
1048 kfree(pnntsd);
1049 kfree(pntsd);
1050out:
1347 return rc; 1051 return rc;
1348} 1052}
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 5c902c7ce524..4f3884835267 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -23,11 +23,8 @@
23#define _CIFSACL_H 23#define _CIFSACL_H
24 24
25 25
26#define NUM_AUTHS 6 /* number of authority fields */ 26#define NUM_AUTHS (6) /* number of authority fields */
27#define NUM_SUBAUTHS 5 /* number of sub authority fields */ 27#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
28#define NUM_WK_SIDS 7 /* number of well known sids */
29#define SIDNAMELENGTH 20 /* long enough for the ones we care about */
30#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */
31 28
32#define READ_BIT 0x4 29#define READ_BIT 0x4
33#define WRITE_BIT 0x2 30#define WRITE_BIT 0x2
@@ -41,12 +38,32 @@
41 38
42#define SIDOWNER 1 39#define SIDOWNER 1
43#define SIDGROUP 2 40#define SIDGROUP 2
44#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
45 41
46#define SID_ID_MAPPED 0 42/*
47#define SID_ID_PENDING 1 43 * Security Descriptor length containing DACL with 3 ACEs (one each for
48#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */ 44 * owner, group and world).
49#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */ 45 */
46#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
47 sizeof(struct cifs_acl) + \
48 (sizeof(struct cifs_ace) * 3))
49
50/*
51 * Maximum size of a string representation of a SID:
52 *
53 * The fields are unsigned values in decimal. So:
54 *
55 * u8: max 3 bytes in decimal
56 * u32: max 10 bytes in decimal
57 *
58 * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
59 *
60 * For authority field, max is when all 6 values are non-zero and it must be
61 * represented in hex. So "-0x" + 12 hex digits.
62 *
63 * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
64 */
65#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
66#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
50 67
51struct cifs_ntsd { 68struct cifs_ntsd {
52 __le16 revision; /* revision level */ 69 __le16 revision; /* revision level */
@@ -60,10 +77,13 @@ struct cifs_ntsd {
60struct cifs_sid { 77struct cifs_sid {
61 __u8 revision; /* revision level */ 78 __u8 revision; /* revision level */
62 __u8 num_subauth; 79 __u8 num_subauth;
63 __u8 authority[6]; 80 __u8 authority[NUM_AUTHS];
64 __le32 sub_auth[5]; /* sub_auth[num_subauth] */ 81 __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
65} __attribute__((packed)); 82} __attribute__((packed));
66 83
84/* size of a struct cifs_sid, sans sub_auth array */
85#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
86
67struct cifs_acl { 87struct cifs_acl {
68 __le16 revision; /* revision level */ 88 __le16 revision; /* revision level */
69 __le16 size; 89 __le16 size;
@@ -78,26 +98,4 @@ struct cifs_ace {
78 struct cifs_sid sid; /* ie UUID of user or group who gets these perms */ 98 struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
79} __attribute__((packed)); 99} __attribute__((packed));
80 100
81struct cifs_wksid {
82 struct cifs_sid cifssid;
83 char sidname[SIDNAMELENGTH];
84} __attribute__((packed));
85
86struct cifs_sid_id {
87 unsigned int refcount; /* increment with spinlock, decrement without */
88 unsigned long id;
89 unsigned long time;
90 unsigned long state;
91 char *sidstr;
92 struct rb_node rbnode;
93 struct cifs_sid sid;
94};
95
96#ifdef __KERNEL__
97extern struct key_type cifs_idmap_key_type;
98extern const struct cred *root_cred;
99#endif /* KERNEL */
100
101extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
102
103#endif /* _CIFSACL_H */ 101#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e7931cc55d0c..de7f9168a118 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -54,7 +54,6 @@
54#endif 54#endif
55 55
56int cifsFYI = 0; 56int cifsFYI = 0;
57int cifsERROR = 1;
58int traceSMB = 0; 57int traceSMB = 0;
59bool enable_oplocks = true; 58bool enable_oplocks = true;
60unsigned int linuxExtEnabled = 1; 59unsigned int linuxExtEnabled = 1;
@@ -64,24 +63,23 @@ unsigned int global_secflags = CIFSSEC_DEF;
64unsigned int sign_CIFS_PDUs = 1; 63unsigned int sign_CIFS_PDUs = 1;
65static const struct super_operations cifs_super_ops; 64static const struct super_operations cifs_super_ops;
66unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 65unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
67module_param(CIFSMaxBufSize, int, 0); 66module_param(CIFSMaxBufSize, uint, 0);
68MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " 67MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
69 "Default: 16384 Range: 8192 to 130048"); 68 "Default: 16384 Range: 8192 to 130048");
70unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; 69unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
71module_param(cifs_min_rcv, int, 0); 70module_param(cifs_min_rcv, uint, 0);
72MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " 71MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
73 "1 to 64"); 72 "1 to 64");
74unsigned int cifs_min_small = 30; 73unsigned int cifs_min_small = 30;
75module_param(cifs_min_small, int, 0); 74module_param(cifs_min_small, uint, 0);
76MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " 75MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
77 "Range: 2 to 256"); 76 "Range: 2 to 256");
78unsigned int cifs_max_pending = CIFS_MAX_REQ; 77unsigned int cifs_max_pending = CIFS_MAX_REQ;
79module_param(cifs_max_pending, int, 0444); 78module_param(cifs_max_pending, uint, 0444);
80MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. " 79MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
81 "Default: 32767 Range: 2 to 32767."); 80 "Default: 32767 Range: 2 to 32767.");
82module_param(enable_oplocks, bool, 0644); 81module_param(enable_oplocks, bool, 0644);
83MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:" 82MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
84 "y/Y/1");
85 83
86extern mempool_t *cifs_sm_req_poolp; 84extern mempool_t *cifs_sm_req_poolp;
87extern mempool_t *cifs_req_poolp; 85extern mempool_t *cifs_req_poolp;
@@ -540,8 +538,8 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
540 char *s, *p; 538 char *s, *p;
541 char sep; 539 char sep;
542 540
543 full_path = build_path_to_root(vol, cifs_sb, 541 full_path = cifs_build_path_to_root(vol, cifs_sb,
544 cifs_sb_master_tcon(cifs_sb)); 542 cifs_sb_master_tcon(cifs_sb));
545 if (full_path == NULL) 543 if (full_path == NULL)
546 return ERR_PTR(-ENOMEM); 544 return ERR_PTR(-ENOMEM);
547 545
@@ -695,13 +693,13 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
695 return written; 693 return written;
696} 694}
697 695
698static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) 696static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
699{ 697{
700 /* 698 /*
701 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate 699 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
702 * the cached file length 700 * the cached file length
703 */ 701 */
704 if (origin != SEEK_SET && origin != SEEK_CUR) { 702 if (whence != SEEK_SET && whence != SEEK_CUR) {
705 int rc; 703 int rc;
706 struct inode *inode = file->f_path.dentry->d_inode; 704 struct inode *inode = file->f_path.dentry->d_inode;
707 705
@@ -728,7 +726,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
728 if (rc < 0) 726 if (rc < 0)
729 return (loff_t)rc; 727 return (loff_t)rc;
730 } 728 }
731 return generic_file_llseek(file, offset, origin); 729 return generic_file_llseek(file, offset, whence);
732} 730}
733 731
734static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 732static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
@@ -1205,7 +1203,6 @@ exit_cifs(void)
1205 unregister_filesystem(&cifs_fs_type); 1203 unregister_filesystem(&cifs_fs_type);
1206 cifs_dfs_release_automount_timer(); 1204 cifs_dfs_release_automount_timer();
1207#ifdef CONFIG_CIFS_ACL 1205#ifdef CONFIG_CIFS_ACL
1208 cifs_destroy_idmaptrees();
1209 exit_cifs_idmap(); 1206 exit_cifs_idmap();
1210#endif 1207#endif
1211#ifdef CONFIG_CIFS_UPCALL 1208#ifdef CONFIG_CIFS_UPCALL
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f5af2527fc69..e6899cea1c35 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -178,6 +178,7 @@ struct smb_rqst {
178 178
179enum smb_version { 179enum smb_version {
180 Smb_1 = 1, 180 Smb_1 = 1,
181 Smb_20,
181 Smb_21, 182 Smb_21,
182 Smb_30, 183 Smb_30,
183}; 184};
@@ -280,9 +281,6 @@ struct smb_version_operations {
280 /* set attributes */ 281 /* set attributes */
281 int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *, 282 int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
282 const unsigned int); 283 const unsigned int);
283 /* build a full path to the root of the mount */
284 char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *,
285 struct cifs_tcon *);
286 /* check if we can send an echo or nor */ 284 /* check if we can send an echo or nor */
287 bool (*can_echo)(struct TCP_Server_Info *); 285 bool (*can_echo)(struct TCP_Server_Info *);
288 /* send echo request */ 286 /* send echo request */
@@ -369,6 +367,8 @@ struct smb_version_operations {
369 void (*set_lease_key)(struct inode *, struct cifs_fid *fid); 367 void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
370 /* generate new lease key */ 368 /* generate new lease key */
371 void (*new_lease_key)(struct cifs_fid *fid); 369 void (*new_lease_key)(struct cifs_fid *fid);
370 int (*calc_signature)(struct smb_rqst *rqst,
371 struct TCP_Server_Info *server);
372}; 372};
373 373
374struct smb_version_values { 374struct smb_version_values {
@@ -386,6 +386,7 @@ struct smb_version_values {
386 unsigned int cap_unix; 386 unsigned int cap_unix;
387 unsigned int cap_nt_find; 387 unsigned int cap_nt_find;
388 unsigned int cap_large_files; 388 unsigned int cap_large_files;
389 unsigned int oplock_read;
389}; 390};
390 391
391#define HEADER_SIZE(server) (server->vals->header_size) 392#define HEADER_SIZE(server) (server->vals->header_size)
@@ -396,7 +397,6 @@ struct smb_vol {
396 char *password; 397 char *password;
397 char *domainname; 398 char *domainname;
398 char *UNC; 399 char *UNC;
399 char *UNCip;
400 char *iocharset; /* local code page for mapping to and from Unicode */ 400 char *iocharset; /* local code page for mapping to and from Unicode */
401 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ 401 char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
402 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ 402 char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
@@ -444,11 +444,11 @@ struct smb_vol {
444 unsigned int rsize; 444 unsigned int rsize;
445 unsigned int wsize; 445 unsigned int wsize;
446 bool sockopt_tcp_nodelay:1; 446 bool sockopt_tcp_nodelay:1;
447 unsigned short int port;
448 unsigned long actimeo; /* attribute cache timeout (jiffies) */ 447 unsigned long actimeo; /* attribute cache timeout (jiffies) */
449 struct smb_version_operations *ops; 448 struct smb_version_operations *ops;
450 struct smb_version_values *vals; 449 struct smb_version_values *vals;
451 char *prepath; 450 char *prepath;
451 struct sockaddr_storage dstaddr; /* destination address */
452 struct sockaddr_storage srcaddr; /* allow binding to a local IP */ 452 struct sockaddr_storage srcaddr; /* allow binding to a local IP */
453 struct nls_table *local_nls; 453 struct nls_table *local_nls;
454}; 454};
@@ -1067,30 +1067,16 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
1067static inline void 1067static inline void
1068convert_delimiter(char *path, char delim) 1068convert_delimiter(char *path, char delim)
1069{ 1069{
1070 int i; 1070 char old_delim, *pos;
1071 char old_delim;
1072
1073 if (path == NULL)
1074 return;
1075 1071
1076 if (delim == '/') 1072 if (delim == '/')
1077 old_delim = '\\'; 1073 old_delim = '\\';
1078 else 1074 else
1079 old_delim = '/'; 1075 old_delim = '/';
1080 1076
1081 for (i = 0; path[i] != '\0'; i++) { 1077 pos = path;
1082 if (path[i] == old_delim) 1078 while ((pos = strchr(pos, old_delim)))
1083 path[i] = delim; 1079 *pos = delim;
1084 }
1085}
1086
1087static inline char *
1088build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
1089 struct cifs_tcon *tcon)
1090{
1091 if (!vol->ops->build_path_to_root)
1092 return NULL;
1093 return vol->ops->build_path_to_root(vol, cifs_sb, tcon);
1094} 1080}
1095 1081
1096#ifdef CONFIG_CIFS_STATS 1082#ifdef CONFIG_CIFS_STATS
@@ -1362,7 +1348,7 @@ require use of the stronger protocol */
1362#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ 1348#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
1363#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ 1349#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
1364 1350
1365#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) 1351#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
1366#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) 1352#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
1367#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) 1353#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
1368/* 1354/*
@@ -1506,6 +1492,6 @@ extern struct smb_version_values smb20_values;
1506extern struct smb_version_operations smb21_operations; 1492extern struct smb_version_operations smb21_operations;
1507extern struct smb_version_values smb21_values; 1493extern struct smb_version_values smb21_values;
1508#define SMB30_VERSION_STRING "3.0" 1494#define SMB30_VERSION_STRING "3.0"
1509/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */ 1495extern struct smb_version_operations smb30_operations;
1510extern struct smb_version_values smb30_values; 1496extern struct smb_version_values smb30_values;
1511#endif /* _CIFS_GLOB_H */ 1497#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5144e9fbeb8c..1988c1baa224 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -58,8 +58,10 @@ do { \
58} while (0) 58} while (0)
59extern int init_cifs_idmap(void); 59extern int init_cifs_idmap(void);
60extern void exit_cifs_idmap(void); 60extern void exit_cifs_idmap(void);
61extern void cifs_destroy_idmaptrees(void);
62extern char *build_path_from_dentry(struct dentry *); 61extern char *build_path_from_dentry(struct dentry *);
62extern char *cifs_build_path_to_root(struct smb_vol *vol,
63 struct cifs_sb_info *cifs_sb,
64 struct cifs_tcon *tcon);
63extern char *build_wildcard_path_from_dentry(struct dentry *direntry); 65extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
64extern char *cifs_compose_mount_options(const char *sb_mountdata, 66extern char *cifs_compose_mount_options(const char *sb_mountdata,
65 const char *fullpath, const struct dfs_info3_param *ref, 67 const char *fullpath, const struct dfs_info3_param *ref,
@@ -107,9 +109,7 @@ extern unsigned int smbCalcSize(void *buf);
107extern int decode_negTokenInit(unsigned char *security_blob, int length, 109extern int decode_negTokenInit(unsigned char *security_blob, int length,
108 struct TCP_Server_Info *server); 110 struct TCP_Server_Info *server);
109extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); 111extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
110extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port); 112extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port);
111extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
112 const unsigned short int port);
113extern int map_smb_to_linux_error(char *buf, bool logErr); 113extern int map_smb_to_linux_error(char *buf, bool logErr);
114extern void header_assemble(struct smb_hdr *, char /* command */ , 114extern void header_assemble(struct smb_hdr *, char /* command */ ,
115 const struct cifs_tcon *, int /* length of 115 const struct cifs_tcon *, int /* length of
@@ -185,7 +185,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
185extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, 185extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
186 __u64 length, __u8 type, 186 __u64 length, __u8 type,
187 struct cifsLockInfo **conf_lock, 187 struct cifsLockInfo **conf_lock,
188 bool rw_check); 188 int rw_check);
189extern void cifs_add_pending_open(struct cifs_fid *fid, 189extern void cifs_add_pending_open(struct cifs_fid *fid,
190 struct tcon_link *tlink, 190 struct tcon_link *tlink,
191 struct cifs_pending_open *open); 191 struct cifs_pending_open *open);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5c670b998ffb..12b3da39733b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -186,6 +186,7 @@ static const match_table_t cifs_mount_option_tokens = {
186 { Opt_user, "user=%s" }, 186 { Opt_user, "user=%s" },
187 { Opt_user, "username=%s" }, 187 { Opt_user, "username=%s" },
188 { Opt_blank_pass, "pass=" }, 188 { Opt_blank_pass, "pass=" },
189 { Opt_blank_pass, "password=" },
189 { Opt_pass, "pass=%s" }, 190 { Opt_pass, "pass=%s" },
190 { Opt_pass, "password=%s" }, 191 { Opt_pass, "password=%s" },
191 { Opt_blank_ip, "ip=" }, 192 { Opt_blank_ip, "ip=" },
@@ -274,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = {
274 275
275static const match_table_t cifs_smb_version_tokens = { 276static const match_table_t cifs_smb_version_tokens = {
276 { Smb_1, SMB1_VERSION_STRING }, 277 { Smb_1, SMB1_VERSION_STRING },
278 { Smb_20, SMB20_VERSION_STRING},
277 { Smb_21, SMB21_VERSION_STRING }, 279 { Smb_21, SMB21_VERSION_STRING },
278 { Smb_30, SMB30_VERSION_STRING }, 280 { Smb_30, SMB30_VERSION_STRING },
279}; 281};
@@ -1074,12 +1076,16 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1074 vol->vals = &smb1_values; 1076 vol->vals = &smb1_values;
1075 break; 1077 break;
1076#ifdef CONFIG_CIFS_SMB2 1078#ifdef CONFIG_CIFS_SMB2
1079 case Smb_20:
1080 vol->ops = &smb21_operations; /* currently identical with 2.1 */
1081 vol->vals = &smb20_values;
1082 break;
1077 case Smb_21: 1083 case Smb_21:
1078 vol->ops = &smb21_operations; 1084 vol->ops = &smb21_operations;
1079 vol->vals = &smb21_values; 1085 vol->vals = &smb21_values;
1080 break; 1086 break;
1081 case Smb_30: 1087 case Smb_30:
1082 vol->ops = &smb21_operations; /* currently identical with 2.1 */ 1088 vol->ops = &smb30_operations;
1083 vol->vals = &smb30_values; 1089 vol->vals = &smb30_values;
1084 break; 1090 break;
1085#endif 1091#endif
@@ -1090,6 +1096,52 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
1090 return 0; 1096 return 0;
1091} 1097}
1092 1098
1099/*
1100 * Parse a devname into substrings and populate the vol->UNC and vol->prepath
1101 * fields with the result. Returns 0 on success and an error otherwise.
1102 */
1103static int
1104cifs_parse_devname(const char *devname, struct smb_vol *vol)
1105{
1106 char *pos;
1107 const char *delims = "/\\";
1108 size_t len;
1109
1110 /* make sure we have a valid UNC double delimiter prefix */
1111 len = strspn(devname, delims);
1112 if (len != 2)
1113 return -EINVAL;
1114
1115 /* find delimiter between host and sharename */
1116 pos = strpbrk(devname + 2, delims);
1117 if (!pos)
1118 return -EINVAL;
1119
1120 /* skip past delimiter */
1121 ++pos;
1122
1123 /* now go until next delimiter or end of string */
1124 len = strcspn(pos, delims);
1125
1126 /* move "pos" up to delimiter or NULL */
1127 pos += len;
1128 vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
1129 if (!vol->UNC)
1130 return -ENOMEM;
1131
1132 convert_delimiter(vol->UNC, '\\');
1133
1134 /* If pos is NULL, or is a bogus trailing delimiter then no prepath */
1135 if (!*pos++ || !*pos)
1136 return 0;
1137
1138 vol->prepath = kstrdup(pos, GFP_KERNEL);
1139 if (!vol->prepath)
1140 return -ENOMEM;
1141
1142 return 0;
1143}
1144
1093static int 1145static int
1094cifs_parse_mount_options(const char *mountdata, const char *devname, 1146cifs_parse_mount_options(const char *mountdata, const char *devname,
1095 struct smb_vol *vol) 1147 struct smb_vol *vol)
@@ -1108,11 +1160,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1108 char *string = NULL; 1160 char *string = NULL;
1109 char *tmp_end, *value; 1161 char *tmp_end, *value;
1110 char delim; 1162 char delim;
1163 bool got_ip = false;
1164 unsigned short port = 0;
1165 struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
1111 1166
1112 separator[0] = ','; 1167 separator[0] = ',';
1113 separator[1] = 0; 1168 separator[1] = 0;
1114 delim = separator[0]; 1169 delim = separator[0];
1115 1170
1171 /* ensure we always start with zeroed-out smb_vol */
1172 memset(vol, 0, sizeof(*vol));
1173
1116 /* 1174 /*
1117 * does not have to be perfect mapping since field is 1175 * does not have to be perfect mapping since field is
1118 * informational, only used for servers that do not support 1176 * informational, only used for servers that do not support
@@ -1169,6 +1227,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1169 vol->backupuid_specified = false; /* no backup intent for a user */ 1227 vol->backupuid_specified = false; /* no backup intent for a user */
1170 vol->backupgid_specified = false; /* no backup intent for a group */ 1228 vol->backupgid_specified = false; /* no backup intent for a group */
1171 1229
1230 /*
1231 * For now, we ignore -EINVAL errors under the assumption that the
1232 * unc= and prefixpath= options will be usable.
1233 */
1234 if (cifs_parse_devname(devname, vol) == -ENOMEM) {
1235 printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
1236 "device string.\n");
1237 goto out_nomem;
1238 }
1239
1172 while ((data = strsep(&options, separator)) != NULL) { 1240 while ((data = strsep(&options, separator)) != NULL) {
1173 substring_t args[MAX_OPT_ARGS]; 1241 substring_t args[MAX_OPT_ARGS];
1174 unsigned long option; 1242 unsigned long option;
@@ -1416,12 +1484,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1416 vol->dir_mode = option; 1484 vol->dir_mode = option;
1417 break; 1485 break;
1418 case Opt_port: 1486 case Opt_port:
1419 if (get_option_ul(args, &option)) { 1487 if (get_option_ul(args, &option) ||
1420 cERROR(1, "%s: Invalid port value", 1488 option > USHRT_MAX) {
1421 __func__); 1489 cERROR(1, "%s: Invalid port value", __func__);
1422 goto cifs_parse_mount_err; 1490 goto cifs_parse_mount_err;
1423 } 1491 }
1424 vol->port = option; 1492 port = (unsigned short)option;
1425 break; 1493 break;
1426 case Opt_rsize: 1494 case Opt_rsize:
1427 if (get_option_ul(args, &option)) { 1495 if (get_option_ul(args, &option)) {
@@ -1537,53 +1605,45 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1537 vol->password[j] = '\0'; 1605 vol->password[j] = '\0';
1538 break; 1606 break;
1539 case Opt_blank_ip: 1607 case Opt_blank_ip:
1540 vol->UNCip = NULL; 1608 /* FIXME: should this be an error instead? */
1609 got_ip = false;
1541 break; 1610 break;
1542 case Opt_ip: 1611 case Opt_ip:
1543 string = match_strdup(args); 1612 string = match_strdup(args);
1544 if (string == NULL) 1613 if (string == NULL)
1545 goto out_nomem; 1614 goto out_nomem;
1546 1615
1547 if (strnlen(string, INET6_ADDRSTRLEN) > 1616 if (!cifs_convert_address(dstaddr, string,
1548 INET6_ADDRSTRLEN) { 1617 strlen(string))) {
1549 printk(KERN_WARNING "CIFS: ip address " 1618 printk(KERN_ERR "CIFS: bad ip= option (%s).\n",
1550 "too long\n"); 1619 string);
1551 goto cifs_parse_mount_err;
1552 }
1553 vol->UNCip = kstrdup(string, GFP_KERNEL);
1554 if (!vol->UNCip) {
1555 printk(KERN_WARNING "CIFS: no memory "
1556 "for UNC IP\n");
1557 goto cifs_parse_mount_err; 1620 goto cifs_parse_mount_err;
1558 } 1621 }
1622 got_ip = true;
1559 break; 1623 break;
1560 case Opt_unc: 1624 case Opt_unc:
1561 string = match_strdup(args); 1625 string = vol->UNC;
1562 if (string == NULL) 1626 vol->UNC = match_strdup(args);
1627 if (vol->UNC == NULL)
1563 goto out_nomem; 1628 goto out_nomem;
1564 1629
1565 temp_len = strnlen(string, 300); 1630 convert_delimiter(vol->UNC, '\\');
1566 if (temp_len == 300) { 1631 if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
1567 printk(KERN_WARNING "CIFS: UNC name too long\n"); 1632 printk(KERN_ERR "CIFS: UNC Path does not "
1568 goto cifs_parse_mount_err; 1633 "begin with // or \\\\\n");
1569 }
1570
1571 vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
1572 if (vol->UNC == NULL) {
1573 printk(KERN_WARNING "CIFS: no memory for UNC\n");
1574 goto cifs_parse_mount_err;
1575 }
1576 strcpy(vol->UNC, string);
1577
1578 if (strncmp(string, "//", 2) == 0) {
1579 vol->UNC[0] = '\\';
1580 vol->UNC[1] = '\\';
1581 } else if (strncmp(string, "\\\\", 2) != 0) {
1582 printk(KERN_WARNING "CIFS: UNC Path does not "
1583 "begin with // or \\\\\n");
1584 goto cifs_parse_mount_err; 1634 goto cifs_parse_mount_err;
1585 } 1635 }
1586 1636
1637 /* Compare old unc= option to new one */
1638 if (!string || strcmp(string, vol->UNC))
1639 printk(KERN_WARNING "CIFS: the value of the "
1640 "unc= mount option does not match the "
1641 "device string. Using the unc= option "
1642 "for now. In 3.10, that option will "
1643 "be ignored and the contents of the "
1644 "device string will be used "
1645 "instead. (%s != %s)\n", string,
1646 vol->UNC);
1587 break; 1647 break;
1588 case Opt_domain: 1648 case Opt_domain:
1589 string = match_strdup(args); 1649 string = match_strdup(args);
@@ -1618,31 +1678,24 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1618 } 1678 }
1619 break; 1679 break;
1620 case Opt_prefixpath: 1680 case Opt_prefixpath:
1621 string = match_strdup(args); 1681 /* skip over any leading delimiter */
1622 if (string == NULL) 1682 if (*args[0].from == '/' || *args[0].from == '\\')
1623 goto out_nomem; 1683 args[0].from++;
1624
1625 temp_len = strnlen(string, 1024);
1626 if (string[0] != '/')
1627 temp_len++; /* missing leading slash */
1628 if (temp_len > 1024) {
1629 printk(KERN_WARNING "CIFS: prefix too long\n");
1630 goto cifs_parse_mount_err;
1631 }
1632
1633 vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
1634 if (vol->prepath == NULL) {
1635 printk(KERN_WARNING "CIFS: no memory "
1636 "for path prefix\n");
1637 goto cifs_parse_mount_err;
1638 }
1639
1640 if (string[0] != '/') {
1641 vol->prepath[0] = '/';
1642 strcpy(vol->prepath+1, string);
1643 } else
1644 strcpy(vol->prepath, string);
1645 1684
1685 string = vol->prepath;
1686 vol->prepath = match_strdup(args);
1687 if (vol->prepath == NULL)
1688 goto out_nomem;
1689 /* Compare old prefixpath= option to new one */
1690 if (!string || strcmp(string, vol->prepath))
1691 printk(KERN_WARNING "CIFS: the value of the "
1692 "prefixpath= mount option does not "
1693 "match the device string. Using the "
1694 "prefixpath= option for now. In 3.10, "
1695 "that option will be ignored and the "
1696 "contents of the device string will be "
1697 "used instead.(%s != %s)\n", string,
1698 vol->prepath);
1646 break; 1699 break;
1647 case Opt_iocharset: 1700 case Opt_iocharset:
1648 string = match_strdup(args); 1701 string = match_strdup(args);
@@ -1799,9 +1852,30 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1799 goto cifs_parse_mount_err; 1852 goto cifs_parse_mount_err;
1800 } 1853 }
1801#endif 1854#endif
1855 if (!vol->UNC) {
1856 cERROR(1, "CIFS mount error: No usable UNC path provided in "
1857 "device string or in unc= option!");
1858 goto cifs_parse_mount_err;
1859 }
1802 1860
1803 if (vol->UNCip == NULL) 1861 /* make sure UNC has a share name */
1804 vol->UNCip = &vol->UNC[2]; 1862 if (!strchr(vol->UNC + 3, '\\')) {
1863 cERROR(1, "Malformed UNC. Unable to find share name.");
1864 goto cifs_parse_mount_err;
1865 }
1866
1867 if (!got_ip) {
1868 /* No ip= option specified? Try to get it from UNC */
1869 if (!cifs_convert_address(dstaddr, &vol->UNC[2],
1870 strlen(&vol->UNC[2]))) {
1871 printk(KERN_ERR "Unable to determine destination "
1872 "address.\n");
1873 goto cifs_parse_mount_err;
1874 }
1875 }
1876
1877 /* set the port that we got earlier */
1878 cifs_set_port(dstaddr, port);
1805 1879
1806 if (uid_specified) 1880 if (uid_specified)
1807 vol->override_uid = override_uid; 1881 vol->override_uid = override_uid;
@@ -1843,7 +1917,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
1843 } 1917 }
1844 case AF_INET6: { 1918 case AF_INET6: {
1845 struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; 1919 struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
1846 struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs; 1920 struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
1847 return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); 1921 return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
1848 } 1922 }
1849 default: 1923 default:
@@ -1972,9 +2046,10 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
1972 return true; 2046 return true;
1973} 2047}
1974 2048
1975static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr, 2049static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
1976 struct smb_vol *vol)
1977{ 2050{
2051 struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
2052
1978 if ((server->vals != vol->vals) || (server->ops != vol->ops)) 2053 if ((server->vals != vol->vals) || (server->ops != vol->ops))
1979 return 0; 2054 return 0;
1980 2055
@@ -1995,13 +2070,13 @@ static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
1995} 2070}
1996 2071
1997static struct TCP_Server_Info * 2072static struct TCP_Server_Info *
1998cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol) 2073cifs_find_tcp_session(struct smb_vol *vol)
1999{ 2074{
2000 struct TCP_Server_Info *server; 2075 struct TCP_Server_Info *server;
2001 2076
2002 spin_lock(&cifs_tcp_ses_lock); 2077 spin_lock(&cifs_tcp_ses_lock);
2003 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { 2078 list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
2004 if (!match_server(server, addr, vol)) 2079 if (!match_server(server, vol))
2005 continue; 2080 continue;
2006 2081
2007 ++server->srv_count; 2082 ++server->srv_count;
@@ -2051,40 +2126,12 @@ static struct TCP_Server_Info *
2051cifs_get_tcp_session(struct smb_vol *volume_info) 2126cifs_get_tcp_session(struct smb_vol *volume_info)
2052{ 2127{
2053 struct TCP_Server_Info *tcp_ses = NULL; 2128 struct TCP_Server_Info *tcp_ses = NULL;
2054 struct sockaddr_storage addr;
2055 struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
2056 struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
2057 int rc; 2129 int rc;
2058 2130
2059 memset(&addr, 0, sizeof(struct sockaddr_storage)); 2131 cFYI(1, "UNC: %s", volume_info->UNC);
2060
2061 cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
2062
2063 if (volume_info->UNCip && volume_info->UNC) {
2064 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
2065 volume_info->UNCip,
2066 strlen(volume_info->UNCip),
2067 volume_info->port);
2068 if (!rc) {
2069 /* we failed translating address */
2070 rc = -EINVAL;
2071 goto out_err;
2072 }
2073 } else if (volume_info->UNCip) {
2074 /* BB using ip addr as tcp_ses name to connect to the
2075 DFS root below */
2076 cERROR(1, "Connecting to DFS root not implemented yet");
2077 rc = -EINVAL;
2078 goto out_err;
2079 } else /* which tcp_sess DFS root would we conect to */ {
2080 cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
2081 "unc=//192.168.1.100/public) specified");
2082 rc = -EINVAL;
2083 goto out_err;
2084 }
2085 2132
2086 /* see if we already have a matching tcp_ses */ 2133 /* see if we already have a matching tcp_ses */
2087 tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info); 2134 tcp_ses = cifs_find_tcp_session(volume_info);
2088 if (tcp_ses) 2135 if (tcp_ses)
2089 return tcp_ses; 2136 return tcp_ses;
2090 2137
@@ -2129,27 +2176,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
2129 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); 2176 INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
2130 INIT_LIST_HEAD(&tcp_ses->smb_ses_list); 2177 INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
2131 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); 2178 INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
2132 2179 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
2180 sizeof(tcp_ses->srcaddr));
2181 memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
2182 sizeof(tcp_ses->dstaddr));
2133 /* 2183 /*
2134 * at this point we are the only ones with the pointer 2184 * at this point we are the only ones with the pointer
2135 * to the struct since the kernel thread not created yet 2185 * to the struct since the kernel thread not created yet
2136 * no need to spinlock this init of tcpStatus or srv_count 2186 * no need to spinlock this init of tcpStatus or srv_count
2137 */ 2187 */
2138 tcp_ses->tcpStatus = CifsNew; 2188 tcp_ses->tcpStatus = CifsNew;
2139 memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
2140 sizeof(tcp_ses->srcaddr));
2141 ++tcp_ses->srv_count; 2189 ++tcp_ses->srv_count;
2142 2190
2143 if (addr.ss_family == AF_INET6) {
2144 cFYI(1, "attempting ipv6 connect");
2145 /* BB should we allow ipv6 on port 139? */
2146 /* other OS never observed in Wild doing 139 with v6 */
2147 memcpy(&tcp_ses->dstaddr, sin_server6,
2148 sizeof(struct sockaddr_in6));
2149 } else
2150 memcpy(&tcp_ses->dstaddr, sin_server,
2151 sizeof(struct sockaddr_in));
2152
2153 rc = ip_connect(tcp_ses); 2191 rc = ip_connect(tcp_ses);
2154 if (rc < 0) { 2192 if (rc < 0) {
2155 cERROR(1, "Error connecting to socket. Aborting operation"); 2193 cERROR(1, "Error connecting to socket. Aborting operation");
@@ -2397,8 +2435,6 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
2397} 2435}
2398#endif /* CONFIG_KEYS */ 2436#endif /* CONFIG_KEYS */
2399 2437
2400static bool warned_on_ntlm; /* globals init to false automatically */
2401
2402static struct cifs_ses * 2438static struct cifs_ses *
2403cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) 2439cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2404{ 2440{
@@ -2475,14 +2511,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
2475 ses->cred_uid = volume_info->cred_uid; 2511 ses->cred_uid = volume_info->cred_uid;
2476 ses->linux_uid = volume_info->linux_uid; 2512 ses->linux_uid = volume_info->linux_uid;
2477 2513
2478 /* ntlmv2 is much stronger than ntlm security, and has been broadly
2479 supported for many years, time to update default security mechanism */
2480 if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
2481 warned_on_ntlm = true;
2482 cERROR(1, "default security mechanism requested. The default "
2483 "security mechanism will be upgraded from ntlm to "
2484 "ntlmv2 in kernel release 3.3");
2485 }
2486 ses->overrideSecFlg = volume_info->secFlg; 2514 ses->overrideSecFlg = volume_info->secFlg;
2487 2515
2488 mutex_lock(&ses->session_mutex); 2516 mutex_lock(&ses->session_mutex);
@@ -2598,13 +2626,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
2598 } 2626 }
2599 } 2627 }
2600 2628
2601 if (strchr(volume_info->UNC + 3, '\\') == NULL
2602 && strchr(volume_info->UNC + 3, '/') == NULL) {
2603 cERROR(1, "Missing share name");
2604 rc = -ENODEV;
2605 goto out_fail;
2606 }
2607
2608 /* 2629 /*
2609 * BB Do we need to wrap session_mutex around this TCon call and Unix 2630 * BB Do we need to wrap session_mutex around this TCon call and Unix
2610 * SetFS as we do on SessSetup and reconnect? 2631 * SetFS as we do on SessSetup and reconnect?
@@ -2718,11 +2739,8 @@ cifs_match_super(struct super_block *sb, void *data)
2718 struct cifs_ses *ses; 2739 struct cifs_ses *ses;
2719 struct cifs_tcon *tcon; 2740 struct cifs_tcon *tcon;
2720 struct tcon_link *tlink; 2741 struct tcon_link *tlink;
2721 struct sockaddr_storage addr;
2722 int rc = 0; 2742 int rc = 0;
2723 2743
2724 memset(&addr, 0, sizeof(struct sockaddr_storage));
2725
2726 spin_lock(&cifs_tcp_ses_lock); 2744 spin_lock(&cifs_tcp_ses_lock);
2727 cifs_sb = CIFS_SB(sb); 2745 cifs_sb = CIFS_SB(sb);
2728 tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); 2746 tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
@@ -2736,17 +2754,7 @@ cifs_match_super(struct super_block *sb, void *data)
2736 2754
2737 volume_info = mnt_data->vol; 2755 volume_info = mnt_data->vol;
2738 2756
2739 if (!volume_info->UNCip || !volume_info->UNC) 2757 if (!match_server(tcp_srv, volume_info) ||
2740 goto out;
2741
2742 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
2743 volume_info->UNCip,
2744 strlen(volume_info->UNCip),
2745 volume_info->port);
2746 if (!rc)
2747 goto out;
2748
2749 if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
2750 !match_session(ses, volume_info) || 2758 !match_session(ses, volume_info) ||
2751 !match_tcon(tcon, volume_info->UNC)) { 2759 !match_tcon(tcon, volume_info->UNC)) {
2752 rc = 0; 2760 rc = 0;
@@ -3261,8 +3269,6 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
3261{ 3269{
3262 kfree(volume_info->username); 3270 kfree(volume_info->username);
3263 kzfree(volume_info->password); 3271 kzfree(volume_info->password);
3264 if (volume_info->UNCip != volume_info->UNC + 2)
3265 kfree(volume_info->UNCip);
3266 kfree(volume_info->UNC); 3272 kfree(volume_info->UNC);
3267 kfree(volume_info->domainname); 3273 kfree(volume_info->domainname);
3268 kfree(volume_info->iocharset); 3274 kfree(volume_info->iocharset);
@@ -3280,14 +3286,16 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
3280 3286
3281 3287
3282#ifdef CONFIG_CIFS_DFS_UPCALL 3288#ifdef CONFIG_CIFS_DFS_UPCALL
3283/* build_path_to_root returns full path to root when 3289/*
3284 * we do not have an exiting connection (tcon) */ 3290 * cifs_build_path_to_root returns full path to root when we do not have an
3291 * exiting connection (tcon)
3292 */
3285static char * 3293static char *
3286build_unc_path_to_root(const struct smb_vol *vol, 3294build_unc_path_to_root(const struct smb_vol *vol,
3287 const struct cifs_sb_info *cifs_sb) 3295 const struct cifs_sb_info *cifs_sb)
3288{ 3296{
3289 char *full_path, *pos; 3297 char *full_path, *pos;
3290 unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; 3298 unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
3291 unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); 3299 unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
3292 3300
3293 full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); 3301 full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
@@ -3298,6 +3306,7 @@ build_unc_path_to_root(const struct smb_vol *vol,
3298 pos = full_path + unc_len; 3306 pos = full_path + unc_len;
3299 3307
3300 if (pplen) { 3308 if (pplen) {
3309 *pos++ = CIFS_DIR_SEP(cifs_sb);
3301 strncpy(pos, vol->prepath, pplen); 3310 strncpy(pos, vol->prepath, pplen);
3302 pos += pplen; 3311 pos += pplen;
3303 } 3312 }
@@ -3353,7 +3362,6 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
3353 mdata = NULL; 3362 mdata = NULL;
3354 } else { 3363 } else {
3355 cleanup_volume_info_contents(volume_info); 3364 cleanup_volume_info_contents(volume_info);
3356 memset(volume_info, '\0', sizeof(*volume_info));
3357 rc = cifs_setup_volume_info(volume_info, mdata, 3365 rc = cifs_setup_volume_info(volume_info, mdata,
3358 fake_devname); 3366 fake_devname);
3359 } 3367 }
@@ -3375,7 +3383,6 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
3375 if (cifs_parse_mount_options(mount_data, devname, volume_info)) 3383 if (cifs_parse_mount_options(mount_data, devname, volume_info))
3376 return -EINVAL; 3384 return -EINVAL;
3377 3385
3378
3379 if (volume_info->nullauth) { 3386 if (volume_info->nullauth) {
3380 cFYI(1, "Anonymous login"); 3387 cFYI(1, "Anonymous login");
3381 kfree(volume_info->username); 3388 kfree(volume_info->username);
@@ -3412,7 +3419,7 @@ cifs_get_volume_info(char *mount_data, const char *devname)
3412 int rc; 3419 int rc;
3413 struct smb_vol *volume_info; 3420 struct smb_vol *volume_info;
3414 3421
3415 volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); 3422 volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL);
3416 if (!volume_info) 3423 if (!volume_info)
3417 return ERR_PTR(-ENOMEM); 3424 return ERR_PTR(-ENOMEM);
3418 3425
@@ -3537,8 +3544,10 @@ remote_path_check:
3537 rc = -ENOSYS; 3544 rc = -ENOSYS;
3538 goto mount_fail_check; 3545 goto mount_fail_check;
3539 } 3546 }
3540 /* build_path_to_root works only when we have a valid tcon */ 3547 /*
3541 full_path = build_path_to_root(volume_info, cifs_sb, tcon); 3548 * cifs_build_path_to_root works only when we have a valid tcon
3549 */
3550 full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
3542 if (full_path == NULL) { 3551 if (full_path == NULL) {
3543 rc = -ENOMEM; 3552 rc = -ENOMEM;
3544 goto mount_fail_check; 3553 goto mount_fail_check;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d3671f2acb29..8719bbe0dcc3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -44,6 +44,38 @@ renew_parental_timestamps(struct dentry *direntry)
44 } while (!IS_ROOT(direntry)); 44 } while (!IS_ROOT(direntry));
45} 45}
46 46
47char *
48cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
49 struct cifs_tcon *tcon)
50{
51 int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
52 int dfsplen;
53 char *full_path = NULL;
54
55 /* if no prefix path, simply set path to the root of share to "" */
56 if (pplen == 0) {
57 full_path = kzalloc(1, GFP_KERNEL);
58 return full_path;
59 }
60
61 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
62 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
63 else
64 dfsplen = 0;
65
66 full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
67 if (full_path == NULL)
68 return full_path;
69
70 if (dfsplen)
71 strncpy(full_path, tcon->treeName, dfsplen);
72 full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
73 strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
74 convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
75 full_path[dfsplen + pplen] = 0; /* add trailing null */
76 return full_path;
77}
78
47/* Note: caller must free return buffer */ 79/* Note: caller must free return buffer */
48char * 80char *
49build_path_from_dentry(struct dentry *direntry) 81build_path_from_dentry(struct dentry *direntry)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 70b6f4c3a0c1..8ea6ca50a665 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -238,6 +238,23 @@ out:
238 return rc; 238 return rc;
239} 239}
240 240
241static bool
242cifs_has_mand_locks(struct cifsInodeInfo *cinode)
243{
244 struct cifs_fid_locks *cur;
245 bool has_locks = false;
246
247 down_read(&cinode->lock_sem);
248 list_for_each_entry(cur, &cinode->llist, llist) {
249 if (!list_empty(&cur->locks)) {
250 has_locks = true;
251 break;
252 }
253 }
254 up_read(&cinode->lock_sem);
255 return has_locks;
256}
257
241struct cifsFileInfo * 258struct cifsFileInfo *
242cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, 259cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
243 struct tcon_link *tlink, __u32 oplock) 260 struct tcon_link *tlink, __u32 oplock)
@@ -248,6 +265,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
248 struct cifsFileInfo *cfile; 265 struct cifsFileInfo *cfile;
249 struct cifs_fid_locks *fdlocks; 266 struct cifs_fid_locks *fdlocks;
250 struct cifs_tcon *tcon = tlink_tcon(tlink); 267 struct cifs_tcon *tcon = tlink_tcon(tlink);
268 struct TCP_Server_Info *server = tcon->ses->server;
251 269
252 cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 270 cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
253 if (cfile == NULL) 271 if (cfile == NULL)
@@ -276,12 +294,22 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
276 INIT_WORK(&cfile->oplock_break, cifs_oplock_break); 294 INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
277 mutex_init(&cfile->fh_mutex); 295 mutex_init(&cfile->fh_mutex);
278 296
297 /*
298 * If the server returned a read oplock and we have mandatory brlocks,
299 * set oplock level to None.
300 */
301 if (oplock == server->vals->oplock_read &&
302 cifs_has_mand_locks(cinode)) {
303 cFYI(1, "Reset oplock val from read to None due to mand locks");
304 oplock = 0;
305 }
306
279 spin_lock(&cifs_file_list_lock); 307 spin_lock(&cifs_file_list_lock);
280 if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE) 308 if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
281 oplock = fid->pending_open->oplock; 309 oplock = fid->pending_open->oplock;
282 list_del(&fid->pending_open->olist); 310 list_del(&fid->pending_open->olist);
283 311
284 tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock); 312 server->ops->set_fid(cfile, fid, oplock);
285 313
286 list_add(&cfile->tlist, &tcon->openFileList); 314 list_add(&cfile->tlist, &tcon->openFileList);
287 /* if readable file instance put first in list*/ 315 /* if readable file instance put first in list*/
@@ -505,16 +533,36 @@ out:
505 return rc; 533 return rc;
506} 534}
507 535
536static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
537
508/* 538/*
509 * Try to reacquire byte range locks that were released when session 539 * Try to reacquire byte range locks that were released when session
510 * to server was lost 540 * to server was lost.
511 */ 541 */
512static int cifs_relock_file(struct cifsFileInfo *cifsFile) 542static int
543cifs_relock_file(struct cifsFileInfo *cfile)
513{ 544{
545 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
546 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
547 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
514 int rc = 0; 548 int rc = 0;
515 549
516 /* BB list all locks open on this file and relock */ 550 /* we are going to update can_cache_brlcks here - need a write access */
551 down_write(&cinode->lock_sem);
552 if (cinode->can_cache_brlcks) {
553 /* can cache locks - no need to push them */
554 up_write(&cinode->lock_sem);
555 return rc;
556 }
557
558 if (cap_unix(tcon->ses) &&
559 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
560 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
561 rc = cifs_push_posix_locks(cfile);
562 else
563 rc = tcon->ses->server->ops->push_mand_locks(cfile);
517 564
565 up_write(&cinode->lock_sem);
518 return rc; 566 return rc;
519} 567}
520 568
@@ -739,10 +787,15 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
739 } 787 }
740} 788}
741 789
790#define CIFS_LOCK_OP 0
791#define CIFS_READ_OP 1
792#define CIFS_WRITE_OP 2
793
794/* @rw_check : 0 - no op, 1 - read, 2 - write */
742static bool 795static bool
743cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, 796cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
744 __u64 length, __u8 type, struct cifsFileInfo *cfile, 797 __u64 length, __u8 type, struct cifsFileInfo *cfile,
745 struct cifsLockInfo **conf_lock, bool rw_check) 798 struct cifsLockInfo **conf_lock, int rw_check)
746{ 799{
747 struct cifsLockInfo *li; 800 struct cifsLockInfo *li;
748 struct cifsFileInfo *cur_cfile = fdlocks->cfile; 801 struct cifsFileInfo *cur_cfile = fdlocks->cfile;
@@ -752,9 +805,13 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
752 if (offset + length <= li->offset || 805 if (offset + length <= li->offset ||
753 offset >= li->offset + li->length) 806 offset >= li->offset + li->length)
754 continue; 807 continue;
755 if (rw_check && server->ops->compare_fids(cfile, cur_cfile) && 808 if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid &&
756 current->tgid == li->pid) 809 server->ops->compare_fids(cfile, cur_cfile)) {
757 continue; 810 /* shared lock prevents write op through the same fid */
811 if (!(li->type & server->vals->shared_lock_type) ||
812 rw_check != CIFS_WRITE_OP)
813 continue;
814 }
758 if ((type & server->vals->shared_lock_type) && 815 if ((type & server->vals->shared_lock_type) &&
759 ((server->ops->compare_fids(cfile, cur_cfile) && 816 ((server->ops->compare_fids(cfile, cur_cfile) &&
760 current->tgid == li->pid) || type == li->type)) 817 current->tgid == li->pid) || type == li->type))
@@ -769,7 +826,7 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
769bool 826bool
770cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, 827cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
771 __u8 type, struct cifsLockInfo **conf_lock, 828 __u8 type, struct cifsLockInfo **conf_lock,
772 bool rw_check) 829 int rw_check)
773{ 830{
774 bool rc = false; 831 bool rc = false;
775 struct cifs_fid_locks *cur; 832 struct cifs_fid_locks *cur;
@@ -805,7 +862,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
805 down_read(&cinode->lock_sem); 862 down_read(&cinode->lock_sem);
806 863
807 exist = cifs_find_lock_conflict(cfile, offset, length, type, 864 exist = cifs_find_lock_conflict(cfile, offset, length, type,
808 &conf_lock, false); 865 &conf_lock, CIFS_LOCK_OP);
809 if (exist) { 866 if (exist) {
810 flock->fl_start = conf_lock->offset; 867 flock->fl_start = conf_lock->offset;
811 flock->fl_end = conf_lock->offset + conf_lock->length - 1; 868 flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -852,7 +909,7 @@ try_again:
852 down_write(&cinode->lock_sem); 909 down_write(&cinode->lock_sem);
853 910
854 exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, 911 exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
855 lock->type, &conf_lock, false); 912 lock->type, &conf_lock, CIFS_LOCK_OP);
856 if (!exist && cinode->can_cache_brlcks) { 913 if (!exist && cinode->can_cache_brlcks) {
857 list_add_tail(&lock->llist, &cfile->llist->locks); 914 list_add_tail(&lock->llist, &cfile->llist->locks);
858 up_write(&cinode->lock_sem); 915 up_write(&cinode->lock_sem);
@@ -948,7 +1005,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
948 int rc = 0, stored_rc; 1005 int rc = 0, stored_rc;
949 struct cifsLockInfo *li, *tmp; 1006 struct cifsLockInfo *li, *tmp;
950 struct cifs_tcon *tcon; 1007 struct cifs_tcon *tcon;
951 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
952 unsigned int num, max_num, max_buf; 1008 unsigned int num, max_num, max_buf;
953 LOCKING_ANDX_RANGE *buf, *cur; 1009 LOCKING_ANDX_RANGE *buf, *cur;
954 int types[] = {LOCKING_ANDX_LARGE_FILES, 1010 int types[] = {LOCKING_ANDX_LARGE_FILES,
@@ -958,21 +1014,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
958 xid = get_xid(); 1014 xid = get_xid();
959 tcon = tlink_tcon(cfile->tlink); 1015 tcon = tlink_tcon(cfile->tlink);
960 1016
961 /* we are going to update can_cache_brlcks here - need a write access */
962 down_write(&cinode->lock_sem);
963 if (!cinode->can_cache_brlcks) {
964 up_write(&cinode->lock_sem);
965 free_xid(xid);
966 return rc;
967 }
968
969 /* 1017 /*
970 * Accessing maxBuf is racy with cifs_reconnect - need to store value 1018 * Accessing maxBuf is racy with cifs_reconnect - need to store value
971 * and check it for zero before using. 1019 * and check it for zero before using.
972 */ 1020 */
973 max_buf = tcon->ses->server->maxBuf; 1021 max_buf = tcon->ses->server->maxBuf;
974 if (!max_buf) { 1022 if (!max_buf) {
975 up_write(&cinode->lock_sem);
976 free_xid(xid); 1023 free_xid(xid);
977 return -EINVAL; 1024 return -EINVAL;
978 } 1025 }
@@ -981,7 +1028,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
981 sizeof(LOCKING_ANDX_RANGE); 1028 sizeof(LOCKING_ANDX_RANGE);
982 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); 1029 buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
983 if (!buf) { 1030 if (!buf) {
984 up_write(&cinode->lock_sem);
985 free_xid(xid); 1031 free_xid(xid);
986 return -ENOMEM; 1032 return -ENOMEM;
987 } 1033 }
@@ -1018,9 +1064,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
1018 } 1064 }
1019 } 1065 }
1020 1066
1021 cinode->can_cache_brlcks = false;
1022 up_write(&cinode->lock_sem);
1023
1024 kfree(buf); 1067 kfree(buf);
1025 free_xid(xid); 1068 free_xid(xid);
1026 return rc; 1069 return rc;
@@ -1043,7 +1086,6 @@ struct lock_to_push {
1043static int 1086static int
1044cifs_push_posix_locks(struct cifsFileInfo *cfile) 1087cifs_push_posix_locks(struct cifsFileInfo *cfile)
1045{ 1088{
1046 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1047 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1089 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1048 struct file_lock *flock, **before; 1090 struct file_lock *flock, **before;
1049 unsigned int count = 0, i = 0; 1091 unsigned int count = 0, i = 0;
@@ -1054,14 +1096,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1054 1096
1055 xid = get_xid(); 1097 xid = get_xid();
1056 1098
1057 /* we are going to update can_cache_brlcks here - need a write access */
1058 down_write(&cinode->lock_sem);
1059 if (!cinode->can_cache_brlcks) {
1060 up_write(&cinode->lock_sem);
1061 free_xid(xid);
1062 return rc;
1063 }
1064
1065 lock_flocks(); 1099 lock_flocks();
1066 cifs_for_each_lock(cfile->dentry->d_inode, before) { 1100 cifs_for_each_lock(cfile->dentry->d_inode, before) {
1067 if ((*before)->fl_flags & FL_POSIX) 1101 if ((*before)->fl_flags & FL_POSIX)
@@ -1127,9 +1161,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
1127 } 1161 }
1128 1162
1129out: 1163out:
1130 cinode->can_cache_brlcks = false;
1131 up_write(&cinode->lock_sem);
1132
1133 free_xid(xid); 1164 free_xid(xid);
1134 return rc; 1165 return rc;
1135err_out: 1166err_out:
@@ -1144,14 +1175,27 @@ static int
1144cifs_push_locks(struct cifsFileInfo *cfile) 1175cifs_push_locks(struct cifsFileInfo *cfile)
1145{ 1176{
1146 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); 1177 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
1178 struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
1147 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1179 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1180 int rc = 0;
1181
1182 /* we are going to update can_cache_brlcks here - need a write access */
1183 down_write(&cinode->lock_sem);
1184 if (!cinode->can_cache_brlcks) {
1185 up_write(&cinode->lock_sem);
1186 return rc;
1187 }
1148 1188
1149 if (cap_unix(tcon->ses) && 1189 if (cap_unix(tcon->ses) &&
1150 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 1190 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
1151 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 1191 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
1152 return cifs_push_posix_locks(cfile); 1192 rc = cifs_push_posix_locks(cfile);
1193 else
1194 rc = tcon->ses->server->ops->push_mand_locks(cfile);
1153 1195
1154 return tcon->ses->server->ops->push_mand_locks(cfile); 1196 cinode->can_cache_brlcks = false;
1197 up_write(&cinode->lock_sem);
1198 return rc;
1155} 1199}
1156 1200
1157static void 1201static void
@@ -1406,6 +1450,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1406 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; 1450 struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
1407 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 1451 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
1408 struct TCP_Server_Info *server = tcon->ses->server; 1452 struct TCP_Server_Info *server = tcon->ses->server;
1453 struct inode *inode = cfile->dentry->d_inode;
1409 1454
1410 if (posix_lck) { 1455 if (posix_lck) {
1411 int posix_lock_type; 1456 int posix_lock_type;
@@ -1436,16 +1481,33 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
1436 return -ENOMEM; 1481 return -ENOMEM;
1437 1482
1438 rc = cifs_lock_add_if(cfile, lock, wait_flag); 1483 rc = cifs_lock_add_if(cfile, lock, wait_flag);
1439 if (rc < 0) 1484 if (rc < 0) {
1440 kfree(lock); 1485 kfree(lock);
1441 if (rc <= 0) 1486 return rc;
1487 }
1488 if (!rc)
1442 goto out; 1489 goto out;
1443 1490
1491 /*
1492 * Windows 7 server can delay breaking lease from read to None
1493 * if we set a byte-range lock on a file - break it explicitly
1494 * before sending the lock to the server to be sure the next
1495 * read won't conflict with non-overlapted locks due to
1496 * pagereading.
1497 */
1498 if (!CIFS_I(inode)->clientCanCacheAll &&
1499 CIFS_I(inode)->clientCanCacheRead) {
1500 cifs_invalidate_mapping(inode);
1501 cFYI(1, "Set no oplock for inode=%p due to mand locks",
1502 inode);
1503 CIFS_I(inode)->clientCanCacheRead = false;
1504 }
1505
1444 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, 1506 rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
1445 type, 1, 0, wait_flag); 1507 type, 1, 0, wait_flag);
1446 if (rc) { 1508 if (rc) {
1447 kfree(lock); 1509 kfree(lock);
1448 goto out; 1510 return rc;
1449 } 1511 }
1450 1512
1451 cifs_lock_add(cfile, lock); 1513 cifs_lock_add(cfile, lock);
@@ -2457,7 +2519,7 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
2457 down_read(&cinode->lock_sem); 2519 down_read(&cinode->lock_sem);
2458 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), 2520 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
2459 server->vals->exclusive_lock_type, NULL, 2521 server->vals->exclusive_lock_type, NULL,
2460 true)) { 2522 CIFS_WRITE_OP)) {
2461 mutex_lock(&inode->i_mutex); 2523 mutex_lock(&inode->i_mutex);
2462 rc = __generic_file_aio_write(iocb, iov, nr_segs, 2524 rc = __generic_file_aio_write(iocb, iov, nr_segs,
2463 &iocb->ki_pos); 2525 &iocb->ki_pos);
@@ -2487,42 +2549,34 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
2487 struct cifsFileInfo *cfile = (struct cifsFileInfo *) 2549 struct cifsFileInfo *cfile = (struct cifsFileInfo *)
2488 iocb->ki_filp->private_data; 2550 iocb->ki_filp->private_data;
2489 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 2551 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
2552 ssize_t written;
2490 2553
2491#ifdef CONFIG_CIFS_SMB2 2554 if (cinode->clientCanCacheAll) {
2492 /* 2555 if (cap_unix(tcon->ses) &&
2493 * If we have an oplock for read and want to write a data to the file 2556 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
2494 * we need to store it in the page cache and then push it to the server 2557 && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
2495 * to be sure the next read will get a valid data. 2558 return generic_file_aio_write(iocb, iov, nr_segs, pos);
2496 */ 2559 return cifs_writev(iocb, iov, nr_segs, pos);
2497 if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) {
2498 ssize_t written;
2499 int rc;
2500
2501 written = generic_file_aio_write(iocb, iov, nr_segs, pos);
2502 rc = filemap_fdatawrite(inode->i_mapping);
2503 if (rc)
2504 return (ssize_t)rc;
2505
2506 return written;
2507 } 2560 }
2508#endif
2509
2510 /* 2561 /*
2511 * For non-oplocked files in strict cache mode we need to write the data 2562 * For non-oplocked files in strict cache mode we need to write the data
2512 * to the server exactly from the pos to pos+len-1 rather than flush all 2563 * to the server exactly from the pos to pos+len-1 rather than flush all
2513 * affected pages because it may cause a error with mandatory locks on 2564 * affected pages because it may cause a error with mandatory locks on
2514 * these pages but not on the region from pos to ppos+len-1. 2565 * these pages but not on the region from pos to ppos+len-1.
2515 */ 2566 */
2516 2567 written = cifs_user_writev(iocb, iov, nr_segs, pos);
2517 if (!cinode->clientCanCacheAll) 2568 if (written > 0 && cinode->clientCanCacheRead) {
2518 return cifs_user_writev(iocb, iov, nr_segs, pos); 2569 /*
2519 2570 * Windows 7 server can delay breaking level2 oplock if a write
2520 if (cap_unix(tcon->ses) && 2571 * request comes - break it on the client to prevent reading
2521 (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && 2572 * an old data.
2522 ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) 2573 */
2523 return generic_file_aio_write(iocb, iov, nr_segs, pos); 2574 cifs_invalidate_mapping(inode);
2524 2575 cFYI(1, "Set no oplock for inode=%p after a write operation",
2525 return cifs_writev(iocb, iov, nr_segs, pos); 2576 inode);
2577 cinode->clientCanCacheRead = false;
2578 }
2579 return written;
2526} 2580}
2527 2581
2528static struct cifs_readdata * 2582static struct cifs_readdata *
@@ -2892,7 +2946,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
2892 down_read(&cinode->lock_sem); 2946 down_read(&cinode->lock_sem);
2893 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs), 2947 if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
2894 tcon->ses->server->vals->shared_lock_type, 2948 tcon->ses->server->vals->shared_lock_type,
2895 NULL, true)) 2949 NULL, CIFS_READ_OP))
2896 rc = generic_file_aio_read(iocb, iov, nr_segs, pos); 2950 rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
2897 up_read(&cinode->lock_sem); 2951 up_read(&cinode->lock_sem);
2898 return rc; 2952 return rc;
@@ -3527,6 +3581,13 @@ void cifs_oplock_break(struct work_struct *work)
3527 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); 3581 struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
3528 int rc = 0; 3582 int rc = 0;
3529 3583
3584 if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead &&
3585 cifs_has_mand_locks(cinode)) {
3586 cFYI(1, "Reset oplock to None for inode=%p due to mand locks",
3587 inode);
3588 cinode->clientCanCacheRead = false;
3589 }
3590
3530 if (inode && S_ISREG(inode->i_mode)) { 3591 if (inode && S_ISREG(inode->i_mode)) {
3531 if (cinode->clientCanCacheRead) 3592 if (cinode->clientCanCacheRead)
3532 break_lease(inode, O_RDONLY); 3593 break_lease(inode, O_RDONLY);
@@ -3536,7 +3597,7 @@ void cifs_oplock_break(struct work_struct *work)
3536 if (cinode->clientCanCacheRead == 0) { 3597 if (cinode->clientCanCacheRead == 0) {
3537 rc = filemap_fdatawait(inode->i_mapping); 3598 rc = filemap_fdatawait(inode->i_mapping);
3538 mapping_set_error(inode->i_mapping, rc); 3599 mapping_set_error(inode->i_mapping, rc);
3539 invalidate_remote_inode(inode); 3600 cifs_invalidate_mapping(inode);
3540 } 3601 }
3541 cFYI(1, "Oplock flush inode %p rc %d", inode, rc); 3602 cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
3542 } 3603 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index afdff79651f1..ed6208ff85a7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1791,11 +1791,12 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1791 stat->ino = CIFS_I(inode)->uniqueid; 1791 stat->ino = CIFS_I(inode)->uniqueid;
1792 1792
1793 /* 1793 /*
1794 * If on a multiuser mount without unix extensions, and the admin hasn't 1794 * If on a multiuser mount without unix extensions or cifsacl being
1795 * overridden them, set the ownership to the fsuid/fsgid of the current 1795 * enabled, and the admin hasn't overridden them, set the ownership
1796 * process. 1796 * to the fsuid/fsgid of the current process.
1797 */ 1797 */
1798 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && 1798 if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
1799 !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
1799 !tcon->unix_ext) { 1800 !tcon->unix_ext) {
1800 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) 1801 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
1801 stat->uid = current_fsuid(); 1802 stat->uid = current_fsuid();
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d5ce9e26696c..a82bc51fdc82 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -204,7 +204,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
204 return rc; 204 return rc;
205} 205}
206 206
207int 207void
208cifs_set_port(struct sockaddr *addr, const unsigned short int port) 208cifs_set_port(struct sockaddr *addr, const unsigned short int port)
209{ 209{
210 switch (addr->sa_family) { 210 switch (addr->sa_family) {
@@ -214,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port)
214 case AF_INET6: 214 case AF_INET6:
215 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); 215 ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
216 break; 216 break;
217 default:
218 return 0;
219 } 217 }
220 return 1;
221}
222
223int
224cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
225 const unsigned short int port)
226{
227 if (!cifs_convert_address(dst, src, len))
228 return 0;
229 return cifs_set_port(dst, port);
230} 218}
231 219
232/***************************************************************************** 220/*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 1c576e871366..cdd6ff48246b 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -66,18 +66,21 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
66#endif /* DEBUG2 */ 66#endif /* DEBUG2 */
67 67
68/* 68/*
69 * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
70 *
69 * Find the dentry that matches "name". If there isn't one, create one. If it's 71 * Find the dentry that matches "name". If there isn't one, create one. If it's
70 * a negative dentry or the uniqueid changed, then drop it and recreate it. 72 * a negative dentry or the uniqueid changed, then drop it and recreate it.
71 */ 73 */
72static struct dentry * 74static void
73cifs_readdir_lookup(struct dentry *parent, struct qstr *name, 75cifs_prime_dcache(struct dentry *parent, struct qstr *name,
74 struct cifs_fattr *fattr) 76 struct cifs_fattr *fattr)
75{ 77{
76 struct dentry *dentry, *alias; 78 struct dentry *dentry, *alias;
77 struct inode *inode; 79 struct inode *inode;
78 struct super_block *sb = parent->d_inode->i_sb; 80 struct super_block *sb = parent->d_inode->i_sb;
81 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
79 82
80 cFYI(1, "For %s", name->name); 83 cFYI(1, "%s: for %s", __func__, name->name);
81 84
82 if (parent->d_op && parent->d_op->d_hash) 85 if (parent->d_op && parent->d_op->d_hash)
83 parent->d_op->d_hash(parent, parent->d_inode, name); 86 parent->d_op->d_hash(parent, parent->d_inode, name);
@@ -87,37 +90,42 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
87 dentry = d_lookup(parent, name); 90 dentry = d_lookup(parent, name);
88 if (dentry) { 91 if (dentry) {
89 int err; 92 int err;
93
90 inode = dentry->d_inode; 94 inode = dentry->d_inode;
91 /* update inode in place if i_ino didn't change */ 95 if (inode) {
92 if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { 96 /*
93 cifs_fattr_to_inode(inode, fattr); 97 * If we're generating inode numbers, then we don't
94 return dentry; 98 * want to clobber the existing one with the one that
99 * the readdir code created.
100 */
101 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
102 fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
103
104 /* update inode in place if i_ino didn't change */
105 if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
106 cifs_fattr_to_inode(inode, fattr);
107 goto out;
108 }
95 } 109 }
96 err = d_invalidate(dentry); 110 err = d_invalidate(dentry);
97 dput(dentry); 111 dput(dentry);
98 if (err) 112 if (err)
99 return NULL; 113 return;
100 } 114 }
101 115
102 dentry = d_alloc(parent, name); 116 dentry = d_alloc(parent, name);
103 if (dentry == NULL) 117 if (!dentry)
104 return NULL; 118 return;
105 119
106 inode = cifs_iget(sb, fattr); 120 inode = cifs_iget(sb, fattr);
107 if (!inode) { 121 if (!inode)
108 dput(dentry); 122 goto out;
109 return NULL;
110 }
111 123
112 alias = d_materialise_unique(dentry, inode); 124 alias = d_materialise_unique(dentry, inode);
113 if (alias != NULL) { 125 if (alias && !IS_ERR(alias))
114 dput(dentry); 126 dput(alias);
115 if (IS_ERR(alias)) 127out:
116 return NULL; 128 dput(dentry);
117 dentry = alias;
118 }
119
120 return dentry;
121} 129}
122 130
123static void 131static void
@@ -137,6 +145,16 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
137 if (fattr->cf_cifsattrs & ATTR_READONLY) 145 if (fattr->cf_cifsattrs & ATTR_READONLY)
138 fattr->cf_mode &= ~S_IWUGO; 146 fattr->cf_mode &= ~S_IWUGO;
139 147
148 /*
149 * We of course don't get ACL info in FIND_FIRST/NEXT results, so
150 * mark it for revalidation so that "ls -l" will look right. It might
151 * be super-slow, but if we don't do this then the ownership of files
152 * may look wrong since the inodes may not have timed out by the time
153 * "ls" does a stat() call on them.
154 */
155 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
156 fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
157
140 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL && 158 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
141 fattr->cf_cifsattrs & ATTR_SYSTEM) { 159 fattr->cf_cifsattrs & ATTR_SYSTEM) {
142 if (fattr->cf_eof == 0) { 160 if (fattr->cf_eof == 0) {
@@ -652,7 +670,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
652 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 670 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
653 struct cifs_dirent de = { NULL, }; 671 struct cifs_dirent de = { NULL, };
654 struct cifs_fattr fattr; 672 struct cifs_fattr fattr;
655 struct dentry *dentry;
656 struct qstr name; 673 struct qstr name;
657 int rc = 0; 674 int rc = 0;
658 ino_t ino; 675 ino_t ino;
@@ -723,13 +740,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
723 */ 740 */
724 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; 741 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
725 742
726 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 743 cifs_prime_dcache(file->f_dentry, &name, &fattr);
727 dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr);
728 744
745 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
729 rc = filldir(dirent, name.name, name.len, file->f_pos, ino, 746 rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
730 fattr.cf_dtype); 747 fattr.cf_dtype);
731
732 dput(dentry);
733 return rc; 748 return rc;
734} 749}
735 750
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 34cea2798333..47bc5a87f94e 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -53,6 +53,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
53 mutex_unlock(&server->srv_mutex); 53 mutex_unlock(&server->srv_mutex);
54 return rc; 54 return rc;
55 } 55 }
56
57 /*
58 * The response to this call was already factored into the sequence
59 * number when the call went out, so we must adjust it back downward
60 * after signing here.
61 */
62 --server->sequence_number;
56 rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); 63 rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
57 mutex_unlock(&server->srv_mutex); 64 mutex_unlock(&server->srv_mutex);
58 65
@@ -575,37 +582,6 @@ cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
575 return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data); 582 return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
576} 583}
577 584
578static char *
579cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
580 struct cifs_tcon *tcon)
581{
582 int pplen = vol->prepath ? strlen(vol->prepath) : 0;
583 int dfsplen;
584 char *full_path = NULL;
585
586 /* if no prefix path, simply set path to the root of share to "" */
587 if (pplen == 0) {
588 full_path = kzalloc(1, GFP_KERNEL);
589 return full_path;
590 }
591
592 if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
593 dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
594 else
595 dfsplen = 0;
596
597 full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
598 if (full_path == NULL)
599 return full_path;
600
601 if (dfsplen)
602 strncpy(full_path, tcon->treeName, dfsplen);
603 strncpy(full_path + dfsplen, vol->prepath, pplen);
604 convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
605 full_path[dfsplen + pplen] = 0; /* add trailing null */
606 return full_path;
607}
608
609static void 585static void
610cifs_clear_stats(struct cifs_tcon *tcon) 586cifs_clear_stats(struct cifs_tcon *tcon)
611{ 587{
@@ -943,7 +919,6 @@ struct smb_version_operations smb1_operations = {
943 .set_path_size = CIFSSMBSetEOF, 919 .set_path_size = CIFSSMBSetEOF,
944 .set_file_size = CIFSSMBSetFileSize, 920 .set_file_size = CIFSSMBSetFileSize,
945 .set_file_info = smb_set_file_info, 921 .set_file_info = smb_set_file_info,
946 .build_path_to_root = cifs_build_path_to_root,
947 .echo = CIFSSMBEcho, 922 .echo = CIFSSMBEcho,
948 .mkdir = CIFSSMBMkDir, 923 .mkdir = CIFSSMBMkDir,
949 .mkdir_setinfo = cifs_mkdir_setinfo, 924 .mkdir_setinfo = cifs_mkdir_setinfo,
@@ -984,4 +959,5 @@ struct smb_version_values smb1_values = {
984 .cap_unix = CAP_UNIX, 959 .cap_unix = CAP_UNIX,
985 .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, 960 .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
986 .cap_large_files = CAP_LARGE_FILES, 961 .cap_large_files = CAP_LARGE_FILES,
962 .oplock_read = OPLOCK_READ,
987}; 963};
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index a93eec30a50d..71e6aed4b382 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -260,13 +260,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
260 struct cifs_fid_locks *fdlocks; 260 struct cifs_fid_locks *fdlocks;
261 261
262 xid = get_xid(); 262 xid = get_xid();
263 /* we are going to update can_cache_brlcks here - need a write access */
264 down_write(&cinode->lock_sem);
265 if (!cinode->can_cache_brlcks) {
266 up_write(&cinode->lock_sem);
267 free_xid(xid);
268 return rc;
269 }
270 263
271 /* 264 /*
272 * Accessing maxBuf is racy with cifs_reconnect - need to store value 265 * Accessing maxBuf is racy with cifs_reconnect - need to store value
@@ -274,7 +267,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
274 */ 267 */
275 max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; 268 max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
276 if (!max_buf) { 269 if (!max_buf) {
277 up_write(&cinode->lock_sem);
278 free_xid(xid); 270 free_xid(xid);
279 return -EINVAL; 271 return -EINVAL;
280 } 272 }
@@ -282,7 +274,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
282 max_num = max_buf / sizeof(struct smb2_lock_element); 274 max_num = max_buf / sizeof(struct smb2_lock_element);
283 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); 275 buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
284 if (!buf) { 276 if (!buf) {
285 up_write(&cinode->lock_sem);
286 free_xid(xid); 277 free_xid(xid);
287 return -ENOMEM; 278 return -ENOMEM;
288 } 279 }
@@ -293,10 +284,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
293 rc = stored_rc; 284 rc = stored_rc;
294 } 285 }
295 286
296 cinode->can_cache_brlcks = false;
297 kfree(buf); 287 kfree(buf);
298
299 up_write(&cinode->lock_sem);
300 free_xid(xid); 288 free_xid(xid);
301 return rc; 289 return rc;
302} 290}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4d9dbe0b7385..c9c7aa7ed966 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -262,23 +262,6 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
262 return rc; 262 return rc;
263} 263}
264 264
265static char *
266smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
267 struct cifs_tcon *tcon)
268{
269 int pplen = vol->prepath ? strlen(vol->prepath) : 0;
270 char *full_path = NULL;
271
272 /* if no prefix path, simply set path to the root of share to "" */
273 if (pplen == 0) {
274 full_path = kzalloc(2, GFP_KERNEL);
275 return full_path;
276 }
277
278 cERROR(1, "prefixpath is not supported for SMB2 now");
279 return NULL;
280}
281
282static bool 265static bool
283smb2_can_echo(struct TCP_Server_Info *server) 266smb2_can_echo(struct TCP_Server_Info *server)
284{ 267{
@@ -613,7 +596,6 @@ struct smb_version_operations smb21_operations = {
613 .set_path_size = smb2_set_path_size, 596 .set_path_size = smb2_set_path_size,
614 .set_file_size = smb2_set_file_size, 597 .set_file_size = smb2_set_file_size,
615 .set_file_info = smb2_set_file_info, 598 .set_file_info = smb2_set_file_info,
616 .build_path_to_root = smb2_build_path_to_root,
617 .mkdir = smb2_mkdir, 599 .mkdir = smb2_mkdir,
618 .mkdir_setinfo = smb2_mkdir_setinfo, 600 .mkdir_setinfo = smb2_mkdir_setinfo,
619 .rmdir = smb2_rmdir, 601 .rmdir = smb2_rmdir,
@@ -641,6 +623,92 @@ struct smb_version_operations smb21_operations = {
641 .get_lease_key = smb2_get_lease_key, 623 .get_lease_key = smb2_get_lease_key,
642 .set_lease_key = smb2_set_lease_key, 624 .set_lease_key = smb2_set_lease_key,
643 .new_lease_key = smb2_new_lease_key, 625 .new_lease_key = smb2_new_lease_key,
626 .calc_signature = smb2_calc_signature,
627};
628
629
630struct smb_version_operations smb30_operations = {
631 .compare_fids = smb2_compare_fids,
632 .setup_request = smb2_setup_request,
633 .setup_async_request = smb2_setup_async_request,
634 .check_receive = smb2_check_receive,
635 .add_credits = smb2_add_credits,
636 .set_credits = smb2_set_credits,
637 .get_credits_field = smb2_get_credits_field,
638 .get_credits = smb2_get_credits,
639 .get_next_mid = smb2_get_next_mid,
640 .read_data_offset = smb2_read_data_offset,
641 .read_data_length = smb2_read_data_length,
642 .map_error = map_smb2_to_linux_error,
643 .find_mid = smb2_find_mid,
644 .check_message = smb2_check_message,
645 .dump_detail = smb2_dump_detail,
646 .clear_stats = smb2_clear_stats,
647 .print_stats = smb2_print_stats,
648 .is_oplock_break = smb2_is_valid_oplock_break,
649 .need_neg = smb2_need_neg,
650 .negotiate = smb2_negotiate,
651 .negotiate_wsize = smb2_negotiate_wsize,
652 .negotiate_rsize = smb2_negotiate_rsize,
653 .sess_setup = SMB2_sess_setup,
654 .logoff = SMB2_logoff,
655 .tree_connect = SMB2_tcon,
656 .tree_disconnect = SMB2_tdis,
657 .is_path_accessible = smb2_is_path_accessible,
658 .can_echo = smb2_can_echo,
659 .echo = SMB2_echo,
660 .query_path_info = smb2_query_path_info,
661 .get_srv_inum = smb2_get_srv_inum,
662 .query_file_info = smb2_query_file_info,
663 .set_path_size = smb2_set_path_size,
664 .set_file_size = smb2_set_file_size,
665 .set_file_info = smb2_set_file_info,
666 .mkdir = smb2_mkdir,
667 .mkdir_setinfo = smb2_mkdir_setinfo,
668 .rmdir = smb2_rmdir,
669 .unlink = smb2_unlink,
670 .rename = smb2_rename_path,
671 .create_hardlink = smb2_create_hardlink,
672 .open = smb2_open_file,
673 .set_fid = smb2_set_fid,
674 .close = smb2_close_file,
675 .flush = smb2_flush_file,
676 .async_readv = smb2_async_readv,
677 .async_writev = smb2_async_writev,
678 .sync_read = smb2_sync_read,
679 .sync_write = smb2_sync_write,
680 .query_dir_first = smb2_query_dir_first,
681 .query_dir_next = smb2_query_dir_next,
682 .close_dir = smb2_close_dir,
683 .calc_smb_size = smb2_calc_size,
684 .is_status_pending = smb2_is_status_pending,
685 .oplock_response = smb2_oplock_response,
686 .queryfs = smb2_queryfs,
687 .mand_lock = smb2_mand_lock,
688 .mand_unlock_range = smb2_unlock_range,
689 .push_mand_locks = smb2_push_mandatory_locks,
690 .get_lease_key = smb2_get_lease_key,
691 .set_lease_key = smb2_set_lease_key,
692 .new_lease_key = smb2_new_lease_key,
693 .calc_signature = smb3_calc_signature,
694};
695
696struct smb_version_values smb20_values = {
697 .version_string = SMB20_VERSION_STRING,
698 .protocol_id = SMB20_PROT_ID,
699 .req_capabilities = 0, /* MBZ */
700 .large_lock_type = 0,
701 .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
702 .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
703 .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
704 .header_size = sizeof(struct smb2_hdr),
705 .max_header_size = MAX_SMB2_HDR_SIZE,
706 .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
707 .lock_cmd = SMB2_LOCK,
708 .cap_unix = 0,
709 .cap_nt_find = SMB2_NT_FIND,
710 .cap_large_files = SMB2_LARGE_FILES,
711 .oplock_read = SMB2_OPLOCK_LEVEL_II,
644}; 712};
645 713
646struct smb_version_values smb21_values = { 714struct smb_version_values smb21_values = {
@@ -658,6 +726,7 @@ struct smb_version_values smb21_values = {
658 .cap_unix = 0, 726 .cap_unix = 0,
659 .cap_nt_find = SMB2_NT_FIND, 727 .cap_nt_find = SMB2_NT_FIND,
660 .cap_large_files = SMB2_LARGE_FILES, 728 .cap_large_files = SMB2_LARGE_FILES,
729 .oplock_read = SMB2_OPLOCK_LEVEL_II,
661}; 730};
662 731
663struct smb_version_values smb30_values = { 732struct smb_version_values smb30_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index cf33622cdac8..41d9d0725f0f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -425,7 +425,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
425 } 425 }
426 426
427 cFYI(1, "sec_flags 0x%x", sec_flags); 427 cFYI(1, "sec_flags 0x%x", sec_flags);
428 if (sec_flags & CIFSSEC_MUST_SIGN) { 428 if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
429 cFYI(1, "Signing required"); 429 cFYI(1, "Signing required");
430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED | 430 if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
431 SMB2_NEGOTIATE_SIGNING_ENABLED))) { 431 SMB2_NEGOTIATE_SIGNING_ENABLED))) {
@@ -612,7 +612,8 @@ ssetup_ntlmssp_authenticate:
612 612
613 /* BB add code to build os and lm fields */ 613 /* BB add code to build os and lm fields */
614 614
615 rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, CIFS_LOG_ERROR); 615 rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
616 CIFS_LOG_ERROR | CIFS_NEG_OP);
616 617
617 kfree(security_blob); 618 kfree(security_blob);
618 rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; 619 rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 7d25f8b14f93..2aa3535e38ce 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
47 struct smb_rqst *rqst); 47 struct smb_rqst *rqst);
48extern struct mid_q_entry *smb2_setup_async_request( 48extern struct mid_q_entry *smb2_setup_async_request(
49 struct TCP_Server_Info *server, struct smb_rqst *rqst); 49 struct TCP_Server_Info *server, struct smb_rqst *rqst);
50extern int smb2_calc_signature(struct smb_rqst *rqst,
51 struct TCP_Server_Info *server);
52extern int smb3_calc_signature(struct smb_rqst *rqst,
53 struct TCP_Server_Info *server);
50extern void smb2_echo_request(struct work_struct *work); 54extern void smb2_echo_request(struct work_struct *work);
51extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); 55extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
52extern __u8 smb2_map_lease_to_oplock(__le32 lease_state); 56extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 2a5fdf26f79f..8dd73e61d762 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,7 +39,7 @@
39#include "smb2status.h" 39#include "smb2status.h"
40#include "smb2glob.h" 40#include "smb2glob.h"
41 41
42static int 42int
43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) 43smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
44{ 44{
45 int i, rc; 45 int i, rc;
@@ -116,6 +116,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
116 return rc; 116 return rc;
117} 117}
118 118
119int
120smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
121{
122 cFYI(1, "smb3 signatures not supported yet");
123 return -EOPNOTSUPP;
124}
125
119/* must be called with server->srv_mutex held */ 126/* must be called with server->srv_mutex held */
120static int 127static int
121smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) 128smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
@@ -132,7 +139,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
132 return rc; 139 return rc;
133 } 140 }
134 141
135 rc = smb2_calc_signature(rqst, server); 142 rc = server->ops->calc_signature(rqst, server);
136 143
137 return rc; 144 return rc;
138} 145}
@@ -168,7 +175,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
168 memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE); 175 memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
169 176
170 mutex_lock(&server->srv_mutex); 177 mutex_lock(&server->srv_mutex);
171 rc = smb2_calc_signature(rqst, server); 178 rc = server->ops->calc_signature(rqst, server);
172 mutex_unlock(&server->srv_mutex); 179 mutex_unlock(&server->srv_mutex);
173 180
174 if (rc) 181 if (rc)
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 76d974c952fe..1a528680ec5a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -144,9 +144,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
144 144
145 *sent = 0; 145 *sent = 0;
146 146
147 if (ssocket == NULL)
148 return -ENOTSOCK; /* BB eventually add reconnect code here */
149
150 smb_msg.msg_name = (struct sockaddr *) &server->dstaddr; 147 smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
151 smb_msg.msg_namelen = sizeof(struct sockaddr); 148 smb_msg.msg_namelen = sizeof(struct sockaddr);
152 smb_msg.msg_control = NULL; 149 smb_msg.msg_control = NULL;
@@ -291,6 +288,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
291 struct socket *ssocket = server->ssocket; 288 struct socket *ssocket = server->ssocket;
292 int val = 1; 289 int val = 1;
293 290
291 if (ssocket == NULL)
292 return -ENOTSOCK;
293
294 cFYI(1, "Sending smb: smb_len=%u", smb_buf_length); 294 cFYI(1, "Sending smb: smb_len=%u", smb_buf_length);
295 dump_smb(iov[0].iov_base, iov[0].iov_len); 295 dump_smb(iov[0].iov_base, iov[0].iov_len);
296 296
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4c6285fff598..e2f57a007029 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -844,6 +844,9 @@ COMPATIBLE_IOCTL(TIOCGDEV)
844COMPATIBLE_IOCTL(TIOCCBRK) 844COMPATIBLE_IOCTL(TIOCCBRK)
845COMPATIBLE_IOCTL(TIOCGSID) 845COMPATIBLE_IOCTL(TIOCGSID)
846COMPATIBLE_IOCTL(TIOCGICOUNT) 846COMPATIBLE_IOCTL(TIOCGICOUNT)
847COMPATIBLE_IOCTL(TIOCGPKT)
848COMPATIBLE_IOCTL(TIOCGPTLCK)
849COMPATIBLE_IOCTL(TIOCGEXCL)
847/* Little t */ 850/* Little t */
848COMPATIBLE_IOCTL(TIOCGETD) 851COMPATIBLE_IOCTL(TIOCGETD)
849COMPATIBLE_IOCTL(TIOCSETD) 852COMPATIBLE_IOCTL(TIOCSETD)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7414ae24a79b..712b10f64c70 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1613,12 +1613,12 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
1613 return 0; 1613 return 0;
1614} 1614}
1615 1615
1616static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) 1616static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
1617{ 1617{
1618 struct dentry * dentry = file->f_path.dentry; 1618 struct dentry * dentry = file->f_path.dentry;
1619 1619
1620 mutex_lock(&dentry->d_inode->i_mutex); 1620 mutex_lock(&dentry->d_inode->i_mutex);
1621 switch (origin) { 1621 switch (whence) {
1622 case 1: 1622 case 1:
1623 offset += file->f_pos; 1623 offset += file->f_pos;
1624 case 0: 1624 case 0:
diff --git a/fs/coredump.c b/fs/coredump.c
index ce47379bfa61..177493272a61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -458,7 +458,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
458 return err; 458 return err;
459} 459}
460 460
461void do_coredump(siginfo_t *siginfo, struct pt_regs *regs) 461void do_coredump(siginfo_t *siginfo)
462{ 462{
463 struct core_state core_state; 463 struct core_state core_state;
464 struct core_name cn; 464 struct core_name cn;
@@ -474,7 +474,7 @@ void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
474 static atomic_t core_dump_count = ATOMIC_INIT(0); 474 static atomic_t core_dump_count = ATOMIC_INIT(0);
475 struct coredump_params cprm = { 475 struct coredump_params cprm = {
476 .siginfo = siginfo, 476 .siginfo = siginfo,
477 .regs = regs, 477 .regs = signal_pt_regs(),
478 .limit = rlimit(RLIMIT_CORE), 478 .limit = rlimit(RLIMIT_CORE),
479 /* 479 /*
480 * We must use the same mm->flags while dumping core to avoid 480 * We must use the same mm->flags while dumping core to avoid
diff --git a/fs/dcache.c b/fs/dcache.c
index 3a463d0c4fe8..19153a0a810c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -455,24 +455,6 @@ void d_drop(struct dentry *dentry)
455EXPORT_SYMBOL(d_drop); 455EXPORT_SYMBOL(d_drop);
456 456
457/* 457/*
458 * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag
459 * @dentry: dentry to drop
460 *
461 * This is called when we do a lookup on a placeholder dentry that needed to be
462 * looked up. The dentry should have been hashed in order for it to be found by
463 * the lookup code, but now needs to be unhashed while we do the actual lookup
464 * and clear the DCACHE_NEED_LOOKUP flag.
465 */
466void d_clear_need_lookup(struct dentry *dentry)
467{
468 spin_lock(&dentry->d_lock);
469 __d_drop(dentry);
470 dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
471 spin_unlock(&dentry->d_lock);
472}
473EXPORT_SYMBOL(d_clear_need_lookup);
474
475/*
476 * Finish off a dentry we've decided to kill. 458 * Finish off a dentry we've decided to kill.
477 * dentry->d_lock must be held, returns with it unlocked. 459 * dentry->d_lock must be held, returns with it unlocked.
478 * If ref is non-zero, then decrement the refcount too. 460 * If ref is non-zero, then decrement the refcount too.
@@ -565,13 +547,7 @@ repeat:
565 if (d_unhashed(dentry)) 547 if (d_unhashed(dentry))
566 goto kill_it; 548 goto kill_it;
567 549
568 /* 550 dentry->d_flags |= DCACHE_REFERENCED;
569 * If this dentry needs lookup, don't set the referenced flag so that it
570 * is more likely to be cleaned up by the dcache shrinker in case of
571 * memory pressure.
572 */
573 if (!d_need_lookup(dentry))
574 dentry->d_flags |= DCACHE_REFERENCED;
575 dentry_lru_add(dentry); 551 dentry_lru_add(dentry);
576 552
577 dentry->d_count--; 553 dentry->d_count--;
@@ -1583,7 +1559,7 @@ EXPORT_SYMBOL(d_find_any_alias);
1583 */ 1559 */
1584struct dentry *d_obtain_alias(struct inode *inode) 1560struct dentry *d_obtain_alias(struct inode *inode)
1585{ 1561{
1586 static const struct qstr anonstring = { .name = "" }; 1562 static const struct qstr anonstring = QSTR_INIT("/", 1);
1587 struct dentry *tmp; 1563 struct dentry *tmp;
1588 struct dentry *res; 1564 struct dentry *res;
1589 1565
@@ -1737,13 +1713,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1737 } 1713 }
1738 1714
1739 /* 1715 /*
1740 * We are going to instantiate this dentry, unhash it and clear the
1741 * lookup flag so we can do that.
1742 */
1743 if (unlikely(d_need_lookup(found)))
1744 d_clear_need_lookup(found);
1745
1746 /*
1747 * Negative dentry: instantiate it unless the inode is a directory and 1716 * Negative dentry: instantiate it unless the inode is a directory and
1748 * already has a dentry. 1717 * already has a dentry.
1749 */ 1718 */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b607d92cdf24..a5f12b7e228d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -59,7 +59,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev
59 case S_IFDIR: 59 case S_IFDIR:
60 inode->i_op = &simple_dir_inode_operations; 60 inode->i_op = &simple_dir_inode_operations;
61 inode->i_fop = &simple_dir_operations; 61 inode->i_fop = &simple_dir_operations;
62 inode->i_private = NULL;
63 62
64 /* directory inodes start off with i_nlink == 2 63 /* directory inodes start off with i_nlink == 2
65 * (for "." entry) */ 64 * (for "." entry) */
@@ -177,7 +176,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
177 opts->uid = uid; 176 opts->uid = uid;
178 break; 177 break;
179 case Opt_gid: 178 case Opt_gid:
180 if (match_octal(&args[0], &option)) 179 if (match_int(&args[0], &option))
181 return -EINVAL; 180 return -EINVAL;
182 gid = make_kgid(current_user_ns(), option); 181 gid = make_kgid(current_user_ns(), option);
183 if (!gid_valid(gid)) 182 if (!gid_valid(gid))
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14afbabe6546..472e6befc54d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -545,37 +545,38 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
545 mutex_unlock(&allocated_ptys_lock); 545 mutex_unlock(&allocated_ptys_lock);
546} 546}
547 547
548int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) 548/**
549 * devpts_pty_new -- create a new inode in /dev/pts/
550 * @ptmx_inode: inode of the master
551 * @device: major+minor of the node to be created
552 * @index: used as a name of the node
553 * @priv: what's given back by devpts_get_priv
554 *
555 * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
556 */
557struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
558 void *priv)
549{ 559{
550 /* tty layer puts index from devpts_new_index() in here */
551 int number = tty->index;
552 struct tty_driver *driver = tty->driver;
553 dev_t device = MKDEV(driver->major, driver->minor_start+number);
554 struct dentry *dentry; 560 struct dentry *dentry;
555 struct super_block *sb = pts_sb_from_inode(ptmx_inode); 561 struct super_block *sb = pts_sb_from_inode(ptmx_inode);
556 struct inode *inode = new_inode(sb); 562 struct inode *inode;
557 struct dentry *root = sb->s_root; 563 struct dentry *root = sb->s_root;
558 struct pts_fs_info *fsi = DEVPTS_SB(sb); 564 struct pts_fs_info *fsi = DEVPTS_SB(sb);
559 struct pts_mount_opts *opts = &fsi->mount_opts; 565 struct pts_mount_opts *opts = &fsi->mount_opts;
560 int ret = 0;
561 char s[12]; 566 char s[12];
562 567
563 /* We're supposed to be given the slave end of a pty */ 568 inode = new_inode(sb);
564 BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
565 BUG_ON(driver->subtype != PTY_TYPE_SLAVE);
566
567 if (!inode) 569 if (!inode)
568 return -ENOMEM; 570 return ERR_PTR(-ENOMEM);
569 571
570 inode->i_ino = number + 3; 572 inode->i_ino = index + 3;
571 inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); 573 inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
572 inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); 574 inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
573 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 575 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
574 init_special_inode(inode, S_IFCHR|opts->mode, device); 576 init_special_inode(inode, S_IFCHR|opts->mode, device);
575 inode->i_private = tty; 577 inode->i_private = priv;
576 tty->driver_data = inode;
577 578
578 sprintf(s, "%d", number); 579 sprintf(s, "%d", index);
579 580
580 mutex_lock(&root->d_inode->i_mutex); 581 mutex_lock(&root->d_inode->i_mutex);
581 582
@@ -585,18 +586,24 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
585 fsnotify_create(root->d_inode, dentry); 586 fsnotify_create(root->d_inode, dentry);
586 } else { 587 } else {
587 iput(inode); 588 iput(inode);
588 ret = -ENOMEM; 589 inode = ERR_PTR(-ENOMEM);
589 } 590 }
590 591
591 mutex_unlock(&root->d_inode->i_mutex); 592 mutex_unlock(&root->d_inode->i_mutex);
592 593
593 return ret; 594 return inode;
594} 595}
595 596
596struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) 597/**
598 * devpts_get_priv -- get private data for a slave
599 * @pts_inode: inode of the slave
600 *
601 * Returns whatever was passed as priv in devpts_pty_new for a given inode.
602 */
603void *devpts_get_priv(struct inode *pts_inode)
597{ 604{
598 struct dentry *dentry; 605 struct dentry *dentry;
599 struct tty_struct *tty; 606 void *priv = NULL;
600 607
601 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); 608 BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
602 609
@@ -605,18 +612,22 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
605 if (!dentry) 612 if (!dentry)
606 return NULL; 613 return NULL;
607 614
608 tty = NULL;
609 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) 615 if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
610 tty = (struct tty_struct *)pts_inode->i_private; 616 priv = pts_inode->i_private;
611 617
612 dput(dentry); 618 dput(dentry);
613 619
614 return tty; 620 return priv;
615} 621}
616 622
617void devpts_pty_kill(struct tty_struct *tty) 623/**
624 * devpts_pty_kill -- remove inode form /dev/pts/
625 * @inode: inode of the slave to be removed
626 *
627 * This is an inverse operation of devpts_pty_new.
628 */
629void devpts_pty_kill(struct inode *inode)
618{ 630{
619 struct inode *inode = tty->driver_data;
620 struct super_block *sb = pts_sb_from_inode(inode); 631 struct super_block *sb = pts_sb_from_inode(inode);
621 struct dentry *root = sb->s_root; 632 struct dentry *root = sb->s_root;
622 struct dentry *dentry; 633 struct dentry *dentry;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 1897eb1b4b6a..e4242c3f8486 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,6 +1,6 @@
1menuconfig DLM 1menuconfig DLM
2 tristate "Distributed Lock Manager (DLM)" 2 tristate "Distributed Lock Manager (DLM)"
3 depends on EXPERIMENTAL && INET 3 depends on INET
4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) 4 depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
5 select IP_SCTP 5 select IP_SCTP
6 help 6 help
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 871c1abf6029..77c0f70f8fe8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -337,6 +337,7 @@ enum rsb_flags {
337 RSB_NEW_MASTER2, 337 RSB_NEW_MASTER2,
338 RSB_RECOVER_CONVERT, 338 RSB_RECOVER_CONVERT,
339 RSB_RECOVER_GRANT, 339 RSB_RECOVER_GRANT,
340 RSB_RECOVER_LVB_INVAL,
340}; 341};
341 342
342static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) 343static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b56950758188..a579f30f237d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5393,6 +5393,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
5393 if ((lkb->lkb_nodeid == nodeid_gone) || 5393 if ((lkb->lkb_nodeid == nodeid_gone) ||
5394 dlm_is_removed(ls, lkb->lkb_nodeid)) { 5394 dlm_is_removed(ls, lkb->lkb_nodeid)) {
5395 5395
5396 /* tell recover_lvb to invalidate the lvb
5397 because a node holding EX/PW failed */
5398 if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
5399 (lkb->lkb_grmode >= DLM_LOCK_PW)) {
5400 rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
5401 }
5402
5396 del_lkb(r, lkb); 5403 del_lkb(r, lkb);
5397 5404
5398 /* this put should free the lkb */ 5405 /* this put should free the lkb */
@@ -6025,15 +6032,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6025 return error; 6032 return error;
6026} 6033}
6027 6034
6028/* The force flag allows the unlock to go ahead even if the lkb isn't granted. 6035/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
6029 Regardless of what rsb queue the lock is on, it's removed and freed. */ 6036 granted. Regardless of what rsb queue the lock is on, it's removed and
6037 freed. The IVVALBLK flag causes the lvb on the resource to be invalidated
6038 if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
6030 6039
6031static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) 6040static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6032{ 6041{
6033 struct dlm_args args; 6042 struct dlm_args args;
6034 int error; 6043 int error;
6035 6044
6036 set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args); 6045 set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
6046 lkb->lkb_ua, &args);
6037 6047
6038 error = unlock_lock(ls, lkb, &args); 6048 error = unlock_lock(ls, lkb, &args);
6039 if (error == -DLM_EUNLOCK) 6049 if (error == -DLM_EUNLOCK)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 331ea4f94efd..dd87a31bcc21 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1385,7 +1385,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1385 struct connection *con; 1385 struct connection *con;
1386 struct writequeue_entry *e; 1386 struct writequeue_entry *e;
1387 int offset = 0; 1387 int offset = 0;
1388 int users = 0;
1389 1388
1390 con = nodeid2con(nodeid, allocation); 1389 con = nodeid2con(nodeid, allocation);
1391 if (!con) 1390 if (!con)
@@ -1399,7 +1398,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1399 } else { 1398 } else {
1400 offset = e->end; 1399 offset = e->end;
1401 e->end += len; 1400 e->end += len;
1402 users = e->users++; 1401 e->users++;
1403 } 1402 }
1404 spin_unlock(&con->writequeue_lock); 1403 spin_unlock(&con->writequeue_lock);
1405 1404
@@ -1414,7 +1413,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
1414 spin_lock(&con->writequeue_lock); 1413 spin_lock(&con->writequeue_lock);
1415 offset = e->end; 1414 offset = e->end;
1416 e->end += len; 1415 e->end += len;
1417 users = e->users++; 1416 e->users++;
1418 list_add_tail(&e->list, &con->writequeue); 1417 list_add_tail(&e->list, &con->writequeue);
1419 spin_unlock(&con->writequeue_lock); 1418 spin_unlock(&con->writequeue_lock);
1420 goto got_one; 1419 goto got_one;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 4a7a76e42fc3..aedea28a86a1 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -717,8 +717,14 @@ void dlm_recovered_lock(struct dlm_rsb *r)
717 * the VALNOTVALID flag if necessary, and determining the correct lvb contents 717 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
718 * based on the lvb's of the locks held on the rsb. 718 * based on the lvb's of the locks held on the rsb.
719 * 719 *
720 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it 720 * RSB_VALNOTVALID is set in two cases:
721 * was already set prior to recovery, it's not cleared, regardless of locks. 721 *
722 * 1. we are master, but not new, and we purged an EX/PW lock held by a
723 * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
724 *
725 * 2. we are a new master, and there are only NL/CR locks left.
726 * (We could probably improve this by only invaliding in this way when
727 * the previous master left uncleanly. VMS docs mention that.)
722 * 728 *
723 * The LVB contents are only considered for changing when this is a new master 729 * The LVB contents are only considered for changing when this is a new master
724 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with 730 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
@@ -734,6 +740,19 @@ static void recover_lvb(struct dlm_rsb *r)
734 int big_lock_exists = 0; 740 int big_lock_exists = 0;
735 int lvblen = r->res_ls->ls_lvblen; 741 int lvblen = r->res_ls->ls_lvblen;
736 742
743 if (!rsb_flag(r, RSB_NEW_MASTER2) &&
744 rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
745 /* case 1 above */
746 rsb_set_flag(r, RSB_VALNOTVALID);
747 return;
748 }
749
750 if (!rsb_flag(r, RSB_NEW_MASTER2))
751 return;
752
753 /* we are the new master, so figure out if VALNOTVALID should
754 be set, and set the rsb lvb from the best lkb available. */
755
737 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { 756 list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
738 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 757 if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
739 continue; 758 continue;
@@ -772,13 +791,10 @@ static void recover_lvb(struct dlm_rsb *r)
772 if (!lock_lvb_exists) 791 if (!lock_lvb_exists)
773 goto out; 792 goto out;
774 793
794 /* lvb is invalidated if only NL/CR locks remain */
775 if (!big_lock_exists) 795 if (!big_lock_exists)
776 rsb_set_flag(r, RSB_VALNOTVALID); 796 rsb_set_flag(r, RSB_VALNOTVALID);
777 797
778 /* don't mess with the lvb unless we're the new master */
779 if (!rsb_flag(r, RSB_NEW_MASTER2))
780 goto out;
781
782 if (!r->res_lvbptr) { 798 if (!r->res_lvbptr) {
783 r->res_lvbptr = dlm_allocate_lvb(r->res_ls); 799 r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
784 if (!r->res_lvbptr) 800 if (!r->res_lvbptr)
@@ -852,12 +868,19 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
852 if (is_master(r)) { 868 if (is_master(r)) {
853 if (rsb_flag(r, RSB_RECOVER_CONVERT)) 869 if (rsb_flag(r, RSB_RECOVER_CONVERT))
854 recover_conversion(r); 870 recover_conversion(r);
871
872 /* recover lvb before granting locks so the updated
873 lvb/VALNOTVALID is presented in the completion */
874 recover_lvb(r);
875
855 if (rsb_flag(r, RSB_NEW_MASTER2)) 876 if (rsb_flag(r, RSB_NEW_MASTER2))
856 recover_grant(r); 877 recover_grant(r);
857 recover_lvb(r);
858 count++; 878 count++;
879 } else {
880 rsb_clear_flag(r, RSB_VALNOTVALID);
859 } 881 }
860 rsb_clear_flag(r, RSB_RECOVER_CONVERT); 882 rsb_clear_flag(r, RSB_RECOVER_CONVERT);
883 rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
861 rsb_clear_flag(r, RSB_NEW_MASTER2); 884 rsb_clear_flag(r, RSB_NEW_MASTER2);
862 unlock_rsb(r); 885 unlock_rsb(r);
863 } 886 }
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 7ff49852b0cb..911649a47dd5 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf,
503#endif 503#endif
504 return -EINVAL; 504 return -EINVAL;
505 505
506#ifdef CONFIG_COMPAT 506 /*
507 if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) 507 * can't compare against COMPAT/dlm_write_request32 because
508#else 508 * we don't yet know if is64bit is zero
509 */
509 if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) 510 if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
510#endif
511 return -EINVAL; 511 return -EINVAL;
512 512
513 kbuf = kzalloc(count + 1, GFP_NOFS); 513 kbuf = kzalloc(count + 1, GFP_NOFS);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index ea9931281557..a7b0c2dfb3db 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1935,7 +1935,7 @@ static const unsigned char filename_rev_map[256] = {
1935 * @src: Source location for the filename to encode 1935 * @src: Source location for the filename to encode
1936 * @src_size: Size of the source in bytes 1936 * @src_size: Size of the source in bytes
1937 */ 1937 */
1938void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, 1938static void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
1939 unsigned char *src, size_t src_size) 1939 unsigned char *src, size_t src_size)
1940{ 1940{
1941 size_t num_blocks; 1941 size_t num_blocks;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index 809e67d05ca3..f1ea610362c6 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -102,12 +102,12 @@ int __init ecryptfs_init_kthread(void)
102 102
103void ecryptfs_destroy_kthread(void) 103void ecryptfs_destroy_kthread(void)
104{ 104{
105 struct ecryptfs_open_req *req; 105 struct ecryptfs_open_req *req, *tmp;
106 106
107 mutex_lock(&ecryptfs_kthread_ctl.mux); 107 mutex_lock(&ecryptfs_kthread_ctl.mux);
108 ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; 108 ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE;
109 list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list, 109 list_for_each_entry_safe(req, tmp, &ecryptfs_kthread_ctl.req_list,
110 kthread_ctl_list) { 110 kthread_ctl_list) {
111 list_del(&req->kthread_ctl_list); 111 list_del(&req->kthread_ctl_list);
112 *req->lower_file = ERR_PTR(-EIO); 112 *req->lower_file = ERR_PTR(-EIO);
113 complete(&req->done); 113 complete(&req->done);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index bd1d57f98f74..564a1fa34b99 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -338,7 +338,8 @@ static int ecryptfs_write_begin(struct file *file,
338 if (prev_page_end_size 338 if (prev_page_end_size
339 >= i_size_read(page->mapping->host)) { 339 >= i_size_read(page->mapping->host)) {
340 zero_user(page, 0, PAGE_CACHE_SIZE); 340 zero_user(page, 0, PAGE_CACHE_SIZE);
341 } else { 341 SetPageUptodate(page);
342 } else if (len < PAGE_CACHE_SIZE) {
342 rc = ecryptfs_decrypt_page(page); 343 rc = ecryptfs_decrypt_page(page);
343 if (rc) { 344 if (rc) {
344 printk(KERN_ERR "%s: Error decrypting " 345 printk(KERN_ERR "%s: Error decrypting "
@@ -348,8 +349,8 @@ static int ecryptfs_write_begin(struct file *file,
348 ClearPageUptodate(page); 349 ClearPageUptodate(page);
349 goto out; 350 goto out;
350 } 351 }
352 SetPageUptodate(page);
351 } 353 }
352 SetPageUptodate(page);
353 } 354 }
354 } 355 }
355 /* If creating a page or more of holes, zero them out via truncate. 356 /* If creating a page or more of holes, zero them out via truncate.
@@ -499,6 +500,13 @@ static int ecryptfs_write_end(struct file *file,
499 } 500 }
500 goto out; 501 goto out;
501 } 502 }
503 if (!PageUptodate(page)) {
504 if (copied < PAGE_CACHE_SIZE) {
505 rc = 0;
506 goto out;
507 }
508 SetPageUptodate(page);
509 }
502 /* Fills in zeros if 'to' goes beyond inode size */ 510 /* Fills in zeros if 'to' goes beyond inode size */
503 rc = fill_zeros_to_end_of_page(page, to); 511 rc = fill_zeros_to_end_of_page(page, to);
504 if (rc) { 512 if (rc) {
diff --git a/fs/eventfd.c b/fs/eventfd.c
index d81b9f654086..35470d9b96e6 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -19,6 +19,8 @@
19#include <linux/export.h> 19#include <linux/export.h>
20#include <linux/kref.h> 20#include <linux/kref.h>
21#include <linux/eventfd.h> 21#include <linux/eventfd.h>
22#include <linux/proc_fs.h>
23#include <linux/seq_file.h>
22 24
23struct eventfd_ctx { 25struct eventfd_ctx {
24 struct kref kref; 26 struct kref kref;
@@ -284,7 +286,25 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
284 return res; 286 return res;
285} 287}
286 288
289#ifdef CONFIG_PROC_FS
290static int eventfd_show_fdinfo(struct seq_file *m, struct file *f)
291{
292 struct eventfd_ctx *ctx = f->private_data;
293 int ret;
294
295 spin_lock_irq(&ctx->wqh.lock);
296 ret = seq_printf(m, "eventfd-count: %16llx\n",
297 (unsigned long long)ctx->count);
298 spin_unlock_irq(&ctx->wqh.lock);
299
300 return ret;
301}
302#endif
303
287static const struct file_operations eventfd_fops = { 304static const struct file_operations eventfd_fops = {
305#ifdef CONFIG_PROC_FS
306 .show_fdinfo = eventfd_show_fdinfo,
307#endif
288 .release = eventfd_release, 308 .release = eventfd_release,
289 .poll = eventfd_poll, 309 .poll = eventfd_poll,
290 .read = eventfd_read, 310 .read = eventfd_read,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cd96649bfe62..9fec1836057a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -38,6 +38,8 @@
38#include <asm/io.h> 38#include <asm/io.h>
39#include <asm/mman.h> 39#include <asm/mman.h>
40#include <linux/atomic.h> 40#include <linux/atomic.h>
41#include <linux/proc_fs.h>
42#include <linux/seq_file.h>
41 43
42/* 44/*
43 * LOCKING: 45 * LOCKING:
@@ -783,8 +785,34 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
783 return pollflags != -1 ? pollflags : 0; 785 return pollflags != -1 ? pollflags : 0;
784} 786}
785 787
788#ifdef CONFIG_PROC_FS
789static int ep_show_fdinfo(struct seq_file *m, struct file *f)
790{
791 struct eventpoll *ep = f->private_data;
792 struct rb_node *rbp;
793 int ret = 0;
794
795 mutex_lock(&ep->mtx);
796 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
797 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
798
799 ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
800 epi->ffd.fd, epi->event.events,
801 (long long)epi->event.data);
802 if (ret)
803 break;
804 }
805 mutex_unlock(&ep->mtx);
806
807 return ret;
808}
809#endif
810
786/* File callbacks that implement the eventpoll file behaviour */ 811/* File callbacks that implement the eventpoll file behaviour */
787static const struct file_operations eventpoll_fops = { 812static const struct file_operations eventpoll_fops = {
813#ifdef CONFIG_PROC_FS
814 .show_fdinfo = ep_show_fdinfo,
815#endif
788 .release = ep_eventpoll_release, 816 .release = ep_eventpoll_release,
789 .poll = ep_eventpoll_poll, 817 .poll = ep_eventpoll_poll,
790 .llseek = noop_llseek, 818 .llseek = noop_llseek,
@@ -1285,7 +1313,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1285 * otherwise we might miss an event that happens between the 1313 * otherwise we might miss an event that happens between the
1286 * f_op->poll() call and the new event set registering. 1314 * f_op->poll() call and the new event set registering.
1287 */ 1315 */
1288 epi->event.events = event->events; 1316 epi->event.events = event->events; /* need barrier below */
1289 pt._key = event->events; 1317 pt._key = event->events;
1290 epi->event.data = event->data; /* protected by mtx */ 1318 epi->event.data = event->data; /* protected by mtx */
1291 if (epi->event.events & EPOLLWAKEUP) { 1319 if (epi->event.events & EPOLLWAKEUP) {
@@ -1296,6 +1324,26 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1296 } 1324 }
1297 1325
1298 /* 1326 /*
1327 * The following barrier has two effects:
1328 *
1329 * 1) Flush epi changes above to other CPUs. This ensures
1330 * we do not miss events from ep_poll_callback if an
1331 * event occurs immediately after we call f_op->poll().
1332 * We need this because we did not take ep->lock while
1333 * changing epi above (but ep_poll_callback does take
1334 * ep->lock).
1335 *
1336 * 2) We also need to ensure we do not miss _past_ events
1337 * when calling f_op->poll(). This barrier also
1338 * pairs with the barrier in wq_has_sleeper (see
1339 * comments for wq_has_sleeper).
1340 *
1341 * This barrier will now guarantee ep_poll_callback or f_op->poll
1342 * (or both) will notice the readiness of an item.
1343 */
1344 smp_mb();
1345
1346 /*
1299 * Get current event bits. We can safely use the file* here because 1347 * Get current event bits. We can safely use the file* here because
1300 * its usage count has been increased by the caller of this function. 1348 * its usage count has been increased by the caller of this function.
1301 */ 1349 */
diff --git a/fs/exec.c b/fs/exec.c
index 0039055b1fc6..20df02c1cc70 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -434,8 +434,9 @@ static int count(struct user_arg_ptr argv, int max)
434 if (IS_ERR(p)) 434 if (IS_ERR(p))
435 return -EFAULT; 435 return -EFAULT;
436 436
437 if (i++ >= max) 437 if (i >= max)
438 return -E2BIG; 438 return -E2BIG;
439 ++i;
439 440
440 if (fatal_signal_pending(current)) 441 if (fatal_signal_pending(current))
441 return -ERESTARTNOHAND; 442 return -ERESTARTNOHAND;
@@ -1175,9 +1176,24 @@ void free_bprm(struct linux_binprm *bprm)
1175 mutex_unlock(&current->signal->cred_guard_mutex); 1176 mutex_unlock(&current->signal->cred_guard_mutex);
1176 abort_creds(bprm->cred); 1177 abort_creds(bprm->cred);
1177 } 1178 }
1179 /* If a binfmt changed the interp, free it. */
1180 if (bprm->interp != bprm->filename)
1181 kfree(bprm->interp);
1178 kfree(bprm); 1182 kfree(bprm);
1179} 1183}
1180 1184
1185int bprm_change_interp(char *interp, struct linux_binprm *bprm)
1186{
1187 /* If a binfmt changed the interp, free it first. */
1188 if (bprm->interp != bprm->filename)
1189 kfree(bprm->interp);
1190 bprm->interp = kstrdup(interp, GFP_KERNEL);
1191 if (!bprm->interp)
1192 return -ENOMEM;
1193 return 0;
1194}
1195EXPORT_SYMBOL(bprm_change_interp);
1196
1181/* 1197/*
1182 * install the new credentials for this executable 1198 * install the new credentials for this executable
1183 */ 1199 */
@@ -1266,14 +1282,13 @@ int prepare_binprm(struct linux_binprm *bprm)
1266 bprm->cred->egid = current_egid(); 1282 bprm->cred->egid = current_egid();
1267 1283
1268 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && 1284 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
1269 !current->no_new_privs) { 1285 !current->no_new_privs &&
1286 kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
1287 kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
1270 /* Set-uid? */ 1288 /* Set-uid? */
1271 if (mode & S_ISUID) { 1289 if (mode & S_ISUID) {
1272 if (!kuid_has_mapping(bprm->cred->user_ns, inode->i_uid))
1273 return -EPERM;
1274 bprm->per_clear |= PER_CLEAR_ON_SETID; 1290 bprm->per_clear |= PER_CLEAR_ON_SETID;
1275 bprm->cred->euid = inode->i_uid; 1291 bprm->cred->euid = inode->i_uid;
1276
1277 } 1292 }
1278 1293
1279 /* Set-gid? */ 1294 /* Set-gid? */
@@ -1283,8 +1298,6 @@ int prepare_binprm(struct linux_binprm *bprm)
1283 * executable. 1298 * executable.
1284 */ 1299 */
1285 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { 1300 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1286 if (!kgid_has_mapping(bprm->cred->user_ns, inode->i_gid))
1287 return -EPERM;
1288 bprm->per_clear |= PER_CLEAR_ON_SETID; 1301 bprm->per_clear |= PER_CLEAR_ON_SETID;
1289 bprm->cred->egid = inode->i_gid; 1302 bprm->cred->egid = inode->i_gid;
1290 } 1303 }
@@ -1349,13 +1362,17 @@ EXPORT_SYMBOL(remove_arg_zero);
1349/* 1362/*
1350 * cycle the list of binary formats handler, until one recognizes the image 1363 * cycle the list of binary formats handler, until one recognizes the image
1351 */ 1364 */
1352int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) 1365int search_binary_handler(struct linux_binprm *bprm)
1353{ 1366{
1354 unsigned int depth = bprm->recursion_depth; 1367 unsigned int depth = bprm->recursion_depth;
1355 int try,retval; 1368 int try,retval;
1356 struct linux_binfmt *fmt; 1369 struct linux_binfmt *fmt;
1357 pid_t old_pid, old_vpid; 1370 pid_t old_pid, old_vpid;
1358 1371
1372 /* This allows 4 levels of binfmt rewrites before failing hard. */
1373 if (depth > 5)
1374 return -ELOOP;
1375
1359 retval = security_bprm_check(bprm); 1376 retval = security_bprm_check(bprm);
1360 if (retval) 1377 if (retval)
1361 return retval; 1378 return retval;
@@ -1374,18 +1391,14 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1374 for (try=0; try<2; try++) { 1391 for (try=0; try<2; try++) {
1375 read_lock(&binfmt_lock); 1392 read_lock(&binfmt_lock);
1376 list_for_each_entry(fmt, &formats, lh) { 1393 list_for_each_entry(fmt, &formats, lh) {
1377 int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; 1394 int (*fn)(struct linux_binprm *) = fmt->load_binary;
1378 if (!fn) 1395 if (!fn)
1379 continue; 1396 continue;
1380 if (!try_module_get(fmt->module)) 1397 if (!try_module_get(fmt->module))
1381 continue; 1398 continue;
1382 read_unlock(&binfmt_lock); 1399 read_unlock(&binfmt_lock);
1383 retval = fn(bprm, regs); 1400 bprm->recursion_depth = depth + 1;
1384 /* 1401 retval = fn(bprm);
1385 * Restore the depth counter to its starting value
1386 * in this call, so we don't have to rely on every
1387 * load_binary function to restore it on return.
1388 */
1389 bprm->recursion_depth = depth; 1402 bprm->recursion_depth = depth;
1390 if (retval >= 0) { 1403 if (retval >= 0) {
1391 if (depth == 0) { 1404 if (depth == 0) {
@@ -1439,8 +1452,7 @@ EXPORT_SYMBOL(search_binary_handler);
1439 */ 1452 */
1440static int do_execve_common(const char *filename, 1453static int do_execve_common(const char *filename,
1441 struct user_arg_ptr argv, 1454 struct user_arg_ptr argv,
1442 struct user_arg_ptr envp, 1455 struct user_arg_ptr envp)
1443 struct pt_regs *regs)
1444{ 1456{
1445 struct linux_binprm *bprm; 1457 struct linux_binprm *bprm;
1446 struct file *file; 1458 struct file *file;
@@ -1524,7 +1536,7 @@ static int do_execve_common(const char *filename,
1524 if (retval < 0) 1536 if (retval < 0)
1525 goto out; 1537 goto out;
1526 1538
1527 retval = search_binary_handler(bprm,regs); 1539 retval = search_binary_handler(bprm);
1528 if (retval < 0) 1540 if (retval < 0)
1529 goto out; 1541 goto out;
1530 1542
@@ -1566,19 +1578,17 @@ out_ret:
1566 1578
1567int do_execve(const char *filename, 1579int do_execve(const char *filename,
1568 const char __user *const __user *__argv, 1580 const char __user *const __user *__argv,
1569 const char __user *const __user *__envp, 1581 const char __user *const __user *__envp)
1570 struct pt_regs *regs)
1571{ 1582{
1572 struct user_arg_ptr argv = { .ptr.native = __argv }; 1583 struct user_arg_ptr argv = { .ptr.native = __argv };
1573 struct user_arg_ptr envp = { .ptr.native = __envp }; 1584 struct user_arg_ptr envp = { .ptr.native = __envp };
1574 return do_execve_common(filename, argv, envp, regs); 1585 return do_execve_common(filename, argv, envp);
1575} 1586}
1576 1587
1577#ifdef CONFIG_COMPAT 1588#ifdef CONFIG_COMPAT
1578int compat_do_execve(const char *filename, 1589static int compat_do_execve(const char *filename,
1579 const compat_uptr_t __user *__argv, 1590 const compat_uptr_t __user *__argv,
1580 const compat_uptr_t __user *__envp, 1591 const compat_uptr_t __user *__envp)
1581 struct pt_regs *regs)
1582{ 1592{
1583 struct user_arg_ptr argv = { 1593 struct user_arg_ptr argv = {
1584 .is_compat = true, 1594 .is_compat = true,
@@ -1588,7 +1598,7 @@ int compat_do_execve(const char *filename,
1588 .is_compat = true, 1598 .is_compat = true,
1589 .ptr.compat = __envp, 1599 .ptr.compat = __envp,
1590 }; 1600 };
1591 return do_execve_common(filename, argv, envp, regs); 1601 return do_execve_common(filename, argv, envp);
1592} 1602}
1593#endif 1603#endif
1594 1604
@@ -1660,7 +1670,6 @@ int get_dumpable(struct mm_struct *mm)
1660 return __get_dumpable(mm->flags); 1670 return __get_dumpable(mm->flags);
1661} 1671}
1662 1672
1663#ifdef __ARCH_WANT_SYS_EXECVE
1664SYSCALL_DEFINE3(execve, 1673SYSCALL_DEFINE3(execve,
1665 const char __user *, filename, 1674 const char __user *, filename,
1666 const char __user *const __user *, argv, 1675 const char __user *const __user *, argv,
@@ -1669,7 +1678,7 @@ SYSCALL_DEFINE3(execve,
1669 struct filename *path = getname(filename); 1678 struct filename *path = getname(filename);
1670 int error = PTR_ERR(path); 1679 int error = PTR_ERR(path);
1671 if (!IS_ERR(path)) { 1680 if (!IS_ERR(path)) {
1672 error = do_execve(path->name, argv, envp, current_pt_regs()); 1681 error = do_execve(path->name, argv, envp);
1673 putname(path); 1682 putname(path);
1674 } 1683 }
1675 return error; 1684 return error;
@@ -1682,33 +1691,9 @@ asmlinkage long compat_sys_execve(const char __user * filename,
1682 struct filename *path = getname(filename); 1691 struct filename *path = getname(filename);
1683 int error = PTR_ERR(path); 1692 int error = PTR_ERR(path);
1684 if (!IS_ERR(path)) { 1693 if (!IS_ERR(path)) {
1685 error = compat_do_execve(path->name, argv, envp, 1694 error = compat_do_execve(path->name, argv, envp);
1686 current_pt_regs());
1687 putname(path); 1695 putname(path);
1688 } 1696 }
1689 return error; 1697 return error;
1690} 1698}
1691#endif 1699#endif
1692#endif
1693
1694#ifdef __ARCH_WANT_KERNEL_EXECVE
1695int kernel_execve(const char *filename,
1696 const char *const argv[],
1697 const char *const envp[])
1698{
1699 struct pt_regs *p = current_pt_regs();
1700 int ret;
1701
1702 ret = do_execve(filename,
1703 (const char __user *const __user *)argv,
1704 (const char __user *const __user *)envp, p);
1705 if (ret < 0)
1706 return ret;
1707
1708 /*
1709 * We were successful. We won't be returning to our caller, but
1710 * instead to user space by manipulating the kernel stack.
1711 */
1712 ret_from_kernel_execve(p);
1713}
1714#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index b56181047751..d1f80abd8828 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -361,12 +361,12 @@ static int read_exec(struct page_collect *pcol)
361 return 0; 361 return 0;
362 362
363err: 363err:
364 if (!pcol->read_4_write) 364 if (!pcol_copy) /* Failed before ownership transfer */
365 _unlock_pcol_pages(pcol, ret, READ); 365 pcol_copy = pcol;
366 366 _unlock_pcol_pages(pcol_copy, ret, READ);
367 pcol_free(pcol); 367 pcol_free(pcol_copy);
368
369 kfree(pcol_copy); 368 kfree(pcol_copy);
369
370 return ret; 370 return ret;
371} 371}
372 372
@@ -676,8 +676,10 @@ static int write_exec(struct page_collect *pcol)
676 return 0; 676 return 0;
677 677
678err: 678err:
679 _unlock_pcol_pages(pcol, ret, WRITE); 679 if (!pcol_copy) /* Failed before ownership transfer */
680 pcol_free(pcol); 680 pcol_copy = pcol;
681 _unlock_pcol_pages(pcol_copy, ret, WRITE);
682 pcol_free(pcol_copy);
681 kfree(pcol_copy); 683 kfree(pcol_copy);
682 684
683 return ret; 685 return ret;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 29ab099e3e08..5df4bb4aab14 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -322,10 +322,10 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
322 322
323 if (parent && (len < 4)) { 323 if (parent && (len < 4)) {
324 *max_len = 4; 324 *max_len = 4;
325 return 255; 325 return FILEID_INVALID;
326 } else if (len < 2) { 326 } else if (len < 2) {
327 *max_len = 2; 327 *max_len = 2;
328 return 255; 328 return FILEID_INVALID;
329 } 329 }
330 330
331 len = 2; 331 len = 2;
@@ -341,10 +341,21 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
341 return type; 341 return type;
342} 342}
343 343
344int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
345 int *max_len, struct inode *parent)
346{
347 const struct export_operations *nop = inode->i_sb->s_export_op;
348
349 if (nop && nop->encode_fh)
350 return nop->encode_fh(inode, fid->raw, max_len, parent);
351
352 return export_encode_fh(inode, fid, max_len, parent);
353}
354EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh);
355
344int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, 356int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
345 int connectable) 357 int connectable)
346{ 358{
347 const struct export_operations *nop = dentry->d_sb->s_export_op;
348 int error; 359 int error;
349 struct dentry *p = NULL; 360 struct dentry *p = NULL;
350 struct inode *inode = dentry->d_inode, *parent = NULL; 361 struct inode *inode = dentry->d_inode, *parent = NULL;
@@ -357,10 +368,8 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
357 */ 368 */
358 parent = p->d_inode; 369 parent = p->d_inode;
359 } 370 }
360 if (nop->encode_fh) 371
361 error = nop->encode_fh(inode, fid->raw, max_len, parent); 372 error = exportfs_encode_inode_fh(inode, fid, max_len, parent);
362 else
363 error = export_encode_fh(inode, fid, max_len, parent);
364 dput(p); 373 dput(p);
365 374
366 return error; 375 return error;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index c8fff930790d..dd91264ba94f 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -296,17 +296,17 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
296 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) 296 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
297 * will be invalid once the directory was converted into a dx directory 297 * will be invalid once the directory was converted into a dx directory
298 */ 298 */
299loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) 299loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
300{ 300{
301 struct inode *inode = file->f_mapping->host; 301 struct inode *inode = file->f_mapping->host;
302 int dx_dir = is_dx_dir(inode); 302 int dx_dir = is_dx_dir(inode);
303 loff_t htree_max = ext3_get_htree_eof(file); 303 loff_t htree_max = ext3_get_htree_eof(file);
304 304
305 if (likely(dx_dir)) 305 if (likely(dx_dir))
306 return generic_file_llseek_size(file, offset, origin, 306 return generic_file_llseek_size(file, offset, whence,
307 htree_max, htree_max); 307 htree_max, htree_max);
308 else 308 else
309 return generic_file_llseek(file, offset, origin); 309 return generic_file_llseek(file, offset, whence);
310} 310}
311 311
312/* 312/*
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7e87e37a372a..b176d4253544 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1071,8 +1071,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
1071 * mapped. 0 in case of a HOLE. 1071 * mapped. 0 in case of a HOLE.
1072 */ 1072 */
1073 if (err > 0) { 1073 if (err > 0) {
1074 if (err > 1) 1074 WARN_ON(err > 1);
1075 WARN_ON(1);
1076 err = 0; 1075 err = 0;
1077 } 1076 }
1078 *errp = err; 1077 *errp = err;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5366393528df..6e50223b3299 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1661,9 +1661,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1661 return -ENOMEM; 1661 return -ENOMEM;
1662 } 1662 }
1663 sb->s_fs_info = sbi; 1663 sb->s_fs_info = sbi;
1664 sbi->s_mount_opt = 0;
1665 sbi->s_resuid = make_kuid(&init_user_ns, EXT3_DEF_RESUID);
1666 sbi->s_resgid = make_kgid(&init_user_ns, EXT3_DEF_RESGID);
1667 sbi->s_sb_block = sb_block; 1664 sbi->s_sb_block = sb_block;
1668 1665
1669 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); 1666 blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index c22f17021b6e..987358740cb9 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -39,22 +39,9 @@ config EXT4_USE_FOR_EXT23
39 compiled kernel size by using one file system driver for 39 compiled kernel size by using one file system driver for
40 ext2, ext3, and ext4 file systems. 40 ext2, ext3, and ext4 file systems.
41 41
42config EXT4_FS_XATTR
43 bool "Ext4 extended attributes"
44 depends on EXT4_FS
45 default y
46 help
47 Extended attributes are name:value pairs associated with inodes by
48 the kernel or by users (see the attr(5) manual page, or visit
49 <http://acl.bestbits.at/> for details).
50
51 If unsure, say N.
52
53 You need this for POSIX ACL support on ext4.
54
55config EXT4_FS_POSIX_ACL 42config EXT4_FS_POSIX_ACL
56 bool "Ext4 POSIX Access Control Lists" 43 bool "Ext4 POSIX Access Control Lists"
57 depends on EXT4_FS_XATTR 44 depends on EXT4_FS
58 select FS_POSIX_ACL 45 select FS_POSIX_ACL
59 help 46 help
60 POSIX Access Control Lists (ACLs) support permissions for users and 47 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -67,7 +54,7 @@ config EXT4_FS_POSIX_ACL
67 54
68config EXT4_FS_SECURITY 55config EXT4_FS_SECURITY
69 bool "Ext4 Security Labels" 56 bool "Ext4 Security Labels"
70 depends on EXT4_FS_XATTR 57 depends on EXT4_FS
71 help 58 help
72 Security labels support alternative access control models 59 Security labels support alternative access control models
73 implemented by security modules like SELinux. This option 60 implemented by security modules like SELinux. This option
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 56fd8f865930..0310fec2ee3d 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o indirect.o 10 mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
11 xattr_trusted.o inline.o
11 12
12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
14ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o 14ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index d3c5b88fd89f..e6e0d988439b 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -423,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
423 423
424retry: 424retry:
425 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 425 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
426 if (IS_ERR(handle)) 426 if (IS_ERR(handle)) {
427 return PTR_ERR(handle); 427 error = PTR_ERR(handle);
428 goto release_and_out;
429 }
428 error = ext4_set_acl(handle, inode, type, acl); 430 error = ext4_set_acl(handle, inode, type, acl);
429 ext4_journal_stop(handle); 431 ext4_journal_stop(handle);
430 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 432 if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 8e07d2a5a139..80a28b297279 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -27,23 +27,11 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/rbtree.h> 28#include <linux/rbtree.h>
29#include "ext4.h" 29#include "ext4.h"
30 30#include "xattr.h"
31static unsigned char ext4_filetype_table[] = {
32 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
33};
34 31
35static int ext4_dx_readdir(struct file *filp, 32static int ext4_dx_readdir(struct file *filp,
36 void *dirent, filldir_t filldir); 33 void *dirent, filldir_t filldir);
37 34
38static unsigned char get_dtype(struct super_block *sb, int filetype)
39{
40 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
41 (filetype >= EXT4_FT_MAX))
42 return DT_UNKNOWN;
43
44 return (ext4_filetype_table[filetype]);
45}
46
47/** 35/**
48 * Check if the given dir-inode refers to an htree-indexed directory 36 * Check if the given dir-inode refers to an htree-indexed directory
49 * (or a directory which chould potentially get coverted to use htree 37 * (or a directory which chould potentially get coverted to use htree
@@ -68,11 +56,14 @@ static int is_dx_dir(struct inode *inode)
68 * Return 0 if the directory entry is OK, and 1 if there is a problem 56 * Return 0 if the directory entry is OK, and 1 if there is a problem
69 * 57 *
70 * Note: this is the opposite of what ext2 and ext3 historically returned... 58 * Note: this is the opposite of what ext2 and ext3 historically returned...
59 *
60 * bh passed here can be an inode block or a dir data block, depending
61 * on the inode inline data flag.
71 */ 62 */
72int __ext4_check_dir_entry(const char *function, unsigned int line, 63int __ext4_check_dir_entry(const char *function, unsigned int line,
73 struct inode *dir, struct file *filp, 64 struct inode *dir, struct file *filp,
74 struct ext4_dir_entry_2 *de, 65 struct ext4_dir_entry_2 *de,
75 struct buffer_head *bh, 66 struct buffer_head *bh, char *buf, int size,
76 unsigned int offset) 67 unsigned int offset)
77{ 68{
78 const char *error_msg = NULL; 69 const char *error_msg = NULL;
@@ -85,9 +76,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
85 error_msg = "rec_len % 4 != 0"; 76 error_msg = "rec_len % 4 != 0";
86 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) 77 else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
87 error_msg = "rec_len is too small for name_len"; 78 error_msg = "rec_len is too small for name_len";
88 else if (unlikely(((char *) de - bh->b_data) + rlen > 79 else if (unlikely(((char *) de - buf) + rlen > size))
89 dir->i_sb->s_blocksize)) 80 error_msg = "directory entry across range";
90 error_msg = "directory entry across blocks";
91 else if (unlikely(le32_to_cpu(de->inode) > 81 else if (unlikely(le32_to_cpu(de->inode) >
92 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) 82 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
93 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
@@ -98,14 +88,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
98 ext4_error_file(filp, function, line, bh->b_blocknr, 88 ext4_error_file(filp, function, line, bh->b_blocknr,
99 "bad entry in directory: %s - offset=%u(%u), " 89 "bad entry in directory: %s - offset=%u(%u), "
100 "inode=%u, rec_len=%d, name_len=%d", 90 "inode=%u, rec_len=%d, name_len=%d",
101 error_msg, (unsigned) (offset % bh->b_size), 91 error_msg, (unsigned) (offset % size),
102 offset, le32_to_cpu(de->inode), 92 offset, le32_to_cpu(de->inode),
103 rlen, de->name_len); 93 rlen, de->name_len);
104 else 94 else
105 ext4_error_inode(dir, function, line, bh->b_blocknr, 95 ext4_error_inode(dir, function, line, bh->b_blocknr,
106 "bad entry in directory: %s - offset=%u(%u), " 96 "bad entry in directory: %s - offset=%u(%u), "
107 "inode=%u, rec_len=%d, name_len=%d", 97 "inode=%u, rec_len=%d, name_len=%d",
108 error_msg, (unsigned) (offset % bh->b_size), 98 error_msg, (unsigned) (offset % size),
109 offset, le32_to_cpu(de->inode), 99 offset, le32_to_cpu(de->inode),
110 rlen, de->name_len); 100 rlen, de->name_len);
111 101
@@ -125,6 +115,14 @@ static int ext4_readdir(struct file *filp,
125 int ret = 0; 115 int ret = 0;
126 int dir_has_error = 0; 116 int dir_has_error = 0;
127 117
118 if (ext4_has_inline_data(inode)) {
119 int has_inline_data = 1;
120 ret = ext4_read_inline_dir(filp, dirent, filldir,
121 &has_inline_data);
122 if (has_inline_data)
123 return ret;
124 }
125
128 if (is_dx_dir(inode)) { 126 if (is_dx_dir(inode)) {
129 err = ext4_dx_readdir(filp, dirent, filldir); 127 err = ext4_dx_readdir(filp, dirent, filldir);
130 if (err != ERR_BAD_DX_DIR) { 128 if (err != ERR_BAD_DX_DIR) {
@@ -221,8 +219,9 @@ revalidate:
221 while (!error && filp->f_pos < inode->i_size 219 while (!error && filp->f_pos < inode->i_size
222 && offset < sb->s_blocksize) { 220 && offset < sb->s_blocksize) {
223 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 221 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
224 if (ext4_check_dir_entry(inode, filp, de, 222 if (ext4_check_dir_entry(inode, filp, de, bh,
225 bh, offset)) { 223 bh->b_data, bh->b_size,
224 offset)) {
226 /* 225 /*
227 * On error, skip the f_pos to the next block 226 * On error, skip the f_pos to the next block
228 */ 227 */
@@ -334,17 +333,17 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
334 * 333 *
335 * For non-htree, ext4_llseek already chooses the proper max offset. 334 * For non-htree, ext4_llseek already chooses the proper max offset.
336 */ 335 */
337loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) 336loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
338{ 337{
339 struct inode *inode = file->f_mapping->host; 338 struct inode *inode = file->f_mapping->host;
340 int dx_dir = is_dx_dir(inode); 339 int dx_dir = is_dx_dir(inode);
341 loff_t htree_max = ext4_get_htree_eof(file); 340 loff_t htree_max = ext4_get_htree_eof(file);
342 341
343 if (likely(dx_dir)) 342 if (likely(dx_dir))
344 return generic_file_llseek_size(file, offset, origin, 343 return generic_file_llseek_size(file, offset, whence,
345 htree_max, htree_max); 344 htree_max, htree_max);
346 else 345 else
347 return ext4_llseek(file, offset, origin); 346 return ext4_llseek(file, offset, whence);
348} 347}
349 348
350/* 349/*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c20de1d59d0..8462eb3c33aa 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,6 +57,16 @@
57#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) 57#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
58#endif 58#endif
59 59
60/*
61 * Turn on EXT_DEBUG to get lots of info about extents operations.
62 */
63#define EXT_DEBUG__
64#ifdef EXT_DEBUG
65#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
66#else
67#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
68#endif
69
60#define EXT4_ERROR_INODE(inode, fmt, a...) \ 70#define EXT4_ERROR_INODE(inode, fmt, a...) \
61 ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) 71 ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
62 72
@@ -392,6 +402,7 @@ struct flex_groups {
392#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 402#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
393#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ 403#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
394#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ 404#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
405#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
395#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 406#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
396 407
397#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ 408#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
@@ -448,28 +459,26 @@ enum {
448 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ 459 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
449 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ 460 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
450 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ 461 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
462 EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
451 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ 463 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
452}; 464};
453 465
454#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) 466/*
455#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ 467 * Since it's pretty easy to mix up bit numbers and hex values, we use a
456 printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ 468 * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
457 EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } 469 * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
458 470 * any extra space in the compiled kernel image, otherwise, the build will fail.
459/* 471 * It's important that these values are the same, since we are using
460 * Since it's pretty easy to mix up bit numbers and hex values, and we 472 * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
461 * can't do a compile-time test for ENUM values, we use a run-time 473 * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
462 * test to make sure that EXT4_XXX_FL is consistent with respect to 474 * values found in ext2, ext3 and ext4 filesystems, and of course the values
463 * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop 475 * defined in e2fsprogs.
464 * out so it won't cost any extra space in the compiled kernel image.
465 * But it's important that these values are the same, since we are
466 * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
467 * must be consistent with the values of FS_XXX_FL defined in
468 * include/linux/fs.h and the on-disk values found in ext2, ext3, and
469 * ext4 filesystems, and of course the values defined in e2fsprogs.
470 * 476 *
471 * It's not paranoia if the Murphy's Law really *is* out to get you. :-) 477 * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
472 */ 478 */
479#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
480#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
481
473static inline void ext4_check_flag_values(void) 482static inline void ext4_check_flag_values(void)
474{ 483{
475 CHECK_FLAG_VALUE(SECRM); 484 CHECK_FLAG_VALUE(SECRM);
@@ -494,6 +503,7 @@ static inline void ext4_check_flag_values(void)
494 CHECK_FLAG_VALUE(EXTENTS); 503 CHECK_FLAG_VALUE(EXTENTS);
495 CHECK_FLAG_VALUE(EA_INODE); 504 CHECK_FLAG_VALUE(EA_INODE);
496 CHECK_FLAG_VALUE(EOFBLOCKS); 505 CHECK_FLAG_VALUE(EOFBLOCKS);
506 CHECK_FLAG_VALUE(INLINE_DATA);
497 CHECK_FLAG_VALUE(RESERVED); 507 CHECK_FLAG_VALUE(RESERVED);
498} 508}
499 509
@@ -811,6 +821,8 @@ struct ext4_ext_cache {
811 __u32 ec_len; /* must be 32bit to return holes */ 821 __u32 ec_len; /* must be 32bit to return holes */
812}; 822};
813 823
824#include "extents_status.h"
825
814/* 826/*
815 * fourth extended file system inode data in memory 827 * fourth extended file system inode data in memory
816 */ 828 */
@@ -833,7 +845,6 @@ struct ext4_inode_info {
833#endif 845#endif
834 unsigned long i_flags; 846 unsigned long i_flags;
835 847
836#ifdef CONFIG_EXT4_FS_XATTR
837 /* 848 /*
838 * Extended attributes can be read independently of the main file 849 * Extended attributes can be read independently of the main file
839 * data. Taking i_mutex even when reading would cause contention 850 * data. Taking i_mutex even when reading would cause contention
@@ -842,7 +853,6 @@ struct ext4_inode_info {
842 * EAs. 853 * EAs.
843 */ 854 */
844 struct rw_semaphore xattr_sem; 855 struct rw_semaphore xattr_sem;
845#endif
846 856
847 struct list_head i_orphan; /* unlinked but open inodes */ 857 struct list_head i_orphan; /* unlinked but open inodes */
848 858
@@ -888,6 +898,10 @@ struct ext4_inode_info {
888 struct list_head i_prealloc_list; 898 struct list_head i_prealloc_list;
889 spinlock_t i_prealloc_lock; 899 spinlock_t i_prealloc_lock;
890 900
901 /* extents status tree */
902 struct ext4_es_tree i_es_tree;
903 rwlock_t i_es_lock;
904
891 /* ialloc */ 905 /* ialloc */
892 ext4_group_t i_last_alloc_group; 906 ext4_group_t i_last_alloc_group;
893 907
@@ -902,6 +916,10 @@ struct ext4_inode_info {
902 /* on-disk additional length */ 916 /* on-disk additional length */
903 __u16 i_extra_isize; 917 __u16 i_extra_isize;
904 918
919 /* Indicate the inline data space. */
920 u16 i_inline_off;
921 u16 i_inline_size;
922
905#ifdef CONFIG_QUOTA 923#ifdef CONFIG_QUOTA
906 /* quota space reservation, managed internally by quota code */ 924 /* quota space reservation, managed internally by quota code */
907 qsize_t i_reserved_quota; 925 qsize_t i_reserved_quota;
@@ -1360,6 +1378,7 @@ enum {
1360 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ 1378 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1361 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1379 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
1362 nolocking */ 1380 nolocking */
1381 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
1363}; 1382};
1364 1383
1365#define EXT4_INODE_BIT_FNS(name, field, offset) \ 1384#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1481,7 +1500,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1481#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1500#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1482#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ 1501#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
1483#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ 1502#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
1484#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ 1503#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
1485 1504
1486#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR 1505#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1487#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1506#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1505,7 +1524,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1505 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1524 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1506 EXT4_FEATURE_INCOMPAT_64BIT| \ 1525 EXT4_FEATURE_INCOMPAT_64BIT| \
1507 EXT4_FEATURE_INCOMPAT_FLEX_BG| \ 1526 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1508 EXT4_FEATURE_INCOMPAT_MMP) 1527 EXT4_FEATURE_INCOMPAT_MMP | \
1528 EXT4_FEATURE_INCOMPAT_INLINE_DATA)
1509#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1529#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1510 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1530 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1511 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ 1531 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1592,6 +1612,11 @@ struct ext4_dir_entry_tail {
1592 __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ 1612 __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */
1593}; 1613};
1594 1614
1615#define EXT4_DIRENT_TAIL(block, blocksize) \
1616 ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
1617 ((blocksize) - \
1618 sizeof(struct ext4_dir_entry_tail))))
1619
1595/* 1620/*
1596 * Ext4 directory file types. Only the low 3 bits are used. The 1621 * Ext4 directory file types. Only the low 3 bits are used. The
1597 * other bits are reserved for now. 1622 * other bits are reserved for now.
@@ -1936,14 +1961,42 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
1936extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1961extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1937 struct file *, 1962 struct file *,
1938 struct ext4_dir_entry_2 *, 1963 struct ext4_dir_entry_2 *,
1939 struct buffer_head *, unsigned int); 1964 struct buffer_head *, char *, int,
1940#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ 1965 unsigned int);
1966#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
1941 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ 1967 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
1942 (de), (bh), (offset))) 1968 (de), (bh), (buf), (size), (offset)))
1943extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1969extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1944 __u32 minor_hash, 1970 __u32 minor_hash,
1945 struct ext4_dir_entry_2 *dirent); 1971 struct ext4_dir_entry_2 *dirent);
1946extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1972extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1973extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
1974 struct buffer_head *bh,
1975 void *buf, int buf_size,
1976 const char *name, int namelen,
1977 struct ext4_dir_entry_2 **dest_de);
1978void ext4_insert_dentry(struct inode *inode,
1979 struct ext4_dir_entry_2 *de,
1980 int buf_size,
1981 const char *name, int namelen);
1982static inline void ext4_update_dx_flag(struct inode *inode)
1983{
1984 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1985 EXT4_FEATURE_COMPAT_DIR_INDEX))
1986 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1987}
1988static unsigned char ext4_filetype_table[] = {
1989 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
1990};
1991
1992static inline unsigned char get_dtype(struct super_block *sb, int filetype)
1993{
1994 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
1995 (filetype >= EXT4_FT_MAX))
1996 return DT_UNKNOWN;
1997
1998 return ext4_filetype_table[filetype];
1999}
1947 2000
1948/* fsync.c */ 2001/* fsync.c */
1949extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2002extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
@@ -1994,8 +2047,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1994 ext4_lblk_t, int, int *); 2047 ext4_lblk_t, int, int *);
1995struct buffer_head *ext4_bread(handle_t *, struct inode *, 2048struct buffer_head *ext4_bread(handle_t *, struct inode *,
1996 ext4_lblk_t, int, int *); 2049 ext4_lblk_t, int, int *);
2050int ext4_get_block_write(struct inode *inode, sector_t iblock,
2051 struct buffer_head *bh_result, int create);
1997int ext4_get_block(struct inode *inode, sector_t iblock, 2052int ext4_get_block(struct inode *inode, sector_t iblock,
1998 struct buffer_head *bh_result, int create); 2053 struct buffer_head *bh_result, int create);
2054int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2055 struct buffer_head *bh, int create);
2056int ext4_walk_page_buffers(handle_t *handle,
2057 struct buffer_head *head,
2058 unsigned from,
2059 unsigned to,
2060 int *partial,
2061 int (*fn)(handle_t *handle,
2062 struct buffer_head *bh));
2063int do_journal_get_write_access(handle_t *handle,
2064 struct buffer_head *bh);
2065#define FALL_BACK_TO_NONDELALLOC 1
2066#define CONVERT_INLINE_DATA 2
1999 2067
2000extern struct inode *ext4_iget(struct super_block *, unsigned long); 2068extern struct inode *ext4_iget(struct super_block *, unsigned long);
2001extern int ext4_write_inode(struct inode *, struct writeback_control *); 2069extern int ext4_write_inode(struct inode *, struct writeback_control *);
@@ -2050,6 +2118,20 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
2050extern int ext4_orphan_del(handle_t *, struct inode *); 2118extern int ext4_orphan_del(handle_t *, struct inode *);
2051extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 2119extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
2052 __u32 start_minor_hash, __u32 *next_hash); 2120 __u32 start_minor_hash, __u32 *next_hash);
2121extern int search_dir(struct buffer_head *bh,
2122 char *search_buf,
2123 int buf_size,
2124 struct inode *dir,
2125 const struct qstr *d_name,
2126 unsigned int offset,
2127 struct ext4_dir_entry_2 **res_dir);
2128extern int ext4_generic_delete_entry(handle_t *handle,
2129 struct inode *dir,
2130 struct ext4_dir_entry_2 *de_del,
2131 struct buffer_head *bh,
2132 void *entry_buf,
2133 int buf_size,
2134 int csum_size);
2053 2135
2054/* resize.c */ 2136/* resize.c */
2055extern int ext4_group_add(struct super_block *sb, 2137extern int ext4_group_add(struct super_block *sb,
@@ -2376,6 +2458,15 @@ extern void ext4_unwritten_wait(struct inode *inode);
2376extern const struct inode_operations ext4_dir_inode_operations; 2458extern const struct inode_operations ext4_dir_inode_operations;
2377extern const struct inode_operations ext4_special_inode_operations; 2459extern const struct inode_operations ext4_special_inode_operations;
2378extern struct dentry *ext4_get_parent(struct dentry *child); 2460extern struct dentry *ext4_get_parent(struct dentry *child);
2461extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
2462 struct ext4_dir_entry_2 *de,
2463 int blocksize, int csum_size,
2464 unsigned int parent_ino, int dotdot_real_len);
2465extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
2466 unsigned int blocksize);
2467extern int ext4_handle_dirty_dirent_node(handle_t *handle,
2468 struct inode *inode,
2469 struct buffer_head *bh);
2379 2470
2380/* symlink.c */ 2471/* symlink.c */
2381extern const struct inode_operations ext4_symlink_inode_operations; 2472extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2393,6 +2484,9 @@ extern int ext4_check_blockref(const char *, unsigned int,
2393 struct inode *, __le32 *, unsigned int); 2484 struct inode *, __le32 *, unsigned int);
2394 2485
2395/* extents.c */ 2486/* extents.c */
2487struct ext4_ext_path;
2488struct ext4_extent;
2489
2396extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2490extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
2397extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2491extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
2398extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 2492extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
@@ -2410,8 +2504,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
2410 ssize_t len); 2504 ssize_t len);
2411extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2505extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
2412 struct ext4_map_blocks *map, int flags); 2506 struct ext4_map_blocks *map, int flags);
2507extern int ext4_ext_calc_metadata_amount(struct inode *inode,
2508 ext4_lblk_t lblocks);
2509extern int ext4_extent_tree_init(handle_t *, struct inode *);
2510extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
2511 int num,
2512 struct ext4_ext_path *path);
2513extern int ext4_can_extents_be_merged(struct inode *inode,
2514 struct ext4_extent *ex1,
2515 struct ext4_extent *ex2);
2516extern int ext4_ext_insert_extent(handle_t *, struct inode *,
2517 struct ext4_ext_path *,
2518 struct ext4_extent *, int);
2519extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
2520 struct ext4_ext_path *);
2521extern void ext4_ext_drop_refs(struct ext4_ext_path *);
2522extern int ext4_ext_check_inode(struct inode *inode);
2523extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2413extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2524extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2414 __u64 start, __u64 len); 2525 __u64 start, __u64 len);
2526
2527
2415/* move_extent.c */ 2528/* move_extent.c */
2416extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2529extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2417 __u64 start_orig, __u64 start_donor, 2530 __u64 start_orig, __u64 start_donor,
@@ -2445,17 +2558,13 @@ enum ext4_state_bits {
2445 * never, ever appear in a buffer_head's state 2558 * never, ever appear in a buffer_head's state
2446 * flag. See EXT4_MAP_FROM_CLUSTER to see where 2559 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2447 * this is used. */ 2560 * this is used. */
2448 BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
2449 * flag is set when ext4_map_blocks is called on a
2450 * delayed allocated block to get its real mapping. */
2451}; 2561};
2452 2562
2453BUFFER_FNS(Uninit, uninit) 2563BUFFER_FNS(Uninit, uninit)
2454TAS_BUFFER_FNS(Uninit, uninit) 2564TAS_BUFFER_FNS(Uninit, uninit)
2455BUFFER_FNS(Da_Mapped, da_mapped)
2456 2565
2457/* 2566/*
2458 * Add new method to test wether block and inode bitmaps are properly 2567 * Add new method to test whether block and inode bitmaps are properly
2459 * initialized. With uninit_bg reading the block from disk is not enough 2568 * initialized. With uninit_bg reading the block from disk is not enough
2460 * to mark the bitmap uptodate. We need to also zero-out the bitmap 2569 * to mark the bitmap uptodate. We need to also zero-out the bitmap
2461 */ 2570 */
@@ -2503,6 +2612,4 @@ extern void ext4_resize_end(struct super_block *sb);
2503 2612
2504#endif /* __KERNEL__ */ 2613#endif /* __KERNEL__ */
2505 2614
2506#include "ext4_extents.h"
2507
2508#endif /* _EXT4_H */ 2615#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index cb1b2c919963..487fda12bc00 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,16 +43,6 @@
43#define CHECK_BINSEARCH__ 43#define CHECK_BINSEARCH__
44 44
45/* 45/*
46 * Turn on EXT_DEBUG to get lots of info about extents operations.
47 */
48#define EXT_DEBUG__
49#ifdef EXT_DEBUG
50#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
51#else
52#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
53#endif
54
55/*
56 * If EXT_STATS is defined then stats numbers are collected. 46 * If EXT_STATS is defined then stats numbers are collected.
57 * These number will be displayed at umount time. 47 * These number will be displayed at umount time.
58 */ 48 */
@@ -144,20 +134,6 @@ struct ext4_ext_path {
144 */ 134 */
145 135
146/* 136/*
147 * to be called by ext4_ext_walk_space()
148 * negative retcode - error
149 * positive retcode - signal for ext4_ext_walk_space(), see below
150 * callback must return valid extent (passed or newly created)
151 */
152typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
153 struct ext4_ext_cache *,
154 struct ext4_extent *, void *);
155
156#define EXT_CONTINUE 0
157#define EXT_BREAK 1
158#define EXT_REPEAT 2
159
160/*
161 * Maximum number of logical blocks in a file; ext4_extent's ee_block is 137 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
162 * __le32. 138 * __le32.
163 */ 139 */
@@ -300,21 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
300 0xffff); 276 0xffff);
301} 277}
302 278
303extern int ext4_ext_calc_metadata_amount(struct inode *inode,
304 ext4_lblk_t lblocks);
305extern int ext4_extent_tree_init(handle_t *, struct inode *);
306extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
307 int num,
308 struct ext4_ext_path *path);
309extern int ext4_can_extents_be_merged(struct inode *inode,
310 struct ext4_extent *ex1,
311 struct ext4_extent *ex2);
312extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
313extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
314 struct ext4_ext_path *);
315extern void ext4_ext_drop_refs(struct ext4_ext_path *);
316extern int ext4_ext_check_inode(struct inode *inode);
317extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
318 int search_hint_reverse);
319#endif /* _EXT4_EXTENTS */ 279#endif /* _EXT4_EXTENTS */
320 280
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 56d258c18303..7177f9b21cb2 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -254,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle)
254 handle->h_sync = 1; 254 handle->h_sync = 1;
255} 255}
256 256
257static inline void ext4_handle_release_buffer(handle_t *handle,
258 struct buffer_head *bh)
259{
260 if (ext4_handle_valid(handle))
261 jbd2_journal_release_buffer(handle, bh);
262}
263
264static inline int ext4_handle_is_aborted(handle_t *handle) 257static inline int ext4_handle_is_aborted(handle_t *handle)
265{ 258{
266 if (ext4_handle_valid(handle)) 259 if (ext4_handle_valid(handle))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7011ac967208..5ae1674ec12f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -41,6 +41,8 @@
41#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42#include <linux/fiemap.h> 42#include <linux/fiemap.h>
43#include "ext4_jbd2.h" 43#include "ext4_jbd2.h"
44#include "ext4_extents.h"
45#include "xattr.h"
44 46
45#include <trace/events/ext4.h> 47#include <trace/events/ext4.h>
46 48
@@ -109,6 +111,9 @@ static int ext4_split_extent_at(handle_t *handle,
109 int split_flag, 111 int split_flag,
110 int flags); 112 int flags);
111 113
114static int ext4_find_delayed_extent(struct inode *inode,
115 struct ext4_ext_cache *newex);
116
112static int ext4_ext_truncate_extend_restart(handle_t *handle, 117static int ext4_ext_truncate_extend_restart(handle_t *handle,
113 struct inode *inode, 118 struct inode *inode,
114 int needed) 119 int needed)
@@ -1959,27 +1964,33 @@ cleanup:
1959 return err; 1964 return err;
1960} 1965}
1961 1966
1962static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1967static int ext4_fill_fiemap_extents(struct inode *inode,
1963 ext4_lblk_t num, ext_prepare_callback func, 1968 ext4_lblk_t block, ext4_lblk_t num,
1964 void *cbdata) 1969 struct fiemap_extent_info *fieinfo)
1965{ 1970{
1966 struct ext4_ext_path *path = NULL; 1971 struct ext4_ext_path *path = NULL;
1967 struct ext4_ext_cache cbex; 1972 struct ext4_ext_cache newex;
1968 struct ext4_extent *ex; 1973 struct ext4_extent *ex;
1969 ext4_lblk_t next, start = 0, end = 0; 1974 ext4_lblk_t next, next_del, start = 0, end = 0;
1970 ext4_lblk_t last = block + num; 1975 ext4_lblk_t last = block + num;
1971 int depth, exists, err = 0; 1976 int exists, depth = 0, err = 0;
1972 1977 unsigned int flags = 0;
1973 BUG_ON(func == NULL); 1978 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
1974 BUG_ON(inode == NULL);
1975 1979
1976 while (block < last && block != EXT_MAX_BLOCKS) { 1980 while (block < last && block != EXT_MAX_BLOCKS) {
1977 num = last - block; 1981 num = last - block;
1978 /* find extent for this block */ 1982 /* find extent for this block */
1979 down_read(&EXT4_I(inode)->i_data_sem); 1983 down_read(&EXT4_I(inode)->i_data_sem);
1984
1985 if (path && ext_depth(inode) != depth) {
1986 /* depth was changed. we have to realloc path */
1987 kfree(path);
1988 path = NULL;
1989 }
1990
1980 path = ext4_ext_find_extent(inode, block, path); 1991 path = ext4_ext_find_extent(inode, block, path);
1981 up_read(&EXT4_I(inode)->i_data_sem);
1982 if (IS_ERR(path)) { 1992 if (IS_ERR(path)) {
1993 up_read(&EXT4_I(inode)->i_data_sem);
1983 err = PTR_ERR(path); 1994 err = PTR_ERR(path);
1984 path = NULL; 1995 path = NULL;
1985 break; 1996 break;
@@ -1987,13 +1998,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1987 1998
1988 depth = ext_depth(inode); 1999 depth = ext_depth(inode);
1989 if (unlikely(path[depth].p_hdr == NULL)) { 2000 if (unlikely(path[depth].p_hdr == NULL)) {
2001 up_read(&EXT4_I(inode)->i_data_sem);
1990 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2002 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1991 err = -EIO; 2003 err = -EIO;
1992 break; 2004 break;
1993 } 2005 }
1994 ex = path[depth].p_ext; 2006 ex = path[depth].p_ext;
1995 next = ext4_ext_next_allocated_block(path); 2007 next = ext4_ext_next_allocated_block(path);
2008 ext4_ext_drop_refs(path);
1996 2009
2010 flags = 0;
1997 exists = 0; 2011 exists = 0;
1998 if (!ex) { 2012 if (!ex) {
1999 /* there is no extent yet, so try to allocate 2013 /* there is no extent yet, so try to allocate
@@ -2030,40 +2044,64 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
2030 BUG_ON(end <= start); 2044 BUG_ON(end <= start);
2031 2045
2032 if (!exists) { 2046 if (!exists) {
2033 cbex.ec_block = start; 2047 newex.ec_block = start;
2034 cbex.ec_len = end - start; 2048 newex.ec_len = end - start;
2035 cbex.ec_start = 0; 2049 newex.ec_start = 0;
2036 } else { 2050 } else {
2037 cbex.ec_block = le32_to_cpu(ex->ee_block); 2051 newex.ec_block = le32_to_cpu(ex->ee_block);
2038 cbex.ec_len = ext4_ext_get_actual_len(ex); 2052 newex.ec_len = ext4_ext_get_actual_len(ex);
2039 cbex.ec_start = ext4_ext_pblock(ex); 2053 newex.ec_start = ext4_ext_pblock(ex);
2054 if (ext4_ext_is_uninitialized(ex))
2055 flags |= FIEMAP_EXTENT_UNWRITTEN;
2040 } 2056 }
2041 2057
2042 if (unlikely(cbex.ec_len == 0)) { 2058 /*
2043 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 2059 * Find delayed extent and update newex accordingly. We call
2044 err = -EIO; 2060 * it even in !exists case to find out whether newex is the
2045 break; 2061 * last existing extent or not.
2062 */
2063 next_del = ext4_find_delayed_extent(inode, &newex);
2064 if (!exists && next_del) {
2065 exists = 1;
2066 flags |= FIEMAP_EXTENT_DELALLOC;
2046 } 2067 }
2047 err = func(inode, next, &cbex, ex, cbdata); 2068 up_read(&EXT4_I(inode)->i_data_sem);
2048 ext4_ext_drop_refs(path);
2049 2069
2050 if (err < 0) 2070 if (unlikely(newex.ec_len == 0)) {
2071 EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
2072 err = -EIO;
2051 break; 2073 break;
2074 }
2052 2075
2053 if (err == EXT_REPEAT) 2076 /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
2054 continue; 2077 if (next == next_del) {
2055 else if (err == EXT_BREAK) { 2078 flags |= FIEMAP_EXTENT_LAST;
2056 err = 0; 2079 if (unlikely(next_del != EXT_MAX_BLOCKS ||
2057 break; 2080 next != EXT_MAX_BLOCKS)) {
2081 EXT4_ERROR_INODE(inode,
2082 "next extent == %u, next "
2083 "delalloc extent = %u",
2084 next, next_del);
2085 err = -EIO;
2086 break;
2087 }
2058 } 2088 }
2059 2089
2060 if (ext_depth(inode) != depth) { 2090 if (exists) {
2061 /* depth was changed. we have to realloc path */ 2091 err = fiemap_fill_next_extent(fieinfo,
2062 kfree(path); 2092 (__u64)newex.ec_block << blksize_bits,
2063 path = NULL; 2093 (__u64)newex.ec_start << blksize_bits,
2094 (__u64)newex.ec_len << blksize_bits,
2095 flags);
2096 if (err < 0)
2097 break;
2098 if (err == 1) {
2099 err = 0;
2100 break;
2101 }
2064 } 2102 }
2065 2103
2066 block = cbex.ec_block + cbex.ec_len; 2104 block = newex.ec_block + newex.ec_len;
2067 } 2105 }
2068 2106
2069 if (path) { 2107 if (path) {
@@ -2156,7 +2194,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2156 struct ext4_extent *ex) 2194 struct ext4_extent *ex)
2157{ 2195{
2158 struct ext4_ext_cache *cex; 2196 struct ext4_ext_cache *cex;
2159 struct ext4_sb_info *sbi;
2160 int ret = 0; 2197 int ret = 0;
2161 2198
2162 /* 2199 /*
@@ -2164,7 +2201,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2164 */ 2201 */
2165 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2202 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2166 cex = &EXT4_I(inode)->i_cached_extent; 2203 cex = &EXT4_I(inode)->i_cached_extent;
2167 sbi = EXT4_SB(inode->i_sb);
2168 2204
2169 /* has cache valid data? */ 2205 /* has cache valid data? */
2170 if (cex->ec_len == 0) 2206 if (cex->ec_len == 0)
@@ -2190,13 +2226,14 @@ errout:
2190 * removes index from the index block. 2226 * removes index from the index block.
2191 */ 2227 */
2192static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2228static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2193 struct ext4_ext_path *path) 2229 struct ext4_ext_path *path, int depth)
2194{ 2230{
2195 int err; 2231 int err;
2196 ext4_fsblk_t leaf; 2232 ext4_fsblk_t leaf;
2197 2233
2198 /* free index block */ 2234 /* free index block */
2199 path--; 2235 depth--;
2236 path = path + depth;
2200 leaf = ext4_idx_pblock(path->p_idx); 2237 leaf = ext4_idx_pblock(path->p_idx);
2201 if (unlikely(path->p_hdr->eh_entries == 0)) { 2238 if (unlikely(path->p_hdr->eh_entries == 0)) {
2202 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2239 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
@@ -2221,6 +2258,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2221 2258
2222 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2259 ext4_free_blocks(handle, inode, NULL, leaf, 1,
2223 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2260 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2261
2262 while (--depth >= 0) {
2263 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2264 break;
2265 path--;
2266 err = ext4_ext_get_access(handle, inode, path);
2267 if (err)
2268 break;
2269 path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2270 err = ext4_ext_dirty(handle, inode, path);
2271 if (err)
2272 break;
2273 }
2224 return err; 2274 return err;
2225} 2275}
2226 2276
@@ -2273,7 +2323,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2273int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2323int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2274{ 2324{
2275 int index; 2325 int index;
2276 int depth = ext_depth(inode); 2326 int depth;
2327
2328 /* If we are converting the inline data, only one is needed here. */
2329 if (ext4_has_inline_data(inode))
2330 return 1;
2331
2332 depth = ext_depth(inode);
2277 2333
2278 if (chunk) 2334 if (chunk)
2279 index = depth * 2; 2335 index = depth * 2;
@@ -2557,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2557 /* if this leaf is free, then we should 2613 /* if this leaf is free, then we should
2558 * remove it from index block above */ 2614 * remove it from index block above */
2559 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2615 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2560 err = ext4_ext_rm_idx(handle, inode, path + depth); 2616 err = ext4_ext_rm_idx(handle, inode, path, depth);
2561 2617
2562out: 2618out:
2563 return err; 2619 return err;
@@ -2760,7 +2816,7 @@ again:
2760 /* index is empty, remove it; 2816 /* index is empty, remove it;
2761 * handle must be already prepared by the 2817 * handle must be already prepared by the
2762 * truncatei_leaf() */ 2818 * truncatei_leaf() */
2763 err = ext4_ext_rm_idx(handle, inode, path + i); 2819 err = ext4_ext_rm_idx(handle, inode, path, i);
2764 } 2820 }
2765 /* root level has p_bh == NULL, brelse() eats this */ 2821 /* root level has p_bh == NULL, brelse() eats this */
2766 brelse(path[i].p_bh); 2822 brelse(path[i].p_bh);
@@ -3461,115 +3517,34 @@ out:
3461/** 3517/**
3462 * ext4_find_delalloc_range: find delayed allocated block in the given range. 3518 * ext4_find_delalloc_range: find delayed allocated block in the given range.
3463 * 3519 *
3464 * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns 3520 * Return 1 if there is a delalloc block in the range, otherwise 0.
3465 * whether there are any buffers marked for delayed allocation. It returns '1'
3466 * on the first delalloc'ed buffer head found. If no buffer head in the given
3467 * range is marked for delalloc, it returns 0.
3468 * lblk_start should always be <= lblk_end.
3469 * search_hint_reverse is to indicate that searching in reverse from lblk_end to
3470 * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
3471 * block sooner). This is useful when blocks are truncated sequentially from
3472 * lblk_start towards lblk_end.
3473 */ 3521 */
3474static int ext4_find_delalloc_range(struct inode *inode, 3522static int ext4_find_delalloc_range(struct inode *inode,
3475 ext4_lblk_t lblk_start, 3523 ext4_lblk_t lblk_start,
3476 ext4_lblk_t lblk_end, 3524 ext4_lblk_t lblk_end)
3477 int search_hint_reverse)
3478{ 3525{
3479 struct address_space *mapping = inode->i_mapping; 3526 struct extent_status es;
3480 struct buffer_head *head, *bh = NULL;
3481 struct page *page;
3482 ext4_lblk_t i, pg_lblk;
3483 pgoff_t index;
3484 3527
3485 if (!test_opt(inode->i_sb, DELALLOC)) 3528 es.start = lblk_start;
3486 return 0; 3529 ext4_es_find_extent(inode, &es);
3487 3530 if (es.len == 0)
3488 /* reverse search wont work if fs block size is less than page size */ 3531 return 0; /* there is no delay extent in this tree */
3489 if (inode->i_blkbits < PAGE_CACHE_SHIFT) 3532 else if (es.start <= lblk_start && lblk_start < es.start + es.len)
3490 search_hint_reverse = 0; 3533 return 1;
3491 3534 else if (lblk_start <= es.start && es.start <= lblk_end)
3492 if (search_hint_reverse) 3535 return 1;
3493 i = lblk_end;
3494 else 3536 else
3495 i = lblk_start; 3537 return 0;
3496
3497 index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3498
3499 while ((i >= lblk_start) && (i <= lblk_end)) {
3500 page = find_get_page(mapping, index);
3501 if (!page)
3502 goto nextpage;
3503
3504 if (!page_has_buffers(page))
3505 goto nextpage;
3506
3507 head = page_buffers(page);
3508 if (!head)
3509 goto nextpage;
3510
3511 bh = head;
3512 pg_lblk = index << (PAGE_CACHE_SHIFT -
3513 inode->i_blkbits);
3514 do {
3515 if (unlikely(pg_lblk < lblk_start)) {
3516 /*
3517 * This is possible when fs block size is less
3518 * than page size and our cluster starts/ends in
3519 * middle of the page. So we need to skip the
3520 * initial few blocks till we reach the 'lblk'
3521 */
3522 pg_lblk++;
3523 continue;
3524 }
3525
3526 /* Check if the buffer is delayed allocated and that it
3527 * is not yet mapped. (when da-buffers are mapped during
3528 * their writeout, their da_mapped bit is set.)
3529 */
3530 if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
3531 page_cache_release(page);
3532 trace_ext4_find_delalloc_range(inode,
3533 lblk_start, lblk_end,
3534 search_hint_reverse,
3535 1, i);
3536 return 1;
3537 }
3538 if (search_hint_reverse)
3539 i--;
3540 else
3541 i++;
3542 } while ((i >= lblk_start) && (i <= lblk_end) &&
3543 ((bh = bh->b_this_page) != head));
3544nextpage:
3545 if (page)
3546 page_cache_release(page);
3547 /*
3548 * Move to next page. 'i' will be the first lblk in the next
3549 * page.
3550 */
3551 if (search_hint_reverse)
3552 index--;
3553 else
3554 index++;
3555 i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3556 }
3557
3558 trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
3559 search_hint_reverse, 0, 0);
3560 return 0;
3561} 3538}
3562 3539
3563int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, 3540int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
3564 int search_hint_reverse)
3565{ 3541{
3566 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3542 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3567 ext4_lblk_t lblk_start, lblk_end; 3543 ext4_lblk_t lblk_start, lblk_end;
3568 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); 3544 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
3569 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 3545 lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
3570 3546
3571 return ext4_find_delalloc_range(inode, lblk_start, lblk_end, 3547 return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
3572 search_hint_reverse);
3573} 3548}
3574 3549
3575/** 3550/**
@@ -3630,7 +3605,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3630 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); 3605 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
3631 lblk_to = lblk_from + c_offset - 1; 3606 lblk_to = lblk_from + c_offset - 1;
3632 3607
3633 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) 3608 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3634 allocated_clusters--; 3609 allocated_clusters--;
3635 } 3610 }
3636 3611
@@ -3640,7 +3615,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3640 lblk_from = lblk_start + num_blks; 3615 lblk_from = lblk_start + num_blks;
3641 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; 3616 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
3642 3617
3643 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) 3618 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
3644 allocated_clusters--; 3619 allocated_clusters--;
3645 } 3620 }
3646 3621
@@ -3663,8 +3638,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3663 flags, allocated); 3638 flags, allocated);
3664 ext4_ext_show_leaf(inode, path); 3639 ext4_ext_show_leaf(inode, path);
3665 3640
3666 trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, 3641 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
3667 newblock); 3642 allocated, newblock);
3668 3643
3669 /* get_block() before submit the IO, split the extent */ 3644 /* get_block() before submit the IO, split the extent */
3670 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3645 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
@@ -3911,7 +3886,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3911 struct ext4_extent newex, *ex, *ex2; 3886 struct ext4_extent newex, *ex, *ex2;
3912 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3887 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3913 ext4_fsblk_t newblock = 0; 3888 ext4_fsblk_t newblock = 0;
3914 int free_on_err = 0, err = 0, depth, ret; 3889 int free_on_err = 0, err = 0, depth;
3915 unsigned int allocated = 0, offset = 0; 3890 unsigned int allocated = 0, offset = 0;
3916 unsigned int allocated_clusters = 0; 3891 unsigned int allocated_clusters = 0;
3917 struct ext4_allocation_request ar; 3892 struct ext4_allocation_request ar;
@@ -3927,7 +3902,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3927 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3902 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3928 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3903 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3929 if ((sbi->s_cluster_ratio > 1) && 3904 if ((sbi->s_cluster_ratio > 1) &&
3930 ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 3905 ext4_find_delalloc_cluster(inode, map->m_lblk))
3931 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3906 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
3932 3907
3933 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3908 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -4007,15 +3982,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4007 ee_len, ee_start); 3982 ee_len, ee_start);
4008 goto out; 3983 goto out;
4009 } 3984 }
4010 ret = ext4_ext_handle_uninitialized_extents( 3985 allocated = ext4_ext_handle_uninitialized_extents(
4011 handle, inode, map, path, flags, 3986 handle, inode, map, path, flags,
4012 allocated, newblock); 3987 allocated, newblock);
4013 return ret; 3988 goto out3;
4014 } 3989 }
4015 } 3990 }
4016 3991
4017 if ((sbi->s_cluster_ratio > 1) && 3992 if ((sbi->s_cluster_ratio > 1) &&
4018 ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) 3993 ext4_find_delalloc_cluster(inode, map->m_lblk))
4019 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 3994 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
4020 3995
4021 /* 3996 /*
@@ -4284,8 +4259,8 @@ out2:
4284 kfree(path); 4259 kfree(path);
4285 } 4260 }
4286 4261
4287 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 4262out3:
4288 newblock, map->m_len, err ? err : allocated); 4263 trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
4289 4264
4290 return err ? err : allocated; 4265 return err ? err : allocated;
4291} 4266}
@@ -4344,6 +4319,8 @@ void ext4_ext_truncate(struct inode *inode)
4344 4319
4345 last_block = (inode->i_size + sb->s_blocksize - 1) 4320 last_block = (inode->i_size + sb->s_blocksize - 1)
4346 >> EXT4_BLOCK_SIZE_BITS(sb); 4321 >> EXT4_BLOCK_SIZE_BITS(sb);
4322 err = ext4_es_remove_extent(inode, last_block,
4323 EXT_MAX_BLOCKS - last_block);
4347 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4324 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4348 4325
4349 /* In a multi-transaction truncate, we only make the final 4326 /* In a multi-transaction truncate, we only make the final
@@ -4434,6 +4411,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4434 if (mode & FALLOC_FL_PUNCH_HOLE) 4411 if (mode & FALLOC_FL_PUNCH_HOLE)
4435 return ext4_punch_hole(file, offset, len); 4412 return ext4_punch_hole(file, offset, len);
4436 4413
4414 ret = ext4_convert_inline_data(inode);
4415 if (ret)
4416 return ret;
4417
4437 trace_ext4_fallocate_enter(inode, offset, len, mode); 4418 trace_ext4_fallocate_enter(inode, offset, len, mode);
4438 map.m_lblk = offset >> blkbits; 4419 map.m_lblk = offset >> blkbits;
4439 /* 4420 /*
@@ -4572,206 +4553,43 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4572} 4553}
4573 4554
4574/* 4555/*
4575 * Callback function called for each extent to gather FIEMAP information. 4556 * If newex is not existing extent (newex->ec_start equals zero) find
4557 * delayed extent at start of newex and update newex accordingly and
4558 * return start of the next delayed extent.
4559 *
4560 * If newex is existing extent (newex->ec_start is not equal zero)
4561 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
4562 * extent found. Leave newex unmodified.
4576 */ 4563 */
4577static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, 4564static int ext4_find_delayed_extent(struct inode *inode,
4578 struct ext4_ext_cache *newex, struct ext4_extent *ex, 4565 struct ext4_ext_cache *newex)
4579 void *data)
4580{ 4566{
4581 __u64 logical; 4567 struct extent_status es;
4582 __u64 physical; 4568 ext4_lblk_t next_del;
4583 __u64 length;
4584 __u32 flags = 0;
4585 int ret = 0;
4586 struct fiemap_extent_info *fieinfo = data;
4587 unsigned char blksize_bits;
4588 4569
4589 blksize_bits = inode->i_sb->s_blocksize_bits; 4570 es.start = newex->ec_block;
4590 logical = (__u64)newex->ec_block << blksize_bits; 4571 next_del = ext4_es_find_extent(inode, &es);
4591 4572
4592 if (newex->ec_start == 0) { 4573 if (newex->ec_start == 0) {
4593 /* 4574 /*
4594 * No extent in extent-tree contains block @newex->ec_start, 4575 * No extent in extent-tree contains block @newex->ec_start,
4595 * then the block may stay in 1)a hole or 2)delayed-extent. 4576 * then the block may stay in 1)a hole or 2)delayed-extent.
4596 *
4597 * Holes or delayed-extents are processed as follows.
4598 * 1. lookup dirty pages with specified range in pagecache.
4599 * If no page is got, then there is no delayed-extent and
4600 * return with EXT_CONTINUE.
4601 * 2. find the 1st mapped buffer,
4602 * 3. check if the mapped buffer is both in the request range
4603 * and a delayed buffer. If not, there is no delayed-extent,
4604 * then return.
4605 * 4. a delayed-extent is found, the extent will be collected.
4606 */ 4577 */
4607 ext4_lblk_t end = 0; 4578 if (es.len == 0)
4608 pgoff_t last_offset; 4579 /* A hole found. */
4609 pgoff_t offset; 4580 return 0;
4610 pgoff_t index;
4611 pgoff_t start_index = 0;
4612 struct page **pages = NULL;
4613 struct buffer_head *bh = NULL;
4614 struct buffer_head *head = NULL;
4615 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4616
4617 pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4618 if (pages == NULL)
4619 return -ENOMEM;
4620
4621 offset = logical >> PAGE_SHIFT;
4622repeat:
4623 last_offset = offset;
4624 head = NULL;
4625 ret = find_get_pages_tag(inode->i_mapping, &offset,
4626 PAGECACHE_TAG_DIRTY, nr_pages, pages);
4627
4628 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4629 /* First time, try to find a mapped buffer. */
4630 if (ret == 0) {
4631out:
4632 for (index = 0; index < ret; index++)
4633 page_cache_release(pages[index]);
4634 /* just a hole. */
4635 kfree(pages);
4636 return EXT_CONTINUE;
4637 }
4638 index = 0;
4639
4640next_page:
4641 /* Try to find the 1st mapped buffer. */
4642 end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
4643 blksize_bits;
4644 if (!page_has_buffers(pages[index]))
4645 goto out;
4646 head = page_buffers(pages[index]);
4647 if (!head)
4648 goto out;
4649
4650 index++;
4651 bh = head;
4652 do {
4653 if (end >= newex->ec_block +
4654 newex->ec_len)
4655 /* The buffer is out of
4656 * the request range.
4657 */
4658 goto out;
4659
4660 if (buffer_mapped(bh) &&
4661 end >= newex->ec_block) {
4662 start_index = index - 1;
4663 /* get the 1st mapped buffer. */
4664 goto found_mapped_buffer;
4665 }
4666
4667 bh = bh->b_this_page;
4668 end++;
4669 } while (bh != head);
4670
4671 /* No mapped buffer in the range found in this page,
4672 * We need to look up next page.
4673 */
4674 if (index >= ret) {
4675 /* There is no page left, but we need to limit
4676 * newex->ec_len.
4677 */
4678 newex->ec_len = end - newex->ec_block;
4679 goto out;
4680 }
4681 goto next_page;
4682 } else {
4683 /*Find contiguous delayed buffers. */
4684 if (ret > 0 && pages[0]->index == last_offset)
4685 head = page_buffers(pages[0]);
4686 bh = head;
4687 index = 1;
4688 start_index = 0;
4689 }
4690
4691found_mapped_buffer:
4692 if (bh != NULL && buffer_delay(bh)) {
4693 /* 1st or contiguous delayed buffer found. */
4694 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4695 /*
4696 * 1st delayed buffer found, record
4697 * the start of extent.
4698 */
4699 flags |= FIEMAP_EXTENT_DELALLOC;
4700 newex->ec_block = end;
4701 logical = (__u64)end << blksize_bits;
4702 }
4703 /* Find contiguous delayed buffers. */
4704 do {
4705 if (!buffer_delay(bh))
4706 goto found_delayed_extent;
4707 bh = bh->b_this_page;
4708 end++;
4709 } while (bh != head);
4710
4711 for (; index < ret; index++) {
4712 if (!page_has_buffers(pages[index])) {
4713 bh = NULL;
4714 break;
4715 }
4716 head = page_buffers(pages[index]);
4717 if (!head) {
4718 bh = NULL;
4719 break;
4720 }
4721
4722 if (pages[index]->index !=
4723 pages[start_index]->index + index
4724 - start_index) {
4725 /* Blocks are not contiguous. */
4726 bh = NULL;
4727 break;
4728 }
4729 bh = head;
4730 do {
4731 if (!buffer_delay(bh))
4732 /* Delayed-extent ends. */
4733 goto found_delayed_extent;
4734 bh = bh->b_this_page;
4735 end++;
4736 } while (bh != head);
4737 }
4738 } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4739 /* a hole found. */
4740 goto out;
4741 4581
4742found_delayed_extent: 4582 if (es.start > newex->ec_block) {
4743 newex->ec_len = min(end - newex->ec_block, 4583 /* A hole found. */
4744 (ext4_lblk_t)EXT_INIT_MAX_LEN); 4584 newex->ec_len = min(es.start - newex->ec_block,
4745 if (ret == nr_pages && bh != NULL && 4585 newex->ec_len);
4746 newex->ec_len < EXT_INIT_MAX_LEN && 4586 return 0;
4747 buffer_delay(bh)) {
4748 /* Have not collected an extent and continue. */
4749 for (index = 0; index < ret; index++)
4750 page_cache_release(pages[index]);
4751 goto repeat;
4752 } 4587 }
4753 4588
4754 for (index = 0; index < ret; index++) 4589 newex->ec_len = es.start + es.len - newex->ec_block;
4755 page_cache_release(pages[index]);
4756 kfree(pages);
4757 } 4590 }
4758 4591
4759 physical = (__u64)newex->ec_start << blksize_bits; 4592 return next_del;
4760 length = (__u64)newex->ec_len << blksize_bits;
4761
4762 if (ex && ext4_ext_is_uninitialized(ex))
4763 flags |= FIEMAP_EXTENT_UNWRITTEN;
4764
4765 if (next == EXT_MAX_BLOCKS)
4766 flags |= FIEMAP_EXTENT_LAST;
4767
4768 ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4769 length, flags);
4770 if (ret < 0)
4771 return ret;
4772 if (ret == 1)
4773 return EXT_BREAK;
4774 return EXT_CONTINUE;
4775} 4593}
4776/* fiemap flags we can handle specified here */ 4594/* fiemap flags we can handle specified here */
4777#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 4595#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4971,6 +4789,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4971 ext4_ext_invalidate_cache(inode); 4789 ext4_ext_invalidate_cache(inode);
4972 ext4_discard_preallocations(inode); 4790 ext4_discard_preallocations(inode);
4973 4791
4792 err = ext4_es_remove_extent(inode, first_block,
4793 stop_block - first_block);
4974 err = ext4_ext_remove_space(inode, first_block, stop_block - 1); 4794 err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
4975 4795
4976 ext4_ext_invalidate_cache(inode); 4796 ext4_ext_invalidate_cache(inode);
@@ -4991,12 +4811,22 @@ out_mutex:
4991 mutex_unlock(&inode->i_mutex); 4811 mutex_unlock(&inode->i_mutex);
4992 return err; 4812 return err;
4993} 4813}
4814
4994int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4815int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4995 __u64 start, __u64 len) 4816 __u64 start, __u64 len)
4996{ 4817{
4997 ext4_lblk_t start_blk; 4818 ext4_lblk_t start_blk;
4998 int error = 0; 4819 int error = 0;
4999 4820
4821 if (ext4_has_inline_data(inode)) {
4822 int has_inline = 1;
4823
4824 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
4825
4826 if (has_inline)
4827 return error;
4828 }
4829
5000 /* fallback to generic here if not in extents fmt */ 4830 /* fallback to generic here if not in extents fmt */
5001 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4831 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5002 return generic_block_fiemap(inode, fieinfo, start, len, 4832 return generic_block_fiemap(inode, fieinfo, start, len,
@@ -5018,11 +4848,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5018 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4848 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5019 4849
5020 /* 4850 /*
5021 * Walk the extent tree gathering extent information. 4851 * Walk the extent tree gathering extent information
5022 * ext4_ext_fiemap_cb will push extents back to user. 4852 * and pushing extents back to the user.
5023 */ 4853 */
5024 error = ext4_ext_walk_space(inode, start_blk, len_blks, 4854 error = ext4_fill_fiemap_extents(inode, start_blk,
5025 ext4_ext_fiemap_cb, fieinfo); 4855 len_blks, fieinfo);
5026 } 4856 }
5027 4857
5028 return error; 4858 return error;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644
index 000000000000..564d981a2fcc
--- /dev/null
+++ b/fs/ext4/extents_status.c
@@ -0,0 +1,500 @@
1/*
2 * fs/ext4/extents_status.c
3 *
4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
5 * Modified by
6 * Allison Henderson <achender@linux.vnet.ibm.com>
7 * Hugh Dickins <hughd@google.com>
8 * Zheng Liu <wenqing.lz@taobao.com>
9 *
10 * Ext4 extents status tree core functions.
11 */
12#include <linux/rbtree.h>
13#include "ext4.h"
14#include "extents_status.h"
15#include "ext4_extents.h"
16
17#include <trace/events/ext4.h>
18
19/*
20 * According to previous discussion in Ext4 Developer Workshop, we
21 * will introduce a new structure called io tree to track all extent
22 * status in order to solve some problems that we have met
23 * (e.g. Reservation space warning), and provide extent-level locking.
24 * Delay extent tree is the first step to achieve this goal. It is
25 * original built by Yongqiang Yang. At that time it is called delay
26 * extent tree, whose goal is only track delay extent in memory to
27 * simplify the implementation of fiemap and bigalloc, and introduce
28 * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
29 * delay extent tree at the following comment. But for better
30 * understand what it does, it has been rename to extent status tree.
31 *
32 * Currently the first step has been done. All delay extents are
33 * tracked in the tree. It maintains the delay extent when a delay
34 * allocation is issued, and the delay extent is written out or
35 * invalidated. Therefore the implementation of fiemap and bigalloc
36 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
37 *
38 * The following comment describes the implemenmtation of extent
39 * status tree and future works.
40 */
41
42/*
43 * extents status tree implementation for ext4.
44 *
45 *
46 * ==========================================================================
47 * Extents status encompass delayed extents and extent locks
48 *
49 * 1. Why delayed extent implementation ?
50 *
51 * Without delayed extent, ext4 identifies a delayed extent by looking
52 * up page cache, this has several deficiencies - complicated, buggy,
53 * and inefficient code.
54 *
55 * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
56 * to know if a block or a range of blocks are belonged to a delayed
57 * extent.
58 *
59 * Let us have a look at how they do without delayed extents implementation.
60 * -- FIEMAP
61 * FIEMAP looks up page cache to identify delayed allocations from holes.
62 *
63 * -- SEEK_HOLE/DATA
64 * SEEK_HOLE/DATA has the same problem as FIEMAP.
65 *
66 * -- bigalloc
67 * bigalloc looks up page cache to figure out if a block is
68 * already under delayed allocation or not to determine whether
69 * quota reserving is needed for the cluster.
70 *
71 * -- punch hole
72 * punch hole looks up page cache to identify a delayed extent.
73 *
74 * -- writeout
75 * Writeout looks up whole page cache to see if a buffer is
76 * mapped, If there are not very many delayed buffers, then it is
77 * time comsuming.
78 *
79 * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
80 * bigalloc and writeout can figure out if a block or a range of
81 * blocks is under delayed allocation(belonged to a delayed extent) or
82 * not by searching the delayed extent tree.
83 *
84 *
85 * ==========================================================================
86 * 2. ext4 delayed extents impelmentation
87 *
88 * -- delayed extent
89 * A delayed extent is a range of blocks which are contiguous
90 * logically and under delayed allocation. Unlike extent in
91 * ext4, delayed extent in ext4 is a in-memory struct, there is
92 * no corresponding on-disk data. There is no limit on length of
93 * delayed extent, so a delayed extent can contain as many blocks
94 * as they are contiguous logically.
95 *
96 * -- delayed extent tree
97 * Every inode has a delayed extent tree and all under delayed
98 * allocation blocks are added to the tree as delayed extents.
99 * Delayed extents in the tree are ordered by logical block no.
100 *
101 * -- operations on a delayed extent tree
102 * There are three operations on a delayed extent tree: find next
103 * delayed extent, adding a space(a range of blocks) and removing
104 * a space.
105 *
106 * -- race on a delayed extent tree
107 * Delayed extent tree is protected inode->i_es_lock.
108 *
109 *
110 * ==========================================================================
111 * 3. performance analysis
112 * -- overhead
113 * 1. There is a cache extent for write access, so if writes are
114 * not very random, adding space operaions are in O(1) time.
115 *
116 * -- gain
117 * 2. Code is much simpler, more readable, more maintainable and
118 * more efficient.
119 *
120 *
121 * ==========================================================================
122 * 4. TODO list
123 * -- Track all extent status
124 *
125 * -- Improve get block process
126 *
127 * -- Extent-level locking
128 */
129
130static struct kmem_cache *ext4_es_cachep;
131
132int __init ext4_init_es(void)
133{
134 ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
135 if (ext4_es_cachep == NULL)
136 return -ENOMEM;
137 return 0;
138}
139
140void ext4_exit_es(void)
141{
142 if (ext4_es_cachep)
143 kmem_cache_destroy(ext4_es_cachep);
144}
145
146void ext4_es_init_tree(struct ext4_es_tree *tree)
147{
148 tree->root = RB_ROOT;
149 tree->cache_es = NULL;
150}
151
152#ifdef ES_DEBUG__
153static void ext4_es_print_tree(struct inode *inode)
154{
155 struct ext4_es_tree *tree;
156 struct rb_node *node;
157
158 printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
159 tree = &EXT4_I(inode)->i_es_tree;
160 node = rb_first(&tree->root);
161 while (node) {
162 struct extent_status *es;
163 es = rb_entry(node, struct extent_status, rb_node);
164 printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
165 node = rb_next(node);
166 }
167 printk(KERN_DEBUG "\n");
168}
169#else
170#define ext4_es_print_tree(inode)
171#endif
172
173static inline ext4_lblk_t extent_status_end(struct extent_status *es)
174{
175 BUG_ON(es->start + es->len < es->start);
176 return es->start + es->len - 1;
177}
178
179/*
180 * search through the tree for an delayed extent with a given offset. If
181 * it can't be found, try to find next extent.
182 */
183static struct extent_status *__es_tree_search(struct rb_root *root,
184 ext4_lblk_t offset)
185{
186 struct rb_node *node = root->rb_node;
187 struct extent_status *es = NULL;
188
189 while (node) {
190 es = rb_entry(node, struct extent_status, rb_node);
191 if (offset < es->start)
192 node = node->rb_left;
193 else if (offset > extent_status_end(es))
194 node = node->rb_right;
195 else
196 return es;
197 }
198
199 if (es && offset < es->start)
200 return es;
201
202 if (es && offset > extent_status_end(es)) {
203 node = rb_next(&es->rb_node);
204 return node ? rb_entry(node, struct extent_status, rb_node) :
205 NULL;
206 }
207
208 return NULL;
209}
210
211/*
212 * ext4_es_find_extent: find the 1st delayed extent covering @es->start
213 * if it exists, otherwise, the next extent after @es->start.
214 *
215 * @inode: the inode which owns delayed extents
216 * @es: delayed extent that we found
217 *
218 * Returns the first block of the next extent after es, otherwise
219 * EXT_MAX_BLOCKS if no delay extent is found.
220 * Delayed extent is returned via @es.
221 */
222ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
223{
224 struct ext4_es_tree *tree = NULL;
225 struct extent_status *es1 = NULL;
226 struct rb_node *node;
227 ext4_lblk_t ret = EXT_MAX_BLOCKS;
228
229 trace_ext4_es_find_extent_enter(inode, es->start);
230
231 read_lock(&EXT4_I(inode)->i_es_lock);
232 tree = &EXT4_I(inode)->i_es_tree;
233
234 /* find delay extent in cache firstly */
235 if (tree->cache_es) {
236 es1 = tree->cache_es;
237 if (in_range(es->start, es1->start, es1->len)) {
238 es_debug("%u cached by [%u/%u)\n",
239 es->start, es1->start, es1->len);
240 goto out;
241 }
242 }
243
244 es->len = 0;
245 es1 = __es_tree_search(&tree->root, es->start);
246
247out:
248 if (es1) {
249 tree->cache_es = es1;
250 es->start = es1->start;
251 es->len = es1->len;
252 node = rb_next(&es1->rb_node);
253 if (node) {
254 es1 = rb_entry(node, struct extent_status, rb_node);
255 ret = es1->start;
256 }
257 }
258
259 read_unlock(&EXT4_I(inode)->i_es_lock);
260
261 trace_ext4_es_find_extent_exit(inode, es, ret);
262 return ret;
263}
264
265static struct extent_status *
266ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
267{
268 struct extent_status *es;
269 es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
270 if (es == NULL)
271 return NULL;
272 es->start = start;
273 es->len = len;
274 return es;
275}
276
277static void ext4_es_free_extent(struct extent_status *es)
278{
279 kmem_cache_free(ext4_es_cachep, es);
280}
281
282static struct extent_status *
283ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
284{
285 struct extent_status *es1;
286 struct rb_node *node;
287
288 node = rb_prev(&es->rb_node);
289 if (!node)
290 return es;
291
292 es1 = rb_entry(node, struct extent_status, rb_node);
293 if (es->start == extent_status_end(es1) + 1) {
294 es1->len += es->len;
295 rb_erase(&es->rb_node, &tree->root);
296 ext4_es_free_extent(es);
297 es = es1;
298 }
299
300 return es;
301}
302
303static struct extent_status *
304ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
305{
306 struct extent_status *es1;
307 struct rb_node *node;
308
309 node = rb_next(&es->rb_node);
310 if (!node)
311 return es;
312
313 es1 = rb_entry(node, struct extent_status, rb_node);
314 if (es1->start == extent_status_end(es) + 1) {
315 es->len += es1->len;
316 rb_erase(node, &tree->root);
317 ext4_es_free_extent(es1);
318 }
319
320 return es;
321}
322
323static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
324 ext4_lblk_t len)
325{
326 struct rb_node **p = &tree->root.rb_node;
327 struct rb_node *parent = NULL;
328 struct extent_status *es;
329 ext4_lblk_t end = offset + len - 1;
330
331 BUG_ON(end < offset);
332 es = tree->cache_es;
333 if (es && offset == (extent_status_end(es) + 1)) {
334 es_debug("cached by [%u/%u)\n", es->start, es->len);
335 es->len += len;
336 es = ext4_es_try_to_merge_right(tree, es);
337 goto out;
338 } else if (es && es->start == end + 1) {
339 es_debug("cached by [%u/%u)\n", es->start, es->len);
340 es->start = offset;
341 es->len += len;
342 es = ext4_es_try_to_merge_left(tree, es);
343 goto out;
344 } else if (es && es->start <= offset &&
345 end <= extent_status_end(es)) {
346 es_debug("cached by [%u/%u)\n", es->start, es->len);
347 goto out;
348 }
349
350 while (*p) {
351 parent = *p;
352 es = rb_entry(parent, struct extent_status, rb_node);
353
354 if (offset < es->start) {
355 if (es->start == end + 1) {
356 es->start = offset;
357 es->len += len;
358 es = ext4_es_try_to_merge_left(tree, es);
359 goto out;
360 }
361 p = &(*p)->rb_left;
362 } else if (offset > extent_status_end(es)) {
363 if (offset == extent_status_end(es) + 1) {
364 es->len += len;
365 es = ext4_es_try_to_merge_right(tree, es);
366 goto out;
367 }
368 p = &(*p)->rb_right;
369 } else {
370 if (extent_status_end(es) <= end)
371 es->len = offset - es->start + len;
372 goto out;
373 }
374 }
375
376 es = ext4_es_alloc_extent(offset, len);
377 if (!es)
378 return -ENOMEM;
379 rb_link_node(&es->rb_node, parent, p);
380 rb_insert_color(&es->rb_node, &tree->root);
381
382out:
383 tree->cache_es = es;
384 return 0;
385}
386
387/*
388 * ext4_es_insert_extent() adds a space to a delayed extent tree.
389 * Caller holds inode->i_es_lock.
390 *
391 * ext4_es_insert_extent is called by ext4_da_write_begin and
392 * ext4_es_remove_extent.
393 *
394 * Return 0 on success, error code on failure.
395 */
396int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
397 ext4_lblk_t len)
398{
399 struct ext4_es_tree *tree;
400 int err = 0;
401
402 trace_ext4_es_insert_extent(inode, offset, len);
403 es_debug("add [%u/%u) to extent status tree of inode %lu\n",
404 offset, len, inode->i_ino);
405
406 write_lock(&EXT4_I(inode)->i_es_lock);
407 tree = &EXT4_I(inode)->i_es_tree;
408 err = __es_insert_extent(tree, offset, len);
409 write_unlock(&EXT4_I(inode)->i_es_lock);
410
411 ext4_es_print_tree(inode);
412
413 return err;
414}
415
416/*
417 * ext4_es_remove_extent() removes a space from a delayed extent tree.
418 * Caller holds inode->i_es_lock.
419 *
420 * Return 0 on success, error code on failure.
421 */
422int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
423 ext4_lblk_t len)
424{
425 struct rb_node *node;
426 struct ext4_es_tree *tree;
427 struct extent_status *es;
428 struct extent_status orig_es;
429 ext4_lblk_t len1, len2, end;
430 int err = 0;
431
432 trace_ext4_es_remove_extent(inode, offset, len);
433 es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
434 offset, len, inode->i_ino);
435
436 end = offset + len - 1;
437 BUG_ON(end < offset);
438 write_lock(&EXT4_I(inode)->i_es_lock);
439 tree = &EXT4_I(inode)->i_es_tree;
440 es = __es_tree_search(&tree->root, offset);
441 if (!es)
442 goto out;
443 if (es->start > end)
444 goto out;
445
446 /* Simply invalidate cache_es. */
447 tree->cache_es = NULL;
448
449 orig_es.start = es->start;
450 orig_es.len = es->len;
451 len1 = offset > es->start ? offset - es->start : 0;
452 len2 = extent_status_end(es) > end ?
453 extent_status_end(es) - end : 0;
454 if (len1 > 0)
455 es->len = len1;
456 if (len2 > 0) {
457 if (len1 > 0) {
458 err = __es_insert_extent(tree, end + 1, len2);
459 if (err) {
460 es->start = orig_es.start;
461 es->len = orig_es.len;
462 goto out;
463 }
464 } else {
465 es->start = end + 1;
466 es->len = len2;
467 }
468 goto out;
469 }
470
471 if (len1 > 0) {
472 node = rb_next(&es->rb_node);
473 if (node)
474 es = rb_entry(node, struct extent_status, rb_node);
475 else
476 es = NULL;
477 }
478
479 while (es && extent_status_end(es) <= end) {
480 node = rb_next(&es->rb_node);
481 rb_erase(&es->rb_node, &tree->root);
482 ext4_es_free_extent(es);
483 if (!node) {
484 es = NULL;
485 break;
486 }
487 es = rb_entry(node, struct extent_status, rb_node);
488 }
489
490 if (es && es->start < end + 1) {
491 len1 = extent_status_end(es) - end;
492 es->start = end + 1;
493 es->len = len1;
494 }
495
496out:
497 write_unlock(&EXT4_I(inode)->i_es_lock);
498 ext4_es_print_tree(inode);
499 return err;
500}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644
index 000000000000..077f82db092a
--- /dev/null
+++ b/fs/ext4/extents_status.h
@@ -0,0 +1,45 @@
1/*
2 * fs/ext4/extents_status.h
3 *
4 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
5 * Modified by
6 * Allison Henderson <achender@linux.vnet.ibm.com>
7 * Zheng Liu <wenqing.lz@taobao.com>
8 *
9 */
10
11#ifndef _EXT4_EXTENTS_STATUS_H
12#define _EXT4_EXTENTS_STATUS_H
13
14/*
15 * Turn on ES_DEBUG__ to get lots of info about extent status operations.
16 */
17#ifdef ES_DEBUG__
18#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
19#else
20#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
21#endif
22
23struct extent_status {
24 struct rb_node rb_node;
25 ext4_lblk_t start; /* first block extent covers */
26 ext4_lblk_t len; /* length of extent in block */
27};
28
29struct ext4_es_tree {
30 struct rb_root root;
31 struct extent_status *cache_es; /* recently accessed extent */
32};
33
34extern int __init ext4_init_es(void);
35extern void ext4_exit_es(void);
36extern void ext4_es_init_tree(struct ext4_es_tree *tree);
37
38extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
39 ext4_lblk_t len);
40extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
41 ext4_lblk_t len);
42extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
43 struct extent_status *es);
44
45#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bf3966bccd34..405565a62277 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -24,6 +24,7 @@
24#include <linux/mount.h> 24#include <linux/mount.h>
25#include <linux/path.h> 25#include <linux/path.h>
26#include <linux/quotaops.h> 26#include <linux/quotaops.h>
27#include <linux/pagevec.h>
27#include "ext4.h" 28#include "ext4.h"
28#include "ext4_jbd2.h" 29#include "ext4_jbd2.h"
29#include "xattr.h" 30#include "xattr.h"
@@ -107,14 +108,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
107 108
108 /* Unaligned direct AIO must be serialized; see comment above */ 109 /* Unaligned direct AIO must be serialized; see comment above */
109 if (unaligned_aio) { 110 if (unaligned_aio) {
110 static unsigned long unaligned_warn_time;
111
112 /* Warn about this once per day */
113 if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
114 ext4_msg(inode->i_sb, KERN_WARNING,
115 "Unaligned AIO/DIO on inode %ld by %s; "
116 "performance will be poor.",
117 inode->i_ino, current->comm);
118 mutex_lock(ext4_aio_mutex(inode)); 111 mutex_lock(ext4_aio_mutex(inode));
119 ext4_unwritten_wait(inode); 112 ext4_unwritten_wait(inode);
120 } 113 }
@@ -286,11 +279,329 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
286} 279}
287 280
288/* 281/*
282 * Here we use ext4_map_blocks() to get a block mapping for a extent-based
283 * file rather than ext4_ext_walk_space() because we can introduce
284 * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
285 * function. When extent status tree has been fully implemented, it will
286 * track all extent status for a file and we can directly use it to
287 * retrieve the offset for SEEK_DATA/SEEK_HOLE.
288 */
289
290/*
291 * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
292 * lookup page cache to check whether or not there has some data between
293 * [startoff, endoff] because, if this range contains an unwritten extent,
294 * we determine this extent as a data or a hole according to whether the
295 * page cache has data or not.
296 */
297static int ext4_find_unwritten_pgoff(struct inode *inode,
298 int whence,
299 struct ext4_map_blocks *map,
300 loff_t *offset)
301{
302 struct pagevec pvec;
303 unsigned int blkbits;
304 pgoff_t index;
305 pgoff_t end;
306 loff_t endoff;
307 loff_t startoff;
308 loff_t lastoff;
309 int found = 0;
310
311 blkbits = inode->i_sb->s_blocksize_bits;
312 startoff = *offset;
313 lastoff = startoff;
314 endoff = (map->m_lblk + map->m_len) << blkbits;
315
316 index = startoff >> PAGE_CACHE_SHIFT;
317 end = endoff >> PAGE_CACHE_SHIFT;
318
319 pagevec_init(&pvec, 0);
320 do {
321 int i, num;
322 unsigned long nr_pages;
323
324 num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
325 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
326 (pgoff_t)num);
327 if (nr_pages == 0) {
328 if (whence == SEEK_DATA)
329 break;
330
331 BUG_ON(whence != SEEK_HOLE);
332 /*
333 * If this is the first time to go into the loop and
334 * offset is not beyond the end offset, it will be a
335 * hole at this offset
336 */
337 if (lastoff == startoff || lastoff < endoff)
338 found = 1;
339 break;
340 }
341
342 /*
343 * If this is the first time to go into the loop and
344 * offset is smaller than the first page offset, it will be a
345 * hole at this offset.
346 */
347 if (lastoff == startoff && whence == SEEK_HOLE &&
348 lastoff < page_offset(pvec.pages[0])) {
349 found = 1;
350 break;
351 }
352
353 for (i = 0; i < nr_pages; i++) {
354 struct page *page = pvec.pages[i];
355 struct buffer_head *bh, *head;
356
357 /*
358 * If the current offset is not beyond the end of given
359 * range, it will be a hole.
360 */
361 if (lastoff < endoff && whence == SEEK_HOLE &&
362 page->index > end) {
363 found = 1;
364 *offset = lastoff;
365 goto out;
366 }
367
368 lock_page(page);
369
370 if (unlikely(page->mapping != inode->i_mapping)) {
371 unlock_page(page);
372 continue;
373 }
374
375 if (!page_has_buffers(page)) {
376 unlock_page(page);
377 continue;
378 }
379
380 if (page_has_buffers(page)) {
381 lastoff = page_offset(page);
382 bh = head = page_buffers(page);
383 do {
384 if (buffer_uptodate(bh) ||
385 buffer_unwritten(bh)) {
386 if (whence == SEEK_DATA)
387 found = 1;
388 } else {
389 if (whence == SEEK_HOLE)
390 found = 1;
391 }
392 if (found) {
393 *offset = max_t(loff_t,
394 startoff, lastoff);
395 unlock_page(page);
396 goto out;
397 }
398 lastoff += bh->b_size;
399 bh = bh->b_this_page;
400 } while (bh != head);
401 }
402
403 lastoff = page_offset(page) + PAGE_SIZE;
404 unlock_page(page);
405 }
406
407 /*
408 * The no. of pages is less than our desired, that would be a
409 * hole in there.
410 */
411 if (nr_pages < num && whence == SEEK_HOLE) {
412 found = 1;
413 *offset = lastoff;
414 break;
415 }
416
417 index = pvec.pages[i - 1]->index + 1;
418 pagevec_release(&pvec);
419 } while (index <= end);
420
421out:
422 pagevec_release(&pvec);
423 return found;
424}
425
426/*
427 * ext4_seek_data() retrieves the offset for SEEK_DATA.
428 */
429static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
430{
431 struct inode *inode = file->f_mapping->host;
432 struct ext4_map_blocks map;
433 struct extent_status es;
434 ext4_lblk_t start, last, end;
435 loff_t dataoff, isize;
436 int blkbits;
437 int ret = 0;
438
439 mutex_lock(&inode->i_mutex);
440
441 isize = i_size_read(inode);
442 if (offset >= isize) {
443 mutex_unlock(&inode->i_mutex);
444 return -ENXIO;
445 }
446
447 blkbits = inode->i_sb->s_blocksize_bits;
448 start = offset >> blkbits;
449 last = start;
450 end = isize >> blkbits;
451 dataoff = offset;
452
453 do {
454 map.m_lblk = last;
455 map.m_len = end - last + 1;
456 ret = ext4_map_blocks(NULL, inode, &map, 0);
457 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
458 if (last != start)
459 dataoff = last << blkbits;
460 break;
461 }
462
463 /*
464 * If there is a delay extent at this offset,
465 * it will be as a data.
466 */
467 es.start = last;
468 (void)ext4_es_find_extent(inode, &es);
469 if (last >= es.start &&
470 last < es.start + es.len) {
471 if (last != start)
472 dataoff = last << blkbits;
473 break;
474 }
475
476 /*
477 * If there is a unwritten extent at this offset,
478 * it will be as a data or a hole according to page
479 * cache that has data or not.
480 */
481 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
482 int unwritten;
483 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
484 &map, &dataoff);
485 if (unwritten)
486 break;
487 }
488
489 last++;
490 dataoff = last << blkbits;
491 } while (last <= end);
492
493 mutex_unlock(&inode->i_mutex);
494
495 if (dataoff > isize)
496 return -ENXIO;
497
498 if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
499 return -EINVAL;
500 if (dataoff > maxsize)
501 return -EINVAL;
502
503 if (dataoff != file->f_pos) {
504 file->f_pos = dataoff;
505 file->f_version = 0;
506 }
507
508 return dataoff;
509}
510
511/*
512 * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
513 */
514static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
515{
516 struct inode *inode = file->f_mapping->host;
517 struct ext4_map_blocks map;
518 struct extent_status es;
519 ext4_lblk_t start, last, end;
520 loff_t holeoff, isize;
521 int blkbits;
522 int ret = 0;
523
524 mutex_lock(&inode->i_mutex);
525
526 isize = i_size_read(inode);
527 if (offset >= isize) {
528 mutex_unlock(&inode->i_mutex);
529 return -ENXIO;
530 }
531
532 blkbits = inode->i_sb->s_blocksize_bits;
533 start = offset >> blkbits;
534 last = start;
535 end = isize >> blkbits;
536 holeoff = offset;
537
538 do {
539 map.m_lblk = last;
540 map.m_len = end - last + 1;
541 ret = ext4_map_blocks(NULL, inode, &map, 0);
542 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
543 last += ret;
544 holeoff = last << blkbits;
545 continue;
546 }
547
548 /*
549 * If there is a delay extent at this offset,
550 * we will skip this extent.
551 */
552 es.start = last;
553 (void)ext4_es_find_extent(inode, &es);
554 if (last >= es.start &&
555 last < es.start + es.len) {
556 last = es.start + es.len;
557 holeoff = last << blkbits;
558 continue;
559 }
560
561 /*
562 * If there is a unwritten extent at this offset,
563 * it will be as a data or a hole according to page
564 * cache that has data or not.
565 */
566 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
567 int unwritten;
568 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
569 &map, &holeoff);
570 if (!unwritten) {
571 last += ret;
572 holeoff = last << blkbits;
573 continue;
574 }
575 }
576
577 /* find a hole */
578 break;
579 } while (last <= end);
580
581 mutex_unlock(&inode->i_mutex);
582
583 if (holeoff > isize)
584 holeoff = isize;
585
586 if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
587 return -EINVAL;
588 if (holeoff > maxsize)
589 return -EINVAL;
590
591 if (holeoff != file->f_pos) {
592 file->f_pos = holeoff;
593 file->f_version = 0;
594 }
595
596 return holeoff;
597}
598
599/*
289 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 600 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
290 * by calling generic_file_llseek_size() with the appropriate maxbytes 601 * by calling generic_file_llseek_size() with the appropriate maxbytes
291 * value for each. 602 * value for each.
292 */ 603 */
293loff_t ext4_llseek(struct file *file, loff_t offset, int origin) 604loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
294{ 605{
295 struct inode *inode = file->f_mapping->host; 606 struct inode *inode = file->f_mapping->host;
296 loff_t maxbytes; 607 loff_t maxbytes;
@@ -300,8 +611,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
300 else 611 else
301 maxbytes = inode->i_sb->s_maxbytes; 612 maxbytes = inode->i_sb->s_maxbytes;
302 613
303 return generic_file_llseek_size(file, offset, origin, 614 switch (whence) {
304 maxbytes, i_size_read(inode)); 615 case SEEK_SET:
616 case SEEK_CUR:
617 case SEEK_END:
618 return generic_file_llseek_size(file, offset, whence,
619 maxbytes, i_size_read(inode));
620 case SEEK_DATA:
621 return ext4_seek_data(file, offset, maxbytes);
622 case SEEK_HOLE:
623 return ext4_seek_hole(file, offset, maxbytes);
624 }
625
626 return -EINVAL;
305} 627}
306 628
307const struct file_operations ext4_file_operations = { 629const struct file_operations ext4_file_operations = {
@@ -326,12 +648,10 @@ const struct file_operations ext4_file_operations = {
326const struct inode_operations ext4_file_inode_operations = { 648const struct inode_operations ext4_file_inode_operations = {
327 .setattr = ext4_setattr, 649 .setattr = ext4_setattr,
328 .getattr = ext4_getattr, 650 .getattr = ext4_getattr,
329#ifdef CONFIG_EXT4_FS_XATTR
330 .setxattr = generic_setxattr, 651 .setxattr = generic_setxattr,
331 .getxattr = generic_getxattr, 652 .getxattr = generic_getxattr,
332 .listxattr = ext4_listxattr, 653 .listxattr = ext4_listxattr,
333 .removexattr = generic_removexattr, 654 .removexattr = generic_removexattr,
334#endif
335 .get_acl = ext4_get_acl, 655 .get_acl = ext4_get_acl,
336 .fiemap = ext4_fiemap, 656 .fiemap = ext4_fiemap,
337}; 657};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index be1d89f385b4..3278e64e57b6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,7 +44,6 @@
44 */ 44 */
45static int ext4_sync_parent(struct inode *inode) 45static int ext4_sync_parent(struct inode *inode)
46{ 46{
47 struct writeback_control wbc;
48 struct dentry *dentry = NULL; 47 struct dentry *dentry = NULL;
49 struct inode *next; 48 struct inode *next;
50 int ret = 0; 49 int ret = 0;
@@ -66,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode)
66 ret = sync_mapping_buffers(inode->i_mapping); 65 ret = sync_mapping_buffers(inode->i_mapping);
67 if (ret) 66 if (ret)
68 break; 67 break;
69 memset(&wbc, 0, sizeof(wbc)); 68 ret = sync_inode_metadata(inode, 1);
70 wbc.sync_mode = WB_SYNC_ALL;
71 wbc.nr_to_write = 0; /* only write out the inode */
72 ret = sync_inode(inode, &wbc);
73 if (ret) 69 if (ret)
74 break; 70 break;
75 } 71 }
@@ -113,8 +109,6 @@ static int __sync_inode(struct inode *inode, int datasync)
113 * 109 *
114 * What we do is just kick off a commit and wait on it. This will snapshot the 110 * What we do is just kick off a commit and wait on it. This will snapshot the
115 * inode to disk. 111 * inode to disk.
116 *
117 * i_mutex lock is held when entering and exiting this function
118 */ 112 */
119 113
120int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 114int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3a100e7a62a8..3f32c8012447 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -762,7 +762,6 @@ got:
762 762
763 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); 763 BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
764 err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); 764 err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
765 brelse(block_bitmap_bh);
766 765
767 /* recheck and clear flag under lock if we still need to */ 766 /* recheck and clear flag under lock if we still need to */
768 ext4_lock_group(sb, group); 767 ext4_lock_group(sb, group);
@@ -775,6 +774,7 @@ got:
775 ext4_group_desc_csum_set(sb, group, gdp); 774 ext4_group_desc_csum_set(sb, group, gdp);
776 } 775 }
777 ext4_unlock_group(sb, group); 776 ext4_unlock_group(sb, group);
777 brelse(block_bitmap_bh);
778 778
779 if (err) 779 if (err)
780 goto fail; 780 goto fail;
@@ -902,6 +902,10 @@ got:
902 902
903 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 903 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
904 904
905 ei->i_inline_off = 0;
906 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
907 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
908
905 ret = inode; 909 ret = inode;
906 dquot_initialize(inode); 910 dquot_initialize(inode);
907 err = dquot_alloc_inode(inode); 911 err = dquot_alloc_inode(inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 792e388e7b44..20862f96e8ae 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
22 22
23#include "ext4_jbd2.h" 23#include "ext4_jbd2.h"
24#include "truncate.h" 24#include "truncate.h"
25#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
25 26
26#include <trace/events/ext4.h> 27#include <trace/events/ext4.h>
27 28
@@ -755,8 +756,7 @@ cleanup:
755 partial--; 756 partial--;
756 } 757 }
757out: 758out:
758 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, 759 trace_ext4_ind_map_blocks_exit(inode, map, err);
759 map->m_pblk, map->m_len, err);
760 return err; 760 return err;
761} 761}
762 762
@@ -1412,6 +1412,7 @@ void ext4_ind_truncate(struct inode *inode)
1412 down_write(&ei->i_data_sem); 1412 down_write(&ei->i_data_sem);
1413 1413
1414 ext4_discard_preallocations(inode); 1414 ext4_discard_preallocations(inode);
1415 ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
1415 1416
1416 /* 1417 /*
1417 * The orphan list entry will now protect us from any crash which 1418 * The orphan list entry will now protect us from any crash which
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644
index 000000000000..387c47c6cda9
--- /dev/null
+++ b/fs/ext4/inline.c
@@ -0,0 +1,1884 @@
1/*
2 * Copyright (c) 2012 Taobao.
3 * Written by Tao Ma <boyu.mt@taobao.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of version 2.1 of the GNU Lesser General Public License
7 * as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 */
14#include "ext4_jbd2.h"
15#include "ext4.h"
16#include "xattr.h"
17#include "truncate.h"
18#include <linux/fiemap.h>
19
20#define EXT4_XATTR_SYSTEM_DATA "data"
21#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
22#define EXT4_INLINE_DOTDOT_SIZE 4
23
24int ext4_get_inline_size(struct inode *inode)
25{
26 if (EXT4_I(inode)->i_inline_off)
27 return EXT4_I(inode)->i_inline_size;
28
29 return 0;
30}
31
32static int get_max_inline_xattr_value_size(struct inode *inode,
33 struct ext4_iloc *iloc)
34{
35 struct ext4_xattr_ibody_header *header;
36 struct ext4_xattr_entry *entry;
37 struct ext4_inode *raw_inode;
38 int free, min_offs;
39
40 min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
41 EXT4_GOOD_OLD_INODE_SIZE -
42 EXT4_I(inode)->i_extra_isize -
43 sizeof(struct ext4_xattr_ibody_header);
44
45 /*
46 * We need to subtract another sizeof(__u32) since an in-inode xattr
47 * needs an empty 4 bytes to indicate the gap between the xattr entry
48 * and the name/value pair.
49 */
50 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
51 return EXT4_XATTR_SIZE(min_offs -
52 EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
53 EXT4_XATTR_ROUND - sizeof(__u32));
54
55 raw_inode = ext4_raw_inode(iloc);
56 header = IHDR(inode, raw_inode);
57 entry = IFIRST(header);
58
59 /* Compute min_offs. */
60 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
61 if (!entry->e_value_block && entry->e_value_size) {
62 size_t offs = le16_to_cpu(entry->e_value_offs);
63 if (offs < min_offs)
64 min_offs = offs;
65 }
66 }
67 free = min_offs -
68 ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
69
70 if (EXT4_I(inode)->i_inline_off) {
71 entry = (struct ext4_xattr_entry *)
72 ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
73
74 free += le32_to_cpu(entry->e_value_size);
75 goto out;
76 }
77
78 free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
79
80 if (free > EXT4_XATTR_ROUND)
81 free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
82 else
83 free = 0;
84
85out:
86 return free;
87}
88
89/*
90 * Get the maximum size we now can store in an inode.
91 * If we can't find the space for a xattr entry, don't use the space
92 * of the extents since we have no space to indicate the inline data.
93 */
94int ext4_get_max_inline_size(struct inode *inode)
95{
96 int error, max_inline_size;
97 struct ext4_iloc iloc;
98
99 if (EXT4_I(inode)->i_extra_isize == 0)
100 return 0;
101
102 error = ext4_get_inode_loc(inode, &iloc);
103 if (error) {
104 ext4_error_inode(inode, __func__, __LINE__, 0,
105 "can't get inode location %lu",
106 inode->i_ino);
107 return 0;
108 }
109
110 down_read(&EXT4_I(inode)->xattr_sem);
111 max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
112 up_read(&EXT4_I(inode)->xattr_sem);
113
114 brelse(iloc.bh);
115
116 if (!max_inline_size)
117 return 0;
118
119 return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
120}
121
122int ext4_has_inline_data(struct inode *inode)
123{
124 return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
125 EXT4_I(inode)->i_inline_off;
126}
127
128/*
129 * this function does not take xattr_sem, which is OK because it is
130 * currently only used in a code path coming form ext4_iget, before
131 * the new inode has been unlocked
132 */
133int ext4_find_inline_data_nolock(struct inode *inode)
134{
135 struct ext4_xattr_ibody_find is = {
136 .s = { .not_found = -ENODATA, },
137 };
138 struct ext4_xattr_info i = {
139 .name_index = EXT4_XATTR_INDEX_SYSTEM,
140 .name = EXT4_XATTR_SYSTEM_DATA,
141 };
142 int error;
143
144 if (EXT4_I(inode)->i_extra_isize == 0)
145 return 0;
146
147 error = ext4_get_inode_loc(inode, &is.iloc);
148 if (error)
149 return error;
150
151 error = ext4_xattr_ibody_find(inode, &i, &is);
152 if (error)
153 goto out;
154
155 if (!is.s.not_found) {
156 EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
157 (void *)ext4_raw_inode(&is.iloc));
158 EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
159 le32_to_cpu(is.s.here->e_value_size);
160 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
161 }
162out:
163 brelse(is.iloc.bh);
164 return error;
165}
166
167static int ext4_read_inline_data(struct inode *inode, void *buffer,
168 unsigned int len,
169 struct ext4_iloc *iloc)
170{
171 struct ext4_xattr_entry *entry;
172 struct ext4_xattr_ibody_header *header;
173 int cp_len = 0;
174 struct ext4_inode *raw_inode;
175
176 if (!len)
177 return 0;
178
179 BUG_ON(len > EXT4_I(inode)->i_inline_size);
180
181 cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
182 len : EXT4_MIN_INLINE_DATA_SIZE;
183
184 raw_inode = ext4_raw_inode(iloc);
185 memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
186
187 len -= cp_len;
188 buffer += cp_len;
189
190 if (!len)
191 goto out;
192
193 header = IHDR(inode, raw_inode);
194 entry = (struct ext4_xattr_entry *)((void *)raw_inode +
195 EXT4_I(inode)->i_inline_off);
196 len = min_t(unsigned int, len,
197 (unsigned int)le32_to_cpu(entry->e_value_size));
198
199 memcpy(buffer,
200 (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
201 cp_len += len;
202
203out:
204 return cp_len;
205}
206
207/*
208 * write the buffer to the inline inode.
209 * If 'create' is set, we don't need to do the extra copy in the xattr
210 * value since it is already handled by ext4_xattr_ibody_inline_set.
211 * That saves us one memcpy.
212 */
213void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
214 void *buffer, loff_t pos, unsigned int len)
215{
216 struct ext4_xattr_entry *entry;
217 struct ext4_xattr_ibody_header *header;
218 struct ext4_inode *raw_inode;
219 int cp_len = 0;
220
221 BUG_ON(!EXT4_I(inode)->i_inline_off);
222 BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
223
224 raw_inode = ext4_raw_inode(iloc);
225 buffer += pos;
226
227 if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
228 cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
229 EXT4_MIN_INLINE_DATA_SIZE - pos : len;
230 memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
231
232 len -= cp_len;
233 buffer += cp_len;
234 pos += cp_len;
235 }
236
237 if (!len)
238 return;
239
240 pos -= EXT4_MIN_INLINE_DATA_SIZE;
241 header = IHDR(inode, raw_inode);
242 entry = (struct ext4_xattr_entry *)((void *)raw_inode +
243 EXT4_I(inode)->i_inline_off);
244
245 memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
246 buffer, len);
247}
248
249static int ext4_create_inline_data(handle_t *handle,
250 struct inode *inode, unsigned len)
251{
252 int error;
253 void *value = NULL;
254 struct ext4_xattr_ibody_find is = {
255 .s = { .not_found = -ENODATA, },
256 };
257 struct ext4_xattr_info i = {
258 .name_index = EXT4_XATTR_INDEX_SYSTEM,
259 .name = EXT4_XATTR_SYSTEM_DATA,
260 };
261
262 error = ext4_get_inode_loc(inode, &is.iloc);
263 if (error)
264 return error;
265
266 error = ext4_journal_get_write_access(handle, is.iloc.bh);
267 if (error)
268 goto out;
269
270 if (len > EXT4_MIN_INLINE_DATA_SIZE) {
271 value = EXT4_ZERO_XATTR_VALUE;
272 len -= EXT4_MIN_INLINE_DATA_SIZE;
273 } else {
274 value = "";
275 len = 0;
276 }
277
278 /* Insert the the xttr entry. */
279 i.value = value;
280 i.value_len = len;
281
282 error = ext4_xattr_ibody_find(inode, &i, &is);
283 if (error)
284 goto out;
285
286 BUG_ON(!is.s.not_found);
287
288 error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
289 if (error) {
290 if (error == -ENOSPC)
291 ext4_clear_inode_state(inode,
292 EXT4_STATE_MAY_INLINE_DATA);
293 goto out;
294 }
295
296 memset((void *)ext4_raw_inode(&is.iloc)->i_block,
297 0, EXT4_MIN_INLINE_DATA_SIZE);
298
299 EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
300 (void *)ext4_raw_inode(&is.iloc));
301 EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
302 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
303 ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
304 get_bh(is.iloc.bh);
305 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
306
307out:
308 brelse(is.iloc.bh);
309 return error;
310}
311
312static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
313 unsigned int len)
314{
315 int error;
316 void *value = NULL;
317 struct ext4_xattr_ibody_find is = {
318 .s = { .not_found = -ENODATA, },
319 };
320 struct ext4_xattr_info i = {
321 .name_index = EXT4_XATTR_INDEX_SYSTEM,
322 .name = EXT4_XATTR_SYSTEM_DATA,
323 };
324
325 /* If the old space is ok, write the data directly. */
326 if (len <= EXT4_I(inode)->i_inline_size)
327 return 0;
328
329 error = ext4_get_inode_loc(inode, &is.iloc);
330 if (error)
331 return error;
332
333 error = ext4_xattr_ibody_find(inode, &i, &is);
334 if (error)
335 goto out;
336
337 BUG_ON(is.s.not_found);
338
339 len -= EXT4_MIN_INLINE_DATA_SIZE;
340 value = kzalloc(len, GFP_NOFS);
341 if (!value)
342 goto out;
343
344 error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
345 value, len);
346 if (error == -ENODATA)
347 goto out;
348
349 error = ext4_journal_get_write_access(handle, is.iloc.bh);
350 if (error)
351 goto out;
352
353 /* Update the xttr entry. */
354 i.value = value;
355 i.value_len = len;
356
357 error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
358 if (error)
359 goto out;
360
361 EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
362 (void *)ext4_raw_inode(&is.iloc));
363 EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
364 le32_to_cpu(is.s.here->e_value_size);
365 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
366 get_bh(is.iloc.bh);
367 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
368
369out:
370 kfree(value);
371 brelse(is.iloc.bh);
372 return error;
373}
374
375int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
376 unsigned int len)
377{
378 int ret, size;
379 struct ext4_inode_info *ei = EXT4_I(inode);
380
381 if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
382 return -ENOSPC;
383
384 size = ext4_get_max_inline_size(inode);
385 if (size < len)
386 return -ENOSPC;
387
388 down_write(&EXT4_I(inode)->xattr_sem);
389
390 if (ei->i_inline_off)
391 ret = ext4_update_inline_data(handle, inode, len);
392 else
393 ret = ext4_create_inline_data(handle, inode, len);
394
395 up_write(&EXT4_I(inode)->xattr_sem);
396
397 return ret;
398}
399
400static int ext4_destroy_inline_data_nolock(handle_t *handle,
401 struct inode *inode)
402{
403 struct ext4_inode_info *ei = EXT4_I(inode);
404 struct ext4_xattr_ibody_find is = {
405 .s = { .not_found = 0, },
406 };
407 struct ext4_xattr_info i = {
408 .name_index = EXT4_XATTR_INDEX_SYSTEM,
409 .name = EXT4_XATTR_SYSTEM_DATA,
410 .value = NULL,
411 .value_len = 0,
412 };
413 int error;
414
415 if (!ei->i_inline_off)
416 return 0;
417
418 error = ext4_get_inode_loc(inode, &is.iloc);
419 if (error)
420 return error;
421
422 error = ext4_xattr_ibody_find(inode, &i, &is);
423 if (error)
424 goto out;
425
426 error = ext4_journal_get_write_access(handle, is.iloc.bh);
427 if (error)
428 goto out;
429
430 error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
431 if (error)
432 goto out;
433
434 memset((void *)ext4_raw_inode(&is.iloc)->i_block,
435 0, EXT4_MIN_INLINE_DATA_SIZE);
436
437 if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
438 EXT4_FEATURE_INCOMPAT_EXTENTS)) {
439 if (S_ISDIR(inode->i_mode) ||
440 S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
441 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
442 ext4_ext_tree_init(handle, inode);
443 }
444 }
445 ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
446
447 get_bh(is.iloc.bh);
448 error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
449
450 EXT4_I(inode)->i_inline_off = 0;
451 EXT4_I(inode)->i_inline_size = 0;
452 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
453out:
454 brelse(is.iloc.bh);
455 if (error == -ENODATA)
456 error = 0;
457 return error;
458}
459
460static int ext4_read_inline_page(struct inode *inode, struct page *page)
461{
462 void *kaddr;
463 int ret = 0;
464 size_t len;
465 struct ext4_iloc iloc;
466
467 BUG_ON(!PageLocked(page));
468 BUG_ON(!ext4_has_inline_data(inode));
469 BUG_ON(page->index);
470
471 if (!EXT4_I(inode)->i_inline_off) {
472 ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
473 inode->i_ino);
474 goto out;
475 }
476
477 ret = ext4_get_inode_loc(inode, &iloc);
478 if (ret)
479 goto out;
480
481 len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
482 kaddr = kmap_atomic(page);
483 ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
484 flush_dcache_page(page);
485 kunmap_atomic(kaddr);
486 zero_user_segment(page, len, PAGE_CACHE_SIZE);
487 SetPageUptodate(page);
488 brelse(iloc.bh);
489
490out:
491 return ret;
492}
493
494int ext4_readpage_inline(struct inode *inode, struct page *page)
495{
496 int ret = 0;
497
498 down_read(&EXT4_I(inode)->xattr_sem);
499 if (!ext4_has_inline_data(inode)) {
500 up_read(&EXT4_I(inode)->xattr_sem);
501 return -EAGAIN;
502 }
503
504 /*
505 * Current inline data can only exist in the 1st page,
506 * So for all the other pages, just set them uptodate.
507 */
508 if (!page->index)
509 ret = ext4_read_inline_page(inode, page);
510 else if (!PageUptodate(page)) {
511 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
512 SetPageUptodate(page);
513 }
514
515 up_read(&EXT4_I(inode)->xattr_sem);
516
517 unlock_page(page);
518 return ret >= 0 ? 0 : ret;
519}
520
521static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
522 struct inode *inode,
523 unsigned flags)
524{
525 int ret, needed_blocks;
526 handle_t *handle = NULL;
527 int retries = 0, sem_held = 0;
528 struct page *page = NULL;
529 unsigned from, to;
530 struct ext4_iloc iloc;
531
532 if (!ext4_has_inline_data(inode)) {
533 /*
534 * clear the flag so that no new write
535 * will trap here again.
536 */
537 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
538 return 0;
539 }
540
541 needed_blocks = ext4_writepage_trans_blocks(inode);
542
543 ret = ext4_get_inode_loc(inode, &iloc);
544 if (ret)
545 return ret;
546
547retry:
548 handle = ext4_journal_start(inode, needed_blocks);
549 if (IS_ERR(handle)) {
550 ret = PTR_ERR(handle);
551 handle = NULL;
552 goto out;
553 }
554
555 /* We cannot recurse into the filesystem as the transaction is already
556 * started */
557 flags |= AOP_FLAG_NOFS;
558
559 page = grab_cache_page_write_begin(mapping, 0, flags);
560 if (!page) {
561 ret = -ENOMEM;
562 goto out;
563 }
564
565 down_write(&EXT4_I(inode)->xattr_sem);
566 sem_held = 1;
567 /* If some one has already done this for us, just exit. */
568 if (!ext4_has_inline_data(inode)) {
569 ret = 0;
570 goto out;
571 }
572
573 from = 0;
574 to = ext4_get_inline_size(inode);
575 if (!PageUptodate(page)) {
576 ret = ext4_read_inline_page(inode, page);
577 if (ret < 0)
578 goto out;
579 }
580
581 ret = ext4_destroy_inline_data_nolock(handle, inode);
582 if (ret)
583 goto out;
584
585 if (ext4_should_dioread_nolock(inode))
586 ret = __block_write_begin(page, from, to, ext4_get_block_write);
587 else
588 ret = __block_write_begin(page, from, to, ext4_get_block);
589
590 if (!ret && ext4_should_journal_data(inode)) {
591 ret = ext4_walk_page_buffers(handle, page_buffers(page),
592 from, to, NULL,
593 do_journal_get_write_access);
594 }
595
596 if (ret) {
597 unlock_page(page);
598 page_cache_release(page);
599 ext4_orphan_add(handle, inode);
600 up_write(&EXT4_I(inode)->xattr_sem);
601 sem_held = 0;
602 ext4_journal_stop(handle);
603 handle = NULL;
604 ext4_truncate_failed_write(inode);
605 /*
606 * If truncate failed early the inode might
607 * still be on the orphan list; we need to
608 * make sure the inode is removed from the
609 * orphan list in that case.
610 */
611 if (inode->i_nlink)
612 ext4_orphan_del(NULL, inode);
613 }
614
615 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
616 goto retry;
617
618 block_commit_write(page, from, to);
619out:
620 if (page) {
621 unlock_page(page);
622 page_cache_release(page);
623 }
624 if (sem_held)
625 up_write(&EXT4_I(inode)->xattr_sem);
626 if (handle)
627 ext4_journal_stop(handle);
628 brelse(iloc.bh);
629 return ret;
630}
631
632/*
633 * Try to write data in the inode.
634 * If the inode has inline data, check whether the new write can be
635 * in the inode also. If not, create the page the handle, move the data
636 * to the page make it update and let the later codes create extent for it.
637 */
638int ext4_try_to_write_inline_data(struct address_space *mapping,
639 struct inode *inode,
640 loff_t pos, unsigned len,
641 unsigned flags,
642 struct page **pagep)
643{
644 int ret;
645 handle_t *handle;
646 struct page *page;
647 struct ext4_iloc iloc;
648
649 if (pos + len > ext4_get_max_inline_size(inode))
650 goto convert;
651
652 ret = ext4_get_inode_loc(inode, &iloc);
653 if (ret)
654 return ret;
655
656 /*
657 * The possible write could happen in the inode,
658 * so try to reserve the space in inode first.
659 */
660 handle = ext4_journal_start(inode, 1);
661 if (IS_ERR(handle)) {
662 ret = PTR_ERR(handle);
663 handle = NULL;
664 goto out;
665 }
666
667 ret = ext4_prepare_inline_data(handle, inode, pos + len);
668 if (ret && ret != -ENOSPC)
669 goto out;
670
671 /* We don't have space in inline inode, so convert it to extent. */
672 if (ret == -ENOSPC) {
673 ext4_journal_stop(handle);
674 brelse(iloc.bh);
675 goto convert;
676 }
677
678 flags |= AOP_FLAG_NOFS;
679
680 page = grab_cache_page_write_begin(mapping, 0, flags);
681 if (!page) {
682 ret = -ENOMEM;
683 goto out;
684 }
685
686 *pagep = page;
687 down_read(&EXT4_I(inode)->xattr_sem);
688 if (!ext4_has_inline_data(inode)) {
689 ret = 0;
690 unlock_page(page);
691 page_cache_release(page);
692 goto out_up_read;
693 }
694
695 if (!PageUptodate(page)) {
696 ret = ext4_read_inline_page(inode, page);
697 if (ret < 0)
698 goto out_up_read;
699 }
700
701 ret = 1;
702 handle = NULL;
703out_up_read:
704 up_read(&EXT4_I(inode)->xattr_sem);
705out:
706 if (handle)
707 ext4_journal_stop(handle);
708 brelse(iloc.bh);
709 return ret;
710convert:
711 return ext4_convert_inline_data_to_extent(mapping,
712 inode, flags);
713}
714
715int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
716 unsigned copied, struct page *page)
717{
718 int ret;
719 void *kaddr;
720 struct ext4_iloc iloc;
721
722 if (unlikely(copied < len)) {
723 if (!PageUptodate(page)) {
724 copied = 0;
725 goto out;
726 }
727 }
728
729 ret = ext4_get_inode_loc(inode, &iloc);
730 if (ret) {
731 ext4_std_error(inode->i_sb, ret);
732 copied = 0;
733 goto out;
734 }
735
736 down_write(&EXT4_I(inode)->xattr_sem);
737 BUG_ON(!ext4_has_inline_data(inode));
738
739 kaddr = kmap_atomic(page);
740 ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
741 kunmap_atomic(kaddr);
742 SetPageUptodate(page);
743 /* clear page dirty so that writepages wouldn't work for us. */
744 ClearPageDirty(page);
745
746 up_write(&EXT4_I(inode)->xattr_sem);
747 brelse(iloc.bh);
748out:
749 return copied;
750}
751
752struct buffer_head *
753ext4_journalled_write_inline_data(struct inode *inode,
754 unsigned len,
755 struct page *page)
756{
757 int ret;
758 void *kaddr;
759 struct ext4_iloc iloc;
760
761 ret = ext4_get_inode_loc(inode, &iloc);
762 if (ret) {
763 ext4_std_error(inode->i_sb, ret);
764 return NULL;
765 }
766
767 down_write(&EXT4_I(inode)->xattr_sem);
768 kaddr = kmap_atomic(page);
769 ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
770 kunmap_atomic(kaddr);
771 up_write(&EXT4_I(inode)->xattr_sem);
772
773 return iloc.bh;
774}
775
776/*
777 * Try to make the page cache and handle ready for the inline data case.
778 * We can call this function in 2 cases:
779 * 1. The inode is created and the first write exceeds inline size. We can
780 * clear the inode state safely.
781 * 2. The inode has inline data, then we need to read the data, make it
782 * update and dirty so that ext4_da_writepages can handle it. We don't
783 * need to start the journal since the file's metatdata isn't changed now.
784 */
785static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
786 struct inode *inode,
787 unsigned flags,
788 void **fsdata)
789{
790 int ret = 0, inline_size;
791 struct page *page;
792
793 page = grab_cache_page_write_begin(mapping, 0, flags);
794 if (!page)
795 return -ENOMEM;
796
797 down_read(&EXT4_I(inode)->xattr_sem);
798 if (!ext4_has_inline_data(inode)) {
799 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
800 goto out;
801 }
802
803 inline_size = ext4_get_inline_size(inode);
804
805 if (!PageUptodate(page)) {
806 ret = ext4_read_inline_page(inode, page);
807 if (ret < 0)
808 goto out;
809 }
810
811 ret = __block_write_begin(page, 0, inline_size,
812 ext4_da_get_block_prep);
813 if (ret) {
814 ext4_truncate_failed_write(inode);
815 goto out;
816 }
817
818 SetPageDirty(page);
819 SetPageUptodate(page);
820 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
821 *fsdata = (void *)CONVERT_INLINE_DATA;
822
823out:
824 up_read(&EXT4_I(inode)->xattr_sem);
825 if (page) {
826 unlock_page(page);
827 page_cache_release(page);
828 }
829 return ret;
830}
831
832/*
833 * Prepare the write for the inline data.
834 * If the the data can be written into the inode, we just read
835 * the page and make it uptodate, and start the journal.
836 * Otherwise read the page, makes it dirty so that it can be
837 * handle in writepages(the i_disksize update is left to the
838 * normal ext4_da_write_end).
839 */
840int ext4_da_write_inline_data_begin(struct address_space *mapping,
841 struct inode *inode,
842 loff_t pos, unsigned len,
843 unsigned flags,
844 struct page **pagep,
845 void **fsdata)
846{
847 int ret, inline_size;
848 handle_t *handle;
849 struct page *page;
850 struct ext4_iloc iloc;
851
852 ret = ext4_get_inode_loc(inode, &iloc);
853 if (ret)
854 return ret;
855
856 handle = ext4_journal_start(inode, 1);
857 if (IS_ERR(handle)) {
858 ret = PTR_ERR(handle);
859 handle = NULL;
860 goto out;
861 }
862
863 inline_size = ext4_get_max_inline_size(inode);
864
865 ret = -ENOSPC;
866 if (inline_size >= pos + len) {
867 ret = ext4_prepare_inline_data(handle, inode, pos + len);
868 if (ret && ret != -ENOSPC)
869 goto out;
870 }
871
872 if (ret == -ENOSPC) {
873 ret = ext4_da_convert_inline_data_to_extent(mapping,
874 inode,
875 flags,
876 fsdata);
877 goto out;
878 }
879
880 /*
881 * We cannot recurse into the filesystem as the transaction
882 * is already started.
883 */
884 flags |= AOP_FLAG_NOFS;
885
886 page = grab_cache_page_write_begin(mapping, 0, flags);
887 if (!page) {
888 ret = -ENOMEM;
889 goto out;
890 }
891
892 down_read(&EXT4_I(inode)->xattr_sem);
893 if (!ext4_has_inline_data(inode)) {
894 ret = 0;
895 goto out_release_page;
896 }
897
898 if (!PageUptodate(page)) {
899 ret = ext4_read_inline_page(inode, page);
900 if (ret < 0)
901 goto out_release_page;
902 }
903
904 up_read(&EXT4_I(inode)->xattr_sem);
905 *pagep = page;
906 handle = NULL;
907 brelse(iloc.bh);
908 return 1;
909out_release_page:
910 up_read(&EXT4_I(inode)->xattr_sem);
911 unlock_page(page);
912 page_cache_release(page);
913out:
914 if (handle)
915 ext4_journal_stop(handle);
916 brelse(iloc.bh);
917 return ret;
918}
919
920int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
921 unsigned len, unsigned copied,
922 struct page *page)
923{
924 int i_size_changed = 0;
925
926 copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
927
928 /*
929 * No need to use i_size_read() here, the i_size
930 * cannot change under us because we hold i_mutex.
931 *
932 * But it's important to update i_size while still holding page lock:
933 * page writeout could otherwise come in and zero beyond i_size.
934 */
935 if (pos+copied > inode->i_size) {
936 i_size_write(inode, pos+copied);
937 i_size_changed = 1;
938 }
939 unlock_page(page);
940 page_cache_release(page);
941
942 /*
943 * Don't mark the inode dirty under page lock. First, it unnecessarily
944 * makes the holding time of page lock longer. Second, it forces lock
945 * ordering of page lock and transaction start for journaling
946 * filesystems.
947 */
948 if (i_size_changed)
949 mark_inode_dirty(inode);
950
951 return copied;
952}
953
954#ifdef INLINE_DIR_DEBUG
955void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
956 void *inline_start, int inline_size)
957{
958 int offset;
959 unsigned short de_len;
960 struct ext4_dir_entry_2 *de = inline_start;
961 void *dlimit = inline_start + inline_size;
962
963 trace_printk("inode %lu\n", dir->i_ino);
964 offset = 0;
965 while ((void *)de < dlimit) {
966 de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
967 trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
968 offset, de_len, de->name_len, de->name,
969 de->name_len, le32_to_cpu(de->inode));
970 if (ext4_check_dir_entry(dir, NULL, de, bh,
971 inline_start, inline_size, offset))
972 BUG();
973
974 offset += de_len;
975 de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
976 }
977}
978#else
979#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
980#endif
981
982/*
983 * Add a new entry into a inline dir.
984 * It will return -ENOSPC if no space is available, and -EIO
985 * and -EEXIST if directory entry already exists.
986 */
987static int ext4_add_dirent_to_inline(handle_t *handle,
988 struct dentry *dentry,
989 struct inode *inode,
990 struct ext4_iloc *iloc,
991 void *inline_start, int inline_size)
992{
993 struct inode *dir = dentry->d_parent->d_inode;
994 const char *name = dentry->d_name.name;
995 int namelen = dentry->d_name.len;
996 unsigned short reclen;
997 int err;
998 struct ext4_dir_entry_2 *de;
999
1000 reclen = EXT4_DIR_REC_LEN(namelen);
1001 err = ext4_find_dest_de(dir, inode, iloc->bh,
1002 inline_start, inline_size,
1003 name, namelen, &de);
1004 if (err)
1005 return err;
1006
1007 err = ext4_journal_get_write_access(handle, iloc->bh);
1008 if (err)
1009 return err;
1010 ext4_insert_dentry(inode, de, inline_size, name, namelen);
1011
1012 ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
1013
1014 /*
1015 * XXX shouldn't update any times until successful
1016 * completion of syscall, but too many callers depend
1017 * on this.
1018 *
1019 * XXX similarly, too many callers depend on
1020 * ext4_new_inode() setting the times, but error
1021 * recovery deletes the inode, so the worst that can
1022 * happen is that the times are slightly out of date
1023 * and/or different from the directory change time.
1024 */
1025 dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1026 ext4_update_dx_flag(dir);
1027 dir->i_version++;
1028 ext4_mark_inode_dirty(handle, dir);
1029 return 1;
1030}
1031
1032static void *ext4_get_inline_xattr_pos(struct inode *inode,
1033 struct ext4_iloc *iloc)
1034{
1035 struct ext4_xattr_entry *entry;
1036 struct ext4_xattr_ibody_header *header;
1037
1038 BUG_ON(!EXT4_I(inode)->i_inline_off);
1039
1040 header = IHDR(inode, ext4_raw_inode(iloc));
1041 entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
1042 EXT4_I(inode)->i_inline_off);
1043
1044 return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
1045}
1046
1047/* Set the final de to cover the whole block. */
1048static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
1049{
1050 struct ext4_dir_entry_2 *de, *prev_de;
1051 void *limit;
1052 int de_len;
1053
1054 de = (struct ext4_dir_entry_2 *)de_buf;
1055 if (old_size) {
1056 limit = de_buf + old_size;
1057 do {
1058 prev_de = de;
1059 de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
1060 de_buf += de_len;
1061 de = (struct ext4_dir_entry_2 *)de_buf;
1062 } while (de_buf < limit);
1063
1064 prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
1065 old_size, new_size);
1066 } else {
1067 /* this is just created, so create an empty entry. */
1068 de->inode = 0;
1069 de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
1070 }
1071}
1072
1073static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
1074 struct ext4_iloc *iloc)
1075{
1076 int ret;
1077 int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
1078 int new_size = get_max_inline_xattr_value_size(dir, iloc);
1079
1080 if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
1081 return -ENOSPC;
1082
1083 ret = ext4_update_inline_data(handle, dir,
1084 new_size + EXT4_MIN_INLINE_DATA_SIZE);
1085 if (ret)
1086 return ret;
1087
1088 ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
1089 EXT4_I(dir)->i_inline_size -
1090 EXT4_MIN_INLINE_DATA_SIZE);
1091 dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
1092 return 0;
1093}
1094
1095static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
1096 struct ext4_iloc *iloc,
1097 void *buf, int inline_size)
1098{
1099 ext4_create_inline_data(handle, inode, inline_size);
1100 ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
1101 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
1102}
1103
1104static int ext4_finish_convert_inline_dir(handle_t *handle,
1105 struct inode *inode,
1106 struct buffer_head *dir_block,
1107 void *buf,
1108 int inline_size)
1109{
1110 int err, csum_size = 0, header_size = 0;
1111 struct ext4_dir_entry_2 *de;
1112 struct ext4_dir_entry_tail *t;
1113 void *target = dir_block->b_data;
1114
1115 /*
1116 * First create "." and ".." and then copy the dir information
1117 * back to the block.
1118 */
1119 de = (struct ext4_dir_entry_2 *)target;
1120 de = ext4_init_dot_dotdot(inode, de,
1121 inode->i_sb->s_blocksize, csum_size,
1122 le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
1123 header_size = (void *)de - target;
1124
1125 memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
1126 inline_size - EXT4_INLINE_DOTDOT_SIZE);
1127
1128 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1129 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1130 csum_size = sizeof(struct ext4_dir_entry_tail);
1131
1132 inode->i_size = inode->i_sb->s_blocksize;
1133 i_size_write(inode, inode->i_sb->s_blocksize);
1134 EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1135 ext4_update_final_de(dir_block->b_data,
1136 inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
1137 inode->i_sb->s_blocksize - csum_size);
1138
1139 if (csum_size) {
1140 t = EXT4_DIRENT_TAIL(dir_block->b_data,
1141 inode->i_sb->s_blocksize);
1142 initialize_dirent_tail(t, inode->i_sb->s_blocksize);
1143 }
1144 set_buffer_uptodate(dir_block);
1145 err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
1146 if (err)
1147 goto out;
1148 set_buffer_verified(dir_block);
1149out:
1150 return err;
1151}
1152
1153static int ext4_convert_inline_data_nolock(handle_t *handle,
1154 struct inode *inode,
1155 struct ext4_iloc *iloc)
1156{
1157 int error;
1158 void *buf = NULL;
1159 struct buffer_head *data_bh = NULL;
1160 struct ext4_map_blocks map;
1161 int inline_size;
1162
1163 inline_size = ext4_get_inline_size(inode);
1164 buf = kmalloc(inline_size, GFP_NOFS);
1165 if (!buf) {
1166 error = -ENOMEM;
1167 goto out;
1168 }
1169
1170 error = ext4_read_inline_data(inode, buf, inline_size, iloc);
1171 if (error < 0)
1172 goto out;
1173
1174 error = ext4_destroy_inline_data_nolock(handle, inode);
1175 if (error)
1176 goto out;
1177
1178 map.m_lblk = 0;
1179 map.m_len = 1;
1180 map.m_flags = 0;
1181 error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
1182 if (error < 0)
1183 goto out_restore;
1184 if (!(map.m_flags & EXT4_MAP_MAPPED)) {
1185 error = -EIO;
1186 goto out_restore;
1187 }
1188
1189 data_bh = sb_getblk(inode->i_sb, map.m_pblk);
1190 if (!data_bh) {
1191 error = -EIO;
1192 goto out_restore;
1193 }
1194
1195 lock_buffer(data_bh);
1196 error = ext4_journal_get_create_access(handle, data_bh);
1197 if (error) {
1198 unlock_buffer(data_bh);
1199 error = -EIO;
1200 goto out_restore;
1201 }
1202 memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
1203
1204 if (!S_ISDIR(inode->i_mode)) {
1205 memcpy(data_bh->b_data, buf, inline_size);
1206 set_buffer_uptodate(data_bh);
1207 error = ext4_handle_dirty_metadata(handle,
1208 inode, data_bh);
1209 } else {
1210 error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
1211 buf, inline_size);
1212 }
1213
1214 unlock_buffer(data_bh);
1215out_restore:
1216 if (error)
1217 ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
1218
1219out:
1220 brelse(data_bh);
1221 kfree(buf);
1222 return error;
1223}
1224
1225/*
1226 * Try to add the new entry to the inline data.
1227 * If succeeds, return 0. If not, extended the inline dir and copied data to
1228 * the new created block.
1229 */
1230int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
1231 struct inode *inode)
1232{
1233 int ret, inline_size;
1234 void *inline_start;
1235 struct ext4_iloc iloc;
1236 struct inode *dir = dentry->d_parent->d_inode;
1237
1238 ret = ext4_get_inode_loc(dir, &iloc);
1239 if (ret)
1240 return ret;
1241
1242 down_write(&EXT4_I(dir)->xattr_sem);
1243 if (!ext4_has_inline_data(dir))
1244 goto out;
1245
1246 inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
1247 EXT4_INLINE_DOTDOT_SIZE;
1248 inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
1249
1250 ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
1251 inline_start, inline_size);
1252 if (ret != -ENOSPC)
1253 goto out;
1254
1255 /* check whether it can be inserted to inline xattr space. */
1256 inline_size = EXT4_I(dir)->i_inline_size -
1257 EXT4_MIN_INLINE_DATA_SIZE;
1258 if (!inline_size) {
1259 /* Try to use the xattr space.*/
1260 ret = ext4_update_inline_dir(handle, dir, &iloc);
1261 if (ret && ret != -ENOSPC)
1262 goto out;
1263
1264 inline_size = EXT4_I(dir)->i_inline_size -
1265 EXT4_MIN_INLINE_DATA_SIZE;
1266 }
1267
1268 if (inline_size) {
1269 inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
1270
1271 ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
1272 inline_start, inline_size);
1273
1274 if (ret != -ENOSPC)
1275 goto out;
1276 }
1277
1278 /*
1279 * The inline space is filled up, so create a new block for it.
1280 * As the extent tree will be created, we have to save the inline
1281 * dir first.
1282 */
1283 ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
1284
1285out:
1286 ext4_mark_inode_dirty(handle, dir);
1287 up_write(&EXT4_I(dir)->xattr_sem);
1288 brelse(iloc.bh);
1289 return ret;
1290}
1291
1292int ext4_read_inline_dir(struct file *filp,
1293 void *dirent, filldir_t filldir,
1294 int *has_inline_data)
1295{
1296 int error = 0;
1297 unsigned int offset, parent_ino;
1298 int i, stored;
1299 struct ext4_dir_entry_2 *de;
1300 struct super_block *sb;
1301 struct inode *inode = filp->f_path.dentry->d_inode;
1302 int ret, inline_size = 0;
1303 struct ext4_iloc iloc;
1304 void *dir_buf = NULL;
1305
1306 ret = ext4_get_inode_loc(inode, &iloc);
1307 if (ret)
1308 return ret;
1309
1310 down_read(&EXT4_I(inode)->xattr_sem);
1311 if (!ext4_has_inline_data(inode)) {
1312 up_read(&EXT4_I(inode)->xattr_sem);
1313 *has_inline_data = 0;
1314 goto out;
1315 }
1316
1317 inline_size = ext4_get_inline_size(inode);
1318 dir_buf = kmalloc(inline_size, GFP_NOFS);
1319 if (!dir_buf) {
1320 ret = -ENOMEM;
1321 up_read(&EXT4_I(inode)->xattr_sem);
1322 goto out;
1323 }
1324
1325 ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
1326 up_read(&EXT4_I(inode)->xattr_sem);
1327 if (ret < 0)
1328 goto out;
1329
1330 sb = inode->i_sb;
1331 stored = 0;
1332 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1333
1334 while (!error && !stored && filp->f_pos < inode->i_size) {
1335revalidate:
1336 /*
1337 * If the version has changed since the last call to
1338 * readdir(2), then we might be pointing to an invalid
1339 * dirent right now. Scan from the start of the inline
1340 * dir to make sure.
1341 */
1342 if (filp->f_version != inode->i_version) {
1343 for (i = 0;
1344 i < inode->i_size && i < offset;) {
1345 if (!i) {
1346 /* skip "." and ".." if needed. */
1347 i += EXT4_INLINE_DOTDOT_SIZE;
1348 continue;
1349 }
1350 de = (struct ext4_dir_entry_2 *)
1351 (dir_buf + i);
1352 /* It's too expensive to do a full
1353 * dirent test each time round this
1354 * loop, but we do have to test at
1355 * least that it is non-zero. A
1356 * failure will be detected in the
1357 * dirent test below. */
1358 if (ext4_rec_len_from_disk(de->rec_len,
1359 inline_size) < EXT4_DIR_REC_LEN(1))
1360 break;
1361 i += ext4_rec_len_from_disk(de->rec_len,
1362 inline_size);
1363 }
1364 offset = i;
1365 filp->f_pos = offset;
1366 filp->f_version = inode->i_version;
1367 }
1368
1369 while (!error && filp->f_pos < inode->i_size) {
1370 if (filp->f_pos == 0) {
1371 error = filldir(dirent, ".", 1, 0, inode->i_ino,
1372 DT_DIR);
1373 if (error)
1374 break;
1375 stored++;
1376
1377 error = filldir(dirent, "..", 2, 0, parent_ino,
1378 DT_DIR);
1379 if (error)
1380 break;
1381 stored++;
1382
1383 filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
1384 continue;
1385 }
1386
1387 de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
1388 if (ext4_check_dir_entry(inode, filp, de,
1389 iloc.bh, dir_buf,
1390 inline_size, offset)) {
1391 ret = stored;
1392 goto out;
1393 }
1394 offset += ext4_rec_len_from_disk(de->rec_len,
1395 inline_size);
1396 if (le32_to_cpu(de->inode)) {
1397 /* We might block in the next section
1398 * if the data destination is
1399 * currently swapped out. So, use a
1400 * version stamp to detect whether or
1401 * not the directory has been modified
1402 * during the copy operation.
1403 */
1404 u64 version = filp->f_version;
1405
1406 error = filldir(dirent, de->name,
1407 de->name_len,
1408 filp->f_pos,
1409 le32_to_cpu(de->inode),
1410 get_dtype(sb, de->file_type));
1411 if (error)
1412 break;
1413 if (version != filp->f_version)
1414 goto revalidate;
1415 stored++;
1416 }
1417 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
1418 inline_size);
1419 }
1420 offset = 0;
1421 }
1422out:
1423 kfree(dir_buf);
1424 brelse(iloc.bh);
1425 return ret;
1426}
1427
1428struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
1429 struct ext4_dir_entry_2 **parent_de,
1430 int *retval)
1431{
1432 struct ext4_iloc iloc;
1433
1434 *retval = ext4_get_inode_loc(inode, &iloc);
1435 if (*retval)
1436 return NULL;
1437
1438 *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
1439
1440 return iloc.bh;
1441}
1442
1443/*
1444 * Try to create the inline data for the new dir.
1445 * If it succeeds, return 0, otherwise return the error.
1446 * In case of ENOSPC, the caller should create the normal disk layout dir.
1447 */
1448int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
1449 struct inode *inode)
1450{
1451 int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
1452 struct ext4_iloc iloc;
1453 struct ext4_dir_entry_2 *de;
1454
1455 ret = ext4_get_inode_loc(inode, &iloc);
1456 if (ret)
1457 return ret;
1458
1459 ret = ext4_prepare_inline_data(handle, inode, inline_size);
1460 if (ret)
1461 goto out;
1462
1463 /*
1464 * For inline dir, we only save the inode information for the ".."
1465 * and create a fake dentry to cover the left space.
1466 */
1467 de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
1468 de->inode = cpu_to_le32(parent->i_ino);
1469 de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
1470 de->inode = 0;
1471 de->rec_len = ext4_rec_len_to_disk(
1472 inline_size - EXT4_INLINE_DOTDOT_SIZE,
1473 inline_size);
1474 set_nlink(inode, 2);
1475 inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
1476out:
1477 brelse(iloc.bh);
1478 return ret;
1479}
1480
1481struct buffer_head *ext4_find_inline_entry(struct inode *dir,
1482 const struct qstr *d_name,
1483 struct ext4_dir_entry_2 **res_dir,
1484 int *has_inline_data)
1485{
1486 int ret;
1487 struct ext4_iloc iloc;
1488 void *inline_start;
1489 int inline_size;
1490
1491 if (ext4_get_inode_loc(dir, &iloc))
1492 return NULL;
1493
1494 down_read(&EXT4_I(dir)->xattr_sem);
1495 if (!ext4_has_inline_data(dir)) {
1496 *has_inline_data = 0;
1497 goto out;
1498 }
1499
1500 inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
1501 EXT4_INLINE_DOTDOT_SIZE;
1502 inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
1503 ret = search_dir(iloc.bh, inline_start, inline_size,
1504 dir, d_name, 0, res_dir);
1505 if (ret == 1)
1506 goto out_find;
1507 if (ret < 0)
1508 goto out;
1509
1510 if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
1511 goto out;
1512
1513 inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
1514 inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
1515
1516 ret = search_dir(iloc.bh, inline_start, inline_size,
1517 dir, d_name, 0, res_dir);
1518 if (ret == 1)
1519 goto out_find;
1520
1521out:
1522 brelse(iloc.bh);
1523 iloc.bh = NULL;
1524out_find:
1525 up_read(&EXT4_I(dir)->xattr_sem);
1526 return iloc.bh;
1527}
1528
1529int ext4_delete_inline_entry(handle_t *handle,
1530 struct inode *dir,
1531 struct ext4_dir_entry_2 *de_del,
1532 struct buffer_head *bh,
1533 int *has_inline_data)
1534{
1535 int err, inline_size;
1536 struct ext4_iloc iloc;
1537 void *inline_start;
1538
1539 err = ext4_get_inode_loc(dir, &iloc);
1540 if (err)
1541 return err;
1542
1543 down_write(&EXT4_I(dir)->xattr_sem);
1544 if (!ext4_has_inline_data(dir)) {
1545 *has_inline_data = 0;
1546 goto out;
1547 }
1548
1549 if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
1550 EXT4_MIN_INLINE_DATA_SIZE) {
1551 inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
1552 EXT4_INLINE_DOTDOT_SIZE;
1553 inline_size = EXT4_MIN_INLINE_DATA_SIZE -
1554 EXT4_INLINE_DOTDOT_SIZE;
1555 } else {
1556 inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
1557 inline_size = ext4_get_inline_size(dir) -
1558 EXT4_MIN_INLINE_DATA_SIZE;
1559 }
1560
1561 err = ext4_journal_get_write_access(handle, bh);
1562 if (err)
1563 goto out;
1564
1565 err = ext4_generic_delete_entry(handle, dir, de_del, bh,
1566 inline_start, inline_size, 0);
1567 if (err)
1568 goto out;
1569
1570 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1571 err = ext4_mark_inode_dirty(handle, dir);
1572 if (unlikely(err))
1573 goto out;
1574
1575 ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
1576out:
1577 up_write(&EXT4_I(dir)->xattr_sem);
1578 brelse(iloc.bh);
1579 if (err != -ENOENT)
1580 ext4_std_error(dir->i_sb, err);
1581 return err;
1582}
1583
1584/*
1585 * Get the inline dentry at offset.
1586 */
1587static inline struct ext4_dir_entry_2 *
1588ext4_get_inline_entry(struct inode *inode,
1589 struct ext4_iloc *iloc,
1590 unsigned int offset,
1591 void **inline_start,
1592 int *inline_size)
1593{
1594 void *inline_pos;
1595
1596 BUG_ON(offset > ext4_get_inline_size(inode));
1597
1598 if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
1599 inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
1600 *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
1601 } else {
1602 inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
1603 offset -= EXT4_MIN_INLINE_DATA_SIZE;
1604 *inline_size = ext4_get_inline_size(inode) -
1605 EXT4_MIN_INLINE_DATA_SIZE;
1606 }
1607
1608 if (inline_start)
1609 *inline_start = inline_pos;
1610 return (struct ext4_dir_entry_2 *)(inline_pos + offset);
1611}
1612
1613int empty_inline_dir(struct inode *dir, int *has_inline_data)
1614{
1615 int err, inline_size;
1616 struct ext4_iloc iloc;
1617 void *inline_pos;
1618 unsigned int offset;
1619 struct ext4_dir_entry_2 *de;
1620 int ret = 1;
1621
1622 err = ext4_get_inode_loc(dir, &iloc);
1623 if (err) {
1624 EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
1625 err, dir->i_ino);
1626 return 1;
1627 }
1628
1629 down_read(&EXT4_I(dir)->xattr_sem);
1630 if (!ext4_has_inline_data(dir)) {
1631 *has_inline_data = 0;
1632 goto out;
1633 }
1634
1635 de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
1636 if (!le32_to_cpu(de->inode)) {
1637 ext4_warning(dir->i_sb,
1638 "bad inline directory (dir #%lu) - no `..'",
1639 dir->i_ino);
1640 ret = 1;
1641 goto out;
1642 }
1643
1644 offset = EXT4_INLINE_DOTDOT_SIZE;
1645 while (offset < dir->i_size) {
1646 de = ext4_get_inline_entry(dir, &iloc, offset,
1647 &inline_pos, &inline_size);
1648 if (ext4_check_dir_entry(dir, NULL, de,
1649 iloc.bh, inline_pos,
1650 inline_size, offset)) {
1651 ext4_warning(dir->i_sb,
1652 "bad inline directory (dir #%lu) - "
1653 "inode %u, rec_len %u, name_len %d"
1654 "inline size %d\n",
1655 dir->i_ino, le32_to_cpu(de->inode),
1656 le16_to_cpu(de->rec_len), de->name_len,
1657 inline_size);
1658 ret = 1;
1659 goto out;
1660 }
1661 if (le32_to_cpu(de->inode)) {
1662 ret = 0;
1663 goto out;
1664 }
1665 offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
1666 }
1667
1668out:
1669 up_read(&EXT4_I(dir)->xattr_sem);
1670 brelse(iloc.bh);
1671 return ret;
1672}
1673
1674int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
1675{
1676 int ret;
1677
1678 down_write(&EXT4_I(inode)->xattr_sem);
1679 ret = ext4_destroy_inline_data_nolock(handle, inode);
1680 up_write(&EXT4_I(inode)->xattr_sem);
1681
1682 return ret;
1683}
1684
1685int ext4_inline_data_fiemap(struct inode *inode,
1686 struct fiemap_extent_info *fieinfo,
1687 int *has_inline)
1688{
1689 __u64 physical = 0;
1690 __u64 length;
1691 __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
1692 int error = 0;
1693 struct ext4_iloc iloc;
1694
1695 down_read(&EXT4_I(inode)->xattr_sem);
1696 if (!ext4_has_inline_data(inode)) {
1697 *has_inline = 0;
1698 goto out;
1699 }
1700
1701 error = ext4_get_inode_loc(inode, &iloc);
1702 if (error)
1703 goto out;
1704
1705 physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1706 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1707 physical += offsetof(struct ext4_inode, i_block);
1708 length = i_size_read(inode);
1709
1710 if (physical)
1711 error = fiemap_fill_next_extent(fieinfo, 0, physical,
1712 length, flags);
1713 brelse(iloc.bh);
1714out:
1715 up_read(&EXT4_I(inode)->xattr_sem);
1716 return (error < 0 ? error : 0);
1717}
1718
1719/*
1720 * Called during xattr set, and if we can sparse space 'needed',
1721 * just create the extent tree evict the data to the outer block.
1722 *
1723 * We use jbd2 instead of page cache to move data to the 1st block
1724 * so that the whole transaction can be committed as a whole and
1725 * the data isn't lost because of the delayed page cache write.
1726 */
1727int ext4_try_to_evict_inline_data(handle_t *handle,
1728 struct inode *inode,
1729 int needed)
1730{
1731 int error;
1732 struct ext4_xattr_entry *entry;
1733 struct ext4_xattr_ibody_header *header;
1734 struct ext4_inode *raw_inode;
1735 struct ext4_iloc iloc;
1736
1737 error = ext4_get_inode_loc(inode, &iloc);
1738 if (error)
1739 return error;
1740
1741 raw_inode = ext4_raw_inode(&iloc);
1742 header = IHDR(inode, raw_inode);
1743 entry = (struct ext4_xattr_entry *)((void *)raw_inode +
1744 EXT4_I(inode)->i_inline_off);
1745 if (EXT4_XATTR_LEN(entry->e_name_len) +
1746 EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
1747 error = -ENOSPC;
1748 goto out;
1749 }
1750
1751 error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
1752out:
1753 brelse(iloc.bh);
1754 return error;
1755}
1756
1757void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
1758{
1759 handle_t *handle;
1760 int inline_size, value_len, needed_blocks;
1761 size_t i_size;
1762 void *value = NULL;
1763 struct ext4_xattr_ibody_find is = {
1764 .s = { .not_found = -ENODATA, },
1765 };
1766 struct ext4_xattr_info i = {
1767 .name_index = EXT4_XATTR_INDEX_SYSTEM,
1768 .name = EXT4_XATTR_SYSTEM_DATA,
1769 };
1770
1771
1772 needed_blocks = ext4_writepage_trans_blocks(inode);
1773 handle = ext4_journal_start(inode, needed_blocks);
1774 if (IS_ERR(handle))
1775 return;
1776
1777 down_write(&EXT4_I(inode)->xattr_sem);
1778 if (!ext4_has_inline_data(inode)) {
1779 *has_inline = 0;
1780 ext4_journal_stop(handle);
1781 return;
1782 }
1783
1784 if (ext4_orphan_add(handle, inode))
1785 goto out;
1786
1787 if (ext4_get_inode_loc(inode, &is.iloc))
1788 goto out;
1789
1790 down_write(&EXT4_I(inode)->i_data_sem);
1791 i_size = inode->i_size;
1792 inline_size = ext4_get_inline_size(inode);
1793 EXT4_I(inode)->i_disksize = i_size;
1794
1795 if (i_size < inline_size) {
1796 /* Clear the content in the xattr space. */
1797 if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
1798 if (ext4_xattr_ibody_find(inode, &i, &is))
1799 goto out_error;
1800
1801 BUG_ON(is.s.not_found);
1802
1803 value_len = le32_to_cpu(is.s.here->e_value_size);
1804 value = kmalloc(value_len, GFP_NOFS);
1805 if (!value)
1806 goto out_error;
1807
1808 if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
1809 value, value_len))
1810 goto out_error;
1811
1812 i.value = value;
1813 i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
1814 i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
1815 if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
1816 goto out_error;
1817 }
1818
1819 /* Clear the content within i_blocks. */
1820 if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
1821 memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
1822 EXT4_MIN_INLINE_DATA_SIZE - i_size);
1823
1824 EXT4_I(inode)->i_inline_size = i_size <
1825 EXT4_MIN_INLINE_DATA_SIZE ?
1826 EXT4_MIN_INLINE_DATA_SIZE : i_size;
1827 }
1828
1829out_error:
1830 up_write(&EXT4_I(inode)->i_data_sem);
1831out:
1832 brelse(is.iloc.bh);
1833 up_write(&EXT4_I(inode)->xattr_sem);
1834 kfree(value);
1835 if (inode->i_nlink)
1836 ext4_orphan_del(handle, inode);
1837
1838 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1839 ext4_mark_inode_dirty(handle, inode);
1840 if (IS_SYNC(inode))
1841 ext4_handle_sync(handle);
1842
1843 ext4_journal_stop(handle);
1844 return;
1845}
1846
1847int ext4_convert_inline_data(struct inode *inode)
1848{
1849 int error, needed_blocks;
1850 handle_t *handle;
1851 struct ext4_iloc iloc;
1852
1853 if (!ext4_has_inline_data(inode)) {
1854 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
1855 return 0;
1856 }
1857
1858 needed_blocks = ext4_writepage_trans_blocks(inode);
1859
1860 iloc.bh = NULL;
1861 error = ext4_get_inode_loc(inode, &iloc);
1862 if (error)
1863 return error;
1864
1865 handle = ext4_journal_start(inode, needed_blocks);
1866 if (IS_ERR(handle)) {
1867 error = PTR_ERR(handle);
1868 goto out_free;
1869 }
1870
1871 down_write(&EXT4_I(inode)->xattr_sem);
1872 if (!ext4_has_inline_data(inode)) {
1873 up_write(&EXT4_I(inode)->xattr_sem);
1874 goto out;
1875 }
1876
1877 error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
1878 up_write(&EXT4_I(inode)->xattr_sem);
1879out:
1880 ext4_journal_stop(handle);
1881out_free:
1882 brelse(iloc.bh);
1883 return error;
1884}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3c243b9afa5..cbfe13bf5b2a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -484,49 +484,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
484} 484}
485 485
486/* 486/*
487 * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
488 */
489static void set_buffers_da_mapped(struct inode *inode,
490 struct ext4_map_blocks *map)
491{
492 struct address_space *mapping = inode->i_mapping;
493 struct pagevec pvec;
494 int i, nr_pages;
495 pgoff_t index, end;
496
497 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
498 end = (map->m_lblk + map->m_len - 1) >>
499 (PAGE_CACHE_SHIFT - inode->i_blkbits);
500
501 pagevec_init(&pvec, 0);
502 while (index <= end) {
503 nr_pages = pagevec_lookup(&pvec, mapping, index,
504 min(end - index + 1,
505 (pgoff_t)PAGEVEC_SIZE));
506 if (nr_pages == 0)
507 break;
508 for (i = 0; i < nr_pages; i++) {
509 struct page *page = pvec.pages[i];
510 struct buffer_head *bh, *head;
511
512 if (unlikely(page->mapping != mapping) ||
513 !PageDirty(page))
514 break;
515
516 if (page_has_buffers(page)) {
517 bh = head = page_buffers(page);
518 do {
519 set_buffer_da_mapped(bh);
520 bh = bh->b_this_page;
521 } while (bh != head);
522 }
523 index++;
524 }
525 pagevec_release(&pvec);
526 }
527}
528
529/*
530 * The ext4_map_blocks() function tries to look up the requested blocks, 487 * The ext4_map_blocks() function tries to look up the requested blocks,
531 * and returns if the blocks are already mapped. 488 * and returns if the blocks are already mapped.
532 * 489 *
@@ -574,7 +531,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
574 up_read((&EXT4_I(inode)->i_data_sem)); 531 up_read((&EXT4_I(inode)->i_data_sem));
575 532
576 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 533 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
577 int ret = check_block_validity(inode, map); 534 int ret;
535 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
536 /* delayed alloc may be allocated by fallocate and
537 * coverted to initialized by directIO.
538 * we need to handle delayed extent here.
539 */
540 down_write((&EXT4_I(inode)->i_data_sem));
541 goto delayed_mapped;
542 }
543 ret = check_block_validity(inode, map);
578 if (ret != 0) 544 if (ret != 0)
579 return ret; 545 return ret;
580 } 546 }
@@ -652,12 +618,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
652 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 618 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
653 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); 619 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
654 620
655 /* If we have successfully mapped the delayed allocated blocks, 621 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
656 * set the BH_Da_Mapped bit on them. Its important to do this 622 int ret;
657 * under the protection of i_data_sem. 623delayed_mapped:
658 */ 624 /* delayed allocation blocks has been allocated */
659 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 625 ret = ext4_es_remove_extent(inode, map->m_lblk,
660 set_buffers_da_mapped(inode, map); 626 map->m_len);
627 if (ret < 0)
628 retval = ret;
629 }
661 } 630 }
662 631
663 up_write((&EXT4_I(inode)->i_data_sem)); 632 up_write((&EXT4_I(inode)->i_data_sem));
@@ -680,10 +649,13 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
680 int ret = 0, started = 0; 649 int ret = 0, started = 0;
681 int dio_credits; 650 int dio_credits;
682 651
652 if (ext4_has_inline_data(inode))
653 return -ERANGE;
654
683 map.m_lblk = iblock; 655 map.m_lblk = iblock;
684 map.m_len = bh->b_size >> inode->i_blkbits; 656 map.m_len = bh->b_size >> inode->i_blkbits;
685 657
686 if (flags && !handle) { 658 if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
687 /* Direct IO write... */ 659 /* Direct IO write... */
688 if (map.m_len > DIO_MAX_BLOCKS) 660 if (map.m_len > DIO_MAX_BLOCKS)
689 map.m_len = DIO_MAX_BLOCKS; 661 map.m_len = DIO_MAX_BLOCKS;
@@ -798,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
798 return NULL; 770 return NULL;
799} 771}
800 772
801static int walk_page_buffers(handle_t *handle, 773int ext4_walk_page_buffers(handle_t *handle,
802 struct buffer_head *head, 774 struct buffer_head *head,
803 unsigned from, 775 unsigned from,
804 unsigned to, 776 unsigned to,
805 int *partial, 777 int *partial,
806 int (*fn)(handle_t *handle, 778 int (*fn)(handle_t *handle,
807 struct buffer_head *bh)) 779 struct buffer_head *bh))
808{ 780{
809 struct buffer_head *bh; 781 struct buffer_head *bh;
810 unsigned block_start, block_end; 782 unsigned block_start, block_end;
@@ -854,8 +826,8 @@ static int walk_page_buffers(handle_t *handle,
854 * is elevated. We'll still have enough credits for the tiny quotafile 826 * is elevated. We'll still have enough credits for the tiny quotafile
855 * write. 827 * write.
856 */ 828 */
857static int do_journal_get_write_access(handle_t *handle, 829int do_journal_get_write_access(handle_t *handle,
858 struct buffer_head *bh) 830 struct buffer_head *bh)
859{ 831{
860 int dirty = buffer_dirty(bh); 832 int dirty = buffer_dirty(bh);
861 int ret; 833 int ret;
@@ -878,7 +850,7 @@ static int do_journal_get_write_access(handle_t *handle,
878 return ret; 850 return ret;
879} 851}
880 852
881static int ext4_get_block_write(struct inode *inode, sector_t iblock, 853static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
882 struct buffer_head *bh_result, int create); 854 struct buffer_head *bh_result, int create);
883static int ext4_write_begin(struct file *file, struct address_space *mapping, 855static int ext4_write_begin(struct file *file, struct address_space *mapping,
884 loff_t pos, unsigned len, unsigned flags, 856 loff_t pos, unsigned len, unsigned flags,
@@ -902,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
902 from = pos & (PAGE_CACHE_SIZE - 1); 874 from = pos & (PAGE_CACHE_SIZE - 1);
903 to = from + len; 875 to = from + len;
904 876
877 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
878 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
879 flags, pagep);
880 if (ret < 0)
881 goto out;
882 if (ret == 1) {
883 ret = 0;
884 goto out;
885 }
886 }
887
905retry: 888retry:
906 handle = ext4_journal_start(inode, needed_blocks); 889 handle = ext4_journal_start(inode, needed_blocks);
907 if (IS_ERR(handle)) { 890 if (IS_ERR(handle)) {
@@ -919,6 +902,7 @@ retry:
919 ret = -ENOMEM; 902 ret = -ENOMEM;
920 goto out; 903 goto out;
921 } 904 }
905
922 *pagep = page; 906 *pagep = page;
923 907
924 if (ext4_should_dioread_nolock(inode)) 908 if (ext4_should_dioread_nolock(inode))
@@ -927,8 +911,9 @@ retry:
927 ret = __block_write_begin(page, pos, len, ext4_get_block); 911 ret = __block_write_begin(page, pos, len, ext4_get_block);
928 912
929 if (!ret && ext4_should_journal_data(inode)) { 913 if (!ret && ext4_should_journal_data(inode)) {
930 ret = walk_page_buffers(handle, page_buffers(page), 914 ret = ext4_walk_page_buffers(handle, page_buffers(page),
931 from, to, NULL, do_journal_get_write_access); 915 from, to, NULL,
916 do_journal_get_write_access);
932 } 917 }
933 918
934 if (ret) { 919 if (ret) {
@@ -983,7 +968,12 @@ static int ext4_generic_write_end(struct file *file,
983 struct inode *inode = mapping->host; 968 struct inode *inode = mapping->host;
984 handle_t *handle = ext4_journal_current_handle(); 969 handle_t *handle = ext4_journal_current_handle();
985 970
986 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 971 if (ext4_has_inline_data(inode))
972 copied = ext4_write_inline_data_end(inode, pos, len,
973 copied, page);
974 else
975 copied = block_write_end(file, mapping, pos,
976 len, copied, page, fsdata);
987 977
988 /* 978 /*
989 * No need to use i_size_read() here, the i_size 979 * No need to use i_size_read() here, the i_size
@@ -1134,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file,
1134 1124
1135 BUG_ON(!ext4_handle_valid(handle)); 1125 BUG_ON(!ext4_handle_valid(handle));
1136 1126
1137 if (copied < len) { 1127 if (ext4_has_inline_data(inode))
1138 if (!PageUptodate(page)) 1128 copied = ext4_write_inline_data_end(inode, pos, len,
1139 copied = 0; 1129 copied, page);
1140 page_zero_new_buffers(page, from+copied, to); 1130 else {
1141 } 1131 if (copied < len) {
1132 if (!PageUptodate(page))
1133 copied = 0;
1134 page_zero_new_buffers(page, from+copied, to);
1135 }
1142 1136
1143 ret = walk_page_buffers(handle, page_buffers(page), from, 1137 ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
1144 to, &partial, write_end_fn); 1138 to, &partial, write_end_fn);
1145 if (!partial) 1139 if (!partial)
1146 SetPageUptodate(page); 1140 SetPageUptodate(page);
1141 }
1147 new_i_size = pos + copied; 1142 new_i_size = pos + copied;
1148 if (new_i_size > inode->i_size) 1143 if (new_i_size > inode->i_size)
1149 i_size_write(inode, pos+copied); 1144 i_size_write(inode, pos+copied);
@@ -1301,6 +1296,7 @@ static void ext4_da_page_release_reservation(struct page *page,
1301 struct inode *inode = page->mapping->host; 1296 struct inode *inode = page->mapping->host;
1302 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1297 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1303 int num_clusters; 1298 int num_clusters;
1299 ext4_fsblk_t lblk;
1304 1300
1305 head = page_buffers(page); 1301 head = page_buffers(page);
1306 bh = head; 1302 bh = head;
@@ -1310,20 +1306,23 @@ static void ext4_da_page_release_reservation(struct page *page,
1310 if ((offset <= curr_off) && (buffer_delay(bh))) { 1306 if ((offset <= curr_off) && (buffer_delay(bh))) {
1311 to_release++; 1307 to_release++;
1312 clear_buffer_delay(bh); 1308 clear_buffer_delay(bh);
1313 clear_buffer_da_mapped(bh);
1314 } 1309 }
1315 curr_off = next_off; 1310 curr_off = next_off;
1316 } while ((bh = bh->b_this_page) != head); 1311 } while ((bh = bh->b_this_page) != head);
1317 1312
1313 if (to_release) {
1314 lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1315 ext4_es_remove_extent(inode, lblk, to_release);
1316 }
1317
1318 /* If we have released all the blocks belonging to a cluster, then we 1318 /* If we have released all the blocks belonging to a cluster, then we
1319 * need to release the reserved space for that cluster. */ 1319 * need to release the reserved space for that cluster. */
1320 num_clusters = EXT4_NUM_B2C(sbi, to_release); 1320 num_clusters = EXT4_NUM_B2C(sbi, to_release);
1321 while (num_clusters > 0) { 1321 while (num_clusters > 0) {
1322 ext4_fsblk_t lblk;
1323 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + 1322 lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
1324 ((num_clusters - 1) << sbi->s_cluster_bits); 1323 ((num_clusters - 1) << sbi->s_cluster_bits);
1325 if (sbi->s_cluster_ratio == 1 || 1324 if (sbi->s_cluster_ratio == 1 ||
1326 !ext4_find_delalloc_cluster(inode, lblk, 1)) 1325 !ext4_find_delalloc_cluster(inode, lblk))
1327 ext4_da_release_space(inode, 1); 1326 ext4_da_release_space(inode, 1);
1328 1327
1329 num_clusters--; 1328 num_clusters--;
@@ -1429,8 +1428,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
1429 clear_buffer_delay(bh); 1428 clear_buffer_delay(bh);
1430 bh->b_blocknr = pblock; 1429 bh->b_blocknr = pblock;
1431 } 1430 }
1432 if (buffer_da_mapped(bh))
1433 clear_buffer_da_mapped(bh);
1434 if (buffer_unwritten(bh) || 1431 if (buffer_unwritten(bh) ||
1435 buffer_mapped(bh)) 1432 buffer_mapped(bh))
1436 BUG_ON(bh->b_blocknr != pblock); 1433 BUG_ON(bh->b_blocknr != pblock);
@@ -1500,9 +1497,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1500 struct pagevec pvec; 1497 struct pagevec pvec;
1501 struct inode *inode = mpd->inode; 1498 struct inode *inode = mpd->inode;
1502 struct address_space *mapping = inode->i_mapping; 1499 struct address_space *mapping = inode->i_mapping;
1500 ext4_lblk_t start, last;
1503 1501
1504 index = mpd->first_page; 1502 index = mpd->first_page;
1505 end = mpd->next_page - 1; 1503 end = mpd->next_page - 1;
1504
1505 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1506 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1507 ext4_es_remove_extent(inode, start, last - start + 1);
1508
1509 pagevec_init(&pvec, 0);
1506 while (index <= end) { 1510 while (index <= end) {
1507 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1511 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1508 if (nr_pages == 0) 1512 if (nr_pages == 0)
@@ -1656,15 +1660,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1656 1660
1657 for (i = 0; i < map.m_len; i++) 1661 for (i = 0; i < map.m_len; i++)
1658 unmap_underlying_metadata(bdev, map.m_pblk + i); 1662 unmap_underlying_metadata(bdev, map.m_pblk + i);
1659
1660 if (ext4_should_order_data(mpd->inode)) {
1661 err = ext4_jbd2_file_inode(handle, mpd->inode);
1662 if (err) {
1663 /* Only if the journal is aborted */
1664 mpd->retval = err;
1665 goto submit_io;
1666 }
1667 }
1668 } 1663 }
1669 1664
1670 /* 1665 /*
@@ -1795,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1795 * file system block. 1790 * file system block.
1796 */ 1791 */
1797 down_read((&EXT4_I(inode)->i_data_sem)); 1792 down_read((&EXT4_I(inode)->i_data_sem));
1798 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1793 if (ext4_has_inline_data(inode)) {
1794 /*
1795 * We will soon create blocks for this page, and let
1796 * us pretend as if the blocks aren't allocated yet.
1797 * In case of clusters, we have to handle the work
1798 * of mapping from cluster so that the reserved space
1799 * is calculated properly.
1800 */
1801 if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
1802 ext4_find_delalloc_cluster(inode, map->m_lblk))
1803 map->m_flags |= EXT4_MAP_FROM_CLUSTER;
1804 retval = 0;
1805 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1799 retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1806 retval = ext4_ext_map_blocks(NULL, inode, map, 0);
1800 else 1807 else
1801 retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1808 retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1814,6 +1821,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1814 goto out_unlock; 1821 goto out_unlock;
1815 } 1822 }
1816 1823
1824 retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
1825 if (retval)
1826 goto out_unlock;
1827
1817 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served 1828 /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
1818 * and it should not appear on the bh->b_state. 1829 * and it should not appear on the bh->b_state.
1819 */ 1830 */
@@ -1842,8 +1853,8 @@ out_unlock:
1842 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 1853 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
1843 * initialized properly. 1854 * initialized properly.
1844 */ 1855 */
1845static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 1856int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
1846 struct buffer_head *bh, int create) 1857 struct buffer_head *bh, int create)
1847{ 1858{
1848 struct ext4_map_blocks map; 1859 struct ext4_map_blocks map;
1849 int ret = 0; 1860 int ret = 0;
@@ -1917,15 +1928,29 @@ static int __ext4_journalled_writepage(struct page *page,
1917{ 1928{
1918 struct address_space *mapping = page->mapping; 1929 struct address_space *mapping = page->mapping;
1919 struct inode *inode = mapping->host; 1930 struct inode *inode = mapping->host;
1920 struct buffer_head *page_bufs; 1931 struct buffer_head *page_bufs = NULL;
1921 handle_t *handle = NULL; 1932 handle_t *handle = NULL;
1922 int ret = 0; 1933 int ret = 0, err = 0;
1923 int err; 1934 int inline_data = ext4_has_inline_data(inode);
1935 struct buffer_head *inode_bh = NULL;
1924 1936
1925 ClearPageChecked(page); 1937 ClearPageChecked(page);
1926 page_bufs = page_buffers(page); 1938
1927 BUG_ON(!page_bufs); 1939 if (inline_data) {
1928 walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); 1940 BUG_ON(page->index != 0);
1941 BUG_ON(len > ext4_get_max_inline_size(inode));
1942 inode_bh = ext4_journalled_write_inline_data(inode, len, page);
1943 if (inode_bh == NULL)
1944 goto out;
1945 } else {
1946 page_bufs = page_buffers(page);
1947 if (!page_bufs) {
1948 BUG();
1949 goto out;
1950 }
1951 ext4_walk_page_buffers(handle, page_bufs, 0, len,
1952 NULL, bget_one);
1953 }
1929 /* As soon as we unlock the page, it can go away, but we have 1954 /* As soon as we unlock the page, it can go away, but we have
1930 * references to buffers so we are safe */ 1955 * references to buffers so we are safe */
1931 unlock_page(page); 1956 unlock_page(page);
@@ -1938,11 +1963,18 @@ static int __ext4_journalled_writepage(struct page *page,
1938 1963
1939 BUG_ON(!ext4_handle_valid(handle)); 1964 BUG_ON(!ext4_handle_valid(handle));
1940 1965
1941 ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, 1966 if (inline_data) {
1942 do_journal_get_write_access); 1967 ret = ext4_journal_get_write_access(handle, inode_bh);
1943 1968
1944 err = walk_page_buffers(handle, page_bufs, 0, len, NULL, 1969 err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
1945 write_end_fn); 1970
1971 } else {
1972 ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
1973 do_journal_get_write_access);
1974
1975 err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
1976 write_end_fn);
1977 }
1946 if (ret == 0) 1978 if (ret == 0)
1947 ret = err; 1979 ret = err;
1948 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1980 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -1950,9 +1982,12 @@ static int __ext4_journalled_writepage(struct page *page,
1950 if (!ret) 1982 if (!ret)
1951 ret = err; 1983 ret = err;
1952 1984
1953 walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); 1985 if (!ext4_has_inline_data(inode))
1986 ext4_walk_page_buffers(handle, page_bufs, 0, len,
1987 NULL, bput_one);
1954 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1988 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1955out: 1989out:
1990 brelse(inode_bh);
1956 return ret; 1991 return ret;
1957} 1992}
1958 1993
@@ -2029,8 +2064,8 @@ static int ext4_writepage(struct page *page,
2029 commit_write = 1; 2064 commit_write = 1;
2030 } 2065 }
2031 page_bufs = page_buffers(page); 2066 page_bufs = page_buffers(page);
2032 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2067 if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
2033 ext4_bh_delay_or_unwritten)) { 2068 ext4_bh_delay_or_unwritten)) {
2034 /* 2069 /*
2035 * We don't want to do block allocation, so redirty 2070 * We don't want to do block allocation, so redirty
2036 * the page and return. We may reach here when we do 2071 * the page and return. We may reach here when we do
@@ -2096,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2096 * mpage_da_map_and_submit to map a single contiguous memory region 2131 * mpage_da_map_and_submit to map a single contiguous memory region
2097 * and then write them. 2132 * and then write them.
2098 */ 2133 */
2099static int write_cache_pages_da(struct address_space *mapping, 2134static int write_cache_pages_da(handle_t *handle,
2135 struct address_space *mapping,
2100 struct writeback_control *wbc, 2136 struct writeback_control *wbc,
2101 struct mpage_da_data *mpd, 2137 struct mpage_da_data *mpd,
2102 pgoff_t *done_index) 2138 pgoff_t *done_index)
@@ -2175,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
2175 wait_on_page_writeback(page); 2211 wait_on_page_writeback(page);
2176 BUG_ON(PageWriteback(page)); 2212 BUG_ON(PageWriteback(page));
2177 2213
2214 /*
2215 * If we have inline data and arrive here, it means that
2216 * we will soon create the block for the 1st page, so
2217 * we'd better clear the inline data here.
2218 */
2219 if (ext4_has_inline_data(inode)) {
2220 BUG_ON(ext4_test_inode_state(inode,
2221 EXT4_STATE_MAY_INLINE_DATA));
2222 ext4_destroy_inline_data(handle, inode);
2223 }
2224
2178 if (mpd->next_page != page->index) 2225 if (mpd->next_page != page->index)
2179 mpd->first_page = page->index; 2226 mpd->first_page = page->index;
2180 mpd->next_page = page->index + 1; 2227 mpd->next_page = page->index + 1;
@@ -2381,7 +2428,8 @@ retry:
2381 * contiguous region of logical blocks that need 2428 * contiguous region of logical blocks that need
2382 * blocks to be allocated by ext4 and submit them. 2429 * blocks to be allocated by ext4 and submit them.
2383 */ 2430 */
2384 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 2431 ret = write_cache_pages_da(handle, mapping,
2432 wbc, &mpd, &done_index);
2385 /* 2433 /*
2386 * If we have a contiguous extent of pages and we 2434 * If we have a contiguous extent of pages and we
2387 * haven't done the I/O yet, map the blocks and submit 2435 * haven't done the I/O yet, map the blocks and submit
@@ -2445,7 +2493,6 @@ out_writepages:
2445 return ret; 2493 return ret;
2446} 2494}
2447 2495
2448#define FALL_BACK_TO_NONDELALLOC 1
2449static int ext4_nonda_switch(struct super_block *sb) 2496static int ext4_nonda_switch(struct super_block *sb)
2450{ 2497{
2451 s64 free_blocks, dirty_blocks; 2498 s64 free_blocks, dirty_blocks;
@@ -2502,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2502 } 2549 }
2503 *fsdata = (void *)0; 2550 *fsdata = (void *)0;
2504 trace_ext4_da_write_begin(inode, pos, len, flags); 2551 trace_ext4_da_write_begin(inode, pos, len, flags);
2552
2553 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
2554 ret = ext4_da_write_inline_data_begin(mapping, inode,
2555 pos, len, flags,
2556 pagep, fsdata);
2557 if (ret < 0)
2558 goto out;
2559 if (ret == 1) {
2560 ret = 0;
2561 goto out;
2562 }
2563 }
2564
2505retry: 2565retry:
2506 /* 2566 /*
2507 * With delayed allocation, we don't log the i_disksize update 2567 * With delayed allocation, we don't log the i_disksize update
@@ -2603,22 +2663,13 @@ static int ext4_da_write_end(struct file *file,
2603 * changes. So let's piggyback the i_disksize mark_inode_dirty 2663 * changes. So let's piggyback the i_disksize mark_inode_dirty
2604 * into that. 2664 * into that.
2605 */ 2665 */
2606
2607 new_i_size = pos + copied; 2666 new_i_size = pos + copied;
2608 if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 2667 if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
2609 if (ext4_da_should_update_i_disksize(page, end)) { 2668 if (ext4_has_inline_data(inode) ||
2669 ext4_da_should_update_i_disksize(page, end)) {
2610 down_write(&EXT4_I(inode)->i_data_sem); 2670 down_write(&EXT4_I(inode)->i_data_sem);
2611 if (new_i_size > EXT4_I(inode)->i_disksize) { 2671 if (new_i_size > EXT4_I(inode)->i_disksize)
2612 /*
2613 * Updating i_disksize when extending file
2614 * without needing block allocation
2615 */
2616 if (ext4_should_order_data(inode))
2617 ret = ext4_jbd2_file_inode(handle,
2618 inode);
2619
2620 EXT4_I(inode)->i_disksize = new_i_size; 2672 EXT4_I(inode)->i_disksize = new_i_size;
2621 }
2622 up_write(&EXT4_I(inode)->i_data_sem); 2673 up_write(&EXT4_I(inode)->i_data_sem);
2623 /* We need to mark inode dirty even if 2674 /* We need to mark inode dirty even if
2624 * new_i_size is less that inode->i_size 2675 * new_i_size is less that inode->i_size
@@ -2627,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
2627 ext4_mark_inode_dirty(handle, inode); 2678 ext4_mark_inode_dirty(handle, inode);
2628 } 2679 }
2629 } 2680 }
2630 ret2 = generic_write_end(file, mapping, pos, len, copied, 2681
2682 if (write_mode != CONVERT_INLINE_DATA &&
2683 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
2684 ext4_has_inline_data(inode))
2685 ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
2686 page);
2687 else
2688 ret2 = generic_write_end(file, mapping, pos, len, copied,
2631 page, fsdata); 2689 page, fsdata);
2690
2632 copied = ret2; 2691 copied = ret2;
2633 if (ret2 < 0) 2692 if (ret2 < 0)
2634 ret = ret2; 2693 ret = ret2;
@@ -2721,6 +2780,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2721 journal_t *journal; 2780 journal_t *journal;
2722 int err; 2781 int err;
2723 2782
2783 /*
2784 * We can get here for an inline file via the FIBMAP ioctl
2785 */
2786 if (ext4_has_inline_data(inode))
2787 return 0;
2788
2724 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 2789 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
2725 test_opt(inode->i_sb, DELALLOC)) { 2790 test_opt(inode->i_sb, DELALLOC)) {
2726 /* 2791 /*
@@ -2766,14 +2831,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
2766 2831
2767static int ext4_readpage(struct file *file, struct page *page) 2832static int ext4_readpage(struct file *file, struct page *page)
2768{ 2833{
2834 int ret = -EAGAIN;
2835 struct inode *inode = page->mapping->host;
2836
2769 trace_ext4_readpage(page); 2837 trace_ext4_readpage(page);
2770 return mpage_readpage(page, ext4_get_block); 2838
2839 if (ext4_has_inline_data(inode))
2840 ret = ext4_readpage_inline(inode, page);
2841
2842 if (ret == -EAGAIN)
2843 return mpage_readpage(page, ext4_get_block);
2844
2845 return ret;
2771} 2846}
2772 2847
2773static int 2848static int
2774ext4_readpages(struct file *file, struct address_space *mapping, 2849ext4_readpages(struct file *file, struct address_space *mapping,
2775 struct list_head *pages, unsigned nr_pages) 2850 struct list_head *pages, unsigned nr_pages)
2776{ 2851{
2852 struct inode *inode = mapping->host;
2853
2854 /* If the file has inline data, no need to do readpages. */
2855 if (ext4_has_inline_data(inode))
2856 return 0;
2857
2777 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2858 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2778} 2859}
2779 2860
@@ -2799,8 +2880,6 @@ static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offs
2799 2880
2800static void ext4_invalidatepage(struct page *page, unsigned long offset) 2881static void ext4_invalidatepage(struct page *page, unsigned long offset)
2801{ 2882{
2802 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2803
2804 trace_ext4_invalidatepage(page, offset); 2883 trace_ext4_invalidatepage(page, offset);
2805 2884
2806 /* 2885 /*
@@ -2808,16 +2887,34 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
2808 */ 2887 */
2809 if (ext4_should_dioread_nolock(page->mapping->host)) 2888 if (ext4_should_dioread_nolock(page->mapping->host))
2810 ext4_invalidatepage_free_endio(page, offset); 2889 ext4_invalidatepage_free_endio(page, offset);
2890
2891 /* No journalling happens on data buffers when this function is used */
2892 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
2893
2894 block_invalidatepage(page, offset);
2895}
2896
2897static int __ext4_journalled_invalidatepage(struct page *page,
2898 unsigned long offset)
2899{
2900 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
2901
2902 trace_ext4_journalled_invalidatepage(page, offset);
2903
2811 /* 2904 /*
2812 * If it's a full truncate we just forget about the pending dirtying 2905 * If it's a full truncate we just forget about the pending dirtying
2813 */ 2906 */
2814 if (offset == 0) 2907 if (offset == 0)
2815 ClearPageChecked(page); 2908 ClearPageChecked(page);
2816 2909
2817 if (journal) 2910 return jbd2_journal_invalidatepage(journal, page, offset);
2818 jbd2_journal_invalidatepage(journal, page, offset); 2911}
2819 else 2912
2820 block_invalidatepage(page, offset); 2913/* Wrapper for aops... */
2914static void ext4_journalled_invalidatepage(struct page *page,
2915 unsigned long offset)
2916{
2917 WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0);
2821} 2918}
2822 2919
2823static int ext4_releasepage(struct page *page, gfp_t wait) 2920static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2840,7 +2937,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
2840 * We allocate an uinitialized extent if blocks haven't been allocated. 2937 * We allocate an uinitialized extent if blocks haven't been allocated.
2841 * The extent will be converted to initialized after the IO is complete. 2938 * The extent will be converted to initialized after the IO is complete.
2842 */ 2939 */
2843static int ext4_get_block_write(struct inode *inode, sector_t iblock, 2940int ext4_get_block_write(struct inode *inode, sector_t iblock,
2844 struct buffer_head *bh_result, int create) 2941 struct buffer_head *bh_result, int create)
2845{ 2942{
2846 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 2943 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -2850,29 +2947,12 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
2850} 2947}
2851 2948
2852static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, 2949static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
2853 struct buffer_head *bh_result, int flags) 2950 struct buffer_head *bh_result, int create)
2854{ 2951{
2855 handle_t *handle = ext4_journal_current_handle(); 2952 ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
2856 struct ext4_map_blocks map; 2953 inode->i_ino, create);
2857 int ret = 0; 2954 return _ext4_get_block(inode, iblock, bh_result,
2858 2955 EXT4_GET_BLOCKS_NO_LOCK);
2859 ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n",
2860 inode->i_ino, flags);
2861
2862 flags = EXT4_GET_BLOCKS_NO_LOCK;
2863
2864 map.m_lblk = iblock;
2865 map.m_len = bh_result->b_size >> inode->i_blkbits;
2866
2867 ret = ext4_map_blocks(handle, inode, &map, flags);
2868 if (ret > 0) {
2869 map_bh(bh_result, inode->i_sb, map.m_pblk);
2870 bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
2871 map.m_flags;
2872 bh_result->b_size = inode->i_sb->s_blocksize * map.m_len;
2873 ret = 0;
2874 }
2875 return ret;
2876} 2956}
2877 2957
2878static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 2958static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -2978,10 +3058,10 @@ retry:
2978 * fall back to buffered IO. 3058 * fall back to buffered IO.
2979 * 3059 *
2980 * For holes, we fallocate those blocks, mark them as uninitialized 3060 * For holes, we fallocate those blocks, mark them as uninitialized
2981 * If those blocks were preallocated, we mark sure they are splited, but 3061 * If those blocks were preallocated, we mark sure they are split, but
2982 * still keep the range to write as uninitialized. 3062 * still keep the range to write as uninitialized.
2983 * 3063 *
2984 * The unwrritten extents will be converted to written when DIO is completed. 3064 * The unwritten extents will be converted to written when DIO is completed.
2985 * For async direct IO, since the IO may still pending when return, we 3065 * For async direct IO, since the IO may still pending when return, we
2986 * set up an end_io call back function, which will do the conversion 3066 * set up an end_io call back function, which will do the conversion
2987 * when async direct IO completed. 3067 * when async direct IO completed.
@@ -2999,125 +3079,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
2999 struct inode *inode = file->f_mapping->host; 3079 struct inode *inode = file->f_mapping->host;
3000 ssize_t ret; 3080 ssize_t ret;
3001 size_t count = iov_length(iov, nr_segs); 3081 size_t count = iov_length(iov, nr_segs);
3002 3082 int overwrite = 0;
3083 get_block_t *get_block_func = NULL;
3084 int dio_flags = 0;
3003 loff_t final_size = offset + count; 3085 loff_t final_size = offset + count;
3004 if (rw == WRITE && final_size <= inode->i_size) {
3005 int overwrite = 0;
3006 3086
3007 BUG_ON(iocb->private == NULL); 3087 /* Use the old path for reads and writes beyond i_size. */
3088 if (rw != WRITE || final_size > inode->i_size)
3089 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3008 3090
3009 /* If we do a overwrite dio, i_mutex locking can be released */ 3091 BUG_ON(iocb->private == NULL);
3010 overwrite = *((int *)iocb->private);
3011 3092
3012 if (overwrite) { 3093 /* If we do a overwrite dio, i_mutex locking can be released */
3013 atomic_inc(&inode->i_dio_count); 3094 overwrite = *((int *)iocb->private);
3014 down_read(&EXT4_I(inode)->i_data_sem);
3015 mutex_unlock(&inode->i_mutex);
3016 }
3017 3095
3018 /* 3096 if (overwrite) {
3019 * We could direct write to holes and fallocate. 3097 atomic_inc(&inode->i_dio_count);
3020 * 3098 down_read(&EXT4_I(inode)->i_data_sem);
3021 * Allocated blocks to fill the hole are marked as uninitialized 3099 mutex_unlock(&inode->i_mutex);
3022 * to prevent parallel buffered read to expose the stale data 3100 }
3023 * before DIO complete the data IO.
3024 *
3025 * As to previously fallocated extents, ext4 get_block
3026 * will just simply mark the buffer mapped but still
3027 * keep the extents uninitialized.
3028 *
3029 * for non AIO case, we will convert those unwritten extents
3030 * to written after return back from blockdev_direct_IO.
3031 *
3032 * for async DIO, the conversion needs to be defered when
3033 * the IO is completed. The ext4 end_io callback function
3034 * will be called to take care of the conversion work.
3035 * Here for async case, we allocate an io_end structure to
3036 * hook to the iocb.
3037 */
3038 iocb->private = NULL;
3039 ext4_inode_aio_set(inode, NULL);
3040 if (!is_sync_kiocb(iocb)) {
3041 ext4_io_end_t *io_end =
3042 ext4_init_io_end(inode, GFP_NOFS);
3043 if (!io_end) {
3044 ret = -ENOMEM;
3045 goto retake_lock;
3046 }
3047 io_end->flag |= EXT4_IO_END_DIRECT;
3048 iocb->private = io_end;
3049 /*
3050 * we save the io structure for current async
3051 * direct IO, so that later ext4_map_blocks()
3052 * could flag the io structure whether there
3053 * is a unwritten extents needs to be converted
3054 * when IO is completed.
3055 */
3056 ext4_inode_aio_set(inode, io_end);
3057 }
3058 3101
3059 if (overwrite) 3102 /*
3060 ret = __blockdev_direct_IO(rw, iocb, inode, 3103 * We could direct write to holes and fallocate.
3061 inode->i_sb->s_bdev, iov, 3104 *
3062 offset, nr_segs, 3105 * Allocated blocks to fill the hole are marked as
3063 ext4_get_block_write_nolock, 3106 * uninitialized to prevent parallel buffered read to expose
3064 ext4_end_io_dio, 3107 * the stale data before DIO complete the data IO.
3065 NULL, 3108 *
3066 0); 3109 * As to previously fallocated extents, ext4 get_block will
3067 else 3110 * just simply mark the buffer mapped but still keep the
3068 ret = __blockdev_direct_IO(rw, iocb, inode, 3111 * extents uninitialized.
3069 inode->i_sb->s_bdev, iov, 3112 *
3070 offset, nr_segs, 3113 * For non AIO case, we will convert those unwritten extents
3071 ext4_get_block_write, 3114 * to written after return back from blockdev_direct_IO.
3072 ext4_end_io_dio, 3115 *
3073 NULL, 3116 * For async DIO, the conversion needs to be deferred when the
3074 DIO_LOCKING); 3117 * IO is completed. The ext4 end_io callback function will be
3075 if (iocb->private) 3118 * called to take care of the conversion work. Here for async
3076 ext4_inode_aio_set(inode, NULL); 3119 * case, we allocate an io_end structure to hook to the iocb.
3120 */
3121 iocb->private = NULL;
3122 ext4_inode_aio_set(inode, NULL);
3123 if (!is_sync_kiocb(iocb)) {
3124 ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
3125 if (!io_end) {
3126 ret = -ENOMEM;
3127 goto retake_lock;
3128 }
3129 io_end->flag |= EXT4_IO_END_DIRECT;
3130 iocb->private = io_end;
3077 /* 3131 /*
3078 * The io_end structure takes a reference to the inode, 3132 * we save the io structure for current async direct
3079 * that structure needs to be destroyed and the 3133 * IO, so that later ext4_map_blocks() could flag the
3080 * reference to the inode need to be dropped, when IO is 3134 * io structure whether there is a unwritten extents
3081 * complete, even with 0 byte write, or failed. 3135 * needs to be converted when IO is completed.
3082 *
3083 * In the successful AIO DIO case, the io_end structure will be
3084 * desctroyed and the reference to the inode will be dropped
3085 * after the end_io call back function is called.
3086 *
3087 * In the case there is 0 byte write, or error case, since
3088 * VFS direct IO won't invoke the end_io call back function,
3089 * we need to free the end_io structure here.
3090 */ 3136 */
3091 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3137 ext4_inode_aio_set(inode, io_end);
3092 ext4_free_io_end(iocb->private); 3138 }
3093 iocb->private = NULL;
3094 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3095 EXT4_STATE_DIO_UNWRITTEN)) {
3096 int err;
3097 /*
3098 * for non AIO case, since the IO is already
3099 * completed, we could do the conversion right here
3100 */
3101 err = ext4_convert_unwritten_extents(inode,
3102 offset, ret);
3103 if (err < 0)
3104 ret = err;
3105 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3106 }
3107 3139
3108 retake_lock: 3140 if (overwrite) {
3109 /* take i_mutex locking again if we do a ovewrite dio */ 3141 get_block_func = ext4_get_block_write_nolock;
3110 if (overwrite) { 3142 } else {
3111 inode_dio_done(inode); 3143 get_block_func = ext4_get_block_write;
3112 up_read(&EXT4_I(inode)->i_data_sem); 3144 dio_flags = DIO_LOCKING;
3113 mutex_lock(&inode->i_mutex); 3145 }
3114 } 3146 ret = __blockdev_direct_IO(rw, iocb, inode,
3147 inode->i_sb->s_bdev, iov,
3148 offset, nr_segs,
3149 get_block_func,
3150 ext4_end_io_dio,
3151 NULL,
3152 dio_flags);
3153
3154 if (iocb->private)
3155 ext4_inode_aio_set(inode, NULL);
3156 /*
3157 * The io_end structure takes a reference to the inode, that
3158 * structure needs to be destroyed and the reference to the
3159 * inode need to be dropped, when IO is complete, even with 0
3160 * byte write, or failed.
3161 *
3162 * In the successful AIO DIO case, the io_end structure will
3163 * be destroyed and the reference to the inode will be dropped
3164 * after the end_io call back function is called.
3165 *
3166 * In the case there is 0 byte write, or error case, since VFS
3167 * direct IO won't invoke the end_io call back function, we
3168 * need to free the end_io structure here.
3169 */
3170 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3171 ext4_free_io_end(iocb->private);
3172 iocb->private = NULL;
3173 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3174 EXT4_STATE_DIO_UNWRITTEN)) {
3175 int err;
3176 /*
3177 * for non AIO case, since the IO is already
3178 * completed, we could do the conversion right here
3179 */
3180 err = ext4_convert_unwritten_extents(inode,
3181 offset, ret);
3182 if (err < 0)
3183 ret = err;
3184 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3185 }
3115 3186
3116 return ret; 3187retake_lock:
3188 /* take i_mutex locking again if we do a ovewrite dio */
3189 if (overwrite) {
3190 inode_dio_done(inode);
3191 up_read(&EXT4_I(inode)->i_data_sem);
3192 mutex_lock(&inode->i_mutex);
3117 } 3193 }
3118 3194
3119 /* for write the the end of file case, we fall back to old way */ 3195 return ret;
3120 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3121} 3196}
3122 3197
3123static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3198static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
@@ -3134,6 +3209,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3134 if (ext4_should_journal_data(inode)) 3209 if (ext4_should_journal_data(inode))
3135 return 0; 3210 return 0;
3136 3211
3212 /* Let buffer I/O handle the inline data case. */
3213 if (ext4_has_inline_data(inode))
3214 return 0;
3215
3137 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); 3216 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3138 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3217 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3139 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3218 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3201,7 +3280,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3201 .write_end = ext4_journalled_write_end, 3280 .write_end = ext4_journalled_write_end,
3202 .set_page_dirty = ext4_journalled_set_page_dirty, 3281 .set_page_dirty = ext4_journalled_set_page_dirty,
3203 .bmap = ext4_bmap, 3282 .bmap = ext4_bmap,
3204 .invalidatepage = ext4_invalidatepage, 3283 .invalidatepage = ext4_journalled_invalidatepage,
3205 .releasepage = ext4_releasepage, 3284 .releasepage = ext4_releasepage,
3206 .direct_IO = ext4_direct_IO, 3285 .direct_IO = ext4_direct_IO,
3207 .is_partially_uptodate = block_is_partially_uptodate, 3286 .is_partially_uptodate = block_is_partially_uptodate,
@@ -3531,6 +3610,14 @@ void ext4_truncate(struct inode *inode)
3531 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 3610 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3532 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 3611 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
3533 3612
3613 if (ext4_has_inline_data(inode)) {
3614 int has_inline = 1;
3615
3616 ext4_inline_data_truncate(inode, &has_inline);
3617 if (has_inline)
3618 return;
3619 }
3620
3534 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3621 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3535 ext4_ext_truncate(inode); 3622 ext4_ext_truncate(inode);
3536 else 3623 else
@@ -3756,6 +3843,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
3756 } 3843 }
3757} 3844}
3758 3845
3846static inline void ext4_iget_extra_inode(struct inode *inode,
3847 struct ext4_inode *raw_inode,
3848 struct ext4_inode_info *ei)
3849{
3850 __le32 *magic = (void *)raw_inode +
3851 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
3852 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
3853 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3854 ext4_find_inline_data_nolock(inode);
3855 } else
3856 EXT4_I(inode)->i_inline_off = 0;
3857}
3858
3759struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 3859struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3760{ 3860{
3761 struct ext4_iloc iloc; 3861 struct ext4_iloc iloc;
@@ -3826,6 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3826 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 3926 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
3827 3927
3828 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 3928 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
3929 ei->i_inline_off = 0;
3829 ei->i_dir_start_lookup = 0; 3930 ei->i_dir_start_lookup = 0;
3830 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 3931 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
3831 /* We now have enough fields to check if the inode was active or not. 3932 /* We now have enough fields to check if the inode was active or not.
@@ -3898,11 +3999,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3898 ei->i_extra_isize = sizeof(struct ext4_inode) - 3999 ei->i_extra_isize = sizeof(struct ext4_inode) -
3899 EXT4_GOOD_OLD_INODE_SIZE; 4000 EXT4_GOOD_OLD_INODE_SIZE;
3900 } else { 4001 } else {
3901 __le32 *magic = (void *)raw_inode + 4002 ext4_iget_extra_inode(inode, raw_inode, ei);
3902 EXT4_GOOD_OLD_INODE_SIZE +
3903 ei->i_extra_isize;
3904 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
3905 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
3906 } 4003 }
3907 } 4004 }
3908 4005
@@ -3925,17 +4022,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3925 ei->i_file_acl); 4022 ei->i_file_acl);
3926 ret = -EIO; 4023 ret = -EIO;
3927 goto bad_inode; 4024 goto bad_inode;
3928 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4025 } else if (!ext4_has_inline_data(inode)) {
3929 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4026 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
3930 (S_ISLNK(inode->i_mode) && 4027 if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3931 !ext4_inode_is_fast_symlink(inode))) 4028 (S_ISLNK(inode->i_mode) &&
3932 /* Validate extent which is part of inode */ 4029 !ext4_inode_is_fast_symlink(inode))))
3933 ret = ext4_ext_check_inode(inode); 4030 /* Validate extent which is part of inode */
3934 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4031 ret = ext4_ext_check_inode(inode);
3935 (S_ISLNK(inode->i_mode) && 4032 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
3936 !ext4_inode_is_fast_symlink(inode))) { 4033 (S_ISLNK(inode->i_mode) &&
3937 /* Validate block references which are part of inode */ 4034 !ext4_inode_is_fast_symlink(inode))) {
3938 ret = ext4_ind_check_inode(inode); 4035 /* Validate block references which are part of inode */
4036 ret = ext4_ind_check_inode(inode);
4037 }
3939 } 4038 }
3940 if (ret) 4039 if (ret)
3941 goto bad_inode; 4040 goto bad_inode;
@@ -4122,9 +4221,10 @@ static int ext4_do_update_inode(handle_t *handle,
4122 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4221 cpu_to_le32(new_encode_dev(inode->i_rdev));
4123 raw_inode->i_block[2] = 0; 4222 raw_inode->i_block[2] = 0;
4124 } 4223 }
4125 } else 4224 } else if (!ext4_has_inline_data(inode)) {
4126 for (block = 0; block < EXT4_N_BLOCKS; block++) 4225 for (block = 0; block < EXT4_N_BLOCKS; block++)
4127 raw_inode->i_block[block] = ei->i_data[block]; 4226 raw_inode->i_block[block] = ei->i_data[block];
4227 }
4128 4228
4129 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4229 raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
4130 if (ei->i_extra_isize) { 4230 if (ei->i_extra_isize) {
@@ -4221,6 +4321,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
4221} 4321}
4222 4322
4223/* 4323/*
4324 * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
4325 * buffers that are attached to a page stradding i_size and are undergoing
4326 * commit. In that case we have to wait for commit to finish and try again.
4327 */
4328static void ext4_wait_for_tail_page_commit(struct inode *inode)
4329{
4330 struct page *page;
4331 unsigned offset;
4332 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
4333 tid_t commit_tid = 0;
4334 int ret;
4335
4336 offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
4337 /*
4338 * All buffers in the last page remain valid? Then there's nothing to
4339 * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
4340 * blocksize case
4341 */
4342 if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
4343 return;
4344 while (1) {
4345 page = find_lock_page(inode->i_mapping,
4346 inode->i_size >> PAGE_CACHE_SHIFT);
4347 if (!page)
4348 return;
4349 ret = __ext4_journalled_invalidatepage(page, offset);
4350 unlock_page(page);
4351 page_cache_release(page);
4352 if (ret != -EBUSY)
4353 return;
4354 commit_tid = 0;
4355 read_lock(&journal->j_state_lock);
4356 if (journal->j_committing_transaction)
4357 commit_tid = journal->j_committing_transaction->t_tid;
4358 read_unlock(&journal->j_state_lock);
4359 if (commit_tid)
4360 jbd2_log_wait_commit(journal, commit_tid);
4361 }
4362}
4363
4364/*
4224 * ext4_setattr() 4365 * ext4_setattr()
4225 * 4366 *
4226 * Called from notify_change. 4367 * Called from notify_change.
@@ -4333,16 +4474,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4333 } 4474 }
4334 4475
4335 if (attr->ia_valid & ATTR_SIZE) { 4476 if (attr->ia_valid & ATTR_SIZE) {
4336 if (attr->ia_size != i_size_read(inode)) { 4477 if (attr->ia_size != inode->i_size) {
4337 truncate_setsize(inode, attr->ia_size); 4478 loff_t oldsize = inode->i_size;
4338 /* Inode size will be reduced, wait for dio in flight. 4479
4339 * Temporarily disable dioread_nolock to prevent 4480 i_size_write(inode, attr->ia_size);
4340 * livelock. */ 4481 /*
4482 * Blocks are going to be removed from the inode. Wait
4483 * for dio in flight. Temporarily disable
4484 * dioread_nolock to prevent livelock.
4485 */
4341 if (orphan) { 4486 if (orphan) {
4342 ext4_inode_block_unlocked_dio(inode); 4487 if (!ext4_should_journal_data(inode)) {
4343 inode_dio_wait(inode); 4488 ext4_inode_block_unlocked_dio(inode);
4344 ext4_inode_resume_unlocked_dio(inode); 4489 inode_dio_wait(inode);
4490 ext4_inode_resume_unlocked_dio(inode);
4491 } else
4492 ext4_wait_for_tail_page_commit(inode);
4345 } 4493 }
4494 /*
4495 * Truncate pagecache after we've waited for commit
4496 * in data=journal mode to make pages freeable.
4497 */
4498 truncate_pagecache(inode, oldsize, inode->i_size);
4346 } 4499 }
4347 ext4_truncate(inode); 4500 ext4_truncate(inode);
4348 } 4501 }
@@ -4811,8 +4964,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4811 * journal_start/journal_stop which can block and take a long time 4964 * journal_start/journal_stop which can block and take a long time
4812 */ 4965 */
4813 if (page_has_buffers(page)) { 4966 if (page_has_buffers(page)) {
4814 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 4967 if (!ext4_walk_page_buffers(NULL, page_buffers(page),
4815 ext4_bh_unmapped)) { 4968 0, len, NULL,
4969 ext4_bh_unmapped)) {
4816 /* Wait so that we don't change page under IO */ 4970 /* Wait so that we don't change page under IO */
4817 wait_on_page_writeback(page); 4971 wait_on_page_writeback(page);
4818 ret = VM_FAULT_LOCKED; 4972 ret = VM_FAULT_LOCKED;
@@ -4833,7 +4987,7 @@ retry_alloc:
4833 } 4987 }
4834 ret = __block_page_mkwrite(vma, vmf, get_block); 4988 ret = __block_page_mkwrite(vma, vmf, get_block);
4835 if (!ret && ext4_should_journal_data(inode)) { 4989 if (!ret && ext4_should_journal_data(inode)) {
4836 if (walk_page_buffers(handle, page_buffers(page), 0, 4990 if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
4837 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { 4991 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
4838 unlock_page(page); 4992 unlock_page(page);
4839 ret = VM_FAULT_SIGBUS; 4993 ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 526e55358606..1bf6fe785c4f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
1373 ex->fe_start += next; 1373 ex->fe_start += next;
1374 1374
1375 while (needed > ex->fe_len && 1375 while (needed > ex->fe_len &&
1376 (buddy = mb_find_buddy(e4b, order, &max))) { 1376 mb_find_buddy(e4b, order, &max)) {
1377 1377
1378 if (block + 1 >= max) 1378 if (block + 1 >= max)
1379 break; 1379 break;
@@ -2607,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb,
2607 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2607 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2608 entry->efd_count, entry->efd_group, entry); 2608 entry->efd_count, entry->efd_group, entry);
2609 2609
2610 if (test_opt(sb, DISCARD)) 2610 if (test_opt(sb, DISCARD)) {
2611 ext4_issue_discard(sb, entry->efd_group, 2611 err = ext4_issue_discard(sb, entry->efd_group,
2612 entry->efd_start_cluster, entry->efd_count); 2612 entry->efd_start_cluster,
2613 entry->efd_count);
2614 if (err && err != -EOPNOTSUPP)
2615 ext4_msg(sb, KERN_WARNING, "discard request in"
2616 " group:%d block:%d count:%d failed"
2617 " with %d", entry->efd_group,
2618 entry->efd_start_cluster,
2619 entry->efd_count, err);
2620 }
2613 2621
2614 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); 2622 err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2615 /* we expect to find existing buddy because it's pinned */ 2623 /* we expect to find existing buddy because it's pinned */
@@ -4310,8 +4318,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4310repeat: 4318repeat:
4311 /* allocate space in core */ 4319 /* allocate space in core */
4312 *errp = ext4_mb_regular_allocator(ac); 4320 *errp = ext4_mb_regular_allocator(ac);
4313 if (*errp) 4321 if (*errp) {
4322 ext4_discard_allocated_blocks(ac);
4314 goto errout; 4323 goto errout;
4324 }
4315 4325
4316 /* as we've just preallocated more space than 4326 /* as we've just preallocated more space than
4317 * user requested orinally, we store allocated 4327 * user requested orinally, we store allocated
@@ -4333,10 +4343,10 @@ repeat:
4333 ac->ac_b_ex.fe_len = 0; 4343 ac->ac_b_ex.fe_len = 0;
4334 ac->ac_status = AC_STATUS_CONTINUE; 4344 ac->ac_status = AC_STATUS_CONTINUE;
4335 goto repeat; 4345 goto repeat;
4336 } else if (*errp) 4346 } else if (*errp) {
4337 errout:
4338 ext4_discard_allocated_blocks(ac); 4347 ext4_discard_allocated_blocks(ac);
4339 else { 4348 goto errout;
4349 } else {
4340 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4350 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4341 ar->len = ac->ac_b_ex.fe_len; 4351 ar->len = ac->ac_b_ex.fe_len;
4342 } 4352 }
@@ -4347,6 +4357,7 @@ repeat:
4347 *errp = -ENOSPC; 4357 *errp = -ENOSPC;
4348 } 4358 }
4349 4359
4360errout:
4350 if (*errp) { 4361 if (*errp) {
4351 ac->ac_b_ex.fe_len = 0; 4362 ac->ac_b_ex.fe_len = 0;
4352 ar->len = 0; 4363 ar->len = 0;
@@ -4656,8 +4667,16 @@ do_more:
4656 * with group lock held. generate_buddy look at 4667 * with group lock held. generate_buddy look at
4657 * them with group lock_held 4668 * them with group lock_held
4658 */ 4669 */
4659 if (test_opt(sb, DISCARD)) 4670 if (test_opt(sb, DISCARD)) {
4660 ext4_issue_discard(sb, block_group, bit, count); 4671 err = ext4_issue_discard(sb, block_group, bit, count);
4672 if (err && err != -EOPNOTSUPP)
4673 ext4_msg(sb, KERN_WARNING, "discard request in"
4674 " group:%d block:%d count:%lu failed"
4675 " with %d", block_group, bit, count,
4676 err);
4677 }
4678
4679
4661 ext4_lock_group(sb, block_group); 4680 ext4_lock_group(sb, block_group);
4662 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 4681 mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4663 mb_free_blocks(inode, &e4b, bit, count_clusters); 4682 mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4851,10 +4870,11 @@ error_return:
4851 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4870 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4852 * be called with under the group lock. 4871 * be called with under the group lock.
4853 */ 4872 */
4854static void ext4_trim_extent(struct super_block *sb, int start, int count, 4873static int ext4_trim_extent(struct super_block *sb, int start, int count,
4855 ext4_group_t group, struct ext4_buddy *e4b) 4874 ext4_group_t group, struct ext4_buddy *e4b)
4856{ 4875{
4857 struct ext4_free_extent ex; 4876 struct ext4_free_extent ex;
4877 int ret = 0;
4858 4878
4859 trace_ext4_trim_extent(sb, group, start, count); 4879 trace_ext4_trim_extent(sb, group, start, count);
4860 4880
@@ -4870,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
4870 */ 4890 */
4871 mb_mark_used(e4b, &ex); 4891 mb_mark_used(e4b, &ex);
4872 ext4_unlock_group(sb, group); 4892 ext4_unlock_group(sb, group);
4873 ext4_issue_discard(sb, group, start, count); 4893 ret = ext4_issue_discard(sb, group, start, count);
4874 ext4_lock_group(sb, group); 4894 ext4_lock_group(sb, group);
4875 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4895 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4896 return ret;
4876} 4897}
4877 4898
4878/** 4899/**
@@ -4901,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4901 void *bitmap; 4922 void *bitmap;
4902 ext4_grpblk_t next, count = 0, free_count = 0; 4923 ext4_grpblk_t next, count = 0, free_count = 0;
4903 struct ext4_buddy e4b; 4924 struct ext4_buddy e4b;
4904 int ret; 4925 int ret = 0;
4905 4926
4906 trace_ext4_trim_all_free(sb, group, start, max); 4927 trace_ext4_trim_all_free(sb, group, start, max);
4907 4928
@@ -4928,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4928 next = mb_find_next_bit(bitmap, max + 1, start); 4949 next = mb_find_next_bit(bitmap, max + 1, start);
4929 4950
4930 if ((next - start) >= minblocks) { 4951 if ((next - start) >= minblocks) {
4931 ext4_trim_extent(sb, start, 4952 ret = ext4_trim_extent(sb, start,
4932 next - start, group, &e4b); 4953 next - start, group, &e4b);
4954 if (ret && ret != -EOPNOTSUPP)
4955 break;
4956 ret = 0;
4933 count += next - start; 4957 count += next - start;
4934 } 4958 }
4935 free_count += next - start; 4959 free_count += next - start;
@@ -4950,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4950 break; 4974 break;
4951 } 4975 }
4952 4976
4953 if (!ret) 4977 if (!ret) {
4978 ret = count;
4954 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); 4979 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
4980 }
4955out: 4981out:
4956 ext4_unlock_group(sb, group); 4982 ext4_unlock_group(sb, group);
4957 ext4_mb_unload_buddy(&e4b); 4983 ext4_mb_unload_buddy(&e4b);
@@ -4959,7 +4985,7 @@ out:
4959 ext4_debug("trimmed %d blocks in the group %d\n", 4985 ext4_debug("trimmed %d blocks in the group %d\n",
4960 count, group); 4986 count, group);
4961 4987
4962 return count; 4988 return ret;
4963} 4989}
4964 4990
4965/** 4991/**
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f1bb32ec0169..db8226d595fa 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -14,6 +14,7 @@
14 14
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include "ext4_jbd2.h" 16#include "ext4_jbd2.h"
17#include "ext4_extents.h"
17 18
18/* 19/*
19 * The contiguous blocks details which can be 20 * The contiguous blocks details which can be
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 292daeeed455..d9cc5ee42f53 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -18,6 +18,7 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "ext4_jbd2.h" 19#include "ext4_jbd2.h"
20#include "ext4.h" 20#include "ext4.h"
21#include "ext4_extents.h"
21 22
22/** 23/**
23 * get_ext_path - Find an extent path for designated logical block number. 24 * get_ext_path - Find an extent path for designated logical block number.
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d600a69fc9d..f9ed946a448e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -202,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
202 struct inode *inode); 202 struct inode *inode);
203 203
204/* checksumming functions */ 204/* checksumming functions */
205#define EXT4_DIRENT_TAIL(block, blocksize) \ 205void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
206 ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ 206 unsigned int blocksize)
207 ((blocksize) - \
208 sizeof(struct ext4_dir_entry_tail))))
209
210static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
211 unsigned int blocksize)
212{ 207{
213 memset(t, 0, sizeof(struct ext4_dir_entry_tail)); 208 memset(t, 0, sizeof(struct ext4_dir_entry_tail));
214 t->det_rec_len = ext4_rec_len_to_disk( 209 t->det_rec_len = ext4_rec_len_to_disk(
@@ -261,6 +256,12 @@ static __le32 ext4_dirent_csum(struct inode *inode,
261 return cpu_to_le32(csum); 256 return cpu_to_le32(csum);
262} 257}
263 258
259static void warn_no_space_for_csum(struct inode *inode)
260{
261 ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
262 "checksum. Please run e2fsck -D.", inode->i_ino);
263}
264
264int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) 265int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
265{ 266{
266 struct ext4_dir_entry_tail *t; 267 struct ext4_dir_entry_tail *t;
@@ -271,8 +272,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
271 272
272 t = get_dirent_tail(inode, dirent); 273 t = get_dirent_tail(inode, dirent);
273 if (!t) { 274 if (!t) {
274 EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " 275 warn_no_space_for_csum(inode);
275 "leaf for checksum. Please run e2fsck -D.");
276 return 0; 276 return 0;
277 } 277 }
278 278
@@ -294,8 +294,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
294 294
295 t = get_dirent_tail(inode, dirent); 295 t = get_dirent_tail(inode, dirent);
296 if (!t) { 296 if (!t) {
297 EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " 297 warn_no_space_for_csum(inode);
298 "leaf for checksum. Please run e2fsck -D.");
299 return; 298 return;
300 } 299 }
301 300
@@ -303,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode,
303 (void *)t - (void *)dirent); 302 (void *)t - (void *)dirent);
304} 303}
305 304
306static inline int ext4_handle_dirty_dirent_node(handle_t *handle, 305int ext4_handle_dirty_dirent_node(handle_t *handle,
307 struct inode *inode, 306 struct inode *inode,
308 struct buffer_head *bh) 307 struct buffer_head *bh)
309{ 308{
310 ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); 309 ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
311 return ext4_handle_dirty_metadata(handle, inode, bh); 310 return ext4_handle_dirty_metadata(handle, inode, bh);
@@ -377,8 +376,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
377 count = le16_to_cpu(c->count); 376 count = le16_to_cpu(c->count);
378 if (count_offset + (limit * sizeof(struct dx_entry)) > 377 if (count_offset + (limit * sizeof(struct dx_entry)) >
379 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { 378 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
380 EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " 379 warn_no_space_for_csum(inode);
381 "tree checksum found. Run e2fsck -D.");
382 return 1; 380 return 1;
383 } 381 }
384 t = (struct dx_tail *)(((struct dx_entry *)c) + limit); 382 t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -408,8 +406,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
408 count = le16_to_cpu(c->count); 406 count = le16_to_cpu(c->count);
409 if (count_offset + (limit * sizeof(struct dx_entry)) > 407 if (count_offset + (limit * sizeof(struct dx_entry)) >
410 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { 408 EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
411 EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " 409 warn_no_space_for_csum(inode);
412 "tree checksum. Run e2fsck -D.");
413 return; 410 return;
414 } 411 }
415 t = (struct dx_tail *)(((struct dx_entry *)c) + limit); 412 t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -725,7 +722,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
725 ext4_warning(dir->i_sb, "Node failed checksum"); 722 ext4_warning(dir->i_sb, "Node failed checksum");
726 brelse(bh); 723 brelse(bh);
727 *err = ERR_BAD_DX_DIR; 724 *err = ERR_BAD_DX_DIR;
728 goto fail; 725 goto fail2;
729 } 726 }
730 set_buffer_verified(bh); 727 set_buffer_verified(bh);
731 728
@@ -890,6 +887,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
890 EXT4_DIR_REC_LEN(0)); 887 EXT4_DIR_REC_LEN(0));
891 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 888 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
892 if (ext4_check_dir_entry(dir, NULL, de, bh, 889 if (ext4_check_dir_entry(dir, NULL, de, bh,
890 bh->b_data, bh->b_size,
893 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 891 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
894 + ((char *)de - bh->b_data))) { 892 + ((char *)de - bh->b_data))) {
895 /* On error, skip the f_pos to the next block. */ 893 /* On error, skip the f_pos to the next block. */
@@ -1007,6 +1005,15 @@ errout:
1007 return (err); 1005 return (err);
1008} 1006}
1009 1007
1008static inline int search_dirblock(struct buffer_head *bh,
1009 struct inode *dir,
1010 const struct qstr *d_name,
1011 unsigned int offset,
1012 struct ext4_dir_entry_2 **res_dir)
1013{
1014 return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
1015 d_name, offset, res_dir);
1016}
1010 1017
1011/* 1018/*
1012 * Directory block splitting, compacting 1019 * Directory block splitting, compacting
@@ -1081,13 +1088,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
1081 dx_set_count(entries, count + 1); 1088 dx_set_count(entries, count + 1);
1082} 1089}
1083 1090
1084static void ext4_update_dx_flag(struct inode *inode)
1085{
1086 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1087 EXT4_FEATURE_COMPAT_DIR_INDEX))
1088 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1089}
1090
1091/* 1091/*
1092 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. 1092 * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
1093 * 1093 *
@@ -1107,11 +1107,13 @@ static inline int ext4_match (int len, const char * const name,
1107/* 1107/*
1108 * Returns 0 if not found, -1 on failure, and 1 on success 1108 * Returns 0 if not found, -1 on failure, and 1 on success
1109 */ 1109 */
1110static inline int search_dirblock(struct buffer_head *bh, 1110int search_dir(struct buffer_head *bh,
1111 struct inode *dir, 1111 char *search_buf,
1112 const struct qstr *d_name, 1112 int buf_size,
1113 unsigned int offset, 1113 struct inode *dir,
1114 struct ext4_dir_entry_2 ** res_dir) 1114 const struct qstr *d_name,
1115 unsigned int offset,
1116 struct ext4_dir_entry_2 **res_dir)
1115{ 1117{
1116 struct ext4_dir_entry_2 * de; 1118 struct ext4_dir_entry_2 * de;
1117 char * dlimit; 1119 char * dlimit;
@@ -1119,8 +1121,8 @@ static inline int search_dirblock(struct buffer_head *bh,
1119 const char *name = d_name->name; 1121 const char *name = d_name->name;
1120 int namelen = d_name->len; 1122 int namelen = d_name->len;
1121 1123
1122 de = (struct ext4_dir_entry_2 *) bh->b_data; 1124 de = (struct ext4_dir_entry_2 *)search_buf;
1123 dlimit = bh->b_data + dir->i_sb->s_blocksize; 1125 dlimit = search_buf + buf_size;
1124 while ((char *) de < dlimit) { 1126 while ((char *) de < dlimit) {
1125 /* this code is executed quadratically often */ 1127 /* this code is executed quadratically often */
1126 /* do minimal checking `by hand' */ 1128 /* do minimal checking `by hand' */
@@ -1128,7 +1130,8 @@ static inline int search_dirblock(struct buffer_head *bh,
1128 if ((char *) de + namelen <= dlimit && 1130 if ((char *) de + namelen <= dlimit &&
1129 ext4_match (namelen, name, de)) { 1131 ext4_match (namelen, name, de)) {
1130 /* found a match - just to be sure, do a full check */ 1132 /* found a match - just to be sure, do a full check */
1131 if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) 1133 if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
1134 bh->b_size, offset))
1132 return -1; 1135 return -1;
1133 *res_dir = de; 1136 *res_dir = de;
1134 return 1; 1137 return 1;
@@ -1144,6 +1147,21 @@ static inline int search_dirblock(struct buffer_head *bh,
1144 return 0; 1147 return 0;
1145} 1148}
1146 1149
1150static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
1151 struct ext4_dir_entry *de)
1152{
1153 struct super_block *sb = dir->i_sb;
1154
1155 if (!is_dx(dir))
1156 return 0;
1157 if (block == 0)
1158 return 1;
1159 if (de->inode == 0 &&
1160 ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
1161 sb->s_blocksize)
1162 return 1;
1163 return 0;
1164}
1147 1165
1148/* 1166/*
1149 * ext4_find_entry() 1167 * ext4_find_entry()
@@ -1158,7 +1176,8 @@ static inline int search_dirblock(struct buffer_head *bh,
1158 */ 1176 */
1159static struct buffer_head * ext4_find_entry (struct inode *dir, 1177static struct buffer_head * ext4_find_entry (struct inode *dir,
1160 const struct qstr *d_name, 1178 const struct qstr *d_name,
1161 struct ext4_dir_entry_2 ** res_dir) 1179 struct ext4_dir_entry_2 **res_dir,
1180 int *inlined)
1162{ 1181{
1163 struct super_block *sb; 1182 struct super_block *sb;
1164 struct buffer_head *bh_use[NAMEI_RA_SIZE]; 1183 struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -1179,6 +1198,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1179 namelen = d_name->len; 1198 namelen = d_name->len;
1180 if (namelen > EXT4_NAME_LEN) 1199 if (namelen > EXT4_NAME_LEN)
1181 return NULL; 1200 return NULL;
1201
1202 if (ext4_has_inline_data(dir)) {
1203 int has_inline_data = 1;
1204 ret = ext4_find_inline_entry(dir, d_name, res_dir,
1205 &has_inline_data);
1206 if (has_inline_data) {
1207 if (inlined)
1208 *inlined = 1;
1209 return ret;
1210 }
1211 }
1212
1182 if ((namelen <= 2) && (name[0] == '.') && 1213 if ((namelen <= 2) && (name[0] == '.') &&
1183 (name[1] == '.' || name[1] == '\0')) { 1214 (name[1] == '.' || name[1] == '\0')) {
1184 /* 1215 /*
@@ -1244,6 +1275,8 @@ restart:
1244 goto next; 1275 goto next;
1245 } 1276 }
1246 if (!buffer_verified(bh) && 1277 if (!buffer_verified(bh) &&
1278 !is_dx_internal_node(dir, block,
1279 (struct ext4_dir_entry *)bh->b_data) &&
1247 !ext4_dirent_csum_verify(dir, 1280 !ext4_dirent_csum_verify(dir,
1248 (struct ext4_dir_entry *)bh->b_data)) { 1281 (struct ext4_dir_entry *)bh->b_data)) {
1249 EXT4_ERROR_INODE(dir, "checksumming directory " 1282 EXT4_ERROR_INODE(dir, "checksumming directory "
@@ -1361,7 +1394,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
1361 if (dentry->d_name.len > EXT4_NAME_LEN) 1394 if (dentry->d_name.len > EXT4_NAME_LEN)
1362 return ERR_PTR(-ENAMETOOLONG); 1395 return ERR_PTR(-ENAMETOOLONG);
1363 1396
1364 bh = ext4_find_entry(dir, &dentry->d_name, &de); 1397 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
1365 inode = NULL; 1398 inode = NULL;
1366 if (bh) { 1399 if (bh) {
1367 __u32 ino = le32_to_cpu(de->inode); 1400 __u32 ino = le32_to_cpu(de->inode);
@@ -1395,7 +1428,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1395 struct ext4_dir_entry_2 * de; 1428 struct ext4_dir_entry_2 * de;
1396 struct buffer_head *bh; 1429 struct buffer_head *bh;
1397 1430
1398 bh = ext4_find_entry(child->d_inode, &dotdot, &de); 1431 bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
1399 if (!bh) 1432 if (!bh)
1400 return ERR_PTR(-ENOENT); 1433 return ERR_PTR(-ENOENT);
1401 ino = le32_to_cpu(de->inode); 1434 ino = le32_to_cpu(de->inode);
@@ -1593,6 +1626,63 @@ errout:
1593 return NULL; 1626 return NULL;
1594} 1627}
1595 1628
1629int ext4_find_dest_de(struct inode *dir, struct inode *inode,
1630 struct buffer_head *bh,
1631 void *buf, int buf_size,
1632 const char *name, int namelen,
1633 struct ext4_dir_entry_2 **dest_de)
1634{
1635 struct ext4_dir_entry_2 *de;
1636 unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
1637 int nlen, rlen;
1638 unsigned int offset = 0;
1639 char *top;
1640
1641 de = (struct ext4_dir_entry_2 *)buf;
1642 top = buf + buf_size - reclen;
1643 while ((char *) de <= top) {
1644 if (ext4_check_dir_entry(dir, NULL, de, bh,
1645 buf, buf_size, offset))
1646 return -EIO;
1647 if (ext4_match(namelen, name, de))
1648 return -EEXIST;
1649 nlen = EXT4_DIR_REC_LEN(de->name_len);
1650 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
1651 if ((de->inode ? rlen - nlen : rlen) >= reclen)
1652 break;
1653 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1654 offset += rlen;
1655 }
1656 if ((char *) de > top)
1657 return -ENOSPC;
1658
1659 *dest_de = de;
1660 return 0;
1661}
1662
1663void ext4_insert_dentry(struct inode *inode,
1664 struct ext4_dir_entry_2 *de,
1665 int buf_size,
1666 const char *name, int namelen)
1667{
1668
1669 int nlen, rlen;
1670
1671 nlen = EXT4_DIR_REC_LEN(de->name_len);
1672 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
1673 if (de->inode) {
1674 struct ext4_dir_entry_2 *de1 =
1675 (struct ext4_dir_entry_2 *)((char *)de + nlen);
1676 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
1677 de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
1678 de = de1;
1679 }
1680 de->file_type = EXT4_FT_UNKNOWN;
1681 de->inode = cpu_to_le32(inode->i_ino);
1682 ext4_set_de_type(inode->i_sb, de, inode->i_mode);
1683 de->name_len = namelen;
1684 memcpy(de->name, name, namelen);
1685}
1596/* 1686/*
1597 * Add a new entry into a directory (leaf) block. If de is non-NULL, 1687 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1598 * it points to a directory entry which is guaranteed to be large 1688 * it points to a directory entry which is guaranteed to be large
@@ -1608,12 +1698,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1608 struct inode *dir = dentry->d_parent->d_inode; 1698 struct inode *dir = dentry->d_parent->d_inode;
1609 const char *name = dentry->d_name.name; 1699 const char *name = dentry->d_name.name;
1610 int namelen = dentry->d_name.len; 1700 int namelen = dentry->d_name.len;
1611 unsigned int offset = 0;
1612 unsigned int blocksize = dir->i_sb->s_blocksize; 1701 unsigned int blocksize = dir->i_sb->s_blocksize;
1613 unsigned short reclen; 1702 unsigned short reclen;
1614 int nlen, rlen, err;
1615 char *top;
1616 int csum_size = 0; 1703 int csum_size = 0;
1704 int err;
1617 1705
1618 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1706 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1619 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 1707 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -1621,22 +1709,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1621 1709
1622 reclen = EXT4_DIR_REC_LEN(namelen); 1710 reclen = EXT4_DIR_REC_LEN(namelen);
1623 if (!de) { 1711 if (!de) {
1624 de = (struct ext4_dir_entry_2 *)bh->b_data; 1712 err = ext4_find_dest_de(dir, inode,
1625 top = bh->b_data + (blocksize - csum_size) - reclen; 1713 bh, bh->b_data, blocksize - csum_size,
1626 while ((char *) de <= top) { 1714 name, namelen, &de);
1627 if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) 1715 if (err)
1628 return -EIO; 1716 return err;
1629 if (ext4_match(namelen, name, de))
1630 return -EEXIST;
1631 nlen = EXT4_DIR_REC_LEN(de->name_len);
1632 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1633 if ((de->inode? rlen - nlen: rlen) >= reclen)
1634 break;
1635 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1636 offset += rlen;
1637 }
1638 if ((char *) de > top)
1639 return -ENOSPC;
1640 } 1717 }
1641 BUFFER_TRACE(bh, "get_write_access"); 1718 BUFFER_TRACE(bh, "get_write_access");
1642 err = ext4_journal_get_write_access(handle, bh); 1719 err = ext4_journal_get_write_access(handle, bh);
@@ -1646,19 +1723,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1646 } 1723 }
1647 1724
1648 /* By now the buffer is marked for journaling */ 1725 /* By now the buffer is marked for journaling */
1649 nlen = EXT4_DIR_REC_LEN(de->name_len); 1726 ext4_insert_dentry(inode, de, blocksize, name, namelen);
1650 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1727
1651 if (de->inode) {
1652 struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1653 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
1654 de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1655 de = de1;
1656 }
1657 de->file_type = EXT4_FT_UNKNOWN;
1658 de->inode = cpu_to_le32(inode->i_ino);
1659 ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1660 de->name_len = namelen;
1661 memcpy(de->name, name, namelen);
1662 /* 1728 /*
1663 * XXX shouldn't update any times until successful 1729 * XXX shouldn't update any times until successful
1664 * completion of syscall, but too many callers depend 1730 * completion of syscall, but too many callers depend
@@ -1831,6 +1897,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1831 blocksize = sb->s_blocksize; 1897 blocksize = sb->s_blocksize;
1832 if (!dentry->d_name.len) 1898 if (!dentry->d_name.len)
1833 return -EINVAL; 1899 return -EINVAL;
1900
1901 if (ext4_has_inline_data(dir)) {
1902 retval = ext4_try_add_inline_entry(handle, dentry, inode);
1903 if (retval < 0)
1904 return retval;
1905 if (retval == 1) {
1906 retval = 0;
1907 return retval;
1908 }
1909 }
1910
1834 if (is_dx(dir)) { 1911 if (is_dx(dir)) {
1835 retval = ext4_dx_add_entry(handle, dentry, inode); 1912 retval = ext4_dx_add_entry(handle, dentry, inode);
1836 if (!retval || (retval != ERR_BAD_DX_DIR)) 1913 if (!retval || (retval != ERR_BAD_DX_DIR))
@@ -2036,36 +2113,29 @@ cleanup:
2036} 2113}
2037 2114
2038/* 2115/*
2039 * ext4_delete_entry deletes a directory entry by merging it with the 2116 * ext4_generic_delete_entry deletes a directory entry by merging it
2040 * previous entry 2117 * with the previous entry
2041 */ 2118 */
2042static int ext4_delete_entry(handle_t *handle, 2119int ext4_generic_delete_entry(handle_t *handle,
2043 struct inode *dir, 2120 struct inode *dir,
2044 struct ext4_dir_entry_2 *de_del, 2121 struct ext4_dir_entry_2 *de_del,
2045 struct buffer_head *bh) 2122 struct buffer_head *bh,
2123 void *entry_buf,
2124 int buf_size,
2125 int csum_size)
2046{ 2126{
2047 struct ext4_dir_entry_2 *de, *pde; 2127 struct ext4_dir_entry_2 *de, *pde;
2048 unsigned int blocksize = dir->i_sb->s_blocksize; 2128 unsigned int blocksize = dir->i_sb->s_blocksize;
2049 int csum_size = 0; 2129 int i;
2050 int i, err;
2051
2052 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2053 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2054 csum_size = sizeof(struct ext4_dir_entry_tail);
2055 2130
2056 i = 0; 2131 i = 0;
2057 pde = NULL; 2132 pde = NULL;
2058 de = (struct ext4_dir_entry_2 *) bh->b_data; 2133 de = (struct ext4_dir_entry_2 *)entry_buf;
2059 while (i < bh->b_size - csum_size) { 2134 while (i < buf_size - csum_size) {
2060 if (ext4_check_dir_entry(dir, NULL, de, bh, i)) 2135 if (ext4_check_dir_entry(dir, NULL, de, bh,
2136 bh->b_data, bh->b_size, i))
2061 return -EIO; 2137 return -EIO;
2062 if (de == de_del) { 2138 if (de == de_del) {
2063 BUFFER_TRACE(bh, "get_write_access");
2064 err = ext4_journal_get_write_access(handle, bh);
2065 if (unlikely(err)) {
2066 ext4_std_error(dir->i_sb, err);
2067 return err;
2068 }
2069 if (pde) 2139 if (pde)
2070 pde->rec_len = ext4_rec_len_to_disk( 2140 pde->rec_len = ext4_rec_len_to_disk(
2071 ext4_rec_len_from_disk(pde->rec_len, 2141 ext4_rec_len_from_disk(pde->rec_len,
@@ -2076,12 +2146,6 @@ static int ext4_delete_entry(handle_t *handle,
2076 else 2146 else
2077 de->inode = 0; 2147 de->inode = 0;
2078 dir->i_version++; 2148 dir->i_version++;
2079 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2080 err = ext4_handle_dirty_dirent_node(handle, dir, bh);
2081 if (unlikely(err)) {
2082 ext4_std_error(dir->i_sb, err);
2083 return err;
2084 }
2085 return 0; 2149 return 0;
2086 } 2150 }
2087 i += ext4_rec_len_from_disk(de->rec_len, blocksize); 2151 i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -2091,6 +2155,48 @@ static int ext4_delete_entry(handle_t *handle,
2091 return -ENOENT; 2155 return -ENOENT;
2092} 2156}
2093 2157
2158static int ext4_delete_entry(handle_t *handle,
2159 struct inode *dir,
2160 struct ext4_dir_entry_2 *de_del,
2161 struct buffer_head *bh)
2162{
2163 int err, csum_size = 0;
2164
2165 if (ext4_has_inline_data(dir)) {
2166 int has_inline_data = 1;
2167 err = ext4_delete_inline_entry(handle, dir, de_del, bh,
2168 &has_inline_data);
2169 if (has_inline_data)
2170 return err;
2171 }
2172
2173 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2174 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2175 csum_size = sizeof(struct ext4_dir_entry_tail);
2176
2177 BUFFER_TRACE(bh, "get_write_access");
2178 err = ext4_journal_get_write_access(handle, bh);
2179 if (unlikely(err))
2180 goto out;
2181
2182 err = ext4_generic_delete_entry(handle, dir, de_del,
2183 bh, bh->b_data,
2184 dir->i_sb->s_blocksize, csum_size);
2185 if (err)
2186 goto out;
2187
2188 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2189 err = ext4_handle_dirty_dirent_node(handle, dir, bh);
2190 if (unlikely(err))
2191 goto out;
2192
2193 return 0;
2194out:
2195 if (err != -ENOENT)
2196 ext4_std_error(dir->i_sb, err);
2197 return err;
2198}
2199
2094/* 2200/*
2095 * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, 2201 * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
2096 * since this indicates that nlinks count was previously 1. 2202 * since this indicates that nlinks count was previously 1.
@@ -2211,21 +2317,94 @@ retry:
2211 return err; 2317 return err;
2212} 2318}
2213 2319
2214static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 2320struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
2321 struct ext4_dir_entry_2 *de,
2322 int blocksize, int csum_size,
2323 unsigned int parent_ino, int dotdot_real_len)
2324{
2325 de->inode = cpu_to_le32(inode->i_ino);
2326 de->name_len = 1;
2327 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
2328 blocksize);
2329 strcpy(de->name, ".");
2330 ext4_set_de_type(inode->i_sb, de, S_IFDIR);
2331
2332 de = ext4_next_entry(de, blocksize);
2333 de->inode = cpu_to_le32(parent_ino);
2334 de->name_len = 2;
2335 if (!dotdot_real_len)
2336 de->rec_len = ext4_rec_len_to_disk(blocksize -
2337 (csum_size + EXT4_DIR_REC_LEN(1)),
2338 blocksize);
2339 else
2340 de->rec_len = ext4_rec_len_to_disk(
2341 EXT4_DIR_REC_LEN(de->name_len), blocksize);
2342 strcpy(de->name, "..");
2343 ext4_set_de_type(inode->i_sb, de, S_IFDIR);
2344
2345 return ext4_next_entry(de, blocksize);
2346}
2347
2348static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
2349 struct inode *inode)
2215{ 2350{
2216 handle_t *handle;
2217 struct inode *inode;
2218 struct buffer_head *dir_block = NULL; 2351 struct buffer_head *dir_block = NULL;
2219 struct ext4_dir_entry_2 *de; 2352 struct ext4_dir_entry_2 *de;
2220 struct ext4_dir_entry_tail *t; 2353 struct ext4_dir_entry_tail *t;
2221 unsigned int blocksize = dir->i_sb->s_blocksize; 2354 unsigned int blocksize = dir->i_sb->s_blocksize;
2222 int csum_size = 0; 2355 int csum_size = 0;
2223 int err, retries = 0; 2356 int err;
2224 2357
2225 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2358 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
2226 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 2359 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2227 csum_size = sizeof(struct ext4_dir_entry_tail); 2360 csum_size = sizeof(struct ext4_dir_entry_tail);
2228 2361
2362 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
2363 err = ext4_try_create_inline_dir(handle, dir, inode);
2364 if (err < 0 && err != -ENOSPC)
2365 goto out;
2366 if (!err)
2367 goto out;
2368 }
2369
2370 inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
2371 if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
2372 if (!err) {
2373 err = -EIO;
2374 ext4_error(inode->i_sb,
2375 "Directory hole detected on inode %lu\n",
2376 inode->i_ino);
2377 }
2378 goto out;
2379 }
2380 BUFFER_TRACE(dir_block, "get_write_access");
2381 err = ext4_journal_get_write_access(handle, dir_block);
2382 if (err)
2383 goto out;
2384 de = (struct ext4_dir_entry_2 *)dir_block->b_data;
2385 ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
2386 set_nlink(inode, 2);
2387 if (csum_size) {
2388 t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
2389 initialize_dirent_tail(t, blocksize);
2390 }
2391
2392 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
2393 err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
2394 if (err)
2395 goto out;
2396 set_buffer_verified(dir_block);
2397out:
2398 brelse(dir_block);
2399 return err;
2400}
2401
2402static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2403{
2404 handle_t *handle;
2405 struct inode *inode;
2406 int err, retries = 0;
2407
2229 if (EXT4_DIR_LINK_MAX(dir)) 2408 if (EXT4_DIR_LINK_MAX(dir))
2230 return -EMLINK; 2409 return -EMLINK;
2231 2410
@@ -2249,47 +2428,9 @@ retry:
2249 2428
2250 inode->i_op = &ext4_dir_inode_operations; 2429 inode->i_op = &ext4_dir_inode_operations;
2251 inode->i_fop = &ext4_dir_operations; 2430 inode->i_fop = &ext4_dir_operations;
2252 inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 2431 err = ext4_init_new_dir(handle, dir, inode);
2253 if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
2254 if (!err) {
2255 err = -EIO;
2256 ext4_error(inode->i_sb,
2257 "Directory hole detected on inode %lu\n",
2258 inode->i_ino);
2259 }
2260 goto out_clear_inode;
2261 }
2262 BUFFER_TRACE(dir_block, "get_write_access");
2263 err = ext4_journal_get_write_access(handle, dir_block);
2264 if (err)
2265 goto out_clear_inode;
2266 de = (struct ext4_dir_entry_2 *) dir_block->b_data;
2267 de->inode = cpu_to_le32(inode->i_ino);
2268 de->name_len = 1;
2269 de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
2270 blocksize);
2271 strcpy(de->name, ".");
2272 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2273 de = ext4_next_entry(de, blocksize);
2274 de->inode = cpu_to_le32(dir->i_ino);
2275 de->rec_len = ext4_rec_len_to_disk(blocksize -
2276 (csum_size + EXT4_DIR_REC_LEN(1)),
2277 blocksize);
2278 de->name_len = 2;
2279 strcpy(de->name, "..");
2280 ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2281 set_nlink(inode, 2);
2282
2283 if (csum_size) {
2284 t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
2285 initialize_dirent_tail(t, blocksize);
2286 }
2287
2288 BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
2289 err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
2290 if (err) 2432 if (err)
2291 goto out_clear_inode; 2433 goto out_clear_inode;
2292 set_buffer_verified(dir_block);
2293 err = ext4_mark_inode_dirty(handle, inode); 2434 err = ext4_mark_inode_dirty(handle, inode);
2294 if (!err) 2435 if (!err)
2295 err = ext4_add_entry(handle, dentry, inode); 2436 err = ext4_add_entry(handle, dentry, inode);
@@ -2309,7 +2450,6 @@ out_clear_inode:
2309 unlock_new_inode(inode); 2450 unlock_new_inode(inode);
2310 d_instantiate(dentry, inode); 2451 d_instantiate(dentry, inode);
2311out_stop: 2452out_stop:
2312 brelse(dir_block);
2313 ext4_journal_stop(handle); 2453 ext4_journal_stop(handle);
2314 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2454 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2315 goto retry; 2455 goto retry;
@@ -2327,6 +2467,14 @@ static int empty_dir(struct inode *inode)
2327 struct super_block *sb; 2467 struct super_block *sb;
2328 int err = 0; 2468 int err = 0;
2329 2469
2470 if (ext4_has_inline_data(inode)) {
2471 int has_inline_data = 1;
2472
2473 err = empty_inline_dir(inode, &has_inline_data);
2474 if (has_inline_data)
2475 return err;
2476 }
2477
2330 sb = inode->i_sb; 2478 sb = inode->i_sb;
2331 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 2479 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
2332 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 2480 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
@@ -2393,7 +2541,8 @@ static int empty_dir(struct inode *inode)
2393 set_buffer_verified(bh); 2541 set_buffer_verified(bh);
2394 de = (struct ext4_dir_entry_2 *) bh->b_data; 2542 de = (struct ext4_dir_entry_2 *) bh->b_data;
2395 } 2543 }
2396 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { 2544 if (ext4_check_dir_entry(inode, NULL, de, bh,
2545 bh->b_data, bh->b_size, offset)) {
2397 de = (struct ext4_dir_entry_2 *)(bh->b_data + 2546 de = (struct ext4_dir_entry_2 *)(bh->b_data +
2398 sb->s_blocksize); 2547 sb->s_blocksize);
2399 offset = (offset | (sb->s_blocksize - 1)) + 1; 2548 offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2498,7 +2647,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2498 struct ext4_iloc iloc; 2647 struct ext4_iloc iloc;
2499 int err = 0; 2648 int err = 0;
2500 2649
2501 if (!EXT4_SB(inode->i_sb)->s_journal) 2650 if ((!EXT4_SB(inode->i_sb)->s_journal) &&
2651 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
2502 return 0; 2652 return 0;
2503 2653
2504 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2654 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2579,7 +2729,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2579 return PTR_ERR(handle); 2729 return PTR_ERR(handle);
2580 2730
2581 retval = -ENOENT; 2731 retval = -ENOENT;
2582 bh = ext4_find_entry(dir, &dentry->d_name, &de); 2732 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2583 if (!bh) 2733 if (!bh)
2584 goto end_rmdir; 2734 goto end_rmdir;
2585 2735
@@ -2644,7 +2794,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2644 ext4_handle_sync(handle); 2794 ext4_handle_sync(handle);
2645 2795
2646 retval = -ENOENT; 2796 retval = -ENOENT;
2647 bh = ext4_find_entry(dir, &dentry->d_name, &de); 2797 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2648 if (!bh) 2798 if (!bh)
2649 goto end_unlink; 2799 goto end_unlink;
2650 2800
@@ -2826,8 +2976,39 @@ retry:
2826 return err; 2976 return err;
2827} 2977}
2828 2978
2829#define PARENT_INO(buffer, size) \ 2979
2830 (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) 2980/*
2981 * Try to find buffer head where contains the parent block.
2982 * It should be the inode block if it is inlined or the 1st block
2983 * if it is a normal dir.
2984 */
2985static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
2986 struct inode *inode,
2987 int *retval,
2988 struct ext4_dir_entry_2 **parent_de,
2989 int *inlined)
2990{
2991 struct buffer_head *bh;
2992
2993 if (!ext4_has_inline_data(inode)) {
2994 if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
2995 if (!*retval) {
2996 *retval = -EIO;
2997 ext4_error(inode->i_sb,
2998 "Directory hole detected on inode %lu\n",
2999 inode->i_ino);
3000 }
3001 return NULL;
3002 }
3003 *parent_de = ext4_next_entry(
3004 (struct ext4_dir_entry_2 *)bh->b_data,
3005 inode->i_sb->s_blocksize);
3006 return bh;
3007 }
3008
3009 *inlined = 1;
3010 return ext4_get_first_inline_block(inode, parent_de, retval);
3011}
2831 3012
2832/* 3013/*
2833 * Anybody can rename anything with this: the permission checks are left to the 3014 * Anybody can rename anything with this: the permission checks are left to the
@@ -2841,6 +3022,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2841 struct buffer_head *old_bh, *new_bh, *dir_bh; 3022 struct buffer_head *old_bh, *new_bh, *dir_bh;
2842 struct ext4_dir_entry_2 *old_de, *new_de; 3023 struct ext4_dir_entry_2 *old_de, *new_de;
2843 int retval, force_da_alloc = 0; 3024 int retval, force_da_alloc = 0;
3025 int inlined = 0, new_inlined = 0;
3026 struct ext4_dir_entry_2 *parent_de;
2844 3027
2845 dquot_initialize(old_dir); 3028 dquot_initialize(old_dir);
2846 dquot_initialize(new_dir); 3029 dquot_initialize(new_dir);
@@ -2860,7 +3043,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2860 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) 3043 if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2861 ext4_handle_sync(handle); 3044 ext4_handle_sync(handle);
2862 3045
2863 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); 3046 old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
2864 /* 3047 /*
2865 * Check for inode number is _not_ due to possible IO errors. 3048 * Check for inode number is _not_ due to possible IO errors.
2866 * We might rmdir the source, keep it as pwd of some process 3049 * We might rmdir the source, keep it as pwd of some process
@@ -2873,7 +3056,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2873 goto end_rename; 3056 goto end_rename;
2874 3057
2875 new_inode = new_dentry->d_inode; 3058 new_inode = new_dentry->d_inode;
2876 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); 3059 new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
3060 &new_de, &new_inlined);
2877 if (new_bh) { 3061 if (new_bh) {
2878 if (!new_inode) { 3062 if (!new_inode) {
2879 brelse(new_bh); 3063 brelse(new_bh);
@@ -2887,22 +3071,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2887 goto end_rename; 3071 goto end_rename;
2888 } 3072 }
2889 retval = -EIO; 3073 retval = -EIO;
2890 if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { 3074 dir_bh = ext4_get_first_dir_block(handle, old_inode,
2891 if (!retval) { 3075 &retval, &parent_de,
2892 retval = -EIO; 3076 &inlined);
2893 ext4_error(old_inode->i_sb, 3077 if (!dir_bh)
2894 "Directory hole detected on inode %lu\n",
2895 old_inode->i_ino);
2896 }
2897 goto end_rename; 3078 goto end_rename;
2898 } 3079 if (!inlined && !buffer_verified(dir_bh) &&
2899 if (!buffer_verified(dir_bh) &&
2900 !ext4_dirent_csum_verify(old_inode, 3080 !ext4_dirent_csum_verify(old_inode,
2901 (struct ext4_dir_entry *)dir_bh->b_data)) 3081 (struct ext4_dir_entry *)dir_bh->b_data))
2902 goto end_rename; 3082 goto end_rename;
2903 set_buffer_verified(dir_bh); 3083 set_buffer_verified(dir_bh);
2904 if (le32_to_cpu(PARENT_INO(dir_bh->b_data, 3084 if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
2905 old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2906 goto end_rename; 3085 goto end_rename;
2907 retval = -EMLINK; 3086 retval = -EMLINK;
2908 if (!new_inode && new_dir != old_dir && 3087 if (!new_inode && new_dir != old_dir &&
@@ -2931,10 +3110,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2931 ext4_current_time(new_dir); 3110 ext4_current_time(new_dir);
2932 ext4_mark_inode_dirty(handle, new_dir); 3111 ext4_mark_inode_dirty(handle, new_dir);
2933 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); 3112 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2934 retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); 3113 if (!new_inlined) {
2935 if (unlikely(retval)) { 3114 retval = ext4_handle_dirty_dirent_node(handle,
2936 ext4_std_error(new_dir->i_sb, retval); 3115 new_dir, new_bh);
2937 goto end_rename; 3116 if (unlikely(retval)) {
3117 ext4_std_error(new_dir->i_sb, retval);
3118 goto end_rename;
3119 }
2938 } 3120 }
2939 brelse(new_bh); 3121 brelse(new_bh);
2940 new_bh = NULL; 3122 new_bh = NULL;
@@ -2962,7 +3144,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2962 struct buffer_head *old_bh2; 3144 struct buffer_head *old_bh2;
2963 struct ext4_dir_entry_2 *old_de2; 3145 struct ext4_dir_entry_2 *old_de2;
2964 3146
2965 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); 3147 old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
3148 &old_de2, NULL);
2966 if (old_bh2) { 3149 if (old_bh2) {
2967 retval = ext4_delete_entry(handle, old_dir, 3150 retval = ext4_delete_entry(handle, old_dir,
2968 old_de2, old_bh2); 3151 old_de2, old_bh2);
@@ -2982,17 +3165,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2982 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); 3165 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2983 ext4_update_dx_flag(old_dir); 3166 ext4_update_dx_flag(old_dir);
2984 if (dir_bh) { 3167 if (dir_bh) {
2985 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 3168 parent_de->inode = cpu_to_le32(new_dir->i_ino);
2986 cpu_to_le32(new_dir->i_ino);
2987 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 3169 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2988 if (is_dx(old_inode)) { 3170 if (!inlined) {
2989 retval = ext4_handle_dirty_dx_node(handle, 3171 if (is_dx(old_inode)) {
2990 old_inode, 3172 retval = ext4_handle_dirty_dx_node(handle,
2991 dir_bh); 3173 old_inode,
3174 dir_bh);
3175 } else {
3176 retval = ext4_handle_dirty_dirent_node(handle,
3177 old_inode, dir_bh);
3178 }
2992 } else { 3179 } else {
2993 retval = ext4_handle_dirty_dirent_node(handle, 3180 retval = ext4_mark_inode_dirty(handle, old_inode);
2994 old_inode,
2995 dir_bh);
2996 } 3181 }
2997 if (retval) { 3182 if (retval) {
2998 ext4_std_error(old_dir->i_sb, retval); 3183 ext4_std_error(old_dir->i_sb, retval);
@@ -3043,23 +3228,19 @@ const struct inode_operations ext4_dir_inode_operations = {
3043 .mknod = ext4_mknod, 3228 .mknod = ext4_mknod,
3044 .rename = ext4_rename, 3229 .rename = ext4_rename,
3045 .setattr = ext4_setattr, 3230 .setattr = ext4_setattr,
3046#ifdef CONFIG_EXT4_FS_XATTR
3047 .setxattr = generic_setxattr, 3231 .setxattr = generic_setxattr,
3048 .getxattr = generic_getxattr, 3232 .getxattr = generic_getxattr,
3049 .listxattr = ext4_listxattr, 3233 .listxattr = ext4_listxattr,
3050 .removexattr = generic_removexattr, 3234 .removexattr = generic_removexattr,
3051#endif
3052 .get_acl = ext4_get_acl, 3235 .get_acl = ext4_get_acl,
3053 .fiemap = ext4_fiemap, 3236 .fiemap = ext4_fiemap,
3054}; 3237};
3055 3238
3056const struct inode_operations ext4_special_inode_operations = { 3239const struct inode_operations ext4_special_inode_operations = {
3057 .setattr = ext4_setattr, 3240 .setattr = ext4_setattr,
3058#ifdef CONFIG_EXT4_FS_XATTR
3059 .setxattr = generic_setxattr, 3241 .setxattr = generic_setxattr,
3060 .getxattr = generic_getxattr, 3242 .getxattr = generic_getxattr,
3061 .listxattr = ext4_listxattr, 3243 .listxattr = ext4_listxattr,
3062 .removexattr = generic_removexattr, 3244 .removexattr = generic_removexattr,
3063#endif
3064 .get_acl = ext4_get_acl, 3245 .get_acl = ext4_get_acl,
3065}; 3246};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 68e896e12a67..0016fbca2a40 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -27,7 +27,6 @@
27#include "ext4_jbd2.h" 27#include "ext4_jbd2.h"
28#include "xattr.h" 28#include "xattr.h"
29#include "acl.h" 29#include "acl.h"
30#include "ext4_extents.h"
31 30
32static struct kmem_cache *io_page_cachep, *io_end_cachep; 31static struct kmem_cache *io_page_cachep, *io_end_cachep;
33 32
@@ -111,7 +110,7 @@ static int ext4_end_io(ext4_io_end_t *io)
111 inode_dio_done(inode); 110 inode_dio_done(inode);
112 /* Wake up anyone waiting on unwritten extent conversion */ 111 /* Wake up anyone waiting on unwritten extent conversion */
113 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 112 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
114 wake_up_all(ext4_ioend_wq(io->inode)); 113 wake_up_all(ext4_ioend_wq(inode));
115 return ret; 114 return ret;
116} 115}
117 116
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 47bf06a2765d..d99387b89edd 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
783 783
784 err = ext4_journal_get_write_access(handle, gdb_bh); 784 err = ext4_journal_get_write_access(handle, gdb_bh);
785 if (unlikely(err)) 785 if (unlikely(err))
786 goto exit_sbh; 786 goto exit_dind;
787 787
788 err = ext4_journal_get_write_access(handle, dind); 788 err = ext4_journal_get_write_access(handle, dind);
789 if (unlikely(err)) 789 if (unlikely(err))
@@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
792 /* ext4_reserve_inode_write() gets a reference on the iloc */ 792 /* ext4_reserve_inode_write() gets a reference on the iloc */
793 err = ext4_reserve_inode_write(handle, inode, &iloc); 793 err = ext4_reserve_inode_write(handle, inode, &iloc);
794 if (unlikely(err)) 794 if (unlikely(err))
795 goto exit_dindj; 795 goto exit_dind;
796 796
797 n_group_desc = ext4_kvmalloc((gdb_num + 1) * 797 n_group_desc = ext4_kvmalloc((gdb_num + 1) *
798 sizeof(struct buffer_head *), 798 sizeof(struct buffer_head *),
@@ -846,12 +846,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
846 846
847exit_inode: 847exit_inode:
848 ext4_kvfree(n_group_desc); 848 ext4_kvfree(n_group_desc);
849 /* ext4_handle_release_buffer(handle, iloc.bh); */
850 brelse(iloc.bh); 849 brelse(iloc.bh);
851exit_dindj:
852 /* ext4_handle_release_buffer(handle, dind); */
853exit_sbh:
854 /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
855exit_dind: 850exit_dind:
856 brelse(dind); 851 brelse(dind);
857exit_bh: 852exit_bh:
@@ -969,14 +964,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
969 } 964 }
970 965
971 for (i = 0; i < reserved_gdb; i++) { 966 for (i = 0; i < reserved_gdb; i++) {
972 if ((err = ext4_journal_get_write_access(handle, primary[i]))) { 967 if ((err = ext4_journal_get_write_access(handle, primary[i])))
973 /*
974 int j;
975 for (j = 0; j < i; j++)
976 ext4_handle_release_buffer(handle, primary[j]);
977 */
978 goto exit_bh; 968 goto exit_bh;
979 }
980 } 969 }
981 970
982 if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) 971 if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 80928f716850..3d4fb81bacd5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,7 +45,7 @@
45#include <linux/freezer.h> 45#include <linux/freezer.h>
46 46
47#include "ext4.h" 47#include "ext4.h"
48#include "ext4_extents.h" 48#include "ext4_extents.h" /* Needed for trace points definition */
49#include "ext4_jbd2.h" 49#include "ext4_jbd2.h"
50#include "xattr.h" 50#include "xattr.h"
51#include "acl.h" 51#include "acl.h"
@@ -939,10 +939,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
939 return NULL; 939 return NULL;
940 940
941 ei->vfs_inode.i_version = 1; 941 ei->vfs_inode.i_version = 1;
942 ei->vfs_inode.i_data.writeback_index = 0;
943 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); 942 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
944 INIT_LIST_HEAD(&ei->i_prealloc_list); 943 INIT_LIST_HEAD(&ei->i_prealloc_list);
945 spin_lock_init(&ei->i_prealloc_lock); 944 spin_lock_init(&ei->i_prealloc_lock);
945 ext4_es_init_tree(&ei->i_es_tree);
946 rwlock_init(&ei->i_es_lock);
946 ei->i_reserved_data_blocks = 0; 947 ei->i_reserved_data_blocks = 0;
947 ei->i_reserved_meta_blocks = 0; 948 ei->i_reserved_meta_blocks = 0;
948 ei->i_allocated_meta_blocks = 0; 949 ei->i_allocated_meta_blocks = 0;
@@ -996,9 +997,7 @@ static void init_once(void *foo)
996 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 997 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
997 998
998 INIT_LIST_HEAD(&ei->i_orphan); 999 INIT_LIST_HEAD(&ei->i_orphan);
999#ifdef CONFIG_EXT4_FS_XATTR
1000 init_rwsem(&ei->xattr_sem); 1000 init_rwsem(&ei->xattr_sem);
1001#endif
1002 init_rwsem(&ei->i_data_sem); 1001 init_rwsem(&ei->i_data_sem);
1003 inode_init_once(&ei->vfs_inode); 1002 inode_init_once(&ei->vfs_inode);
1004} 1003}
@@ -1031,6 +1030,7 @@ void ext4_clear_inode(struct inode *inode)
1031 clear_inode(inode); 1030 clear_inode(inode);
1032 dquot_drop(inode); 1031 dquot_drop(inode);
1033 ext4_discard_preallocations(inode); 1032 ext4_discard_preallocations(inode);
1033 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1034 if (EXT4_I(inode)->jinode) { 1034 if (EXT4_I(inode)->jinode) {
1035 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 1035 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1036 EXT4_I(inode)->jinode); 1036 EXT4_I(inode)->jinode);
@@ -1447,13 +1447,8 @@ static const struct mount_opts {
1447 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, 1447 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
1448 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, 1448 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
1449 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, 1449 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
1450#ifdef CONFIG_EXT4_FS_XATTR
1451 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, 1450 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1452 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, 1451 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1453#else
1454 {Opt_user_xattr, 0, MOPT_NOSUPPORT},
1455 {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
1456#endif
1457#ifdef CONFIG_EXT4_FS_POSIX_ACL 1452#ifdef CONFIG_EXT4_FS_POSIX_ACL
1458 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, 1453 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1459 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, 1454 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
@@ -1650,9 +1645,7 @@ static int parse_options(char *options, struct super_block *sb,
1650 unsigned int *journal_ioprio, 1645 unsigned int *journal_ioprio,
1651 int is_remount) 1646 int is_remount)
1652{ 1647{
1653#ifdef CONFIG_QUOTA
1654 struct ext4_sb_info *sbi = EXT4_SB(sb); 1648 struct ext4_sb_info *sbi = EXT4_SB(sb);
1655#endif
1656 char *p; 1649 char *p;
1657 substring_t args[MAX_OPT_ARGS]; 1650 substring_t args[MAX_OPT_ARGS];
1658 int token; 1651 int token;
@@ -1701,6 +1694,16 @@ static int parse_options(char *options, struct super_block *sb,
1701 } 1694 }
1702 } 1695 }
1703#endif 1696#endif
1697 if (test_opt(sb, DIOREAD_NOLOCK)) {
1698 int blocksize =
1699 BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
1700
1701 if (blocksize < PAGE_CACHE_SIZE) {
1702 ext4_msg(sb, KERN_ERR, "can't mount with "
1703 "dioread_nolock if block size != PAGE_SIZE");
1704 return 0;
1705 }
1706 }
1704 return 1; 1707 return 1;
1705} 1708}
1706 1709
@@ -2217,7 +2220,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2217 __func__, inode->i_ino, inode->i_size); 2220 __func__, inode->i_ino, inode->i_size);
2218 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 2221 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2219 inode->i_ino, inode->i_size); 2222 inode->i_ino, inode->i_size);
2223 mutex_lock(&inode->i_mutex);
2220 ext4_truncate(inode); 2224 ext4_truncate(inode);
2225 mutex_unlock(&inode->i_mutex);
2221 nr_truncates++; 2226 nr_truncates++;
2222 } else { 2227 } else {
2223 ext4_msg(sb, KERN_DEBUG, 2228 ext4_msg(sb, KERN_DEBUG,
@@ -3202,7 +3207,6 @@ int ext4_calculate_overhead(struct super_block *sb)
3202 ext4_fsblk_t overhead = 0; 3207 ext4_fsblk_t overhead = 0;
3203 char *buf = (char *) get_zeroed_page(GFP_KERNEL); 3208 char *buf = (char *) get_zeroed_page(GFP_KERNEL);
3204 3209
3205 memset(buf, 0, PAGE_SIZE);
3206 if (!buf) 3210 if (!buf)
3207 return -ENOMEM; 3211 return -ENOMEM;
3208 3212
@@ -3229,6 +3233,10 @@ int ext4_calculate_overhead(struct super_block *sb)
3229 memset(buf, 0, PAGE_SIZE); 3233 memset(buf, 0, PAGE_SIZE);
3230 cond_resched(); 3234 cond_resched();
3231 } 3235 }
3236 /* Add the journal blocks as well */
3237 if (sbi->s_journal)
3238 overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen);
3239
3232 sbi->s_overhead = overhead; 3240 sbi->s_overhead = overhead;
3233 smp_wmb(); 3241 smp_wmb();
3234 free_page((unsigned long) buf); 3242 free_page((unsigned long) buf);
@@ -3256,7 +3264,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3256 unsigned int i; 3264 unsigned int i;
3257 int needs_recovery, has_huge_files, has_bigalloc; 3265 int needs_recovery, has_huge_files, has_bigalloc;
3258 __u64 blocks_count; 3266 __u64 blocks_count;
3259 int err; 3267 int err = 0;
3260 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3268 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3261 ext4_group_t first_not_zeroed; 3269 ext4_group_t first_not_zeroed;
3262 3270
@@ -3272,9 +3280,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3272 } 3280 }
3273 sb->s_fs_info = sbi; 3281 sb->s_fs_info = sbi;
3274 sbi->s_sb = sb; 3282 sbi->s_sb = sb;
3275 sbi->s_mount_opt = 0;
3276 sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
3277 sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
3278 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 3283 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
3279 sbi->s_sb_block = sb_block; 3284 sbi->s_sb_block = sb_block;
3280 if (sb->s_bdev->bd_part) 3285 if (sb->s_bdev->bd_part)
@@ -3285,6 +3290,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3285 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 3290 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
3286 *cp = '!'; 3291 *cp = '!';
3287 3292
3293 /* -EINVAL is default */
3288 ret = -EINVAL; 3294 ret = -EINVAL;
3289 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 3295 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
3290 if (!blocksize) { 3296 if (!blocksize) {
@@ -3369,9 +3375,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3369 if (def_mount_opts & EXT4_DEFM_UID16) 3375 if (def_mount_opts & EXT4_DEFM_UID16)
3370 set_opt(sb, NO_UID32); 3376 set_opt(sb, NO_UID32);
3371 /* xattr user namespace & acls are now defaulted on */ 3377 /* xattr user namespace & acls are now defaulted on */
3372#ifdef CONFIG_EXT4_FS_XATTR
3373 set_opt(sb, XATTR_USER); 3378 set_opt(sb, XATTR_USER);
3374#endif
3375#ifdef CONFIG_EXT4_FS_POSIX_ACL 3379#ifdef CONFIG_EXT4_FS_POSIX_ACL
3376 set_opt(sb, POSIX_ACL); 3380 set_opt(sb, POSIX_ACL);
3377#endif 3381#endif
@@ -3446,15 +3450,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3446 clear_opt(sb, DELALLOC); 3450 clear_opt(sb, DELALLOC);
3447 } 3451 }
3448 3452
3449 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3450 if (test_opt(sb, DIOREAD_NOLOCK)) {
3451 if (blocksize < PAGE_SIZE) {
3452 ext4_msg(sb, KERN_ERR, "can't mount with "
3453 "dioread_nolock if block size != PAGE_SIZE");
3454 goto failed_mount;
3455 }
3456 }
3457
3458 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3453 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3459 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); 3454 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3460 3455
@@ -3496,6 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3496 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) 3491 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
3497 goto failed_mount; 3492 goto failed_mount;
3498 3493
3494 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3499 if (blocksize < EXT4_MIN_BLOCK_SIZE || 3495 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
3500 blocksize > EXT4_MAX_BLOCK_SIZE) { 3496 blocksize > EXT4_MAX_BLOCK_SIZE) {
3501 ext4_msg(sb, KERN_ERR, 3497 ext4_msg(sb, KERN_ERR,
@@ -3662,7 +3658,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3662 " too large to mount safely on this system"); 3658 " too large to mount safely on this system");
3663 if (sizeof(sector_t) < 8) 3659 if (sizeof(sector_t) < 8)
3664 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 3660 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
3665 ret = err;
3666 goto failed_mount; 3661 goto failed_mount;
3667 } 3662 }
3668 3663
@@ -3770,7 +3765,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3770 } 3765 }
3771 if (err) { 3766 if (err) {
3772 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3767 ext4_msg(sb, KERN_ERR, "insufficient memory");
3773 ret = err;
3774 goto failed_mount3; 3768 goto failed_mount3;
3775 } 3769 }
3776 3770
@@ -3801,7 +3795,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3801 3795
3802 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3796 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3803 mutex_init(&sbi->s_orphan_lock); 3797 mutex_init(&sbi->s_orphan_lock);
3804 sbi->s_resize_flags = 0;
3805 3798
3806 sb->s_root = NULL; 3799 sb->s_root = NULL;
3807 3800
@@ -3897,8 +3890,8 @@ no_journal:
3897 if (es->s_overhead_clusters) 3890 if (es->s_overhead_clusters)
3898 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); 3891 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
3899 else { 3892 else {
3900 ret = ext4_calculate_overhead(sb); 3893 err = ext4_calculate_overhead(sb);
3901 if (ret) 3894 if (err)
3902 goto failed_mount_wq; 3895 goto failed_mount_wq;
3903 } 3896 }
3904 3897
@@ -3910,6 +3903,7 @@ no_journal:
3910 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3903 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3911 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3904 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3912 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3905 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
3906 ret = -ENOMEM;
3913 goto failed_mount_wq; 3907 goto failed_mount_wq;
3914 } 3908 }
3915 3909
@@ -4012,12 +4006,20 @@ no_journal:
4012 /* Enable quota usage during mount. */ 4006 /* Enable quota usage during mount. */
4013 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && 4007 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
4014 !(sb->s_flags & MS_RDONLY)) { 4008 !(sb->s_flags & MS_RDONLY)) {
4015 ret = ext4_enable_quotas(sb); 4009 err = ext4_enable_quotas(sb);
4016 if (ret) 4010 if (err)
4017 goto failed_mount7; 4011 goto failed_mount7;
4018 } 4012 }
4019#endif /* CONFIG_QUOTA */ 4013#endif /* CONFIG_QUOTA */
4020 4014
4015 if (test_opt(sb, DISCARD)) {
4016 struct request_queue *q = bdev_get_queue(sb->s_bdev);
4017 if (!blk_queue_discard(q))
4018 ext4_msg(sb, KERN_WARNING,
4019 "mounting with \"discard\" option, but "
4020 "the device does not support discard");
4021 }
4022
4021 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 4023 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
4022 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, 4024 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
4023 *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 4025 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
@@ -4084,7 +4086,7 @@ out_fail:
4084 kfree(sbi); 4086 kfree(sbi);
4085out_free_orig: 4087out_free_orig:
4086 kfree(orig_data); 4088 kfree(orig_data);
4087 return ret; 4089 return err ? err : ret;
4088} 4090}
4089 4091
4090/* 4092/*
@@ -4729,7 +4731,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4729 } 4731 }
4730 4732
4731 ext4_setup_system_zone(sb); 4733 ext4_setup_system_zone(sb);
4732 if (sbi->s_journal == NULL) 4734 if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
4733 ext4_commit_super(sb, 1); 4735 ext4_commit_super(sb, 1);
4734 4736
4735#ifdef CONFIG_QUOTA 4737#ifdef CONFIG_QUOTA
@@ -4790,7 +4792,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4790 4792
4791 buf->f_type = EXT4_SUPER_MAGIC; 4793 buf->f_type = EXT4_SUPER_MAGIC;
4792 buf->f_bsize = sb->s_blocksize; 4794 buf->f_bsize = sb->s_blocksize;
4793 buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead); 4795 buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
4794 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - 4796 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
4795 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); 4797 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
4796 /* prevent underflow in case that few free space is available */ 4798 /* prevent underflow in case that few free space is available */
@@ -5282,6 +5284,7 @@ static int __init ext4_init_fs(void)
5282 ext4_li_info = NULL; 5284 ext4_li_info = NULL;
5283 mutex_init(&ext4_li_mtx); 5285 mutex_init(&ext4_li_mtx);
5284 5286
5287 /* Build-time check for flags consistency */
5285 ext4_check_flag_values(); 5288 ext4_check_flag_values();
5286 5289
5287 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { 5290 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
@@ -5289,9 +5292,14 @@ static int __init ext4_init_fs(void)
5289 init_waitqueue_head(&ext4__ioend_wq[i]); 5292 init_waitqueue_head(&ext4__ioend_wq[i]);
5290 } 5293 }
5291 5294
5292 err = ext4_init_pageio(); 5295 err = ext4_init_es();
5293 if (err) 5296 if (err)
5294 return err; 5297 return err;
5298
5299 err = ext4_init_pageio();
5300 if (err)
5301 goto out7;
5302
5295 err = ext4_init_system_zone(); 5303 err = ext4_init_system_zone();
5296 if (err) 5304 if (err)
5297 goto out6; 5305 goto out6;
@@ -5341,6 +5349,9 @@ out5:
5341 ext4_exit_system_zone(); 5349 ext4_exit_system_zone();
5342out6: 5350out6:
5343 ext4_exit_pageio(); 5351 ext4_exit_pageio();
5352out7:
5353 ext4_exit_es();
5354
5344 return err; 5355 return err;
5345} 5356}
5346 5357
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ed9354aff279..ff3711932018 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = {
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext4_setattr, 37 .setattr = ext4_setattr,
38#ifdef CONFIG_EXT4_FS_XATTR
39 .setxattr = generic_setxattr, 38 .setxattr = generic_setxattr,
40 .getxattr = generic_getxattr, 39 .getxattr = generic_getxattr,
41 .listxattr = ext4_listxattr, 40 .listxattr = ext4_listxattr,
42 .removexattr = generic_removexattr, 41 .removexattr = generic_removexattr,
43#endif
44}; 42};
45 43
46const struct inode_operations ext4_fast_symlink_inode_operations = { 44const struct inode_operations ext4_fast_symlink_inode_operations = {
47 .readlink = generic_readlink, 45 .readlink = generic_readlink,
48 .follow_link = ext4_follow_link, 46 .follow_link = ext4_follow_link,
49 .setattr = ext4_setattr, 47 .setattr = ext4_setattr,
50#ifdef CONFIG_EXT4_FS_XATTR
51 .setxattr = generic_setxattr, 48 .setxattr = generic_setxattr,
52 .getxattr = generic_getxattr, 49 .getxattr = generic_getxattr,
53 .listxattr = ext4_listxattr, 50 .listxattr = ext4_listxattr,
54 .removexattr = generic_removexattr, 51 .removexattr = generic_removexattr,
55#endif
56}; 52};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2cdb98d62980..3a91ebc2b66f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -61,11 +61,6 @@
61#include "xattr.h" 61#include "xattr.h"
62#include "acl.h" 62#include "acl.h"
63 63
64#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
65#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
66#define BFIRST(bh) ENTRY(BHDR(bh)+1)
67#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
68
69#ifdef EXT4_XATTR_DEBUG 64#ifdef EXT4_XATTR_DEBUG
70# define ea_idebug(inode, f...) do { \ 65# define ea_idebug(inode, f...) do { \
71 printk(KERN_DEBUG "inode %s:%lu: ", \ 66 printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -312,7 +307,7 @@ cleanup:
312 return error; 307 return error;
313} 308}
314 309
315static int 310int
316ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, 311ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
317 void *buffer, size_t buffer_size) 312 void *buffer, size_t buffer_size)
318{ 313{
@@ -581,21 +576,6 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
581 return (*min_offs - ((void *)last - base) - sizeof(__u32)); 576 return (*min_offs - ((void *)last - base) - sizeof(__u32));
582} 577}
583 578
584struct ext4_xattr_info {
585 int name_index;
586 const char *name;
587 const void *value;
588 size_t value_len;
589};
590
591struct ext4_xattr_search {
592 struct ext4_xattr_entry *first;
593 void *base;
594 void *end;
595 struct ext4_xattr_entry *here;
596 int not_found;
597};
598
599static int 579static int
600ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) 580ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
601{ 581{
@@ -648,9 +628,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
648 size. Just replace. */ 628 size. Just replace. */
649 s->here->e_value_size = 629 s->here->e_value_size =
650 cpu_to_le32(i->value_len); 630 cpu_to_le32(i->value_len);
651 memset(val + size - EXT4_XATTR_PAD, 0, 631 if (i->value == EXT4_ZERO_XATTR_VALUE) {
652 EXT4_XATTR_PAD); /* Clear pad bytes. */ 632 memset(val, 0, size);
653 memcpy(val, i->value, i->value_len); 633 } else {
634 /* Clear pad bytes first. */
635 memset(val + size - EXT4_XATTR_PAD, 0,
636 EXT4_XATTR_PAD);
637 memcpy(val, i->value, i->value_len);
638 }
654 return 0; 639 return 0;
655 } 640 }
656 641
@@ -689,9 +674,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
689 size_t size = EXT4_XATTR_SIZE(i->value_len); 674 size_t size = EXT4_XATTR_SIZE(i->value_len);
690 void *val = s->base + min_offs - size; 675 void *val = s->base + min_offs - size;
691 s->here->e_value_offs = cpu_to_le16(min_offs - size); 676 s->here->e_value_offs = cpu_to_le16(min_offs - size);
692 memset(val + size - EXT4_XATTR_PAD, 0, 677 if (i->value == EXT4_ZERO_XATTR_VALUE) {
693 EXT4_XATTR_PAD); /* Clear the pad bytes. */ 678 memset(val, 0, size);
694 memcpy(val, i->value, i->value_len); 679 } else {
680 /* Clear the pad bytes first. */
681 memset(val + size - EXT4_XATTR_PAD, 0,
682 EXT4_XATTR_PAD);
683 memcpy(val, i->value, i->value_len);
684 }
695 } 685 }
696 } 686 }
697 return 0; 687 return 0;
@@ -794,7 +784,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
794 int offset = (char *)s->here - bs->bh->b_data; 784 int offset = (char *)s->here - bs->bh->b_data;
795 785
796 unlock_buffer(bs->bh); 786 unlock_buffer(bs->bh);
797 ext4_handle_release_buffer(handle, bs->bh);
798 if (ce) { 787 if (ce) {
799 mb_cache_entry_release(ce); 788 mb_cache_entry_release(ce);
800 ce = NULL; 789 ce = NULL;
@@ -950,14 +939,8 @@ bad_block:
950#undef header 939#undef header
951} 940}
952 941
953struct ext4_xattr_ibody_find { 942int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
954 struct ext4_xattr_search s; 943 struct ext4_xattr_ibody_find *is)
955 struct ext4_iloc iloc;
956};
957
958static int
959ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
960 struct ext4_xattr_ibody_find *is)
961{ 944{
962 struct ext4_xattr_ibody_header *header; 945 struct ext4_xattr_ibody_header *header;
963 struct ext4_inode *raw_inode; 946 struct ext4_inode *raw_inode;
@@ -985,10 +968,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
985 return 0; 968 return 0;
986} 969}
987 970
988static int 971int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
989ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, 972 struct ext4_xattr_info *i,
990 struct ext4_xattr_info *i, 973 struct ext4_xattr_ibody_find *is)
991 struct ext4_xattr_ibody_find *is) 974{
975 struct ext4_xattr_ibody_header *header;
976 struct ext4_xattr_search *s = &is->s;
977 int error;
978
979 if (EXT4_I(inode)->i_extra_isize == 0)
980 return -ENOSPC;
981 error = ext4_xattr_set_entry(i, s);
982 if (error) {
983 if (error == -ENOSPC &&
984 ext4_has_inline_data(inode)) {
985 error = ext4_try_to_evict_inline_data(handle, inode,
986 EXT4_XATTR_LEN(strlen(i->name) +
987 EXT4_XATTR_SIZE(i->value_len)));
988 if (error)
989 return error;
990 error = ext4_xattr_ibody_find(inode, i, is);
991 if (error)
992 return error;
993 error = ext4_xattr_set_entry(i, s);
994 }
995 if (error)
996 return error;
997 }
998 header = IHDR(inode, ext4_raw_inode(&is->iloc));
999 if (!IS_LAST_ENTRY(s->first)) {
1000 header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
1001 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
1002 } else {
1003 header->h_magic = cpu_to_le32(0);
1004 ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
1005 }
1006 return 0;
1007}
1008
1009static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
1010 struct ext4_xattr_info *i,
1011 struct ext4_xattr_ibody_find *is)
992{ 1012{
993 struct ext4_xattr_ibody_header *header; 1013 struct ext4_xattr_ibody_header *header;
994 struct ext4_xattr_search *s = &is->s; 1014 struct ext4_xattr_search *s = &is->s;
@@ -1144,9 +1164,17 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
1144{ 1164{
1145 handle_t *handle; 1165 handle_t *handle;
1146 int error, retries = 0; 1166 int error, retries = 0;
1167 int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
1147 1168
1148retry: 1169retry:
1149 handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 1170 /*
1171 * In case of inline data, we may push out the data to a block,
1172 * So reserve the journal space first.
1173 */
1174 if (ext4_has_inline_data(inode))
1175 credits += ext4_writepage_trans_blocks(inode) + 1;
1176
1177 handle = ext4_journal_start(inode, credits);
1150 if (IS_ERR(handle)) { 1178 if (IS_ERR(handle)) {
1151 error = PTR_ERR(handle); 1179 error = PTR_ERR(handle);
1152 } else { 1180 } else {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 91f31ca7d9af..69eda787a96a 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,6 +21,7 @@
21#define EXT4_XATTR_INDEX_TRUSTED 4 21#define EXT4_XATTR_INDEX_TRUSTED 4
22#define EXT4_XATTR_INDEX_LUSTRE 5 22#define EXT4_XATTR_INDEX_LUSTRE 5
23#define EXT4_XATTR_INDEX_SECURITY 6 23#define EXT4_XATTR_INDEX_SECURITY 6
24#define EXT4_XATTR_INDEX_SYSTEM 7
24 25
25struct ext4_xattr_header { 26struct ext4_xattr_header {
26 __le32 h_magic; /* magic number for identification */ 27 __le32 h_magic; /* magic number for identification */
@@ -65,7 +66,32 @@ struct ext4_xattr_entry {
65 EXT4_I(inode)->i_extra_isize)) 66 EXT4_I(inode)->i_extra_isize))
66#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 67#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
67 68
68# ifdef CONFIG_EXT4_FS_XATTR 69#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
70#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
71#define BFIRST(bh) ENTRY(BHDR(bh)+1)
72#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
73
74#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
75
76struct ext4_xattr_info {
77 int name_index;
78 const char *name;
79 const void *value;
80 size_t value_len;
81};
82
83struct ext4_xattr_search {
84 struct ext4_xattr_entry *first;
85 void *base;
86 void *end;
87 struct ext4_xattr_entry *here;
88 int not_found;
89};
90
91struct ext4_xattr_ibody_find {
92 struct ext4_xattr_search s;
93 struct ext4_iloc iloc;
94};
69 95
70extern const struct xattr_handler ext4_xattr_user_handler; 96extern const struct xattr_handler ext4_xattr_user_handler;
71extern const struct xattr_handler ext4_xattr_trusted_handler; 97extern const struct xattr_handler ext4_xattr_trusted_handler;
@@ -90,60 +116,82 @@ extern void ext4_exit_xattr(void);
90 116
91extern const struct xattr_handler *ext4_xattr_handlers[]; 117extern const struct xattr_handler *ext4_xattr_handlers[];
92 118
93# else /* CONFIG_EXT4_FS_XATTR */ 119extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
94 120 struct ext4_xattr_ibody_find *is);
95static inline int 121extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
96ext4_xattr_get(struct inode *inode, int name_index, const char *name, 122 const char *name,
97 void *buffer, size_t size, int flags) 123 void *buffer, size_t buffer_size);
98{ 124extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
99 return -EOPNOTSUPP; 125 struct ext4_xattr_info *i,
100} 126 struct ext4_xattr_ibody_find *is);
101 127
102static inline int 128extern int ext4_has_inline_data(struct inode *inode);
103ext4_xattr_set(struct inode *inode, int name_index, const char *name, 129extern int ext4_get_inline_size(struct inode *inode);
104 const void *value, size_t size, int flags) 130extern int ext4_get_max_inline_size(struct inode *inode);
105{ 131extern int ext4_find_inline_data_nolock(struct inode *inode);
106 return -EOPNOTSUPP; 132extern void ext4_write_inline_data(struct inode *inode,
107} 133 struct ext4_iloc *iloc,
108 134 void *buffer, loff_t pos,
109static inline int 135 unsigned int len);
110ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, 136extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
111 const char *name, const void *value, size_t size, int flags) 137 unsigned int len);
112{ 138extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
113 return -EOPNOTSUPP; 139 unsigned int len);
114} 140extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
115 141
116static inline void 142extern int ext4_readpage_inline(struct inode *inode, struct page *page);
117ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) 143extern int ext4_try_to_write_inline_data(struct address_space *mapping,
118{ 144 struct inode *inode,
119} 145 loff_t pos, unsigned len,
120 146 unsigned flags,
121static inline void 147 struct page **pagep);
122ext4_xattr_put_super(struct super_block *sb) 148extern int ext4_write_inline_data_end(struct inode *inode,
123{ 149 loff_t pos, unsigned len,
124} 150 unsigned copied,
125 151 struct page *page);
126static __init inline int 152extern struct buffer_head *
127ext4_init_xattr(void) 153ext4_journalled_write_inline_data(struct inode *inode,
128{ 154 unsigned len,
129 return 0; 155 struct page *page);
130} 156extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
131 157 struct inode *inode,
132static inline void 158 loff_t pos, unsigned len,
133ext4_exit_xattr(void) 159 unsigned flags,
134{ 160 struct page **pagep,
135} 161 void **fsdata);
136 162extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
137static inline int 163 unsigned len, unsigned copied,
138ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 164 struct page *page);
139 struct ext4_inode *raw_inode, handle_t *handle) 165extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
140{ 166 struct inode *inode);
141 return -EOPNOTSUPP; 167extern int ext4_try_create_inline_dir(handle_t *handle,
142} 168 struct inode *parent,
143 169 struct inode *inode);
144#define ext4_xattr_handlers NULL 170extern int ext4_read_inline_dir(struct file *filp,
145 171 void *dirent, filldir_t filldir,
146# endif /* CONFIG_EXT4_FS_XATTR */ 172 int *has_inline_data);
173extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
174 const struct qstr *d_name,
175 struct ext4_dir_entry_2 **res_dir,
176 int *has_inline_data);
177extern int ext4_delete_inline_entry(handle_t *handle,
178 struct inode *dir,
179 struct ext4_dir_entry_2 *de_del,
180 struct buffer_head *bh,
181 int *has_inline_data);
182extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
183extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
184 struct ext4_dir_entry_2 **parent_de,
185 int *retval);
186extern int ext4_inline_data_fiemap(struct inode *inode,
187 struct fiemap_extent_info *fieinfo,
188 int *has_inline);
189extern int ext4_try_to_evict_inline_data(handle_t *handle,
190 struct inode *inode,
191 int needed);
192extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
193
194extern int ext4_convert_inline_data(struct inode *inode);
147 195
148#ifdef CONFIG_EXT4_FS_SECURITY 196#ifdef CONFIG_EXT4_FS_SECURITY
149extern int ext4_init_security(handle_t *handle, struct inode *inode, 197extern int ext4_init_security(handle_t *handle, struct inode *inode,
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
new file mode 100644
index 000000000000..fd27e7e6326e
--- /dev/null
+++ b/fs/f2fs/Kconfig
@@ -0,0 +1,53 @@
1config F2FS_FS
2 tristate "F2FS filesystem support (EXPERIMENTAL)"
3 depends on BLOCK
4 help
5 F2FS is based on Log-structured File System (LFS), which supports
6 versatile "flash-friendly" features. The design has been focused on
7 addressing the fundamental issues in LFS, which are snowball effect
8 of wandering tree and high cleaning overhead.
9
10 Since flash-based storages show different characteristics according to
11 the internal geometry or flash memory management schemes aka FTL, F2FS
12 and tools support various parameters not only for configuring on-disk
13 layout, but also for selecting allocation and cleaning algorithms.
14
15 If unsure, say N.
16
17config F2FS_STAT_FS
18 bool "F2FS Status Information"
19 depends on F2FS_FS && DEBUG_FS
20 default y
21 help
22 /sys/kernel/debug/f2fs/ contains information about all the partitions
23 mounted as f2fs. Each file shows the whole f2fs information.
24
25 /sys/kernel/debug/f2fs/status includes:
26 - major file system information managed by f2fs currently
27 - average SIT information about whole segments
28 - current memory footprint consumed by f2fs.
29
30config F2FS_FS_XATTR
31 bool "F2FS extended attributes"
32 depends on F2FS_FS
33 default y
34 help
35 Extended attributes are name:value pairs associated with inodes by
36 the kernel or by users (see the attr(5) manual page, or visit
37 <http://acl.bestbits.at/> for details).
38
39 If unsure, say N.
40
41config F2FS_FS_POSIX_ACL
42 bool "F2FS Access Control Lists"
43 depends on F2FS_FS_XATTR
44 select FS_POSIX_ACL
45 default y
46 help
47 Posix Access Control Lists (ACLs) support permissions for users and
48 gourps beyond the owner/group/world scheme.
49
50 To learn more about Access Control Lists, visit the POSIX ACLs for
51 Linux website <http://acl.bestbits.at/>.
52
53 If you don't know what Access Control Lists are, say N
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
new file mode 100644
index 000000000000..27a0820340b9
--- /dev/null
+++ b/fs/f2fs/Makefile
@@ -0,0 +1,7 @@
1obj-$(CONFIG_F2FS_FS) += f2fs.o
2
3f2fs-y := dir.o file.o inode.o namei.o hash.o super.o
4f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
7f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
new file mode 100644
index 000000000000..137af4255da6
--- /dev/null
+++ b/fs/f2fs/acl.c
@@ -0,0 +1,412 @@
1/*
2 * fs/f2fs/acl.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext2/acl.c
8 *
9 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15#include <linux/f2fs_fs.h>
16#include "f2fs.h"
17#include "xattr.h"
18#include "acl.h"
19
20#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
21 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
22
23static inline size_t f2fs_acl_size(int count)
24{
25 if (count <= 4) {
26 return sizeof(struct f2fs_acl_header) +
27 count * sizeof(struct f2fs_acl_entry_short);
28 } else {
29 return sizeof(struct f2fs_acl_header) +
30 4 * sizeof(struct f2fs_acl_entry_short) +
31 (count - 4) * sizeof(struct f2fs_acl_entry);
32 }
33}
34
35static inline int f2fs_acl_count(size_t size)
36{
37 ssize_t s;
38 size -= sizeof(struct f2fs_acl_header);
39 s = size - 4 * sizeof(struct f2fs_acl_entry_short);
40 if (s < 0) {
41 if (size % sizeof(struct f2fs_acl_entry_short))
42 return -1;
43 return size / sizeof(struct f2fs_acl_entry_short);
44 } else {
45 if (s % sizeof(struct f2fs_acl_entry))
46 return -1;
47 return s / sizeof(struct f2fs_acl_entry) + 4;
48 }
49}
50
51static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
52{
53 int i, count;
54 struct posix_acl *acl;
55 struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value;
56 struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
57 const char *end = value + size;
58
59 if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
60 return ERR_PTR(-EINVAL);
61
62 count = f2fs_acl_count(size);
63 if (count < 0)
64 return ERR_PTR(-EINVAL);
65 if (count == 0)
66 return NULL;
67
68 acl = posix_acl_alloc(count, GFP_KERNEL);
69 if (!acl)
70 return ERR_PTR(-ENOMEM);
71
72 for (i = 0; i < count; i++) {
73
74 if ((char *)entry > end)
75 goto fail;
76
77 acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag);
78 acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm);
79
80 switch (acl->a_entries[i].e_tag) {
81 case ACL_USER_OBJ:
82 case ACL_GROUP_OBJ:
83 case ACL_MASK:
84 case ACL_OTHER:
85 entry = (struct f2fs_acl_entry *)((char *)entry +
86 sizeof(struct f2fs_acl_entry_short));
87 break;
88
89 case ACL_USER:
90 acl->a_entries[i].e_uid =
91 make_kuid(&init_user_ns,
92 le32_to_cpu(entry->e_id));
93 entry = (struct f2fs_acl_entry *)((char *)entry +
94 sizeof(struct f2fs_acl_entry));
95 break;
96 case ACL_GROUP:
97 acl->a_entries[i].e_gid =
98 make_kgid(&init_user_ns,
99 le32_to_cpu(entry->e_id));
100 entry = (struct f2fs_acl_entry *)((char *)entry +
101 sizeof(struct f2fs_acl_entry));
102 break;
103 default:
104 goto fail;
105 }
106 }
107 if ((char *)entry != end)
108 goto fail;
109 return acl;
110fail:
111 posix_acl_release(acl);
112 return ERR_PTR(-EINVAL);
113}
114
115static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
116{
117 struct f2fs_acl_header *f2fs_acl;
118 struct f2fs_acl_entry *entry;
119 int i;
120
121 f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
122 sizeof(struct f2fs_acl_entry), GFP_KERNEL);
123 if (!f2fs_acl)
124 return ERR_PTR(-ENOMEM);
125
126 f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION);
127 entry = (struct f2fs_acl_entry *)(f2fs_acl + 1);
128
129 for (i = 0; i < acl->a_count; i++) {
130
131 entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag);
132 entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm);
133
134 switch (acl->a_entries[i].e_tag) {
135 case ACL_USER:
136 entry->e_id = cpu_to_le32(
137 from_kuid(&init_user_ns,
138 acl->a_entries[i].e_uid));
139 entry = (struct f2fs_acl_entry *)((char *)entry +
140 sizeof(struct f2fs_acl_entry));
141 break;
142 case ACL_GROUP:
143 entry->e_id = cpu_to_le32(
144 from_kgid(&init_user_ns,
145 acl->a_entries[i].e_gid));
146 entry = (struct f2fs_acl_entry *)((char *)entry +
147 sizeof(struct f2fs_acl_entry));
148 break;
149 case ACL_USER_OBJ:
150 case ACL_GROUP_OBJ:
151 case ACL_MASK:
152 case ACL_OTHER:
153 entry = (struct f2fs_acl_entry *)((char *)entry +
154 sizeof(struct f2fs_acl_entry_short));
155 break;
156 default:
157 goto fail;
158 }
159 }
160 *size = f2fs_acl_size(acl->a_count);
161 return (void *)f2fs_acl;
162
163fail:
164 kfree(f2fs_acl);
165 return ERR_PTR(-EINVAL);
166}
167
168struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
169{
170 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
171 int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
172 void *value = NULL;
173 struct posix_acl *acl;
174 int retval;
175
176 if (!test_opt(sbi, POSIX_ACL))
177 return NULL;
178
179 acl = get_cached_acl(inode, type);
180 if (acl != ACL_NOT_CACHED)
181 return acl;
182
183 if (type == ACL_TYPE_ACCESS)
184 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
185
186 retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
187 if (retval > 0) {
188 value = kmalloc(retval, GFP_KERNEL);
189 if (!value)
190 return ERR_PTR(-ENOMEM);
191 retval = f2fs_getxattr(inode, name_index, "", value, retval);
192 }
193
194 if (retval > 0)
195 acl = f2fs_acl_from_disk(value, retval);
196 else if (retval == -ENODATA)
197 acl = NULL;
198 else
199 acl = ERR_PTR(retval);
200 kfree(value);
201
202 if (!IS_ERR(acl))
203 set_cached_acl(inode, type, acl);
204
205 return acl;
206}
207
208static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
209{
210 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
211 struct f2fs_inode_info *fi = F2FS_I(inode);
212 int name_index;
213 void *value = NULL;
214 size_t size = 0;
215 int error;
216
217 if (!test_opt(sbi, POSIX_ACL))
218 return 0;
219 if (S_ISLNK(inode->i_mode))
220 return -EOPNOTSUPP;
221
222 switch (type) {
223 case ACL_TYPE_ACCESS:
224 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
225 if (acl) {
226 error = posix_acl_equiv_mode(acl, &inode->i_mode);
227 if (error < 0)
228 return error;
229 set_acl_inode(fi, inode->i_mode);
230 if (error == 0)
231 acl = NULL;
232 }
233 break;
234
235 case ACL_TYPE_DEFAULT:
236 name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
237 if (!S_ISDIR(inode->i_mode))
238 return acl ? -EACCES : 0;
239 break;
240
241 default:
242 return -EINVAL;
243 }
244
245 if (acl) {
246 value = f2fs_acl_to_disk(acl, &size);
247 if (IS_ERR(value)) {
248 cond_clear_inode_flag(fi, FI_ACL_MODE);
249 return (int)PTR_ERR(value);
250 }
251 }
252
253 error = f2fs_setxattr(inode, name_index, "", value, size);
254
255 kfree(value);
256 if (!error)
257 set_cached_acl(inode, type, acl);
258
259 cond_clear_inode_flag(fi, FI_ACL_MODE);
260 return error;
261}
262
263int f2fs_init_acl(struct inode *inode, struct inode *dir)
264{
265 struct posix_acl *acl = NULL;
266 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
267 int error = 0;
268
269 if (!S_ISLNK(inode->i_mode)) {
270 if (test_opt(sbi, POSIX_ACL)) {
271 acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
272 if (IS_ERR(acl))
273 return PTR_ERR(acl);
274 }
275 if (!acl)
276 inode->i_mode &= ~current_umask();
277 }
278
279 if (test_opt(sbi, POSIX_ACL) && acl) {
280
281 if (S_ISDIR(inode->i_mode)) {
282 error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
283 if (error)
284 goto cleanup;
285 }
286 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
287 if (error < 0)
288 return error;
289 if (error > 0)
290 error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
291 }
292cleanup:
293 posix_acl_release(acl);
294 return error;
295}
296
297int f2fs_acl_chmod(struct inode *inode)
298{
299 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
300 struct posix_acl *acl;
301 int error;
302 mode_t mode = get_inode_mode(inode);
303
304 if (!test_opt(sbi, POSIX_ACL))
305 return 0;
306 if (S_ISLNK(mode))
307 return -EOPNOTSUPP;
308
309 acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
310 if (IS_ERR(acl) || !acl)
311 return PTR_ERR(acl);
312
313 error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
314 if (error)
315 return error;
316 error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
317 posix_acl_release(acl);
318 return error;
319}
320
321static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list,
322 size_t list_size, const char *name, size_t name_len, int type)
323{
324 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
325 const char *xname = POSIX_ACL_XATTR_DEFAULT;
326 size_t size;
327
328 if (!test_opt(sbi, POSIX_ACL))
329 return 0;
330
331 if (type == ACL_TYPE_ACCESS)
332 xname = POSIX_ACL_XATTR_ACCESS;
333
334 size = strlen(xname) + 1;
335 if (list && size <= list_size)
336 memcpy(list, xname, size);
337 return size;
338}
339
340static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
341 void *buffer, size_t size, int type)
342{
343 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
344 struct posix_acl *acl;
345 int error;
346
347 if (strcmp(name, "") != 0)
348 return -EINVAL;
349 if (!test_opt(sbi, POSIX_ACL))
350 return -EOPNOTSUPP;
351
352 acl = f2fs_get_acl(dentry->d_inode, type);
353 if (IS_ERR(acl))
354 return PTR_ERR(acl);
355 if (!acl)
356 return -ENODATA;
357 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
358 posix_acl_release(acl);
359
360 return error;
361}
362
363static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
364 const void *value, size_t size, int flags, int type)
365{
366 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
367 struct inode *inode = dentry->d_inode;
368 struct posix_acl *acl = NULL;
369 int error;
370
371 if (strcmp(name, "") != 0)
372 return -EINVAL;
373 if (!test_opt(sbi, POSIX_ACL))
374 return -EOPNOTSUPP;
375 if (!inode_owner_or_capable(inode))
376 return -EPERM;
377
378 if (value) {
379 acl = posix_acl_from_xattr(&init_user_ns, value, size);
380 if (IS_ERR(acl))
381 return PTR_ERR(acl);
382 if (acl) {
383 error = posix_acl_valid(acl);
384 if (error)
385 goto release_and_out;
386 }
387 } else {
388 acl = NULL;
389 }
390
391 error = f2fs_set_acl(inode, type, acl);
392
393release_and_out:
394 posix_acl_release(acl);
395 return error;
396}
397
398const struct xattr_handler f2fs_xattr_acl_default_handler = {
399 .prefix = POSIX_ACL_XATTR_DEFAULT,
400 .flags = ACL_TYPE_DEFAULT,
401 .list = f2fs_xattr_list_acl,
402 .get = f2fs_xattr_get_acl,
403 .set = f2fs_xattr_set_acl,
404};
405
406const struct xattr_handler f2fs_xattr_acl_access_handler = {
407 .prefix = POSIX_ACL_XATTR_ACCESS,
408 .flags = ACL_TYPE_ACCESS,
409 .list = f2fs_xattr_list_acl,
410 .get = f2fs_xattr_get_acl,
411 .set = f2fs_xattr_set_acl,
412};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
new file mode 100644
index 000000000000..80f430674417
--- /dev/null
+++ b/fs/f2fs/acl.h
@@ -0,0 +1,57 @@
1/*
2 * fs/f2fs/acl.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext2/acl.h
8 *
9 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15#ifndef __F2FS_ACL_H__
16#define __F2FS_ACL_H__
17
18#include <linux/posix_acl_xattr.h>
19
20#define F2FS_ACL_VERSION 0x0001
21
22struct f2fs_acl_entry {
23 __le16 e_tag;
24 __le16 e_perm;
25 __le32 e_id;
26};
27
28struct f2fs_acl_entry_short {
29 __le16 e_tag;
30 __le16 e_perm;
31};
32
33struct f2fs_acl_header {
34 __le32 a_version;
35};
36
37#ifdef CONFIG_F2FS_FS_POSIX_ACL
38
39extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type);
40extern int f2fs_acl_chmod(struct inode *inode);
41extern int f2fs_init_acl(struct inode *inode, struct inode *dir);
42#else
43#define f2fs_check_acl NULL
44#define f2fs_get_acl NULL
45#define f2fs_set_acl NULL
46
47static inline int f2fs_acl_chmod(struct inode *inode)
48{
49 return 0;
50}
51
52static inline int f2fs_init_acl(struct inode *inode, struct inode *dir)
53{
54 return 0;
55}
56#endif
57#endif /* __F2FS_ACL_H__ */
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
new file mode 100644
index 000000000000..ff3c8439af87
--- /dev/null
+++ b/fs/f2fs/checkpoint.c
@@ -0,0 +1,793 @@
1/*
2 * fs/f2fs/checkpoint.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/bio.h>
13#include <linux/mpage.h>
14#include <linux/writeback.h>
15#include <linux/blkdev.h>
16#include <linux/f2fs_fs.h>
17#include <linux/pagevec.h>
18#include <linux/swap.h>
19
20#include "f2fs.h"
21#include "node.h"
22#include "segment.h"
23
24static struct kmem_cache *orphan_entry_slab;
25static struct kmem_cache *inode_entry_slab;
26
27/*
28 * We guarantee no failure on the returned page.
29 */
30struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
31{
32 struct address_space *mapping = sbi->meta_inode->i_mapping;
33 struct page *page = NULL;
34repeat:
35 page = grab_cache_page(mapping, index);
36 if (!page) {
37 cond_resched();
38 goto repeat;
39 }
40
41 /* We wait writeback only inside grab_meta_page() */
42 wait_on_page_writeback(page);
43 SetPageUptodate(page);
44 return page;
45}
46
47/*
48 * We guarantee no failure on the returned page.
49 */
50struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
51{
52 struct address_space *mapping = sbi->meta_inode->i_mapping;
53 struct page *page;
54repeat:
55 page = grab_cache_page(mapping, index);
56 if (!page) {
57 cond_resched();
58 goto repeat;
59 }
60 if (f2fs_readpage(sbi, page, index, READ_SYNC)) {
61 f2fs_put_page(page, 1);
62 goto repeat;
63 }
64 mark_page_accessed(page);
65
66 /* We do not allow returning an errorneous page */
67 return page;
68}
69
70static int f2fs_write_meta_page(struct page *page,
71 struct writeback_control *wbc)
72{
73 struct inode *inode = page->mapping->host;
74 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
75 int err;
76
77 wait_on_page_writeback(page);
78
79 err = write_meta_page(sbi, page, wbc);
80 if (err) {
81 wbc->pages_skipped++;
82 set_page_dirty(page);
83 }
84
85 dec_page_count(sbi, F2FS_DIRTY_META);
86
87 /* In this case, we should not unlock this page */
88 if (err != AOP_WRITEPAGE_ACTIVATE)
89 unlock_page(page);
90 return err;
91}
92
93static int f2fs_write_meta_pages(struct address_space *mapping,
94 struct writeback_control *wbc)
95{
96 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
97 struct block_device *bdev = sbi->sb->s_bdev;
98 long written;
99
100 if (wbc->for_kupdate)
101 return 0;
102
103 if (get_pages(sbi, F2FS_DIRTY_META) == 0)
104 return 0;
105
106 /* if mounting is failed, skip writing node pages */
107 mutex_lock(&sbi->cp_mutex);
108 written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
109 mutex_unlock(&sbi->cp_mutex);
110 wbc->nr_to_write -= written;
111 return 0;
112}
113
114long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
115 long nr_to_write)
116{
117 struct address_space *mapping = sbi->meta_inode->i_mapping;
118 pgoff_t index = 0, end = LONG_MAX;
119 struct pagevec pvec;
120 long nwritten = 0;
121 struct writeback_control wbc = {
122 .for_reclaim = 0,
123 };
124
125 pagevec_init(&pvec, 0);
126
127 while (index <= end) {
128 int i, nr_pages;
129 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
130 PAGECACHE_TAG_DIRTY,
131 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
132 if (nr_pages == 0)
133 break;
134
135 for (i = 0; i < nr_pages; i++) {
136 struct page *page = pvec.pages[i];
137 lock_page(page);
138 BUG_ON(page->mapping != mapping);
139 BUG_ON(!PageDirty(page));
140 clear_page_dirty_for_io(page);
141 f2fs_write_meta_page(page, &wbc);
142 if (nwritten++ >= nr_to_write)
143 break;
144 }
145 pagevec_release(&pvec);
146 cond_resched();
147 }
148
149 if (nwritten)
150 f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
151
152 return nwritten;
153}
154
155static int f2fs_set_meta_page_dirty(struct page *page)
156{
157 struct address_space *mapping = page->mapping;
158 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
159
160 SetPageUptodate(page);
161 if (!PageDirty(page)) {
162 __set_page_dirty_nobuffers(page);
163 inc_page_count(sbi, F2FS_DIRTY_META);
164 F2FS_SET_SB_DIRT(sbi);
165 return 1;
166 }
167 return 0;
168}
169
170const struct address_space_operations f2fs_meta_aops = {
171 .writepage = f2fs_write_meta_page,
172 .writepages = f2fs_write_meta_pages,
173 .set_page_dirty = f2fs_set_meta_page_dirty,
174};
175
176int check_orphan_space(struct f2fs_sb_info *sbi)
177{
178 unsigned int max_orphans;
179 int err = 0;
180
181 /*
182 * considering 512 blocks in a segment 5 blocks are needed for cp
183 * and log segment summaries. Remaining blocks are used to keep
184 * orphan entries with the limitation one reserved segment
185 * for cp pack we can have max 1020*507 orphan entries
186 */
187 max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
188 mutex_lock(&sbi->orphan_inode_mutex);
189 if (sbi->n_orphans >= max_orphans)
190 err = -ENOSPC;
191 mutex_unlock(&sbi->orphan_inode_mutex);
192 return err;
193}
194
195void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
196{
197 struct list_head *head, *this;
198 struct orphan_inode_entry *new = NULL, *orphan = NULL;
199
200 mutex_lock(&sbi->orphan_inode_mutex);
201 head = &sbi->orphan_inode_list;
202 list_for_each(this, head) {
203 orphan = list_entry(this, struct orphan_inode_entry, list);
204 if (orphan->ino == ino)
205 goto out;
206 if (orphan->ino > ino)
207 break;
208 orphan = NULL;
209 }
210retry:
211 new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
212 if (!new) {
213 cond_resched();
214 goto retry;
215 }
216 new->ino = ino;
217
218 /* add new_oentry into list which is sorted by inode number */
219 if (orphan) {
220 struct orphan_inode_entry *prev;
221
222 /* get previous entry */
223 prev = list_entry(orphan->list.prev, typeof(*prev), list);
224 if (&prev->list != head)
225 /* insert new orphan inode entry */
226 list_add(&new->list, &prev->list);
227 else
228 list_add(&new->list, head);
229 } else {
230 list_add_tail(&new->list, head);
231 }
232 sbi->n_orphans++;
233out:
234 mutex_unlock(&sbi->orphan_inode_mutex);
235}
236
237void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
238{
239 struct list_head *this, *next, *head;
240 struct orphan_inode_entry *orphan;
241
242 mutex_lock(&sbi->orphan_inode_mutex);
243 head = &sbi->orphan_inode_list;
244 list_for_each_safe(this, next, head) {
245 orphan = list_entry(this, struct orphan_inode_entry, list);
246 if (orphan->ino == ino) {
247 list_del(&orphan->list);
248 kmem_cache_free(orphan_entry_slab, orphan);
249 sbi->n_orphans--;
250 break;
251 }
252 }
253 mutex_unlock(&sbi->orphan_inode_mutex);
254}
255
256static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
257{
258 struct inode *inode = f2fs_iget(sbi->sb, ino);
259 BUG_ON(IS_ERR(inode));
260 clear_nlink(inode);
261
262 /* truncate all the data during iput */
263 iput(inode);
264}
265
266int recover_orphan_inodes(struct f2fs_sb_info *sbi)
267{
268 block_t start_blk, orphan_blkaddr, i, j;
269
270 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
271 return 0;
272
273 sbi->por_doing = 1;
274 start_blk = __start_cp_addr(sbi) + 1;
275 orphan_blkaddr = __start_sum_addr(sbi) - 1;
276
277 for (i = 0; i < orphan_blkaddr; i++) {
278 struct page *page = get_meta_page(sbi, start_blk + i);
279 struct f2fs_orphan_block *orphan_blk;
280
281 orphan_blk = (struct f2fs_orphan_block *)page_address(page);
282 for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
283 nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
284 recover_orphan_inode(sbi, ino);
285 }
286 f2fs_put_page(page, 1);
287 }
288 /* clear Orphan Flag */
289 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
290 sbi->por_doing = 0;
291 return 0;
292}
293
294static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
295{
296 struct list_head *head, *this, *next;
297 struct f2fs_orphan_block *orphan_blk = NULL;
298 struct page *page = NULL;
299 unsigned int nentries = 0;
300 unsigned short index = 1;
301 unsigned short orphan_blocks;
302
303 orphan_blocks = (unsigned short)((sbi->n_orphans +
304 (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
305
306 mutex_lock(&sbi->orphan_inode_mutex);
307 head = &sbi->orphan_inode_list;
308
309 /* loop for each orphan inode entry and write them in Jornal block */
310 list_for_each_safe(this, next, head) {
311 struct orphan_inode_entry *orphan;
312
313 orphan = list_entry(this, struct orphan_inode_entry, list);
314
315 if (nentries == F2FS_ORPHANS_PER_BLOCK) {
316 /*
317 * an orphan block is full of 1020 entries,
318 * then we need to flush current orphan blocks
319 * and bring another one in memory
320 */
321 orphan_blk->blk_addr = cpu_to_le16(index);
322 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
323 orphan_blk->entry_count = cpu_to_le32(nentries);
324 set_page_dirty(page);
325 f2fs_put_page(page, 1);
326 index++;
327 start_blk++;
328 nentries = 0;
329 page = NULL;
330 }
331 if (page)
332 goto page_exist;
333
334 page = grab_meta_page(sbi, start_blk);
335 orphan_blk = (struct f2fs_orphan_block *)page_address(page);
336 memset(orphan_blk, 0, sizeof(*orphan_blk));
337page_exist:
338 orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
339 }
340 if (!page)
341 goto end;
342
343 orphan_blk->blk_addr = cpu_to_le16(index);
344 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
345 orphan_blk->entry_count = cpu_to_le32(nentries);
346 set_page_dirty(page);
347 f2fs_put_page(page, 1);
348end:
349 mutex_unlock(&sbi->orphan_inode_mutex);
350}
351
352static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
353 block_t cp_addr, unsigned long long *version)
354{
355 struct page *cp_page_1, *cp_page_2 = NULL;
356 unsigned long blk_size = sbi->blocksize;
357 struct f2fs_checkpoint *cp_block;
358 unsigned long long cur_version = 0, pre_version = 0;
359 unsigned int crc = 0;
360 size_t crc_offset;
361
362 /* Read the 1st cp block in this CP pack */
363 cp_page_1 = get_meta_page(sbi, cp_addr);
364
365 /* get the version number */
366 cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
367 crc_offset = le32_to_cpu(cp_block->checksum_offset);
368 if (crc_offset >= blk_size)
369 goto invalid_cp1;
370
371 crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
372 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
373 goto invalid_cp1;
374
375 pre_version = le64_to_cpu(cp_block->checkpoint_ver);
376
377 /* Read the 2nd cp block in this CP pack */
378 cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
379 cp_page_2 = get_meta_page(sbi, cp_addr);
380
381 cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
382 crc_offset = le32_to_cpu(cp_block->checksum_offset);
383 if (crc_offset >= blk_size)
384 goto invalid_cp2;
385
386 crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
387 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
388 goto invalid_cp2;
389
390 cur_version = le64_to_cpu(cp_block->checkpoint_ver);
391
392 if (cur_version == pre_version) {
393 *version = cur_version;
394 f2fs_put_page(cp_page_2, 1);
395 return cp_page_1;
396 }
397invalid_cp2:
398 f2fs_put_page(cp_page_2, 1);
399invalid_cp1:
400 f2fs_put_page(cp_page_1, 1);
401 return NULL;
402}
403
404int get_valid_checkpoint(struct f2fs_sb_info *sbi)
405{
406 struct f2fs_checkpoint *cp_block;
407 struct f2fs_super_block *fsb = sbi->raw_super;
408 struct page *cp1, *cp2, *cur_page;
409 unsigned long blk_size = sbi->blocksize;
410 unsigned long long cp1_version = 0, cp2_version = 0;
411 unsigned long long cp_start_blk_no;
412
413 sbi->ckpt = kzalloc(blk_size, GFP_KERNEL);
414 if (!sbi->ckpt)
415 return -ENOMEM;
416 /*
417 * Finding out valid cp block involves read both
418 * sets( cp pack1 and cp pack 2)
419 */
420 cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
421 cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
422
423 /* The second checkpoint pack should start at the next segment */
424 cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
425 cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
426
427 if (cp1 && cp2) {
428 if (ver_after(cp2_version, cp1_version))
429 cur_page = cp2;
430 else
431 cur_page = cp1;
432 } else if (cp1) {
433 cur_page = cp1;
434 } else if (cp2) {
435 cur_page = cp2;
436 } else {
437 goto fail_no_cp;
438 }
439
440 cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
441 memcpy(sbi->ckpt, cp_block, blk_size);
442
443 f2fs_put_page(cp1, 1);
444 f2fs_put_page(cp2, 1);
445 return 0;
446
447fail_no_cp:
448 kfree(sbi->ckpt);
449 return -EINVAL;
450}
451
452void set_dirty_dir_page(struct inode *inode, struct page *page)
453{
454 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
455 struct list_head *head = &sbi->dir_inode_list;
456 struct dir_inode_entry *new;
457 struct list_head *this;
458
459 if (!S_ISDIR(inode->i_mode))
460 return;
461retry:
462 new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
463 if (!new) {
464 cond_resched();
465 goto retry;
466 }
467 new->inode = inode;
468 INIT_LIST_HEAD(&new->list);
469
470 spin_lock(&sbi->dir_inode_lock);
471 list_for_each(this, head) {
472 struct dir_inode_entry *entry;
473 entry = list_entry(this, struct dir_inode_entry, list);
474 if (entry->inode == inode) {
475 kmem_cache_free(inode_entry_slab, new);
476 goto out;
477 }
478 }
479 list_add_tail(&new->list, head);
480 sbi->n_dirty_dirs++;
481
482 BUG_ON(!S_ISDIR(inode->i_mode));
483out:
484 inc_page_count(sbi, F2FS_DIRTY_DENTS);
485 inode_inc_dirty_dents(inode);
486 SetPagePrivate(page);
487
488 spin_unlock(&sbi->dir_inode_lock);
489}
490
491void remove_dirty_dir_inode(struct inode *inode)
492{
493 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
494 struct list_head *head = &sbi->dir_inode_list;
495 struct list_head *this;
496
497 if (!S_ISDIR(inode->i_mode))
498 return;
499
500 spin_lock(&sbi->dir_inode_lock);
501 if (atomic_read(&F2FS_I(inode)->dirty_dents))
502 goto out;
503
504 list_for_each(this, head) {
505 struct dir_inode_entry *entry;
506 entry = list_entry(this, struct dir_inode_entry, list);
507 if (entry->inode == inode) {
508 list_del(&entry->list);
509 kmem_cache_free(inode_entry_slab, entry);
510 sbi->n_dirty_dirs--;
511 break;
512 }
513 }
514out:
515 spin_unlock(&sbi->dir_inode_lock);
516}
517
518void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
519{
520 struct list_head *head = &sbi->dir_inode_list;
521 struct dir_inode_entry *entry;
522 struct inode *inode;
523retry:
524 spin_lock(&sbi->dir_inode_lock);
525 if (list_empty(head)) {
526 spin_unlock(&sbi->dir_inode_lock);
527 return;
528 }
529 entry = list_entry(head->next, struct dir_inode_entry, list);
530 inode = igrab(entry->inode);
531 spin_unlock(&sbi->dir_inode_lock);
532 if (inode) {
533 filemap_flush(inode->i_mapping);
534 iput(inode);
535 } else {
536 /*
537 * We should submit bio, since it exists several
538 * wribacking dentry pages in the freeing inode.
539 */
540 f2fs_submit_bio(sbi, DATA, true);
541 }
542 goto retry;
543}
544
545/*
546 * Freeze all the FS-operations for checkpoint.
547 */
548void block_operations(struct f2fs_sb_info *sbi)
549{
550 int t;
551 struct writeback_control wbc = {
552 .sync_mode = WB_SYNC_ALL,
553 .nr_to_write = LONG_MAX,
554 .for_reclaim = 0,
555 };
556
557 /* Stop renaming operation */
558 mutex_lock_op(sbi, RENAME);
559 mutex_lock_op(sbi, DENTRY_OPS);
560
561retry_dents:
562 /* write all the dirty dentry pages */
563 sync_dirty_dir_inodes(sbi);
564
565 mutex_lock_op(sbi, DATA_WRITE);
566 if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
567 mutex_unlock_op(sbi, DATA_WRITE);
568 goto retry_dents;
569 }
570
571 /* block all the operations */
572 for (t = DATA_NEW; t <= NODE_TRUNC; t++)
573 mutex_lock_op(sbi, t);
574
575 mutex_lock(&sbi->write_inode);
576
577 /*
578 * POR: we should ensure that there is no dirty node pages
579 * until finishing nat/sit flush.
580 */
581retry:
582 sync_node_pages(sbi, 0, &wbc);
583
584 mutex_lock_op(sbi, NODE_WRITE);
585
586 if (get_pages(sbi, F2FS_DIRTY_NODES)) {
587 mutex_unlock_op(sbi, NODE_WRITE);
588 goto retry;
589 }
590 mutex_unlock(&sbi->write_inode);
591}
592
593static void unblock_operations(struct f2fs_sb_info *sbi)
594{
595 int t;
596 for (t = NODE_WRITE; t >= RENAME; t--)
597 mutex_unlock_op(sbi, t);
598}
599
600static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
601{
602 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
603 nid_t last_nid = 0;
604 block_t start_blk;
605 struct page *cp_page;
606 unsigned int data_sum_blocks, orphan_blocks;
607 unsigned int crc32 = 0;
608 void *kaddr;
609 int i;
610
611 /* Flush all the NAT/SIT pages */
612 while (get_pages(sbi, F2FS_DIRTY_META))
613 sync_meta_pages(sbi, META, LONG_MAX);
614
615 next_free_nid(sbi, &last_nid);
616
617 /*
618 * modify checkpoint
619 * version number is already updated
620 */
621 ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
622 ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
623 ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
624 for (i = 0; i < 3; i++) {
625 ckpt->cur_node_segno[i] =
626 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
627 ckpt->cur_node_blkoff[i] =
628 cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
629 ckpt->alloc_type[i + CURSEG_HOT_NODE] =
630 curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
631 }
632 for (i = 0; i < 3; i++) {
633 ckpt->cur_data_segno[i] =
634 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
635 ckpt->cur_data_blkoff[i] =
636 cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
637 ckpt->alloc_type[i + CURSEG_HOT_DATA] =
638 curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
639 }
640
641 ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
642 ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
643 ckpt->next_free_nid = cpu_to_le32(last_nid);
644
645 /* 2 cp + n data seg summary + orphan inode blocks */
646 data_sum_blocks = npages_for_summary_flush(sbi);
647 if (data_sum_blocks < 3)
648 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
649 else
650 clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
651
652 orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
653 / F2FS_ORPHANS_PER_BLOCK;
654 ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks);
655
656 if (is_umount) {
657 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
658 ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
659 data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE);
660 } else {
661 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
662 ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
663 data_sum_blocks + orphan_blocks);
664 }
665
666 if (sbi->n_orphans)
667 set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
668 else
669 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
670
671 /* update SIT/NAT bitmap */
672 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
673 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
674
675 crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
676 *(__le32 *)((unsigned char *)ckpt +
677 le32_to_cpu(ckpt->checksum_offset))
678 = cpu_to_le32(crc32);
679
680 start_blk = __start_cp_addr(sbi);
681
682 /* write out checkpoint buffer at block 0 */
683 cp_page = grab_meta_page(sbi, start_blk++);
684 kaddr = page_address(cp_page);
685 memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
686 set_page_dirty(cp_page);
687 f2fs_put_page(cp_page, 1);
688
689 if (sbi->n_orphans) {
690 write_orphan_inodes(sbi, start_blk);
691 start_blk += orphan_blocks;
692 }
693
694 write_data_summaries(sbi, start_blk);
695 start_blk += data_sum_blocks;
696 if (is_umount) {
697 write_node_summaries(sbi, start_blk);
698 start_blk += NR_CURSEG_NODE_TYPE;
699 }
700
701 /* writeout checkpoint block */
702 cp_page = grab_meta_page(sbi, start_blk);
703 kaddr = page_address(cp_page);
704 memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
705 set_page_dirty(cp_page);
706 f2fs_put_page(cp_page, 1);
707
708 /* wait for previous submitted node/meta pages writeback */
709 while (get_pages(sbi, F2FS_WRITEBACK))
710 congestion_wait(BLK_RW_ASYNC, HZ / 50);
711
712 filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
713 filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
714
715 /* update user_block_counts */
716 sbi->last_valid_block_count = sbi->total_valid_block_count;
717 sbi->alloc_valid_block_count = 0;
718
719 /* Here, we only have one bio having CP pack */
720 if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))
721 sbi->sb->s_flags |= MS_RDONLY;
722 else
723 sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
724
725 clear_prefree_segments(sbi);
726 F2FS_RESET_SB_DIRT(sbi);
727}
728
729/*
730 * We guarantee that this checkpoint procedure should not fail.
731 */
732void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount)
733{
734 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
735 unsigned long long ckpt_ver;
736
737 if (!blocked) {
738 mutex_lock(&sbi->cp_mutex);
739 block_operations(sbi);
740 }
741
742 f2fs_submit_bio(sbi, DATA, true);
743 f2fs_submit_bio(sbi, NODE, true);
744 f2fs_submit_bio(sbi, META, true);
745
746 /*
747 * update checkpoint pack index
748 * Increase the version number so that
749 * SIT entries and seg summaries are written at correct place
750 */
751 ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver);
752 ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
753
754 /* write cached NAT/SIT entries to NAT/SIT area */
755 flush_nat_entries(sbi);
756 flush_sit_entries(sbi);
757
758 reset_victim_segmap(sbi);
759
760 /* unlock all the fs_lock[] in do_checkpoint() */
761 do_checkpoint(sbi, is_umount);
762
763 unblock_operations(sbi);
764 mutex_unlock(&sbi->cp_mutex);
765}
766
767void init_orphan_info(struct f2fs_sb_info *sbi)
768{
769 mutex_init(&sbi->orphan_inode_mutex);
770 INIT_LIST_HEAD(&sbi->orphan_inode_list);
771 sbi->n_orphans = 0;
772}
773
774int __init create_checkpoint_caches(void)
775{
776 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
777 sizeof(struct orphan_inode_entry), NULL);
778 if (unlikely(!orphan_entry_slab))
779 return -ENOMEM;
780 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
781 sizeof(struct dir_inode_entry), NULL);
782 if (unlikely(!inode_entry_slab)) {
783 kmem_cache_destroy(orphan_entry_slab);
784 return -ENOMEM;
785 }
786 return 0;
787}
788
789void destroy_checkpoint_caches(void)
790{
791 kmem_cache_destroy(orphan_entry_slab);
792 kmem_cache_destroy(inode_entry_slab);
793}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
new file mode 100644
index 000000000000..7bd22a201125
--- /dev/null
+++ b/fs/f2fs/data.c
@@ -0,0 +1,718 @@
1/*
2 * fs/f2fs/data.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/buffer_head.h>
14#include <linux/mpage.h>
15#include <linux/writeback.h>
16#include <linux/backing-dev.h>
17#include <linux/blkdev.h>
18#include <linux/bio.h>
19#include <linux/prefetch.h>
20
21#include "f2fs.h"
22#include "node.h"
23#include "segment.h"
24
25/*
26 * Lock ordering for the change of data block address:
27 * ->data_page
28 * ->node_page
29 * update block addresses in the node page
30 */
31static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
32{
33 struct f2fs_node *rn;
34 __le32 *addr_array;
35 struct page *node_page = dn->node_page;
36 unsigned int ofs_in_node = dn->ofs_in_node;
37
38 wait_on_page_writeback(node_page);
39
40 rn = (struct f2fs_node *)page_address(node_page);
41
42 /* Get physical address of data block */
43 addr_array = blkaddr_in_node(rn);
44 addr_array[ofs_in_node] = cpu_to_le32(new_addr);
45 set_page_dirty(node_page);
46}
47
48int reserve_new_block(struct dnode_of_data *dn)
49{
50 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
51
52 if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
53 return -EPERM;
54 if (!inc_valid_block_count(sbi, dn->inode, 1))
55 return -ENOSPC;
56
57 __set_data_blkaddr(dn, NEW_ADDR);
58 dn->data_blkaddr = NEW_ADDR;
59 sync_inode_page(dn);
60 return 0;
61}
62
63static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
64 struct buffer_head *bh_result)
65{
66 struct f2fs_inode_info *fi = F2FS_I(inode);
67 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
68 pgoff_t start_fofs, end_fofs;
69 block_t start_blkaddr;
70
71 read_lock(&fi->ext.ext_lock);
72 if (fi->ext.len == 0) {
73 read_unlock(&fi->ext.ext_lock);
74 return 0;
75 }
76
77 sbi->total_hit_ext++;
78 start_fofs = fi->ext.fofs;
79 end_fofs = fi->ext.fofs + fi->ext.len - 1;
80 start_blkaddr = fi->ext.blk_addr;
81
82 if (pgofs >= start_fofs && pgofs <= end_fofs) {
83 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
84 size_t count;
85
86 clear_buffer_new(bh_result);
87 map_bh(bh_result, inode->i_sb,
88 start_blkaddr + pgofs - start_fofs);
89 count = end_fofs - pgofs + 1;
90 if (count < (UINT_MAX >> blkbits))
91 bh_result->b_size = (count << blkbits);
92 else
93 bh_result->b_size = UINT_MAX;
94
95 sbi->read_hit_ext++;
96 read_unlock(&fi->ext.ext_lock);
97 return 1;
98 }
99 read_unlock(&fi->ext.ext_lock);
100 return 0;
101}
102
103void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
104{
105 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
106 pgoff_t fofs, start_fofs, end_fofs;
107 block_t start_blkaddr, end_blkaddr;
108
109 BUG_ON(blk_addr == NEW_ADDR);
110 fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node;
111
112 /* Update the page address in the parent node */
113 __set_data_blkaddr(dn, blk_addr);
114
115 write_lock(&fi->ext.ext_lock);
116
117 start_fofs = fi->ext.fofs;
118 end_fofs = fi->ext.fofs + fi->ext.len - 1;
119 start_blkaddr = fi->ext.blk_addr;
120 end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
121
122 /* Drop and initialize the matched extent */
123 if (fi->ext.len == 1 && fofs == start_fofs)
124 fi->ext.len = 0;
125
126 /* Initial extent */
127 if (fi->ext.len == 0) {
128 if (blk_addr != NULL_ADDR) {
129 fi->ext.fofs = fofs;
130 fi->ext.blk_addr = blk_addr;
131 fi->ext.len = 1;
132 }
133 goto end_update;
134 }
135
136 /* Frone merge */
137 if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
138 fi->ext.fofs--;
139 fi->ext.blk_addr--;
140 fi->ext.len++;
141 goto end_update;
142 }
143
144 /* Back merge */
145 if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
146 fi->ext.len++;
147 goto end_update;
148 }
149
150 /* Split the existing extent */
151 if (fi->ext.len > 1 &&
152 fofs >= start_fofs && fofs <= end_fofs) {
153 if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
154 fi->ext.len = fofs - start_fofs;
155 } else {
156 fi->ext.fofs = fofs + 1;
157 fi->ext.blk_addr = start_blkaddr +
158 fofs - start_fofs + 1;
159 fi->ext.len -= fofs - start_fofs + 1;
160 }
161 goto end_update;
162 }
163 write_unlock(&fi->ext.ext_lock);
164 return;
165
166end_update:
167 write_unlock(&fi->ext.ext_lock);
168 sync_inode_page(dn);
169 return;
170}
171
172struct page *find_data_page(struct inode *inode, pgoff_t index)
173{
174 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
175 struct address_space *mapping = inode->i_mapping;
176 struct dnode_of_data dn;
177 struct page *page;
178 int err;
179
180 page = find_get_page(mapping, index);
181 if (page && PageUptodate(page))
182 return page;
183 f2fs_put_page(page, 0);
184
185 set_new_dnode(&dn, inode, NULL, NULL, 0);
186 err = get_dnode_of_data(&dn, index, RDONLY_NODE);
187 if (err)
188 return ERR_PTR(err);
189 f2fs_put_dnode(&dn);
190
191 if (dn.data_blkaddr == NULL_ADDR)
192 return ERR_PTR(-ENOENT);
193
194 /* By fallocate(), there is no cached page, but with NEW_ADDR */
195 if (dn.data_blkaddr == NEW_ADDR)
196 return ERR_PTR(-EINVAL);
197
198 page = grab_cache_page(mapping, index);
199 if (!page)
200 return ERR_PTR(-ENOMEM);
201
202 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
203 if (err) {
204 f2fs_put_page(page, 1);
205 return ERR_PTR(err);
206 }
207 unlock_page(page);
208 return page;
209}
210
211/*
212 * If it tries to access a hole, return an error.
213 * Because, the callers, functions in dir.c and GC, should be able to know
214 * whether this page exists or not.
215 */
216struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
217{
218 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
219 struct address_space *mapping = inode->i_mapping;
220 struct dnode_of_data dn;
221 struct page *page;
222 int err;
223
224 set_new_dnode(&dn, inode, NULL, NULL, 0);
225 err = get_dnode_of_data(&dn, index, RDONLY_NODE);
226 if (err)
227 return ERR_PTR(err);
228 f2fs_put_dnode(&dn);
229
230 if (dn.data_blkaddr == NULL_ADDR)
231 return ERR_PTR(-ENOENT);
232
233 page = grab_cache_page(mapping, index);
234 if (!page)
235 return ERR_PTR(-ENOMEM);
236
237 if (PageUptodate(page))
238 return page;
239
240 BUG_ON(dn.data_blkaddr == NEW_ADDR);
241 BUG_ON(dn.data_blkaddr == NULL_ADDR);
242
243 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
244 if (err) {
245 f2fs_put_page(page, 1);
246 return ERR_PTR(err);
247 }
248 return page;
249}
250
251/*
252 * Caller ensures that this data page is never allocated.
253 * A new zero-filled data page is allocated in the page cache.
254 */
255struct page *get_new_data_page(struct inode *inode, pgoff_t index,
256 bool new_i_size)
257{
258 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
259 struct address_space *mapping = inode->i_mapping;
260 struct page *page;
261 struct dnode_of_data dn;
262 int err;
263
264 set_new_dnode(&dn, inode, NULL, NULL, 0);
265 err = get_dnode_of_data(&dn, index, 0);
266 if (err)
267 return ERR_PTR(err);
268
269 if (dn.data_blkaddr == NULL_ADDR) {
270 if (reserve_new_block(&dn)) {
271 f2fs_put_dnode(&dn);
272 return ERR_PTR(-ENOSPC);
273 }
274 }
275 f2fs_put_dnode(&dn);
276
277 page = grab_cache_page(mapping, index);
278 if (!page)
279 return ERR_PTR(-ENOMEM);
280
281 if (PageUptodate(page))
282 return page;
283
284 if (dn.data_blkaddr == NEW_ADDR) {
285 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
286 } else {
287 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
288 if (err) {
289 f2fs_put_page(page, 1);
290 return ERR_PTR(err);
291 }
292 }
293 SetPageUptodate(page);
294
295 if (new_i_size &&
296 i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
297 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
298 mark_inode_dirty_sync(inode);
299 }
300 return page;
301}
302
303static void read_end_io(struct bio *bio, int err)
304{
305 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
306 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
307
308 do {
309 struct page *page = bvec->bv_page;
310
311 if (--bvec >= bio->bi_io_vec)
312 prefetchw(&bvec->bv_page->flags);
313
314 if (uptodate) {
315 SetPageUptodate(page);
316 } else {
317 ClearPageUptodate(page);
318 SetPageError(page);
319 }
320 unlock_page(page);
321 } while (bvec >= bio->bi_io_vec);
322 kfree(bio->bi_private);
323 bio_put(bio);
324}
325
326/*
327 * Fill the locked page with data located in the block address.
328 * Read operation is synchronous, and caller must unlock the page.
329 */
330int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
331 block_t blk_addr, int type)
332{
333 struct block_device *bdev = sbi->sb->s_bdev;
334 bool sync = (type == READ_SYNC);
335 struct bio *bio;
336
337 /* This page can be already read by other threads */
338 if (PageUptodate(page)) {
339 if (!sync)
340 unlock_page(page);
341 return 0;
342 }
343
344 down_read(&sbi->bio_sem);
345
346 /* Allocate a new bio */
347 bio = f2fs_bio_alloc(bdev, 1);
348
349 /* Initialize the bio */
350 bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
351 bio->bi_end_io = read_end_io;
352
353 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
354 kfree(bio->bi_private);
355 bio_put(bio);
356 up_read(&sbi->bio_sem);
357 return -EFAULT;
358 }
359
360 submit_bio(type, bio);
361 up_read(&sbi->bio_sem);
362
363 /* wait for read completion if sync */
364 if (sync) {
365 lock_page(page);
366 if (PageError(page))
367 return -EIO;
368 }
369 return 0;
370}
371
372/*
373 * This function should be used by the data read flow only where it
374 * does not check the "create" flag that indicates block allocation.
375 * The reason for this special functionality is to exploit VFS readahead
376 * mechanism.
377 */
378static int get_data_block_ro(struct inode *inode, sector_t iblock,
379 struct buffer_head *bh_result, int create)
380{
381 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
382 unsigned maxblocks = bh_result->b_size >> blkbits;
383 struct dnode_of_data dn;
384 pgoff_t pgofs;
385 int err;
386
387 /* Get the page offset from the block offset(iblock) */
388 pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
389
390 if (check_extent_cache(inode, pgofs, bh_result))
391 return 0;
392
393 /* When reading holes, we need its node page */
394 set_new_dnode(&dn, inode, NULL, NULL, 0);
395 err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
396 if (err)
397 return (err == -ENOENT) ? 0 : err;
398
399 /* It does not support data allocation */
400 BUG_ON(create);
401
402 if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
403 int i;
404 unsigned int end_offset;
405
406 end_offset = IS_INODE(dn.node_page) ?
407 ADDRS_PER_INODE :
408 ADDRS_PER_BLOCK;
409
410 clear_buffer_new(bh_result);
411
412 /* Give more consecutive addresses for the read ahead */
413 for (i = 0; i < end_offset - dn.ofs_in_node; i++)
414 if (((datablock_addr(dn.node_page,
415 dn.ofs_in_node + i))
416 != (dn.data_blkaddr + i)) || maxblocks == i)
417 break;
418 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
419 bh_result->b_size = (i << blkbits);
420 }
421 f2fs_put_dnode(&dn);
422 return 0;
423}
424
425static int f2fs_read_data_page(struct file *file, struct page *page)
426{
427 return mpage_readpage(page, get_data_block_ro);
428}
429
430static int f2fs_read_data_pages(struct file *file,
431 struct address_space *mapping,
432 struct list_head *pages, unsigned nr_pages)
433{
434 return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
435}
436
437int do_write_data_page(struct page *page)
438{
439 struct inode *inode = page->mapping->host;
440 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
441 block_t old_blk_addr, new_blk_addr;
442 struct dnode_of_data dn;
443 int err = 0;
444
445 set_new_dnode(&dn, inode, NULL, NULL, 0);
446 err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
447 if (err)
448 return err;
449
450 old_blk_addr = dn.data_blkaddr;
451
452 /* This page is already truncated */
453 if (old_blk_addr == NULL_ADDR)
454 goto out_writepage;
455
456 set_page_writeback(page);
457
458 /*
459 * If current allocation needs SSR,
460 * it had better in-place writes for updated data.
461 */
462 if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
463 need_inplace_update(inode)) {
464 rewrite_data_page(F2FS_SB(inode->i_sb), page,
465 old_blk_addr);
466 } else {
467 write_data_page(inode, page, &dn,
468 old_blk_addr, &new_blk_addr);
469 update_extent_cache(new_blk_addr, &dn);
470 F2FS_I(inode)->data_version =
471 le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
472 }
473out_writepage:
474 f2fs_put_dnode(&dn);
475 return err;
476}
477
478static int f2fs_write_data_page(struct page *page,
479 struct writeback_control *wbc)
480{
481 struct inode *inode = page->mapping->host;
482 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
483 loff_t i_size = i_size_read(inode);
484 const pgoff_t end_index = ((unsigned long long) i_size)
485 >> PAGE_CACHE_SHIFT;
486 unsigned offset;
487 int err = 0;
488
489 if (page->index < end_index)
490 goto out;
491
492 /*
493 * If the offset is out-of-range of file size,
494 * this page does not have to be written to disk.
495 */
496 offset = i_size & (PAGE_CACHE_SIZE - 1);
497 if ((page->index >= end_index + 1) || !offset) {
498 if (S_ISDIR(inode->i_mode)) {
499 dec_page_count(sbi, F2FS_DIRTY_DENTS);
500 inode_dec_dirty_dents(inode);
501 }
502 goto unlock_out;
503 }
504
505 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
506out:
507 if (sbi->por_doing)
508 goto redirty_out;
509
510 if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
511 goto redirty_out;
512
513 mutex_lock_op(sbi, DATA_WRITE);
514 if (S_ISDIR(inode->i_mode)) {
515 dec_page_count(sbi, F2FS_DIRTY_DENTS);
516 inode_dec_dirty_dents(inode);
517 }
518 err = do_write_data_page(page);
519 if (err && err != -ENOENT) {
520 wbc->pages_skipped++;
521 set_page_dirty(page);
522 }
523 mutex_unlock_op(sbi, DATA_WRITE);
524
525 if (wbc->for_reclaim)
526 f2fs_submit_bio(sbi, DATA, true);
527
528 if (err == -ENOENT)
529 goto unlock_out;
530
531 clear_cold_data(page);
532 unlock_page(page);
533
534 if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
535 f2fs_balance_fs(sbi);
536 return 0;
537
538unlock_out:
539 unlock_page(page);
540 return (err == -ENOENT) ? 0 : err;
541
542redirty_out:
543 wbc->pages_skipped++;
544 set_page_dirty(page);
545 return AOP_WRITEPAGE_ACTIVATE;
546}
547
548#define MAX_DESIRED_PAGES_WP 4096
549
550static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
551 void *data)
552{
553 struct address_space *mapping = data;
554 int ret = mapping->a_ops->writepage(page, wbc);
555 mapping_set_error(mapping, ret);
556 return ret;
557}
558
559static int f2fs_write_data_pages(struct address_space *mapping,
560 struct writeback_control *wbc)
561{
562 struct inode *inode = mapping->host;
563 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
564 int ret;
565 long excess_nrtw = 0, desired_nrtw;
566
567 if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
568 desired_nrtw = MAX_DESIRED_PAGES_WP;
569 excess_nrtw = desired_nrtw - wbc->nr_to_write;
570 wbc->nr_to_write = desired_nrtw;
571 }
572
573 if (!S_ISDIR(inode->i_mode))
574 mutex_lock(&sbi->writepages);
575 ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
576 if (!S_ISDIR(inode->i_mode))
577 mutex_unlock(&sbi->writepages);
578 f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
579
580 remove_dirty_dir_inode(inode);
581
582 wbc->nr_to_write -= excess_nrtw;
583 return ret;
584}
585
586static int f2fs_write_begin(struct file *file, struct address_space *mapping,
587 loff_t pos, unsigned len, unsigned flags,
588 struct page **pagep, void **fsdata)
589{
590 struct inode *inode = mapping->host;
591 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
592 struct page *page;
593 pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
594 struct dnode_of_data dn;
595 int err = 0;
596
597 /* for nobh_write_end */
598 *fsdata = NULL;
599
600 f2fs_balance_fs(sbi);
601
602 page = grab_cache_page_write_begin(mapping, index, flags);
603 if (!page)
604 return -ENOMEM;
605 *pagep = page;
606
607 mutex_lock_op(sbi, DATA_NEW);
608
609 set_new_dnode(&dn, inode, NULL, NULL, 0);
610 err = get_dnode_of_data(&dn, index, 0);
611 if (err) {
612 mutex_unlock_op(sbi, DATA_NEW);
613 f2fs_put_page(page, 1);
614 return err;
615 }
616
617 if (dn.data_blkaddr == NULL_ADDR) {
618 err = reserve_new_block(&dn);
619 if (err) {
620 f2fs_put_dnode(&dn);
621 mutex_unlock_op(sbi, DATA_NEW);
622 f2fs_put_page(page, 1);
623 return err;
624 }
625 }
626 f2fs_put_dnode(&dn);
627
628 mutex_unlock_op(sbi, DATA_NEW);
629
630 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
631 return 0;
632
633 if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
634 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
635 unsigned end = start + len;
636
637 /* Reading beyond i_size is simple: memset to zero */
638 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
639 return 0;
640 }
641
642 if (dn.data_blkaddr == NEW_ADDR) {
643 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
644 } else {
645 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
646 if (err) {
647 f2fs_put_page(page, 1);
648 return err;
649 }
650 }
651 SetPageUptodate(page);
652 clear_cold_data(page);
653 return 0;
654}
655
656static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
657 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
658{
659 struct file *file = iocb->ki_filp;
660 struct inode *inode = file->f_mapping->host;
661
662 if (rw == WRITE)
663 return 0;
664
665 /* Needs synchronization with the cleaner */
666 return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
667 get_data_block_ro);
668}
669
670static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
671{
672 struct inode *inode = page->mapping->host;
673 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
674 if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
675 dec_page_count(sbi, F2FS_DIRTY_DENTS);
676 inode_dec_dirty_dents(inode);
677 }
678 ClearPagePrivate(page);
679}
680
681static int f2fs_release_data_page(struct page *page, gfp_t wait)
682{
683 ClearPagePrivate(page);
684 return 0;
685}
686
687static int f2fs_set_data_page_dirty(struct page *page)
688{
689 struct address_space *mapping = page->mapping;
690 struct inode *inode = mapping->host;
691
692 SetPageUptodate(page);
693 if (!PageDirty(page)) {
694 __set_page_dirty_nobuffers(page);
695 set_dirty_dir_page(inode, page);
696 return 1;
697 }
698 return 0;
699}
700
701static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
702{
703 return generic_block_bmap(mapping, block, get_data_block_ro);
704}
705
706const struct address_space_operations f2fs_dblock_aops = {
707 .readpage = f2fs_read_data_page,
708 .readpages = f2fs_read_data_pages,
709 .writepage = f2fs_write_data_page,
710 .writepages = f2fs_write_data_pages,
711 .write_begin = f2fs_write_begin,
712 .write_end = nobh_write_end,
713 .set_page_dirty = f2fs_set_data_page_dirty,
714 .invalidatepage = f2fs_invalidate_data_page,
715 .releasepage = f2fs_release_data_page,
716 .direct_IO = f2fs_direct_IO,
717 .bmap = f2fs_bmap,
718};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
new file mode 100644
index 000000000000..c8c37307b326
--- /dev/null
+++ b/fs/f2fs/debug.c
@@ -0,0 +1,353 @@
1/*
2 * f2fs debugging statistics
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 * Copyright (c) 2012 Linux Foundation
7 * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include <linux/fs.h>
15#include <linux/backing-dev.h>
16#include <linux/proc_fs.h>
17#include <linux/f2fs_fs.h>
18#include <linux/blkdev.h>
19#include <linux/debugfs.h>
20#include <linux/seq_file.h>
21
22#include "f2fs.h"
23#include "node.h"
24#include "segment.h"
25#include "gc.h"
26
27static LIST_HEAD(f2fs_stat_list);
28static struct dentry *debugfs_root;
29static DEFINE_MUTEX(f2fs_stat_mutex);
30
31static void update_general_status(struct f2fs_sb_info *sbi)
32{
33 struct f2fs_stat_info *si = sbi->stat_info;
34 int i;
35
36 /* valid check of the segment numbers */
37 si->hit_ext = sbi->read_hit_ext;
38 si->total_ext = sbi->total_hit_ext;
39 si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
40 si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
41 si->ndirty_dirs = sbi->n_dirty_dirs;
42 si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
43 si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
44 si->rsvd_segs = reserved_segments(sbi);
45 si->overp_segs = overprovision_segments(sbi);
46 si->valid_count = valid_user_blocks(sbi);
47 si->valid_node_count = valid_node_count(sbi);
48 si->valid_inode_count = valid_inode_count(sbi);
49 si->utilization = utilization(sbi);
50
51 si->free_segs = free_segments(sbi);
52 si->free_secs = free_sections(sbi);
53 si->prefree_count = prefree_segments(sbi);
54 si->dirty_count = dirty_segments(sbi);
55 si->node_pages = sbi->node_inode->i_mapping->nrpages;
56 si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
57 si->nats = NM_I(sbi)->nat_cnt;
58 si->sits = SIT_I(sbi)->dirty_sentries;
59 si->fnids = NM_I(sbi)->fcnt;
60 si->bg_gc = sbi->bg_gc;
61 si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
62 * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
63 / 2;
64 si->util_valid = (int)(written_block_count(sbi) >>
65 sbi->log_blocks_per_seg)
66 * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
67 / 2;
68 si->util_invalid = 50 - si->util_free - si->util_valid;
69 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
70 struct curseg_info *curseg = CURSEG_I(sbi, i);
71 si->curseg[i] = curseg->segno;
72 si->cursec[i] = curseg->segno / sbi->segs_per_sec;
73 si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
74 }
75
76 for (i = 0; i < 2; i++) {
77 si->segment_count[i] = sbi->segment_count[i];
78 si->block_count[i] = sbi->block_count[i];
79 }
80}
81
82/*
83 * This function calculates BDF of every segments
84 */
85static void update_sit_info(struct f2fs_sb_info *sbi)
86{
87 struct f2fs_stat_info *si = sbi->stat_info;
88 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
89 struct sit_info *sit_i = SIT_I(sbi);
90 unsigned int segno, vblocks;
91 int ndirty = 0;
92
93 bimodal = 0;
94 total_vblocks = 0;
95 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
96 hblks_per_sec = blks_per_sec / 2;
97 mutex_lock(&sit_i->sentry_lock);
98 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
99 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
100 dist = abs(vblocks - hblks_per_sec);
101 bimodal += dist * dist;
102
103 if (vblocks > 0 && vblocks < blks_per_sec) {
104 total_vblocks += vblocks;
105 ndirty++;
106 }
107 }
108 mutex_unlock(&sit_i->sentry_lock);
109 dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100;
110 si->bimodal = bimodal / dist;
111 if (si->dirty_count)
112 si->avg_vblocks = total_vblocks / ndirty;
113 else
114 si->avg_vblocks = 0;
115}
116
117/*
118 * This function calculates memory footprint.
119 */
120static void update_mem_info(struct f2fs_sb_info *sbi)
121{
122 struct f2fs_stat_info *si = sbi->stat_info;
123 unsigned npages;
124
125 if (si->base_mem)
126 goto get_cache;
127
128 si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
129 si->base_mem += 2 * sizeof(struct f2fs_inode_info);
130 si->base_mem += sizeof(*sbi->ckpt);
131
132 /* build sm */
133 si->base_mem += sizeof(struct f2fs_sm_info);
134
135 /* build sit */
136 si->base_mem += sizeof(struct sit_info);
137 si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
138 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
139 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
140 if (sbi->segs_per_sec > 1)
141 si->base_mem += sbi->total_sections *
142 sizeof(struct sec_entry);
143 si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
144
145 /* build free segmap */
146 si->base_mem += sizeof(struct free_segmap_info);
147 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
148 si->base_mem += f2fs_bitmap_size(sbi->total_sections);
149
150 /* build curseg */
151 si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
152 si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
153
154 /* build dirty segmap */
155 si->base_mem += sizeof(struct dirty_seglist_info);
156 si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
157 si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi));
158
159 /* buld nm */
160 si->base_mem += sizeof(struct f2fs_nm_info);
161 si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
162
163 /* build gc */
164 si->base_mem += sizeof(struct f2fs_gc_kthread);
165
166get_cache:
167 /* free nids */
168 si->cache_mem = NM_I(sbi)->fcnt;
169 si->cache_mem += NM_I(sbi)->nat_cnt;
170 npages = sbi->node_inode->i_mapping->nrpages;
171 si->cache_mem += npages << PAGE_CACHE_SHIFT;
172 npages = sbi->meta_inode->i_mapping->nrpages;
173 si->cache_mem += npages << PAGE_CACHE_SHIFT;
174 si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
175 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
176}
177
178static int stat_show(struct seq_file *s, void *v)
179{
180 struct f2fs_stat_info *si, *next;
181 int i = 0;
182 int j;
183
184 mutex_lock(&f2fs_stat_mutex);
185 list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
186
187 update_general_status(si->sbi);
188
189 seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++);
190 seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
191 si->sit_area_segs, si->nat_area_segs);
192 seq_printf(s, "[SSA: %d] [MAIN: %d",
193 si->ssa_area_segs, si->main_area_segs);
194 seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
195 si->overp_segs, si->rsvd_segs);
196 seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
197 si->utilization, si->valid_count);
198 seq_printf(s, " - Node: %u (Inode: %u, ",
199 si->valid_node_count, si->valid_inode_count);
200 seq_printf(s, "Other: %u)\n - Data: %u\n",
201 si->valid_node_count - si->valid_inode_count,
202 si->valid_count - si->valid_node_count);
203 seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
204 si->main_area_segs, si->main_area_sections,
205 si->main_area_zones);
206 seq_printf(s, " - COLD data: %d, %d, %d\n",
207 si->curseg[CURSEG_COLD_DATA],
208 si->cursec[CURSEG_COLD_DATA],
209 si->curzone[CURSEG_COLD_DATA]);
210 seq_printf(s, " - WARM data: %d, %d, %d\n",
211 si->curseg[CURSEG_WARM_DATA],
212 si->cursec[CURSEG_WARM_DATA],
213 si->curzone[CURSEG_WARM_DATA]);
214 seq_printf(s, " - HOT data: %d, %d, %d\n",
215 si->curseg[CURSEG_HOT_DATA],
216 si->cursec[CURSEG_HOT_DATA],
217 si->curzone[CURSEG_HOT_DATA]);
218 seq_printf(s, " - Dir dnode: %d, %d, %d\n",
219 si->curseg[CURSEG_HOT_NODE],
220 si->cursec[CURSEG_HOT_NODE],
221 si->curzone[CURSEG_HOT_NODE]);
222 seq_printf(s, " - File dnode: %d, %d, %d\n",
223 si->curseg[CURSEG_WARM_NODE],
224 si->cursec[CURSEG_WARM_NODE],
225 si->curzone[CURSEG_WARM_NODE]);
226 seq_printf(s, " - Indir nodes: %d, %d, %d\n",
227 si->curseg[CURSEG_COLD_NODE],
228 si->cursec[CURSEG_COLD_NODE],
229 si->curzone[CURSEG_COLD_NODE]);
230 seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n",
231 si->main_area_segs - si->dirty_count -
232 si->prefree_count - si->free_segs,
233 si->dirty_count);
234 seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
235 si->prefree_count, si->free_segs, si->free_secs);
236 seq_printf(s, "GC calls: %d (BG: %d)\n",
237 si->call_count, si->bg_gc);
238 seq_printf(s, " - data segments : %d\n", si->data_segs);
239 seq_printf(s, " - node segments : %d\n", si->node_segs);
240 seq_printf(s, "Try to move %d blocks\n", si->tot_blks);
241 seq_printf(s, " - data blocks : %d\n", si->data_blks);
242 seq_printf(s, " - node blocks : %d\n", si->node_blks);
243 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
244 si->hit_ext, si->total_ext);
245 seq_printf(s, "\nBalancing F2FS Async:\n");
246 seq_printf(s, " - nodes %4d in %4d\n",
247 si->ndirty_node, si->node_pages);
248 seq_printf(s, " - dents %4d in dirs:%4d\n",
249 si->ndirty_dent, si->ndirty_dirs);
250 seq_printf(s, " - meta %4d in %4d\n",
251 si->ndirty_meta, si->meta_pages);
252 seq_printf(s, " - NATs %5d > %lu\n",
253 si->nats, NM_WOUT_THRESHOLD);
254 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
255 si->sits, si->fnids);
256 seq_printf(s, "\nDistribution of User Blocks:");
257 seq_printf(s, " [ valid | invalid | free ]\n");
258 seq_printf(s, " [");
259
260 for (j = 0; j < si->util_valid; j++)
261 seq_printf(s, "-");
262 seq_printf(s, "|");
263
264 for (j = 0; j < si->util_invalid; j++)
265 seq_printf(s, "-");
266 seq_printf(s, "|");
267
268 for (j = 0; j < si->util_free; j++)
269 seq_printf(s, "-");
270 seq_printf(s, "]\n\n");
271 seq_printf(s, "SSR: %u blocks in %u segments\n",
272 si->block_count[SSR], si->segment_count[SSR]);
273 seq_printf(s, "LFS: %u blocks in %u segments\n",
274 si->block_count[LFS], si->segment_count[LFS]);
275
276 /* segment usage info */
277 update_sit_info(si->sbi);
278 seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n",
279 si->bimodal, si->avg_vblocks);
280
281 /* memory footprint */
282 update_mem_info(si->sbi);
283 seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
284 (si->base_mem + si->cache_mem) >> 10,
285 si->base_mem >> 10, si->cache_mem >> 10);
286 }
287 mutex_unlock(&f2fs_stat_mutex);
288 return 0;
289}
290
291static int stat_open(struct inode *inode, struct file *file)
292{
293 return single_open(file, stat_show, inode->i_private);
294}
295
296static const struct file_operations stat_fops = {
297 .open = stat_open,
298 .read = seq_read,
299 .llseek = seq_lseek,
300 .release = single_release,
301};
302
303int f2fs_build_stats(struct f2fs_sb_info *sbi)
304{
305 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
306 struct f2fs_stat_info *si;
307
308 sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
309 if (!sbi->stat_info)
310 return -ENOMEM;
311
312 si = sbi->stat_info;
313 si->all_area_segs = le32_to_cpu(raw_super->segment_count);
314 si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
315 si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
316 si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa);
317 si->main_area_segs = le32_to_cpu(raw_super->segment_count_main);
318 si->main_area_sections = le32_to_cpu(raw_super->section_count);
319 si->main_area_zones = si->main_area_sections /
320 le32_to_cpu(raw_super->secs_per_zone);
321 si->sbi = sbi;
322
323 mutex_lock(&f2fs_stat_mutex);
324 list_add_tail(&si->stat_list, &f2fs_stat_list);
325 mutex_unlock(&f2fs_stat_mutex);
326
327 return 0;
328}
329
330void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
331{
332 struct f2fs_stat_info *si = sbi->stat_info;
333
334 mutex_lock(&f2fs_stat_mutex);
335 list_del(&si->stat_list);
336 mutex_unlock(&f2fs_stat_mutex);
337
338 kfree(sbi->stat_info);
339}
340
341void __init f2fs_create_root_stats(void)
342{
343 debugfs_root = debugfs_create_dir("f2fs", NULL);
344 if (debugfs_root)
345 debugfs_create_file("status", S_IRUGO, debugfs_root,
346 NULL, &stat_fops);
347}
348
349void f2fs_destroy_root_stats(void)
350{
351 debugfs_remove_recursive(debugfs_root);
352 debugfs_root = NULL;
353}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
new file mode 100644
index 000000000000..989980e16d0b
--- /dev/null
+++ b/fs/f2fs/dir.c
@@ -0,0 +1,674 @@
1/*
2 * fs/f2fs/dir.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include "f2fs.h"
14#include "node.h"
15#include "acl.h"
16
17static unsigned long dir_blocks(struct inode *inode)
18{
19 return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
20 >> PAGE_CACHE_SHIFT;
21}
22
23static unsigned int dir_buckets(unsigned int level)
24{
25 if (level < MAX_DIR_HASH_DEPTH / 2)
26 return 1 << level;
27 else
28 return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1);
29}
30
31static unsigned int bucket_blocks(unsigned int level)
32{
33 if (level < MAX_DIR_HASH_DEPTH / 2)
34 return 2;
35 else
36 return 4;
37}
38
39static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
40 [F2FS_FT_UNKNOWN] = DT_UNKNOWN,
41 [F2FS_FT_REG_FILE] = DT_REG,
42 [F2FS_FT_DIR] = DT_DIR,
43 [F2FS_FT_CHRDEV] = DT_CHR,
44 [F2FS_FT_BLKDEV] = DT_BLK,
45 [F2FS_FT_FIFO] = DT_FIFO,
46 [F2FS_FT_SOCK] = DT_SOCK,
47 [F2FS_FT_SYMLINK] = DT_LNK,
48};
49
50#define S_SHIFT 12
51static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
52 [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE,
53 [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR,
54 [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV,
55 [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV,
56 [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO,
57 [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK,
58 [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
59};
60
61static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
62{
63 mode_t mode = inode->i_mode;
64 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
65}
66
67static unsigned long dir_block_index(unsigned int level, unsigned int idx)
68{
69 unsigned long i;
70 unsigned long bidx = 0;
71
72 for (i = 0; i < level; i++)
73 bidx += dir_buckets(i) * bucket_blocks(i);
74 bidx += idx * bucket_blocks(level);
75 return bidx;
76}
77
78static bool early_match_name(const char *name, size_t namelen,
79 f2fs_hash_t namehash, struct f2fs_dir_entry *de)
80{
81 if (le16_to_cpu(de->name_len) != namelen)
82 return false;
83
84 if (de->hash_code != namehash)
85 return false;
86
87 return true;
88}
89
90static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
91 const char *name, size_t namelen, int *max_slots,
92 f2fs_hash_t namehash, struct page **res_page)
93{
94 struct f2fs_dir_entry *de;
95 unsigned long bit_pos, end_pos, next_pos;
96 struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
97 int slots;
98
99 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
100 NR_DENTRY_IN_BLOCK, 0);
101 while (bit_pos < NR_DENTRY_IN_BLOCK) {
102 de = &dentry_blk->dentry[bit_pos];
103 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
104
105 if (early_match_name(name, namelen, namehash, de)) {
106 if (!memcmp(dentry_blk->filename[bit_pos],
107 name, namelen)) {
108 *res_page = dentry_page;
109 goto found;
110 }
111 }
112 next_pos = bit_pos + slots;
113 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
114 NR_DENTRY_IN_BLOCK, next_pos);
115 if (bit_pos >= NR_DENTRY_IN_BLOCK)
116 end_pos = NR_DENTRY_IN_BLOCK;
117 else
118 end_pos = bit_pos;
119 if (*max_slots < end_pos - next_pos)
120 *max_slots = end_pos - next_pos;
121 }
122
123 de = NULL;
124 kunmap(dentry_page);
125found:
126 return de;
127}
128
129static struct f2fs_dir_entry *find_in_level(struct inode *dir,
130 unsigned int level, const char *name, size_t namelen,
131 f2fs_hash_t namehash, struct page **res_page)
132{
133 int s = GET_DENTRY_SLOTS(namelen);
134 unsigned int nbucket, nblock;
135 unsigned int bidx, end_block;
136 struct page *dentry_page;
137 struct f2fs_dir_entry *de = NULL;
138 bool room = false;
139 int max_slots = 0;
140
141 BUG_ON(level > MAX_DIR_HASH_DEPTH);
142
143 nbucket = dir_buckets(level);
144 nblock = bucket_blocks(level);
145
146 bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket);
147 end_block = bidx + nblock;
148
149 for (; bidx < end_block; bidx++) {
150 /* no need to allocate new dentry pages to all the indices */
151 dentry_page = find_data_page(dir, bidx);
152 if (IS_ERR(dentry_page)) {
153 room = true;
154 continue;
155 }
156
157 de = find_in_block(dentry_page, name, namelen,
158 &max_slots, namehash, res_page);
159 if (de)
160 break;
161
162 if (max_slots >= s)
163 room = true;
164 f2fs_put_page(dentry_page, 0);
165 }
166
167 if (!de && room && F2FS_I(dir)->chash != namehash) {
168 F2FS_I(dir)->chash = namehash;
169 F2FS_I(dir)->clevel = level;
170 }
171
172 return de;
173}
174
175/*
176 * Find an entry in the specified directory with the wanted name.
177 * It returns the page where the entry was found (as a parameter - res_page),
178 * and the entry itself. Page is returned mapped and unlocked.
179 * Entry is guaranteed to be valid.
180 */
181struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
182 struct qstr *child, struct page **res_page)
183{
184 const char *name = child->name;
185 size_t namelen = child->len;
186 unsigned long npages = dir_blocks(dir);
187 struct f2fs_dir_entry *de = NULL;
188 f2fs_hash_t name_hash;
189 unsigned int max_depth;
190 unsigned int level;
191
192 if (npages == 0)
193 return NULL;
194
195 *res_page = NULL;
196
197 name_hash = f2fs_dentry_hash(name, namelen);
198 max_depth = F2FS_I(dir)->i_current_depth;
199
200 for (level = 0; level < max_depth; level++) {
201 de = find_in_level(dir, level, name,
202 namelen, name_hash, res_page);
203 if (de)
204 break;
205 }
206 if (!de && F2FS_I(dir)->chash != name_hash) {
207 F2FS_I(dir)->chash = name_hash;
208 F2FS_I(dir)->clevel = level - 1;
209 }
210 return de;
211}
212
213struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
214{
215 struct page *page = NULL;
216 struct f2fs_dir_entry *de = NULL;
217 struct f2fs_dentry_block *dentry_blk = NULL;
218
219 page = get_lock_data_page(dir, 0);
220 if (IS_ERR(page))
221 return NULL;
222
223 dentry_blk = kmap(page);
224 de = &dentry_blk->dentry[1];
225 *p = page;
226 unlock_page(page);
227 return de;
228}
229
230ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
231{
232 ino_t res = 0;
233 struct f2fs_dir_entry *de;
234 struct page *page;
235
236 de = f2fs_find_entry(dir, qstr, &page);
237 if (de) {
238 res = le32_to_cpu(de->ino);
239 kunmap(page);
240 f2fs_put_page(page, 0);
241 }
242
243 return res;
244}
245
246void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
247 struct page *page, struct inode *inode)
248{
249 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
250
251 mutex_lock_op(sbi, DENTRY_OPS);
252 lock_page(page);
253 wait_on_page_writeback(page);
254 de->ino = cpu_to_le32(inode->i_ino);
255 set_de_type(de, inode);
256 kunmap(page);
257 set_page_dirty(page);
258 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
259 mark_inode_dirty(dir);
260
261 /* update parent inode number before releasing dentry page */
262 F2FS_I(inode)->i_pino = dir->i_ino;
263
264 f2fs_put_page(page, 1);
265 mutex_unlock_op(sbi, DENTRY_OPS);
266}
267
268void init_dent_inode(struct dentry *dentry, struct page *ipage)
269{
270 struct f2fs_node *rn;
271
272 if (IS_ERR(ipage))
273 return;
274
275 wait_on_page_writeback(ipage);
276
277 /* copy dentry info. to this inode page */
278 rn = (struct f2fs_node *)page_address(ipage);
279 rn->i.i_namelen = cpu_to_le32(dentry->d_name.len);
280 memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len);
281 set_page_dirty(ipage);
282}
283
284static int init_inode_metadata(struct inode *inode, struct dentry *dentry)
285{
286 struct inode *dir = dentry->d_parent->d_inode;
287
288 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
289 int err;
290 err = new_inode_page(inode, dentry);
291 if (err)
292 return err;
293
294 if (S_ISDIR(inode->i_mode)) {
295 err = f2fs_make_empty(inode, dir);
296 if (err) {
297 remove_inode_page(inode);
298 return err;
299 }
300 }
301
302 err = f2fs_init_acl(inode, dir);
303 if (err) {
304 remove_inode_page(inode);
305 return err;
306 }
307 } else {
308 struct page *ipage;
309 ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
310 if (IS_ERR(ipage))
311 return PTR_ERR(ipage);
312 set_cold_node(inode, ipage);
313 init_dent_inode(dentry, ipage);
314 f2fs_put_page(ipage, 1);
315 }
316 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
317 inc_nlink(inode);
318 f2fs_write_inode(inode, NULL);
319 }
320 return 0;
321}
322
323static void update_parent_metadata(struct inode *dir, struct inode *inode,
324 unsigned int current_depth)
325{
326 bool need_dir_update = false;
327
328 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
329 if (S_ISDIR(inode->i_mode)) {
330 inc_nlink(dir);
331 need_dir_update = true;
332 }
333 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
334 }
335 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
336 if (F2FS_I(dir)->i_current_depth != current_depth) {
337 F2FS_I(dir)->i_current_depth = current_depth;
338 need_dir_update = true;
339 }
340
341 if (need_dir_update)
342 f2fs_write_inode(dir, NULL);
343 else
344 mark_inode_dirty(dir);
345
346 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
347 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
348}
349
350static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots)
351{
352 int bit_start = 0;
353 int zero_start, zero_end;
354next:
355 zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap,
356 NR_DENTRY_IN_BLOCK,
357 bit_start);
358 if (zero_start >= NR_DENTRY_IN_BLOCK)
359 return NR_DENTRY_IN_BLOCK;
360
361 zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap,
362 NR_DENTRY_IN_BLOCK,
363 zero_start);
364 if (zero_end - zero_start >= slots)
365 return zero_start;
366
367 bit_start = zero_end + 1;
368
369 if (zero_end + 1 >= NR_DENTRY_IN_BLOCK)
370 return NR_DENTRY_IN_BLOCK;
371 goto next;
372}
373
374int f2fs_add_link(struct dentry *dentry, struct inode *inode)
375{
376 unsigned int bit_pos;
377 unsigned int level;
378 unsigned int current_depth;
379 unsigned long bidx, block;
380 f2fs_hash_t dentry_hash;
381 struct f2fs_dir_entry *de;
382 unsigned int nbucket, nblock;
383 struct inode *dir = dentry->d_parent->d_inode;
384 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
385 const char *name = dentry->d_name.name;
386 size_t namelen = dentry->d_name.len;
387 struct page *dentry_page = NULL;
388 struct f2fs_dentry_block *dentry_blk = NULL;
389 int slots = GET_DENTRY_SLOTS(namelen);
390 int err = 0;
391 int i;
392
393 dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len);
394 level = 0;
395 current_depth = F2FS_I(dir)->i_current_depth;
396 if (F2FS_I(dir)->chash == dentry_hash) {
397 level = F2FS_I(dir)->clevel;
398 F2FS_I(dir)->chash = 0;
399 }
400
401start:
402 if (current_depth == MAX_DIR_HASH_DEPTH)
403 return -ENOSPC;
404
405 /* Increase the depth, if required */
406 if (level == current_depth)
407 ++current_depth;
408
409 nbucket = dir_buckets(level);
410 nblock = bucket_blocks(level);
411
412 bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
413
414 for (block = bidx; block <= (bidx + nblock - 1); block++) {
415 mutex_lock_op(sbi, DENTRY_OPS);
416 dentry_page = get_new_data_page(dir, block, true);
417 if (IS_ERR(dentry_page)) {
418 mutex_unlock_op(sbi, DENTRY_OPS);
419 return PTR_ERR(dentry_page);
420 }
421
422 dentry_blk = kmap(dentry_page);
423 bit_pos = room_for_filename(dentry_blk, slots);
424 if (bit_pos < NR_DENTRY_IN_BLOCK)
425 goto add_dentry;
426
427 kunmap(dentry_page);
428 f2fs_put_page(dentry_page, 1);
429 mutex_unlock_op(sbi, DENTRY_OPS);
430 }
431
432 /* Move to next level to find the empty slot for new dentry */
433 ++level;
434 goto start;
435add_dentry:
436 err = init_inode_metadata(inode, dentry);
437 if (err)
438 goto fail;
439
440 wait_on_page_writeback(dentry_page);
441
442 de = &dentry_blk->dentry[bit_pos];
443 de->hash_code = dentry_hash;
444 de->name_len = cpu_to_le16(namelen);
445 memcpy(dentry_blk->filename[bit_pos], name, namelen);
446 de->ino = cpu_to_le32(inode->i_ino);
447 set_de_type(de, inode);
448 for (i = 0; i < slots; i++)
449 test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
450 set_page_dirty(dentry_page);
451
452 update_parent_metadata(dir, inode, current_depth);
453
454 /* update parent inode number before releasing dentry page */
455 F2FS_I(inode)->i_pino = dir->i_ino;
456fail:
457 kunmap(dentry_page);
458 f2fs_put_page(dentry_page, 1);
459 mutex_unlock_op(sbi, DENTRY_OPS);
460 return err;
461}
462
463/*
464 * It only removes the dentry from the dentry page,corresponding name
465 * entry in name page does not need to be touched during deletion.
466 */
467void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
468 struct inode *inode)
469{
470 struct f2fs_dentry_block *dentry_blk;
471 unsigned int bit_pos;
472 struct address_space *mapping = page->mapping;
473 struct inode *dir = mapping->host;
474 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
475 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
476 void *kaddr = page_address(page);
477 int i;
478
479 mutex_lock_op(sbi, DENTRY_OPS);
480
481 lock_page(page);
482 wait_on_page_writeback(page);
483
484 dentry_blk = (struct f2fs_dentry_block *)kaddr;
485 bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
486 for (i = 0; i < slots; i++)
487 test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
488
489 /* Let's check and deallocate this dentry page */
490 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
491 NR_DENTRY_IN_BLOCK,
492 0);
493 kunmap(page); /* kunmap - pair of f2fs_find_entry */
494 set_page_dirty(page);
495
496 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
497
498 if (inode && S_ISDIR(inode->i_mode)) {
499 drop_nlink(dir);
500 f2fs_write_inode(dir, NULL);
501 } else {
502 mark_inode_dirty(dir);
503 }
504
505 if (inode) {
506 inode->i_ctime = CURRENT_TIME;
507 drop_nlink(inode);
508 if (S_ISDIR(inode->i_mode)) {
509 drop_nlink(inode);
510 i_size_write(inode, 0);
511 }
512 f2fs_write_inode(inode, NULL);
513 if (inode->i_nlink == 0)
514 add_orphan_inode(sbi, inode->i_ino);
515 }
516
517 if (bit_pos == NR_DENTRY_IN_BLOCK) {
518 truncate_hole(dir, page->index, page->index + 1);
519 clear_page_dirty_for_io(page);
520 ClearPageUptodate(page);
521 dec_page_count(sbi, F2FS_DIRTY_DENTS);
522 inode_dec_dirty_dents(dir);
523 }
524 f2fs_put_page(page, 1);
525
526 mutex_unlock_op(sbi, DENTRY_OPS);
527}
528
529int f2fs_make_empty(struct inode *inode, struct inode *parent)
530{
531 struct page *dentry_page;
532 struct f2fs_dentry_block *dentry_blk;
533 struct f2fs_dir_entry *de;
534 void *kaddr;
535
536 dentry_page = get_new_data_page(inode, 0, true);
537 if (IS_ERR(dentry_page))
538 return PTR_ERR(dentry_page);
539
540 kaddr = kmap_atomic(dentry_page);
541 dentry_blk = (struct f2fs_dentry_block *)kaddr;
542
543 de = &dentry_blk->dentry[0];
544 de->name_len = cpu_to_le16(1);
545 de->hash_code = f2fs_dentry_hash(".", 1);
546 de->ino = cpu_to_le32(inode->i_ino);
547 memcpy(dentry_blk->filename[0], ".", 1);
548 set_de_type(de, inode);
549
550 de = &dentry_blk->dentry[1];
551 de->hash_code = f2fs_dentry_hash("..", 2);
552 de->name_len = cpu_to_le16(2);
553 de->ino = cpu_to_le32(parent->i_ino);
554 memcpy(dentry_blk->filename[1], "..", 2);
555 set_de_type(de, inode);
556
557 test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
558 test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
559 kunmap_atomic(kaddr);
560
561 set_page_dirty(dentry_page);
562 f2fs_put_page(dentry_page, 1);
563 return 0;
564}
565
566bool f2fs_empty_dir(struct inode *dir)
567{
568 unsigned long bidx;
569 struct page *dentry_page;
570 unsigned int bit_pos;
571 struct f2fs_dentry_block *dentry_blk;
572 unsigned long nblock = dir_blocks(dir);
573
574 for (bidx = 0; bidx < nblock; bidx++) {
575 void *kaddr;
576 dentry_page = get_lock_data_page(dir, bidx);
577 if (IS_ERR(dentry_page)) {
578 if (PTR_ERR(dentry_page) == -ENOENT)
579 continue;
580 else
581 return false;
582 }
583
584 kaddr = kmap_atomic(dentry_page);
585 dentry_blk = (struct f2fs_dentry_block *)kaddr;
586 if (bidx == 0)
587 bit_pos = 2;
588 else
589 bit_pos = 0;
590 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
591 NR_DENTRY_IN_BLOCK,
592 bit_pos);
593 kunmap_atomic(kaddr);
594
595 f2fs_put_page(dentry_page, 1);
596
597 if (bit_pos < NR_DENTRY_IN_BLOCK)
598 return false;
599 }
600 return true;
601}
602
603static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
604{
605 unsigned long pos = file->f_pos;
606 struct inode *inode = file->f_dentry->d_inode;
607 unsigned long npages = dir_blocks(inode);
608 unsigned char *types = NULL;
609 unsigned int bit_pos = 0, start_bit_pos = 0;
610 int over = 0;
611 struct f2fs_dentry_block *dentry_blk = NULL;
612 struct f2fs_dir_entry *de = NULL;
613 struct page *dentry_page = NULL;
614 unsigned int n = 0;
615 unsigned char d_type = DT_UNKNOWN;
616 int slots;
617
618 types = f2fs_filetype_table;
619 bit_pos = (pos % NR_DENTRY_IN_BLOCK);
620 n = (pos / NR_DENTRY_IN_BLOCK);
621
622 for ( ; n < npages; n++) {
623 dentry_page = get_lock_data_page(inode, n);
624 if (IS_ERR(dentry_page))
625 continue;
626
627 start_bit_pos = bit_pos;
628 dentry_blk = kmap(dentry_page);
629 while (bit_pos < NR_DENTRY_IN_BLOCK) {
630 d_type = DT_UNKNOWN;
631 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
632 NR_DENTRY_IN_BLOCK,
633 bit_pos);
634 if (bit_pos >= NR_DENTRY_IN_BLOCK)
635 break;
636
637 de = &dentry_blk->dentry[bit_pos];
638 if (types && de->file_type < F2FS_FT_MAX)
639 d_type = types[de->file_type];
640
641 over = filldir(dirent,
642 dentry_blk->filename[bit_pos],
643 le16_to_cpu(de->name_len),
644 (n * NR_DENTRY_IN_BLOCK) + bit_pos,
645 le32_to_cpu(de->ino), d_type);
646 if (over) {
647 file->f_pos += bit_pos - start_bit_pos;
648 goto success;
649 }
650 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
651 bit_pos += slots;
652 }
653 bit_pos = 0;
654 file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
655 kunmap(dentry_page);
656 f2fs_put_page(dentry_page, 1);
657 dentry_page = NULL;
658 }
659success:
660 if (dentry_page && !IS_ERR(dentry_page)) {
661 kunmap(dentry_page);
662 f2fs_put_page(dentry_page, 1);
663 }
664
665 return 0;
666}
667
668const struct file_operations f2fs_dir_operations = {
669 .llseek = generic_file_llseek,
670 .read = generic_read_dir,
671 .readdir = f2fs_readdir,
672 .fsync = f2fs_sync_file,
673 .unlocked_ioctl = f2fs_ioctl,
674};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
new file mode 100644
index 000000000000..c8e2d751ef9c
--- /dev/null
+++ b/fs/f2fs/f2fs.h
@@ -0,0 +1,1087 @@
1/*
2 * fs/f2fs/f2fs.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef _LINUX_F2FS_H
12#define _LINUX_F2FS_H
13
14#include <linux/types.h>
15#include <linux/page-flags.h>
16#include <linux/buffer_head.h>
17#include <linux/slab.h>
18#include <linux/crc32.h>
19#include <linux/magic.h>
20
21/*
22 * For mount options
23 */
24#define F2FS_MOUNT_BG_GC 0x00000001
25#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002
26#define F2FS_MOUNT_DISCARD 0x00000004
27#define F2FS_MOUNT_NOHEAP 0x00000008
28#define F2FS_MOUNT_XATTR_USER 0x00000010
29#define F2FS_MOUNT_POSIX_ACL 0x00000020
30#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
31
32#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
33#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
34#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option)
35
36#define ver_after(a, b) (typecheck(unsigned long long, a) && \
37 typecheck(unsigned long long, b) && \
38 ((long long)((a) - (b)) > 0))
39
40typedef u64 block_t;
41typedef u32 nid_t;
42
43struct f2fs_mount_info {
44 unsigned int opt;
45};
46
47static inline __u32 f2fs_crc32(void *buff, size_t len)
48{
49 return crc32_le(F2FS_SUPER_MAGIC, buff, len);
50}
51
52static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
53{
54 return f2fs_crc32(buff, buff_size) == blk_crc;
55}
56
57/*
58 * For checkpoint manager
59 */
60enum {
61 NAT_BITMAP,
62 SIT_BITMAP
63};
64
65/* for the list of orphan inodes */
66struct orphan_inode_entry {
67 struct list_head list; /* list head */
68 nid_t ino; /* inode number */
69};
70
71/* for the list of directory inodes */
72struct dir_inode_entry {
73 struct list_head list; /* list head */
74 struct inode *inode; /* vfs inode pointer */
75};
76
77/* for the list of fsync inodes, used only during recovery */
78struct fsync_inode_entry {
79 struct list_head list; /* list head */
80 struct inode *inode; /* vfs inode pointer */
81 block_t blkaddr; /* block address locating the last inode */
82};
83
84#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
85#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits))
86
87#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne)
88#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid)
89#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
90#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
91
92static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
93{
94 int before = nats_in_cursum(rs);
95 rs->n_nats = cpu_to_le16(before + i);
96 return before;
97}
98
99static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
100{
101 int before = sits_in_cursum(rs);
102 rs->n_sits = cpu_to_le16(before + i);
103 return before;
104}
105
106/*
107 * For INODE and NODE manager
108 */
109#define XATTR_NODE_OFFSET (-1) /*
110 * store xattrs to one node block per
111 * file keeping -1 as its node offset to
112 * distinguish from index node blocks.
113 */
114#define RDONLY_NODE 1 /*
115 * specify a read-only mode when getting
116 * a node block. 0 is read-write mode.
117 * used by get_dnode_of_data().
118 */
119#define F2FS_LINK_MAX 32000 /* maximum link count per file */
120
121/* for in-memory extent cache entry */
122struct extent_info {
123 rwlock_t ext_lock; /* rwlock for consistency */
124 unsigned int fofs; /* start offset in a file */
125 u32 blk_addr; /* start block address of the extent */
126 unsigned int len; /* lenth of the extent */
127};
128
129/*
130 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
131 */
132#define FADVISE_COLD_BIT 0x01
133
134struct f2fs_inode_info {
135 struct inode vfs_inode; /* serve a vfs inode */
136 unsigned long i_flags; /* keep an inode flags for ioctl */
137 unsigned char i_advise; /* use to give file attribute hints */
138 unsigned int i_current_depth; /* use only in directory structure */
139 unsigned int i_pino; /* parent inode number */
140 umode_t i_acl_mode; /* keep file acl mode temporarily */
141
142 /* Use below internally in f2fs*/
143 unsigned long flags; /* use to pass per-file flags */
144 unsigned long long data_version;/* lastes version of data for fsync */
145 atomic_t dirty_dents; /* # of dirty dentry pages */
146 f2fs_hash_t chash; /* hash value of given file name */
147 unsigned int clevel; /* maximum level of given file name */
148 nid_t i_xattr_nid; /* node id that contains xattrs */
149 struct extent_info ext; /* in-memory extent cache entry */
150};
151
152static inline void get_extent_info(struct extent_info *ext,
153 struct f2fs_extent i_ext)
154{
155 write_lock(&ext->ext_lock);
156 ext->fofs = le32_to_cpu(i_ext.fofs);
157 ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
158 ext->len = le32_to_cpu(i_ext.len);
159 write_unlock(&ext->ext_lock);
160}
161
162static inline void set_raw_extent(struct extent_info *ext,
163 struct f2fs_extent *i_ext)
164{
165 read_lock(&ext->ext_lock);
166 i_ext->fofs = cpu_to_le32(ext->fofs);
167 i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
168 i_ext->len = cpu_to_le32(ext->len);
169 read_unlock(&ext->ext_lock);
170}
171
172struct f2fs_nm_info {
173 block_t nat_blkaddr; /* base disk address of NAT */
174 nid_t max_nid; /* maximum possible node ids */
175 nid_t init_scan_nid; /* the first nid to be scanned */
176 nid_t next_scan_nid; /* the next nid to be scanned */
177
178 /* NAT cache management */
179 struct radix_tree_root nat_root;/* root of the nat entry cache */
180 rwlock_t nat_tree_lock; /* protect nat_tree_lock */
181 unsigned int nat_cnt; /* the # of cached nat entries */
182 struct list_head nat_entries; /* cached nat entry list (clean) */
183 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
184
185 /* free node ids management */
186 struct list_head free_nid_list; /* a list for free nids */
187 spinlock_t free_nid_list_lock; /* protect free nid list */
188 unsigned int fcnt; /* the number of free node id */
189 struct mutex build_lock; /* lock for build free nids */
190
191 /* for checkpoint */
192 char *nat_bitmap; /* NAT bitmap pointer */
193 int bitmap_size; /* bitmap size */
194};
195
196/*
197 * this structure is used as one of function parameters.
198 * all the information are dedicated to a given direct node block determined
199 * by the data offset in a file.
200 */
201struct dnode_of_data {
202 struct inode *inode; /* vfs inode pointer */
203 struct page *inode_page; /* its inode page, NULL is possible */
204 struct page *node_page; /* cached direct node page */
205 nid_t nid; /* node id of the direct node block */
206 unsigned int ofs_in_node; /* data offset in the node page */
207 bool inode_page_locked; /* inode page is locked or not */
208 block_t data_blkaddr; /* block address of the node block */
209};
210
211static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
212 struct page *ipage, struct page *npage, nid_t nid)
213{
214 memset(dn, 0, sizeof(*dn));
215 dn->inode = inode;
216 dn->inode_page = ipage;
217 dn->node_page = npage;
218 dn->nid = nid;
219}
220
221/*
222 * For SIT manager
223 *
224 * By default, there are 6 active log areas across the whole main area.
225 * When considering hot and cold data separation to reduce cleaning overhead,
226 * we split 3 for data logs and 3 for node logs as hot, warm, and cold types,
227 * respectively.
228 * In the current design, you should not change the numbers intentionally.
229 * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6
230 * logs individually according to the underlying devices. (default: 6)
231 * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for
232 * data and 8 for node logs.
233 */
234#define NR_CURSEG_DATA_TYPE (3)
235#define NR_CURSEG_NODE_TYPE (3)
236#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
237
238enum {
239 CURSEG_HOT_DATA = 0, /* directory entry blocks */
240 CURSEG_WARM_DATA, /* data blocks */
241 CURSEG_COLD_DATA, /* multimedia or GCed data blocks */
242 CURSEG_HOT_NODE, /* direct node blocks of directory files */
243 CURSEG_WARM_NODE, /* direct node blocks of normal files */
244 CURSEG_COLD_NODE, /* indirect node blocks */
245 NO_CHECK_TYPE
246};
247
248struct f2fs_sm_info {
249 struct sit_info *sit_info; /* whole segment information */
250 struct free_segmap_info *free_info; /* free segment information */
251 struct dirty_seglist_info *dirty_info; /* dirty segment information */
252 struct curseg_info *curseg_array; /* active segment information */
253
254 struct list_head wblist_head; /* list of under-writeback pages */
255 spinlock_t wblist_lock; /* lock for checkpoint */
256
257 block_t seg0_blkaddr; /* block address of 0'th segment */
258 block_t main_blkaddr; /* start block address of main area */
259 block_t ssa_blkaddr; /* start block address of SSA area */
260
261 unsigned int segment_count; /* total # of segments */
262 unsigned int main_segments; /* # of segments in main area */
263 unsigned int reserved_segments; /* # of reserved segments */
264 unsigned int ovp_segments; /* # of overprovision segments */
265};
266
267/*
268 * For directory operation
269 */
270#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1)
271#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2)
272#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3)
273#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4)
274#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5)
275
276/*
277 * For superblock
278 */
279/*
280 * COUNT_TYPE for monitoring
281 *
282 * f2fs monitors the number of several block types such as on-writeback,
283 * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
284 */
285enum count_type {
286 F2FS_WRITEBACK,
287 F2FS_DIRTY_DENTS,
288 F2FS_DIRTY_NODES,
289 F2FS_DIRTY_META,
290 NR_COUNT_TYPE,
291};
292
293/*
294 * FS_LOCK nesting subclasses for the lock validator:
295 *
296 * The locking order between these classes is
297 * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW
298 * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC
299 */
300enum lock_type {
301 RENAME, /* for renaming operations */
302 DENTRY_OPS, /* for directory operations */
303 DATA_WRITE, /* for data write */
304 DATA_NEW, /* for data allocation */
305 DATA_TRUNC, /* for data truncate */
306 NODE_NEW, /* for node allocation */
307 NODE_TRUNC, /* for node truncate */
308 NODE_WRITE, /* for node write */
309 NR_LOCK_TYPE,
310};
311
312/*
313 * The below are the page types of bios used in submti_bio().
314 * The available types are:
315 * DATA User data pages. It operates as async mode.
316 * NODE Node pages. It operates as async mode.
317 * META FS metadata pages such as SIT, NAT, CP.
318 * NR_PAGE_TYPE The number of page types.
319 * META_FLUSH Make sure the previous pages are written
320 * with waiting the bio's completion
321 * ... Only can be used with META.
322 */
323enum page_type {
324 DATA,
325 NODE,
326 META,
327 NR_PAGE_TYPE,
328 META_FLUSH,
329};
330
331struct f2fs_sb_info {
332 struct super_block *sb; /* pointer to VFS super block */
333 struct buffer_head *raw_super_buf; /* buffer head of raw sb */
334 struct f2fs_super_block *raw_super; /* raw super block pointer */
335 int s_dirty; /* dirty flag for checkpoint */
336
337 /* for node-related operations */
338 struct f2fs_nm_info *nm_info; /* node manager */
339 struct inode *node_inode; /* cache node blocks */
340
341 /* for segment-related operations */
342 struct f2fs_sm_info *sm_info; /* segment manager */
343 struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */
344 sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */
345 struct rw_semaphore bio_sem; /* IO semaphore */
346
347 /* for checkpoint */
348 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
349 struct inode *meta_inode; /* cache meta blocks */
350 struct mutex cp_mutex; /* for checkpoint procedure */
351 struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */
352 struct mutex write_inode; /* mutex for write inode */
353 struct mutex writepages; /* mutex for writepages() */
354 int por_doing; /* recovery is doing or not */
355
356 /* for orphan inode management */
357 struct list_head orphan_inode_list; /* orphan inode list */
358 struct mutex orphan_inode_mutex; /* for orphan inode list */
359 unsigned int n_orphans; /* # of orphan inodes */
360
361 /* for directory inode management */
362 struct list_head dir_inode_list; /* dir inode list */
363 spinlock_t dir_inode_lock; /* for dir inode list lock */
364 unsigned int n_dirty_dirs; /* # of dir inodes */
365
366 /* basic file system units */
367 unsigned int log_sectors_per_block; /* log2 sectors per block */
368 unsigned int log_blocksize; /* log2 block size */
369 unsigned int blocksize; /* block size */
370 unsigned int root_ino_num; /* root inode number*/
371 unsigned int node_ino_num; /* node inode number*/
372 unsigned int meta_ino_num; /* meta inode number*/
373 unsigned int log_blocks_per_seg; /* log2 blocks per segment */
374 unsigned int blocks_per_seg; /* blocks per segment */
375 unsigned int segs_per_sec; /* segments per section */
376 unsigned int secs_per_zone; /* sections per zone */
377 unsigned int total_sections; /* total section count */
378 unsigned int total_node_count; /* total node block count */
379 unsigned int total_valid_node_count; /* valid node block count */
380 unsigned int total_valid_inode_count; /* valid inode count */
381 int active_logs; /* # of active logs */
382
383 block_t user_block_count; /* # of user blocks */
384 block_t total_valid_block_count; /* # of valid blocks */
385 block_t alloc_valid_block_count; /* # of allocated blocks */
386 block_t last_valid_block_count; /* for recovery */
387 u32 s_next_generation; /* for NFS support */
388 atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */
389
390 struct f2fs_mount_info mount_opt; /* mount options */
391
392 /* for cleaning operations */
393 struct mutex gc_mutex; /* mutex for GC */
394 struct f2fs_gc_kthread *gc_thread; /* GC thread */
395
396 /*
397 * for stat information.
398 * one is for the LFS mode, and the other is for the SSR mode.
399 */
400 struct f2fs_stat_info *stat_info; /* FS status information */
401 unsigned int segment_count[2]; /* # of allocated segments */
402 unsigned int block_count[2]; /* # of allocated blocks */
403 unsigned int last_victim[2]; /* last victim segment # */
404 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
405 int bg_gc; /* background gc calls */
406 spinlock_t stat_lock; /* lock for stat operations */
407};
408
409/*
410 * Inline functions
411 */
412static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
413{
414 return container_of(inode, struct f2fs_inode_info, vfs_inode);
415}
416
417static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
418{
419 return sb->s_fs_info;
420}
421
422static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
423{
424 return (struct f2fs_super_block *)(sbi->raw_super);
425}
426
427static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
428{
429 return (struct f2fs_checkpoint *)(sbi->ckpt);
430}
431
432static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
433{
434 return (struct f2fs_nm_info *)(sbi->nm_info);
435}
436
437static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi)
438{
439 return (struct f2fs_sm_info *)(sbi->sm_info);
440}
441
442static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi)
443{
444 return (struct sit_info *)(SM_I(sbi)->sit_info);
445}
446
447static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi)
448{
449 return (struct free_segmap_info *)(SM_I(sbi)->free_info);
450}
451
452static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
453{
454 return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
455}
456
457static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
458{
459 sbi->s_dirty = 1;
460}
461
462static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
463{
464 sbi->s_dirty = 0;
465}
466
467static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
468{
469 unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
470 return ckpt_flags & f;
471}
472
473static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
474{
475 unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
476 ckpt_flags |= f;
477 cp->ckpt_flags = cpu_to_le32(ckpt_flags);
478}
479
480static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
481{
482 unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
483 ckpt_flags &= (~f);
484 cp->ckpt_flags = cpu_to_le32(ckpt_flags);
485}
486
487static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t)
488{
489 mutex_lock_nested(&sbi->fs_lock[t], t);
490}
491
492static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t)
493{
494 mutex_unlock(&sbi->fs_lock[t]);
495}
496
497/*
498 * Check whether the given nid is within node id range.
499 */
500static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
501{
502 BUG_ON((nid >= NM_I(sbi)->max_nid));
503}
504
505#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1
506
507/*
508 * Check whether the inode has blocks or not
509 */
510static inline int F2FS_HAS_BLOCKS(struct inode *inode)
511{
512 if (F2FS_I(inode)->i_xattr_nid)
513 return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
514 else
515 return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
516}
517
518static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
519 struct inode *inode, blkcnt_t count)
520{
521 block_t valid_block_count;
522
523 spin_lock(&sbi->stat_lock);
524 valid_block_count =
525 sbi->total_valid_block_count + (block_t)count;
526 if (valid_block_count > sbi->user_block_count) {
527 spin_unlock(&sbi->stat_lock);
528 return false;
529 }
530 inode->i_blocks += count;
531 sbi->total_valid_block_count = valid_block_count;
532 sbi->alloc_valid_block_count += (block_t)count;
533 spin_unlock(&sbi->stat_lock);
534 return true;
535}
536
537static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
538 struct inode *inode,
539 blkcnt_t count)
540{
541 spin_lock(&sbi->stat_lock);
542 BUG_ON(sbi->total_valid_block_count < (block_t) count);
543 BUG_ON(inode->i_blocks < count);
544 inode->i_blocks -= count;
545 sbi->total_valid_block_count -= (block_t)count;
546 spin_unlock(&sbi->stat_lock);
547 return 0;
548}
549
550static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
551{
552 atomic_inc(&sbi->nr_pages[count_type]);
553 F2FS_SET_SB_DIRT(sbi);
554}
555
556static inline void inode_inc_dirty_dents(struct inode *inode)
557{
558 atomic_inc(&F2FS_I(inode)->dirty_dents);
559}
560
561static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
562{
563 atomic_dec(&sbi->nr_pages[count_type]);
564}
565
566static inline void inode_dec_dirty_dents(struct inode *inode)
567{
568 atomic_dec(&F2FS_I(inode)->dirty_dents);
569}
570
571static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
572{
573 return atomic_read(&sbi->nr_pages[count_type]);
574}
575
576static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
577{
578 block_t ret;
579 spin_lock(&sbi->stat_lock);
580 ret = sbi->total_valid_block_count;
581 spin_unlock(&sbi->stat_lock);
582 return ret;
583}
584
585static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
586{
587 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
588
589 /* return NAT or SIT bitmap */
590 if (flag == NAT_BITMAP)
591 return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize);
592 else if (flag == SIT_BITMAP)
593 return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
594
595 return 0;
596}
597
598static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
599{
600 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
601 int offset = (flag == NAT_BITMAP) ?
602 le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
603 return &ckpt->sit_nat_version_bitmap + offset;
604}
605
606static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
607{
608 block_t start_addr;
609 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
610 unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver);
611
612 start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
613
614 /*
615 * odd numbered checkpoint should at cp segment 0
616 * and even segent must be at cp segment 1
617 */
618 if (!(ckpt_version & 1))
619 start_addr += sbi->blocks_per_seg;
620
621 return start_addr;
622}
623
624static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
625{
626 return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
627}
628
629static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
630 struct inode *inode,
631 unsigned int count)
632{
633 block_t valid_block_count;
634 unsigned int valid_node_count;
635
636 spin_lock(&sbi->stat_lock);
637
638 valid_block_count = sbi->total_valid_block_count + (block_t)count;
639 sbi->alloc_valid_block_count += (block_t)count;
640 valid_node_count = sbi->total_valid_node_count + count;
641
642 if (valid_block_count > sbi->user_block_count) {
643 spin_unlock(&sbi->stat_lock);
644 return false;
645 }
646
647 if (valid_node_count > sbi->total_node_count) {
648 spin_unlock(&sbi->stat_lock);
649 return false;
650 }
651
652 if (inode)
653 inode->i_blocks += count;
654 sbi->total_valid_node_count = valid_node_count;
655 sbi->total_valid_block_count = valid_block_count;
656 spin_unlock(&sbi->stat_lock);
657
658 return true;
659}
660
661static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
662 struct inode *inode,
663 unsigned int count)
664{
665 spin_lock(&sbi->stat_lock);
666
667 BUG_ON(sbi->total_valid_block_count < count);
668 BUG_ON(sbi->total_valid_node_count < count);
669 BUG_ON(inode->i_blocks < count);
670
671 inode->i_blocks -= count;
672 sbi->total_valid_node_count -= count;
673 sbi->total_valid_block_count -= (block_t)count;
674
675 spin_unlock(&sbi->stat_lock);
676}
677
678static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
679{
680 unsigned int ret;
681 spin_lock(&sbi->stat_lock);
682 ret = sbi->total_valid_node_count;
683 spin_unlock(&sbi->stat_lock);
684 return ret;
685}
686
687static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
688{
689 spin_lock(&sbi->stat_lock);
690 BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count);
691 sbi->total_valid_inode_count++;
692 spin_unlock(&sbi->stat_lock);
693}
694
695static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
696{
697 spin_lock(&sbi->stat_lock);
698 BUG_ON(!sbi->total_valid_inode_count);
699 sbi->total_valid_inode_count--;
700 spin_unlock(&sbi->stat_lock);
701 return 0;
702}
703
704static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
705{
706 unsigned int ret;
707 spin_lock(&sbi->stat_lock);
708 ret = sbi->total_valid_inode_count;
709 spin_unlock(&sbi->stat_lock);
710 return ret;
711}
712
713static inline void f2fs_put_page(struct page *page, int unlock)
714{
715 if (!page || IS_ERR(page))
716 return;
717
718 if (unlock) {
719 BUG_ON(!PageLocked(page));
720 unlock_page(page);
721 }
722 page_cache_release(page);
723}
724
725static inline void f2fs_put_dnode(struct dnode_of_data *dn)
726{
727 if (dn->node_page)
728 f2fs_put_page(dn->node_page, 1);
729 if (dn->inode_page && dn->node_page != dn->inode_page)
730 f2fs_put_page(dn->inode_page, 0);
731 dn->node_page = NULL;
732 dn->inode_page = NULL;
733}
734
735static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
736 size_t size, void (*ctor)(void *))
737{
738 return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
739}
740
741#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
742
743static inline bool IS_INODE(struct page *page)
744{
745 struct f2fs_node *p = (struct f2fs_node *)page_address(page);
746 return RAW_IS_INODE(p);
747}
748
749static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
750{
751 return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
752}
753
754static inline block_t datablock_addr(struct page *node_page,
755 unsigned int offset)
756{
757 struct f2fs_node *raw_node;
758 __le32 *addr_array;
759 raw_node = (struct f2fs_node *)page_address(node_page);
760 addr_array = blkaddr_in_node(raw_node);
761 return le32_to_cpu(addr_array[offset]);
762}
763
764static inline int f2fs_test_bit(unsigned int nr, char *addr)
765{
766 int mask;
767
768 addr += (nr >> 3);
769 mask = 1 << (7 - (nr & 0x07));
770 return mask & *addr;
771}
772
773static inline int f2fs_set_bit(unsigned int nr, char *addr)
774{
775 int mask;
776 int ret;
777
778 addr += (nr >> 3);
779 mask = 1 << (7 - (nr & 0x07));
780 ret = mask & *addr;
781 *addr |= mask;
782 return ret;
783}
784
785static inline int f2fs_clear_bit(unsigned int nr, char *addr)
786{
787 int mask;
788 int ret;
789
790 addr += (nr >> 3);
791 mask = 1 << (7 - (nr & 0x07));
792 ret = mask & *addr;
793 *addr &= ~mask;
794 return ret;
795}
796
797/* used for f2fs_inode_info->flags */
798enum {
799 FI_NEW_INODE, /* indicate newly allocated inode */
800 FI_NEED_CP, /* need to do checkpoint during fsync */
801 FI_INC_LINK, /* need to increment i_nlink */
802 FI_ACL_MODE, /* indicate acl mode */
803 FI_NO_ALLOC, /* should not allocate any blocks */
804};
805
806static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
807{
808 set_bit(flag, &fi->flags);
809}
810
811static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
812{
813 return test_bit(flag, &fi->flags);
814}
815
816static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
817{
818 clear_bit(flag, &fi->flags);
819}
820
821static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
822{
823 fi->i_acl_mode = mode;
824 set_inode_flag(fi, FI_ACL_MODE);
825}
826
827static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
828{
829 if (is_inode_flag_set(fi, FI_ACL_MODE)) {
830 clear_inode_flag(fi, FI_ACL_MODE);
831 return 1;
832 }
833 return 0;
834}
835
836/*
837 * file.c
838 */
839int f2fs_sync_file(struct file *, loff_t, loff_t, int);
840void truncate_data_blocks(struct dnode_of_data *);
841void f2fs_truncate(struct inode *);
842int f2fs_setattr(struct dentry *, struct iattr *);
843int truncate_hole(struct inode *, pgoff_t, pgoff_t);
844long f2fs_ioctl(struct file *, unsigned int, unsigned long);
845
846/*
847 * inode.c
848 */
849void f2fs_set_inode_flags(struct inode *);
850struct inode *f2fs_iget_nowait(struct super_block *, unsigned long);
851struct inode *f2fs_iget(struct super_block *, unsigned long);
852void update_inode(struct inode *, struct page *);
853int f2fs_write_inode(struct inode *, struct writeback_control *);
854void f2fs_evict_inode(struct inode *);
855
856/*
857 * namei.c
858 */
859struct dentry *f2fs_get_parent(struct dentry *child);
860
861/*
862 * dir.c
863 */
864struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
865 struct page **);
866struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
867ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
868void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
869 struct page *, struct inode *);
870void init_dent_inode(struct dentry *, struct page *);
871int f2fs_add_link(struct dentry *, struct inode *);
872void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
873int f2fs_make_empty(struct inode *, struct inode *);
874bool f2fs_empty_dir(struct inode *);
875
876/*
877 * super.c
878 */
879int f2fs_sync_fs(struct super_block *, int);
880extern __printf(3, 4)
881void f2fs_msg(struct super_block *, const char *, const char *, ...);
882
883/*
884 * hash.c
885 */
886f2fs_hash_t f2fs_dentry_hash(const char *, size_t);
887
888/*
889 * node.c
890 */
891struct dnode_of_data;
892struct node_info;
893
894int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
895void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
896int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
897int truncate_inode_blocks(struct inode *, pgoff_t);
898int remove_inode_page(struct inode *);
899int new_inode_page(struct inode *, struct dentry *);
900struct page *new_node_page(struct dnode_of_data *, unsigned int);
901void ra_node_page(struct f2fs_sb_info *, nid_t);
902struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
903struct page *get_node_page_ra(struct page *, int);
904void sync_inode_page(struct dnode_of_data *);
905int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
906bool alloc_nid(struct f2fs_sb_info *, nid_t *);
907void alloc_nid_done(struct f2fs_sb_info *, nid_t);
908void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
909void recover_node_page(struct f2fs_sb_info *, struct page *,
910 struct f2fs_summary *, struct node_info *, block_t);
911int recover_inode_page(struct f2fs_sb_info *, struct page *);
912int restore_node_summary(struct f2fs_sb_info *, unsigned int,
913 struct f2fs_summary_block *);
914void flush_nat_entries(struct f2fs_sb_info *);
915int build_node_manager(struct f2fs_sb_info *);
916void destroy_node_manager(struct f2fs_sb_info *);
917int __init create_node_manager_caches(void);
918void destroy_node_manager_caches(void);
919
920/*
921 * segment.c
922 */
923void f2fs_balance_fs(struct f2fs_sb_info *);
924void invalidate_blocks(struct f2fs_sb_info *, block_t);
925void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
926void clear_prefree_segments(struct f2fs_sb_info *);
927int npages_for_summary_flush(struct f2fs_sb_info *);
928void allocate_new_segments(struct f2fs_sb_info *);
929struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
930struct bio *f2fs_bio_alloc(struct block_device *, int);
931void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
932int write_meta_page(struct f2fs_sb_info *, struct page *,
933 struct writeback_control *);
934void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
935 block_t, block_t *);
936void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
937 block_t, block_t *);
938void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
939void recover_data_page(struct f2fs_sb_info *, struct page *,
940 struct f2fs_summary *, block_t, block_t);
941void rewrite_node_page(struct f2fs_sb_info *, struct page *,
942 struct f2fs_summary *, block_t, block_t);
943void write_data_summaries(struct f2fs_sb_info *, block_t);
944void write_node_summaries(struct f2fs_sb_info *, block_t);
945int lookup_journal_in_cursum(struct f2fs_summary_block *,
946 int, unsigned int, int);
947void flush_sit_entries(struct f2fs_sb_info *);
948int build_segment_manager(struct f2fs_sb_info *);
949void reset_victim_segmap(struct f2fs_sb_info *);
950void destroy_segment_manager(struct f2fs_sb_info *);
951
952/*
953 * checkpoint.c
954 */
955struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
956struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
957long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
958int check_orphan_space(struct f2fs_sb_info *);
959void add_orphan_inode(struct f2fs_sb_info *, nid_t);
960void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
961int recover_orphan_inodes(struct f2fs_sb_info *);
962int get_valid_checkpoint(struct f2fs_sb_info *);
963void set_dirty_dir_page(struct inode *, struct page *);
964void remove_dirty_dir_inode(struct inode *);
965void sync_dirty_dir_inodes(struct f2fs_sb_info *);
966void block_operations(struct f2fs_sb_info *);
967void write_checkpoint(struct f2fs_sb_info *, bool, bool);
968void init_orphan_info(struct f2fs_sb_info *);
969int __init create_checkpoint_caches(void);
970void destroy_checkpoint_caches(void);
971
972/*
973 * data.c
974 */
975int reserve_new_block(struct dnode_of_data *);
976void update_extent_cache(block_t, struct dnode_of_data *);
977struct page *find_data_page(struct inode *, pgoff_t);
978struct page *get_lock_data_page(struct inode *, pgoff_t);
979struct page *get_new_data_page(struct inode *, pgoff_t, bool);
980int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
981int do_write_data_page(struct page *);
982
983/*
984 * gc.c
985 */
986int start_gc_thread(struct f2fs_sb_info *);
987void stop_gc_thread(struct f2fs_sb_info *);
988block_t start_bidx_of_node(unsigned int);
989int f2fs_gc(struct f2fs_sb_info *);
990void build_gc_manager(struct f2fs_sb_info *);
991int __init create_gc_caches(void);
992void destroy_gc_caches(void);
993
994/*
995 * recovery.c
996 */
997void recover_fsync_data(struct f2fs_sb_info *);
998bool space_for_roll_forward(struct f2fs_sb_info *);
999
1000/*
1001 * debug.c
1002 */
1003#ifdef CONFIG_F2FS_STAT_FS
1004struct f2fs_stat_info {
1005 struct list_head stat_list;
1006 struct f2fs_sb_info *sbi;
1007 struct mutex stat_lock;
1008 int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
1009 int main_area_segs, main_area_sections, main_area_zones;
1010 int hit_ext, total_ext;
1011 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
1012 int nats, sits, fnids;
1013 int total_count, utilization;
1014 int bg_gc;
1015 unsigned int valid_count, valid_node_count, valid_inode_count;
1016 unsigned int bimodal, avg_vblocks;
1017 int util_free, util_valid, util_invalid;
1018 int rsvd_segs, overp_segs;
1019 int dirty_count, node_pages, meta_pages;
1020 int prefree_count, call_count;
1021 int tot_segs, node_segs, data_segs, free_segs, free_secs;
1022 int tot_blks, data_blks, node_blks;
1023 int curseg[NR_CURSEG_TYPE];
1024 int cursec[NR_CURSEG_TYPE];
1025 int curzone[NR_CURSEG_TYPE];
1026
1027 unsigned int segment_count[2];
1028 unsigned int block_count[2];
1029 unsigned base_mem, cache_mem;
1030};
1031
1032#define stat_inc_call_count(si) ((si)->call_count++)
1033
1034#define stat_inc_seg_count(sbi, type) \
1035 do { \
1036 struct f2fs_stat_info *si = sbi->stat_info; \
1037 (si)->tot_segs++; \
1038 if (type == SUM_TYPE_DATA) \
1039 si->data_segs++; \
1040 else \
1041 si->node_segs++; \
1042 } while (0)
1043
1044#define stat_inc_tot_blk_count(si, blks) \
1045 (si->tot_blks += (blks))
1046
1047#define stat_inc_data_blk_count(sbi, blks) \
1048 do { \
1049 struct f2fs_stat_info *si = sbi->stat_info; \
1050 stat_inc_tot_blk_count(si, blks); \
1051 si->data_blks += (blks); \
1052 } while (0)
1053
1054#define stat_inc_node_blk_count(sbi, blks) \
1055 do { \
1056 struct f2fs_stat_info *si = sbi->stat_info; \
1057 stat_inc_tot_blk_count(si, blks); \
1058 si->node_blks += (blks); \
1059 } while (0)
1060
1061int f2fs_build_stats(struct f2fs_sb_info *);
1062void f2fs_destroy_stats(struct f2fs_sb_info *);
1063void __init f2fs_create_root_stats(void);
1064void f2fs_destroy_root_stats(void);
1065#else
1066#define stat_inc_call_count(si)
1067#define stat_inc_seg_count(si, type)
1068#define stat_inc_tot_blk_count(si, blks)
1069#define stat_inc_data_blk_count(si, blks)
1070#define stat_inc_node_blk_count(sbi, blks)
1071
1072static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
1073static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
1074static inline void __init f2fs_create_root_stats(void) { }
1075static inline void f2fs_destroy_root_stats(void) { }
1076#endif
1077
1078extern const struct file_operations f2fs_dir_operations;
1079extern const struct file_operations f2fs_file_operations;
1080extern const struct inode_operations f2fs_file_inode_operations;
1081extern const struct address_space_operations f2fs_dblock_aops;
1082extern const struct address_space_operations f2fs_node_aops;
1083extern const struct address_space_operations f2fs_meta_aops;
1084extern const struct inode_operations f2fs_dir_inode_operations;
1085extern const struct inode_operations f2fs_symlink_inode_operations;
1086extern const struct inode_operations f2fs_special_inode_operations;
1087#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
new file mode 100644
index 000000000000..3191b52aafb0
--- /dev/null
+++ b/fs/f2fs/file.c
@@ -0,0 +1,646 @@
1/*
2 * fs/f2fs/file.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/stat.h>
14#include <linux/buffer_head.h>
15#include <linux/writeback.h>
16#include <linux/falloc.h>
17#include <linux/types.h>
18#include <linux/uaccess.h>
19#include <linux/mount.h>
20
21#include "f2fs.h"
22#include "node.h"
23#include "segment.h"
24#include "xattr.h"
25#include "acl.h"
26
27static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
28 struct vm_fault *vmf)
29{
30 struct page *page = vmf->page;
31 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
32 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
33 block_t old_blk_addr;
34 struct dnode_of_data dn;
35 int err;
36
37 f2fs_balance_fs(sbi);
38
39 sb_start_pagefault(inode->i_sb);
40
41 mutex_lock_op(sbi, DATA_NEW);
42
43 /* block allocation */
44 set_new_dnode(&dn, inode, NULL, NULL, 0);
45 err = get_dnode_of_data(&dn, page->index, 0);
46 if (err) {
47 mutex_unlock_op(sbi, DATA_NEW);
48 goto out;
49 }
50
51 old_blk_addr = dn.data_blkaddr;
52
53 if (old_blk_addr == NULL_ADDR) {
54 err = reserve_new_block(&dn);
55 if (err) {
56 f2fs_put_dnode(&dn);
57 mutex_unlock_op(sbi, DATA_NEW);
58 goto out;
59 }
60 }
61 f2fs_put_dnode(&dn);
62
63 mutex_unlock_op(sbi, DATA_NEW);
64
65 lock_page(page);
66 if (page->mapping != inode->i_mapping ||
67 page_offset(page) >= i_size_read(inode) ||
68 !PageUptodate(page)) {
69 unlock_page(page);
70 err = -EFAULT;
71 goto out;
72 }
73
74 /*
75 * check to see if the page is mapped already (no holes)
76 */
77 if (PageMappedToDisk(page))
78 goto out;
79
80 /* fill the page */
81 wait_on_page_writeback(page);
82
83 /* page is wholly or partially inside EOF */
84 if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
85 unsigned offset;
86 offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
87 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
88 }
89 set_page_dirty(page);
90 SetPageUptodate(page);
91
92 file_update_time(vma->vm_file);
93out:
94 sb_end_pagefault(inode->i_sb);
95 return block_page_mkwrite_return(err);
96}
97
98static const struct vm_operations_struct f2fs_file_vm_ops = {
99 .fault = filemap_fault,
100 .page_mkwrite = f2fs_vm_page_mkwrite,
101 .remap_pages = generic_file_remap_pages,
102};
103
104static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
105{
106 struct dentry *dentry;
107 nid_t pino;
108
109 inode = igrab(inode);
110 dentry = d_find_any_alias(inode);
111 if (!dentry) {
112 iput(inode);
113 return 0;
114 }
115 pino = dentry->d_parent->d_inode->i_ino;
116 dput(dentry);
117 iput(inode);
118 return !is_checkpointed_node(sbi, pino);
119}
120
121int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
122{
123 struct inode *inode = file->f_mapping->host;
124 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
125 unsigned long long cur_version;
126 int ret = 0;
127 bool need_cp = false;
128 struct writeback_control wbc = {
129 .sync_mode = WB_SYNC_ALL,
130 .nr_to_write = LONG_MAX,
131 .for_reclaim = 0,
132 };
133
134 if (inode->i_sb->s_flags & MS_RDONLY)
135 return 0;
136
137 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
138 if (ret)
139 return ret;
140
141 /* guarantee free sections for fsync */
142 f2fs_balance_fs(sbi);
143
144 mutex_lock(&inode->i_mutex);
145
146 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
147 goto out;
148
149 mutex_lock(&sbi->cp_mutex);
150 cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
151 mutex_unlock(&sbi->cp_mutex);
152
153 if (F2FS_I(inode)->data_version != cur_version &&
154 !(inode->i_state & I_DIRTY))
155 goto out;
156 F2FS_I(inode)->data_version--;
157
158 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
159 need_cp = true;
160 if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
161 need_cp = true;
162 if (!space_for_roll_forward(sbi))
163 need_cp = true;
164 if (need_to_sync_dir(sbi, inode))
165 need_cp = true;
166
167 if (need_cp) {
168 /* all the dirty node pages should be flushed for POR */
169 ret = f2fs_sync_fs(inode->i_sb, 1);
170 clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
171 } else {
172 /* if there is no written node page, write its inode page */
173 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
174 ret = f2fs_write_inode(inode, NULL);
175 if (ret)
176 goto out;
177 }
178 filemap_fdatawait_range(sbi->node_inode->i_mapping,
179 0, LONG_MAX);
180 }
181out:
182 mutex_unlock(&inode->i_mutex);
183 return ret;
184}
185
186static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
187{
188 file_accessed(file);
189 vma->vm_ops = &f2fs_file_vm_ops;
190 return 0;
191}
192
193static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
194{
195 int nr_free = 0, ofs = dn->ofs_in_node;
196 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
197 struct f2fs_node *raw_node;
198 __le32 *addr;
199
200 raw_node = page_address(dn->node_page);
201 addr = blkaddr_in_node(raw_node) + ofs;
202
203 for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
204 block_t blkaddr = le32_to_cpu(*addr);
205 if (blkaddr == NULL_ADDR)
206 continue;
207
208 update_extent_cache(NULL_ADDR, dn);
209 invalidate_blocks(sbi, blkaddr);
210 dec_valid_block_count(sbi, dn->inode, 1);
211 nr_free++;
212 }
213 if (nr_free) {
214 set_page_dirty(dn->node_page);
215 sync_inode_page(dn);
216 }
217 dn->ofs_in_node = ofs;
218 return nr_free;
219}
220
221void truncate_data_blocks(struct dnode_of_data *dn)
222{
223 truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
224}
225
226static void truncate_partial_data_page(struct inode *inode, u64 from)
227{
228 unsigned offset = from & (PAGE_CACHE_SIZE - 1);
229 struct page *page;
230
231 if (!offset)
232 return;
233
234 page = find_data_page(inode, from >> PAGE_CACHE_SHIFT);
235 if (IS_ERR(page))
236 return;
237
238 lock_page(page);
239 wait_on_page_writeback(page);
240 zero_user(page, offset, PAGE_CACHE_SIZE - offset);
241 set_page_dirty(page);
242 f2fs_put_page(page, 1);
243}
244
245static int truncate_blocks(struct inode *inode, u64 from)
246{
247 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
248 unsigned int blocksize = inode->i_sb->s_blocksize;
249 struct dnode_of_data dn;
250 pgoff_t free_from;
251 int count = 0;
252 int err;
253
254 free_from = (pgoff_t)
255 ((from + blocksize - 1) >> (sbi->log_blocksize));
256
257 mutex_lock_op(sbi, DATA_TRUNC);
258
259 set_new_dnode(&dn, inode, NULL, NULL, 0);
260 err = get_dnode_of_data(&dn, free_from, RDONLY_NODE);
261 if (err) {
262 if (err == -ENOENT)
263 goto free_next;
264 mutex_unlock_op(sbi, DATA_TRUNC);
265 return err;
266 }
267
268 if (IS_INODE(dn.node_page))
269 count = ADDRS_PER_INODE;
270 else
271 count = ADDRS_PER_BLOCK;
272
273 count -= dn.ofs_in_node;
274 BUG_ON(count < 0);
275 if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
276 truncate_data_blocks_range(&dn, count);
277 free_from += count;
278 }
279
280 f2fs_put_dnode(&dn);
281free_next:
282 err = truncate_inode_blocks(inode, free_from);
283 mutex_unlock_op(sbi, DATA_TRUNC);
284
285 /* lastly zero out the first data page */
286 truncate_partial_data_page(inode, from);
287
288 return err;
289}
290
291void f2fs_truncate(struct inode *inode)
292{
293 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
294 S_ISLNK(inode->i_mode)))
295 return;
296
297 if (!truncate_blocks(inode, i_size_read(inode))) {
298 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
299 mark_inode_dirty(inode);
300 }
301
302 f2fs_balance_fs(F2FS_SB(inode->i_sb));
303}
304
305static int f2fs_getattr(struct vfsmount *mnt,
306 struct dentry *dentry, struct kstat *stat)
307{
308 struct inode *inode = dentry->d_inode;
309 generic_fillattr(inode, stat);
310 stat->blocks <<= 3;
311 return 0;
312}
313
314#ifdef CONFIG_F2FS_FS_POSIX_ACL
315static void __setattr_copy(struct inode *inode, const struct iattr *attr)
316{
317 struct f2fs_inode_info *fi = F2FS_I(inode);
318 unsigned int ia_valid = attr->ia_valid;
319
320 if (ia_valid & ATTR_UID)
321 inode->i_uid = attr->ia_uid;
322 if (ia_valid & ATTR_GID)
323 inode->i_gid = attr->ia_gid;
324 if (ia_valid & ATTR_ATIME)
325 inode->i_atime = timespec_trunc(attr->ia_atime,
326 inode->i_sb->s_time_gran);
327 if (ia_valid & ATTR_MTIME)
328 inode->i_mtime = timespec_trunc(attr->ia_mtime,
329 inode->i_sb->s_time_gran);
330 if (ia_valid & ATTR_CTIME)
331 inode->i_ctime = timespec_trunc(attr->ia_ctime,
332 inode->i_sb->s_time_gran);
333 if (ia_valid & ATTR_MODE) {
334 umode_t mode = attr->ia_mode;
335
336 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
337 mode &= ~S_ISGID;
338 set_acl_inode(fi, mode);
339 }
340}
341#else
342#define __setattr_copy setattr_copy
343#endif
344
345int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
346{
347 struct inode *inode = dentry->d_inode;
348 struct f2fs_inode_info *fi = F2FS_I(inode);
349 int err;
350
351 err = inode_change_ok(inode, attr);
352 if (err)
353 return err;
354
355 if ((attr->ia_valid & ATTR_SIZE) &&
356 attr->ia_size != i_size_read(inode)) {
357 truncate_setsize(inode, attr->ia_size);
358 f2fs_truncate(inode);
359 }
360
361 __setattr_copy(inode, attr);
362
363 if (attr->ia_valid & ATTR_MODE) {
364 err = f2fs_acl_chmod(inode);
365 if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
366 inode->i_mode = fi->i_acl_mode;
367 clear_inode_flag(fi, FI_ACL_MODE);
368 }
369 }
370
371 mark_inode_dirty(inode);
372 return err;
373}
374
375const struct inode_operations f2fs_file_inode_operations = {
376 .getattr = f2fs_getattr,
377 .setattr = f2fs_setattr,
378 .get_acl = f2fs_get_acl,
379#ifdef CONFIG_F2FS_FS_XATTR
380 .setxattr = generic_setxattr,
381 .getxattr = generic_getxattr,
382 .listxattr = f2fs_listxattr,
383 .removexattr = generic_removexattr,
384#endif
385};
386
387static void fill_zero(struct inode *inode, pgoff_t index,
388 loff_t start, loff_t len)
389{
390 struct page *page;
391
392 if (!len)
393 return;
394
395 page = get_new_data_page(inode, index, false);
396
397 if (!IS_ERR(page)) {
398 wait_on_page_writeback(page);
399 zero_user(page, start, len);
400 set_page_dirty(page);
401 f2fs_put_page(page, 1);
402 }
403}
404
405int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
406{
407 pgoff_t index;
408 int err;
409
410 for (index = pg_start; index < pg_end; index++) {
411 struct dnode_of_data dn;
412 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
413
414 f2fs_balance_fs(sbi);
415
416 mutex_lock_op(sbi, DATA_TRUNC);
417 set_new_dnode(&dn, inode, NULL, NULL, 0);
418 err = get_dnode_of_data(&dn, index, RDONLY_NODE);
419 if (err) {
420 mutex_unlock_op(sbi, DATA_TRUNC);
421 if (err == -ENOENT)
422 continue;
423 return err;
424 }
425
426 if (dn.data_blkaddr != NULL_ADDR)
427 truncate_data_blocks_range(&dn, 1);
428 f2fs_put_dnode(&dn);
429 mutex_unlock_op(sbi, DATA_TRUNC);
430 }
431 return 0;
432}
433
434static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
435{
436 pgoff_t pg_start, pg_end;
437 loff_t off_start, off_end;
438 int ret = 0;
439
440 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
441 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
442
443 off_start = offset & (PAGE_CACHE_SIZE - 1);
444 off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
445
446 if (pg_start == pg_end) {
447 fill_zero(inode, pg_start, off_start,
448 off_end - off_start);
449 } else {
450 if (off_start)
451 fill_zero(inode, pg_start++, off_start,
452 PAGE_CACHE_SIZE - off_start);
453 if (off_end)
454 fill_zero(inode, pg_end, 0, off_end);
455
456 if (pg_start < pg_end) {
457 struct address_space *mapping = inode->i_mapping;
458 loff_t blk_start, blk_end;
459
460 blk_start = pg_start << PAGE_CACHE_SHIFT;
461 blk_end = pg_end << PAGE_CACHE_SHIFT;
462 truncate_inode_pages_range(mapping, blk_start,
463 blk_end - 1);
464 ret = truncate_hole(inode, pg_start, pg_end);
465 }
466 }
467
468 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
469 i_size_read(inode) <= (offset + len)) {
470 i_size_write(inode, offset);
471 mark_inode_dirty(inode);
472 }
473
474 return ret;
475}
476
477static int expand_inode_data(struct inode *inode, loff_t offset,
478 loff_t len, int mode)
479{
480 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
481 pgoff_t index, pg_start, pg_end;
482 loff_t new_size = i_size_read(inode);
483 loff_t off_start, off_end;
484 int ret = 0;
485
486 ret = inode_newsize_ok(inode, (len + offset));
487 if (ret)
488 return ret;
489
490 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
491 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
492
493 off_start = offset & (PAGE_CACHE_SIZE - 1);
494 off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
495
496 for (index = pg_start; index <= pg_end; index++) {
497 struct dnode_of_data dn;
498
499 mutex_lock_op(sbi, DATA_NEW);
500
501 set_new_dnode(&dn, inode, NULL, NULL, 0);
502 ret = get_dnode_of_data(&dn, index, 0);
503 if (ret) {
504 mutex_unlock_op(sbi, DATA_NEW);
505 break;
506 }
507
508 if (dn.data_blkaddr == NULL_ADDR) {
509 ret = reserve_new_block(&dn);
510 if (ret) {
511 f2fs_put_dnode(&dn);
512 mutex_unlock_op(sbi, DATA_NEW);
513 break;
514 }
515 }
516 f2fs_put_dnode(&dn);
517
518 mutex_unlock_op(sbi, DATA_NEW);
519
520 if (pg_start == pg_end)
521 new_size = offset + len;
522 else if (index == pg_start && off_start)
523 new_size = (index + 1) << PAGE_CACHE_SHIFT;
524 else if (index == pg_end)
525 new_size = (index << PAGE_CACHE_SHIFT) + off_end;
526 else
527 new_size += PAGE_CACHE_SIZE;
528 }
529
530 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
531 i_size_read(inode) < new_size) {
532 i_size_write(inode, new_size);
533 mark_inode_dirty(inode);
534 }
535
536 return ret;
537}
538
539static long f2fs_fallocate(struct file *file, int mode,
540 loff_t offset, loff_t len)
541{
542 struct inode *inode = file->f_path.dentry->d_inode;
543 long ret;
544
545 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
546 return -EOPNOTSUPP;
547
548 if (mode & FALLOC_FL_PUNCH_HOLE)
549 ret = punch_hole(inode, offset, len, mode);
550 else
551 ret = expand_inode_data(inode, offset, len, mode);
552
553 if (!ret) {
554 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
555 mark_inode_dirty(inode);
556 }
557 return ret;
558}
559
560#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
561#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
562
563static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
564{
565 if (S_ISDIR(mode))
566 return flags;
567 else if (S_ISREG(mode))
568 return flags & F2FS_REG_FLMASK;
569 else
570 return flags & F2FS_OTHER_FLMASK;
571}
572
573long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
574{
575 struct inode *inode = filp->f_dentry->d_inode;
576 struct f2fs_inode_info *fi = F2FS_I(inode);
577 unsigned int flags;
578 int ret;
579
580 switch (cmd) {
581 case FS_IOC_GETFLAGS:
582 flags = fi->i_flags & FS_FL_USER_VISIBLE;
583 return put_user(flags, (int __user *) arg);
584 case FS_IOC_SETFLAGS:
585 {
586 unsigned int oldflags;
587
588 ret = mnt_want_write(filp->f_path.mnt);
589 if (ret)
590 return ret;
591
592 if (!inode_owner_or_capable(inode)) {
593 ret = -EACCES;
594 goto out;
595 }
596
597 if (get_user(flags, (int __user *) arg)) {
598 ret = -EFAULT;
599 goto out;
600 }
601
602 flags = f2fs_mask_flags(inode->i_mode, flags);
603
604 mutex_lock(&inode->i_mutex);
605
606 oldflags = fi->i_flags;
607
608 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
609 if (!capable(CAP_LINUX_IMMUTABLE)) {
610 mutex_unlock(&inode->i_mutex);
611 ret = -EPERM;
612 goto out;
613 }
614 }
615
616 flags = flags & FS_FL_USER_MODIFIABLE;
617 flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
618 fi->i_flags = flags;
619 mutex_unlock(&inode->i_mutex);
620
621 f2fs_set_inode_flags(inode);
622 inode->i_ctime = CURRENT_TIME;
623 mark_inode_dirty(inode);
624out:
625 mnt_drop_write(filp->f_path.mnt);
626 return ret;
627 }
628 default:
629 return -ENOTTY;
630 }
631}
632
633const struct file_operations f2fs_file_operations = {
634 .llseek = generic_file_llseek,
635 .read = do_sync_read,
636 .write = do_sync_write,
637 .aio_read = generic_file_aio_read,
638 .aio_write = generic_file_aio_write,
639 .open = generic_file_open,
640 .mmap = f2fs_file_mmap,
641 .fsync = f2fs_sync_file,
642 .fallocate = f2fs_fallocate,
643 .unlocked_ioctl = f2fs_ioctl,
644 .splice_read = generic_file_splice_read,
645 .splice_write = generic_file_splice_write,
646};
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
new file mode 100644
index 000000000000..c386910dacc5
--- /dev/null
+++ b/fs/f2fs/gc.c
@@ -0,0 +1,716 @@
1/*
2 * fs/f2fs/gc.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/module.h>
13#include <linux/backing-dev.h>
14#include <linux/proc_fs.h>
15#include <linux/init.h>
16#include <linux/f2fs_fs.h>
17#include <linux/kthread.h>
18#include <linux/delay.h>
19#include <linux/freezer.h>
20#include <linux/blkdev.h>
21
22#include "f2fs.h"
23#include "node.h"
24#include "segment.h"
25#include "gc.h"
26
27static struct kmem_cache *winode_slab;
28
29static int gc_thread_func(void *data)
30{
31 struct f2fs_sb_info *sbi = data;
32 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
33 long wait_ms;
34
35 wait_ms = GC_THREAD_MIN_SLEEP_TIME;
36
37 do {
38 if (try_to_freeze())
39 continue;
40 else
41 wait_event_interruptible_timeout(*wq,
42 kthread_should_stop(),
43 msecs_to_jiffies(wait_ms));
44 if (kthread_should_stop())
45 break;
46
47 f2fs_balance_fs(sbi);
48
49 if (!test_opt(sbi, BG_GC))
50 continue;
51
52 /*
53 * [GC triggering condition]
54 * 0. GC is not conducted currently.
55 * 1. There are enough dirty segments.
56 * 2. IO subsystem is idle by checking the # of writeback pages.
57 * 3. IO subsystem is idle by checking the # of requests in
58 * bdev's request list.
59 *
60 * Note) We have to avoid triggering GCs too much frequently.
61 * Because it is possible that some segments can be
62 * invalidated soon after by user update or deletion.
63 * So, I'd like to wait some time to collect dirty segments.
64 */
65 if (!mutex_trylock(&sbi->gc_mutex))
66 continue;
67
68 if (!is_idle(sbi)) {
69 wait_ms = increase_sleep_time(wait_ms);
70 mutex_unlock(&sbi->gc_mutex);
71 continue;
72 }
73
74 if (has_enough_invalid_blocks(sbi))
75 wait_ms = decrease_sleep_time(wait_ms);
76 else
77 wait_ms = increase_sleep_time(wait_ms);
78
79 sbi->bg_gc++;
80
81 if (f2fs_gc(sbi) == GC_NONE)
82 wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
83 else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
84 wait_ms = GC_THREAD_MAX_SLEEP_TIME;
85
86 } while (!kthread_should_stop());
87 return 0;
88}
89
90int start_gc_thread(struct f2fs_sb_info *sbi)
91{
92 struct f2fs_gc_kthread *gc_th;
93
94 gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
95 if (!gc_th)
96 return -ENOMEM;
97
98 sbi->gc_thread = gc_th;
99 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
100 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
101 GC_THREAD_NAME);
102 if (IS_ERR(gc_th->f2fs_gc_task)) {
103 kfree(gc_th);
104 return -ENOMEM;
105 }
106 return 0;
107}
108
109void stop_gc_thread(struct f2fs_sb_info *sbi)
110{
111 struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
112 if (!gc_th)
113 return;
114 kthread_stop(gc_th->f2fs_gc_task);
115 kfree(gc_th);
116 sbi->gc_thread = NULL;
117}
118
119static int select_gc_type(int gc_type)
120{
121 return (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
122}
123
124static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
125 int type, struct victim_sel_policy *p)
126{
127 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
128
129 if (p->alloc_mode) {
130 p->gc_mode = GC_GREEDY;
131 p->dirty_segmap = dirty_i->dirty_segmap[type];
132 p->ofs_unit = 1;
133 } else {
134 p->gc_mode = select_gc_type(gc_type);
135 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
136 p->ofs_unit = sbi->segs_per_sec;
137 }
138 p->offset = sbi->last_victim[p->gc_mode];
139}
140
141static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
142 struct victim_sel_policy *p)
143{
144 if (p->gc_mode == GC_GREEDY)
145 return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
146 else if (p->gc_mode == GC_CB)
147 return UINT_MAX;
148 else /* No other gc_mode */
149 return 0;
150}
151
152static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
153{
154 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
155 unsigned int segno;
156
157 /*
158 * If the gc_type is FG_GC, we can select victim segments
159 * selected by background GC before.
160 * Those segments guarantee they have small valid blocks.
161 */
162 segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
163 TOTAL_SEGS(sbi), 0);
164 if (segno < TOTAL_SEGS(sbi)) {
165 clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
166 return segno;
167 }
168 return NULL_SEGNO;
169}
170
171static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
172{
173 struct sit_info *sit_i = SIT_I(sbi);
174 unsigned int secno = GET_SECNO(sbi, segno);
175 unsigned int start = secno * sbi->segs_per_sec;
176 unsigned long long mtime = 0;
177 unsigned int vblocks;
178 unsigned char age = 0;
179 unsigned char u;
180 unsigned int i;
181
182 for (i = 0; i < sbi->segs_per_sec; i++)
183 mtime += get_seg_entry(sbi, start + i)->mtime;
184 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
185
186 mtime = div_u64(mtime, sbi->segs_per_sec);
187 vblocks = div_u64(vblocks, sbi->segs_per_sec);
188
189 u = (vblocks * 100) >> sbi->log_blocks_per_seg;
190
191 /* Handle if the system time is changed by user */
192 if (mtime < sit_i->min_mtime)
193 sit_i->min_mtime = mtime;
194 if (mtime > sit_i->max_mtime)
195 sit_i->max_mtime = mtime;
196 if (sit_i->max_mtime != sit_i->min_mtime)
197 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
198 sit_i->max_mtime - sit_i->min_mtime);
199
200 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
201}
202
203static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
204 struct victim_sel_policy *p)
205{
206 if (p->alloc_mode == SSR)
207 return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
208
209 /* alloc_mode == LFS */
210 if (p->gc_mode == GC_GREEDY)
211 return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
212 else
213 return get_cb_cost(sbi, segno);
214}
215
216/*
217 * This function is called from two pathes.
218 * One is garbage collection and the other is SSR segment selection.
219 * When it is called during GC, it just gets a victim segment
220 * and it does not remove it from dirty seglist.
221 * When it is called from SSR segment selection, it finds a segment
222 * which has minimum valid blocks and removes it from dirty seglist.
223 */
224static int get_victim_by_default(struct f2fs_sb_info *sbi,
225 unsigned int *result, int gc_type, int type, char alloc_mode)
226{
227 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
228 struct victim_sel_policy p;
229 unsigned int segno;
230 int nsearched = 0;
231
232 p.alloc_mode = alloc_mode;
233 select_policy(sbi, gc_type, type, &p);
234
235 p.min_segno = NULL_SEGNO;
236 p.min_cost = get_max_cost(sbi, &p);
237
238 mutex_lock(&dirty_i->seglist_lock);
239
240 if (p.alloc_mode == LFS && gc_type == FG_GC) {
241 p.min_segno = check_bg_victims(sbi);
242 if (p.min_segno != NULL_SEGNO)
243 goto got_it;
244 }
245
246 while (1) {
247 unsigned long cost;
248
249 segno = find_next_bit(p.dirty_segmap,
250 TOTAL_SEGS(sbi), p.offset);
251 if (segno >= TOTAL_SEGS(sbi)) {
252 if (sbi->last_victim[p.gc_mode]) {
253 sbi->last_victim[p.gc_mode] = 0;
254 p.offset = 0;
255 continue;
256 }
257 break;
258 }
259 p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
260
261 if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
262 continue;
263 if (gc_type == BG_GC &&
264 test_bit(segno, dirty_i->victim_segmap[BG_GC]))
265 continue;
266 if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
267 continue;
268
269 cost = get_gc_cost(sbi, segno, &p);
270
271 if (p.min_cost > cost) {
272 p.min_segno = segno;
273 p.min_cost = cost;
274 }
275
276 if (cost == get_max_cost(sbi, &p))
277 continue;
278
279 if (nsearched++ >= MAX_VICTIM_SEARCH) {
280 sbi->last_victim[p.gc_mode] = segno;
281 break;
282 }
283 }
284got_it:
285 if (p.min_segno != NULL_SEGNO) {
286 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
287 if (p.alloc_mode == LFS) {
288 int i;
289 for (i = 0; i < p.ofs_unit; i++)
290 set_bit(*result + i,
291 dirty_i->victim_segmap[gc_type]);
292 }
293 }
294 mutex_unlock(&dirty_i->seglist_lock);
295
296 return (p.min_segno == NULL_SEGNO) ? 0 : 1;
297}
298
299static const struct victim_selection default_v_ops = {
300 .get_victim = get_victim_by_default,
301};
302
303static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
304{
305 struct list_head *this;
306 struct inode_entry *ie;
307
308 list_for_each(this, ilist) {
309 ie = list_entry(this, struct inode_entry, list);
310 if (ie->inode->i_ino == ino)
311 return ie->inode;
312 }
313 return NULL;
314}
315
316static void add_gc_inode(struct inode *inode, struct list_head *ilist)
317{
318 struct list_head *this;
319 struct inode_entry *new_ie, *ie;
320
321 list_for_each(this, ilist) {
322 ie = list_entry(this, struct inode_entry, list);
323 if (ie->inode == inode) {
324 iput(inode);
325 return;
326 }
327 }
328repeat:
329 new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
330 if (!new_ie) {
331 cond_resched();
332 goto repeat;
333 }
334 new_ie->inode = inode;
335 list_add_tail(&new_ie->list, ilist);
336}
337
338static void put_gc_inode(struct list_head *ilist)
339{
340 struct inode_entry *ie, *next_ie;
341 list_for_each_entry_safe(ie, next_ie, ilist, list) {
342 iput(ie->inode);
343 list_del(&ie->list);
344 kmem_cache_free(winode_slab, ie);
345 }
346}
347
348static int check_valid_map(struct f2fs_sb_info *sbi,
349 unsigned int segno, int offset)
350{
351 struct sit_info *sit_i = SIT_I(sbi);
352 struct seg_entry *sentry;
353 int ret;
354
355 mutex_lock(&sit_i->sentry_lock);
356 sentry = get_seg_entry(sbi, segno);
357 ret = f2fs_test_bit(offset, sentry->cur_valid_map);
358 mutex_unlock(&sit_i->sentry_lock);
359 return ret ? GC_OK : GC_NEXT;
360}
361
362/*
363 * This function compares node address got in summary with that in NAT.
364 * On validity, copy that node with cold status, otherwise (invalid node)
365 * ignore that.
366 */
367static int gc_node_segment(struct f2fs_sb_info *sbi,
368 struct f2fs_summary *sum, unsigned int segno, int gc_type)
369{
370 bool initial = true;
371 struct f2fs_summary *entry;
372 int off;
373
374next_step:
375 entry = sum;
376 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
377 nid_t nid = le32_to_cpu(entry->nid);
378 struct page *node_page;
379 int err;
380
381 /*
382 * It makes sure that free segments are able to write
383 * all the dirty node pages before CP after this CP.
384 * So let's check the space of dirty node pages.
385 */
386 if (should_do_checkpoint(sbi)) {
387 mutex_lock(&sbi->cp_mutex);
388 block_operations(sbi);
389 return GC_BLOCKED;
390 }
391
392 err = check_valid_map(sbi, segno, off);
393 if (err == GC_NEXT)
394 continue;
395
396 if (initial) {
397 ra_node_page(sbi, nid);
398 continue;
399 }
400 node_page = get_node_page(sbi, nid);
401 if (IS_ERR(node_page))
402 continue;
403
404 /* set page dirty and write it */
405 if (!PageWriteback(node_page))
406 set_page_dirty(node_page);
407 f2fs_put_page(node_page, 1);
408 stat_inc_node_blk_count(sbi, 1);
409 }
410 if (initial) {
411 initial = false;
412 goto next_step;
413 }
414
415 if (gc_type == FG_GC) {
416 struct writeback_control wbc = {
417 .sync_mode = WB_SYNC_ALL,
418 .nr_to_write = LONG_MAX,
419 .for_reclaim = 0,
420 };
421 sync_node_pages(sbi, 0, &wbc);
422 }
423 return GC_DONE;
424}
425
426/*
427 * Calculate start block index indicating the given node offset.
428 * Be careful, caller should give this node offset only indicating direct node
429 * blocks. If any node offsets, which point the other types of node blocks such
430 * as indirect or double indirect node blocks, are given, it must be a caller's
431 * bug.
432 */
433block_t start_bidx_of_node(unsigned int node_ofs)
434{
435 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
436 unsigned int bidx;
437
438 if (node_ofs == 0)
439 return 0;
440
441 if (node_ofs <= 2) {
442 bidx = node_ofs - 1;
443 } else if (node_ofs <= indirect_blks) {
444 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
445 bidx = node_ofs - 2 - dec;
446 } else {
447 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
448 bidx = node_ofs - 5 - dec;
449 }
450 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE;
451}
452
453static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
454 struct node_info *dni, block_t blkaddr, unsigned int *nofs)
455{
456 struct page *node_page;
457 nid_t nid;
458 unsigned int ofs_in_node;
459 block_t source_blkaddr;
460
461 nid = le32_to_cpu(sum->nid);
462 ofs_in_node = le16_to_cpu(sum->ofs_in_node);
463
464 node_page = get_node_page(sbi, nid);
465 if (IS_ERR(node_page))
466 return GC_NEXT;
467
468 get_node_info(sbi, nid, dni);
469
470 if (sum->version != dni->version) {
471 f2fs_put_page(node_page, 1);
472 return GC_NEXT;
473 }
474
475 *nofs = ofs_of_node(node_page);
476 source_blkaddr = datablock_addr(node_page, ofs_in_node);
477 f2fs_put_page(node_page, 1);
478
479 if (source_blkaddr != blkaddr)
480 return GC_NEXT;
481 return GC_OK;
482}
483
484static void move_data_page(struct inode *inode, struct page *page, int gc_type)
485{
486 if (page->mapping != inode->i_mapping)
487 goto out;
488
489 if (inode != page->mapping->host)
490 goto out;
491
492 if (PageWriteback(page))
493 goto out;
494
495 if (gc_type == BG_GC) {
496 set_page_dirty(page);
497 set_cold_data(page);
498 } else {
499 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
500 mutex_lock_op(sbi, DATA_WRITE);
501 if (clear_page_dirty_for_io(page) &&
502 S_ISDIR(inode->i_mode)) {
503 dec_page_count(sbi, F2FS_DIRTY_DENTS);
504 inode_dec_dirty_dents(inode);
505 }
506 set_cold_data(page);
507 do_write_data_page(page);
508 mutex_unlock_op(sbi, DATA_WRITE);
509 clear_cold_data(page);
510 }
511out:
512 f2fs_put_page(page, 1);
513}
514
515/*
516 * This function tries to get parent node of victim data block, and identifies
517 * data block validity. If the block is valid, copy that with cold status and
518 * modify parent node.
519 * If the parent node is not valid or the data block address is different,
520 * the victim data block is ignored.
521 */
522static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
523 struct list_head *ilist, unsigned int segno, int gc_type)
524{
525 struct super_block *sb = sbi->sb;
526 struct f2fs_summary *entry;
527 block_t start_addr;
528 int err, off;
529 int phase = 0;
530
531 start_addr = START_BLOCK(sbi, segno);
532
533next_step:
534 entry = sum;
535 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
536 struct page *data_page;
537 struct inode *inode;
538 struct node_info dni; /* dnode info for the data */
539 unsigned int ofs_in_node, nofs;
540 block_t start_bidx;
541
542 /*
543 * It makes sure that free segments are able to write
544 * all the dirty node pages before CP after this CP.
545 * So let's check the space of dirty node pages.
546 */
547 if (should_do_checkpoint(sbi)) {
548 mutex_lock(&sbi->cp_mutex);
549 block_operations(sbi);
550 err = GC_BLOCKED;
551 goto stop;
552 }
553
554 err = check_valid_map(sbi, segno, off);
555 if (err == GC_NEXT)
556 continue;
557
558 if (phase == 0) {
559 ra_node_page(sbi, le32_to_cpu(entry->nid));
560 continue;
561 }
562
563 /* Get an inode by ino with checking validity */
564 err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs);
565 if (err == GC_NEXT)
566 continue;
567
568 if (phase == 1) {
569 ra_node_page(sbi, dni.ino);
570 continue;
571 }
572
573 start_bidx = start_bidx_of_node(nofs);
574 ofs_in_node = le16_to_cpu(entry->ofs_in_node);
575
576 if (phase == 2) {
577 inode = f2fs_iget_nowait(sb, dni.ino);
578 if (IS_ERR(inode))
579 continue;
580
581 data_page = find_data_page(inode,
582 start_bidx + ofs_in_node);
583 if (IS_ERR(data_page))
584 goto next_iput;
585
586 f2fs_put_page(data_page, 0);
587 add_gc_inode(inode, ilist);
588 } else {
589 inode = find_gc_inode(dni.ino, ilist);
590 if (inode) {
591 data_page = get_lock_data_page(inode,
592 start_bidx + ofs_in_node);
593 if (IS_ERR(data_page))
594 continue;
595 move_data_page(inode, data_page, gc_type);
596 stat_inc_data_blk_count(sbi, 1);
597 }
598 }
599 continue;
600next_iput:
601 iput(inode);
602 }
603 if (++phase < 4)
604 goto next_step;
605 err = GC_DONE;
606stop:
607 if (gc_type == FG_GC)
608 f2fs_submit_bio(sbi, DATA, true);
609 return err;
610}
611
612static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
613 int gc_type, int type)
614{
615 struct sit_info *sit_i = SIT_I(sbi);
616 int ret;
617 mutex_lock(&sit_i->sentry_lock);
618 ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS);
619 mutex_unlock(&sit_i->sentry_lock);
620 return ret;
621}
622
623static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
624 struct list_head *ilist, int gc_type)
625{
626 struct page *sum_page;
627 struct f2fs_summary_block *sum;
628 int ret = GC_DONE;
629
630 /* read segment summary of victim */
631 sum_page = get_sum_page(sbi, segno);
632 if (IS_ERR(sum_page))
633 return GC_ERROR;
634
635 /*
636 * CP needs to lock sum_page. In this time, we don't need
637 * to lock this page, because this summary page is not gone anywhere.
638 * Also, this page is not gonna be updated before GC is done.
639 */
640 unlock_page(sum_page);
641 sum = page_address(sum_page);
642
643 switch (GET_SUM_TYPE((&sum->footer))) {
644 case SUM_TYPE_NODE:
645 ret = gc_node_segment(sbi, sum->entries, segno, gc_type);
646 break;
647 case SUM_TYPE_DATA:
648 ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
649 break;
650 }
651 stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
652 stat_inc_call_count(sbi->stat_info);
653
654 f2fs_put_page(sum_page, 0);
655 return ret;
656}
657
658int f2fs_gc(struct f2fs_sb_info *sbi)
659{
660 struct list_head ilist;
661 unsigned int segno, i;
662 int gc_type = BG_GC;
663 int gc_status = GC_NONE;
664
665 INIT_LIST_HEAD(&ilist);
666gc_more:
667 if (!(sbi->sb->s_flags & MS_ACTIVE))
668 goto stop;
669
670 if (has_not_enough_free_secs(sbi))
671 gc_type = FG_GC;
672
673 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
674 goto stop;
675
676 for (i = 0; i < sbi->segs_per_sec; i++) {
677 /*
678 * do_garbage_collect will give us three gc_status:
679 * GC_ERROR, GC_DONE, and GC_BLOCKED.
680 * If GC is finished uncleanly, we have to return
681 * the victim to dirty segment list.
682 */
683 gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type);
684 if (gc_status != GC_DONE)
685 break;
686 }
687 if (has_not_enough_free_secs(sbi)) {
688 write_checkpoint(sbi, (gc_status == GC_BLOCKED), false);
689 if (has_not_enough_free_secs(sbi))
690 goto gc_more;
691 }
692stop:
693 mutex_unlock(&sbi->gc_mutex);
694
695 put_gc_inode(&ilist);
696 return gc_status;
697}
698
699void build_gc_manager(struct f2fs_sb_info *sbi)
700{
701 DIRTY_I(sbi)->v_ops = &default_v_ops;
702}
703
704int __init create_gc_caches(void)
705{
706 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
707 sizeof(struct inode_entry), NULL);
708 if (!winode_slab)
709 return -ENOMEM;
710 return 0;
711}
712
713void destroy_gc_caches(void)
714{
715 kmem_cache_destroy(winode_slab);
716}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
new file mode 100644
index 000000000000..b026d9354ccd
--- /dev/null
+++ b/fs/f2fs/gc.h
@@ -0,0 +1,117 @@
1/*
2 * fs/f2fs/gc.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#define GC_THREAD_NAME "f2fs_gc_task"
12#define GC_THREAD_MIN_WB_PAGES 1 /*
13 * a threshold to determine
14 * whether IO subsystem is idle
15 * or not
16 */
17#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */
18#define GC_THREAD_MAX_SLEEP_TIME 30000
19#define GC_THREAD_NOGC_SLEEP_TIME 10000
20#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
21#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
22
23/* Search max. number of dirty segments to select a victim segment */
24#define MAX_VICTIM_SEARCH 20
25
26enum {
27 GC_NONE = 0,
28 GC_ERROR,
29 GC_OK,
30 GC_NEXT,
31 GC_BLOCKED,
32 GC_DONE,
33};
34
35struct f2fs_gc_kthread {
36 struct task_struct *f2fs_gc_task;
37 wait_queue_head_t gc_wait_queue_head;
38};
39
40struct inode_entry {
41 struct list_head list;
42 struct inode *inode;
43};
44
45/*
46 * inline functions
47 */
48static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
49{
50 if (free_segments(sbi) < overprovision_segments(sbi))
51 return 0;
52 else
53 return (free_segments(sbi) - overprovision_segments(sbi))
54 << sbi->log_blocks_per_seg;
55}
56
57static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
58{
59 return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
60}
61
62static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
63{
64 block_t reclaimable_user_blocks = sbi->user_block_count -
65 written_block_count(sbi);
66 return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
67}
68
69static inline long increase_sleep_time(long wait)
70{
71 wait += GC_THREAD_MIN_SLEEP_TIME;
72 if (wait > GC_THREAD_MAX_SLEEP_TIME)
73 wait = GC_THREAD_MAX_SLEEP_TIME;
74 return wait;
75}
76
77static inline long decrease_sleep_time(long wait)
78{
79 wait -= GC_THREAD_MIN_SLEEP_TIME;
80 if (wait <= GC_THREAD_MIN_SLEEP_TIME)
81 wait = GC_THREAD_MIN_SLEEP_TIME;
82 return wait;
83}
84
85static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
86{
87 block_t invalid_user_blocks = sbi->user_block_count -
88 written_block_count(sbi);
89 /*
90 * Background GC is triggered with the following condition.
91 * 1. There are a number of invalid blocks.
92 * 2. There is not enough free space.
93 */
94 if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
95 free_user_blocks(sbi) < limit_free_user_blocks(sbi))
96 return true;
97 return false;
98}
99
100static inline int is_idle(struct f2fs_sb_info *sbi)
101{
102 struct block_device *bdev = sbi->sb->s_bdev;
103 struct request_queue *q = bdev_get_queue(bdev);
104 struct request_list *rl = &q->root_rl;
105 return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
106}
107
108static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi)
109{
110 unsigned int pages_per_sec = sbi->segs_per_sec *
111 (1 << sbi->log_blocks_per_seg);
112 int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
113 >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
114 int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
115 >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
116 return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2);
117}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
new file mode 100644
index 000000000000..6eb8d269b53b
--- /dev/null
+++ b/fs/f2fs/hash.c
@@ -0,0 +1,101 @@
1/*
2 * fs/f2fs/hash.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext3/hash.c
8 *
9 * Copyright (C) 2002 by Theodore Ts'o
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15#include <linux/types.h>
16#include <linux/fs.h>
17#include <linux/f2fs_fs.h>
18#include <linux/cryptohash.h>
19#include <linux/pagemap.h>
20
21#include "f2fs.h"
22
23/*
24 * Hashing code copied from ext3
25 */
26#define DELTA 0x9E3779B9
27
28static void TEA_transform(unsigned int buf[4], unsigned int const in[])
29{
30 __u32 sum = 0;
31 __u32 b0 = buf[0], b1 = buf[1];
32 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
33 int n = 16;
34
35 do {
36 sum += DELTA;
37 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
38 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
39 } while (--n);
40
41 buf[0] += b0;
42 buf[1] += b1;
43}
44
45static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
46{
47 unsigned pad, val;
48 int i;
49
50 pad = (__u32)len | ((__u32)len << 8);
51 pad |= pad << 16;
52
53 val = pad;
54 if (len > num * 4)
55 len = num * 4;
56 for (i = 0; i < len; i++) {
57 if ((i % 4) == 0)
58 val = pad;
59 val = msg[i] + (val << 8);
60 if ((i % 4) == 3) {
61 *buf++ = val;
62 val = pad;
63 num--;
64 }
65 }
66 if (--num >= 0)
67 *buf++ = val;
68 while (--num >= 0)
69 *buf++ = pad;
70}
71
72f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len)
73{
74 __u32 hash;
75 f2fs_hash_t f2fs_hash;
76 const char *p;
77 __u32 in[8], buf[4];
78
79 if ((len <= 2) && (name[0] == '.') &&
80 (name[1] == '.' || name[1] == '\0'))
81 return 0;
82
83 /* Initialize the default seed for the hash checksum functions */
84 buf[0] = 0x67452301;
85 buf[1] = 0xefcdab89;
86 buf[2] = 0x98badcfe;
87 buf[3] = 0x10325476;
88
89 p = name;
90 while (1) {
91 str2hashbuf(p, len, in, 4);
92 TEA_transform(buf, in);
93 p += 16;
94 if (len <= 16)
95 break;
96 len -= 16;
97 }
98 hash = buf[0];
99 f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
100 return f2fs_hash;
101}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
new file mode 100644
index 000000000000..794241777322
--- /dev/null
+++ b/fs/f2fs/inode.c
@@ -0,0 +1,272 @@
1/*
2 * fs/f2fs/inode.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/buffer_head.h>
14#include <linux/writeback.h>
15
16#include "f2fs.h"
17#include "node.h"
18
19struct f2fs_iget_args {
20 u64 ino;
21 int on_free;
22};
23
24void f2fs_set_inode_flags(struct inode *inode)
25{
26 unsigned int flags = F2FS_I(inode)->i_flags;
27
28 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE |
29 S_NOATIME | S_DIRSYNC);
30
31 if (flags & FS_SYNC_FL)
32 inode->i_flags |= S_SYNC;
33 if (flags & FS_APPEND_FL)
34 inode->i_flags |= S_APPEND;
35 if (flags & FS_IMMUTABLE_FL)
36 inode->i_flags |= S_IMMUTABLE;
37 if (flags & FS_NOATIME_FL)
38 inode->i_flags |= S_NOATIME;
39 if (flags & FS_DIRSYNC_FL)
40 inode->i_flags |= S_DIRSYNC;
41}
42
43static int f2fs_iget_test(struct inode *inode, void *data)
44{
45 struct f2fs_iget_args *args = data;
46
47 if (inode->i_ino != args->ino)
48 return 0;
49 if (inode->i_state & (I_FREEING | I_WILL_FREE)) {
50 args->on_free = 1;
51 return 0;
52 }
53 return 1;
54}
55
56struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino)
57{
58 struct f2fs_iget_args args = {
59 .ino = ino,
60 .on_free = 0
61 };
62 struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args);
63
64 if (inode)
65 return inode;
66 if (!args.on_free)
67 return f2fs_iget(sb, ino);
68 return ERR_PTR(-ENOENT);
69}
70
71static int do_read_inode(struct inode *inode)
72{
73 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
74 struct f2fs_inode_info *fi = F2FS_I(inode);
75 struct page *node_page;
76 struct f2fs_node *rn;
77 struct f2fs_inode *ri;
78
79 /* Check if ino is within scope */
80 check_nid_range(sbi, inode->i_ino);
81
82 node_page = get_node_page(sbi, inode->i_ino);
83 if (IS_ERR(node_page))
84 return PTR_ERR(node_page);
85
86 rn = page_address(node_page);
87 ri = &(rn->i);
88
89 inode->i_mode = le16_to_cpu(ri->i_mode);
90 i_uid_write(inode, le32_to_cpu(ri->i_uid));
91 i_gid_write(inode, le32_to_cpu(ri->i_gid));
92 set_nlink(inode, le32_to_cpu(ri->i_links));
93 inode->i_size = le64_to_cpu(ri->i_size);
94 inode->i_blocks = le64_to_cpu(ri->i_blocks);
95
96 inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
97 inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
98 inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
99 inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
100 inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
101 inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
102 inode->i_generation = le32_to_cpu(ri->i_generation);
103
104 fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
105 fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
106 fi->i_flags = le32_to_cpu(ri->i_flags);
107 fi->flags = 0;
108 fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1;
109 fi->i_advise = ri->i_advise;
110 fi->i_pino = le32_to_cpu(ri->i_pino);
111 get_extent_info(&fi->ext, ri->i_ext);
112 f2fs_put_page(node_page, 1);
113 return 0;
114}
115
116struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
117{
118 struct f2fs_sb_info *sbi = F2FS_SB(sb);
119 struct inode *inode;
120 int ret;
121
122 inode = iget_locked(sb, ino);
123 if (!inode)
124 return ERR_PTR(-ENOMEM);
125 if (!(inode->i_state & I_NEW))
126 return inode;
127 if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
128 goto make_now;
129
130 ret = do_read_inode(inode);
131 if (ret)
132 goto bad_inode;
133
134 if (!sbi->por_doing && inode->i_nlink == 0) {
135 ret = -ENOENT;
136 goto bad_inode;
137 }
138
139make_now:
140 if (ino == F2FS_NODE_INO(sbi)) {
141 inode->i_mapping->a_ops = &f2fs_node_aops;
142 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
143 } else if (ino == F2FS_META_INO(sbi)) {
144 inode->i_mapping->a_ops = &f2fs_meta_aops;
145 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
146 } else if (S_ISREG(inode->i_mode)) {
147 inode->i_op = &f2fs_file_inode_operations;
148 inode->i_fop = &f2fs_file_operations;
149 inode->i_mapping->a_ops = &f2fs_dblock_aops;
150 } else if (S_ISDIR(inode->i_mode)) {
151 inode->i_op = &f2fs_dir_inode_operations;
152 inode->i_fop = &f2fs_dir_operations;
153 inode->i_mapping->a_ops = &f2fs_dblock_aops;
154 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
155 __GFP_ZERO);
156 } else if (S_ISLNK(inode->i_mode)) {
157 inode->i_op = &f2fs_symlink_inode_operations;
158 inode->i_mapping->a_ops = &f2fs_dblock_aops;
159 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
160 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
161 inode->i_op = &f2fs_special_inode_operations;
162 init_special_inode(inode, inode->i_mode, inode->i_rdev);
163 } else {
164 ret = -EIO;
165 goto bad_inode;
166 }
167 unlock_new_inode(inode);
168
169 return inode;
170
171bad_inode:
172 iget_failed(inode);
173 return ERR_PTR(ret);
174}
175
176void update_inode(struct inode *inode, struct page *node_page)
177{
178 struct f2fs_node *rn;
179 struct f2fs_inode *ri;
180
181 wait_on_page_writeback(node_page);
182
183 rn = page_address(node_page);
184 ri = &(rn->i);
185
186 ri->i_mode = cpu_to_le16(inode->i_mode);
187 ri->i_advise = F2FS_I(inode)->i_advise;
188 ri->i_uid = cpu_to_le32(i_uid_read(inode));
189 ri->i_gid = cpu_to_le32(i_gid_read(inode));
190 ri->i_links = cpu_to_le32(inode->i_nlink);
191 ri->i_size = cpu_to_le64(i_size_read(inode));
192 ri->i_blocks = cpu_to_le64(inode->i_blocks);
193 set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
194
195 ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
196 ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
197 ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
198 ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
199 ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
200 ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
201 ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth);
202 ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
203 ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
204 ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
205 ri->i_generation = cpu_to_le32(inode->i_generation);
206 set_cold_node(inode, node_page);
207 set_page_dirty(node_page);
208}
209
210int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
211{
212 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
213 struct page *node_page;
214 bool need_lock = false;
215
216 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
217 inode->i_ino == F2FS_META_INO(sbi))
218 return 0;
219
220 if (wbc)
221 f2fs_balance_fs(sbi);
222
223 node_page = get_node_page(sbi, inode->i_ino);
224 if (IS_ERR(node_page))
225 return PTR_ERR(node_page);
226
227 if (!PageDirty(node_page)) {
228 need_lock = true;
229 f2fs_put_page(node_page, 1);
230 mutex_lock(&sbi->write_inode);
231 node_page = get_node_page(sbi, inode->i_ino);
232 if (IS_ERR(node_page)) {
233 mutex_unlock(&sbi->write_inode);
234 return PTR_ERR(node_page);
235 }
236 }
237 update_inode(inode, node_page);
238 f2fs_put_page(node_page, 1);
239 if (need_lock)
240 mutex_unlock(&sbi->write_inode);
241 return 0;
242}
243
244/*
245 * Called at the last iput() if i_nlink is zero
246 */
247void f2fs_evict_inode(struct inode *inode)
248{
249 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
250
251 truncate_inode_pages(&inode->i_data, 0);
252
253 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
254 inode->i_ino == F2FS_META_INO(sbi))
255 goto no_delete;
256
257 BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents));
258 remove_dirty_dir_inode(inode);
259
260 if (inode->i_nlink || is_bad_inode(inode))
261 goto no_delete;
262
263 set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
264 i_size_write(inode, 0);
265
266 if (F2FS_HAS_BLOCKS(inode))
267 f2fs_truncate(inode);
268
269 remove_inode_page(inode);
270no_delete:
271 clear_inode(inode);
272}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
new file mode 100644
index 000000000000..1a49b881bac0
--- /dev/null
+++ b/fs/f2fs/namei.c
@@ -0,0 +1,503 @@
1/*
2 * fs/f2fs/namei.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/pagemap.h>
14#include <linux/sched.h>
15#include <linux/ctype.h>
16
17#include "f2fs.h"
18#include "xattr.h"
19#include "acl.h"
20
21static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
22{
23 struct super_block *sb = dir->i_sb;
24 struct f2fs_sb_info *sbi = F2FS_SB(sb);
25 nid_t ino;
26 struct inode *inode;
27 bool nid_free = false;
28 int err;
29
30 inode = new_inode(sb);
31 if (!inode)
32 return ERR_PTR(-ENOMEM);
33
34 mutex_lock_op(sbi, NODE_NEW);
35 if (!alloc_nid(sbi, &ino)) {
36 mutex_unlock_op(sbi, NODE_NEW);
37 err = -ENOSPC;
38 goto fail;
39 }
40 mutex_unlock_op(sbi, NODE_NEW);
41
42 inode->i_uid = current_fsuid();
43
44 if (dir->i_mode & S_ISGID) {
45 inode->i_gid = dir->i_gid;
46 if (S_ISDIR(mode))
47 mode |= S_ISGID;
48 } else {
49 inode->i_gid = current_fsgid();
50 }
51
52 inode->i_ino = ino;
53 inode->i_mode = mode;
54 inode->i_blocks = 0;
55 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
56 inode->i_generation = sbi->s_next_generation++;
57
58 err = insert_inode_locked(inode);
59 if (err) {
60 err = -EINVAL;
61 nid_free = true;
62 goto out;
63 }
64
65 mark_inode_dirty(inode);
66 return inode;
67
68out:
69 clear_nlink(inode);
70 unlock_new_inode(inode);
71fail:
72 iput(inode);
73 if (nid_free)
74 alloc_nid_failed(sbi, ino);
75 return ERR_PTR(err);
76}
77
78static int is_multimedia_file(const unsigned char *s, const char *sub)
79{
80 size_t slen = strlen(s);
81 size_t sublen = strlen(sub);
82 int ret;
83
84 if (sublen > slen)
85 return 1;
86
87 ret = memcmp(s + slen - sublen, sub, sublen);
88 if (ret) { /* compare upper case */
89 int i;
90 char upper_sub[8];
91 for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
92 upper_sub[i] = toupper(sub[i]);
93 return memcmp(s + slen - sublen, upper_sub, sublen);
94 }
95
96 return ret;
97}
98
99/*
100 * Set multimedia files as cold files for hot/cold data separation
101 */
102static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
103 const unsigned char *name)
104{
105 int i;
106 __u8 (*extlist)[8] = sbi->raw_super->extension_list;
107
108 int count = le32_to_cpu(sbi->raw_super->extension_count);
109 for (i = 0; i < count; i++) {
110 if (!is_multimedia_file(name, extlist[i])) {
111 F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
112 break;
113 }
114 }
115}
116
117static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
118 bool excl)
119{
120 struct super_block *sb = dir->i_sb;
121 struct f2fs_sb_info *sbi = F2FS_SB(sb);
122 struct inode *inode;
123 nid_t ino = 0;
124 int err;
125
126 f2fs_balance_fs(sbi);
127
128 inode = f2fs_new_inode(dir, mode);
129 if (IS_ERR(inode))
130 return PTR_ERR(inode);
131
132 if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
133 set_cold_file(sbi, inode, dentry->d_name.name);
134
135 inode->i_op = &f2fs_file_inode_operations;
136 inode->i_fop = &f2fs_file_operations;
137 inode->i_mapping->a_ops = &f2fs_dblock_aops;
138 ino = inode->i_ino;
139
140 err = f2fs_add_link(dentry, inode);
141 if (err)
142 goto out;
143
144 alloc_nid_done(sbi, ino);
145
146 if (!sbi->por_doing)
147 d_instantiate(dentry, inode);
148 unlock_new_inode(inode);
149 return 0;
150out:
151 clear_nlink(inode);
152 unlock_new_inode(inode);
153 iput(inode);
154 alloc_nid_failed(sbi, ino);
155 return err;
156}
157
158static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
159 struct dentry *dentry)
160{
161 struct inode *inode = old_dentry->d_inode;
162 struct super_block *sb = dir->i_sb;
163 struct f2fs_sb_info *sbi = F2FS_SB(sb);
164 int err;
165
166 f2fs_balance_fs(sbi);
167
168 inode->i_ctime = CURRENT_TIME;
169 atomic_inc(&inode->i_count);
170
171 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
172 err = f2fs_add_link(dentry, inode);
173 if (err)
174 goto out;
175
176 d_instantiate(dentry, inode);
177 return 0;
178out:
179 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
180 iput(inode);
181 return err;
182}
183
184struct dentry *f2fs_get_parent(struct dentry *child)
185{
186 struct qstr dotdot = QSTR_INIT("..", 2);
187 unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot);
188 if (!ino)
189 return ERR_PTR(-ENOENT);
190 return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
191}
192
193static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
194 unsigned int flags)
195{
196 struct inode *inode = NULL;
197 struct f2fs_dir_entry *de;
198 struct page *page;
199
200 if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
201 return ERR_PTR(-ENAMETOOLONG);
202
203 de = f2fs_find_entry(dir, &dentry->d_name, &page);
204 if (de) {
205 nid_t ino = le32_to_cpu(de->ino);
206 kunmap(page);
207 f2fs_put_page(page, 0);
208
209 inode = f2fs_iget(dir->i_sb, ino);
210 if (IS_ERR(inode))
211 return ERR_CAST(inode);
212 }
213
214 return d_splice_alias(inode, dentry);
215}
216
217static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
218{
219 struct super_block *sb = dir->i_sb;
220 struct f2fs_sb_info *sbi = F2FS_SB(sb);
221 struct inode *inode = dentry->d_inode;
222 struct f2fs_dir_entry *de;
223 struct page *page;
224 int err = -ENOENT;
225
226 f2fs_balance_fs(sbi);
227
228 de = f2fs_find_entry(dir, &dentry->d_name, &page);
229 if (!de)
230 goto fail;
231
232 err = check_orphan_space(sbi);
233 if (err) {
234 kunmap(page);
235 f2fs_put_page(page, 0);
236 goto fail;
237 }
238
239 f2fs_delete_entry(de, page, inode);
240
241 /* In order to evict this inode, we set it dirty */
242 mark_inode_dirty(inode);
243fail:
244 return err;
245}
246
247static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
248 const char *symname)
249{
250 struct super_block *sb = dir->i_sb;
251 struct f2fs_sb_info *sbi = F2FS_SB(sb);
252 struct inode *inode;
253 size_t symlen = strlen(symname) + 1;
254 int err;
255
256 f2fs_balance_fs(sbi);
257
258 inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
259 if (IS_ERR(inode))
260 return PTR_ERR(inode);
261
262 inode->i_op = &f2fs_symlink_inode_operations;
263 inode->i_mapping->a_ops = &f2fs_dblock_aops;
264
265 err = f2fs_add_link(dentry, inode);
266 if (err)
267 goto out;
268
269 err = page_symlink(inode, symname, symlen);
270 alloc_nid_done(sbi, inode->i_ino);
271
272 d_instantiate(dentry, inode);
273 unlock_new_inode(inode);
274 return err;
275out:
276 clear_nlink(inode);
277 unlock_new_inode(inode);
278 iput(inode);
279 alloc_nid_failed(sbi, inode->i_ino);
280 return err;
281}
282
283static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
284{
285 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
286 struct inode *inode;
287 int err;
288
289 f2fs_balance_fs(sbi);
290
291 inode = f2fs_new_inode(dir, S_IFDIR | mode);
292 if (IS_ERR(inode))
293 return PTR_ERR(inode);
294
295 inode->i_op = &f2fs_dir_inode_operations;
296 inode->i_fop = &f2fs_dir_operations;
297 inode->i_mapping->a_ops = &f2fs_dblock_aops;
298 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
299
300 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
301 err = f2fs_add_link(dentry, inode);
302 if (err)
303 goto out_fail;
304
305 alloc_nid_done(sbi, inode->i_ino);
306
307 d_instantiate(dentry, inode);
308 unlock_new_inode(inode);
309
310 return 0;
311
312out_fail:
313 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
314 clear_nlink(inode);
315 unlock_new_inode(inode);
316 iput(inode);
317 alloc_nid_failed(sbi, inode->i_ino);
318 return err;
319}
320
321static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
322{
323 struct inode *inode = dentry->d_inode;
324 if (f2fs_empty_dir(inode))
325 return f2fs_unlink(dir, dentry);
326 return -ENOTEMPTY;
327}
328
329static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
330 umode_t mode, dev_t rdev)
331{
332 struct super_block *sb = dir->i_sb;
333 struct f2fs_sb_info *sbi = F2FS_SB(sb);
334 struct inode *inode;
335 int err = 0;
336
337 if (!new_valid_dev(rdev))
338 return -EINVAL;
339
340 f2fs_balance_fs(sbi);
341
342 inode = f2fs_new_inode(dir, mode);
343 if (IS_ERR(inode))
344 return PTR_ERR(inode);
345
346 init_special_inode(inode, inode->i_mode, rdev);
347 inode->i_op = &f2fs_special_inode_operations;
348
349 err = f2fs_add_link(dentry, inode);
350 if (err)
351 goto out;
352
353 alloc_nid_done(sbi, inode->i_ino);
354 d_instantiate(dentry, inode);
355 unlock_new_inode(inode);
356 return 0;
357out:
358 clear_nlink(inode);
359 unlock_new_inode(inode);
360 iput(inode);
361 alloc_nid_failed(sbi, inode->i_ino);
362 return err;
363}
364
365static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
366 struct inode *new_dir, struct dentry *new_dentry)
367{
368 struct super_block *sb = old_dir->i_sb;
369 struct f2fs_sb_info *sbi = F2FS_SB(sb);
370 struct inode *old_inode = old_dentry->d_inode;
371 struct inode *new_inode = new_dentry->d_inode;
372 struct page *old_dir_page;
373 struct page *old_page;
374 struct f2fs_dir_entry *old_dir_entry = NULL;
375 struct f2fs_dir_entry *old_entry;
376 struct f2fs_dir_entry *new_entry;
377 int err = -ENOENT;
378
379 f2fs_balance_fs(sbi);
380
381 old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
382 if (!old_entry)
383 goto out;
384
385 if (S_ISDIR(old_inode->i_mode)) {
386 err = -EIO;
387 old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
388 if (!old_dir_entry)
389 goto out_old;
390 }
391
392 mutex_lock_op(sbi, RENAME);
393
394 if (new_inode) {
395 struct page *new_page;
396
397 err = -ENOTEMPTY;
398 if (old_dir_entry && !f2fs_empty_dir(new_inode))
399 goto out_dir;
400
401 err = -ENOENT;
402 new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
403 &new_page);
404 if (!new_entry)
405 goto out_dir;
406
407 f2fs_set_link(new_dir, new_entry, new_page, old_inode);
408
409 new_inode->i_ctime = CURRENT_TIME;
410 if (old_dir_entry)
411 drop_nlink(new_inode);
412 drop_nlink(new_inode);
413 if (!new_inode->i_nlink)
414 add_orphan_inode(sbi, new_inode->i_ino);
415 f2fs_write_inode(new_inode, NULL);
416 } else {
417 err = f2fs_add_link(new_dentry, old_inode);
418 if (err)
419 goto out_dir;
420
421 if (old_dir_entry) {
422 inc_nlink(new_dir);
423 f2fs_write_inode(new_dir, NULL);
424 }
425 }
426
427 old_inode->i_ctime = CURRENT_TIME;
428 set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
429 mark_inode_dirty(old_inode);
430
431 f2fs_delete_entry(old_entry, old_page, NULL);
432
433 if (old_dir_entry) {
434 if (old_dir != new_dir) {
435 f2fs_set_link(old_inode, old_dir_entry,
436 old_dir_page, new_dir);
437 } else {
438 kunmap(old_dir_page);
439 f2fs_put_page(old_dir_page, 0);
440 }
441 drop_nlink(old_dir);
442 f2fs_write_inode(old_dir, NULL);
443 }
444
445 mutex_unlock_op(sbi, RENAME);
446 return 0;
447
448out_dir:
449 if (old_dir_entry) {
450 kunmap(old_dir_page);
451 f2fs_put_page(old_dir_page, 0);
452 }
453 mutex_unlock_op(sbi, RENAME);
454out_old:
455 kunmap(old_page);
456 f2fs_put_page(old_page, 0);
457out:
458 return err;
459}
460
461const struct inode_operations f2fs_dir_inode_operations = {
462 .create = f2fs_create,
463 .lookup = f2fs_lookup,
464 .link = f2fs_link,
465 .unlink = f2fs_unlink,
466 .symlink = f2fs_symlink,
467 .mkdir = f2fs_mkdir,
468 .rmdir = f2fs_rmdir,
469 .mknod = f2fs_mknod,
470 .rename = f2fs_rename,
471 .setattr = f2fs_setattr,
472 .get_acl = f2fs_get_acl,
473#ifdef CONFIG_F2FS_FS_XATTR
474 .setxattr = generic_setxattr,
475 .getxattr = generic_getxattr,
476 .listxattr = f2fs_listxattr,
477 .removexattr = generic_removexattr,
478#endif
479};
480
481const struct inode_operations f2fs_symlink_inode_operations = {
482 .readlink = generic_readlink,
483 .follow_link = page_follow_link_light,
484 .put_link = page_put_link,
485 .setattr = f2fs_setattr,
486#ifdef CONFIG_F2FS_FS_XATTR
487 .setxattr = generic_setxattr,
488 .getxattr = generic_getxattr,
489 .listxattr = f2fs_listxattr,
490 .removexattr = generic_removexattr,
491#endif
492};
493
494const struct inode_operations f2fs_special_inode_operations = {
495 .setattr = f2fs_setattr,
496 .get_acl = f2fs_get_acl,
497#ifdef CONFIG_F2FS_FS_XATTR
498 .setxattr = generic_setxattr,
499 .getxattr = generic_getxattr,
500 .listxattr = f2fs_listxattr,
501 .removexattr = generic_removexattr,
502#endif
503};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
new file mode 100644
index 000000000000..9bda63c9c166
--- /dev/null
+++ b/fs/f2fs/node.c
@@ -0,0 +1,1760 @@
1/*
2 * fs/f2fs/node.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/mpage.h>
14#include <linux/backing-dev.h>
15#include <linux/blkdev.h>
16#include <linux/pagevec.h>
17#include <linux/swap.h>
18
19#include "f2fs.h"
20#include "node.h"
21#include "segment.h"
22
23static struct kmem_cache *nat_entry_slab;
24static struct kmem_cache *free_nid_slab;
25
26static void clear_node_page_dirty(struct page *page)
27{
28 struct address_space *mapping = page->mapping;
29 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
30 unsigned int long flags;
31
32 if (PageDirty(page)) {
33 spin_lock_irqsave(&mapping->tree_lock, flags);
34 radix_tree_tag_clear(&mapping->page_tree,
35 page_index(page),
36 PAGECACHE_TAG_DIRTY);
37 spin_unlock_irqrestore(&mapping->tree_lock, flags);
38
39 clear_page_dirty_for_io(page);
40 dec_page_count(sbi, F2FS_DIRTY_NODES);
41 }
42 ClearPageUptodate(page);
43}
44
45static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
46{
47 pgoff_t index = current_nat_addr(sbi, nid);
48 return get_meta_page(sbi, index);
49}
50
51static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
52{
53 struct page *src_page;
54 struct page *dst_page;
55 pgoff_t src_off;
56 pgoff_t dst_off;
57 void *src_addr;
58 void *dst_addr;
59 struct f2fs_nm_info *nm_i = NM_I(sbi);
60
61 src_off = current_nat_addr(sbi, nid);
62 dst_off = next_nat_addr(sbi, src_off);
63
64 /* get current nat block page with lock */
65 src_page = get_meta_page(sbi, src_off);
66
67 /* Dirty src_page means that it is already the new target NAT page. */
68 if (PageDirty(src_page))
69 return src_page;
70
71 dst_page = grab_meta_page(sbi, dst_off);
72
73 src_addr = page_address(src_page);
74 dst_addr = page_address(dst_page);
75 memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
76 set_page_dirty(dst_page);
77 f2fs_put_page(src_page, 1);
78
79 set_to_next_nat(nm_i, nid);
80
81 return dst_page;
82}
83
84/*
85 * Readahead NAT pages
86 */
87static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
88{
89 struct address_space *mapping = sbi->meta_inode->i_mapping;
90 struct f2fs_nm_info *nm_i = NM_I(sbi);
91 struct page *page;
92 pgoff_t index;
93 int i;
94
95 for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
96 if (nid >= nm_i->max_nid)
97 nid = 0;
98 index = current_nat_addr(sbi, nid);
99
100 page = grab_cache_page(mapping, index);
101 if (!page)
102 continue;
103 if (f2fs_readpage(sbi, page, index, READ)) {
104 f2fs_put_page(page, 1);
105 continue;
106 }
107 page_cache_release(page);
108 }
109}
110
111static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
112{
113 return radix_tree_lookup(&nm_i->nat_root, n);
114}
115
116static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
117 nid_t start, unsigned int nr, struct nat_entry **ep)
118{
119 return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
120}
121
122static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
123{
124 list_del(&e->list);
125 radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
126 nm_i->nat_cnt--;
127 kmem_cache_free(nat_entry_slab, e);
128}
129
130int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
131{
132 struct f2fs_nm_info *nm_i = NM_I(sbi);
133 struct nat_entry *e;
134 int is_cp = 1;
135
136 read_lock(&nm_i->nat_tree_lock);
137 e = __lookup_nat_cache(nm_i, nid);
138 if (e && !e->checkpointed)
139 is_cp = 0;
140 read_unlock(&nm_i->nat_tree_lock);
141 return is_cp;
142}
143
144static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
145{
146 struct nat_entry *new;
147
148 new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
149 if (!new)
150 return NULL;
151 if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
152 kmem_cache_free(nat_entry_slab, new);
153 return NULL;
154 }
155 memset(new, 0, sizeof(struct nat_entry));
156 nat_set_nid(new, nid);
157 list_add_tail(&new->list, &nm_i->nat_entries);
158 nm_i->nat_cnt++;
159 return new;
160}
161
162static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
163 struct f2fs_nat_entry *ne)
164{
165 struct nat_entry *e;
166retry:
167 write_lock(&nm_i->nat_tree_lock);
168 e = __lookup_nat_cache(nm_i, nid);
169 if (!e) {
170 e = grab_nat_entry(nm_i, nid);
171 if (!e) {
172 write_unlock(&nm_i->nat_tree_lock);
173 goto retry;
174 }
175 nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
176 nat_set_ino(e, le32_to_cpu(ne->ino));
177 nat_set_version(e, ne->version);
178 e->checkpointed = true;
179 }
180 write_unlock(&nm_i->nat_tree_lock);
181}
182
183static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
184 block_t new_blkaddr)
185{
186 struct f2fs_nm_info *nm_i = NM_I(sbi);
187 struct nat_entry *e;
188retry:
189 write_lock(&nm_i->nat_tree_lock);
190 e = __lookup_nat_cache(nm_i, ni->nid);
191 if (!e) {
192 e = grab_nat_entry(nm_i, ni->nid);
193 if (!e) {
194 write_unlock(&nm_i->nat_tree_lock);
195 goto retry;
196 }
197 e->ni = *ni;
198 e->checkpointed = true;
199 BUG_ON(ni->blk_addr == NEW_ADDR);
200 } else if (new_blkaddr == NEW_ADDR) {
201 /*
202 * when nid is reallocated,
203 * previous nat entry can be remained in nat cache.
204 * So, reinitialize it with new information.
205 */
206 e->ni = *ni;
207 BUG_ON(ni->blk_addr != NULL_ADDR);
208 }
209
210 if (new_blkaddr == NEW_ADDR)
211 e->checkpointed = false;
212
213 /* sanity check */
214 BUG_ON(nat_get_blkaddr(e) != ni->blk_addr);
215 BUG_ON(nat_get_blkaddr(e) == NULL_ADDR &&
216 new_blkaddr == NULL_ADDR);
217 BUG_ON(nat_get_blkaddr(e) == NEW_ADDR &&
218 new_blkaddr == NEW_ADDR);
219 BUG_ON(nat_get_blkaddr(e) != NEW_ADDR &&
220 nat_get_blkaddr(e) != NULL_ADDR &&
221 new_blkaddr == NEW_ADDR);
222
223 /* increament version no as node is removed */
224 if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
225 unsigned char version = nat_get_version(e);
226 nat_set_version(e, inc_node_version(version));
227 }
228
229 /* change address */
230 nat_set_blkaddr(e, new_blkaddr);
231 __set_nat_cache_dirty(nm_i, e);
232 write_unlock(&nm_i->nat_tree_lock);
233}
234
235static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
236{
237 struct f2fs_nm_info *nm_i = NM_I(sbi);
238
239 if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD)
240 return 0;
241
242 write_lock(&nm_i->nat_tree_lock);
243 while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
244 struct nat_entry *ne;
245 ne = list_first_entry(&nm_i->nat_entries,
246 struct nat_entry, list);
247 __del_from_nat_cache(nm_i, ne);
248 nr_shrink--;
249 }
250 write_unlock(&nm_i->nat_tree_lock);
251 return nr_shrink;
252}
253
254/*
255 * This function returns always success
256 */
257void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
258{
259 struct f2fs_nm_info *nm_i = NM_I(sbi);
260 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
261 struct f2fs_summary_block *sum = curseg->sum_blk;
262 nid_t start_nid = START_NID(nid);
263 struct f2fs_nat_block *nat_blk;
264 struct page *page = NULL;
265 struct f2fs_nat_entry ne;
266 struct nat_entry *e;
267 int i;
268
269 memset(&ne, 0, sizeof(struct f2fs_nat_entry));
270 ni->nid = nid;
271
272 /* Check nat cache */
273 read_lock(&nm_i->nat_tree_lock);
274 e = __lookup_nat_cache(nm_i, nid);
275 if (e) {
276 ni->ino = nat_get_ino(e);
277 ni->blk_addr = nat_get_blkaddr(e);
278 ni->version = nat_get_version(e);
279 }
280 read_unlock(&nm_i->nat_tree_lock);
281 if (e)
282 return;
283
284 /* Check current segment summary */
285 mutex_lock(&curseg->curseg_mutex);
286 i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
287 if (i >= 0) {
288 ne = nat_in_journal(sum, i);
289 node_info_from_raw_nat(ni, &ne);
290 }
291 mutex_unlock(&curseg->curseg_mutex);
292 if (i >= 0)
293 goto cache;
294
295 /* Fill node_info from nat page */
296 page = get_current_nat_page(sbi, start_nid);
297 nat_blk = (struct f2fs_nat_block *)page_address(page);
298 ne = nat_blk->entries[nid - start_nid];
299 node_info_from_raw_nat(ni, &ne);
300 f2fs_put_page(page, 1);
301cache:
302 /* cache nat entry */
303 cache_nat_entry(NM_I(sbi), nid, &ne);
304}
305
306/*
307 * The maximum depth is four.
308 * Offset[0] will have raw inode offset.
309 */
310static int get_node_path(long block, int offset[4], unsigned int noffset[4])
311{
312 const long direct_index = ADDRS_PER_INODE;
313 const long direct_blks = ADDRS_PER_BLOCK;
314 const long dptrs_per_blk = NIDS_PER_BLOCK;
315 const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
316 const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
317 int n = 0;
318 int level = 0;
319
320 noffset[0] = 0;
321
322 if (block < direct_index) {
323 offset[n++] = block;
324 level = 0;
325 goto got;
326 }
327 block -= direct_index;
328 if (block < direct_blks) {
329 offset[n++] = NODE_DIR1_BLOCK;
330 noffset[n] = 1;
331 offset[n++] = block;
332 level = 1;
333 goto got;
334 }
335 block -= direct_blks;
336 if (block < direct_blks) {
337 offset[n++] = NODE_DIR2_BLOCK;
338 noffset[n] = 2;
339 offset[n++] = block;
340 level = 1;
341 goto got;
342 }
343 block -= direct_blks;
344 if (block < indirect_blks) {
345 offset[n++] = NODE_IND1_BLOCK;
346 noffset[n] = 3;
347 offset[n++] = block / direct_blks;
348 noffset[n] = 4 + offset[n - 1];
349 offset[n++] = block % direct_blks;
350 level = 2;
351 goto got;
352 }
353 block -= indirect_blks;
354 if (block < indirect_blks) {
355 offset[n++] = NODE_IND2_BLOCK;
356 noffset[n] = 4 + dptrs_per_blk;
357 offset[n++] = block / direct_blks;
358 noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
359 offset[n++] = block % direct_blks;
360 level = 2;
361 goto got;
362 }
363 block -= indirect_blks;
364 if (block < dindirect_blks) {
365 offset[n++] = NODE_DIND_BLOCK;
366 noffset[n] = 5 + (dptrs_per_blk * 2);
367 offset[n++] = block / indirect_blks;
368 noffset[n] = 6 + (dptrs_per_blk * 2) +
369 offset[n - 1] * (dptrs_per_blk + 1);
370 offset[n++] = (block / direct_blks) % dptrs_per_blk;
371 noffset[n] = 7 + (dptrs_per_blk * 2) +
372 offset[n - 2] * (dptrs_per_blk + 1) +
373 offset[n - 1];
374 offset[n++] = block % direct_blks;
375 level = 3;
376 goto got;
377 } else {
378 BUG();
379 }
380got:
381 return level;
382}
383
384/*
385 * Caller should call f2fs_put_dnode(dn).
386 */
387int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
388{
389 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
390 struct page *npage[4];
391 struct page *parent;
392 int offset[4];
393 unsigned int noffset[4];
394 nid_t nids[4];
395 int level, i;
396 int err = 0;
397
398 level = get_node_path(index, offset, noffset);
399
400 nids[0] = dn->inode->i_ino;
401 npage[0] = get_node_page(sbi, nids[0]);
402 if (IS_ERR(npage[0]))
403 return PTR_ERR(npage[0]);
404
405 parent = npage[0];
406 nids[1] = get_nid(parent, offset[0], true);
407 dn->inode_page = npage[0];
408 dn->inode_page_locked = true;
409
410 /* get indirect or direct nodes */
411 for (i = 1; i <= level; i++) {
412 bool done = false;
413
414 if (!nids[i] && !ro) {
415 mutex_lock_op(sbi, NODE_NEW);
416
417 /* alloc new node */
418 if (!alloc_nid(sbi, &(nids[i]))) {
419 mutex_unlock_op(sbi, NODE_NEW);
420 err = -ENOSPC;
421 goto release_pages;
422 }
423
424 dn->nid = nids[i];
425 npage[i] = new_node_page(dn, noffset[i]);
426 if (IS_ERR(npage[i])) {
427 alloc_nid_failed(sbi, nids[i]);
428 mutex_unlock_op(sbi, NODE_NEW);
429 err = PTR_ERR(npage[i]);
430 goto release_pages;
431 }
432
433 set_nid(parent, offset[i - 1], nids[i], i == 1);
434 alloc_nid_done(sbi, nids[i]);
435 mutex_unlock_op(sbi, NODE_NEW);
436 done = true;
437 } else if (ro && i == level && level > 1) {
438 npage[i] = get_node_page_ra(parent, offset[i - 1]);
439 if (IS_ERR(npage[i])) {
440 err = PTR_ERR(npage[i]);
441 goto release_pages;
442 }
443 done = true;
444 }
445 if (i == 1) {
446 dn->inode_page_locked = false;
447 unlock_page(parent);
448 } else {
449 f2fs_put_page(parent, 1);
450 }
451
452 if (!done) {
453 npage[i] = get_node_page(sbi, nids[i]);
454 if (IS_ERR(npage[i])) {
455 err = PTR_ERR(npage[i]);
456 f2fs_put_page(npage[0], 0);
457 goto release_out;
458 }
459 }
460 if (i < level) {
461 parent = npage[i];
462 nids[i + 1] = get_nid(parent, offset[i], false);
463 }
464 }
465 dn->nid = nids[level];
466 dn->ofs_in_node = offset[level];
467 dn->node_page = npage[level];
468 dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
469 return 0;
470
471release_pages:
472 f2fs_put_page(parent, 1);
473 if (i > 1)
474 f2fs_put_page(npage[0], 0);
475release_out:
476 dn->inode_page = NULL;
477 dn->node_page = NULL;
478 return err;
479}
480
481static void truncate_node(struct dnode_of_data *dn)
482{
483 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
484 struct node_info ni;
485
486 get_node_info(sbi, dn->nid, &ni);
487 if (dn->inode->i_blocks == 0) {
488 BUG_ON(ni.blk_addr != NULL_ADDR);
489 goto invalidate;
490 }
491 BUG_ON(ni.blk_addr == NULL_ADDR);
492
493 /* Deallocate node address */
494 invalidate_blocks(sbi, ni.blk_addr);
495 dec_valid_node_count(sbi, dn->inode, 1);
496 set_node_addr(sbi, &ni, NULL_ADDR);
497
498 if (dn->nid == dn->inode->i_ino) {
499 remove_orphan_inode(sbi, dn->nid);
500 dec_valid_inode_count(sbi);
501 } else {
502 sync_inode_page(dn);
503 }
504invalidate:
505 clear_node_page_dirty(dn->node_page);
506 F2FS_SET_SB_DIRT(sbi);
507
508 f2fs_put_page(dn->node_page, 1);
509 dn->node_page = NULL;
510}
511
512static int truncate_dnode(struct dnode_of_data *dn)
513{
514 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
515 struct page *page;
516
517 if (dn->nid == 0)
518 return 1;
519
520 /* get direct node */
521 page = get_node_page(sbi, dn->nid);
522 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
523 return 1;
524 else if (IS_ERR(page))
525 return PTR_ERR(page);
526
527 /* Make dnode_of_data for parameter */
528 dn->node_page = page;
529 dn->ofs_in_node = 0;
530 truncate_data_blocks(dn);
531 truncate_node(dn);
532 return 1;
533}
534
535static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
536 int ofs, int depth)
537{
538 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
539 struct dnode_of_data rdn = *dn;
540 struct page *page;
541 struct f2fs_node *rn;
542 nid_t child_nid;
543 unsigned int child_nofs;
544 int freed = 0;
545 int i, ret;
546
547 if (dn->nid == 0)
548 return NIDS_PER_BLOCK + 1;
549
550 page = get_node_page(sbi, dn->nid);
551 if (IS_ERR(page))
552 return PTR_ERR(page);
553
554 rn = (struct f2fs_node *)page_address(page);
555 if (depth < 3) {
556 for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
557 child_nid = le32_to_cpu(rn->in.nid[i]);
558 if (child_nid == 0)
559 continue;
560 rdn.nid = child_nid;
561 ret = truncate_dnode(&rdn);
562 if (ret < 0)
563 goto out_err;
564 set_nid(page, i, 0, false);
565 }
566 } else {
567 child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
568 for (i = ofs; i < NIDS_PER_BLOCK; i++) {
569 child_nid = le32_to_cpu(rn->in.nid[i]);
570 if (child_nid == 0) {
571 child_nofs += NIDS_PER_BLOCK + 1;
572 continue;
573 }
574 rdn.nid = child_nid;
575 ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
576 if (ret == (NIDS_PER_BLOCK + 1)) {
577 set_nid(page, i, 0, false);
578 child_nofs += ret;
579 } else if (ret < 0 && ret != -ENOENT) {
580 goto out_err;
581 }
582 }
583 freed = child_nofs;
584 }
585
586 if (!ofs) {
587 /* remove current indirect node */
588 dn->node_page = page;
589 truncate_node(dn);
590 freed++;
591 } else {
592 f2fs_put_page(page, 1);
593 }
594 return freed;
595
596out_err:
597 f2fs_put_page(page, 1);
598 return ret;
599}
600
601static int truncate_partial_nodes(struct dnode_of_data *dn,
602 struct f2fs_inode *ri, int *offset, int depth)
603{
604 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
605 struct page *pages[2];
606 nid_t nid[3];
607 nid_t child_nid;
608 int err = 0;
609 int i;
610 int idx = depth - 2;
611
612 nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
613 if (!nid[0])
614 return 0;
615
616 /* get indirect nodes in the path */
617 for (i = 0; i < depth - 1; i++) {
618 /* refernece count'll be increased */
619 pages[i] = get_node_page(sbi, nid[i]);
620 if (IS_ERR(pages[i])) {
621 depth = i + 1;
622 err = PTR_ERR(pages[i]);
623 goto fail;
624 }
625 nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
626 }
627
628 /* free direct nodes linked to a partial indirect node */
629 for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
630 child_nid = get_nid(pages[idx], i, false);
631 if (!child_nid)
632 continue;
633 dn->nid = child_nid;
634 err = truncate_dnode(dn);
635 if (err < 0)
636 goto fail;
637 set_nid(pages[idx], i, 0, false);
638 }
639
640 if (offset[depth - 1] == 0) {
641 dn->node_page = pages[idx];
642 dn->nid = nid[idx];
643 truncate_node(dn);
644 } else {
645 f2fs_put_page(pages[idx], 1);
646 }
647 offset[idx]++;
648 offset[depth - 1] = 0;
649fail:
650 for (i = depth - 3; i >= 0; i--)
651 f2fs_put_page(pages[i], 1);
652 return err;
653}
654
655/*
656 * All the block addresses of data and nodes should be nullified.
657 */
658int truncate_inode_blocks(struct inode *inode, pgoff_t from)
659{
660 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
661 int err = 0, cont = 1;
662 int level, offset[4], noffset[4];
663 unsigned int nofs;
664 struct f2fs_node *rn;
665 struct dnode_of_data dn;
666 struct page *page;
667
668 level = get_node_path(from, offset, noffset);
669
670 page = get_node_page(sbi, inode->i_ino);
671 if (IS_ERR(page))
672 return PTR_ERR(page);
673
674 set_new_dnode(&dn, inode, page, NULL, 0);
675 unlock_page(page);
676
677 rn = page_address(page);
678 switch (level) {
679 case 0:
680 case 1:
681 nofs = noffset[1];
682 break;
683 case 2:
684 nofs = noffset[1];
685 if (!offset[level - 1])
686 goto skip_partial;
687 err = truncate_partial_nodes(&dn, &rn->i, offset, level);
688 if (err < 0 && err != -ENOENT)
689 goto fail;
690 nofs += 1 + NIDS_PER_BLOCK;
691 break;
692 case 3:
693 nofs = 5 + 2 * NIDS_PER_BLOCK;
694 if (!offset[level - 1])
695 goto skip_partial;
696 err = truncate_partial_nodes(&dn, &rn->i, offset, level);
697 if (err < 0 && err != -ENOENT)
698 goto fail;
699 break;
700 default:
701 BUG();
702 }
703
704skip_partial:
705 while (cont) {
706 dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
707 switch (offset[0]) {
708 case NODE_DIR1_BLOCK:
709 case NODE_DIR2_BLOCK:
710 err = truncate_dnode(&dn);
711 break;
712
713 case NODE_IND1_BLOCK:
714 case NODE_IND2_BLOCK:
715 err = truncate_nodes(&dn, nofs, offset[1], 2);
716 break;
717
718 case NODE_DIND_BLOCK:
719 err = truncate_nodes(&dn, nofs, offset[1], 3);
720 cont = 0;
721 break;
722
723 default:
724 BUG();
725 }
726 if (err < 0 && err != -ENOENT)
727 goto fail;
728 if (offset[1] == 0 &&
729 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
730 lock_page(page);
731 wait_on_page_writeback(page);
732 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
733 set_page_dirty(page);
734 unlock_page(page);
735 }
736 offset[1] = 0;
737 offset[0]++;
738 nofs += err;
739 }
740fail:
741 f2fs_put_page(page, 0);
742 return err > 0 ? 0 : err;
743}
744
745int remove_inode_page(struct inode *inode)
746{
747 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
748 struct page *page;
749 nid_t ino = inode->i_ino;
750 struct dnode_of_data dn;
751
752 mutex_lock_op(sbi, NODE_TRUNC);
753 page = get_node_page(sbi, ino);
754 if (IS_ERR(page)) {
755 mutex_unlock_op(sbi, NODE_TRUNC);
756 return PTR_ERR(page);
757 }
758
759 if (F2FS_I(inode)->i_xattr_nid) {
760 nid_t nid = F2FS_I(inode)->i_xattr_nid;
761 struct page *npage = get_node_page(sbi, nid);
762
763 if (IS_ERR(npage)) {
764 mutex_unlock_op(sbi, NODE_TRUNC);
765 return PTR_ERR(npage);
766 }
767
768 F2FS_I(inode)->i_xattr_nid = 0;
769 set_new_dnode(&dn, inode, page, npage, nid);
770 dn.inode_page_locked = 1;
771 truncate_node(&dn);
772 }
773
774 /* 0 is possible, after f2fs_new_inode() is failed */
775 BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1);
776 set_new_dnode(&dn, inode, page, page, ino);
777 truncate_node(&dn);
778
779 mutex_unlock_op(sbi, NODE_TRUNC);
780 return 0;
781}
782
783int new_inode_page(struct inode *inode, struct dentry *dentry)
784{
785 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
786 struct page *page;
787 struct dnode_of_data dn;
788
789 /* allocate inode page for new inode */
790 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
791 mutex_lock_op(sbi, NODE_NEW);
792 page = new_node_page(&dn, 0);
793 init_dent_inode(dentry, page);
794 mutex_unlock_op(sbi, NODE_NEW);
795 if (IS_ERR(page))
796 return PTR_ERR(page);
797 f2fs_put_page(page, 1);
798 return 0;
799}
800
801struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
802{
803 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
804 struct address_space *mapping = sbi->node_inode->i_mapping;
805 struct node_info old_ni, new_ni;
806 struct page *page;
807 int err;
808
809 if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
810 return ERR_PTR(-EPERM);
811
812 page = grab_cache_page(mapping, dn->nid);
813 if (!page)
814 return ERR_PTR(-ENOMEM);
815
816 get_node_info(sbi, dn->nid, &old_ni);
817
818 SetPageUptodate(page);
819 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
820
821 /* Reinitialize old_ni with new node page */
822 BUG_ON(old_ni.blk_addr != NULL_ADDR);
823 new_ni = old_ni;
824 new_ni.ino = dn->inode->i_ino;
825
826 if (!inc_valid_node_count(sbi, dn->inode, 1)) {
827 err = -ENOSPC;
828 goto fail;
829 }
830 set_node_addr(sbi, &new_ni, NEW_ADDR);
831 set_cold_node(dn->inode, page);
832
833 dn->node_page = page;
834 sync_inode_page(dn);
835 set_page_dirty(page);
836 if (ofs == 0)
837 inc_valid_inode_count(sbi);
838
839 return page;
840
841fail:
842 clear_node_page_dirty(page);
843 f2fs_put_page(page, 1);
844 return ERR_PTR(err);
845}
846
847static int read_node_page(struct page *page, int type)
848{
849 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
850 struct node_info ni;
851
852 get_node_info(sbi, page->index, &ni);
853
854 if (ni.blk_addr == NULL_ADDR)
855 return -ENOENT;
856 return f2fs_readpage(sbi, page, ni.blk_addr, type);
857}
858
859/*
860 * Readahead a node page
861 */
862void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
863{
864 struct address_space *mapping = sbi->node_inode->i_mapping;
865 struct page *apage;
866
867 apage = find_get_page(mapping, nid);
868 if (apage && PageUptodate(apage))
869 goto release_out;
870 f2fs_put_page(apage, 0);
871
872 apage = grab_cache_page(mapping, nid);
873 if (!apage)
874 return;
875
876 if (read_node_page(apage, READA))
877 goto unlock_out;
878
879 page_cache_release(apage);
880 return;
881
882unlock_out:
883 unlock_page(apage);
884release_out:
885 page_cache_release(apage);
886}
887
888struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
889{
890 int err;
891 struct page *page;
892 struct address_space *mapping = sbi->node_inode->i_mapping;
893
894 page = grab_cache_page(mapping, nid);
895 if (!page)
896 return ERR_PTR(-ENOMEM);
897
898 err = read_node_page(page, READ_SYNC);
899 if (err) {
900 f2fs_put_page(page, 1);
901 return ERR_PTR(err);
902 }
903
904 BUG_ON(nid != nid_of_node(page));
905 mark_page_accessed(page);
906 return page;
907}
908
909/*
910 * Return a locked page for the desired node page.
911 * And, readahead MAX_RA_NODE number of node pages.
912 */
913struct page *get_node_page_ra(struct page *parent, int start)
914{
915 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
916 struct address_space *mapping = sbi->node_inode->i_mapping;
917 int i, end;
918 int err = 0;
919 nid_t nid;
920 struct page *page;
921
922 /* First, try getting the desired direct node. */
923 nid = get_nid(parent, start, false);
924 if (!nid)
925 return ERR_PTR(-ENOENT);
926
927 page = find_get_page(mapping, nid);
928 if (page && PageUptodate(page))
929 goto page_hit;
930 f2fs_put_page(page, 0);
931
932repeat:
933 page = grab_cache_page(mapping, nid);
934 if (!page)
935 return ERR_PTR(-ENOMEM);
936
937 err = read_node_page(page, READA);
938 if (err) {
939 f2fs_put_page(page, 1);
940 return ERR_PTR(err);
941 }
942
943 /* Then, try readahead for siblings of the desired node */
944 end = start + MAX_RA_NODE;
945 end = min(end, NIDS_PER_BLOCK);
946 for (i = start + 1; i < end; i++) {
947 nid = get_nid(parent, i, false);
948 if (!nid)
949 continue;
950 ra_node_page(sbi, nid);
951 }
952
953page_hit:
954 lock_page(page);
955 if (PageError(page)) {
956 f2fs_put_page(page, 1);
957 return ERR_PTR(-EIO);
958 }
959
960 /* Has the page been truncated? */
961 if (page->mapping != mapping) {
962 f2fs_put_page(page, 1);
963 goto repeat;
964 }
965 return page;
966}
967
968void sync_inode_page(struct dnode_of_data *dn)
969{
970 if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
971 update_inode(dn->inode, dn->node_page);
972 } else if (dn->inode_page) {
973 if (!dn->inode_page_locked)
974 lock_page(dn->inode_page);
975 update_inode(dn->inode, dn->inode_page);
976 if (!dn->inode_page_locked)
977 unlock_page(dn->inode_page);
978 } else {
979 f2fs_write_inode(dn->inode, NULL);
980 }
981}
982
983int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
984 struct writeback_control *wbc)
985{
986 struct address_space *mapping = sbi->node_inode->i_mapping;
987 pgoff_t index, end;
988 struct pagevec pvec;
989 int step = ino ? 2 : 0;
990 int nwritten = 0, wrote = 0;
991
992 pagevec_init(&pvec, 0);
993
994next_step:
995 index = 0;
996 end = LONG_MAX;
997
998 while (index <= end) {
999 int i, nr_pages;
1000 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1001 PAGECACHE_TAG_DIRTY,
1002 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1003 if (nr_pages == 0)
1004 break;
1005
1006 for (i = 0; i < nr_pages; i++) {
1007 struct page *page = pvec.pages[i];
1008
1009 /*
1010 * flushing sequence with step:
1011 * 0. indirect nodes
1012 * 1. dentry dnodes
1013 * 2. file dnodes
1014 */
1015 if (step == 0 && IS_DNODE(page))
1016 continue;
1017 if (step == 1 && (!IS_DNODE(page) ||
1018 is_cold_node(page)))
1019 continue;
1020 if (step == 2 && (!IS_DNODE(page) ||
1021 !is_cold_node(page)))
1022 continue;
1023
1024 /*
1025 * If an fsync mode,
1026 * we should not skip writing node pages.
1027 */
1028 if (ino && ino_of_node(page) == ino)
1029 lock_page(page);
1030 else if (!trylock_page(page))
1031 continue;
1032
1033 if (unlikely(page->mapping != mapping)) {
1034continue_unlock:
1035 unlock_page(page);
1036 continue;
1037 }
1038 if (ino && ino_of_node(page) != ino)
1039 goto continue_unlock;
1040
1041 if (!PageDirty(page)) {
1042 /* someone wrote it for us */
1043 goto continue_unlock;
1044 }
1045
1046 if (!clear_page_dirty_for_io(page))
1047 goto continue_unlock;
1048
1049 /* called by fsync() */
1050 if (ino && IS_DNODE(page)) {
1051 int mark = !is_checkpointed_node(sbi, ino);
1052 set_fsync_mark(page, 1);
1053 if (IS_INODE(page))
1054 set_dentry_mark(page, mark);
1055 nwritten++;
1056 } else {
1057 set_fsync_mark(page, 0);
1058 set_dentry_mark(page, 0);
1059 }
1060 mapping->a_ops->writepage(page, wbc);
1061 wrote++;
1062
1063 if (--wbc->nr_to_write == 0)
1064 break;
1065 }
1066 pagevec_release(&pvec);
1067 cond_resched();
1068
1069 if (wbc->nr_to_write == 0) {
1070 step = 2;
1071 break;
1072 }
1073 }
1074
1075 if (step < 2) {
1076 step++;
1077 goto next_step;
1078 }
1079
1080 if (wrote)
1081 f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
1082
1083 return nwritten;
1084}
1085
1086static int f2fs_write_node_page(struct page *page,
1087 struct writeback_control *wbc)
1088{
1089 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1090 nid_t nid;
1091 block_t new_addr;
1092 struct node_info ni;
1093
1094 if (wbc->for_reclaim) {
1095 dec_page_count(sbi, F2FS_DIRTY_NODES);
1096 wbc->pages_skipped++;
1097 set_page_dirty(page);
1098 return AOP_WRITEPAGE_ACTIVATE;
1099 }
1100
1101 wait_on_page_writeback(page);
1102
1103 mutex_lock_op(sbi, NODE_WRITE);
1104
1105 /* get old block addr of this node page */
1106 nid = nid_of_node(page);
1107 BUG_ON(page->index != nid);
1108
1109 get_node_info(sbi, nid, &ni);
1110
1111 /* This page is already truncated */
1112 if (ni.blk_addr == NULL_ADDR)
1113 return 0;
1114
1115 set_page_writeback(page);
1116
1117 /* insert node offset */
1118 write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
1119 set_node_addr(sbi, &ni, new_addr);
1120 dec_page_count(sbi, F2FS_DIRTY_NODES);
1121
1122 mutex_unlock_op(sbi, NODE_WRITE);
1123 unlock_page(page);
1124 return 0;
1125}
1126
1127/*
1128 * It is very important to gather dirty pages and write at once, so that we can
1129 * submit a big bio without interfering other data writes.
1130 * Be default, 512 pages (2MB), a segment size, is quite reasonable.
1131 */
1132#define COLLECT_DIRTY_NODES 512
1133static int f2fs_write_node_pages(struct address_space *mapping,
1134 struct writeback_control *wbc)
1135{
1136 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1137 struct block_device *bdev = sbi->sb->s_bdev;
1138 long nr_to_write = wbc->nr_to_write;
1139
1140 /* First check balancing cached NAT entries */
1141 if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
1142 write_checkpoint(sbi, false, false);
1143 return 0;
1144 }
1145
1146 /* collect a number of dirty node pages and write together */
1147 if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
1148 return 0;
1149
1150 /* if mounting is failed, skip writing node pages */
1151 wbc->nr_to_write = bio_get_nr_vecs(bdev);
1152 sync_node_pages(sbi, 0, wbc);
1153 wbc->nr_to_write = nr_to_write -
1154 (bio_get_nr_vecs(bdev) - wbc->nr_to_write);
1155 return 0;
1156}
1157
1158static int f2fs_set_node_page_dirty(struct page *page)
1159{
1160 struct address_space *mapping = page->mapping;
1161 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1162
1163 SetPageUptodate(page);
1164 if (!PageDirty(page)) {
1165 __set_page_dirty_nobuffers(page);
1166 inc_page_count(sbi, F2FS_DIRTY_NODES);
1167 SetPagePrivate(page);
1168 return 1;
1169 }
1170 return 0;
1171}
1172
1173static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
1174{
1175 struct inode *inode = page->mapping->host;
1176 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1177 if (PageDirty(page))
1178 dec_page_count(sbi, F2FS_DIRTY_NODES);
1179 ClearPagePrivate(page);
1180}
1181
1182static int f2fs_release_node_page(struct page *page, gfp_t wait)
1183{
1184 ClearPagePrivate(page);
1185 return 0;
1186}
1187
1188/*
1189 * Structure of the f2fs node operations
1190 */
1191const struct address_space_operations f2fs_node_aops = {
1192 .writepage = f2fs_write_node_page,
1193 .writepages = f2fs_write_node_pages,
1194 .set_page_dirty = f2fs_set_node_page_dirty,
1195 .invalidatepage = f2fs_invalidate_node_page,
1196 .releasepage = f2fs_release_node_page,
1197};
1198
1199static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
1200{
1201 struct list_head *this;
1202 struct free_nid *i = NULL;
1203 list_for_each(this, head) {
1204 i = list_entry(this, struct free_nid, list);
1205 if (i->nid == n)
1206 break;
1207 i = NULL;
1208 }
1209 return i;
1210}
1211
1212static void __del_from_free_nid_list(struct free_nid *i)
1213{
1214 list_del(&i->list);
1215 kmem_cache_free(free_nid_slab, i);
1216}
1217
1218static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
1219{
1220 struct free_nid *i;
1221
1222 if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
1223 return 0;
1224retry:
1225 i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
1226 if (!i) {
1227 cond_resched();
1228 goto retry;
1229 }
1230 i->nid = nid;
1231 i->state = NID_NEW;
1232
1233 spin_lock(&nm_i->free_nid_list_lock);
1234 if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
1235 spin_unlock(&nm_i->free_nid_list_lock);
1236 kmem_cache_free(free_nid_slab, i);
1237 return 0;
1238 }
1239 list_add_tail(&i->list, &nm_i->free_nid_list);
1240 nm_i->fcnt++;
1241 spin_unlock(&nm_i->free_nid_list_lock);
1242 return 1;
1243}
1244
1245static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
1246{
1247 struct free_nid *i;
1248 spin_lock(&nm_i->free_nid_list_lock);
1249 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
1250 if (i && i->state == NID_NEW) {
1251 __del_from_free_nid_list(i);
1252 nm_i->fcnt--;
1253 }
1254 spin_unlock(&nm_i->free_nid_list_lock);
1255}
1256
1257static int scan_nat_page(struct f2fs_nm_info *nm_i,
1258 struct page *nat_page, nid_t start_nid)
1259{
1260 struct f2fs_nat_block *nat_blk = page_address(nat_page);
1261 block_t blk_addr;
1262 int fcnt = 0;
1263 int i;
1264
1265 /* 0 nid should not be used */
1266 if (start_nid == 0)
1267 ++start_nid;
1268
1269 i = start_nid % NAT_ENTRY_PER_BLOCK;
1270
1271 for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1272 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1273 BUG_ON(blk_addr == NEW_ADDR);
1274 if (blk_addr == NULL_ADDR)
1275 fcnt += add_free_nid(nm_i, start_nid);
1276 }
1277 return fcnt;
1278}
1279
1280static void build_free_nids(struct f2fs_sb_info *sbi)
1281{
1282 struct free_nid *fnid, *next_fnid;
1283 struct f2fs_nm_info *nm_i = NM_I(sbi);
1284 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1285 struct f2fs_summary_block *sum = curseg->sum_blk;
1286 nid_t nid = 0;
1287 bool is_cycled = false;
1288 int fcnt = 0;
1289 int i;
1290
1291 nid = nm_i->next_scan_nid;
1292 nm_i->init_scan_nid = nid;
1293
1294 ra_nat_pages(sbi, nid);
1295
1296 while (1) {
1297 struct page *page = get_current_nat_page(sbi, nid);
1298
1299 fcnt += scan_nat_page(nm_i, page, nid);
1300 f2fs_put_page(page, 1);
1301
1302 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
1303
1304 if (nid >= nm_i->max_nid) {
1305 nid = 0;
1306 is_cycled = true;
1307 }
1308 if (fcnt > MAX_FREE_NIDS)
1309 break;
1310 if (is_cycled && nm_i->init_scan_nid <= nid)
1311 break;
1312 }
1313
1314 nm_i->next_scan_nid = nid;
1315
1316 /* find free nids from current sum_pages */
1317 mutex_lock(&curseg->curseg_mutex);
1318 for (i = 0; i < nats_in_cursum(sum); i++) {
1319 block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
1320 nid = le32_to_cpu(nid_in_journal(sum, i));
1321 if (addr == NULL_ADDR)
1322 add_free_nid(nm_i, nid);
1323 else
1324 remove_free_nid(nm_i, nid);
1325 }
1326 mutex_unlock(&curseg->curseg_mutex);
1327
1328 /* remove the free nids from current allocated nids */
1329 list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
1330 struct nat_entry *ne;
1331
1332 read_lock(&nm_i->nat_tree_lock);
1333 ne = __lookup_nat_cache(nm_i, fnid->nid);
1334 if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
1335 remove_free_nid(nm_i, fnid->nid);
1336 read_unlock(&nm_i->nat_tree_lock);
1337 }
1338}
1339
1340/*
1341 * If this function returns success, caller can obtain a new nid
1342 * from second parameter of this function.
1343 * The returned nid could be used ino as well as nid when inode is created.
1344 */
1345bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
1346{
1347 struct f2fs_nm_info *nm_i = NM_I(sbi);
1348 struct free_nid *i = NULL;
1349 struct list_head *this;
1350retry:
1351 mutex_lock(&nm_i->build_lock);
1352 if (!nm_i->fcnt) {
1353 /* scan NAT in order to build free nid list */
1354 build_free_nids(sbi);
1355 if (!nm_i->fcnt) {
1356 mutex_unlock(&nm_i->build_lock);
1357 return false;
1358 }
1359 }
1360 mutex_unlock(&nm_i->build_lock);
1361
1362 /*
1363 * We check fcnt again since previous check is racy as
1364 * we didn't hold free_nid_list_lock. So other thread
1365 * could consume all of free nids.
1366 */
1367 spin_lock(&nm_i->free_nid_list_lock);
1368 if (!nm_i->fcnt) {
1369 spin_unlock(&nm_i->free_nid_list_lock);
1370 goto retry;
1371 }
1372
1373 BUG_ON(list_empty(&nm_i->free_nid_list));
1374 list_for_each(this, &nm_i->free_nid_list) {
1375 i = list_entry(this, struct free_nid, list);
1376 if (i->state == NID_NEW)
1377 break;
1378 }
1379
1380 BUG_ON(i->state != NID_NEW);
1381 *nid = i->nid;
1382 i->state = NID_ALLOC;
1383 nm_i->fcnt--;
1384 spin_unlock(&nm_i->free_nid_list_lock);
1385 return true;
1386}
1387
1388/*
1389 * alloc_nid() should be called prior to this function.
1390 */
1391void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1392{
1393 struct f2fs_nm_info *nm_i = NM_I(sbi);
1394 struct free_nid *i;
1395
1396 spin_lock(&nm_i->free_nid_list_lock);
1397 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
1398 if (i) {
1399 BUG_ON(i->state != NID_ALLOC);
1400 __del_from_free_nid_list(i);
1401 }
1402 spin_unlock(&nm_i->free_nid_list_lock);
1403}
1404
1405/*
1406 * alloc_nid() should be called prior to this function.
1407 */
1408void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1409{
1410 alloc_nid_done(sbi, nid);
1411 add_free_nid(NM_I(sbi), nid);
1412}
1413
1414void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
1415 struct f2fs_summary *sum, struct node_info *ni,
1416 block_t new_blkaddr)
1417{
1418 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
1419 set_node_addr(sbi, ni, new_blkaddr);
1420 clear_node_page_dirty(page);
1421}
1422
1423int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1424{
1425 struct address_space *mapping = sbi->node_inode->i_mapping;
1426 struct f2fs_node *src, *dst;
1427 nid_t ino = ino_of_node(page);
1428 struct node_info old_ni, new_ni;
1429 struct page *ipage;
1430
1431 ipage = grab_cache_page(mapping, ino);
1432 if (!ipage)
1433 return -ENOMEM;
1434
1435 /* Should not use this inode from free nid list */
1436 remove_free_nid(NM_I(sbi), ino);
1437
1438 get_node_info(sbi, ino, &old_ni);
1439 SetPageUptodate(ipage);
1440 fill_node_footer(ipage, ino, ino, 0, true);
1441
1442 src = (struct f2fs_node *)page_address(page);
1443 dst = (struct f2fs_node *)page_address(ipage);
1444
1445 memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
1446 dst->i.i_size = 0;
1447 dst->i.i_blocks = cpu_to_le64(1);
1448 dst->i.i_links = cpu_to_le32(1);
1449 dst->i.i_xattr_nid = 0;
1450
1451 new_ni = old_ni;
1452 new_ni.ino = ino;
1453
1454 set_node_addr(sbi, &new_ni, NEW_ADDR);
1455 inc_valid_inode_count(sbi);
1456
1457 f2fs_put_page(ipage, 1);
1458 return 0;
1459}
1460
1461int restore_node_summary(struct f2fs_sb_info *sbi,
1462 unsigned int segno, struct f2fs_summary_block *sum)
1463{
1464 struct f2fs_node *rn;
1465 struct f2fs_summary *sum_entry;
1466 struct page *page;
1467 block_t addr;
1468 int i, last_offset;
1469
1470 /* alloc temporal page for read node */
1471 page = alloc_page(GFP_NOFS | __GFP_ZERO);
1472 if (IS_ERR(page))
1473 return PTR_ERR(page);
1474 lock_page(page);
1475
1476 /* scan the node segment */
1477 last_offset = sbi->blocks_per_seg;
1478 addr = START_BLOCK(sbi, segno);
1479 sum_entry = &sum->entries[0];
1480
1481 for (i = 0; i < last_offset; i++, sum_entry++) {
1482 if (f2fs_readpage(sbi, page, addr, READ_SYNC))
1483 goto out;
1484
1485 rn = (struct f2fs_node *)page_address(page);
1486 sum_entry->nid = rn->footer.nid;
1487 sum_entry->version = 0;
1488 sum_entry->ofs_in_node = 0;
1489 addr++;
1490
1491 /*
1492 * In order to read next node page,
1493 * we must clear PageUptodate flag.
1494 */
1495 ClearPageUptodate(page);
1496 }
1497out:
1498 unlock_page(page);
1499 __free_pages(page, 0);
1500 return 0;
1501}
1502
1503static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
1504{
1505 struct f2fs_nm_info *nm_i = NM_I(sbi);
1506 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1507 struct f2fs_summary_block *sum = curseg->sum_blk;
1508 int i;
1509
1510 mutex_lock(&curseg->curseg_mutex);
1511
1512 if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
1513 mutex_unlock(&curseg->curseg_mutex);
1514 return false;
1515 }
1516
1517 for (i = 0; i < nats_in_cursum(sum); i++) {
1518 struct nat_entry *ne;
1519 struct f2fs_nat_entry raw_ne;
1520 nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
1521
1522 raw_ne = nat_in_journal(sum, i);
1523retry:
1524 write_lock(&nm_i->nat_tree_lock);
1525 ne = __lookup_nat_cache(nm_i, nid);
1526 if (ne) {
1527 __set_nat_cache_dirty(nm_i, ne);
1528 write_unlock(&nm_i->nat_tree_lock);
1529 continue;
1530 }
1531 ne = grab_nat_entry(nm_i, nid);
1532 if (!ne) {
1533 write_unlock(&nm_i->nat_tree_lock);
1534 goto retry;
1535 }
1536 nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr));
1537 nat_set_ino(ne, le32_to_cpu(raw_ne.ino));
1538 nat_set_version(ne, raw_ne.version);
1539 __set_nat_cache_dirty(nm_i, ne);
1540 write_unlock(&nm_i->nat_tree_lock);
1541 }
1542 update_nats_in_cursum(sum, -i);
1543 mutex_unlock(&curseg->curseg_mutex);
1544 return true;
1545}
1546
1547/*
1548 * This function is called during the checkpointing process.
1549 */
1550void flush_nat_entries(struct f2fs_sb_info *sbi)
1551{
1552 struct f2fs_nm_info *nm_i = NM_I(sbi);
1553 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1554 struct f2fs_summary_block *sum = curseg->sum_blk;
1555 struct list_head *cur, *n;
1556 struct page *page = NULL;
1557 struct f2fs_nat_block *nat_blk = NULL;
1558 nid_t start_nid = 0, end_nid = 0;
1559 bool flushed;
1560
1561 flushed = flush_nats_in_journal(sbi);
1562
1563 if (!flushed)
1564 mutex_lock(&curseg->curseg_mutex);
1565
1566 /* 1) flush dirty nat caches */
1567 list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
1568 struct nat_entry *ne;
1569 nid_t nid;
1570 struct f2fs_nat_entry raw_ne;
1571 int offset = -1;
1572 block_t new_blkaddr;
1573
1574 ne = list_entry(cur, struct nat_entry, list);
1575 nid = nat_get_nid(ne);
1576
1577 if (nat_get_blkaddr(ne) == NEW_ADDR)
1578 continue;
1579 if (flushed)
1580 goto to_nat_page;
1581
1582 /* if there is room for nat enries in curseg->sumpage */
1583 offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
1584 if (offset >= 0) {
1585 raw_ne = nat_in_journal(sum, offset);
1586 goto flush_now;
1587 }
1588to_nat_page:
1589 if (!page || (start_nid > nid || nid > end_nid)) {
1590 if (page) {
1591 f2fs_put_page(page, 1);
1592 page = NULL;
1593 }
1594 start_nid = START_NID(nid);
1595 end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
1596
1597 /*
1598 * get nat block with dirty flag, increased reference
1599 * count, mapped and lock
1600 */
1601 page = get_next_nat_page(sbi, start_nid);
1602 nat_blk = page_address(page);
1603 }
1604
1605 BUG_ON(!nat_blk);
1606 raw_ne = nat_blk->entries[nid - start_nid];
1607flush_now:
1608 new_blkaddr = nat_get_blkaddr(ne);
1609
1610 raw_ne.ino = cpu_to_le32(nat_get_ino(ne));
1611 raw_ne.block_addr = cpu_to_le32(new_blkaddr);
1612 raw_ne.version = nat_get_version(ne);
1613
1614 if (offset < 0) {
1615 nat_blk->entries[nid - start_nid] = raw_ne;
1616 } else {
1617 nat_in_journal(sum, offset) = raw_ne;
1618 nid_in_journal(sum, offset) = cpu_to_le32(nid);
1619 }
1620
1621 if (nat_get_blkaddr(ne) == NULL_ADDR) {
1622 write_lock(&nm_i->nat_tree_lock);
1623 __del_from_nat_cache(nm_i, ne);
1624 write_unlock(&nm_i->nat_tree_lock);
1625
1626 /* We can reuse this freed nid at this point */
1627 add_free_nid(NM_I(sbi), nid);
1628 } else {
1629 write_lock(&nm_i->nat_tree_lock);
1630 __clear_nat_cache_dirty(nm_i, ne);
1631 ne->checkpointed = true;
1632 write_unlock(&nm_i->nat_tree_lock);
1633 }
1634 }
1635 if (!flushed)
1636 mutex_unlock(&curseg->curseg_mutex);
1637 f2fs_put_page(page, 1);
1638
1639 /* 2) shrink nat caches if necessary */
1640 try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
1641}
1642
1643static int init_node_manager(struct f2fs_sb_info *sbi)
1644{
1645 struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
1646 struct f2fs_nm_info *nm_i = NM_I(sbi);
1647 unsigned char *version_bitmap;
1648 unsigned int nat_segs, nat_blocks;
1649
1650 nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
1651
1652 /* segment_count_nat includes pair segment so divide to 2. */
1653 nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
1654 nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
1655 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
1656 nm_i->fcnt = 0;
1657 nm_i->nat_cnt = 0;
1658
1659 INIT_LIST_HEAD(&nm_i->free_nid_list);
1660 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1661 INIT_LIST_HEAD(&nm_i->nat_entries);
1662 INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
1663
1664 mutex_init(&nm_i->build_lock);
1665 spin_lock_init(&nm_i->free_nid_list_lock);
1666 rwlock_init(&nm_i->nat_tree_lock);
1667
1668 nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
1669 nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
1670 nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
1671
1672 nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
1673 if (!nm_i->nat_bitmap)
1674 return -ENOMEM;
1675 version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
1676 if (!version_bitmap)
1677 return -EFAULT;
1678
1679 /* copy version bitmap */
1680 memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
1681 return 0;
1682}
1683
1684int build_node_manager(struct f2fs_sb_info *sbi)
1685{
1686 int err;
1687
1688 sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
1689 if (!sbi->nm_info)
1690 return -ENOMEM;
1691
1692 err = init_node_manager(sbi);
1693 if (err)
1694 return err;
1695
1696 build_free_nids(sbi);
1697 return 0;
1698}
1699
1700void destroy_node_manager(struct f2fs_sb_info *sbi)
1701{
1702 struct f2fs_nm_info *nm_i = NM_I(sbi);
1703 struct free_nid *i, *next_i;
1704 struct nat_entry *natvec[NATVEC_SIZE];
1705 nid_t nid = 0;
1706 unsigned int found;
1707
1708 if (!nm_i)
1709 return;
1710
1711 /* destroy free nid list */
1712 spin_lock(&nm_i->free_nid_list_lock);
1713 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
1714 BUG_ON(i->state == NID_ALLOC);
1715 __del_from_free_nid_list(i);
1716 nm_i->fcnt--;
1717 }
1718 BUG_ON(nm_i->fcnt);
1719 spin_unlock(&nm_i->free_nid_list_lock);
1720
1721 /* destroy nat cache */
1722 write_lock(&nm_i->nat_tree_lock);
1723 while ((found = __gang_lookup_nat_cache(nm_i,
1724 nid, NATVEC_SIZE, natvec))) {
1725 unsigned idx;
1726 for (idx = 0; idx < found; idx++) {
1727 struct nat_entry *e = natvec[idx];
1728 nid = nat_get_nid(e) + 1;
1729 __del_from_nat_cache(nm_i, e);
1730 }
1731 }
1732 BUG_ON(nm_i->nat_cnt);
1733 write_unlock(&nm_i->nat_tree_lock);
1734
1735 kfree(nm_i->nat_bitmap);
1736 sbi->nm_info = NULL;
1737 kfree(nm_i);
1738}
1739
1740int __init create_node_manager_caches(void)
1741{
1742 nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
1743 sizeof(struct nat_entry), NULL);
1744 if (!nat_entry_slab)
1745 return -ENOMEM;
1746
1747 free_nid_slab = f2fs_kmem_cache_create("free_nid",
1748 sizeof(struct free_nid), NULL);
1749 if (!free_nid_slab) {
1750 kmem_cache_destroy(nat_entry_slab);
1751 return -ENOMEM;
1752 }
1753 return 0;
1754}
1755
1756void destroy_node_manager_caches(void)
1757{
1758 kmem_cache_destroy(free_nid_slab);
1759 kmem_cache_destroy(nat_entry_slab);
1760}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
new file mode 100644
index 000000000000..afdb130f782e
--- /dev/null
+++ b/fs/f2fs/node.h
@@ -0,0 +1,353 @@
1/*
2 * fs/f2fs/node.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11/* start node id of a node block dedicated to the given node id */
12#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
13
14/* node block offset on the NAT area dedicated to the given start node id */
15#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
16
17/* # of pages to perform readahead before building free nids */
18#define FREE_NID_PAGES 4
19
20/* maximum # of free node ids to produce during build_free_nids */
21#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
22
23/* maximum readahead size for node during getting data blocks */
24#define MAX_RA_NODE 128
25
26/* maximum cached nat entries to manage memory footprint */
27#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK)
28
29/* vector size for gang look-up from nat cache that consists of radix tree */
30#define NATVEC_SIZE 64
31
32/*
33 * For node information
34 */
35struct node_info {
36 nid_t nid; /* node id */
37 nid_t ino; /* inode number of the node's owner */
38 block_t blk_addr; /* block address of the node */
39 unsigned char version; /* version of the node */
40};
41
42struct nat_entry {
43 struct list_head list; /* for clean or dirty nat list */
44 bool checkpointed; /* whether it is checkpointed or not */
45 struct node_info ni; /* in-memory node information */
46};
47
48#define nat_get_nid(nat) (nat->ni.nid)
49#define nat_set_nid(nat, n) (nat->ni.nid = n)
50#define nat_get_blkaddr(nat) (nat->ni.blk_addr)
51#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b)
52#define nat_get_ino(nat) (nat->ni.ino)
53#define nat_set_ino(nat, i) (nat->ni.ino = i)
54#define nat_get_version(nat) (nat->ni.version)
55#define nat_set_version(nat, v) (nat->ni.version = v)
56
57#define __set_nat_cache_dirty(nm_i, ne) \
58 list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
59#define __clear_nat_cache_dirty(nm_i, ne) \
60 list_move_tail(&ne->list, &nm_i->nat_entries);
61#define inc_node_version(version) (++version)
62
63static inline void node_info_from_raw_nat(struct node_info *ni,
64 struct f2fs_nat_entry *raw_ne)
65{
66 ni->ino = le32_to_cpu(raw_ne->ino);
67 ni->blk_addr = le32_to_cpu(raw_ne->block_addr);
68 ni->version = raw_ne->version;
69}
70
71/*
72 * For free nid mangement
73 */
74enum nid_state {
75 NID_NEW, /* newly added to free nid list */
76 NID_ALLOC /* it is allocated */
77};
78
79struct free_nid {
80 struct list_head list; /* for free node id list */
81 nid_t nid; /* node id */
82 int state; /* in use or not: NID_NEW or NID_ALLOC */
83};
84
85static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
86{
87 struct f2fs_nm_info *nm_i = NM_I(sbi);
88 struct free_nid *fnid;
89
90 if (nm_i->fcnt <= 0)
91 return -1;
92 spin_lock(&nm_i->free_nid_list_lock);
93 fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
94 *nid = fnid->nid;
95 spin_unlock(&nm_i->free_nid_list_lock);
96 return 0;
97}
98
99/*
100 * inline functions
101 */
102static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
103{
104 struct f2fs_nm_info *nm_i = NM_I(sbi);
105 memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
106}
107
108static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
109{
110 struct f2fs_nm_info *nm_i = NM_I(sbi);
111 pgoff_t block_off;
112 pgoff_t block_addr;
113 int seg_off;
114
115 block_off = NAT_BLOCK_OFFSET(start);
116 seg_off = block_off >> sbi->log_blocks_per_seg;
117
118 block_addr = (pgoff_t)(nm_i->nat_blkaddr +
119 (seg_off << sbi->log_blocks_per_seg << 1) +
120 (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
121
122 if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
123 block_addr += sbi->blocks_per_seg;
124
125 return block_addr;
126}
127
128static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
129 pgoff_t block_addr)
130{
131 struct f2fs_nm_info *nm_i = NM_I(sbi);
132
133 block_addr -= nm_i->nat_blkaddr;
134 if ((block_addr >> sbi->log_blocks_per_seg) % 2)
135 block_addr -= sbi->blocks_per_seg;
136 else
137 block_addr += sbi->blocks_per_seg;
138
139 return block_addr + nm_i->nat_blkaddr;
140}
141
142static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
143{
144 unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
145
146 if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
147 f2fs_clear_bit(block_off, nm_i->nat_bitmap);
148 else
149 f2fs_set_bit(block_off, nm_i->nat_bitmap);
150}
151
152static inline void fill_node_footer(struct page *page, nid_t nid,
153 nid_t ino, unsigned int ofs, bool reset)
154{
155 void *kaddr = page_address(page);
156 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
157 if (reset)
158 memset(rn, 0, sizeof(*rn));
159 rn->footer.nid = cpu_to_le32(nid);
160 rn->footer.ino = cpu_to_le32(ino);
161 rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
162}
163
164static inline void copy_node_footer(struct page *dst, struct page *src)
165{
166 void *src_addr = page_address(src);
167 void *dst_addr = page_address(dst);
168 struct f2fs_node *src_rn = (struct f2fs_node *)src_addr;
169 struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr;
170 memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
171}
172
173static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
174{
175 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
176 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
177 void *kaddr = page_address(page);
178 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
179 rn->footer.cp_ver = ckpt->checkpoint_ver;
180 rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
181}
182
183static inline nid_t ino_of_node(struct page *node_page)
184{
185 void *kaddr = page_address(node_page);
186 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
187 return le32_to_cpu(rn->footer.ino);
188}
189
190static inline nid_t nid_of_node(struct page *node_page)
191{
192 void *kaddr = page_address(node_page);
193 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
194 return le32_to_cpu(rn->footer.nid);
195}
196
197static inline unsigned int ofs_of_node(struct page *node_page)
198{
199 void *kaddr = page_address(node_page);
200 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
201 unsigned flag = le32_to_cpu(rn->footer.flag);
202 return flag >> OFFSET_BIT_SHIFT;
203}
204
205static inline unsigned long long cpver_of_node(struct page *node_page)
206{
207 void *kaddr = page_address(node_page);
208 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
209 return le64_to_cpu(rn->footer.cp_ver);
210}
211
212static inline block_t next_blkaddr_of_node(struct page *node_page)
213{
214 void *kaddr = page_address(node_page);
215 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
216 return le32_to_cpu(rn->footer.next_blkaddr);
217}
218
219/*
220 * f2fs assigns the following node offsets described as (num).
221 * N = NIDS_PER_BLOCK
222 *
223 * Inode block (0)
224 * |- direct node (1)
225 * |- direct node (2)
226 * |- indirect node (3)
227 * | `- direct node (4 => 4 + N - 1)
228 * |- indirect node (4 + N)
229 * | `- direct node (5 + N => 5 + 2N - 1)
230 * `- double indirect node (5 + 2N)
231 * `- indirect node (6 + 2N)
232 * `- direct node (x(N + 1))
233 */
234static inline bool IS_DNODE(struct page *node_page)
235{
236 unsigned int ofs = ofs_of_node(node_page);
237 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
238 ofs == 5 + 2 * NIDS_PER_BLOCK)
239 return false;
240 if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
241 ofs -= 6 + 2 * NIDS_PER_BLOCK;
242 if ((long int)ofs % (NIDS_PER_BLOCK + 1))
243 return false;
244 }
245 return true;
246}
247
248static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
249{
250 struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
251
252 wait_on_page_writeback(p);
253
254 if (i)
255 rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
256 else
257 rn->in.nid[off] = cpu_to_le32(nid);
258 set_page_dirty(p);
259}
260
261static inline nid_t get_nid(struct page *p, int off, bool i)
262{
263 struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
264 if (i)
265 return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
266 return le32_to_cpu(rn->in.nid[off]);
267}
268
269/*
270 * Coldness identification:
271 * - Mark cold files in f2fs_inode_info
272 * - Mark cold node blocks in their node footer
273 * - Mark cold data pages in page cache
274 */
275static inline int is_cold_file(struct inode *inode)
276{
277 return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
278}
279
280static inline int is_cold_data(struct page *page)
281{
282 return PageChecked(page);
283}
284
285static inline void set_cold_data(struct page *page)
286{
287 SetPageChecked(page);
288}
289
290static inline void clear_cold_data(struct page *page)
291{
292 ClearPageChecked(page);
293}
294
295static inline int is_cold_node(struct page *page)
296{
297 void *kaddr = page_address(page);
298 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
299 unsigned int flag = le32_to_cpu(rn->footer.flag);
300 return flag & (0x1 << COLD_BIT_SHIFT);
301}
302
303static inline unsigned char is_fsync_dnode(struct page *page)
304{
305 void *kaddr = page_address(page);
306 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
307 unsigned int flag = le32_to_cpu(rn->footer.flag);
308 return flag & (0x1 << FSYNC_BIT_SHIFT);
309}
310
311static inline unsigned char is_dent_dnode(struct page *page)
312{
313 void *kaddr = page_address(page);
314 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
315 unsigned int flag = le32_to_cpu(rn->footer.flag);
316 return flag & (0x1 << DENT_BIT_SHIFT);
317}
318
319static inline void set_cold_node(struct inode *inode, struct page *page)
320{
321 struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
322 unsigned int flag = le32_to_cpu(rn->footer.flag);
323
324 if (S_ISDIR(inode->i_mode))
325 flag &= ~(0x1 << COLD_BIT_SHIFT);
326 else
327 flag |= (0x1 << COLD_BIT_SHIFT);
328 rn->footer.flag = cpu_to_le32(flag);
329}
330
331static inline void set_fsync_mark(struct page *page, int mark)
332{
333 void *kaddr = page_address(page);
334 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
335 unsigned int flag = le32_to_cpu(rn->footer.flag);
336 if (mark)
337 flag |= (0x1 << FSYNC_BIT_SHIFT);
338 else
339 flag &= ~(0x1 << FSYNC_BIT_SHIFT);
340 rn->footer.flag = cpu_to_le32(flag);
341}
342
343static inline void set_dentry_mark(struct page *page, int mark)
344{
345 void *kaddr = page_address(page);
346 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
347 unsigned int flag = le32_to_cpu(rn->footer.flag);
348 if (mark)
349 flag |= (0x1 << DENT_BIT_SHIFT);
350 else
351 flag &= ~(0x1 << DENT_BIT_SHIFT);
352 rn->footer.flag = cpu_to_le32(flag);
353}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
new file mode 100644
index 000000000000..f42e4060b399
--- /dev/null
+++ b/fs/f2fs/recovery.c
@@ -0,0 +1,377 @@
1/*
2 * fs/f2fs/recovery.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include "f2fs.h"
14#include "node.h"
15#include "segment.h"
16
17static struct kmem_cache *fsync_entry_slab;
18
19bool space_for_roll_forward(struct f2fs_sb_info *sbi)
20{
21 if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
22 > sbi->user_block_count)
23 return false;
24 return true;
25}
26
27static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
28 nid_t ino)
29{
30 struct list_head *this;
31 struct fsync_inode_entry *entry;
32
33 list_for_each(this, head) {
34 entry = list_entry(this, struct fsync_inode_entry, list);
35 if (entry->inode->i_ino == ino)
36 return entry;
37 }
38 return NULL;
39}
40
41static int recover_dentry(struct page *ipage, struct inode *inode)
42{
43 struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
44 struct f2fs_inode *raw_inode = &(raw_node->i);
45 struct dentry dent, parent;
46 struct f2fs_dir_entry *de;
47 struct page *page;
48 struct inode *dir;
49 int err = 0;
50
51 if (!is_dent_dnode(ipage))
52 goto out;
53
54 dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
55 if (IS_ERR(dir)) {
56 err = -EINVAL;
57 goto out;
58 }
59
60 parent.d_inode = dir;
61 dent.d_parent = &parent;
62 dent.d_name.len = le32_to_cpu(raw_inode->i_namelen);
63 dent.d_name.name = raw_inode->i_name;
64
65 de = f2fs_find_entry(dir, &dent.d_name, &page);
66 if (de) {
67 kunmap(page);
68 f2fs_put_page(page, 0);
69 } else {
70 err = f2fs_add_link(&dent, inode);
71 }
72 iput(dir);
73out:
74 kunmap(ipage);
75 return err;
76}
77
78static int recover_inode(struct inode *inode, struct page *node_page)
79{
80 void *kaddr = page_address(node_page);
81 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
82 struct f2fs_inode *raw_inode = &(raw_node->i);
83
84 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
85 i_size_write(inode, le64_to_cpu(raw_inode->i_size));
86 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
87 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
88 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
89 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
90 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
91 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
92
93 return recover_dentry(node_page, inode);
94}
95
96static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
97{
98 unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
99 struct curseg_info *curseg;
100 struct page *page;
101 block_t blkaddr;
102 int err = 0;
103
104 /* get node pages in the current segment */
105 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
106 blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff;
107
108 /* read node page */
109 page = alloc_page(GFP_F2FS_ZERO);
110 if (IS_ERR(page))
111 return PTR_ERR(page);
112 lock_page(page);
113
114 while (1) {
115 struct fsync_inode_entry *entry;
116
117 if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
118 goto out;
119
120 if (cp_ver != cpver_of_node(page))
121 goto out;
122
123 if (!is_fsync_dnode(page))
124 goto next;
125
126 entry = get_fsync_inode(head, ino_of_node(page));
127 if (entry) {
128 entry->blkaddr = blkaddr;
129 if (IS_INODE(page) && is_dent_dnode(page))
130 set_inode_flag(F2FS_I(entry->inode),
131 FI_INC_LINK);
132 } else {
133 if (IS_INODE(page) && is_dent_dnode(page)) {
134 if (recover_inode_page(sbi, page)) {
135 err = -ENOMEM;
136 goto out;
137 }
138 }
139
140 /* add this fsync inode to the list */
141 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
142 if (!entry) {
143 err = -ENOMEM;
144 goto out;
145 }
146
147 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
148 if (IS_ERR(entry->inode)) {
149 err = PTR_ERR(entry->inode);
150 kmem_cache_free(fsync_entry_slab, entry);
151 goto out;
152 }
153
154 list_add_tail(&entry->list, head);
155 entry->blkaddr = blkaddr;
156 }
157 if (IS_INODE(page)) {
158 err = recover_inode(entry->inode, page);
159 if (err)
160 goto out;
161 }
162next:
163 /* check next segment */
164 blkaddr = next_blkaddr_of_node(page);
165 ClearPageUptodate(page);
166 }
167out:
168 unlock_page(page);
169 __free_pages(page, 0);
170 return err;
171}
172
173static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
174 struct list_head *head)
175{
176 struct fsync_inode_entry *entry, *tmp;
177
178 list_for_each_entry_safe(entry, tmp, head, list) {
179 iput(entry->inode);
180 list_del(&entry->list);
181 kmem_cache_free(fsync_entry_slab, entry);
182 }
183}
184
185static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
186 block_t blkaddr)
187{
188 struct seg_entry *sentry;
189 unsigned int segno = GET_SEGNO(sbi, blkaddr);
190 unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
191 (sbi->blocks_per_seg - 1);
192 struct f2fs_summary sum;
193 nid_t ino;
194 void *kaddr;
195 struct inode *inode;
196 struct page *node_page;
197 block_t bidx;
198 int i;
199
200 sentry = get_seg_entry(sbi, segno);
201 if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
202 return;
203
204 /* Get the previous summary */
205 for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
206 struct curseg_info *curseg = CURSEG_I(sbi, i);
207 if (curseg->segno == segno) {
208 sum = curseg->sum_blk->entries[blkoff];
209 break;
210 }
211 }
212 if (i > CURSEG_COLD_DATA) {
213 struct page *sum_page = get_sum_page(sbi, segno);
214 struct f2fs_summary_block *sum_node;
215 kaddr = page_address(sum_page);
216 sum_node = (struct f2fs_summary_block *)kaddr;
217 sum = sum_node->entries[blkoff];
218 f2fs_put_page(sum_page, 1);
219 }
220
221 /* Get the node page */
222 node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
223 bidx = start_bidx_of_node(ofs_of_node(node_page)) +
224 le16_to_cpu(sum.ofs_in_node);
225 ino = ino_of_node(node_page);
226 f2fs_put_page(node_page, 1);
227
228 /* Deallocate previous index in the node page */
229 inode = f2fs_iget_nowait(sbi->sb, ino);
230 if (IS_ERR(inode))
231 return;
232
233 truncate_hole(inode, bidx, bidx + 1);
234 iput(inode);
235}
236
237static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
238 struct page *page, block_t blkaddr)
239{
240 unsigned int start, end;
241 struct dnode_of_data dn;
242 struct f2fs_summary sum;
243 struct node_info ni;
244
245 start = start_bidx_of_node(ofs_of_node(page));
246 if (IS_INODE(page))
247 end = start + ADDRS_PER_INODE;
248 else
249 end = start + ADDRS_PER_BLOCK;
250
251 set_new_dnode(&dn, inode, NULL, NULL, 0);
252 if (get_dnode_of_data(&dn, start, 0))
253 return;
254
255 wait_on_page_writeback(dn.node_page);
256
257 get_node_info(sbi, dn.nid, &ni);
258 BUG_ON(ni.ino != ino_of_node(page));
259 BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page));
260
261 for (; start < end; start++) {
262 block_t src, dest;
263
264 src = datablock_addr(dn.node_page, dn.ofs_in_node);
265 dest = datablock_addr(page, dn.ofs_in_node);
266
267 if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
268 if (src == NULL_ADDR) {
269 int err = reserve_new_block(&dn);
270 /* We should not get -ENOSPC */
271 BUG_ON(err);
272 }
273
274 /* Check the previous node page having this index */
275 check_index_in_prev_nodes(sbi, dest);
276
277 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
278
279 /* write dummy data page */
280 recover_data_page(sbi, NULL, &sum, src, dest);
281 update_extent_cache(dest, &dn);
282 }
283 dn.ofs_in_node++;
284 }
285
286 /* write node page in place */
287 set_summary(&sum, dn.nid, 0, 0);
288 if (IS_INODE(dn.node_page))
289 sync_inode_page(&dn);
290
291 copy_node_footer(dn.node_page, page);
292 fill_node_footer(dn.node_page, dn.nid, ni.ino,
293 ofs_of_node(page), false);
294 set_page_dirty(dn.node_page);
295
296 recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
297 f2fs_put_dnode(&dn);
298}
299
300static void recover_data(struct f2fs_sb_info *sbi,
301 struct list_head *head, int type)
302{
303 unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
304 struct curseg_info *curseg;
305 struct page *page;
306 block_t blkaddr;
307
308 /* get node pages in the current segment */
309 curseg = CURSEG_I(sbi, type);
310 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
311
312 /* read node page */
313 page = alloc_page(GFP_NOFS | __GFP_ZERO);
314 if (IS_ERR(page))
315 return;
316 lock_page(page);
317
318 while (1) {
319 struct fsync_inode_entry *entry;
320
321 if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
322 goto out;
323
324 if (cp_ver != cpver_of_node(page))
325 goto out;
326
327 entry = get_fsync_inode(head, ino_of_node(page));
328 if (!entry)
329 goto next;
330
331 do_recover_data(sbi, entry->inode, page, blkaddr);
332
333 if (entry->blkaddr == blkaddr) {
334 iput(entry->inode);
335 list_del(&entry->list);
336 kmem_cache_free(fsync_entry_slab, entry);
337 }
338next:
339 /* check next segment */
340 blkaddr = next_blkaddr_of_node(page);
341 ClearPageUptodate(page);
342 }
343out:
344 unlock_page(page);
345 __free_pages(page, 0);
346
347 allocate_new_segments(sbi);
348}
349
350void recover_fsync_data(struct f2fs_sb_info *sbi)
351{
352 struct list_head inode_list;
353
354 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
355 sizeof(struct fsync_inode_entry), NULL);
356 if (unlikely(!fsync_entry_slab))
357 return;
358
359 INIT_LIST_HEAD(&inode_list);
360
361 /* step #1: find fsynced inode numbers */
362 if (find_fsync_dnodes(sbi, &inode_list))
363 goto out;
364
365 if (list_empty(&inode_list))
366 goto out;
367
368 /* step #2: recover data */
369 sbi->por_doing = 1;
370 recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
371 sbi->por_doing = 0;
372 BUG_ON(!list_empty(&inode_list));
373out:
374 destroy_fsync_dnodes(sbi, &inode_list);
375 kmem_cache_destroy(fsync_entry_slab);
376 write_checkpoint(sbi, false, false);
377}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
new file mode 100644
index 000000000000..4b0099066582
--- /dev/null
+++ b/fs/f2fs/segment.c
@@ -0,0 +1,1757 @@
1/*
2 * fs/f2fs/segment.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/bio.h>
14#include <linux/blkdev.h>
15#include <linux/prefetch.h>
16#include <linux/vmalloc.h>
17
18#include "f2fs.h"
19#include "segment.h"
20#include "node.h"
21
22/*
23 * This function balances dirty node and dentry pages.
24 * In addition, it controls garbage collection.
25 */
26void f2fs_balance_fs(struct f2fs_sb_info *sbi)
27{
28 /*
29 * We should do GC or end up with checkpoint, if there are so many dirty
30 * dir/node pages without enough free segments.
31 */
32 if (has_not_enough_free_secs(sbi)) {
33 mutex_lock(&sbi->gc_mutex);
34 f2fs_gc(sbi);
35 }
36}
37
38static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
39 enum dirty_type dirty_type)
40{
41 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
42
43 /* need not be added */
44 if (IS_CURSEG(sbi, segno))
45 return;
46
47 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
48 dirty_i->nr_dirty[dirty_type]++;
49
50 if (dirty_type == DIRTY) {
51 struct seg_entry *sentry = get_seg_entry(sbi, segno);
52 dirty_type = sentry->type;
53 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
54 dirty_i->nr_dirty[dirty_type]++;
55 }
56}
57
58static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
59 enum dirty_type dirty_type)
60{
61 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
62
63 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
64 dirty_i->nr_dirty[dirty_type]--;
65
66 if (dirty_type == DIRTY) {
67 struct seg_entry *sentry = get_seg_entry(sbi, segno);
68 dirty_type = sentry->type;
69 if (test_and_clear_bit(segno,
70 dirty_i->dirty_segmap[dirty_type]))
71 dirty_i->nr_dirty[dirty_type]--;
72 clear_bit(segno, dirty_i->victim_segmap[FG_GC]);
73 clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
74 }
75}
76
77/*
78 * Should not occur error such as -ENOMEM.
79 * Adding dirty entry into seglist is not critical operation.
80 * If a given segment is one of current working segments, it won't be added.
81 */
82void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
83{
84 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
85 unsigned short valid_blocks;
86
87 if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
88 return;
89
90 mutex_lock(&dirty_i->seglist_lock);
91
92 valid_blocks = get_valid_blocks(sbi, segno, 0);
93
94 if (valid_blocks == 0) {
95 __locate_dirty_segment(sbi, segno, PRE);
96 __remove_dirty_segment(sbi, segno, DIRTY);
97 } else if (valid_blocks < sbi->blocks_per_seg) {
98 __locate_dirty_segment(sbi, segno, DIRTY);
99 } else {
100 /* Recovery routine with SSR needs this */
101 __remove_dirty_segment(sbi, segno, DIRTY);
102 }
103
104 mutex_unlock(&dirty_i->seglist_lock);
105 return;
106}
107
108/*
109 * Should call clear_prefree_segments after checkpoint is done.
110 */
111static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
112{
113 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
114 unsigned int segno, offset = 0;
115 unsigned int total_segs = TOTAL_SEGS(sbi);
116
117 mutex_lock(&dirty_i->seglist_lock);
118 while (1) {
119 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
120 offset);
121 if (segno >= total_segs)
122 break;
123 __set_test_and_free(sbi, segno);
124 offset = segno + 1;
125 }
126 mutex_unlock(&dirty_i->seglist_lock);
127}
128
129void clear_prefree_segments(struct f2fs_sb_info *sbi)
130{
131 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
132 unsigned int segno, offset = 0;
133 unsigned int total_segs = TOTAL_SEGS(sbi);
134
135 mutex_lock(&dirty_i->seglist_lock);
136 while (1) {
137 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
138 offset);
139 if (segno >= total_segs)
140 break;
141
142 offset = segno + 1;
143 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
144 dirty_i->nr_dirty[PRE]--;
145
146 /* Let's use trim */
147 if (test_opt(sbi, DISCARD))
148 blkdev_issue_discard(sbi->sb->s_bdev,
149 START_BLOCK(sbi, segno) <<
150 sbi->log_sectors_per_block,
151 1 << (sbi->log_sectors_per_block +
152 sbi->log_blocks_per_seg),
153 GFP_NOFS, 0);
154 }
155 mutex_unlock(&dirty_i->seglist_lock);
156}
157
158static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
159{
160 struct sit_info *sit_i = SIT_I(sbi);
161 if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
162 sit_i->dirty_sentries++;
163}
164
165static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
166 unsigned int segno, int modified)
167{
168 struct seg_entry *se = get_seg_entry(sbi, segno);
169 se->type = type;
170 if (modified)
171 __mark_sit_entry_dirty(sbi, segno);
172}
173
174static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
175{
176 struct seg_entry *se;
177 unsigned int segno, offset;
178 long int new_vblocks;
179
180 segno = GET_SEGNO(sbi, blkaddr);
181
182 se = get_seg_entry(sbi, segno);
183 new_vblocks = se->valid_blocks + del;
184 offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
185
186 BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) ||
187 (new_vblocks > sbi->blocks_per_seg)));
188
189 se->valid_blocks = new_vblocks;
190 se->mtime = get_mtime(sbi);
191 SIT_I(sbi)->max_mtime = se->mtime;
192
193 /* Update valid block bitmap */
194 if (del > 0) {
195 if (f2fs_set_bit(offset, se->cur_valid_map))
196 BUG();
197 } else {
198 if (!f2fs_clear_bit(offset, se->cur_valid_map))
199 BUG();
200 }
201 if (!f2fs_test_bit(offset, se->ckpt_valid_map))
202 se->ckpt_valid_blocks += del;
203
204 __mark_sit_entry_dirty(sbi, segno);
205
206 /* update total number of valid blocks to be written in ckpt area */
207 SIT_I(sbi)->written_valid_blocks += del;
208
209 if (sbi->segs_per_sec > 1)
210 get_sec_entry(sbi, segno)->valid_blocks += del;
211}
212
213static void refresh_sit_entry(struct f2fs_sb_info *sbi,
214 block_t old_blkaddr, block_t new_blkaddr)
215{
216 update_sit_entry(sbi, new_blkaddr, 1);
217 if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
218 update_sit_entry(sbi, old_blkaddr, -1);
219}
220
221void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
222{
223 unsigned int segno = GET_SEGNO(sbi, addr);
224 struct sit_info *sit_i = SIT_I(sbi);
225
226 BUG_ON(addr == NULL_ADDR);
227 if (addr == NEW_ADDR)
228 return;
229
230 /* add it into sit main buffer */
231 mutex_lock(&sit_i->sentry_lock);
232
233 update_sit_entry(sbi, addr, -1);
234
235 /* add it into dirty seglist */
236 locate_dirty_segment(sbi, segno);
237
238 mutex_unlock(&sit_i->sentry_lock);
239}
240
241/*
242 * This function should be resided under the curseg_mutex lock
243 */
244static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
245 struct f2fs_summary *sum, unsigned short offset)
246{
247 struct curseg_info *curseg = CURSEG_I(sbi, type);
248 void *addr = curseg->sum_blk;
249 addr += offset * sizeof(struct f2fs_summary);
250 memcpy(addr, sum, sizeof(struct f2fs_summary));
251 return;
252}
253
254/*
255 * Calculate the number of current summary pages for writing
256 */
257int npages_for_summary_flush(struct f2fs_sb_info *sbi)
258{
259 int total_size_bytes = 0;
260 int valid_sum_count = 0;
261 int i, sum_space;
262
263 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
264 if (sbi->ckpt->alloc_type[i] == SSR)
265 valid_sum_count += sbi->blocks_per_seg;
266 else
267 valid_sum_count += curseg_blkoff(sbi, i);
268 }
269
270 total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1)
271 + sizeof(struct nat_journal) + 2
272 + sizeof(struct sit_journal) + 2;
273 sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE;
274 if (total_size_bytes < sum_space)
275 return 1;
276 else if (total_size_bytes < 2 * sum_space)
277 return 2;
278 return 3;
279}
280
281/*
282 * Caller should put this summary page
283 */
284struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
285{
286 return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
287}
288
289static void write_sum_page(struct f2fs_sb_info *sbi,
290 struct f2fs_summary_block *sum_blk, block_t blk_addr)
291{
292 struct page *page = grab_meta_page(sbi, blk_addr);
293 void *kaddr = page_address(page);
294 memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
295 set_page_dirty(page);
296 f2fs_put_page(page, 1);
297}
298
299static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
300 int ofs_unit, int type)
301{
302 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
303 unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
304 unsigned int segno, next_segno, i;
305 int ofs = 0;
306
307 /*
308 * If there is not enough reserved sections,
309 * we should not reuse prefree segments.
310 */
311 if (has_not_enough_free_secs(sbi))
312 return NULL_SEGNO;
313
314 /*
315 * NODE page should not reuse prefree segment,
316 * since those information is used for SPOR.
317 */
318 if (IS_NODESEG(type))
319 return NULL_SEGNO;
320next:
321 segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++);
322 ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit;
323 if (segno < TOTAL_SEGS(sbi)) {
324 /* skip intermediate segments in a section */
325 if (segno % ofs_unit)
326 goto next;
327
328 /* skip if whole section is not prefree */
329 next_segno = find_next_zero_bit(prefree_segmap,
330 TOTAL_SEGS(sbi), segno + 1);
331 if (next_segno - segno < ofs_unit)
332 goto next;
333
334 /* skip if whole section was not free at the last checkpoint */
335 for (i = 0; i < ofs_unit; i++)
336 if (get_seg_entry(sbi, segno)->ckpt_valid_blocks)
337 goto next;
338 return segno;
339 }
340 return NULL_SEGNO;
341}
342
343/*
344 * Find a new segment from the free segments bitmap to right order
345 * This function should be returned with success, otherwise BUG
346 */
347static void get_new_segment(struct f2fs_sb_info *sbi,
348 unsigned int *newseg, bool new_sec, int dir)
349{
350 struct free_segmap_info *free_i = FREE_I(sbi);
351 unsigned int total_secs = sbi->total_sections;
352 unsigned int segno, secno, zoneno;
353 unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
354 unsigned int hint = *newseg / sbi->segs_per_sec;
355 unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
356 unsigned int left_start = hint;
357 bool init = true;
358 int go_left = 0;
359 int i;
360
361 write_lock(&free_i->segmap_lock);
362
363 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
364 segno = find_next_zero_bit(free_i->free_segmap,
365 TOTAL_SEGS(sbi), *newseg + 1);
366 if (segno < TOTAL_SEGS(sbi))
367 goto got_it;
368 }
369find_other_zone:
370 secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
371 if (secno >= total_secs) {
372 if (dir == ALLOC_RIGHT) {
373 secno = find_next_zero_bit(free_i->free_secmap,
374 total_secs, 0);
375 BUG_ON(secno >= total_secs);
376 } else {
377 go_left = 1;
378 left_start = hint - 1;
379 }
380 }
381 if (go_left == 0)
382 goto skip_left;
383
384 while (test_bit(left_start, free_i->free_secmap)) {
385 if (left_start > 0) {
386 left_start--;
387 continue;
388 }
389 left_start = find_next_zero_bit(free_i->free_secmap,
390 total_secs, 0);
391 BUG_ON(left_start >= total_secs);
392 break;
393 }
394 secno = left_start;
395skip_left:
396 hint = secno;
397 segno = secno * sbi->segs_per_sec;
398 zoneno = secno / sbi->secs_per_zone;
399
400 /* give up on finding another zone */
401 if (!init)
402 goto got_it;
403 if (sbi->secs_per_zone == 1)
404 goto got_it;
405 if (zoneno == old_zoneno)
406 goto got_it;
407 if (dir == ALLOC_LEFT) {
408 if (!go_left && zoneno + 1 >= total_zones)
409 goto got_it;
410 if (go_left && zoneno == 0)
411 goto got_it;
412 }
413 for (i = 0; i < NR_CURSEG_TYPE; i++)
414 if (CURSEG_I(sbi, i)->zone == zoneno)
415 break;
416
417 if (i < NR_CURSEG_TYPE) {
418 /* zone is in user, try another */
419 if (go_left)
420 hint = zoneno * sbi->secs_per_zone - 1;
421 else if (zoneno + 1 >= total_zones)
422 hint = 0;
423 else
424 hint = (zoneno + 1) * sbi->secs_per_zone;
425 init = false;
426 goto find_other_zone;
427 }
428got_it:
429 /* set it as dirty segment in free segmap */
430 BUG_ON(test_bit(segno, free_i->free_segmap));
431 __set_inuse(sbi, segno);
432 *newseg = segno;
433 write_unlock(&free_i->segmap_lock);
434}
435
436static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
437{
438 struct curseg_info *curseg = CURSEG_I(sbi, type);
439 struct summary_footer *sum_footer;
440
441 curseg->segno = curseg->next_segno;
442 curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
443 curseg->next_blkoff = 0;
444 curseg->next_segno = NULL_SEGNO;
445
446 sum_footer = &(curseg->sum_blk->footer);
447 memset(sum_footer, 0, sizeof(struct summary_footer));
448 if (IS_DATASEG(type))
449 SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
450 if (IS_NODESEG(type))
451 SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
452 __set_sit_entry_type(sbi, type, curseg->segno, modified);
453}
454
455/*
456 * Allocate a current working segment.
457 * This function always allocates a free segment in LFS manner.
458 */
459static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
460{
461 struct curseg_info *curseg = CURSEG_I(sbi, type);
462 unsigned int segno = curseg->segno;
463 int dir = ALLOC_LEFT;
464
465 write_sum_page(sbi, curseg->sum_blk,
466 GET_SUM_BLOCK(sbi, curseg->segno));
467 if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
468 dir = ALLOC_RIGHT;
469
470 if (test_opt(sbi, NOHEAP))
471 dir = ALLOC_RIGHT;
472
473 get_new_segment(sbi, &segno, new_sec, dir);
474 curseg->next_segno = segno;
475 reset_curseg(sbi, type, 1);
476 curseg->alloc_type = LFS;
477}
478
479static void __next_free_blkoff(struct f2fs_sb_info *sbi,
480 struct curseg_info *seg, block_t start)
481{
482 struct seg_entry *se = get_seg_entry(sbi, seg->segno);
483 block_t ofs;
484 for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
485 if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
486 && !f2fs_test_bit(ofs, se->cur_valid_map))
487 break;
488 }
489 seg->next_blkoff = ofs;
490}
491
492/*
493 * If a segment is written by LFS manner, next block offset is just obtained
494 * by increasing the current block offset. However, if a segment is written by
495 * SSR manner, next block offset obtained by calling __next_free_blkoff
496 */
497static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
498 struct curseg_info *seg)
499{
500 if (seg->alloc_type == SSR)
501 __next_free_blkoff(sbi, seg, seg->next_blkoff + 1);
502 else
503 seg->next_blkoff++;
504}
505
506/*
507 * This function always allocates a used segment (from dirty seglist) by SSR
508 * manner, so it should recover the existing segment information of valid blocks
509 */
510static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
511{
512 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
513 struct curseg_info *curseg = CURSEG_I(sbi, type);
514 unsigned int new_segno = curseg->next_segno;
515 struct f2fs_summary_block *sum_node;
516 struct page *sum_page;
517
518 write_sum_page(sbi, curseg->sum_blk,
519 GET_SUM_BLOCK(sbi, curseg->segno));
520 __set_test_and_inuse(sbi, new_segno);
521
522 mutex_lock(&dirty_i->seglist_lock);
523 __remove_dirty_segment(sbi, new_segno, PRE);
524 __remove_dirty_segment(sbi, new_segno, DIRTY);
525 mutex_unlock(&dirty_i->seglist_lock);
526
527 reset_curseg(sbi, type, 1);
528 curseg->alloc_type = SSR;
529 __next_free_blkoff(sbi, curseg, 0);
530
531 if (reuse) {
532 sum_page = get_sum_page(sbi, new_segno);
533 sum_node = (struct f2fs_summary_block *)page_address(sum_page);
534 memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
535 f2fs_put_page(sum_page, 1);
536 }
537}
538
539/*
540 * flush out current segment and replace it with new segment
541 * This function should be returned with success, otherwise BUG
542 */
543static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
544 int type, bool force)
545{
546 struct curseg_info *curseg = CURSEG_I(sbi, type);
547 unsigned int ofs_unit;
548
549 if (force) {
550 new_curseg(sbi, type, true);
551 goto out;
552 }
553
554 ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
555 curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
556
557 if (curseg->next_segno != NULL_SEGNO)
558 change_curseg(sbi, type, false);
559 else if (type == CURSEG_WARM_NODE)
560 new_curseg(sbi, type, false);
561 else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
562 change_curseg(sbi, type, true);
563 else
564 new_curseg(sbi, type, false);
565out:
566 sbi->segment_count[curseg->alloc_type]++;
567}
568
569void allocate_new_segments(struct f2fs_sb_info *sbi)
570{
571 struct curseg_info *curseg;
572 unsigned int old_curseg;
573 int i;
574
575 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
576 curseg = CURSEG_I(sbi, i);
577 old_curseg = curseg->segno;
578 SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
579 locate_dirty_segment(sbi, old_curseg);
580 }
581}
582
583static const struct segment_allocation default_salloc_ops = {
584 .allocate_segment = allocate_segment_by_default,
585};
586
587static void f2fs_end_io_write(struct bio *bio, int err)
588{
589 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
590 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
591 struct bio_private *p = bio->bi_private;
592
593 do {
594 struct page *page = bvec->bv_page;
595
596 if (--bvec >= bio->bi_io_vec)
597 prefetchw(&bvec->bv_page->flags);
598 if (!uptodate) {
599 SetPageError(page);
600 if (page->mapping)
601 set_bit(AS_EIO, &page->mapping->flags);
602 set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
603 }
604 end_page_writeback(page);
605 dec_page_count(p->sbi, F2FS_WRITEBACK);
606 } while (bvec >= bio->bi_io_vec);
607
608 if (p->is_sync)
609 complete(p->wait);
610 kfree(p);
611 bio_put(bio);
612}
613
614struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
615{
616 struct bio *bio;
617 struct bio_private *priv;
618retry:
619 priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
620 if (!priv) {
621 cond_resched();
622 goto retry;
623 }
624
625 /* No failure on bio allocation */
626 bio = bio_alloc(GFP_NOIO, npages);
627 bio->bi_bdev = bdev;
628 bio->bi_private = priv;
629 return bio;
630}
631
632static void do_submit_bio(struct f2fs_sb_info *sbi,
633 enum page_type type, bool sync)
634{
635 int rw = sync ? WRITE_SYNC : WRITE;
636 enum page_type btype = type > META ? META : type;
637
638 if (type >= META_FLUSH)
639 rw = WRITE_FLUSH_FUA;
640
641 if (sbi->bio[btype]) {
642 struct bio_private *p = sbi->bio[btype]->bi_private;
643 p->sbi = sbi;
644 sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
645 if (type == META_FLUSH) {
646 DECLARE_COMPLETION_ONSTACK(wait);
647 p->is_sync = true;
648 p->wait = &wait;
649 submit_bio(rw, sbi->bio[btype]);
650 wait_for_completion(&wait);
651 } else {
652 p->is_sync = false;
653 submit_bio(rw, sbi->bio[btype]);
654 }
655 sbi->bio[btype] = NULL;
656 }
657}
658
659void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
660{
661 down_write(&sbi->bio_sem);
662 do_submit_bio(sbi, type, sync);
663 up_write(&sbi->bio_sem);
664}
665
666static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
667 block_t blk_addr, enum page_type type)
668{
669 struct block_device *bdev = sbi->sb->s_bdev;
670
671 verify_block_addr(sbi, blk_addr);
672
673 down_write(&sbi->bio_sem);
674
675 inc_page_count(sbi, F2FS_WRITEBACK);
676
677 if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
678 do_submit_bio(sbi, type, false);
679alloc_new:
680 if (sbi->bio[type] == NULL) {
681 sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
682 sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
683 /*
684 * The end_io will be assigned at the sumbission phase.
685 * Until then, let bio_add_page() merge consecutive IOs as much
686 * as possible.
687 */
688 }
689
690 if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
691 PAGE_CACHE_SIZE) {
692 do_submit_bio(sbi, type, false);
693 goto alloc_new;
694 }
695
696 sbi->last_block_in_bio[type] = blk_addr;
697
698 up_write(&sbi->bio_sem);
699}
700
701static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
702{
703 struct curseg_info *curseg = CURSEG_I(sbi, type);
704 if (curseg->next_blkoff < sbi->blocks_per_seg)
705 return true;
706 return false;
707}
708
709static int __get_segment_type_2(struct page *page, enum page_type p_type)
710{
711 if (p_type == DATA)
712 return CURSEG_HOT_DATA;
713 else
714 return CURSEG_HOT_NODE;
715}
716
717static int __get_segment_type_4(struct page *page, enum page_type p_type)
718{
719 if (p_type == DATA) {
720 struct inode *inode = page->mapping->host;
721
722 if (S_ISDIR(inode->i_mode))
723 return CURSEG_HOT_DATA;
724 else
725 return CURSEG_COLD_DATA;
726 } else {
727 if (IS_DNODE(page) && !is_cold_node(page))
728 return CURSEG_HOT_NODE;
729 else
730 return CURSEG_COLD_NODE;
731 }
732}
733
734static int __get_segment_type_6(struct page *page, enum page_type p_type)
735{
736 if (p_type == DATA) {
737 struct inode *inode = page->mapping->host;
738
739 if (S_ISDIR(inode->i_mode))
740 return CURSEG_HOT_DATA;
741 else if (is_cold_data(page) || is_cold_file(inode))
742 return CURSEG_COLD_DATA;
743 else
744 return CURSEG_WARM_DATA;
745 } else {
746 if (IS_DNODE(page))
747 return is_cold_node(page) ? CURSEG_WARM_NODE :
748 CURSEG_HOT_NODE;
749 else
750 return CURSEG_COLD_NODE;
751 }
752}
753
754static int __get_segment_type(struct page *page, enum page_type p_type)
755{
756 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
757 switch (sbi->active_logs) {
758 case 2:
759 return __get_segment_type_2(page, p_type);
760 case 4:
761 return __get_segment_type_4(page, p_type);
762 }
763 /* NR_CURSEG_TYPE(6) logs by default */
764 BUG_ON(sbi->active_logs != NR_CURSEG_TYPE);
765 return __get_segment_type_6(page, p_type);
766}
767
768static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
769 block_t old_blkaddr, block_t *new_blkaddr,
770 struct f2fs_summary *sum, enum page_type p_type)
771{
772 struct sit_info *sit_i = SIT_I(sbi);
773 struct curseg_info *curseg;
774 unsigned int old_cursegno;
775 int type;
776
777 type = __get_segment_type(page, p_type);
778 curseg = CURSEG_I(sbi, type);
779
780 mutex_lock(&curseg->curseg_mutex);
781
782 *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
783 old_cursegno = curseg->segno;
784
785 /*
786 * __add_sum_entry should be resided under the curseg_mutex
787 * because, this function updates a summary entry in the
788 * current summary block.
789 */
790 __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
791
792 mutex_lock(&sit_i->sentry_lock);
793 __refresh_next_blkoff(sbi, curseg);
794 sbi->block_count[curseg->alloc_type]++;
795
796 /*
797 * SIT information should be updated before segment allocation,
798 * since SSR needs latest valid block information.
799 */
800 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
801
802 if (!__has_curseg_space(sbi, type))
803 sit_i->s_ops->allocate_segment(sbi, type, false);
804
805 locate_dirty_segment(sbi, old_cursegno);
806 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
807 mutex_unlock(&sit_i->sentry_lock);
808
809 if (p_type == NODE)
810 fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
811
812 /* writeout dirty page into bdev */
813 submit_write_page(sbi, page, *new_blkaddr, p_type);
814
815 mutex_unlock(&curseg->curseg_mutex);
816}
817
818int write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
819 struct writeback_control *wbc)
820{
821 if (wbc->for_reclaim)
822 return AOP_WRITEPAGE_ACTIVATE;
823
824 set_page_writeback(page);
825 submit_write_page(sbi, page, page->index, META);
826 return 0;
827}
828
829void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
830 unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
831{
832 struct f2fs_summary sum;
833 set_summary(&sum, nid, 0, 0);
834 do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
835}
836
837void write_data_page(struct inode *inode, struct page *page,
838 struct dnode_of_data *dn, block_t old_blkaddr,
839 block_t *new_blkaddr)
840{
841 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
842 struct f2fs_summary sum;
843 struct node_info ni;
844
845 BUG_ON(old_blkaddr == NULL_ADDR);
846 get_node_info(sbi, dn->nid, &ni);
847 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
848
849 do_write_page(sbi, page, old_blkaddr,
850 new_blkaddr, &sum, DATA);
851}
852
853void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
854 block_t old_blk_addr)
855{
856 submit_write_page(sbi, page, old_blk_addr, DATA);
857}
858
859void recover_data_page(struct f2fs_sb_info *sbi,
860 struct page *page, struct f2fs_summary *sum,
861 block_t old_blkaddr, block_t new_blkaddr)
862{
863 struct sit_info *sit_i = SIT_I(sbi);
864 struct curseg_info *curseg;
865 unsigned int segno, old_cursegno;
866 struct seg_entry *se;
867 int type;
868
869 segno = GET_SEGNO(sbi, new_blkaddr);
870 se = get_seg_entry(sbi, segno);
871 type = se->type;
872
873 if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
874 if (old_blkaddr == NULL_ADDR)
875 type = CURSEG_COLD_DATA;
876 else
877 type = CURSEG_WARM_DATA;
878 }
879 curseg = CURSEG_I(sbi, type);
880
881 mutex_lock(&curseg->curseg_mutex);
882 mutex_lock(&sit_i->sentry_lock);
883
884 old_cursegno = curseg->segno;
885
886 /* change the current segment */
887 if (segno != curseg->segno) {
888 curseg->next_segno = segno;
889 change_curseg(sbi, type, true);
890 }
891
892 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
893 (sbi->blocks_per_seg - 1);
894 __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
895
896 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
897
898 locate_dirty_segment(sbi, old_cursegno);
899 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
900
901 mutex_unlock(&sit_i->sentry_lock);
902 mutex_unlock(&curseg->curseg_mutex);
903}
904
905void rewrite_node_page(struct f2fs_sb_info *sbi,
906 struct page *page, struct f2fs_summary *sum,
907 block_t old_blkaddr, block_t new_blkaddr)
908{
909 struct sit_info *sit_i = SIT_I(sbi);
910 int type = CURSEG_WARM_NODE;
911 struct curseg_info *curseg;
912 unsigned int segno, old_cursegno;
913 block_t next_blkaddr = next_blkaddr_of_node(page);
914 unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
915
916 curseg = CURSEG_I(sbi, type);
917
918 mutex_lock(&curseg->curseg_mutex);
919 mutex_lock(&sit_i->sentry_lock);
920
921 segno = GET_SEGNO(sbi, new_blkaddr);
922 old_cursegno = curseg->segno;
923
924 /* change the current segment */
925 if (segno != curseg->segno) {
926 curseg->next_segno = segno;
927 change_curseg(sbi, type, true);
928 }
929 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
930 (sbi->blocks_per_seg - 1);
931 __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
932
933 /* change the current log to the next block addr in advance */
934 if (next_segno != segno) {
935 curseg->next_segno = next_segno;
936 change_curseg(sbi, type, true);
937 }
938 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
939 (sbi->blocks_per_seg - 1);
940
941 /* rewrite node page */
942 set_page_writeback(page);
943 submit_write_page(sbi, page, new_blkaddr, NODE);
944 f2fs_submit_bio(sbi, NODE, true);
945 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
946
947 locate_dirty_segment(sbi, old_cursegno);
948 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
949
950 mutex_unlock(&sit_i->sentry_lock);
951 mutex_unlock(&curseg->curseg_mutex);
952}
953
954static int read_compacted_summaries(struct f2fs_sb_info *sbi)
955{
956 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
957 struct curseg_info *seg_i;
958 unsigned char *kaddr;
959 struct page *page;
960 block_t start;
961 int i, j, offset;
962
963 start = start_sum_block(sbi);
964
965 page = get_meta_page(sbi, start++);
966 kaddr = (unsigned char *)page_address(page);
967
968 /* Step 1: restore nat cache */
969 seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
970 memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
971
972 /* Step 2: restore sit cache */
973 seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
974 memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
975 SUM_JOURNAL_SIZE);
976 offset = 2 * SUM_JOURNAL_SIZE;
977
978 /* Step 3: restore summary entries */
979 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
980 unsigned short blk_off;
981 unsigned int segno;
982
983 seg_i = CURSEG_I(sbi, i);
984 segno = le32_to_cpu(ckpt->cur_data_segno[i]);
985 blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]);
986 seg_i->next_segno = segno;
987 reset_curseg(sbi, i, 0);
988 seg_i->alloc_type = ckpt->alloc_type[i];
989 seg_i->next_blkoff = blk_off;
990
991 if (seg_i->alloc_type == SSR)
992 blk_off = sbi->blocks_per_seg;
993
994 for (j = 0; j < blk_off; j++) {
995 struct f2fs_summary *s;
996 s = (struct f2fs_summary *)(kaddr + offset);
997 seg_i->sum_blk->entries[j] = *s;
998 offset += SUMMARY_SIZE;
999 if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
1000 SUM_FOOTER_SIZE)
1001 continue;
1002
1003 f2fs_put_page(page, 1);
1004 page = NULL;
1005
1006 page = get_meta_page(sbi, start++);
1007 kaddr = (unsigned char *)page_address(page);
1008 offset = 0;
1009 }
1010 }
1011 f2fs_put_page(page, 1);
1012 return 0;
1013}
1014
1015static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1016{
1017 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1018 struct f2fs_summary_block *sum;
1019 struct curseg_info *curseg;
1020 struct page *new;
1021 unsigned short blk_off;
1022 unsigned int segno = 0;
1023 block_t blk_addr = 0;
1024
1025 /* get segment number and block addr */
1026 if (IS_DATASEG(type)) {
1027 segno = le32_to_cpu(ckpt->cur_data_segno[type]);
1028 blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
1029 CURSEG_HOT_DATA]);
1030 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
1031 blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
1032 else
1033 blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
1034 } else {
1035 segno = le32_to_cpu(ckpt->cur_node_segno[type -
1036 CURSEG_HOT_NODE]);
1037 blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
1038 CURSEG_HOT_NODE]);
1039 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
1040 blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
1041 type - CURSEG_HOT_NODE);
1042 else
1043 blk_addr = GET_SUM_BLOCK(sbi, segno);
1044 }
1045
1046 new = get_meta_page(sbi, blk_addr);
1047 sum = (struct f2fs_summary_block *)page_address(new);
1048
1049 if (IS_NODESEG(type)) {
1050 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
1051 struct f2fs_summary *ns = &sum->entries[0];
1052 int i;
1053 for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
1054 ns->version = 0;
1055 ns->ofs_in_node = 0;
1056 }
1057 } else {
1058 if (restore_node_summary(sbi, segno, sum)) {
1059 f2fs_put_page(new, 1);
1060 return -EINVAL;
1061 }
1062 }
1063 }
1064
1065 /* set uncompleted segment to curseg */
1066 curseg = CURSEG_I(sbi, type);
1067 mutex_lock(&curseg->curseg_mutex);
1068 memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
1069 curseg->next_segno = segno;
1070 reset_curseg(sbi, type, 0);
1071 curseg->alloc_type = ckpt->alloc_type[type];
1072 curseg->next_blkoff = blk_off;
1073 mutex_unlock(&curseg->curseg_mutex);
1074 f2fs_put_page(new, 1);
1075 return 0;
1076}
1077
1078static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
1079{
1080 int type = CURSEG_HOT_DATA;
1081
1082 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
1083 /* restore for compacted data summary */
1084 if (read_compacted_summaries(sbi))
1085 return -EINVAL;
1086 type = CURSEG_HOT_NODE;
1087 }
1088
1089 for (; type <= CURSEG_COLD_NODE; type++)
1090 if (read_normal_summaries(sbi, type))
1091 return -EINVAL;
1092 return 0;
1093}
1094
1095static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
1096{
1097 struct page *page;
1098 unsigned char *kaddr;
1099 struct f2fs_summary *summary;
1100 struct curseg_info *seg_i;
1101 int written_size = 0;
1102 int i, j;
1103
1104 page = grab_meta_page(sbi, blkaddr++);
1105 kaddr = (unsigned char *)page_address(page);
1106
1107 /* Step 1: write nat cache */
1108 seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
1109 memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
1110 written_size += SUM_JOURNAL_SIZE;
1111
1112 /* Step 2: write sit cache */
1113 seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
1114 memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
1115 SUM_JOURNAL_SIZE);
1116 written_size += SUM_JOURNAL_SIZE;
1117
1118 set_page_dirty(page);
1119
1120 /* Step 3: write summary entries */
1121 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
1122 unsigned short blkoff;
1123 seg_i = CURSEG_I(sbi, i);
1124 if (sbi->ckpt->alloc_type[i] == SSR)
1125 blkoff = sbi->blocks_per_seg;
1126 else
1127 blkoff = curseg_blkoff(sbi, i);
1128
1129 for (j = 0; j < blkoff; j++) {
1130 if (!page) {
1131 page = grab_meta_page(sbi, blkaddr++);
1132 kaddr = (unsigned char *)page_address(page);
1133 written_size = 0;
1134 }
1135 summary = (struct f2fs_summary *)(kaddr + written_size);
1136 *summary = seg_i->sum_blk->entries[j];
1137 written_size += SUMMARY_SIZE;
1138 set_page_dirty(page);
1139
1140 if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
1141 SUM_FOOTER_SIZE)
1142 continue;
1143
1144 f2fs_put_page(page, 1);
1145 page = NULL;
1146 }
1147 }
1148 if (page)
1149 f2fs_put_page(page, 1);
1150}
1151
1152static void write_normal_summaries(struct f2fs_sb_info *sbi,
1153 block_t blkaddr, int type)
1154{
1155 int i, end;
1156 if (IS_DATASEG(type))
1157 end = type + NR_CURSEG_DATA_TYPE;
1158 else
1159 end = type + NR_CURSEG_NODE_TYPE;
1160
1161 for (i = type; i < end; i++) {
1162 struct curseg_info *sum = CURSEG_I(sbi, i);
1163 mutex_lock(&sum->curseg_mutex);
1164 write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
1165 mutex_unlock(&sum->curseg_mutex);
1166 }
1167}
1168
1169void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1170{
1171 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
1172 write_compacted_summaries(sbi, start_blk);
1173 else
1174 write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
1175}
1176
1177void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1178{
1179 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
1180 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
1181 return;
1182}
1183
1184int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
1185 unsigned int val, int alloc)
1186{
1187 int i;
1188
1189 if (type == NAT_JOURNAL) {
1190 for (i = 0; i < nats_in_cursum(sum); i++) {
1191 if (le32_to_cpu(nid_in_journal(sum, i)) == val)
1192 return i;
1193 }
1194 if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
1195 return update_nats_in_cursum(sum, 1);
1196 } else if (type == SIT_JOURNAL) {
1197 for (i = 0; i < sits_in_cursum(sum); i++)
1198 if (le32_to_cpu(segno_in_journal(sum, i)) == val)
1199 return i;
1200 if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
1201 return update_sits_in_cursum(sum, 1);
1202 }
1203 return -1;
1204}
1205
1206static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
1207 unsigned int segno)
1208{
1209 struct sit_info *sit_i = SIT_I(sbi);
1210 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
1211 block_t blk_addr = sit_i->sit_base_addr + offset;
1212
1213 check_seg_range(sbi, segno);
1214
1215 /* calculate sit block address */
1216 if (f2fs_test_bit(offset, sit_i->sit_bitmap))
1217 blk_addr += sit_i->sit_blocks;
1218
1219 return get_meta_page(sbi, blk_addr);
1220}
1221
1222static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
1223 unsigned int start)
1224{
1225 struct sit_info *sit_i = SIT_I(sbi);
1226 struct page *src_page, *dst_page;
1227 pgoff_t src_off, dst_off;
1228 void *src_addr, *dst_addr;
1229
1230 src_off = current_sit_addr(sbi, start);
1231 dst_off = next_sit_addr(sbi, src_off);
1232
1233 /* get current sit block page without lock */
1234 src_page = get_meta_page(sbi, src_off);
1235 dst_page = grab_meta_page(sbi, dst_off);
1236 BUG_ON(PageDirty(src_page));
1237
1238 src_addr = page_address(src_page);
1239 dst_addr = page_address(dst_page);
1240 memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
1241
1242 set_page_dirty(dst_page);
1243 f2fs_put_page(src_page, 1);
1244
1245 set_to_next_sit(sit_i, start);
1246
1247 return dst_page;
1248}
1249
1250static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
1251{
1252 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1253 struct f2fs_summary_block *sum = curseg->sum_blk;
1254 int i;
1255
1256 /*
1257 * If the journal area in the current summary is full of sit entries,
1258 * all the sit entries will be flushed. Otherwise the sit entries
1259 * are not able to replace with newly hot sit entries.
1260 */
1261 if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
1262 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
1263 unsigned int segno;
1264 segno = le32_to_cpu(segno_in_journal(sum, i));
1265 __mark_sit_entry_dirty(sbi, segno);
1266 }
1267 update_sits_in_cursum(sum, -sits_in_cursum(sum));
1268 return 1;
1269 }
1270 return 0;
1271}
1272
1273/*
1274 * CP calls this function, which flushes SIT entries including sit_journal,
1275 * and moves prefree segs to free segs.
1276 */
1277void flush_sit_entries(struct f2fs_sb_info *sbi)
1278{
1279 struct sit_info *sit_i = SIT_I(sbi);
1280 unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
1281 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1282 struct f2fs_summary_block *sum = curseg->sum_blk;
1283 unsigned long nsegs = TOTAL_SEGS(sbi);
1284 struct page *page = NULL;
1285 struct f2fs_sit_block *raw_sit = NULL;
1286 unsigned int start = 0, end = 0;
1287 unsigned int segno = -1;
1288 bool flushed;
1289
1290 mutex_lock(&curseg->curseg_mutex);
1291 mutex_lock(&sit_i->sentry_lock);
1292
1293 /*
1294 * "flushed" indicates whether sit entries in journal are flushed
1295 * to the SIT area or not.
1296 */
1297 flushed = flush_sits_in_journal(sbi);
1298
1299 while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) {
1300 struct seg_entry *se = get_seg_entry(sbi, segno);
1301 int sit_offset, offset;
1302
1303 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
1304
1305 if (flushed)
1306 goto to_sit_page;
1307
1308 offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
1309 if (offset >= 0) {
1310 segno_in_journal(sum, offset) = cpu_to_le32(segno);
1311 seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
1312 goto flush_done;
1313 }
1314to_sit_page:
1315 if (!page || (start > segno) || (segno > end)) {
1316 if (page) {
1317 f2fs_put_page(page, 1);
1318 page = NULL;
1319 }
1320
1321 start = START_SEGNO(sit_i, segno);
1322 end = start + SIT_ENTRY_PER_BLOCK - 1;
1323
1324 /* read sit block that will be updated */
1325 page = get_next_sit_page(sbi, start);
1326 raw_sit = page_address(page);
1327 }
1328
1329 /* udpate entry in SIT block */
1330 seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
1331flush_done:
1332 __clear_bit(segno, bitmap);
1333 sit_i->dirty_sentries--;
1334 }
1335 mutex_unlock(&sit_i->sentry_lock);
1336 mutex_unlock(&curseg->curseg_mutex);
1337
1338 /* writeout last modified SIT block */
1339 f2fs_put_page(page, 1);
1340
1341 set_prefree_as_free_segments(sbi);
1342}
1343
1344static int build_sit_info(struct f2fs_sb_info *sbi)
1345{
1346 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
1347 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1348 struct sit_info *sit_i;
1349 unsigned int sit_segs, start;
1350 char *src_bitmap, *dst_bitmap;
1351 unsigned int bitmap_size;
1352
1353 /* allocate memory for SIT information */
1354 sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
1355 if (!sit_i)
1356 return -ENOMEM;
1357
1358 SM_I(sbi)->sit_info = sit_i;
1359
1360 sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
1361 if (!sit_i->sentries)
1362 return -ENOMEM;
1363
1364 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1365 sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
1366 if (!sit_i->dirty_sentries_bitmap)
1367 return -ENOMEM;
1368
1369 for (start = 0; start < TOTAL_SEGS(sbi); start++) {
1370 sit_i->sentries[start].cur_valid_map
1371 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1372 sit_i->sentries[start].ckpt_valid_map
1373 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1374 if (!sit_i->sentries[start].cur_valid_map
1375 || !sit_i->sentries[start].ckpt_valid_map)
1376 return -ENOMEM;
1377 }
1378
1379 if (sbi->segs_per_sec > 1) {
1380 sit_i->sec_entries = vzalloc(sbi->total_sections *
1381 sizeof(struct sec_entry));
1382 if (!sit_i->sec_entries)
1383 return -ENOMEM;
1384 }
1385
1386 /* get information related with SIT */
1387 sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
1388
1389 /* setup SIT bitmap from ckeckpoint pack */
1390 bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
1391 src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
1392
1393 dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
1394 if (!dst_bitmap)
1395 return -ENOMEM;
1396 memcpy(dst_bitmap, src_bitmap, bitmap_size);
1397
1398 /* init SIT information */
1399 sit_i->s_ops = &default_salloc_ops;
1400
1401 sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
1402 sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
1403 sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
1404 sit_i->sit_bitmap = dst_bitmap;
1405 sit_i->bitmap_size = bitmap_size;
1406 sit_i->dirty_sentries = 0;
1407 sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
1408 sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
1409 sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
1410 mutex_init(&sit_i->sentry_lock);
1411 return 0;
1412}
1413
1414static int build_free_segmap(struct f2fs_sb_info *sbi)
1415{
1416 struct f2fs_sm_info *sm_info = SM_I(sbi);
1417 struct free_segmap_info *free_i;
1418 unsigned int bitmap_size, sec_bitmap_size;
1419
1420 /* allocate memory for free segmap information */
1421 free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
1422 if (!free_i)
1423 return -ENOMEM;
1424
1425 SM_I(sbi)->free_info = free_i;
1426
1427 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1428 free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
1429 if (!free_i->free_segmap)
1430 return -ENOMEM;
1431
1432 sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
1433 free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
1434 if (!free_i->free_secmap)
1435 return -ENOMEM;
1436
1437 /* set all segments as dirty temporarily */
1438 memset(free_i->free_segmap, 0xff, bitmap_size);
1439 memset(free_i->free_secmap, 0xff, sec_bitmap_size);
1440
1441 /* init free segmap information */
1442 free_i->start_segno =
1443 (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
1444 free_i->free_segments = 0;
1445 free_i->free_sections = 0;
1446 rwlock_init(&free_i->segmap_lock);
1447 return 0;
1448}
1449
1450static int build_curseg(struct f2fs_sb_info *sbi)
1451{
1452 struct curseg_info *array;
1453 int i;
1454
1455 array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL);
1456 if (!array)
1457 return -ENOMEM;
1458
1459 SM_I(sbi)->curseg_array = array;
1460
1461 for (i = 0; i < NR_CURSEG_TYPE; i++) {
1462 mutex_init(&array[i].curseg_mutex);
1463 array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
1464 if (!array[i].sum_blk)
1465 return -ENOMEM;
1466 array[i].segno = NULL_SEGNO;
1467 array[i].next_blkoff = 0;
1468 }
1469 return restore_curseg_summaries(sbi);
1470}
1471
1472static void build_sit_entries(struct f2fs_sb_info *sbi)
1473{
1474 struct sit_info *sit_i = SIT_I(sbi);
1475 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1476 struct f2fs_summary_block *sum = curseg->sum_blk;
1477 unsigned int start;
1478
1479 for (start = 0; start < TOTAL_SEGS(sbi); start++) {
1480 struct seg_entry *se = &sit_i->sentries[start];
1481 struct f2fs_sit_block *sit_blk;
1482 struct f2fs_sit_entry sit;
1483 struct page *page;
1484 int i;
1485
1486 mutex_lock(&curseg->curseg_mutex);
1487 for (i = 0; i < sits_in_cursum(sum); i++) {
1488 if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
1489 sit = sit_in_journal(sum, i);
1490 mutex_unlock(&curseg->curseg_mutex);
1491 goto got_it;
1492 }
1493 }
1494 mutex_unlock(&curseg->curseg_mutex);
1495 page = get_current_sit_page(sbi, start);
1496 sit_blk = (struct f2fs_sit_block *)page_address(page);
1497 sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
1498 f2fs_put_page(page, 1);
1499got_it:
1500 check_block_count(sbi, start, &sit);
1501 seg_info_from_raw_sit(se, &sit);
1502 if (sbi->segs_per_sec > 1) {
1503 struct sec_entry *e = get_sec_entry(sbi, start);
1504 e->valid_blocks += se->valid_blocks;
1505 }
1506 }
1507}
1508
1509static void init_free_segmap(struct f2fs_sb_info *sbi)
1510{
1511 unsigned int start;
1512 int type;
1513
1514 for (start = 0; start < TOTAL_SEGS(sbi); start++) {
1515 struct seg_entry *sentry = get_seg_entry(sbi, start);
1516 if (!sentry->valid_blocks)
1517 __set_free(sbi, start);
1518 }
1519
1520 /* set use the current segments */
1521 for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
1522 struct curseg_info *curseg_t = CURSEG_I(sbi, type);
1523 __set_test_and_inuse(sbi, curseg_t->segno);
1524 }
1525}
1526
1527static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1528{
1529 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1530 struct free_segmap_info *free_i = FREE_I(sbi);
1531 unsigned int segno = 0, offset = 0;
1532 unsigned short valid_blocks;
1533
1534 while (segno < TOTAL_SEGS(sbi)) {
1535 /* find dirty segment based on free segmap */
1536 segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
1537 if (segno >= TOTAL_SEGS(sbi))
1538 break;
1539 offset = segno + 1;
1540 valid_blocks = get_valid_blocks(sbi, segno, 0);
1541 if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
1542 continue;
1543 mutex_lock(&dirty_i->seglist_lock);
1544 __locate_dirty_segment(sbi, segno, DIRTY);
1545 mutex_unlock(&dirty_i->seglist_lock);
1546 }
1547}
1548
1549static int init_victim_segmap(struct f2fs_sb_info *sbi)
1550{
1551 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1552 unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1553
1554 dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
1555 dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
1556 if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
1557 return -ENOMEM;
1558 return 0;
1559}
1560
1561static int build_dirty_segmap(struct f2fs_sb_info *sbi)
1562{
1563 struct dirty_seglist_info *dirty_i;
1564 unsigned int bitmap_size, i;
1565
1566 /* allocate memory for dirty segments list information */
1567 dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
1568 if (!dirty_i)
1569 return -ENOMEM;
1570
1571 SM_I(sbi)->dirty_info = dirty_i;
1572 mutex_init(&dirty_i->seglist_lock);
1573
1574 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1575
1576 for (i = 0; i < NR_DIRTY_TYPE; i++) {
1577 dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
1578 if (!dirty_i->dirty_segmap[i])
1579 return -ENOMEM;
1580 }
1581
1582 init_dirty_segmap(sbi);
1583 return init_victim_segmap(sbi);
1584}
1585
1586/*
1587 * Update min, max modified time for cost-benefit GC algorithm
1588 */
1589static void init_min_max_mtime(struct f2fs_sb_info *sbi)
1590{
1591 struct sit_info *sit_i = SIT_I(sbi);
1592 unsigned int segno;
1593
1594 mutex_lock(&sit_i->sentry_lock);
1595
1596 sit_i->min_mtime = LLONG_MAX;
1597
1598 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
1599 unsigned int i;
1600 unsigned long long mtime = 0;
1601
1602 for (i = 0; i < sbi->segs_per_sec; i++)
1603 mtime += get_seg_entry(sbi, segno + i)->mtime;
1604
1605 mtime = div_u64(mtime, sbi->segs_per_sec);
1606
1607 if (sit_i->min_mtime > mtime)
1608 sit_i->min_mtime = mtime;
1609 }
1610 sit_i->max_mtime = get_mtime(sbi);
1611 mutex_unlock(&sit_i->sentry_lock);
1612}
1613
1614int build_segment_manager(struct f2fs_sb_info *sbi)
1615{
1616 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
1617 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1618 struct f2fs_sm_info *sm_info;
1619 int err;
1620
1621 sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
1622 if (!sm_info)
1623 return -ENOMEM;
1624
1625 /* init sm info */
1626 sbi->sm_info = sm_info;
1627 INIT_LIST_HEAD(&sm_info->wblist_head);
1628 spin_lock_init(&sm_info->wblist_lock);
1629 sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
1630 sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
1631 sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
1632 sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
1633 sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
1634 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
1635 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1636
1637 err = build_sit_info(sbi);
1638 if (err)
1639 return err;
1640 err = build_free_segmap(sbi);
1641 if (err)
1642 return err;
1643 err = build_curseg(sbi);
1644 if (err)
1645 return err;
1646
1647 /* reinit free segmap based on SIT */
1648 build_sit_entries(sbi);
1649
1650 init_free_segmap(sbi);
1651 err = build_dirty_segmap(sbi);
1652 if (err)
1653 return err;
1654
1655 init_min_max_mtime(sbi);
1656 return 0;
1657}
1658
1659static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
1660 enum dirty_type dirty_type)
1661{
1662 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1663
1664 mutex_lock(&dirty_i->seglist_lock);
1665 kfree(dirty_i->dirty_segmap[dirty_type]);
1666 dirty_i->nr_dirty[dirty_type] = 0;
1667 mutex_unlock(&dirty_i->seglist_lock);
1668}
1669
1670void reset_victim_segmap(struct f2fs_sb_info *sbi)
1671{
1672 unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1673 memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size);
1674}
1675
1676static void destroy_victim_segmap(struct f2fs_sb_info *sbi)
1677{
1678 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1679
1680 kfree(dirty_i->victim_segmap[FG_GC]);
1681 kfree(dirty_i->victim_segmap[BG_GC]);
1682}
1683
1684static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
1685{
1686 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1687 int i;
1688
1689 if (!dirty_i)
1690 return;
1691
1692 /* discard pre-free/dirty segments list */
1693 for (i = 0; i < NR_DIRTY_TYPE; i++)
1694 discard_dirty_segmap(sbi, i);
1695
1696 destroy_victim_segmap(sbi);
1697 SM_I(sbi)->dirty_info = NULL;
1698 kfree(dirty_i);
1699}
1700
1701static void destroy_curseg(struct f2fs_sb_info *sbi)
1702{
1703 struct curseg_info *array = SM_I(sbi)->curseg_array;
1704 int i;
1705
1706 if (!array)
1707 return;
1708 SM_I(sbi)->curseg_array = NULL;
1709 for (i = 0; i < NR_CURSEG_TYPE; i++)
1710 kfree(array[i].sum_blk);
1711 kfree(array);
1712}
1713
1714static void destroy_free_segmap(struct f2fs_sb_info *sbi)
1715{
1716 struct free_segmap_info *free_i = SM_I(sbi)->free_info;
1717 if (!free_i)
1718 return;
1719 SM_I(sbi)->free_info = NULL;
1720 kfree(free_i->free_segmap);
1721 kfree(free_i->free_secmap);
1722 kfree(free_i);
1723}
1724
1725static void destroy_sit_info(struct f2fs_sb_info *sbi)
1726{
1727 struct sit_info *sit_i = SIT_I(sbi);
1728 unsigned int start;
1729
1730 if (!sit_i)
1731 return;
1732
1733 if (sit_i->sentries) {
1734 for (start = 0; start < TOTAL_SEGS(sbi); start++) {
1735 kfree(sit_i->sentries[start].cur_valid_map);
1736 kfree(sit_i->sentries[start].ckpt_valid_map);
1737 }
1738 }
1739 vfree(sit_i->sentries);
1740 vfree(sit_i->sec_entries);
1741 kfree(sit_i->dirty_sentries_bitmap);
1742
1743 SM_I(sbi)->sit_info = NULL;
1744 kfree(sit_i->sit_bitmap);
1745 kfree(sit_i);
1746}
1747
1748void destroy_segment_manager(struct f2fs_sb_info *sbi)
1749{
1750 struct f2fs_sm_info *sm_info = SM_I(sbi);
1751 destroy_dirty_segmap(sbi);
1752 destroy_curseg(sbi);
1753 destroy_free_segmap(sbi);
1754 destroy_sit_info(sbi);
1755 sbi->sm_info = NULL;
1756 kfree(sm_info);
1757}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
new file mode 100644
index 000000000000..66a288a52fd3
--- /dev/null
+++ b/fs/f2fs/segment.h
@@ -0,0 +1,631 @@
1/*
2 * fs/f2fs/segment.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11/* constant macro */
12#define NULL_SEGNO ((unsigned int)(~0))
13
14/* V: Logical segment # in volume, R: Relative segment # in main area */
15#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
16#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
17
18#define IS_DATASEG(t) \
19 ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \
20 (t == CURSEG_WARM_DATA))
21
22#define IS_NODESEG(t) \
23 ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \
24 (t == CURSEG_WARM_NODE))
25
26#define IS_CURSEG(sbi, segno) \
27 ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
28 (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
29 (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
30 (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
31 (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
32 (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
33
34#define IS_CURSEC(sbi, secno) \
35 ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
36 sbi->segs_per_sec) || \
37 (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
38 sbi->segs_per_sec) || \
39 (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
40 sbi->segs_per_sec) || \
41 (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
42 sbi->segs_per_sec) || \
43 (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
44 sbi->segs_per_sec) || \
45 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
46 sbi->segs_per_sec)) \
47
48#define START_BLOCK(sbi, segno) \
49 (SM_I(sbi)->seg0_blkaddr + \
50 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
51#define NEXT_FREE_BLKADDR(sbi, curseg) \
52 (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
53
54#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr)
55
56#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
60#define GET_SEGNO(sbi, blk_addr) \
61 (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
62 NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
63 GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
64#define GET_SECNO(sbi, segno) \
65 ((segno) / sbi->segs_per_sec)
66#define GET_ZONENO_FROM_SEGNO(sbi, segno) \
67 ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
68
69#define GET_SUM_BLOCK(sbi, segno) \
70 ((sbi->sm_info->ssa_blkaddr) + segno)
71
72#define GET_SUM_TYPE(footer) ((footer)->entry_type)
73#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
74
75#define SIT_ENTRY_OFFSET(sit_i, segno) \
76 (segno % sit_i->sents_per_block)
77#define SIT_BLOCK_OFFSET(sit_i, segno) \
78 (segno / SIT_ENTRY_PER_BLOCK)
79#define START_SEGNO(sit_i, segno) \
80 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
81#define f2fs_bitmap_size(nr) \
82 (BITS_TO_LONGS(nr) * sizeof(unsigned long))
83#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
84
85#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
86 (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
87
88/* during checkpoint, bio_private is used to synchronize the last bio */
89struct bio_private {
90 struct f2fs_sb_info *sbi;
91 bool is_sync;
92 void *wait;
93};
94
95/*
96 * indicate a block allocation direction: RIGHT and LEFT.
97 * RIGHT means allocating new sections towards the end of volume.
98 * LEFT means the opposite direction.
99 */
100enum {
101 ALLOC_RIGHT = 0,
102 ALLOC_LEFT
103};
104
105/*
106 * In the victim_sel_policy->alloc_mode, there are two block allocation modes.
107 * LFS writes data sequentially with cleaning operations.
108 * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
109 */
110enum {
111 LFS = 0,
112 SSR
113};
114
115/*
116 * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
117 * GC_CB is based on cost-benefit algorithm.
118 * GC_GREEDY is based on greedy algorithm.
119 */
120enum {
121 GC_CB = 0,
122 GC_GREEDY
123};
124
125/*
126 * BG_GC means the background cleaning job.
127 * FG_GC means the on-demand cleaning job.
128 */
129enum {
130 BG_GC = 0,
131 FG_GC
132};
133
134/* for a function parameter to select a victim segment */
135struct victim_sel_policy {
136 int alloc_mode; /* LFS or SSR */
137 int gc_mode; /* GC_CB or GC_GREEDY */
138 unsigned long *dirty_segmap; /* dirty segment bitmap */
139 unsigned int offset; /* last scanned bitmap offset */
140 unsigned int ofs_unit; /* bitmap search unit */
141 unsigned int min_cost; /* minimum cost */
142 unsigned int min_segno; /* segment # having min. cost */
143};
144
145struct seg_entry {
146 unsigned short valid_blocks; /* # of valid blocks */
147 unsigned char *cur_valid_map; /* validity bitmap of blocks */
148 /*
149 * # of valid blocks and the validity bitmap stored in the the last
150 * checkpoint pack. This information is used by the SSR mode.
151 */
152 unsigned short ckpt_valid_blocks;
153 unsigned char *ckpt_valid_map;
154 unsigned char type; /* segment type like CURSEG_XXX_TYPE */
155 unsigned long long mtime; /* modification time of the segment */
156};
157
158struct sec_entry {
159 unsigned int valid_blocks; /* # of valid blocks in a section */
160};
161
162struct segment_allocation {
163 void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
164};
165
166struct sit_info {
167 const struct segment_allocation *s_ops;
168
169 block_t sit_base_addr; /* start block address of SIT area */
170 block_t sit_blocks; /* # of blocks used by SIT area */
171 block_t written_valid_blocks; /* # of valid blocks in main area */
172 char *sit_bitmap; /* SIT bitmap pointer */
173 unsigned int bitmap_size; /* SIT bitmap size */
174
175 unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
176 unsigned int dirty_sentries; /* # of dirty sentries */
177 unsigned int sents_per_block; /* # of SIT entries per block */
178 struct mutex sentry_lock; /* to protect SIT cache */
179 struct seg_entry *sentries; /* SIT segment-level cache */
180 struct sec_entry *sec_entries; /* SIT section-level cache */
181
182 /* for cost-benefit algorithm in cleaning procedure */
183 unsigned long long elapsed_time; /* elapsed time after mount */
184 unsigned long long mounted_time; /* mount time */
185 unsigned long long min_mtime; /* min. modification time */
186 unsigned long long max_mtime; /* max. modification time */
187};
188
189struct free_segmap_info {
190 unsigned int start_segno; /* start segment number logically */
191 unsigned int free_segments; /* # of free segments */
192 unsigned int free_sections; /* # of free sections */
193 rwlock_t segmap_lock; /* free segmap lock */
194 unsigned long *free_segmap; /* free segment bitmap */
195 unsigned long *free_secmap; /* free section bitmap */
196};
197
198/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */
199enum dirty_type {
200 DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */
201 DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */
202 DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */
203 DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */
204 DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */
205 DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
206 DIRTY, /* to count # of dirty segments */
207 PRE, /* to count # of entirely obsolete segments */
208 NR_DIRTY_TYPE
209};
210
211struct dirty_seglist_info {
212 const struct victim_selection *v_ops; /* victim selction operation */
213 unsigned long *dirty_segmap[NR_DIRTY_TYPE];
214 struct mutex seglist_lock; /* lock for segment bitmaps */
215 int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
216 unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */
217};
218
219/* victim selection function for cleaning and SSR */
220struct victim_selection {
221 int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
222 int, int, char);
223};
224
225/* for active log information */
226struct curseg_info {
227 struct mutex curseg_mutex; /* lock for consistency */
228 struct f2fs_summary_block *sum_blk; /* cached summary block */
229 unsigned char alloc_type; /* current allocation type */
230 unsigned int segno; /* current segment number */
231 unsigned short next_blkoff; /* next block offset to write */
232 unsigned int zone; /* current zone number */
233 unsigned int next_segno; /* preallocated segment */
234};
235
236/*
237 * inline functions
238 */
239static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
240{
241 return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
242}
243
244static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi,
245 unsigned int segno)
246{
247 struct sit_info *sit_i = SIT_I(sbi);
248 return &sit_i->sentries[segno];
249}
250
251static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
252 unsigned int segno)
253{
254 struct sit_info *sit_i = SIT_I(sbi);
255 return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
256}
257
258static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
259 unsigned int segno, int section)
260{
261 /*
262 * In order to get # of valid blocks in a section instantly from many
263 * segments, f2fs manages two counting structures separately.
264 */
265 if (section > 1)
266 return get_sec_entry(sbi, segno)->valid_blocks;
267 else
268 return get_seg_entry(sbi, segno)->valid_blocks;
269}
270
271static inline void seg_info_from_raw_sit(struct seg_entry *se,
272 struct f2fs_sit_entry *rs)
273{
274 se->valid_blocks = GET_SIT_VBLOCKS(rs);
275 se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
276 memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
277 memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
278 se->type = GET_SIT_TYPE(rs);
279 se->mtime = le64_to_cpu(rs->mtime);
280}
281
282static inline void seg_info_to_raw_sit(struct seg_entry *se,
283 struct f2fs_sit_entry *rs)
284{
285 unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) |
286 se->valid_blocks;
287 rs->vblocks = cpu_to_le16(raw_vblocks);
288 memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
289 memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
290 se->ckpt_valid_blocks = se->valid_blocks;
291 rs->mtime = cpu_to_le64(se->mtime);
292}
293
294static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
295 unsigned int max, unsigned int segno)
296{
297 unsigned int ret;
298 read_lock(&free_i->segmap_lock);
299 ret = find_next_bit(free_i->free_segmap, max, segno);
300 read_unlock(&free_i->segmap_lock);
301 return ret;
302}
303
304static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
305{
306 struct free_segmap_info *free_i = FREE_I(sbi);
307 unsigned int secno = segno / sbi->segs_per_sec;
308 unsigned int start_segno = secno * sbi->segs_per_sec;
309 unsigned int next;
310
311 write_lock(&free_i->segmap_lock);
312 clear_bit(segno, free_i->free_segmap);
313 free_i->free_segments++;
314
315 next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
316 if (next >= start_segno + sbi->segs_per_sec) {
317 clear_bit(secno, free_i->free_secmap);
318 free_i->free_sections++;
319 }
320 write_unlock(&free_i->segmap_lock);
321}
322
323static inline void __set_inuse(struct f2fs_sb_info *sbi,
324 unsigned int segno)
325{
326 struct free_segmap_info *free_i = FREE_I(sbi);
327 unsigned int secno = segno / sbi->segs_per_sec;
328 set_bit(segno, free_i->free_segmap);
329 free_i->free_segments--;
330 if (!test_and_set_bit(secno, free_i->free_secmap))
331 free_i->free_sections--;
332}
333
334static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
335 unsigned int segno)
336{
337 struct free_segmap_info *free_i = FREE_I(sbi);
338 unsigned int secno = segno / sbi->segs_per_sec;
339 unsigned int start_segno = secno * sbi->segs_per_sec;
340 unsigned int next;
341
342 write_lock(&free_i->segmap_lock);
343 if (test_and_clear_bit(segno, free_i->free_segmap)) {
344 free_i->free_segments++;
345
346 next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi),
347 start_segno);
348 if (next >= start_segno + sbi->segs_per_sec) {
349 if (test_and_clear_bit(secno, free_i->free_secmap))
350 free_i->free_sections++;
351 }
352 }
353 write_unlock(&free_i->segmap_lock);
354}
355
356static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
357 unsigned int segno)
358{
359 struct free_segmap_info *free_i = FREE_I(sbi);
360 unsigned int secno = segno / sbi->segs_per_sec;
361 write_lock(&free_i->segmap_lock);
362 if (!test_and_set_bit(segno, free_i->free_segmap)) {
363 free_i->free_segments--;
364 if (!test_and_set_bit(secno, free_i->free_secmap))
365 free_i->free_sections--;
366 }
367 write_unlock(&free_i->segmap_lock);
368}
369
370static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
371 void *dst_addr)
372{
373 struct sit_info *sit_i = SIT_I(sbi);
374 memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
375}
376
377static inline block_t written_block_count(struct f2fs_sb_info *sbi)
378{
379 struct sit_info *sit_i = SIT_I(sbi);
380 block_t vblocks;
381
382 mutex_lock(&sit_i->sentry_lock);
383 vblocks = sit_i->written_valid_blocks;
384 mutex_unlock(&sit_i->sentry_lock);
385
386 return vblocks;
387}
388
389static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
390{
391 struct free_segmap_info *free_i = FREE_I(sbi);
392 unsigned int free_segs;
393
394 read_lock(&free_i->segmap_lock);
395 free_segs = free_i->free_segments;
396 read_unlock(&free_i->segmap_lock);
397
398 return free_segs;
399}
400
401static inline int reserved_segments(struct f2fs_sb_info *sbi)
402{
403 return SM_I(sbi)->reserved_segments;
404}
405
406static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
407{
408 struct free_segmap_info *free_i = FREE_I(sbi);
409 unsigned int free_secs;
410
411 read_lock(&free_i->segmap_lock);
412 free_secs = free_i->free_sections;
413 read_unlock(&free_i->segmap_lock);
414
415 return free_secs;
416}
417
418static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
419{
420 return DIRTY_I(sbi)->nr_dirty[PRE];
421}
422
423static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
424{
425 return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
426 DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] +
427 DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] +
428 DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] +
429 DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] +
430 DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE];
431}
432
433static inline int overprovision_segments(struct f2fs_sb_info *sbi)
434{
435 return SM_I(sbi)->ovp_segments;
436}
437
438static inline int overprovision_sections(struct f2fs_sb_info *sbi)
439{
440 return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
441}
442
443static inline int reserved_sections(struct f2fs_sb_info *sbi)
444{
445 return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
446}
447
448static inline bool need_SSR(struct f2fs_sb_info *sbi)
449{
450 return (free_sections(sbi) < overprovision_sections(sbi));
451}
452
453static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
454{
455 struct curseg_info *curseg = CURSEG_I(sbi, type);
456 return DIRTY_I(sbi)->v_ops->get_victim(sbi,
457 &(curseg)->next_segno, BG_GC, type, SSR);
458}
459
460static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi)
461{
462 unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) *
463 sbi->segs_per_sec;
464 int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
465 >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
466 int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
467 >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
468
469 if (sbi->por_doing)
470 return false;
471
472 if (free_sections(sbi) <= (node_secs + 2 * dent_secs +
473 reserved_sections(sbi)))
474 return true;
475 return false;
476}
477
478static inline int utilization(struct f2fs_sb_info *sbi)
479{
480 return (long int)valid_user_blocks(sbi) * 100 /
481 (long int)sbi->user_block_count;
482}
483
484/*
485 * Sometimes f2fs may be better to drop out-of-place update policy.
486 * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
487 * data in the original place likewise other traditional file systems.
488 * But, currently set 100 in percentage, which means it is disabled.
489 * See below need_inplace_update().
490 */
491#define MIN_IPU_UTIL 100
492static inline bool need_inplace_update(struct inode *inode)
493{
494 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
495 if (S_ISDIR(inode->i_mode))
496 return false;
497 if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
498 return true;
499 return false;
500}
501
502static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
503 int type)
504{
505 struct curseg_info *curseg = CURSEG_I(sbi, type);
506 return curseg->segno;
507}
508
509static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi,
510 int type)
511{
512 struct curseg_info *curseg = CURSEG_I(sbi, type);
513 return curseg->alloc_type;
514}
515
516static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
517{
518 struct curseg_info *curseg = CURSEG_I(sbi, type);
519 return curseg->next_blkoff;
520}
521
522static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
523{
524 unsigned int end_segno = SM_I(sbi)->segment_count - 1;
525 BUG_ON(segno > end_segno);
526}
527
528/*
529 * This function is used for only debugging.
530 * NOTE: In future, we have to remove this function.
531 */
532static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
533{
534 struct f2fs_sm_info *sm_info = SM_I(sbi);
535 block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
536 block_t start_addr = sm_info->seg0_blkaddr;
537 block_t end_addr = start_addr + total_blks - 1;
538 BUG_ON(blk_addr < start_addr);
539 BUG_ON(blk_addr > end_addr);
540}
541
542/*
543 * Summary block is always treated as invalid block
544 */
545static inline void check_block_count(struct f2fs_sb_info *sbi,
546 int segno, struct f2fs_sit_entry *raw_sit)
547{
548 struct f2fs_sm_info *sm_info = SM_I(sbi);
549 unsigned int end_segno = sm_info->segment_count - 1;
550 int valid_blocks = 0;
551 int i;
552
553 /* check segment usage */
554 BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
555
556 /* check boundary of a given segment number */
557 BUG_ON(segno > end_segno);
558
559 /* check bitmap with valid block count */
560 for (i = 0; i < sbi->blocks_per_seg; i++)
561 if (f2fs_test_bit(i, raw_sit->valid_map))
562 valid_blocks++;
563 BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
564}
565
566static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
567 unsigned int start)
568{
569 struct sit_info *sit_i = SIT_I(sbi);
570 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
571 block_t blk_addr = sit_i->sit_base_addr + offset;
572
573 check_seg_range(sbi, start);
574
575 /* calculate sit block address */
576 if (f2fs_test_bit(offset, sit_i->sit_bitmap))
577 blk_addr += sit_i->sit_blocks;
578
579 return blk_addr;
580}
581
582static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
583 pgoff_t block_addr)
584{
585 struct sit_info *sit_i = SIT_I(sbi);
586 block_addr -= sit_i->sit_base_addr;
587 if (block_addr < sit_i->sit_blocks)
588 block_addr += sit_i->sit_blocks;
589 else
590 block_addr -= sit_i->sit_blocks;
591
592 return block_addr + sit_i->sit_base_addr;
593}
594
595static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
596{
597 unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
598
599 if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
600 f2fs_clear_bit(block_off, sit_i->sit_bitmap);
601 else
602 f2fs_set_bit(block_off, sit_i->sit_bitmap);
603}
604
605static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
606{
607 struct sit_info *sit_i = SIT_I(sbi);
608 return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
609 sit_i->mounted_time;
610}
611
612static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
613 unsigned int ofs_in_node, unsigned char version)
614{
615 sum->nid = cpu_to_le32(nid);
616 sum->ofs_in_node = cpu_to_le16(ofs_in_node);
617 sum->version = version;
618}
619
620static inline block_t start_sum_block(struct f2fs_sb_info *sbi)
621{
622 return __start_cp_addr(sbi) +
623 le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
624}
625
626static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
627{
628 return __start_cp_addr(sbi) +
629 le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
630 - (base + 1) + type;
631}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
new file mode 100644
index 000000000000..37fad04c8669
--- /dev/null
+++ b/fs/f2fs/super.c
@@ -0,0 +1,701 @@
1/*
2 * fs/f2fs/super.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/module.h>
12#include <linux/init.h>
13#include <linux/fs.h>
14#include <linux/statfs.h>
15#include <linux/proc_fs.h>
16#include <linux/buffer_head.h>
17#include <linux/backing-dev.h>
18#include <linux/kthread.h>
19#include <linux/parser.h>
20#include <linux/mount.h>
21#include <linux/seq_file.h>
22#include <linux/random.h>
23#include <linux/exportfs.h>
24#include <linux/f2fs_fs.h>
25
26#include "f2fs.h"
27#include "node.h"
28#include "xattr.h"
29
30static struct kmem_cache *f2fs_inode_cachep;
31
32enum {
33 Opt_gc_background_off,
34 Opt_disable_roll_forward,
35 Opt_discard,
36 Opt_noheap,
37 Opt_nouser_xattr,
38 Opt_noacl,
39 Opt_active_logs,
40 Opt_disable_ext_identify,
41 Opt_err,
42};
43
44static match_table_t f2fs_tokens = {
45 {Opt_gc_background_off, "background_gc_off"},
46 {Opt_disable_roll_forward, "disable_roll_forward"},
47 {Opt_discard, "discard"},
48 {Opt_noheap, "no_heap"},
49 {Opt_nouser_xattr, "nouser_xattr"},
50 {Opt_noacl, "noacl"},
51 {Opt_active_logs, "active_logs=%u"},
52 {Opt_disable_ext_identify, "disable_ext_identify"},
53 {Opt_err, NULL},
54};
55
56void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
57{
58 struct va_format vaf;
59 va_list args;
60
61 va_start(args, fmt);
62 vaf.fmt = fmt;
63 vaf.va = &args;
64 printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
65 va_end(args);
66}
67
68static void init_once(void *foo)
69{
70 struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
71
72 inode_init_once(&fi->vfs_inode);
73}
74
75static struct inode *f2fs_alloc_inode(struct super_block *sb)
76{
77 struct f2fs_inode_info *fi;
78
79 fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
80 if (!fi)
81 return NULL;
82
83 init_once((void *) fi);
84
85 /* Initilize f2fs-specific inode info */
86 fi->vfs_inode.i_version = 1;
87 atomic_set(&fi->dirty_dents, 0);
88 fi->i_current_depth = 1;
89 fi->i_advise = 0;
90 rwlock_init(&fi->ext.ext_lock);
91
92 set_inode_flag(fi, FI_NEW_INODE);
93
94 return &fi->vfs_inode;
95}
96
97static void f2fs_i_callback(struct rcu_head *head)
98{
99 struct inode *inode = container_of(head, struct inode, i_rcu);
100 kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode));
101}
102
103static void f2fs_destroy_inode(struct inode *inode)
104{
105 call_rcu(&inode->i_rcu, f2fs_i_callback);
106}
107
108static void f2fs_put_super(struct super_block *sb)
109{
110 struct f2fs_sb_info *sbi = F2FS_SB(sb);
111
112 f2fs_destroy_stats(sbi);
113 stop_gc_thread(sbi);
114
115 write_checkpoint(sbi, false, true);
116
117 iput(sbi->node_inode);
118 iput(sbi->meta_inode);
119
120 /* destroy f2fs internal modules */
121 destroy_node_manager(sbi);
122 destroy_segment_manager(sbi);
123
124 kfree(sbi->ckpt);
125
126 sb->s_fs_info = NULL;
127 brelse(sbi->raw_super_buf);
128 kfree(sbi);
129}
130
131int f2fs_sync_fs(struct super_block *sb, int sync)
132{
133 struct f2fs_sb_info *sbi = F2FS_SB(sb);
134
135 if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
136 return 0;
137
138 if (sync)
139 write_checkpoint(sbi, false, false);
140 else
141 f2fs_balance_fs(sbi);
142
143 return 0;
144}
145
146static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
147{
148 struct super_block *sb = dentry->d_sb;
149 struct f2fs_sb_info *sbi = F2FS_SB(sb);
150 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
151 block_t total_count, user_block_count, start_count, ovp_count;
152
153 total_count = le64_to_cpu(sbi->raw_super->block_count);
154 user_block_count = sbi->user_block_count;
155 start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
156 ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
157 buf->f_type = F2FS_SUPER_MAGIC;
158 buf->f_bsize = sbi->blocksize;
159
160 buf->f_blocks = total_count - start_count;
161 buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
162 buf->f_bavail = user_block_count - valid_user_blocks(sbi);
163
164 buf->f_files = sbi->total_node_count;
165 buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
166
167 buf->f_namelen = F2FS_MAX_NAME_LEN;
168 buf->f_fsid.val[0] = (u32)id;
169 buf->f_fsid.val[1] = (u32)(id >> 32);
170
171 return 0;
172}
173
174static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
175{
176 struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
177
178 if (test_opt(sbi, BG_GC))
179 seq_puts(seq, ",background_gc_on");
180 else
181 seq_puts(seq, ",background_gc_off");
182 if (test_opt(sbi, DISABLE_ROLL_FORWARD))
183 seq_puts(seq, ",disable_roll_forward");
184 if (test_opt(sbi, DISCARD))
185 seq_puts(seq, ",discard");
186 if (test_opt(sbi, NOHEAP))
187 seq_puts(seq, ",no_heap_alloc");
188#ifdef CONFIG_F2FS_FS_XATTR
189 if (test_opt(sbi, XATTR_USER))
190 seq_puts(seq, ",user_xattr");
191 else
192 seq_puts(seq, ",nouser_xattr");
193#endif
194#ifdef CONFIG_F2FS_FS_POSIX_ACL
195 if (test_opt(sbi, POSIX_ACL))
196 seq_puts(seq, ",acl");
197 else
198 seq_puts(seq, ",noacl");
199#endif
200 if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
201 seq_puts(seq, ",disable_ext_indentify");
202
203 seq_printf(seq, ",active_logs=%u", sbi->active_logs);
204
205 return 0;
206}
207
208static struct super_operations f2fs_sops = {
209 .alloc_inode = f2fs_alloc_inode,
210 .destroy_inode = f2fs_destroy_inode,
211 .write_inode = f2fs_write_inode,
212 .show_options = f2fs_show_options,
213 .evict_inode = f2fs_evict_inode,
214 .put_super = f2fs_put_super,
215 .sync_fs = f2fs_sync_fs,
216 .statfs = f2fs_statfs,
217};
218
219static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
220 u64 ino, u32 generation)
221{
222 struct f2fs_sb_info *sbi = F2FS_SB(sb);
223 struct inode *inode;
224
225 if (ino < F2FS_ROOT_INO(sbi))
226 return ERR_PTR(-ESTALE);
227
228 /*
229 * f2fs_iget isn't quite right if the inode is currently unallocated!
230 * However f2fs_iget currently does appropriate checks to handle stale
231 * inodes so everything is OK.
232 */
233 inode = f2fs_iget(sb, ino);
234 if (IS_ERR(inode))
235 return ERR_CAST(inode);
236 if (generation && inode->i_generation != generation) {
237 /* we didn't find the right inode.. */
238 iput(inode);
239 return ERR_PTR(-ESTALE);
240 }
241 return inode;
242}
243
244static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
245 int fh_len, int fh_type)
246{
247 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
248 f2fs_nfs_get_inode);
249}
250
251static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
252 int fh_len, int fh_type)
253{
254 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
255 f2fs_nfs_get_inode);
256}
257
258static const struct export_operations f2fs_export_ops = {
259 .fh_to_dentry = f2fs_fh_to_dentry,
260 .fh_to_parent = f2fs_fh_to_parent,
261 .get_parent = f2fs_get_parent,
262};
263
264static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
265 char *options)
266{
267 substring_t args[MAX_OPT_ARGS];
268 char *p;
269 int arg = 0;
270
271 if (!options)
272 return 0;
273
274 while ((p = strsep(&options, ",")) != NULL) {
275 int token;
276 if (!*p)
277 continue;
278 /*
279 * Initialize args struct so we know whether arg was
280 * found; some options take optional arguments.
281 */
282 args[0].to = args[0].from = NULL;
283 token = match_token(p, f2fs_tokens, args);
284
285 switch (token) {
286 case Opt_gc_background_off:
287 clear_opt(sbi, BG_GC);
288 break;
289 case Opt_disable_roll_forward:
290 set_opt(sbi, DISABLE_ROLL_FORWARD);
291 break;
292 case Opt_discard:
293 set_opt(sbi, DISCARD);
294 break;
295 case Opt_noheap:
296 set_opt(sbi, NOHEAP);
297 break;
298#ifdef CONFIG_F2FS_FS_XATTR
299 case Opt_nouser_xattr:
300 clear_opt(sbi, XATTR_USER);
301 break;
302#else
303 case Opt_nouser_xattr:
304 f2fs_msg(sb, KERN_INFO,
305 "nouser_xattr options not supported");
306 break;
307#endif
308#ifdef CONFIG_F2FS_FS_POSIX_ACL
309 case Opt_noacl:
310 clear_opt(sbi, POSIX_ACL);
311 break;
312#else
313 case Opt_noacl:
314 f2fs_msg(sb, KERN_INFO, "noacl options not supported");
315 break;
316#endif
317 case Opt_active_logs:
318 if (args->from && match_int(args, &arg))
319 return -EINVAL;
320 if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
321 return -EINVAL;
322 sbi->active_logs = arg;
323 break;
324 case Opt_disable_ext_identify:
325 set_opt(sbi, DISABLE_EXT_IDENTIFY);
326 break;
327 default:
328 f2fs_msg(sb, KERN_ERR,
329 "Unrecognized mount option \"%s\" or missing value",
330 p);
331 return -EINVAL;
332 }
333 }
334 return 0;
335}
336
337static loff_t max_file_size(unsigned bits)
338{
339 loff_t result = ADDRS_PER_INODE;
340 loff_t leaf_count = ADDRS_PER_BLOCK;
341
342 /* two direct node blocks */
343 result += (leaf_count * 2);
344
345 /* two indirect node blocks */
346 leaf_count *= NIDS_PER_BLOCK;
347 result += (leaf_count * 2);
348
349 /* one double indirect node block */
350 leaf_count *= NIDS_PER_BLOCK;
351 result += leaf_count;
352
353 result <<= bits;
354 return result;
355}
356
357static int sanity_check_raw_super(struct super_block *sb,
358 struct f2fs_super_block *raw_super)
359{
360 unsigned int blocksize;
361
362 if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
363 f2fs_msg(sb, KERN_INFO,
364 "Magic Mismatch, valid(0x%x) - read(0x%x)",
365 F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
366 return 1;
367 }
368
369 /* Currently, support only 4KB block size */
370 blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
371 if (blocksize != PAGE_CACHE_SIZE) {
372 f2fs_msg(sb, KERN_INFO,
373 "Invalid blocksize (%u), supports only 4KB\n",
374 blocksize);
375 return 1;
376 }
377 if (le32_to_cpu(raw_super->log_sectorsize) !=
378 F2FS_LOG_SECTOR_SIZE) {
379 f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
380 return 1;
381 }
382 if (le32_to_cpu(raw_super->log_sectors_per_block) !=
383 F2FS_LOG_SECTORS_PER_BLOCK) {
384 f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block");
385 return 1;
386 }
387 return 0;
388}
389
390static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
391 struct f2fs_checkpoint *ckpt)
392{
393 unsigned int total, fsmeta;
394
395 total = le32_to_cpu(raw_super->segment_count);
396 fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
397 fsmeta += le32_to_cpu(raw_super->segment_count_sit);
398 fsmeta += le32_to_cpu(raw_super->segment_count_nat);
399 fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
400 fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
401
402 if (fsmeta >= total)
403 return 1;
404 return 0;
405}
406
407static void init_sb_info(struct f2fs_sb_info *sbi)
408{
409 struct f2fs_super_block *raw_super = sbi->raw_super;
410 int i;
411
412 sbi->log_sectors_per_block =
413 le32_to_cpu(raw_super->log_sectors_per_block);
414 sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
415 sbi->blocksize = 1 << sbi->log_blocksize;
416 sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
417 sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
418 sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
419 sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
420 sbi->total_sections = le32_to_cpu(raw_super->section_count);
421 sbi->total_node_count =
422 (le32_to_cpu(raw_super->segment_count_nat) / 2)
423 * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
424 sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
425 sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
426 sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
427
428 for (i = 0; i < NR_COUNT_TYPE; i++)
429 atomic_set(&sbi->nr_pages[i], 0);
430}
431
432static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
433{
434 struct f2fs_sb_info *sbi;
435 struct f2fs_super_block *raw_super;
436 struct buffer_head *raw_super_buf;
437 struct inode *root;
438 long err = -EINVAL;
439 int i;
440
441 /* allocate memory for f2fs-specific super block info */
442 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
443 if (!sbi)
444 return -ENOMEM;
445
446 /* set a block size */
447 if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
448 f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
449 goto free_sbi;
450 }
451
452 /* read f2fs raw super block */
453 raw_super_buf = sb_bread(sb, 0);
454 if (!raw_super_buf) {
455 err = -EIO;
456 f2fs_msg(sb, KERN_ERR, "unable to read superblock");
457 goto free_sbi;
458 }
459 raw_super = (struct f2fs_super_block *)
460 ((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET);
461
462 /* init some FS parameters */
463 sbi->active_logs = NR_CURSEG_TYPE;
464
465 set_opt(sbi, BG_GC);
466
467#ifdef CONFIG_F2FS_FS_XATTR
468 set_opt(sbi, XATTR_USER);
469#endif
470#ifdef CONFIG_F2FS_FS_POSIX_ACL
471 set_opt(sbi, POSIX_ACL);
472#endif
473 /* parse mount options */
474 if (parse_options(sb, sbi, (char *)data))
475 goto free_sb_buf;
476
477 /* sanity checking of raw super */
478 if (sanity_check_raw_super(sb, raw_super)) {
479 f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem");
480 goto free_sb_buf;
481 }
482
483 sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
484 sb->s_max_links = F2FS_LINK_MAX;
485 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
486
487 sb->s_op = &f2fs_sops;
488 sb->s_xattr = f2fs_xattr_handlers;
489 sb->s_export_op = &f2fs_export_ops;
490 sb->s_magic = F2FS_SUPER_MAGIC;
491 sb->s_fs_info = sbi;
492 sb->s_time_gran = 1;
493 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
494 (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
495 memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
496
497 /* init f2fs-specific super block info */
498 sbi->sb = sb;
499 sbi->raw_super = raw_super;
500 sbi->raw_super_buf = raw_super_buf;
501 mutex_init(&sbi->gc_mutex);
502 mutex_init(&sbi->write_inode);
503 mutex_init(&sbi->writepages);
504 mutex_init(&sbi->cp_mutex);
505 for (i = 0; i < NR_LOCK_TYPE; i++)
506 mutex_init(&sbi->fs_lock[i]);
507 sbi->por_doing = 0;
508 spin_lock_init(&sbi->stat_lock);
509 init_rwsem(&sbi->bio_sem);
510 init_sb_info(sbi);
511
512 /* get an inode for meta space */
513 sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
514 if (IS_ERR(sbi->meta_inode)) {
515 f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
516 err = PTR_ERR(sbi->meta_inode);
517 goto free_sb_buf;
518 }
519
520 err = get_valid_checkpoint(sbi);
521 if (err) {
522 f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint");
523 goto free_meta_inode;
524 }
525
526 /* sanity checking of checkpoint */
527 err = -EINVAL;
528 if (sanity_check_ckpt(raw_super, sbi->ckpt)) {
529 f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
530 goto free_cp;
531 }
532
533 sbi->total_valid_node_count =
534 le32_to_cpu(sbi->ckpt->valid_node_count);
535 sbi->total_valid_inode_count =
536 le32_to_cpu(sbi->ckpt->valid_inode_count);
537 sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
538 sbi->total_valid_block_count =
539 le64_to_cpu(sbi->ckpt->valid_block_count);
540 sbi->last_valid_block_count = sbi->total_valid_block_count;
541 sbi->alloc_valid_block_count = 0;
542 INIT_LIST_HEAD(&sbi->dir_inode_list);
543 spin_lock_init(&sbi->dir_inode_lock);
544
545 init_orphan_info(sbi);
546
547 /* setup f2fs internal modules */
548 err = build_segment_manager(sbi);
549 if (err) {
550 f2fs_msg(sb, KERN_ERR,
551 "Failed to initialize F2FS segment manager");
552 goto free_sm;
553 }
554 err = build_node_manager(sbi);
555 if (err) {
556 f2fs_msg(sb, KERN_ERR,
557 "Failed to initialize F2FS node manager");
558 goto free_nm;
559 }
560
561 build_gc_manager(sbi);
562
563 /* get an inode for node space */
564 sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
565 if (IS_ERR(sbi->node_inode)) {
566 f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
567 err = PTR_ERR(sbi->node_inode);
568 goto free_nm;
569 }
570
571 /* if there are nt orphan nodes free them */
572 err = -EINVAL;
573 if (recover_orphan_inodes(sbi))
574 goto free_node_inode;
575
576 /* read root inode and dentry */
577 root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
578 if (IS_ERR(root)) {
579 f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
580 err = PTR_ERR(root);
581 goto free_node_inode;
582 }
583 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
584 goto free_root_inode;
585
586 sb->s_root = d_make_root(root); /* allocate root dentry */
587 if (!sb->s_root) {
588 err = -ENOMEM;
589 goto free_root_inode;
590 }
591
592 /* recover fsynced data */
593 if (!test_opt(sbi, DISABLE_ROLL_FORWARD))
594 recover_fsync_data(sbi);
595
596 /* After POR, we can run background GC thread */
597 err = start_gc_thread(sbi);
598 if (err)
599 goto fail;
600
601 err = f2fs_build_stats(sbi);
602 if (err)
603 goto fail;
604
605 return 0;
606fail:
607 stop_gc_thread(sbi);
608free_root_inode:
609 dput(sb->s_root);
610 sb->s_root = NULL;
611free_node_inode:
612 iput(sbi->node_inode);
613free_nm:
614 destroy_node_manager(sbi);
615free_sm:
616 destroy_segment_manager(sbi);
617free_cp:
618 kfree(sbi->ckpt);
619free_meta_inode:
620 make_bad_inode(sbi->meta_inode);
621 iput(sbi->meta_inode);
622free_sb_buf:
623 brelse(raw_super_buf);
624free_sbi:
625 kfree(sbi);
626 return err;
627}
628
629static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
630 const char *dev_name, void *data)
631{
632 return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
633}
634
635static struct file_system_type f2fs_fs_type = {
636 .owner = THIS_MODULE,
637 .name = "f2fs",
638 .mount = f2fs_mount,
639 .kill_sb = kill_block_super,
640 .fs_flags = FS_REQUIRES_DEV,
641};
642
643static int __init init_inodecache(void)
644{
645 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
646 sizeof(struct f2fs_inode_info), NULL);
647 if (f2fs_inode_cachep == NULL)
648 return -ENOMEM;
649 return 0;
650}
651
652static void destroy_inodecache(void)
653{
654 /*
655 * Make sure all delayed rcu free inodes are flushed before we
656 * destroy cache.
657 */
658 rcu_barrier();
659 kmem_cache_destroy(f2fs_inode_cachep);
660}
661
662static int __init init_f2fs_fs(void)
663{
664 int err;
665
666 err = init_inodecache();
667 if (err)
668 goto fail;
669 err = create_node_manager_caches();
670 if (err)
671 goto fail;
672 err = create_gc_caches();
673 if (err)
674 goto fail;
675 err = create_checkpoint_caches();
676 if (err)
677 goto fail;
678 err = register_filesystem(&f2fs_fs_type);
679 if (err)
680 goto fail;
681 f2fs_create_root_stats();
682fail:
683 return err;
684}
685
686static void __exit exit_f2fs_fs(void)
687{
688 f2fs_destroy_root_stats();
689 unregister_filesystem(&f2fs_fs_type);
690 destroy_checkpoint_caches();
691 destroy_gc_caches();
692 destroy_node_manager_caches();
693 destroy_inodecache();
694}
695
696module_init(init_f2fs_fs)
697module_exit(exit_f2fs_fs)
698
699MODULE_AUTHOR("Samsung Electronics's Praesto Team");
700MODULE_DESCRIPTION("Flash Friendly File System");
701MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
new file mode 100644
index 000000000000..8038c0496504
--- /dev/null
+++ b/fs/f2fs/xattr.c
@@ -0,0 +1,443 @@
1/*
2 * fs/f2fs/xattr.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext2/xattr.c
8 *
9 * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de>
10 *
11 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
12 * Extended attributes for symlinks and special files added per
13 * suggestion of Luka Renko <luka.renko@hermes.si>.
14 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
15 * Red Hat Inc.
16 *
17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License version 2 as
19 * published by the Free Software Foundation.
20 */
21#include <linux/rwsem.h>
22#include <linux/f2fs_fs.h>
23#include "f2fs.h"
24#include "xattr.h"
25
26static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
27 size_t list_size, const char *name, size_t name_len, int type)
28{
29 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
30 int total_len, prefix_len = 0;
31 const char *prefix = NULL;
32
33 switch (type) {
34 case F2FS_XATTR_INDEX_USER:
35 if (!test_opt(sbi, XATTR_USER))
36 return -EOPNOTSUPP;
37 prefix = XATTR_USER_PREFIX;
38 prefix_len = XATTR_USER_PREFIX_LEN;
39 break;
40 case F2FS_XATTR_INDEX_TRUSTED:
41 if (!capable(CAP_SYS_ADMIN))
42 return -EPERM;
43 prefix = XATTR_TRUSTED_PREFIX;
44 prefix_len = XATTR_TRUSTED_PREFIX_LEN;
45 break;
46 default:
47 return -EINVAL;
48 }
49
50 total_len = prefix_len + name_len + 1;
51 if (list && total_len <= list_size) {
52 memcpy(list, prefix, prefix_len);
53 memcpy(list+prefix_len, name, name_len);
54 list[prefix_len + name_len] = '\0';
55 }
56 return total_len;
57}
58
59static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
60 void *buffer, size_t size, int type)
61{
62 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
63
64 switch (type) {
65 case F2FS_XATTR_INDEX_USER:
66 if (!test_opt(sbi, XATTR_USER))
67 return -EOPNOTSUPP;
68 break;
69 case F2FS_XATTR_INDEX_TRUSTED:
70 if (!capable(CAP_SYS_ADMIN))
71 return -EPERM;
72 break;
73 default:
74 return -EINVAL;
75 }
76 if (strcmp(name, "") == 0)
77 return -EINVAL;
78 return f2fs_getxattr(dentry->d_inode, type, name,
79 buffer, size);
80}
81
82static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
83 const void *value, size_t size, int flags, int type)
84{
85 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
86
87 switch (type) {
88 case F2FS_XATTR_INDEX_USER:
89 if (!test_opt(sbi, XATTR_USER))
90 return -EOPNOTSUPP;
91 break;
92 case F2FS_XATTR_INDEX_TRUSTED:
93 if (!capable(CAP_SYS_ADMIN))
94 return -EPERM;
95 break;
96 default:
97 return -EINVAL;
98 }
99 if (strcmp(name, "") == 0)
100 return -EINVAL;
101
102 return f2fs_setxattr(dentry->d_inode, type, name, value, size);
103}
104
105static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
106 size_t list_size, const char *name, size_t name_len, int type)
107{
108 const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
109 size_t size;
110
111 if (type != F2FS_XATTR_INDEX_ADVISE)
112 return 0;
113
114 size = strlen(xname) + 1;
115 if (list && size <= list_size)
116 memcpy(list, xname, size);
117 return size;
118}
119
120static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
121 void *buffer, size_t size, int type)
122{
123 struct inode *inode = dentry->d_inode;
124
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127
128 *((char *)buffer) = F2FS_I(inode)->i_advise;
129 return sizeof(char);
130}
131
132static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
133 const void *value, size_t size, int flags, int type)
134{
135 struct inode *inode = dentry->d_inode;
136
137 if (strcmp(name, "") != 0)
138 return -EINVAL;
139 if (!inode_owner_or_capable(inode))
140 return -EPERM;
141 if (value == NULL)
142 return -EINVAL;
143
144 F2FS_I(inode)->i_advise |= *(char *)value;
145 return 0;
146}
147
148const struct xattr_handler f2fs_xattr_user_handler = {
149 .prefix = XATTR_USER_PREFIX,
150 .flags = F2FS_XATTR_INDEX_USER,
151 .list = f2fs_xattr_generic_list,
152 .get = f2fs_xattr_generic_get,
153 .set = f2fs_xattr_generic_set,
154};
155
156const struct xattr_handler f2fs_xattr_trusted_handler = {
157 .prefix = XATTR_TRUSTED_PREFIX,
158 .flags = F2FS_XATTR_INDEX_TRUSTED,
159 .list = f2fs_xattr_generic_list,
160 .get = f2fs_xattr_generic_get,
161 .set = f2fs_xattr_generic_set,
162};
163
164const struct xattr_handler f2fs_xattr_advise_handler = {
165 .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
166 .flags = F2FS_XATTR_INDEX_ADVISE,
167 .list = f2fs_xattr_advise_list,
168 .get = f2fs_xattr_advise_get,
169 .set = f2fs_xattr_advise_set,
170};
171
172static const struct xattr_handler *f2fs_xattr_handler_map[] = {
173 [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
174#ifdef CONFIG_F2FS_FS_POSIX_ACL
175 [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler,
176 [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
177#endif
178 [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
179 [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
180};
181
182const struct xattr_handler *f2fs_xattr_handlers[] = {
183 &f2fs_xattr_user_handler,
184#ifdef CONFIG_F2FS_FS_POSIX_ACL
185 &f2fs_xattr_acl_access_handler,
186 &f2fs_xattr_acl_default_handler,
187#endif
188 &f2fs_xattr_trusted_handler,
189 &f2fs_xattr_advise_handler,
190 NULL,
191};
192
193static inline const struct xattr_handler *f2fs_xattr_handler(int name_index)
194{
195 const struct xattr_handler *handler = NULL;
196
197 if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map))
198 handler = f2fs_xattr_handler_map[name_index];
199 return handler;
200}
201
202int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
203 void *buffer, size_t buffer_size)
204{
205 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
206 struct f2fs_inode_info *fi = F2FS_I(inode);
207 struct f2fs_xattr_entry *entry;
208 struct page *page;
209 void *base_addr;
210 int error = 0, found = 0;
211 size_t value_len, name_len;
212
213 if (name == NULL)
214 return -EINVAL;
215 name_len = strlen(name);
216
217 if (!fi->i_xattr_nid)
218 return -ENODATA;
219
220 page = get_node_page(sbi, fi->i_xattr_nid);
221 base_addr = page_address(page);
222
223 list_for_each_xattr(entry, base_addr) {
224 if (entry->e_name_index != name_index)
225 continue;
226 if (entry->e_name_len != name_len)
227 continue;
228 if (!memcmp(entry->e_name, name, name_len)) {
229 found = 1;
230 break;
231 }
232 }
233 if (!found) {
234 error = -ENODATA;
235 goto cleanup;
236 }
237
238 value_len = le16_to_cpu(entry->e_value_size);
239
240 if (buffer && value_len > buffer_size) {
241 error = -ERANGE;
242 goto cleanup;
243 }
244
245 if (buffer) {
246 char *pval = entry->e_name + entry->e_name_len;
247 memcpy(buffer, pval, value_len);
248 }
249 error = value_len;
250
251cleanup:
252 f2fs_put_page(page, 1);
253 return error;
254}
255
256ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
257{
258 struct inode *inode = dentry->d_inode;
259 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
260 struct f2fs_inode_info *fi = F2FS_I(inode);
261 struct f2fs_xattr_entry *entry;
262 struct page *page;
263 void *base_addr;
264 int error = 0;
265 size_t rest = buffer_size;
266
267 if (!fi->i_xattr_nid)
268 return 0;
269
270 page = get_node_page(sbi, fi->i_xattr_nid);
271 base_addr = page_address(page);
272
273 list_for_each_xattr(entry, base_addr) {
274 const struct xattr_handler *handler =
275 f2fs_xattr_handler(entry->e_name_index);
276 size_t size;
277
278 if (!handler)
279 continue;
280
281 size = handler->list(dentry, buffer, rest, entry->e_name,
282 entry->e_name_len, handler->flags);
283 if (buffer && size > rest) {
284 error = -ERANGE;
285 goto cleanup;
286 }
287
288 if (buffer)
289 buffer += size;
290 rest -= size;
291 }
292 error = buffer_size - rest;
293cleanup:
294 f2fs_put_page(page, 1);
295 return error;
296}
297
298int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
299 const void *value, size_t value_len)
300{
301 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
302 struct f2fs_inode_info *fi = F2FS_I(inode);
303 struct f2fs_xattr_header *header = NULL;
304 struct f2fs_xattr_entry *here, *last;
305 struct page *page;
306 void *base_addr;
307 int error, found, free, newsize;
308 size_t name_len;
309 char *pval;
310
311 if (name == NULL)
312 return -EINVAL;
313 name_len = strlen(name);
314
315 if (value == NULL)
316 value_len = 0;
317
318 if (name_len > 255 || value_len > MAX_VALUE_LEN)
319 return -ERANGE;
320
321 f2fs_balance_fs(sbi);
322
323 mutex_lock_op(sbi, NODE_NEW);
324 if (!fi->i_xattr_nid) {
325 /* Allocate new attribute block */
326 struct dnode_of_data dn;
327
328 if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
329 mutex_unlock_op(sbi, NODE_NEW);
330 return -ENOSPC;
331 }
332 set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
333 mark_inode_dirty(inode);
334
335 page = new_node_page(&dn, XATTR_NODE_OFFSET);
336 if (IS_ERR(page)) {
337 alloc_nid_failed(sbi, fi->i_xattr_nid);
338 fi->i_xattr_nid = 0;
339 mutex_unlock_op(sbi, NODE_NEW);
340 return PTR_ERR(page);
341 }
342
343 alloc_nid_done(sbi, fi->i_xattr_nid);
344 base_addr = page_address(page);
345 header = XATTR_HDR(base_addr);
346 header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
347 header->h_refcount = cpu_to_le32(1);
348 } else {
349 /* The inode already has an extended attribute block. */
350 page = get_node_page(sbi, fi->i_xattr_nid);
351 if (IS_ERR(page)) {
352 mutex_unlock_op(sbi, NODE_NEW);
353 return PTR_ERR(page);
354 }
355
356 base_addr = page_address(page);
357 header = XATTR_HDR(base_addr);
358 }
359
360 if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
361 error = -EIO;
362 goto cleanup;
363 }
364
365 /* find entry with wanted name. */
366 found = 0;
367 list_for_each_xattr(here, base_addr) {
368 if (here->e_name_index != name_index)
369 continue;
370 if (here->e_name_len != name_len)
371 continue;
372 if (!memcmp(here->e_name, name, name_len)) {
373 found = 1;
374 break;
375 }
376 }
377
378 last = here;
379
380 while (!IS_XATTR_LAST_ENTRY(last))
381 last = XATTR_NEXT_ENTRY(last);
382
383 newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) +
384 name_len + value_len);
385
386 /* 1. Check space */
387 if (value) {
388 /* If value is NULL, it is remove operation.
389 * In case of update operation, we caculate free.
390 */
391 free = MIN_OFFSET - ((char *)last - (char *)header);
392 if (found)
393 free = free - ENTRY_SIZE(here);
394
395 if (free < newsize) {
396 error = -ENOSPC;
397 goto cleanup;
398 }
399 }
400
401 /* 2. Remove old entry */
402 if (found) {
403 /* If entry is found, remove old entry.
404 * If not found, remove operation is not needed.
405 */
406 struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here);
407 int oldsize = ENTRY_SIZE(here);
408
409 memmove(here, next, (char *)last - (char *)next);
410 last = (struct f2fs_xattr_entry *)((char *)last - oldsize);
411 memset(last, 0, oldsize);
412 }
413
414 /* 3. Write new entry */
415 if (value) {
416 /* Before we come here, old entry is removed.
417 * We just write new entry. */
418 memset(last, 0, newsize);
419 last->e_name_index = name_index;
420 last->e_name_len = name_len;
421 memcpy(last->e_name, name, name_len);
422 pval = last->e_name + name_len;
423 memcpy(pval, value, value_len);
424 last->e_value_size = cpu_to_le16(value_len);
425 }
426
427 set_page_dirty(page);
428 f2fs_put_page(page, 1);
429
430 if (is_inode_flag_set(fi, FI_ACL_MODE)) {
431 inode->i_mode = fi->i_acl_mode;
432 inode->i_ctime = CURRENT_TIME;
433 clear_inode_flag(fi, FI_ACL_MODE);
434 }
435 f2fs_write_inode(inode, NULL);
436 mutex_unlock_op(sbi, NODE_NEW);
437
438 return 0;
439cleanup:
440 f2fs_put_page(page, 1);
441 mutex_unlock_op(sbi, NODE_NEW);
442 return error;
443}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
new file mode 100644
index 000000000000..49c9558305e3
--- /dev/null
+++ b/fs/f2fs/xattr.h
@@ -0,0 +1,145 @@
1/*
2 * fs/f2fs/xattr.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext2/xattr.h
8 *
9 * On-disk format of extended attributes for the ext2 filesystem.
10 *
11 * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License version 2 as
15 * published by the Free Software Foundation.
16 */
17#ifndef __F2FS_XATTR_H__
18#define __F2FS_XATTR_H__
19
20#include <linux/init.h>
21#include <linux/xattr.h>
22
23/* Magic value in attribute blocks */
24#define F2FS_XATTR_MAGIC 0xF2F52011
25
26/* Maximum number of references to one attribute block */
27#define F2FS_XATTR_REFCOUNT_MAX 1024
28
29/* Name indexes */
30#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise"
31#define F2FS_XATTR_INDEX_USER 1
32#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2
33#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
34#define F2FS_XATTR_INDEX_TRUSTED 4
35#define F2FS_XATTR_INDEX_LUSTRE 5
36#define F2FS_XATTR_INDEX_SECURITY 6
37#define F2FS_XATTR_INDEX_ADVISE 7
38
39struct f2fs_xattr_header {
40 __le32 h_magic; /* magic number for identification */
41 __le32 h_refcount; /* reference count */
42 __u32 h_reserved[4]; /* zero right now */
43};
44
45struct f2fs_xattr_entry {
46 __u8 e_name_index;
47 __u8 e_name_len;
48 __le16 e_value_size; /* size of attribute value */
49 char e_name[0]; /* attribute name */
50};
51
52#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
53#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr))
54#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1))
55#define XATTR_ROUND (3)
56
57#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
58
59#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
60 entry->e_name_len + le16_to_cpu(entry->e_value_size)))
61
62#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
63 ENTRY_SIZE(entry)))
64
65#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
66
67#define list_for_each_xattr(entry, addr) \
68 for (entry = XATTR_FIRST_ENTRY(addr);\
69 !IS_XATTR_LAST_ENTRY(entry);\
70 entry = XATTR_NEXT_ENTRY(entry))
71
72
73#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \
74 sizeof(struct node_footer) - \
75 sizeof(__u32))
76
77#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \
78 sizeof(struct f2fs_xattr_entry))
79
80/*
81 * On-disk structure of f2fs_xattr
82 * We use only 1 block for xattr.
83 *
84 * +--------------------+
85 * | f2fs_xattr_header |
86 * | |
87 * +--------------------+
88 * | f2fs_xattr_entry |
89 * | .e_name_index = 1 |
90 * | .e_name_len = 3 |
91 * | .e_value_size = 14 |
92 * | .e_name = "foo" |
93 * | "value_of_xattr" |<- value_offs = e_name + e_name_len
94 * +--------------------+
95 * | f2fs_xattr_entry |
96 * | .e_name_index = 4 |
97 * | .e_name = "bar" |
98 * +--------------------+
99 * | |
100 * | Free |
101 * | |
102 * +--------------------+<- MIN_OFFSET
103 * | node_footer |
104 * | (nid, ino, offset) |
105 * +--------------------+
106 *
107 **/
108
109#ifdef CONFIG_F2FS_FS_XATTR
110extern const struct xattr_handler f2fs_xattr_user_handler;
111extern const struct xattr_handler f2fs_xattr_trusted_handler;
112extern const struct xattr_handler f2fs_xattr_acl_access_handler;
113extern const struct xattr_handler f2fs_xattr_acl_default_handler;
114extern const struct xattr_handler f2fs_xattr_advise_handler;
115
116extern const struct xattr_handler *f2fs_xattr_handlers[];
117
118extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
119 const void *value, size_t value_len);
120extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
121 void *buffer, size_t buffer_size);
122extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
123 size_t buffer_size);
124
125#else
126
127#define f2fs_xattr_handlers NULL
128static inline int f2fs_setxattr(struct inode *inode, int name_index,
129 const char *name, const void *value, size_t value_len)
130{
131 return -EOPNOTSUPP;
132}
133static inline int f2fs_getxattr(struct inode *inode, int name_index,
134 const char *name, void *buffer, size_t buffer_size)
135{
136 return -EOPNOTSUPP;
137}
138static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
139 size_t buffer_size)
140{
141 return -EOPNOTSUPP;
142}
143#endif
144
145#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 2a182342442e..58bf744dbf39 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -461,8 +461,7 @@ static int fat_parse_short(struct super_block *sb,
461} 461}
462 462
463/* 463/*
464 * Return values: negative -> error, 0 -> not found, positive -> found, 464 * Return values: negative -> error/not found, 0 -> found.
465 * value is the total amount of slots, including the shortname entry.
466 */ 465 */
467int fat_search_long(struct inode *inode, const unsigned char *name, 466int fat_search_long(struct inode *inode, const unsigned char *name,
468 int name_len, struct fat_slot_info *sinfo) 467 int name_len, struct fat_slot_info *sinfo)
@@ -1255,7 +1254,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
1255 1254
1256 sinfo->nr_slots = nr_slots; 1255 sinfo->nr_slots = nr_slots;
1257 1256
1258 /* First stage: search free direcotry entries */ 1257 /* First stage: search free directory entries */
1259 free_slots = nr_bhs = 0; 1258 free_slots = nr_bhs = 0;
1260 bh = prev = NULL; 1259 bh = prev = NULL;
1261 pos = 0; 1260 pos = 0;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 623f36f0423b..12701a567752 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -29,6 +29,7 @@ struct fat_mount_options {
29 unsigned short fs_fmask; 29 unsigned short fs_fmask;
30 unsigned short fs_dmask; 30 unsigned short fs_dmask;
31 unsigned short codepage; /* Codepage for shortname conversions */ 31 unsigned short codepage; /* Codepage for shortname conversions */
32 int time_offset; /* Offset of timestamps from UTC (in minutes) */
32 char *iocharset; /* Charset used for filename input/display */ 33 char *iocharset; /* Charset used for filename input/display */
33 unsigned short shortname; /* flags for shortname display/create rule */ 34 unsigned short shortname; /* flags for shortname display/create rule */
34 unsigned char name_check; /* r = relaxed, n = normal, s = strict */ 35 unsigned char name_check; /* r = relaxed, n = normal, s = strict */
@@ -45,7 +46,7 @@ struct fat_mount_options {
45 flush:1, /* write things quickly */ 46 flush:1, /* write things quickly */
46 nocase:1, /* Does this need case conversion? 0=need case conversion*/ 47 nocase:1, /* Does this need case conversion? 0=need case conversion*/
47 usefree:1, /* Use free_clusters for FAT32 */ 48 usefree:1, /* Use free_clusters for FAT32 */
48 tz_utc:1, /* Filesystem timestamps are in UTC */ 49 tz_set:1, /* Filesystem timestamps' offset set */
49 rodir:1, /* allow ATTR_RO for directory */ 50 rodir:1, /* allow ATTR_RO for directory */
50 discard:1, /* Issue discard requests on deletions */ 51 discard:1, /* Issue discard requests on deletions */
51 nfs:1; /* Do extra work needed for NFS export */ 52 nfs:1; /* Do extra work needed for NFS export */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 5bafaad00530..f8f491677a4a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -26,6 +26,7 @@
26#include <linux/writeback.h> 26#include <linux/writeback.h>
27#include <linux/log2.h> 27#include <linux/log2.h>
28#include <linux/hash.h> 28#include <linux/hash.h>
29#include <linux/blkdev.h>
29#include <asm/unaligned.h> 30#include <asm/unaligned.h>
30#include "fat.h" 31#include "fat.h"
31 32
@@ -725,7 +726,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
725 if (opts->allow_utime) 726 if (opts->allow_utime)
726 seq_printf(m, ",allow_utime=%04o", opts->allow_utime); 727 seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
727 if (sbi->nls_disk) 728 if (sbi->nls_disk)
728 seq_printf(m, ",codepage=%s", sbi->nls_disk->charset); 729 /* strip "cp" prefix from displayed option */
730 seq_printf(m, ",codepage=%s", &sbi->nls_disk->charset[2]);
729 if (isvfat) { 731 if (isvfat) {
730 if (sbi->nls_io) 732 if (sbi->nls_io)
731 seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); 733 seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
@@ -777,8 +779,12 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
777 } 779 }
778 if (opts->flush) 780 if (opts->flush)
779 seq_puts(m, ",flush"); 781 seq_puts(m, ",flush");
780 if (opts->tz_utc) 782 if (opts->tz_set) {
781 seq_puts(m, ",tz=UTC"); 783 if (opts->time_offset)
784 seq_printf(m, ",time_offset=%d", opts->time_offset);
785 else
786 seq_puts(m, ",tz=UTC");
787 }
782 if (opts->errors == FAT_ERRORS_CONT) 788 if (opts->errors == FAT_ERRORS_CONT)
783 seq_puts(m, ",errors=continue"); 789 seq_puts(m, ",errors=continue");
784 else if (opts->errors == FAT_ERRORS_PANIC) 790 else if (opts->errors == FAT_ERRORS_PANIC)
@@ -800,7 +806,8 @@ enum {
800 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, 806 Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
801 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, 807 Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
802 Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont, 808 Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
803 Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_err, 809 Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
810 Opt_err,
804}; 811};
805 812
806static const match_table_t fat_tokens = { 813static const match_table_t fat_tokens = {
@@ -825,6 +832,7 @@ static const match_table_t fat_tokens = {
825 {Opt_immutable, "sys_immutable"}, 832 {Opt_immutable, "sys_immutable"},
826 {Opt_flush, "flush"}, 833 {Opt_flush, "flush"},
827 {Opt_tz_utc, "tz=UTC"}, 834 {Opt_tz_utc, "tz=UTC"},
835 {Opt_time_offset, "time_offset=%d"},
828 {Opt_err_cont, "errors=continue"}, 836 {Opt_err_cont, "errors=continue"},
829 {Opt_err_panic, "errors=panic"}, 837 {Opt_err_panic, "errors=panic"},
830 {Opt_err_ro, "errors=remount-ro"}, 838 {Opt_err_ro, "errors=remount-ro"},
@@ -909,7 +917,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
909 opts->utf8 = opts->unicode_xlate = 0; 917 opts->utf8 = opts->unicode_xlate = 0;
910 opts->numtail = 1; 918 opts->numtail = 1;
911 opts->usefree = opts->nocase = 0; 919 opts->usefree = opts->nocase = 0;
912 opts->tz_utc = 0; 920 opts->tz_set = 0;
913 opts->nfs = 0; 921 opts->nfs = 0;
914 opts->errors = FAT_ERRORS_RO; 922 opts->errors = FAT_ERRORS_RO;
915 *debug = 0; 923 *debug = 0;
@@ -965,48 +973,57 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
965 break; 973 break;
966 case Opt_uid: 974 case Opt_uid:
967 if (match_int(&args[0], &option)) 975 if (match_int(&args[0], &option))
968 return 0; 976 return -EINVAL;
969 opts->fs_uid = make_kuid(current_user_ns(), option); 977 opts->fs_uid = make_kuid(current_user_ns(), option);
970 if (!uid_valid(opts->fs_uid)) 978 if (!uid_valid(opts->fs_uid))
971 return 0; 979 return -EINVAL;
972 break; 980 break;
973 case Opt_gid: 981 case Opt_gid:
974 if (match_int(&args[0], &option)) 982 if (match_int(&args[0], &option))
975 return 0; 983 return -EINVAL;
976 opts->fs_gid = make_kgid(current_user_ns(), option); 984 opts->fs_gid = make_kgid(current_user_ns(), option);
977 if (!gid_valid(opts->fs_gid)) 985 if (!gid_valid(opts->fs_gid))
978 return 0; 986 return -EINVAL;
979 break; 987 break;
980 case Opt_umask: 988 case Opt_umask:
981 if (match_octal(&args[0], &option)) 989 if (match_octal(&args[0], &option))
982 return 0; 990 return -EINVAL;
983 opts->fs_fmask = opts->fs_dmask = option; 991 opts->fs_fmask = opts->fs_dmask = option;
984 break; 992 break;
985 case Opt_dmask: 993 case Opt_dmask:
986 if (match_octal(&args[0], &option)) 994 if (match_octal(&args[0], &option))
987 return 0; 995 return -EINVAL;
988 opts->fs_dmask = option; 996 opts->fs_dmask = option;
989 break; 997 break;
990 case Opt_fmask: 998 case Opt_fmask:
991 if (match_octal(&args[0], &option)) 999 if (match_octal(&args[0], &option))
992 return 0; 1000 return -EINVAL;
993 opts->fs_fmask = option; 1001 opts->fs_fmask = option;
994 break; 1002 break;
995 case Opt_allow_utime: 1003 case Opt_allow_utime:
996 if (match_octal(&args[0], &option)) 1004 if (match_octal(&args[0], &option))
997 return 0; 1005 return -EINVAL;
998 opts->allow_utime = option & (S_IWGRP | S_IWOTH); 1006 opts->allow_utime = option & (S_IWGRP | S_IWOTH);
999 break; 1007 break;
1000 case Opt_codepage: 1008 case Opt_codepage:
1001 if (match_int(&args[0], &option)) 1009 if (match_int(&args[0], &option))
1002 return 0; 1010 return -EINVAL;
1003 opts->codepage = option; 1011 opts->codepage = option;
1004 break; 1012 break;
1005 case Opt_flush: 1013 case Opt_flush:
1006 opts->flush = 1; 1014 opts->flush = 1;
1007 break; 1015 break;
1016 case Opt_time_offset:
1017 if (match_int(&args[0], &option))
1018 return -EINVAL;
1019 if (option < -12 * 60 || option > 12 * 60)
1020 return -EINVAL;
1021 opts->tz_set = 1;
1022 opts->time_offset = option;
1023 break;
1008 case Opt_tz_utc: 1024 case Opt_tz_utc:
1009 opts->tz_utc = 1; 1025 opts->tz_set = 1;
1026 opts->time_offset = 0;
1010 break; 1027 break;
1011 case Opt_err_cont: 1028 case Opt_err_cont:
1012 opts->errors = FAT_ERRORS_CONT; 1029 opts->errors = FAT_ERRORS_CONT;
@@ -1327,7 +1344,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1327 sbi->dir_entries = get_unaligned_le16(&b->dir_entries); 1344 sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
1328 if (sbi->dir_entries & (sbi->dir_per_block - 1)) { 1345 if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
1329 if (!silent) 1346 if (!silent)
1330 fat_msg(sb, KERN_ERR, "bogus directroy-entries per block" 1347 fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
1331 " (%u)", sbi->dir_entries); 1348 " (%u)", sbi->dir_entries);
1332 brelse(bh); 1349 brelse(bh);
1333 goto out_invalid; 1350 goto out_invalid;
@@ -1431,6 +1448,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
1431 goto out_fail; 1448 goto out_fail;
1432 } 1449 }
1433 1450
1451 if (sbi->options.discard) {
1452 struct request_queue *q = bdev_get_queue(sb->s_bdev);
1453 if (!blk_queue_discard(q))
1454 fat_msg(sb, KERN_WARNING,
1455 "mounting with \"discard\" option, but "
1456 "the device does not support discard");
1457 }
1458
1434 return 0; 1459 return 0;
1435 1460
1436out_invalid: 1461out_invalid:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 6d93360ca0cc..359d307b5507 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -135,6 +135,10 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
135 } 135 }
136 if (ret < 0) 136 if (ret < 0)
137 return ret; 137 return ret;
138 /*
139 * FIXME:Although we can add this cache, fat_cache_add() is
140 * assuming to be called after linear search with fat_cache_id.
141 */
138// fat_cache_add(inode, new_fclus, new_dclus); 142// fat_cache_add(inode, new_fclus, new_dclus);
139 } else { 143 } else {
140 MSDOS_I(inode)->i_start = new_dclus; 144 MSDOS_I(inode)->i_start = new_dclus;
@@ -212,8 +216,10 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
212 + days_in_year[month] + day 216 + days_in_year[month] + day
213 + DAYS_DELTA) * SECS_PER_DAY; 217 + DAYS_DELTA) * SECS_PER_DAY;
214 218
215 if (!sbi->options.tz_utc) 219 if (!sbi->options.tz_set)
216 second += sys_tz.tz_minuteswest * SECS_PER_MIN; 220 second += sys_tz.tz_minuteswest * SECS_PER_MIN;
221 else
222 second -= sbi->options.time_offset * SECS_PER_MIN;
217 223
218 if (time_cs) { 224 if (time_cs) {
219 ts->tv_sec = second + (time_cs / 100); 225 ts->tv_sec = second + (time_cs / 100);
@@ -229,8 +235,9 @@ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
229 __le16 *time, __le16 *date, u8 *time_cs) 235 __le16 *time, __le16 *date, u8 *time_cs)
230{ 236{
231 struct tm tm; 237 struct tm tm;
232 time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 : 238 time_to_tm(ts->tv_sec,
233 -sys_tz.tz_minuteswest * 60, &tm); 239 (sbi->options.tz_set ? sbi->options.time_offset :
240 -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm);
234 241
235 /* FAT can only support year between 1980 to 2107 */ 242 /* FAT can only support year between 1980 to 2107 */
236 if (tm.tm_year < 1980 - 1900) { 243 if (tm.tm_year < 1980 - 1900) {
diff --git a/fs/fhandle.c b/fs/fhandle.c
index f775bfdd6e4a..999ff5c3cab0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -22,7 +22,7 @@ static long do_sys_name_to_handle(struct path *path,
22 struct file_handle *handle = NULL; 22 struct file_handle *handle = NULL;
23 23
24 /* 24 /*
25 * We need t make sure wether the file system 25 * We need to make sure whether the file system
26 * support decoding of the file handle 26 * support decoding of the file handle
27 */ 27 */
28 if (!path->dentry->d_sb->s_export_op || 28 if (!path->dentry->d_sb->s_export_op ||
@@ -40,7 +40,7 @@ static long do_sys_name_to_handle(struct path *path,
40 if (!handle) 40 if (!handle)
41 return -ENOMEM; 41 return -ENOMEM;
42 42
43 /* convert handle size to multiple of sizeof(u32) */ 43 /* convert handle size to multiple of sizeof(u32) */
44 handle_dwords = f_handle.handle_bytes >> 2; 44 handle_dwords = f_handle.handle_bytes >> 2;
45 45
46 /* we ask for a non connected handle */ 46 /* we ask for a non connected handle */
@@ -52,7 +52,7 @@ static long do_sys_name_to_handle(struct path *path,
52 handle_bytes = handle_dwords * sizeof(u32); 52 handle_bytes = handle_dwords * sizeof(u32);
53 handle->handle_bytes = handle_bytes; 53 handle->handle_bytes = handle_bytes;
54 if ((handle->handle_bytes > f_handle.handle_bytes) || 54 if ((handle->handle_bytes > f_handle.handle_bytes) ||
55 (retval == 255) || (retval == -ENOSPC)) { 55 (retval == FILEID_INVALID) || (retval == -ENOSPC)) {
56 /* As per old exportfs_encode_fh documentation 56 /* As per old exportfs_encode_fh documentation
57 * we could return ENOSPC to indicate overflow 57 * we could return ENOSPC to indicate overflow
58 * But file system returned 255 always. So handle 58 * But file system returned 255 always. So handle
diff --git a/fs/file.c b/fs/file.c
index eff23162485f..2b3570b7caeb 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -490,7 +490,7 @@ void exit_files(struct task_struct *tsk)
490 } 490 }
491} 491}
492 492
493static void __devinit fdtable_defer_list_init(int cpu) 493static void fdtable_defer_list_init(int cpu)
494{ 494{
495 struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); 495 struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
496 spin_lock_init(&fddef->lock); 496 spin_lock_init(&fddef->lock);
@@ -519,12 +519,6 @@ struct files_struct init_files = {
519 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 519 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
520}; 520};
521 521
522void daemonize_descriptors(void)
523{
524 atomic_inc(&init_files.count);
525 reset_files_struct(&init_files);
526}
527
528/* 522/*
529 * allocate a file descriptor, mark it busy. 523 * allocate a file descriptor, mark it busy.
530 */ 524 */
diff --git a/fs/file_table.c b/fs/file_table.c
index a72bf9ddd0d2..de9e9653d611 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -458,8 +458,8 @@ void mark_files_ro(struct super_block *sb)
458 spin_unlock(&f->f_lock); 458 spin_unlock(&f->f_lock);
459 if (file_check_writeable(f) != 0) 459 if (file_check_writeable(f) != 0)
460 continue; 460 continue;
461 __mnt_drop_write(f->f_path.mnt);
461 file_release_write(f); 462 file_release_write(f);
462 mnt_drop_write_file(f);
463 } while_file_list_for_each_entry; 463 } while_file_list_for_each_entry;
464 lg_global_unlock(&files_lglock); 464 lg_global_unlock(&files_lglock);
465} 465}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3e3422f7f0a4..310972b72a66 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1034,7 +1034,7 @@ int bdi_writeback_thread(void *data)
1034 while (!kthread_freezable_should_stop(NULL)) { 1034 while (!kthread_freezable_should_stop(NULL)) {
1035 /* 1035 /*
1036 * Remove own delayed wake-up timer, since we are already awake 1036 * Remove own delayed wake-up timer, since we are already awake
1037 * and we'll take care of the preriodic write-back. 1037 * and we'll take care of the periodic write-back.
1038 */ 1038 */
1039 del_timer(&wb->wakeup_timer); 1039 del_timer(&wb->wakeup_timer);
1040 1040
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 5df4775fea03..fe6ca583bbc0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -164,27 +164,3 @@ struct fs_struct init_fs = {
164 .seq = SEQCNT_ZERO, 164 .seq = SEQCNT_ZERO,
165 .umask = 0022, 165 .umask = 0022,
166}; 166};
167
168void daemonize_fs_struct(void)
169{
170 struct fs_struct *fs = current->fs;
171
172 if (fs) {
173 int kill;
174
175 task_lock(current);
176
177 spin_lock(&init_fs.lock);
178 init_fs.users++;
179 spin_unlock(&init_fs.lock);
180
181 spin_lock(&fs->lock);
182 current->fs = &init_fs;
183 kill = !--fs->users;
184 spin_unlock(&fs->lock);
185
186 task_unlock(current);
187 if (kill)
188 free_fs_struct(fs);
189 }
190}
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 6a3c48abd677..b52aed1dca97 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -314,10 +314,10 @@ EXPORT_SYMBOL(fscache_add_cache);
314 */ 314 */
315void fscache_io_error(struct fscache_cache *cache) 315void fscache_io_error(struct fscache_cache *cache)
316{ 316{
317 set_bit(FSCACHE_IOERROR, &cache->flags); 317 if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
318 318 printk(KERN_ERR "FS-Cache:"
319 printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n", 319 " Cache '%s' stopped due to I/O error\n",
320 cache->ops->name); 320 cache->ops->name);
321} 321}
322EXPORT_SYMBOL(fscache_io_error); 322EXPORT_SYMBOL(fscache_io_error);
323 323
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 990535071a8a..8dcb114758e3 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -370,6 +370,66 @@ cant_attach_object:
370} 370}
371 371
372/* 372/*
373 * Invalidate an object. Callable with spinlocks held.
374 */
375void __fscache_invalidate(struct fscache_cookie *cookie)
376{
377 struct fscache_object *object;
378
379 _enter("{%s}", cookie->def->name);
380
381 fscache_stat(&fscache_n_invalidates);
382
383 /* Only permit invalidation of data files. Invalidating an index will
384 * require the caller to release all its attachments to the tree rooted
385 * there, and if it's doing that, it may as well just retire the
386 * cookie.
387 */
388 ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
389
390 /* We will be updating the cookie too. */
391 BUG_ON(!cookie->def->get_aux);
392
393 /* If there's an object, we tell the object state machine to handle the
394 * invalidation on our behalf, otherwise there's nothing to do.
395 */
396 if (!hlist_empty(&cookie->backing_objects)) {
397 spin_lock(&cookie->lock);
398
399 if (!hlist_empty(&cookie->backing_objects) &&
400 !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
401 &cookie->flags)) {
402 object = hlist_entry(cookie->backing_objects.first,
403 struct fscache_object,
404 cookie_link);
405 if (object->state < FSCACHE_OBJECT_DYING)
406 fscache_raise_event(
407 object, FSCACHE_OBJECT_EV_INVALIDATE);
408 }
409
410 spin_unlock(&cookie->lock);
411 }
412
413 _leave("");
414}
415EXPORT_SYMBOL(__fscache_invalidate);
416
417/*
418 * Wait for object invalidation to complete.
419 */
420void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
421{
422 _enter("%p", cookie);
423
424 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
425 fscache_wait_bit_interruptible,
426 TASK_UNINTERRUPTIBLE);
427
428 _leave("");
429}
430EXPORT_SYMBOL(__fscache_wait_on_invalidate);
431
432/*
373 * update the index entries backing a cookie 433 * update the index entries backing a cookie
374 */ 434 */
375void __fscache_update_cookie(struct fscache_cookie *cookie) 435void __fscache_update_cookie(struct fscache_cookie *cookie)
@@ -442,16 +502,34 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
442 502
443 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; 503 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
444 504
505try_again:
445 spin_lock(&cookie->lock); 506 spin_lock(&cookie->lock);
446 507
447 /* break links with all the active objects */ 508 /* break links with all the active objects */
448 while (!hlist_empty(&cookie->backing_objects)) { 509 while (!hlist_empty(&cookie->backing_objects)) {
510 int n_reads;
449 object = hlist_entry(cookie->backing_objects.first, 511 object = hlist_entry(cookie->backing_objects.first,
450 struct fscache_object, 512 struct fscache_object,
451 cookie_link); 513 cookie_link);
452 514
453 _debug("RELEASE OBJ%x", object->debug_id); 515 _debug("RELEASE OBJ%x", object->debug_id);
454 516
517 set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
518 n_reads = atomic_read(&object->n_reads);
519 if (n_reads) {
520 int n_ops = object->n_ops;
521 int n_in_progress = object->n_in_progress;
522 spin_unlock(&cookie->lock);
523 printk(KERN_ERR "FS-Cache:"
524 " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
525 cookie->def->name,
526 n_reads, n_ops, n_in_progress);
527 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
528 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
529 printk("Wait finished\n");
530 goto try_again;
531 }
532
455 /* detach each cache object from the object cookie */ 533 /* detach each cache object from the object cookie */
456 spin_lock(&object->lock); 534 spin_lock(&object->lock);
457 hlist_del_init(&object->cookie_link); 535 hlist_del_init(&object->cookie_link);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index f6aad48d38a8..ee38fef4be51 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -121,12 +121,19 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
121 struct fscache_operation *); 121 struct fscache_operation *);
122extern int fscache_submit_op(struct fscache_object *, 122extern int fscache_submit_op(struct fscache_object *,
123 struct fscache_operation *); 123 struct fscache_operation *);
124extern int fscache_cancel_op(struct fscache_operation *); 124extern int fscache_cancel_op(struct fscache_operation *,
125 void (*)(struct fscache_operation *));
126extern void fscache_cancel_all_ops(struct fscache_object *);
125extern void fscache_abort_object(struct fscache_object *); 127extern void fscache_abort_object(struct fscache_object *);
126extern void fscache_start_operations(struct fscache_object *); 128extern void fscache_start_operations(struct fscache_object *);
127extern void fscache_operation_gc(struct work_struct *); 129extern void fscache_operation_gc(struct work_struct *);
128 130
129/* 131/*
132 * page.c
133 */
134extern void fscache_invalidate_writes(struct fscache_cookie *);
135
136/*
130 * proc.c 137 * proc.c
131 */ 138 */
132#ifdef CONFIG_PROC_FS 139#ifdef CONFIG_PROC_FS
@@ -194,6 +201,7 @@ extern atomic_t fscache_n_store_vmscan_not_storing;
194extern atomic_t fscache_n_store_vmscan_gone; 201extern atomic_t fscache_n_store_vmscan_gone;
195extern atomic_t fscache_n_store_vmscan_busy; 202extern atomic_t fscache_n_store_vmscan_busy;
196extern atomic_t fscache_n_store_vmscan_cancelled; 203extern atomic_t fscache_n_store_vmscan_cancelled;
204extern atomic_t fscache_n_store_vmscan_wait;
197 205
198extern atomic_t fscache_n_marks; 206extern atomic_t fscache_n_marks;
199extern atomic_t fscache_n_uncaches; 207extern atomic_t fscache_n_uncaches;
@@ -205,6 +213,9 @@ extern atomic_t fscache_n_acquires_ok;
205extern atomic_t fscache_n_acquires_nobufs; 213extern atomic_t fscache_n_acquires_nobufs;
206extern atomic_t fscache_n_acquires_oom; 214extern atomic_t fscache_n_acquires_oom;
207 215
216extern atomic_t fscache_n_invalidates;
217extern atomic_t fscache_n_invalidates_run;
218
208extern atomic_t fscache_n_updates; 219extern atomic_t fscache_n_updates;
209extern atomic_t fscache_n_updates_null; 220extern atomic_t fscache_n_updates_null;
210extern atomic_t fscache_n_updates_run; 221extern atomic_t fscache_n_updates_run;
@@ -237,6 +248,7 @@ extern atomic_t fscache_n_cop_alloc_object;
237extern atomic_t fscache_n_cop_lookup_object; 248extern atomic_t fscache_n_cop_lookup_object;
238extern atomic_t fscache_n_cop_lookup_complete; 249extern atomic_t fscache_n_cop_lookup_complete;
239extern atomic_t fscache_n_cop_grab_object; 250extern atomic_t fscache_n_cop_grab_object;
251extern atomic_t fscache_n_cop_invalidate_object;
240extern atomic_t fscache_n_cop_update_object; 252extern atomic_t fscache_n_cop_update_object;
241extern atomic_t fscache_n_cop_drop_object; 253extern atomic_t fscache_n_cop_drop_object;
242extern atomic_t fscache_n_cop_put_object; 254extern atomic_t fscache_n_cop_put_object;
@@ -278,6 +290,7 @@ extern const struct file_operations fscache_stats_fops;
278static inline void fscache_raise_event(struct fscache_object *object, 290static inline void fscache_raise_event(struct fscache_object *object,
279 unsigned event) 291 unsigned event)
280{ 292{
293 BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
281 if (!test_and_set_bit(event, &object->events) && 294 if (!test_and_set_bit(event, &object->events) &&
282 test_bit(event, &object->event_mask)) 295 test_bit(event, &object->event_mask))
283 fscache_enqueue_object(object); 296 fscache_enqueue_object(object);
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index ebe29c581380..f27c89d17885 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -245,7 +245,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
245 obj->n_in_progress, 245 obj->n_in_progress,
246 obj->n_exclusive, 246 obj->n_exclusive,
247 atomic_read(&obj->n_reads), 247 atomic_read(&obj->n_reads),
248 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, 248 obj->event_mask,
249 obj->events, 249 obj->events,
250 obj->flags, 250 obj->flags,
251 work_busy(&obj->work)); 251 work_busy(&obj->work));
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index b6b897c550ac..50d41c180211 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,6 +14,7 @@
14 14
15#define FSCACHE_DEBUG_LEVEL COOKIE 15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { 20const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -22,6 +23,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
22 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", 23 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
23 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", 24 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
24 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", 25 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
26 [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING",
25 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", 27 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
26 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", 28 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
27 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", 29 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
@@ -39,6 +41,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
39 [FSCACHE_OBJECT_CREATING] = "CRTN", 41 [FSCACHE_OBJECT_CREATING] = "CRTN",
40 [FSCACHE_OBJECT_AVAILABLE] = "AVBL", 42 [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
41 [FSCACHE_OBJECT_ACTIVE] = "ACTV", 43 [FSCACHE_OBJECT_ACTIVE] = "ACTV",
44 [FSCACHE_OBJECT_INVALIDATING] = "INVL",
42 [FSCACHE_OBJECT_UPDATING] = "UPDT", 45 [FSCACHE_OBJECT_UPDATING] = "UPDT",
43 [FSCACHE_OBJECT_DYING] = "DYNG", 46 [FSCACHE_OBJECT_DYING] = "DYNG",
44 [FSCACHE_OBJECT_LC_DYING] = "LCDY", 47 [FSCACHE_OBJECT_LC_DYING] = "LCDY",
@@ -54,6 +57,7 @@ static void fscache_put_object(struct fscache_object *);
54static void fscache_initialise_object(struct fscache_object *); 57static void fscache_initialise_object(struct fscache_object *);
55static void fscache_lookup_object(struct fscache_object *); 58static void fscache_lookup_object(struct fscache_object *);
56static void fscache_object_available(struct fscache_object *); 59static void fscache_object_available(struct fscache_object *);
60static void fscache_invalidate_object(struct fscache_object *);
57static void fscache_release_object(struct fscache_object *); 61static void fscache_release_object(struct fscache_object *);
58static void fscache_withdraw_object(struct fscache_object *); 62static void fscache_withdraw_object(struct fscache_object *);
59static void fscache_enqueue_dependents(struct fscache_object *); 63static void fscache_enqueue_dependents(struct fscache_object *);
@@ -79,6 +83,15 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
79} 83}
80 84
81/* 85/*
86 * Notify netfs of invalidation completion.
87 */
88static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
89{
90 if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
91 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
92}
93
94/*
82 * process events that have been sent to an object's state machine 95 * process events that have been sent to an object's state machine
83 * - initiates parent lookup 96 * - initiates parent lookup
84 * - does object lookup 97 * - does object lookup
@@ -90,6 +103,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
90{ 103{
91 enum fscache_object_state new_state; 104 enum fscache_object_state new_state;
92 struct fscache_cookie *cookie; 105 struct fscache_cookie *cookie;
106 int event;
93 107
94 ASSERT(object != NULL); 108 ASSERT(object != NULL);
95 109
@@ -101,7 +115,8 @@ static void fscache_object_state_machine(struct fscache_object *object)
101 /* wait for the parent object to become ready */ 115 /* wait for the parent object to become ready */
102 case FSCACHE_OBJECT_INIT: 116 case FSCACHE_OBJECT_INIT:
103 object->event_mask = 117 object->event_mask =
104 ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); 118 FSCACHE_OBJECT_EVENTS_MASK &
119 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
105 fscache_initialise_object(object); 120 fscache_initialise_object(object);
106 goto done; 121 goto done;
107 122
@@ -125,6 +140,16 @@ static void fscache_object_state_machine(struct fscache_object *object)
125 case FSCACHE_OBJECT_ACTIVE: 140 case FSCACHE_OBJECT_ACTIVE:
126 goto active_transit; 141 goto active_transit;
127 142
143 /* Invalidate an object on disk */
144 case FSCACHE_OBJECT_INVALIDATING:
145 clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
146 fscache_stat(&fscache_n_invalidates_run);
147 fscache_stat(&fscache_n_cop_invalidate_object);
148 fscache_invalidate_object(object);
149 fscache_stat_d(&fscache_n_cop_invalidate_object);
150 fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
151 goto active_transit;
152
128 /* update the object metadata on disk */ 153 /* update the object metadata on disk */
129 case FSCACHE_OBJECT_UPDATING: 154 case FSCACHE_OBJECT_UPDATING:
130 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); 155 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
@@ -251,13 +276,17 @@ static void fscache_object_state_machine(struct fscache_object *object)
251 276
252 /* determine the transition from a lookup state */ 277 /* determine the transition from a lookup state */
253lookup_transit: 278lookup_transit:
254 switch (fls(object->events & object->event_mask) - 1) { 279 event = fls(object->events & object->event_mask) - 1;
280 switch (event) {
255 case FSCACHE_OBJECT_EV_WITHDRAW: 281 case FSCACHE_OBJECT_EV_WITHDRAW:
256 case FSCACHE_OBJECT_EV_RETIRE: 282 case FSCACHE_OBJECT_EV_RETIRE:
257 case FSCACHE_OBJECT_EV_RELEASE: 283 case FSCACHE_OBJECT_EV_RELEASE:
258 case FSCACHE_OBJECT_EV_ERROR: 284 case FSCACHE_OBJECT_EV_ERROR:
259 new_state = FSCACHE_OBJECT_LC_DYING; 285 new_state = FSCACHE_OBJECT_LC_DYING;
260 goto change_state; 286 goto change_state;
287 case FSCACHE_OBJECT_EV_INVALIDATE:
288 new_state = FSCACHE_OBJECT_INVALIDATING;
289 goto change_state;
261 case FSCACHE_OBJECT_EV_REQUEUE: 290 case FSCACHE_OBJECT_EV_REQUEUE:
262 goto done; 291 goto done;
263 case -1: 292 case -1:
@@ -268,13 +297,17 @@ lookup_transit:
268 297
269 /* determine the transition from an active state */ 298 /* determine the transition from an active state */
270active_transit: 299active_transit:
271 switch (fls(object->events & object->event_mask) - 1) { 300 event = fls(object->events & object->event_mask) - 1;
301 switch (event) {
272 case FSCACHE_OBJECT_EV_WITHDRAW: 302 case FSCACHE_OBJECT_EV_WITHDRAW:
273 case FSCACHE_OBJECT_EV_RETIRE: 303 case FSCACHE_OBJECT_EV_RETIRE:
274 case FSCACHE_OBJECT_EV_RELEASE: 304 case FSCACHE_OBJECT_EV_RELEASE:
275 case FSCACHE_OBJECT_EV_ERROR: 305 case FSCACHE_OBJECT_EV_ERROR:
276 new_state = FSCACHE_OBJECT_DYING; 306 new_state = FSCACHE_OBJECT_DYING;
277 goto change_state; 307 goto change_state;
308 case FSCACHE_OBJECT_EV_INVALIDATE:
309 new_state = FSCACHE_OBJECT_INVALIDATING;
310 goto change_state;
278 case FSCACHE_OBJECT_EV_UPDATE: 311 case FSCACHE_OBJECT_EV_UPDATE:
279 new_state = FSCACHE_OBJECT_UPDATING; 312 new_state = FSCACHE_OBJECT_UPDATING;
280 goto change_state; 313 goto change_state;
@@ -287,7 +320,8 @@ active_transit:
287 320
288 /* determine the transition from a terminal state */ 321 /* determine the transition from a terminal state */
289terminal_transit: 322terminal_transit:
290 switch (fls(object->events & object->event_mask) - 1) { 323 event = fls(object->events & object->event_mask) - 1;
324 switch (event) {
291 case FSCACHE_OBJECT_EV_WITHDRAW: 325 case FSCACHE_OBJECT_EV_WITHDRAW:
292 new_state = FSCACHE_OBJECT_WITHDRAWING; 326 new_state = FSCACHE_OBJECT_WITHDRAWING;
293 goto change_state; 327 goto change_state;
@@ -320,8 +354,8 @@ done:
320 354
321unsupported_event: 355unsupported_event:
322 printk(KERN_ERR "FS-Cache:" 356 printk(KERN_ERR "FS-Cache:"
323 " Unsupported event %lx [mask %lx] in state %s\n", 357 " Unsupported event %d [%lx/%lx] in state %s\n",
324 object->events, object->event_mask, 358 event, object->events, object->event_mask,
325 fscache_object_states[object->state]); 359 fscache_object_states[object->state]);
326 BUG(); 360 BUG();
327} 361}
@@ -587,8 +621,6 @@ static void fscache_object_available(struct fscache_object *object)
587 if (object->n_in_progress == 0) { 621 if (object->n_in_progress == 0) {
588 if (object->n_ops > 0) { 622 if (object->n_ops > 0) {
589 ASSERTCMP(object->n_ops, >=, object->n_obj_ops); 623 ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
590 ASSERTIF(object->n_ops > object->n_obj_ops,
591 !list_empty(&object->pending_ops));
592 fscache_start_operations(object); 624 fscache_start_operations(object);
593 } else { 625 } else {
594 ASSERT(list_empty(&object->pending_ops)); 626 ASSERT(list_empty(&object->pending_ops));
@@ -681,6 +713,7 @@ static void fscache_withdraw_object(struct fscache_object *object)
681 if (object->cookie == cookie) { 713 if (object->cookie == cookie) {
682 hlist_del_init(&object->cookie_link); 714 hlist_del_init(&object->cookie_link);
683 object->cookie = NULL; 715 object->cookie = NULL;
716 fscache_invalidation_complete(cookie);
684 detached = true; 717 detached = true;
685 } 718 }
686 spin_unlock(&cookie->lock); 719 spin_unlock(&cookie->lock);
@@ -890,3 +923,55 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
890 return result; 923 return result;
891} 924}
892EXPORT_SYMBOL(fscache_check_aux); 925EXPORT_SYMBOL(fscache_check_aux);
926
927/*
928 * Asynchronously invalidate an object.
929 */
930static void fscache_invalidate_object(struct fscache_object *object)
931{
932 struct fscache_operation *op;
933 struct fscache_cookie *cookie = object->cookie;
934
935 _enter("{OBJ%x}", object->debug_id);
936
937 /* Reject any new read/write ops and abort any that are pending. */
938 fscache_invalidate_writes(cookie);
939 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
940 fscache_cancel_all_ops(object);
941
942 /* Now we have to wait for in-progress reads and writes */
943 op = kzalloc(sizeof(*op), GFP_KERNEL);
944 if (!op) {
945 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
946 _leave(" [ENOMEM]");
947 return;
948 }
949
950 fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
951 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
952
953 spin_lock(&cookie->lock);
954 if (fscache_submit_exclusive_op(object, op) < 0)
955 goto submit_op_failed;
956 spin_unlock(&cookie->lock);
957 fscache_put_operation(op);
958
959 /* Once we've completed the invalidation, we know there will be no data
960 * stored in the cache and thus we can reinstate the data-check-skip
961 * optimisation.
962 */
963 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
964
965 /* We can allow read and write requests to come in once again. They'll
966 * queue up behind our exclusive invalidation operation.
967 */
968 fscache_invalidation_complete(cookie);
969 _leave("");
970 return;
971
972submit_op_failed:
973 spin_unlock(&cookie->lock);
974 kfree(op);
975 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
976 _leave(" [EIO]");
977}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 30afdfa7aec7..762a9ec4ffa4 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -37,6 +37,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
37 ASSERT(op->processor != NULL); 37 ASSERT(op->processor != NULL);
38 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); 38 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
39 ASSERTCMP(atomic_read(&op->usage), >, 0); 39 ASSERTCMP(atomic_read(&op->usage), >, 0);
40 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
40 41
41 fscache_stat(&fscache_n_op_enqueue); 42 fscache_stat(&fscache_n_op_enqueue);
42 switch (op->flags & FSCACHE_OP_TYPE) { 43 switch (op->flags & FSCACHE_OP_TYPE) {
@@ -64,6 +65,9 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
64static void fscache_run_op(struct fscache_object *object, 65static void fscache_run_op(struct fscache_object *object,
65 struct fscache_operation *op) 66 struct fscache_operation *op)
66{ 67{
68 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
69
70 op->state = FSCACHE_OP_ST_IN_PROGRESS;
67 object->n_in_progress++; 71 object->n_in_progress++;
68 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) 72 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
69 wake_up_bit(&op->flags, FSCACHE_OP_WAITING); 73 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -84,18 +88,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
84 88
85 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); 89 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
86 90
91 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
92 ASSERTCMP(atomic_read(&op->usage), >, 0);
93
87 spin_lock(&object->lock); 94 spin_lock(&object->lock);
88 ASSERTCMP(object->n_ops, >=, object->n_in_progress); 95 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
89 ASSERTCMP(object->n_ops, >=, object->n_exclusive); 96 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
90 ASSERT(list_empty(&op->pend_link)); 97 ASSERT(list_empty(&op->pend_link));
91 98
92 ret = -ENOBUFS; 99 op->state = FSCACHE_OP_ST_PENDING;
93 if (fscache_object_is_active(object)) { 100 if (fscache_object_is_active(object)) {
94 op->object = object; 101 op->object = object;
95 object->n_ops++; 102 object->n_ops++;
96 object->n_exclusive++; /* reads and writes must wait */ 103 object->n_exclusive++; /* reads and writes must wait */
97 104
98 if (object->n_ops > 1) { 105 if (object->n_in_progress > 0) {
99 atomic_inc(&op->usage); 106 atomic_inc(&op->usage);
100 list_add_tail(&op->pend_link, &object->pending_ops); 107 list_add_tail(&op->pend_link, &object->pending_ops);
101 fscache_stat(&fscache_n_op_pend); 108 fscache_stat(&fscache_n_op_pend);
@@ -121,8 +128,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
121 fscache_stat(&fscache_n_op_pend); 128 fscache_stat(&fscache_n_op_pend);
122 ret = 0; 129 ret = 0;
123 } else { 130 } else {
124 /* not allowed to submit ops in any other state */ 131 /* If we're in any other state, there must have been an I/O
125 BUG(); 132 * error of some nature.
133 */
134 ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
135 ret = -EIO;
126 } 136 }
127 137
128 spin_unlock(&object->lock); 138 spin_unlock(&object->lock);
@@ -186,6 +196,7 @@ int fscache_submit_op(struct fscache_object *object,
186 _enter("{OBJ%x OP%x},{%u}", 196 _enter("{OBJ%x OP%x},{%u}",
187 object->debug_id, op->debug_id, atomic_read(&op->usage)); 197 object->debug_id, op->debug_id, atomic_read(&op->usage));
188 198
199 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
189 ASSERTCMP(atomic_read(&op->usage), >, 0); 200 ASSERTCMP(atomic_read(&op->usage), >, 0);
190 201
191 spin_lock(&object->lock); 202 spin_lock(&object->lock);
@@ -196,6 +207,7 @@ int fscache_submit_op(struct fscache_object *object,
196 ostate = object->state; 207 ostate = object->state;
197 smp_rmb(); 208 smp_rmb();
198 209
210 op->state = FSCACHE_OP_ST_PENDING;
199 if (fscache_object_is_active(object)) { 211 if (fscache_object_is_active(object)) {
200 op->object = object; 212 op->object = object;
201 object->n_ops++; 213 object->n_ops++;
@@ -225,12 +237,15 @@ int fscache_submit_op(struct fscache_object *object,
225 object->state == FSCACHE_OBJECT_LC_DYING || 237 object->state == FSCACHE_OBJECT_LC_DYING ||
226 object->state == FSCACHE_OBJECT_WITHDRAWING) { 238 object->state == FSCACHE_OBJECT_WITHDRAWING) {
227 fscache_stat(&fscache_n_op_rejected); 239 fscache_stat(&fscache_n_op_rejected);
240 op->state = FSCACHE_OP_ST_CANCELLED;
228 ret = -ENOBUFS; 241 ret = -ENOBUFS;
229 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { 242 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
230 fscache_report_unexpected_submission(object, op, ostate); 243 fscache_report_unexpected_submission(object, op, ostate);
231 ASSERT(!fscache_object_is_active(object)); 244 ASSERT(!fscache_object_is_active(object));
245 op->state = FSCACHE_OP_ST_CANCELLED;
232 ret = -ENOBUFS; 246 ret = -ENOBUFS;
233 } else { 247 } else {
248 op->state = FSCACHE_OP_ST_CANCELLED;
234 ret = -ENOBUFS; 249 ret = -ENOBUFS;
235 } 250 }
236 251
@@ -283,20 +298,28 @@ void fscache_start_operations(struct fscache_object *object)
283/* 298/*
284 * cancel an operation that's pending on an object 299 * cancel an operation that's pending on an object
285 */ 300 */
286int fscache_cancel_op(struct fscache_operation *op) 301int fscache_cancel_op(struct fscache_operation *op,
302 void (*do_cancel)(struct fscache_operation *))
287{ 303{
288 struct fscache_object *object = op->object; 304 struct fscache_object *object = op->object;
289 int ret; 305 int ret;
290 306
291 _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); 307 _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
292 308
309 ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
310 ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
311 ASSERTCMP(atomic_read(&op->usage), >, 0);
312
293 spin_lock(&object->lock); 313 spin_lock(&object->lock);
294 314
295 ret = -EBUSY; 315 ret = -EBUSY;
296 if (!list_empty(&op->pend_link)) { 316 if (op->state == FSCACHE_OP_ST_PENDING) {
317 ASSERT(!list_empty(&op->pend_link));
297 fscache_stat(&fscache_n_op_cancelled); 318 fscache_stat(&fscache_n_op_cancelled);
298 list_del_init(&op->pend_link); 319 list_del_init(&op->pend_link);
299 object->n_ops--; 320 if (do_cancel)
321 do_cancel(op);
322 op->state = FSCACHE_OP_ST_CANCELLED;
300 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) 323 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
301 object->n_exclusive--; 324 object->n_exclusive--;
302 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) 325 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
@@ -311,6 +334,70 @@ int fscache_cancel_op(struct fscache_operation *op)
311} 334}
312 335
313/* 336/*
337 * Cancel all pending operations on an object
338 */
339void fscache_cancel_all_ops(struct fscache_object *object)
340{
341 struct fscache_operation *op;
342
343 _enter("OBJ%x", object->debug_id);
344
345 spin_lock(&object->lock);
346
347 while (!list_empty(&object->pending_ops)) {
348 op = list_entry(object->pending_ops.next,
349 struct fscache_operation, pend_link);
350 fscache_stat(&fscache_n_op_cancelled);
351 list_del_init(&op->pend_link);
352
353 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
354 op->state = FSCACHE_OP_ST_CANCELLED;
355
356 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
357 object->n_exclusive--;
358 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
359 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
360 fscache_put_operation(op);
361 cond_resched_lock(&object->lock);
362 }
363
364 spin_unlock(&object->lock);
365 _leave("");
366}
367
368/*
369 * Record the completion or cancellation of an in-progress operation.
370 */
371void fscache_op_complete(struct fscache_operation *op, bool cancelled)
372{
373 struct fscache_object *object = op->object;
374
375 _enter("OBJ%x", object->debug_id);
376
377 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
378 ASSERTCMP(object->n_in_progress, >, 0);
379 ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
380 object->n_exclusive, >, 0);
381 ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
382 object->n_in_progress, ==, 1);
383
384 spin_lock(&object->lock);
385
386 op->state = cancelled ?
387 FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
388
389 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
390 object->n_exclusive--;
391 object->n_in_progress--;
392 if (object->n_in_progress == 0)
393 fscache_start_operations(object);
394
395 spin_unlock(&object->lock);
396 _leave("");
397}
398EXPORT_SYMBOL(fscache_op_complete);
399
400/*
314 * release an operation 401 * release an operation
315 * - queues pending ops if this is the last in-progress op 402 * - queues pending ops if this is the last in-progress op
316 */ 403 */
@@ -328,8 +415,9 @@ void fscache_put_operation(struct fscache_operation *op)
328 return; 415 return;
329 416
330 _debug("PUT OP"); 417 _debug("PUT OP");
331 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) 418 ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE,
332 BUG(); 419 op->state, ==, FSCACHE_OP_ST_CANCELLED);
420 op->state = FSCACHE_OP_ST_DEAD;
333 421
334 fscache_stat(&fscache_n_op_release); 422 fscache_stat(&fscache_n_op_release);
335 423
@@ -340,8 +428,14 @@ void fscache_put_operation(struct fscache_operation *op)
340 428
341 object = op->object; 429 object = op->object;
342 430
343 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) 431 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
344 atomic_dec(&object->n_reads); 432 if (atomic_dec_and_test(&object->n_reads)) {
433 clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
434 &object->cookie->flags);
435 wake_up_bit(&object->cookie->flags,
436 FSCACHE_COOKIE_WAITING_ON_READS);
437 }
438 }
345 439
346 /* now... we may get called with the object spinlock held, so we 440 /* now... we may get called with the object spinlock held, so we
347 * complete the cleanup here only if we can immediately acquire the 441 * complete the cleanup here only if we can immediately acquire the
@@ -359,16 +453,6 @@ void fscache_put_operation(struct fscache_operation *op)
359 return; 453 return;
360 } 454 }
361 455
362 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
363 ASSERTCMP(object->n_exclusive, >, 0);
364 object->n_exclusive--;
365 }
366
367 ASSERTCMP(object->n_in_progress, >, 0);
368 object->n_in_progress--;
369 if (object->n_in_progress == 0)
370 fscache_start_operations(object);
371
372 ASSERTCMP(object->n_ops, >, 0); 456 ASSERTCMP(object->n_ops, >, 0);
373 object->n_ops--; 457 object->n_ops--;
374 if (object->n_ops == 0) 458 if (object->n_ops == 0)
@@ -407,23 +491,14 @@ void fscache_operation_gc(struct work_struct *work)
407 spin_unlock(&cache->op_gc_list_lock); 491 spin_unlock(&cache->op_gc_list_lock);
408 492
409 object = op->object; 493 object = op->object;
494 spin_lock(&object->lock);
410 495
411 _debug("GC DEFERRED REL OBJ%x OP%x", 496 _debug("GC DEFERRED REL OBJ%x OP%x",
412 object->debug_id, op->debug_id); 497 object->debug_id, op->debug_id);
413 fscache_stat(&fscache_n_op_gc); 498 fscache_stat(&fscache_n_op_gc);
414 499
415 ASSERTCMP(atomic_read(&op->usage), ==, 0); 500 ASSERTCMP(atomic_read(&op->usage), ==, 0);
416 501 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD);
417 spin_lock(&object->lock);
418 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
419 ASSERTCMP(object->n_exclusive, >, 0);
420 object->n_exclusive--;
421 }
422
423 ASSERTCMP(object->n_in_progress, >, 0);
424 object->n_in_progress--;
425 if (object->n_in_progress == 0)
426 fscache_start_operations(object);
427 502
428 ASSERTCMP(object->n_ops, >, 0); 503 ASSERTCMP(object->n_ops, >, 0);
429 object->n_ops--; 504 object->n_ops--;
@@ -431,6 +506,7 @@ void fscache_operation_gc(struct work_struct *work)
431 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); 506 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
432 507
433 spin_unlock(&object->lock); 508 spin_unlock(&object->lock);
509 kfree(op);
434 510
435 } while (count++ < 20); 511 } while (count++ < 20);
436 512
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 3f7a59bfa7ad..ff000e52072d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -56,6 +56,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
56 56
57 _enter("%p,%p,%x", cookie, page, gfp); 57 _enter("%p,%p,%x", cookie, page, gfp);
58 58
59try_again:
59 rcu_read_lock(); 60 rcu_read_lock();
60 val = radix_tree_lookup(&cookie->stores, page->index); 61 val = radix_tree_lookup(&cookie->stores, page->index);
61 if (!val) { 62 if (!val) {
@@ -104,11 +105,19 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
104 return true; 105 return true;
105 106
106page_busy: 107page_busy:
107 /* we might want to wait here, but that could deadlock the allocator as 108 /* We will wait here if we're allowed to, but that could deadlock the
108 * the work threads writing to the cache may all end up sleeping 109 * allocator as the work threads writing to the cache may all end up
109 * on memory allocation */ 110 * sleeping on memory allocation, so we may need to impose a timeout
110 fscache_stat(&fscache_n_store_vmscan_busy); 111 * too. */
111 return false; 112 if (!(gfp & __GFP_WAIT)) {
113 fscache_stat(&fscache_n_store_vmscan_busy);
114 return false;
115 }
116
117 fscache_stat(&fscache_n_store_vmscan_wait);
118 __fscache_wait_on_page_write(cookie, page);
119 gfp &= ~__GFP_WAIT;
120 goto try_again;
112} 121}
113EXPORT_SYMBOL(__fscache_maybe_release_page); 122EXPORT_SYMBOL(__fscache_maybe_release_page);
114 123
@@ -162,6 +171,7 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
162 fscache_abort_object(object); 171 fscache_abort_object(object);
163 } 172 }
164 173
174 fscache_op_complete(op, true);
165 _leave(""); 175 _leave("");
166} 176}
167 177
@@ -223,6 +233,8 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
223 233
224 _enter("{OP%x}", op->op.debug_id); 234 _enter("{OP%x}", op->op.debug_id);
225 235
236 ASSERTCMP(op->n_pages, ==, 0);
237
226 fscache_hist(fscache_retrieval_histogram, op->start_time); 238 fscache_hist(fscache_retrieval_histogram, op->start_time);
227 if (op->context) 239 if (op->context)
228 fscache_put_context(op->op.object->cookie, op->context); 240 fscache_put_context(op->op.object->cookie, op->context);
@@ -291,6 +303,17 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
291} 303}
292 304
293/* 305/*
306 * Handle cancellation of a pending retrieval op
307 */
308static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
309{
310 struct fscache_retrieval *op =
311 container_of(_op, struct fscache_retrieval, op);
312
313 op->n_pages = 0;
314}
315
316/*
294 * wait for an object to become active (or dead) 317 * wait for an object to become active (or dead)
295 */ 318 */
296static int fscache_wait_for_retrieval_activation(struct fscache_object *object, 319static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
@@ -307,8 +330,8 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
307 fscache_stat(stat_op_waits); 330 fscache_stat(stat_op_waits);
308 if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, 331 if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
309 fscache_wait_bit_interruptible, 332 fscache_wait_bit_interruptible,
310 TASK_INTERRUPTIBLE) < 0) { 333 TASK_INTERRUPTIBLE) != 0) {
311 ret = fscache_cancel_op(&op->op); 334 ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
312 if (ret == 0) 335 if (ret == 0)
313 return -ERESTARTSYS; 336 return -ERESTARTSYS;
314 337
@@ -320,7 +343,14 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
320 _debug("<<< GO"); 343 _debug("<<< GO");
321 344
322check_if_dead: 345check_if_dead:
346 if (op->op.state == FSCACHE_OP_ST_CANCELLED) {
347 fscache_stat(stat_object_dead);
348 _leave(" = -ENOBUFS [cancelled]");
349 return -ENOBUFS;
350 }
323 if (unlikely(fscache_object_is_dead(object))) { 351 if (unlikely(fscache_object_is_dead(object))) {
352 pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state);
353 fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
324 fscache_stat(stat_object_dead); 354 fscache_stat(stat_object_dead);
325 return -ENOBUFS; 355 return -ENOBUFS;
326 } 356 }
@@ -353,6 +383,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
353 if (hlist_empty(&cookie->backing_objects)) 383 if (hlist_empty(&cookie->backing_objects))
354 goto nobufs; 384 goto nobufs;
355 385
386 if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
387 _leave(" = -ENOBUFS [invalidating]");
388 return -ENOBUFS;
389 }
390
356 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); 391 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
357 ASSERTCMP(page, !=, NULL); 392 ASSERTCMP(page, !=, NULL);
358 393
@@ -364,6 +399,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
364 _leave(" = -ENOMEM"); 399 _leave(" = -ENOMEM");
365 return -ENOMEM; 400 return -ENOMEM;
366 } 401 }
402 op->n_pages = 1;
367 403
368 spin_lock(&cookie->lock); 404 spin_lock(&cookie->lock);
369 405
@@ -375,10 +411,10 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
375 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); 411 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
376 412
377 atomic_inc(&object->n_reads); 413 atomic_inc(&object->n_reads);
378 set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); 414 __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
379 415
380 if (fscache_submit_op(object, &op->op) < 0) 416 if (fscache_submit_op(object, &op->op) < 0)
381 goto nobufs_unlock; 417 goto nobufs_unlock_dec;
382 spin_unlock(&cookie->lock); 418 spin_unlock(&cookie->lock);
383 419
384 fscache_stat(&fscache_n_retrieval_ops); 420 fscache_stat(&fscache_n_retrieval_ops);
@@ -425,6 +461,8 @@ error:
425 _leave(" = %d", ret); 461 _leave(" = %d", ret);
426 return ret; 462 return ret;
427 463
464nobufs_unlock_dec:
465 atomic_dec(&object->n_reads);
428nobufs_unlock: 466nobufs_unlock:
429 spin_unlock(&cookie->lock); 467 spin_unlock(&cookie->lock);
430 kfree(op); 468 kfree(op);
@@ -472,6 +510,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
472 if (hlist_empty(&cookie->backing_objects)) 510 if (hlist_empty(&cookie->backing_objects))
473 goto nobufs; 511 goto nobufs;
474 512
513 if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
514 _leave(" = -ENOBUFS [invalidating]");
515 return -ENOBUFS;
516 }
517
475 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); 518 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
476 ASSERTCMP(*nr_pages, >, 0); 519 ASSERTCMP(*nr_pages, >, 0);
477 ASSERT(!list_empty(pages)); 520 ASSERT(!list_empty(pages));
@@ -482,6 +525,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
482 op = fscache_alloc_retrieval(mapping, end_io_func, context); 525 op = fscache_alloc_retrieval(mapping, end_io_func, context);
483 if (!op) 526 if (!op)
484 return -ENOMEM; 527 return -ENOMEM;
528 op->n_pages = *nr_pages;
485 529
486 spin_lock(&cookie->lock); 530 spin_lock(&cookie->lock);
487 531
@@ -491,10 +535,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
491 struct fscache_object, cookie_link); 535 struct fscache_object, cookie_link);
492 536
493 atomic_inc(&object->n_reads); 537 atomic_inc(&object->n_reads);
494 set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); 538 __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
495 539
496 if (fscache_submit_op(object, &op->op) < 0) 540 if (fscache_submit_op(object, &op->op) < 0)
497 goto nobufs_unlock; 541 goto nobufs_unlock_dec;
498 spin_unlock(&cookie->lock); 542 spin_unlock(&cookie->lock);
499 543
500 fscache_stat(&fscache_n_retrieval_ops); 544 fscache_stat(&fscache_n_retrieval_ops);
@@ -541,6 +585,8 @@ error:
541 _leave(" = %d", ret); 585 _leave(" = %d", ret);
542 return ret; 586 return ret;
543 587
588nobufs_unlock_dec:
589 atomic_dec(&object->n_reads);
544nobufs_unlock: 590nobufs_unlock:
545 spin_unlock(&cookie->lock); 591 spin_unlock(&cookie->lock);
546 kfree(op); 592 kfree(op);
@@ -577,12 +623,18 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
577 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); 623 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
578 ASSERTCMP(page, !=, NULL); 624 ASSERTCMP(page, !=, NULL);
579 625
626 if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
627 _leave(" = -ENOBUFS [invalidating]");
628 return -ENOBUFS;
629 }
630
580 if (fscache_wait_for_deferred_lookup(cookie) < 0) 631 if (fscache_wait_for_deferred_lookup(cookie) < 0)
581 return -ERESTARTSYS; 632 return -ERESTARTSYS;
582 633
583 op = fscache_alloc_retrieval(page->mapping, NULL, NULL); 634 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
584 if (!op) 635 if (!op)
585 return -ENOMEM; 636 return -ENOMEM;
637 op->n_pages = 1;
586 638
587 spin_lock(&cookie->lock); 639 spin_lock(&cookie->lock);
588 640
@@ -658,9 +710,27 @@ static void fscache_write_op(struct fscache_operation *_op)
658 spin_lock(&object->lock); 710 spin_lock(&object->lock);
659 cookie = object->cookie; 711 cookie = object->cookie;
660 712
661 if (!fscache_object_is_active(object) || !cookie) { 713 if (!fscache_object_is_active(object)) {
714 /* If we get here, then the on-disk cache object likely longer
715 * exists, so we should just cancel this write operation.
716 */
717 spin_unlock(&object->lock);
718 fscache_op_complete(&op->op, false);
719 _leave(" [inactive]");
720 return;
721 }
722
723 if (!cookie) {
724 /* If we get here, then the cookie belonging to the object was
725 * detached, probably by the cookie being withdrawn due to
726 * memory pressure, which means that the pages we might write
727 * to the cache from no longer exist - therefore, we can just
728 * cancel this write operation.
729 */
662 spin_unlock(&object->lock); 730 spin_unlock(&object->lock);
663 _leave(""); 731 fscache_op_complete(&op->op, false);
732 _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
733 _op->flags, _op->state, object->state, object->flags);
664 return; 734 return;
665 } 735 }
666 736
@@ -696,6 +766,7 @@ static void fscache_write_op(struct fscache_operation *_op)
696 fscache_end_page_write(object, page); 766 fscache_end_page_write(object, page);
697 if (ret < 0) { 767 if (ret < 0) {
698 fscache_abort_object(object); 768 fscache_abort_object(object);
769 fscache_op_complete(&op->op, true);
699 } else { 770 } else {
700 fscache_enqueue_operation(&op->op); 771 fscache_enqueue_operation(&op->op);
701 } 772 }
@@ -710,6 +781,38 @@ superseded:
710 spin_unlock(&cookie->stores_lock); 781 spin_unlock(&cookie->stores_lock);
711 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); 782 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
712 spin_unlock(&object->lock); 783 spin_unlock(&object->lock);
784 fscache_op_complete(&op->op, true);
785 _leave("");
786}
787
788/*
789 * Clear the pages pending writing for invalidation
790 */
791void fscache_invalidate_writes(struct fscache_cookie *cookie)
792{
793 struct page *page;
794 void *results[16];
795 int n, i;
796
797 _enter("");
798
799 while (spin_lock(&cookie->stores_lock),
800 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
801 ARRAY_SIZE(results),
802 FSCACHE_COOKIE_PENDING_TAG),
803 n > 0) {
804 for (i = n - 1; i >= 0; i--) {
805 page = results[i];
806 radix_tree_delete(&cookie->stores, page->index);
807 }
808
809 spin_unlock(&cookie->stores_lock);
810
811 for (i = n - 1; i >= 0; i--)
812 page_cache_release(results[i]);
813 }
814
815 spin_unlock(&cookie->stores_lock);
713 _leave(""); 816 _leave("");
714} 817}
715 818
@@ -759,7 +862,12 @@ int __fscache_write_page(struct fscache_cookie *cookie,
759 862
760 fscache_stat(&fscache_n_stores); 863 fscache_stat(&fscache_n_stores);
761 864
762 op = kzalloc(sizeof(*op), GFP_NOIO); 865 if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
866 _leave(" = -ENOBUFS [invalidating]");
867 return -ENOBUFS;
868 }
869
870 op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
763 if (!op) 871 if (!op)
764 goto nomem; 872 goto nomem;
765 873
@@ -915,6 +1023,40 @@ done:
915EXPORT_SYMBOL(__fscache_uncache_page); 1023EXPORT_SYMBOL(__fscache_uncache_page);
916 1024
917/** 1025/**
1026 * fscache_mark_page_cached - Mark a page as being cached
1027 * @op: The retrieval op pages are being marked for
1028 * @page: The page to be marked
1029 *
1030 * Mark a netfs page as being cached. After this is called, the netfs
1031 * must call fscache_uncache_page() to remove the mark.
1032 */
1033void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
1034{
1035 struct fscache_cookie *cookie = op->op.object->cookie;
1036
1037#ifdef CONFIG_FSCACHE_STATS
1038 atomic_inc(&fscache_n_marks);
1039#endif
1040
1041 _debug("- mark %p{%lx}", page, page->index);
1042 if (TestSetPageFsCache(page)) {
1043 static bool once_only;
1044 if (!once_only) {
1045 once_only = true;
1046 printk(KERN_WARNING "FS-Cache:"
1047 " Cookie type %s marked page %lx"
1048 " multiple times\n",
1049 cookie->def->name, page->index);
1050 }
1051 }
1052
1053 if (cookie->def->mark_page_cached)
1054 cookie->def->mark_page_cached(cookie->netfs_data,
1055 op->mapping, page);
1056}
1057EXPORT_SYMBOL(fscache_mark_page_cached);
1058
1059/**
918 * fscache_mark_pages_cached - Mark pages as being cached 1060 * fscache_mark_pages_cached - Mark pages as being cached
919 * @op: The retrieval op pages are being marked for 1061 * @op: The retrieval op pages are being marked for
920 * @pagevec: The pages to be marked 1062 * @pagevec: The pages to be marked
@@ -925,32 +1067,11 @@ EXPORT_SYMBOL(__fscache_uncache_page);
925void fscache_mark_pages_cached(struct fscache_retrieval *op, 1067void fscache_mark_pages_cached(struct fscache_retrieval *op,
926 struct pagevec *pagevec) 1068 struct pagevec *pagevec)
927{ 1069{
928 struct fscache_cookie *cookie = op->op.object->cookie;
929 unsigned long loop; 1070 unsigned long loop;
930 1071
931#ifdef CONFIG_FSCACHE_STATS 1072 for (loop = 0; loop < pagevec->nr; loop++)
932 atomic_add(pagevec->nr, &fscache_n_marks); 1073 fscache_mark_page_cached(op, pagevec->pages[loop]);
933#endif
934
935 for (loop = 0; loop < pagevec->nr; loop++) {
936 struct page *page = pagevec->pages[loop];
937
938 _debug("- mark %p{%lx}", page, page->index);
939 if (TestSetPageFsCache(page)) {
940 static bool once_only;
941 if (!once_only) {
942 once_only = true;
943 printk(KERN_WARNING "FS-Cache:"
944 " Cookie type %s marked page %lx"
945 " multiple times\n",
946 cookie->def->name, page->index);
947 }
948 }
949 }
950 1074
951 if (cookie->def->mark_pages_cached)
952 cookie->def->mark_pages_cached(cookie->netfs_data,
953 op->mapping, pagevec);
954 pagevec_reinit(pagevec); 1075 pagevec_reinit(pagevec);
955} 1076}
956EXPORT_SYMBOL(fscache_mark_pages_cached); 1077EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 4765190d537f..8179e8bc4a3d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -69,6 +69,7 @@ atomic_t fscache_n_store_vmscan_not_storing;
69atomic_t fscache_n_store_vmscan_gone; 69atomic_t fscache_n_store_vmscan_gone;
70atomic_t fscache_n_store_vmscan_busy; 70atomic_t fscache_n_store_vmscan_busy;
71atomic_t fscache_n_store_vmscan_cancelled; 71atomic_t fscache_n_store_vmscan_cancelled;
72atomic_t fscache_n_store_vmscan_wait;
72 73
73atomic_t fscache_n_marks; 74atomic_t fscache_n_marks;
74atomic_t fscache_n_uncaches; 75atomic_t fscache_n_uncaches;
@@ -80,6 +81,9 @@ atomic_t fscache_n_acquires_ok;
80atomic_t fscache_n_acquires_nobufs; 81atomic_t fscache_n_acquires_nobufs;
81atomic_t fscache_n_acquires_oom; 82atomic_t fscache_n_acquires_oom;
82 83
84atomic_t fscache_n_invalidates;
85atomic_t fscache_n_invalidates_run;
86
83atomic_t fscache_n_updates; 87atomic_t fscache_n_updates;
84atomic_t fscache_n_updates_null; 88atomic_t fscache_n_updates_null;
85atomic_t fscache_n_updates_run; 89atomic_t fscache_n_updates_run;
@@ -112,6 +116,7 @@ atomic_t fscache_n_cop_alloc_object;
112atomic_t fscache_n_cop_lookup_object; 116atomic_t fscache_n_cop_lookup_object;
113atomic_t fscache_n_cop_lookup_complete; 117atomic_t fscache_n_cop_lookup_complete;
114atomic_t fscache_n_cop_grab_object; 118atomic_t fscache_n_cop_grab_object;
119atomic_t fscache_n_cop_invalidate_object;
115atomic_t fscache_n_cop_update_object; 120atomic_t fscache_n_cop_update_object;
116atomic_t fscache_n_cop_drop_object; 121atomic_t fscache_n_cop_drop_object;
117atomic_t fscache_n_cop_put_object; 122atomic_t fscache_n_cop_put_object;
@@ -168,6 +173,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
168 atomic_read(&fscache_n_object_created), 173 atomic_read(&fscache_n_object_created),
169 atomic_read(&fscache_n_object_lookups_timed_out)); 174 atomic_read(&fscache_n_object_lookups_timed_out));
170 175
176 seq_printf(m, "Invals : n=%u run=%u\n",
177 atomic_read(&fscache_n_invalidates),
178 atomic_read(&fscache_n_invalidates_run));
179
171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n", 180 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
172 atomic_read(&fscache_n_updates), 181 atomic_read(&fscache_n_updates),
173 atomic_read(&fscache_n_updates_null), 182 atomic_read(&fscache_n_updates_null),
@@ -224,11 +233,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)
224 atomic_read(&fscache_n_store_radix_deletes), 233 atomic_read(&fscache_n_store_radix_deletes),
225 atomic_read(&fscache_n_store_pages_over_limit)); 234 atomic_read(&fscache_n_store_pages_over_limit));
226 235
227 seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n", 236 seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n",
228 atomic_read(&fscache_n_store_vmscan_not_storing), 237 atomic_read(&fscache_n_store_vmscan_not_storing),
229 atomic_read(&fscache_n_store_vmscan_gone), 238 atomic_read(&fscache_n_store_vmscan_gone),
230 atomic_read(&fscache_n_store_vmscan_busy), 239 atomic_read(&fscache_n_store_vmscan_busy),
231 atomic_read(&fscache_n_store_vmscan_cancelled)); 240 atomic_read(&fscache_n_store_vmscan_cancelled),
241 atomic_read(&fscache_n_store_vmscan_wait));
232 242
233 seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", 243 seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",
234 atomic_read(&fscache_n_op_pend), 244 atomic_read(&fscache_n_op_pend),
@@ -246,7 +256,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
246 atomic_read(&fscache_n_cop_lookup_object), 256 atomic_read(&fscache_n_cop_lookup_object),
247 atomic_read(&fscache_n_cop_lookup_complete), 257 atomic_read(&fscache_n_cop_lookup_complete),
248 atomic_read(&fscache_n_cop_grab_object)); 258 atomic_read(&fscache_n_cop_grab_object));
249 seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n", 259 seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n",
260 atomic_read(&fscache_n_cop_invalidate_object),
250 atomic_read(&fscache_n_cop_update_object), 261 atomic_read(&fscache_n_cop_update_object),
251 atomic_read(&fscache_n_cop_drop_object), 262 atomic_read(&fscache_n_cop_drop_object),
252 atomic_read(&fscache_n_cop_put_object), 263 atomic_read(&fscache_n_cop_put_object),
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 0cf160a94eda..1b2f6c2c3aaf 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -4,12 +4,24 @@ config FUSE_FS
4 With FUSE it is possible to implement a fully functional filesystem 4 With FUSE it is possible to implement a fully functional filesystem
5 in a userspace program. 5 in a userspace program.
6 6
7 There's also companion library: libfuse. This library along with 7 There's also a companion library: libfuse2. This library is available
8 utilities is available from the FUSE homepage: 8 from the FUSE homepage:
9 <http://fuse.sourceforge.net/> 9 <http://fuse.sourceforge.net/>
10 although chances are your distribution already has that library
11 installed if you've installed the "fuse" package itself.
10 12
11 See <file:Documentation/filesystems/fuse.txt> for more information. 13 See <file:Documentation/filesystems/fuse.txt> for more information.
12 See <file:Documentation/Changes> for needed library/utility version. 14 See <file:Documentation/Changes> for needed library/utility version.
13 15
14 If you want to develop a userspace FS, or if you want to use 16 If you want to develop a userspace FS, or if you want to use
15 a filesystem based on FUSE, answer Y or M. 17 a filesystem based on FUSE, answer Y or M.
18
19config CUSE
20 tristate "Character device in Userspace support"
21 depends on FUSE_FS
22 help
23 This FUSE extension allows character devices to be
24 implemented in userspace.
25
26 If you want to develop or use a userspace character device
27 based on CUSE, answer Y or M.
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index ee8d55042298..e397b675b029 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -45,7 +45,6 @@
45#include <linux/miscdevice.h> 45#include <linux/miscdevice.h>
46#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/spinlock.h>
49#include <linux/stat.h> 48#include <linux/stat.h>
50#include <linux/module.h> 49#include <linux/module.h>
51 50
@@ -63,7 +62,7 @@ struct cuse_conn {
63 bool unrestricted_ioctl; 62 bool unrestricted_ioctl;
64}; 63};
65 64
66static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ 65static DEFINE_MUTEX(cuse_lock); /* protects registration */
67static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; 66static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
68static struct class *cuse_class; 67static struct class *cuse_class;
69 68
@@ -114,14 +113,14 @@ static int cuse_open(struct inode *inode, struct file *file)
114 int rc; 113 int rc;
115 114
116 /* look up and get the connection */ 115 /* look up and get the connection */
117 spin_lock(&cuse_lock); 116 mutex_lock(&cuse_lock);
118 list_for_each_entry(pos, cuse_conntbl_head(devt), list) 117 list_for_each_entry(pos, cuse_conntbl_head(devt), list)
119 if (pos->dev->devt == devt) { 118 if (pos->dev->devt == devt) {
120 fuse_conn_get(&pos->fc); 119 fuse_conn_get(&pos->fc);
121 cc = pos; 120 cc = pos;
122 break; 121 break;
123 } 122 }
124 spin_unlock(&cuse_lock); 123 mutex_unlock(&cuse_lock);
125 124
126 /* dead? */ 125 /* dead? */
127 if (!cc) 126 if (!cc)
@@ -267,7 +266,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
267static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) 266static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
268{ 267{
269 char *end = p + len; 268 char *end = p + len;
270 char *key, *val; 269 char *uninitialized_var(key), *uninitialized_var(val);
271 int rc; 270 int rc;
272 271
273 while (true) { 272 while (true) {
@@ -305,14 +304,14 @@ static void cuse_gendev_release(struct device *dev)
305 */ 304 */
306static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 305static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
307{ 306{
308 struct cuse_conn *cc = fc_to_cc(fc); 307 struct cuse_conn *cc = fc_to_cc(fc), *pos;
309 struct cuse_init_out *arg = req->out.args[0].value; 308 struct cuse_init_out *arg = req->out.args[0].value;
310 struct page *page = req->pages[0]; 309 struct page *page = req->pages[0];
311 struct cuse_devinfo devinfo = { }; 310 struct cuse_devinfo devinfo = { };
312 struct device *dev; 311 struct device *dev;
313 struct cdev *cdev; 312 struct cdev *cdev;
314 dev_t devt; 313 dev_t devt;
315 int rc; 314 int rc, i;
316 315
317 if (req->out.h.error || 316 if (req->out.h.error ||
318 arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { 317 arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
@@ -356,15 +355,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
356 dev_set_drvdata(dev, cc); 355 dev_set_drvdata(dev, cc);
357 dev_set_name(dev, "%s", devinfo.name); 356 dev_set_name(dev, "%s", devinfo.name);
358 357
358 mutex_lock(&cuse_lock);
359
360 /* make sure the device-name is unique */
361 for (i = 0; i < CUSE_CONNTBL_LEN; ++i) {
362 list_for_each_entry(pos, &cuse_conntbl[i], list)
363 if (!strcmp(dev_name(pos->dev), dev_name(dev)))
364 goto err_unlock;
365 }
366
359 rc = device_add(dev); 367 rc = device_add(dev);
360 if (rc) 368 if (rc)
361 goto err_device; 369 goto err_unlock;
362 370
363 /* register cdev */ 371 /* register cdev */
364 rc = -ENOMEM; 372 rc = -ENOMEM;
365 cdev = cdev_alloc(); 373 cdev = cdev_alloc();
366 if (!cdev) 374 if (!cdev)
367 goto err_device; 375 goto err_unlock;
368 376
369 cdev->owner = THIS_MODULE; 377 cdev->owner = THIS_MODULE;
370 cdev->ops = &cuse_frontend_fops; 378 cdev->ops = &cuse_frontend_fops;
@@ -377,9 +385,8 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
377 cc->cdev = cdev; 385 cc->cdev = cdev;
378 386
379 /* make the device available */ 387 /* make the device available */
380 spin_lock(&cuse_lock);
381 list_add(&cc->list, cuse_conntbl_head(devt)); 388 list_add(&cc->list, cuse_conntbl_head(devt));
382 spin_unlock(&cuse_lock); 389 mutex_unlock(&cuse_lock);
383 390
384 /* announce device availability */ 391 /* announce device availability */
385 dev_set_uevent_suppress(dev, 0); 392 dev_set_uevent_suppress(dev, 0);
@@ -391,7 +398,8 @@ out:
391 398
392err_cdev: 399err_cdev:
393 cdev_del(cdev); 400 cdev_del(cdev);
394err_device: 401err_unlock:
402 mutex_unlock(&cuse_lock);
395 put_device(dev); 403 put_device(dev);
396err_region: 404err_region:
397 unregister_chrdev_region(devt, 1); 405 unregister_chrdev_region(devt, 1);
@@ -520,9 +528,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
520 int rc; 528 int rc;
521 529
522 /* remove from the conntbl, no more access from this point on */ 530 /* remove from the conntbl, no more access from this point on */
523 spin_lock(&cuse_lock); 531 mutex_lock(&cuse_lock);
524 list_del_init(&cc->list); 532 list_del_init(&cc->list);
525 spin_unlock(&cuse_lock); 533 mutex_unlock(&cuse_lock);
526 534
527 /* remove device */ 535 /* remove device */
528 if (cc->dev) 536 if (cc->dev)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8c23fa7a91e6..e83351aa5bad 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -92,8 +92,8 @@ static void __fuse_put_request(struct fuse_req *req)
92 92
93static void fuse_req_init_context(struct fuse_req *req) 93static void fuse_req_init_context(struct fuse_req *req)
94{ 94{
95 req->in.h.uid = current_fsuid(); 95 req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
96 req->in.h.gid = current_fsgid(); 96 req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
97 req->in.h.pid = current->pid; 97 req->in.h.pid = current->pid;
98} 98}
99 99
@@ -692,8 +692,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
692 struct page *oldpage = *pagep; 692 struct page *oldpage = *pagep;
693 struct page *newpage; 693 struct page *newpage;
694 struct pipe_buffer *buf = cs->pipebufs; 694 struct pipe_buffer *buf = cs->pipebufs;
695 struct address_space *mapping;
696 pgoff_t index;
697 695
698 unlock_request(cs->fc, cs->req); 696 unlock_request(cs->fc, cs->req);
699 fuse_copy_finish(cs); 697 fuse_copy_finish(cs);
@@ -724,9 +722,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
724 if (fuse_check_page(newpage) != 0) 722 if (fuse_check_page(newpage) != 0)
725 goto out_fallback_unlock; 723 goto out_fallback_unlock;
726 724
727 mapping = oldpage->mapping;
728 index = oldpage->index;
729
730 /* 725 /*
731 * This is a new and locked page, it shouldn't be mapped or 726 * This is a new and locked page, it shouldn't be mapped or
732 * have any special flags on it 727 * have any special flags on it
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 324bc0850534..b7c09f9eb40c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -818,8 +818,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
818 stat->ino = attr->ino; 818 stat->ino = attr->ino;
819 stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 819 stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
820 stat->nlink = attr->nlink; 820 stat->nlink = attr->nlink;
821 stat->uid = attr->uid; 821 stat->uid = make_kuid(&init_user_ns, attr->uid);
822 stat->gid = attr->gid; 822 stat->gid = make_kgid(&init_user_ns, attr->gid);
823 stat->rdev = inode->i_rdev; 823 stat->rdev = inode->i_rdev;
824 stat->atime.tv_sec = attr->atime; 824 stat->atime.tv_sec = attr->atime;
825 stat->atime.tv_nsec = attr->atimensec; 825 stat->atime.tv_nsec = attr->atimensec;
@@ -1007,12 +1007,12 @@ int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
1007 rcu_read_lock(); 1007 rcu_read_lock();
1008 ret = 0; 1008 ret = 0;
1009 cred = __task_cred(task); 1009 cred = __task_cred(task);
1010 if (cred->euid == fc->user_id && 1010 if (uid_eq(cred->euid, fc->user_id) &&
1011 cred->suid == fc->user_id && 1011 uid_eq(cred->suid, fc->user_id) &&
1012 cred->uid == fc->user_id && 1012 uid_eq(cred->uid, fc->user_id) &&
1013 cred->egid == fc->group_id && 1013 gid_eq(cred->egid, fc->group_id) &&
1014 cred->sgid == fc->group_id && 1014 gid_eq(cred->sgid, fc->group_id) &&
1015 cred->gid == fc->group_id) 1015 gid_eq(cred->gid, fc->group_id))
1016 ret = 1; 1016 ret = 1;
1017 rcu_read_unlock(); 1017 rcu_read_unlock();
1018 1018
@@ -1306,9 +1306,9 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
1306 if (ivalid & ATTR_MODE) 1306 if (ivalid & ATTR_MODE)
1307 arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; 1307 arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
1308 if (ivalid & ATTR_UID) 1308 if (ivalid & ATTR_UID)
1309 arg->valid |= FATTR_UID, arg->uid = iattr->ia_uid; 1309 arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
1310 if (ivalid & ATTR_GID) 1310 if (ivalid & ATTR_GID)
1311 arg->valid |= FATTR_GID, arg->gid = iattr->ia_gid; 1311 arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
1312 if (ivalid & ATTR_SIZE) 1312 if (ivalid & ATTR_SIZE)
1313 arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; 1313 arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
1314 if (ivalid & ATTR_ATIME) { 1314 if (ivalid & ATTR_ATIME) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 78d2837bc940..f3ab824fa302 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1599,19 +1599,19 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
1599 return err ? 0 : outarg.block; 1599 return err ? 0 : outarg.block;
1600} 1600}
1601 1601
1602static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) 1602static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
1603{ 1603{
1604 loff_t retval; 1604 loff_t retval;
1605 struct inode *inode = file->f_path.dentry->d_inode; 1605 struct inode *inode = file->f_path.dentry->d_inode;
1606 1606
1607 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ 1607 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
1608 if (origin == SEEK_CUR || origin == SEEK_SET) 1608 if (whence == SEEK_CUR || whence == SEEK_SET)
1609 return generic_file_llseek(file, offset, origin); 1609 return generic_file_llseek(file, offset, whence);
1610 1610
1611 mutex_lock(&inode->i_mutex); 1611 mutex_lock(&inode->i_mutex);
1612 retval = fuse_update_attributes(inode, NULL, file, NULL); 1612 retval = fuse_update_attributes(inode, NULL, file, NULL);
1613 if (!retval) 1613 if (!retval)
1614 retval = generic_file_llseek(file, offset, origin); 1614 retval = generic_file_llseek(file, offset, whence);
1615 mutex_unlock(&inode->i_mutex); 1615 mutex_unlock(&inode->i_mutex);
1616 1616
1617 return retval; 1617 return retval;
@@ -2177,8 +2177,8 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2177 return ret; 2177 return ret;
2178} 2178}
2179 2179
2180long fuse_file_fallocate(struct file *file, int mode, loff_t offset, 2180static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2181 loff_t length) 2181 loff_t length)
2182{ 2182{
2183 struct fuse_file *ff = file->private_data; 2183 struct fuse_file *ff = file->private_data;
2184 struct fuse_conn *fc = ff->fc; 2184 struct fuse_conn *fc = ff->fc;
@@ -2213,7 +2213,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
2213 2213
2214 return err; 2214 return err;
2215} 2215}
2216EXPORT_SYMBOL_GPL(fuse_file_fallocate);
2217 2216
2218static const struct file_operations fuse_file_operations = { 2217static const struct file_operations fuse_file_operations = {
2219 .llseek = fuse_file_llseek, 2218 .llseek = fuse_file_llseek,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e24dd74e3068..e105a53fc72d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -333,10 +333,10 @@ struct fuse_conn {
333 atomic_t count; 333 atomic_t count;
334 334
335 /** The user id for this mount */ 335 /** The user id for this mount */
336 uid_t user_id; 336 kuid_t user_id;
337 337
338 /** The group id for this mount */ 338 /** The group id for this mount */
339 gid_t group_id; 339 kgid_t group_id;
340 340
341 /** The fuse mount flags for this mount */ 341 /** The fuse mount flags for this mount */
342 unsigned flags; 342 unsigned flags;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f0eda124cffb..73ca6b72beaf 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -60,8 +60,8 @@ MODULE_PARM_DESC(max_user_congthresh,
60struct fuse_mount_data { 60struct fuse_mount_data {
61 int fd; 61 int fd;
62 unsigned rootmode; 62 unsigned rootmode;
63 unsigned user_id; 63 kuid_t user_id;
64 unsigned group_id; 64 kgid_t group_id;
65 unsigned fd_present:1; 65 unsigned fd_present:1;
66 unsigned rootmode_present:1; 66 unsigned rootmode_present:1;
67 unsigned user_id_present:1; 67 unsigned user_id_present:1;
@@ -164,8 +164,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
164 inode->i_ino = fuse_squash_ino(attr->ino); 164 inode->i_ino = fuse_squash_ino(attr->ino);
165 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); 165 inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
166 set_nlink(inode, attr->nlink); 166 set_nlink(inode, attr->nlink);
167 inode->i_uid = attr->uid; 167 inode->i_uid = make_kuid(&init_user_ns, attr->uid);
168 inode->i_gid = attr->gid; 168 inode->i_gid = make_kgid(&init_user_ns, attr->gid);
169 inode->i_blocks = attr->blocks; 169 inode->i_blocks = attr->blocks;
170 inode->i_atime.tv_sec = attr->atime; 170 inode->i_atime.tv_sec = attr->atime;
171 inode->i_atime.tv_nsec = attr->atimensec; 171 inode->i_atime.tv_nsec = attr->atimensec;
@@ -492,14 +492,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
492 case OPT_USER_ID: 492 case OPT_USER_ID:
493 if (match_int(&args[0], &value)) 493 if (match_int(&args[0], &value))
494 return 0; 494 return 0;
495 d->user_id = value; 495 d->user_id = make_kuid(current_user_ns(), value);
496 if (!uid_valid(d->user_id))
497 return 0;
496 d->user_id_present = 1; 498 d->user_id_present = 1;
497 break; 499 break;
498 500
499 case OPT_GROUP_ID: 501 case OPT_GROUP_ID:
500 if (match_int(&args[0], &value)) 502 if (match_int(&args[0], &value))
501 return 0; 503 return 0;
502 d->group_id = value; 504 d->group_id = make_kgid(current_user_ns(), value);
505 if (!gid_valid(d->group_id))
506 return 0;
503 d->group_id_present = 1; 507 d->group_id_present = 1;
504 break; 508 break;
505 509
@@ -540,8 +544,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
540 struct super_block *sb = root->d_sb; 544 struct super_block *sb = root->d_sb;
541 struct fuse_conn *fc = get_fuse_conn_super(sb); 545 struct fuse_conn *fc = get_fuse_conn_super(sb);
542 546
543 seq_printf(m, ",user_id=%u", fc->user_id); 547 seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
544 seq_printf(m, ",group_id=%u", fc->group_id); 548 seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
545 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) 549 if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
546 seq_puts(m, ",default_permissions"); 550 seq_puts(m, ",default_permissions");
547 if (fc->flags & FUSE_ALLOW_OTHER) 551 if (fc->flags & FUSE_ALLOW_OTHER)
@@ -989,7 +993,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
989 if (!file) 993 if (!file)
990 goto err; 994 goto err;
991 995
992 if (file->f_op != &fuse_dev_operations) 996 if ((file->f_op != &fuse_dev_operations) ||
997 (file->f_cred->user_ns != &init_user_ns))
993 goto err_fput; 998 goto err_fput;
994 999
995 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 1000 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 01c4975da4bc..30de4f2a2ea9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -643,7 +643,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
643 goto out_unlock; 643 goto out_unlock;
644 644
645 requested = data_blocks + ind_blocks; 645 requested = data_blocks + ind_blocks;
646 error = gfs2_inplace_reserve(ip, requested); 646 error = gfs2_inplace_reserve(ip, requested, 0);
647 if (error) 647 if (error)
648 goto out_qunlock; 648 goto out_qunlock;
649 } 649 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1fd3ae237bdd..a68e91bcef3d 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -991,6 +991,41 @@ unlock:
991 return err; 991 return err;
992} 992}
993 993
994/**
995 * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
996 * @inode: The inode being truncated
997 * @oldsize: The original (larger) size
998 * @newsize: The new smaller size
999 *
1000 * With jdata files, we have to journal a revoke for each block which is
1001 * truncated. As a result, we need to split this into separate transactions
1002 * if the number of pages being truncated gets too large.
1003 */
1004
1005#define GFS2_JTRUNC_REVOKES 8192
1006
1007static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1008{
1009 struct gfs2_sbd *sdp = GFS2_SB(inode);
1010 u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1011 u64 chunk;
1012 int error;
1013
1014 while (oldsize != newsize) {
1015 chunk = oldsize - newsize;
1016 if (chunk > max_chunk)
1017 chunk = max_chunk;
1018 truncate_pagecache(inode, oldsize, oldsize - chunk);
1019 oldsize -= chunk;
1020 gfs2_trans_end(sdp);
1021 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1022 if (error)
1023 return error;
1024 }
1025
1026 return 0;
1027}
1028
994static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) 1029static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
995{ 1030{
996 struct gfs2_inode *ip = GFS2_I(inode); 1031 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1000,8 +1035,10 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1000 int journaled = gfs2_is_jdata(ip); 1035 int journaled = gfs2_is_jdata(ip);
1001 int error; 1036 int error;
1002 1037
1003 error = gfs2_trans_begin(sdp, 1038 if (journaled)
1004 RES_DINODE + (journaled ? RES_JDATA : 0), 0); 1039 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1040 else
1041 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1005 if (error) 1042 if (error)
1006 return error; 1043 return error;
1007 1044
@@ -1026,7 +1063,16 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1026 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1063 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1027 gfs2_dinode_out(ip, dibh->b_data); 1064 gfs2_dinode_out(ip, dibh->b_data);
1028 1065
1029 truncate_pagecache(inode, oldsize, newsize); 1066 if (journaled)
1067 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1068 else
1069 truncate_pagecache(inode, oldsize, newsize);
1070
1071 if (error) {
1072 brelse(dibh);
1073 return error;
1074 }
1075
1030out_brelse: 1076out_brelse:
1031 brelse(dibh); 1077 brelse(dibh);
1032out: 1078out:
@@ -1178,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size)
1178 if (error) 1224 if (error)
1179 return error; 1225 return error;
1180 1226
1181 error = gfs2_inplace_reserve(ip, 1); 1227 error = gfs2_inplace_reserve(ip, 1, 0);
1182 if (error) 1228 if (error)
1183 goto do_grow_qunlock; 1229 goto do_grow_qunlock;
1184 unstuff = 1; 1230 unstuff = 1;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 259b088cfc4c..9a35670fdc38 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1676,16 +1676,11 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1676 be16_add_cpu(&leaf->lf_entries, 1); 1676 be16_add_cpu(&leaf->lf_entries, 1);
1677 } 1677 }
1678 brelse(bh); 1678 brelse(bh);
1679 error = gfs2_meta_inode_buffer(ip, &bh);
1680 if (error)
1681 break;
1682 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1683 ip->i_entries++; 1679 ip->i_entries++;
1684 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1680 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1685 if (S_ISDIR(nip->i_inode.i_mode)) 1681 if (S_ISDIR(nip->i_inode.i_mode))
1686 inc_nlink(&ip->i_inode); 1682 inc_nlink(&ip->i_inode);
1687 gfs2_dinode_out(ip, bh->b_data); 1683 mark_inode_dirty(inode);
1688 brelse(bh);
1689 error = 0; 1684 error = 0;
1690 break; 1685 break;
1691 } 1686 }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e056b4ce4877..991ab2d484dd 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -44,7 +44,7 @@
44 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
45 * @file: the file 45 * @file: the file
46 * @offset: the offset 46 * @offset: the offset
47 * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) 47 * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
48 * 48 *
49 * SEEK_END requires the glock for the file because it references the 49 * SEEK_END requires the glock for the file because it references the
50 * file's size. 50 * file's size.
@@ -52,26 +52,26 @@
52 * Returns: The new offset, or errno 52 * Returns: The new offset, or errno
53 */ 53 */
54 54
55static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) 55static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence)
56{ 56{
57 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 57 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
58 struct gfs2_holder i_gh; 58 struct gfs2_holder i_gh;
59 loff_t error; 59 loff_t error;
60 60
61 switch (origin) { 61 switch (whence) {
62 case SEEK_END: /* These reference inode->i_size */ 62 case SEEK_END: /* These reference inode->i_size */
63 case SEEK_DATA: 63 case SEEK_DATA:
64 case SEEK_HOLE: 64 case SEEK_HOLE:
65 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, 65 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
66 &i_gh); 66 &i_gh);
67 if (!error) { 67 if (!error) {
68 error = generic_file_llseek(file, offset, origin); 68 error = generic_file_llseek(file, offset, whence);
69 gfs2_glock_dq_uninit(&i_gh); 69 gfs2_glock_dq_uninit(&i_gh);
70 } 70 }
71 break; 71 break;
72 case SEEK_CUR: 72 case SEEK_CUR:
73 case SEEK_SET: 73 case SEEK_SET:
74 error = generic_file_llseek(file, offset, origin); 74 error = generic_file_llseek(file, offset, whence);
75 break; 75 break;
76 default: 76 default:
77 error = -EINVAL; 77 error = -EINVAL;
@@ -432,7 +432,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
432 if (ret) 432 if (ret)
433 goto out_unlock; 433 goto out_unlock;
434 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); 434 gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
435 ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); 435 ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
436 if (ret) 436 if (ret)
437 goto out_quota_unlock; 437 goto out_quota_unlock;
438 438
@@ -825,7 +825,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
825retry: 825retry:
826 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); 826 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
827 827
828 error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks); 828 error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
829 if (error) { 829 if (error) {
830 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { 830 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
831 bytes >>= 1; 831 bytes >>= 1;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e6c2fd53cab2..992c5c0cb504 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -55,8 +55,6 @@ struct gfs2_glock_iter {
55 55
56typedef void (*glock_examiner) (struct gfs2_glock * gl); 56typedef void (*glock_examiner) (struct gfs2_glock * gl);
57 57
58static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 58static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
61 59
62static struct dentry *gfs2_root; 60static struct dentry *gfs2_root;
@@ -107,10 +105,12 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
107{ 105{
108 struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); 106 struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
109 107
110 if (gl->gl_ops->go_flags & GLOF_ASPACE) 108 if (gl->gl_ops->go_flags & GLOF_ASPACE) {
111 kmem_cache_free(gfs2_glock_aspace_cachep, gl); 109 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
112 else 110 } else {
111 kfree(gl->gl_lksb.sb_lvbptr);
113 kmem_cache_free(gfs2_glock_cachep, gl); 112 kmem_cache_free(gfs2_glock_cachep, gl);
113 }
114} 114}
115 115
116void gfs2_glock_free(struct gfs2_glock *gl) 116void gfs2_glock_free(struct gfs2_glock *gl)
@@ -537,8 +537,8 @@ __acquires(&gl->gl_spin)
537 (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) 537 (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
538 clear_bit(GLF_BLOCKING, &gl->gl_flags); 538 clear_bit(GLF_BLOCKING, &gl->gl_flags);
539 spin_unlock(&gl->gl_spin); 539 spin_unlock(&gl->gl_spin);
540 if (glops->go_xmote_th) 540 if (glops->go_sync)
541 glops->go_xmote_th(gl); 541 glops->go_sync(gl);
542 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) 542 if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
543 glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); 543 glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
544 clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); 544 clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
@@ -547,7 +547,10 @@ __acquires(&gl->gl_spin)
547 if (sdp->sd_lockstruct.ls_ops->lm_lock) { 547 if (sdp->sd_lockstruct.ls_ops->lm_lock) {
548 /* lock_dlm */ 548 /* lock_dlm */
549 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); 549 ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
550 GLOCK_BUG_ON(gl, ret); 550 if (ret) {
551 printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret);
552 GLOCK_BUG_ON(gl, 1);
553 }
551 } else { /* lock_nolock */ 554 } else { /* lock_nolock */
552 finish_xmote(gl, target); 555 finish_xmote(gl, target);
553 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 556 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -736,6 +739,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
736 if (!gl) 739 if (!gl)
737 return -ENOMEM; 740 return -ENOMEM;
738 741
742 memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
743
744 if (glops->go_flags & GLOF_LVB) {
745 gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL);
746 if (!gl->gl_lksb.sb_lvbptr) {
747 kmem_cache_free(cachep, gl);
748 return -ENOMEM;
749 }
750 }
751
739 atomic_inc(&sdp->sd_glock_disposal); 752 atomic_inc(&sdp->sd_glock_disposal);
740 gl->gl_sbd = sdp; 753 gl->gl_sbd = sdp;
741 gl->gl_flags = 0; 754 gl->gl_flags = 0;
@@ -753,9 +766,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
753 preempt_enable(); 766 preempt_enable();
754 gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; 767 gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0;
755 gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; 768 gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0;
756 memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
757 memset(gl->gl_lvb, 0, 32 * sizeof(char));
758 gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
759 gl->gl_tchange = jiffies; 769 gl->gl_tchange = jiffies;
760 gl->gl_object = NULL; 770 gl->gl_object = NULL;
761 gl->gl_hold_time = GL_GLOCK_DFT_HOLD; 771 gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
@@ -768,7 +778,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
768 mapping->host = s->s_bdev->bd_inode; 778 mapping->host = s->s_bdev->bd_inode;
769 mapping->flags = 0; 779 mapping->flags = 0;
770 mapping_set_gfp_mask(mapping, GFP_NOFS); 780 mapping_set_gfp_mask(mapping, GFP_NOFS);
771 mapping->assoc_mapping = NULL; 781 mapping->private_data = NULL;
772 mapping->backing_dev_info = s->s_bdi; 782 mapping->backing_dev_info = s->s_bdi;
773 mapping->writeback_index = 0; 783 mapping->writeback_index = 0;
774 } 784 }
@@ -777,6 +787,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
777 tmp = search_bucket(hash, sdp, &name); 787 tmp = search_bucket(hash, sdp, &name);
778 if (tmp) { 788 if (tmp) {
779 spin_unlock_bucket(hash); 789 spin_unlock_bucket(hash);
790 kfree(gl->gl_lksb.sb_lvbptr);
780 kmem_cache_free(cachep, gl); 791 kmem_cache_free(cachep, gl);
781 atomic_dec(&sdp->sd_glock_disposal); 792 atomic_dec(&sdp->sd_glock_disposal);
782 gl = tmp; 793 gl = tmp;
@@ -1013,7 +1024,7 @@ trap_recursive:
1013 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid)); 1024 printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
1014 printk(KERN_ERR "lock type: %d req lock state : %d\n", 1025 printk(KERN_ERR "lock type: %d req lock state : %d\n",
1015 gh->gh_gl->gl_name.ln_type, gh->gh_state); 1026 gh->gh_gl->gl_name.ln_type, gh->gh_state);
1016 __dump_glock(NULL, gl); 1027 gfs2_dump_glock(NULL, gl);
1017 BUG(); 1028 BUG();
1018} 1029}
1019 1030
@@ -1508,7 +1519,7 @@ static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1508{ 1519{
1509 int ret; 1520 int ret;
1510 spin_lock(&gl->gl_spin); 1521 spin_lock(&gl->gl_spin);
1511 ret = __dump_glock(seq, gl); 1522 ret = gfs2_dump_glock(seq, gl);
1512 spin_unlock(&gl->gl_spin); 1523 spin_unlock(&gl->gl_spin);
1513 return ret; 1524 return ret;
1514} 1525}
@@ -1528,6 +1539,7 @@ static void dump_glock_func(struct gfs2_glock *gl)
1528 1539
1529void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1540void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1530{ 1541{
1542 set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
1531 glock_hash_walk(clear_glock, sdp); 1543 glock_hash_walk(clear_glock, sdp);
1532 flush_workqueue(glock_workqueue); 1544 flush_workqueue(glock_workqueue);
1533 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); 1545 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
@@ -1655,7 +1667,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
1655} 1667}
1656 1668
1657/** 1669/**
1658 * __dump_glock - print information about a glock 1670 * gfs2_dump_glock - print information about a glock
1659 * @seq: The seq_file struct 1671 * @seq: The seq_file struct
1660 * @gl: the glock 1672 * @gl: the glock
1661 * 1673 *
@@ -1672,7 +1684,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
1672 * Returns: 0 on success, -ENOBUFS when we run out of space 1684 * Returns: 0 on success, -ENOBUFS when we run out of space
1673 */ 1685 */
1674 1686
1675static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) 1687int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
1676{ 1688{
1677 const struct gfs2_glock_operations *glops = gl->gl_ops; 1689 const struct gfs2_glock_operations *glops = gl->gl_ops;
1678 unsigned long long dtime; 1690 unsigned long long dtime;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 307ac31df781..fd580b7861d5 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -178,33 +178,33 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
178 return NULL; 178 return NULL;
179} 179}
180 180
181int gfs2_glock_get(struct gfs2_sbd *sdp, 181extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
182 u64 number, const struct gfs2_glock_operations *glops, 182 const struct gfs2_glock_operations *glops,
183 int create, struct gfs2_glock **glp); 183 int create, struct gfs2_glock **glp);
184void gfs2_glock_hold(struct gfs2_glock *gl); 184extern void gfs2_glock_hold(struct gfs2_glock *gl);
185void gfs2_glock_put_nolock(struct gfs2_glock *gl); 185extern void gfs2_glock_put_nolock(struct gfs2_glock *gl);
186void gfs2_glock_put(struct gfs2_glock *gl); 186extern void gfs2_glock_put(struct gfs2_glock *gl);
187void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, 187extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
188 struct gfs2_holder *gh); 188 unsigned flags, struct gfs2_holder *gh);
189void gfs2_holder_reinit(unsigned int state, unsigned flags, 189extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
190 struct gfs2_holder *gh); 190 struct gfs2_holder *gh);
191void gfs2_holder_uninit(struct gfs2_holder *gh); 191extern void gfs2_holder_uninit(struct gfs2_holder *gh);
192int gfs2_glock_nq(struct gfs2_holder *gh); 192extern int gfs2_glock_nq(struct gfs2_holder *gh);
193int gfs2_glock_poll(struct gfs2_holder *gh); 193extern int gfs2_glock_poll(struct gfs2_holder *gh);
194int gfs2_glock_wait(struct gfs2_holder *gh); 194extern int gfs2_glock_wait(struct gfs2_holder *gh);
195void gfs2_glock_dq(struct gfs2_holder *gh); 195extern void gfs2_glock_dq(struct gfs2_holder *gh);
196void gfs2_glock_dq_wait(struct gfs2_holder *gh); 196extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
197 197extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
198void gfs2_glock_dq_uninit(struct gfs2_holder *gh); 198extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
199int gfs2_glock_nq_num(struct gfs2_sbd *sdp, 199 const struct gfs2_glock_operations *glops,
200 u64 number, const struct gfs2_glock_operations *glops, 200 unsigned int state, int flags,
201 unsigned int state, int flags, struct gfs2_holder *gh); 201 struct gfs2_holder *gh);
202 202extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
203int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); 203extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
204void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); 204extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
205void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); 205extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
206 206#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
207__printf(2, 3) 207extern __printf(2, 3)
208void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 208void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
209 209
210/** 210/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 32cc4fde975c..78d4184ffc7d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -74,7 +74,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
74 74
75 gfs2_trans_add_revoke(sdp, bd); 75 gfs2_trans_add_revoke(sdp, bd);
76 } 76 }
77 BUG_ON(!fsync && atomic_read(&gl->gl_ail_count)); 77 GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
78 spin_unlock(&sdp->sd_ail_lock); 78 spin_unlock(&sdp->sd_ail_lock);
79 gfs2_log_unlock(sdp); 79 gfs2_log_unlock(sdp);
80} 80}
@@ -96,7 +96,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
96 tr.tr_ip = (unsigned long)__builtin_return_address(0); 96 tr.tr_ip = (unsigned long)__builtin_return_address(0);
97 sb_start_intwrite(sdp->sd_vfs); 97 sb_start_intwrite(sdp->sd_vfs);
98 gfs2_log_reserve(sdp, tr.tr_reserved); 98 gfs2_log_reserve(sdp, tr.tr_reserved);
99 BUG_ON(current->journal_info); 99 WARN_ON_ONCE(current->journal_info);
100 current->journal_info = &tr; 100 current->journal_info = &tr;
101 101
102 __gfs2_ail_flush(gl, 0); 102 __gfs2_ail_flush(gl, 0);
@@ -139,7 +139,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
139 139
140 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 140 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
141 return; 141 return;
142 BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); 142 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
143 143
144 gfs2_log_flush(gl->gl_sbd, gl); 144 gfs2_log_flush(gl->gl_sbd, gl);
145 filemap_fdatawrite(metamapping); 145 filemap_fdatawrite(metamapping);
@@ -168,7 +168,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
168{ 168{
169 struct address_space *mapping = gfs2_glock2aspace(gl); 169 struct address_space *mapping = gfs2_glock2aspace(gl);
170 170
171 BUG_ON(!(flags & DIO_METADATA)); 171 WARN_ON_ONCE(!(flags & DIO_METADATA));
172 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); 172 gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
173 truncate_inode_pages(mapping, 0); 173 truncate_inode_pages(mapping, 0);
174 174
@@ -197,7 +197,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
197 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) 197 if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
198 return; 198 return;
199 199
200 BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE); 200 GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
201 201
202 gfs2_log_flush(gl->gl_sbd, gl); 202 gfs2_log_flush(gl->gl_sbd, gl);
203 filemap_fdatawrite(metamapping); 203 filemap_fdatawrite(metamapping);
@@ -536,7 +536,7 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
536}; 536};
537 537
538const struct gfs2_glock_operations gfs2_inode_glops = { 538const struct gfs2_glock_operations gfs2_inode_glops = {
539 .go_xmote_th = inode_go_sync, 539 .go_sync = inode_go_sync,
540 .go_inval = inode_go_inval, 540 .go_inval = inode_go_inval,
541 .go_demote_ok = inode_go_demote_ok, 541 .go_demote_ok = inode_go_demote_ok,
542 .go_lock = inode_go_lock, 542 .go_lock = inode_go_lock,
@@ -546,17 +546,17 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
546}; 546};
547 547
548const struct gfs2_glock_operations gfs2_rgrp_glops = { 548const struct gfs2_glock_operations gfs2_rgrp_glops = {
549 .go_xmote_th = rgrp_go_sync, 549 .go_sync = rgrp_go_sync,
550 .go_inval = rgrp_go_inval, 550 .go_inval = rgrp_go_inval,
551 .go_lock = gfs2_rgrp_go_lock, 551 .go_lock = gfs2_rgrp_go_lock,
552 .go_unlock = gfs2_rgrp_go_unlock, 552 .go_unlock = gfs2_rgrp_go_unlock,
553 .go_dump = gfs2_rgrp_dump, 553 .go_dump = gfs2_rgrp_dump,
554 .go_type = LM_TYPE_RGRP, 554 .go_type = LM_TYPE_RGRP,
555 .go_flags = GLOF_ASPACE, 555 .go_flags = GLOF_ASPACE | GLOF_LVB,
556}; 556};
557 557
558const struct gfs2_glock_operations gfs2_trans_glops = { 558const struct gfs2_glock_operations gfs2_trans_glops = {
559 .go_xmote_th = trans_go_sync, 559 .go_sync = trans_go_sync,
560 .go_xmote_bh = trans_go_xmote_bh, 560 .go_xmote_bh = trans_go_xmote_bh,
561 .go_demote_ok = trans_go_demote_ok, 561 .go_demote_ok = trans_go_demote_ok,
562 .go_type = LM_TYPE_NONDISK, 562 .go_type = LM_TYPE_NONDISK,
@@ -577,6 +577,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
577 577
578const struct gfs2_glock_operations gfs2_quota_glops = { 578const struct gfs2_glock_operations gfs2_quota_glops = {
579 .go_type = LM_TYPE_QUOTA, 579 .go_type = LM_TYPE_QUOTA,
580 .go_flags = GLOF_LVB,
580}; 581};
581 582
582const struct gfs2_glock_operations gfs2_journal_glops = { 583const struct gfs2_glock_operations gfs2_journal_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3d469d37345e..c373a24fedd9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -205,7 +205,7 @@ struct lm_lockname {
205 205
206 206
207struct gfs2_glock_operations { 207struct gfs2_glock_operations {
208 void (*go_xmote_th) (struct gfs2_glock *gl); 208 void (*go_sync) (struct gfs2_glock *gl);
209 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); 209 int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
210 void (*go_inval) (struct gfs2_glock *gl, int flags); 210 void (*go_inval) (struct gfs2_glock *gl, int flags);
211 int (*go_demote_ok) (const struct gfs2_glock *gl); 211 int (*go_demote_ok) (const struct gfs2_glock *gl);
@@ -216,6 +216,7 @@ struct gfs2_glock_operations {
216 const int go_type; 216 const int go_type;
217 const unsigned long go_flags; 217 const unsigned long go_flags;
218#define GLOF_ASPACE 1 218#define GLOF_ASPACE 1
219#define GLOF_LVB 2
219}; 220};
220 221
221enum { 222enum {
@@ -321,7 +322,6 @@ struct gfs2_glock {
321 ktime_t gl_dstamp; 322 ktime_t gl_dstamp;
322 struct gfs2_lkstats gl_stats; 323 struct gfs2_lkstats gl_stats;
323 struct dlm_lksb gl_lksb; 324 struct dlm_lksb gl_lksb;
324 char gl_lvb[32];
325 unsigned long gl_tchange; 325 unsigned long gl_tchange;
326 void *gl_object; 326 void *gl_object;
327 327
@@ -539,6 +539,7 @@ enum {
539 SDF_DEMOTE = 5, 539 SDF_DEMOTE = 5,
540 SDF_NOJOURNALID = 6, 540 SDF_NOJOURNALID = 6,
541 SDF_RORECOVERY = 7, /* read only recovery */ 541 SDF_RORECOVERY = 7, /* read only recovery */
542 SDF_SKIP_DLM_UNLOCK = 8,
542}; 543};
543 544
544#define GFS2_FSNAME_LEN 256 545#define GFS2_FSNAME_LEN 256
@@ -621,6 +622,7 @@ struct gfs2_sbd {
621 u32 sd_hash_bsize_shift; 622 u32 sd_hash_bsize_shift;
622 u32 sd_hash_ptrs; /* Number of pointers in a hash block */ 623 u32 sd_hash_ptrs; /* Number of pointers in a hash block */
623 u32 sd_qc_per_block; 624 u32 sd_qc_per_block;
625 u32 sd_blocks_per_bitmap;
624 u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ 626 u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
625 u32 sd_max_height; /* Max height of a file's metadata tree */ 627 u32 sd_max_height; /* Max height of a file's metadata tree */
626 u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; 628 u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 381893ceefa4..2b6f5698ef18 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -364,34 +364,34 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
364 return 0; 364 return 0;
365} 365}
366 366
367static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode, 367static void munge_mode_uid_gid(const struct gfs2_inode *dip,
368 unsigned int *uid, unsigned int *gid) 368 struct inode *inode)
369{ 369{
370 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && 370 if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
371 (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) { 371 (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
372 if (S_ISDIR(*mode)) 372 if (S_ISDIR(inode->i_mode))
373 *mode |= S_ISUID; 373 inode->i_mode |= S_ISUID;
374 else if (dip->i_inode.i_uid != current_fsuid()) 374 else if (dip->i_inode.i_uid != current_fsuid())
375 *mode &= ~07111; 375 inode->i_mode &= ~07111;
376 *uid = dip->i_inode.i_uid; 376 inode->i_uid = dip->i_inode.i_uid;
377 } else 377 } else
378 *uid = current_fsuid(); 378 inode->i_uid = current_fsuid();
379 379
380 if (dip->i_inode.i_mode & S_ISGID) { 380 if (dip->i_inode.i_mode & S_ISGID) {
381 if (S_ISDIR(*mode)) 381 if (S_ISDIR(inode->i_mode))
382 *mode |= S_ISGID; 382 inode->i_mode |= S_ISGID;
383 *gid = dip->i_inode.i_gid; 383 inode->i_gid = dip->i_inode.i_gid;
384 } else 384 } else
385 *gid = current_fsgid(); 385 inode->i_gid = current_fsgid();
386} 386}
387 387
388static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) 388static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
389{ 389{
390 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 390 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
391 int error; 391 int error;
392 int dblocks = 1; 392 int dblocks = 1;
393 393
394 error = gfs2_inplace_reserve(dip, RES_DINODE); 394 error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
395 if (error) 395 if (error)
396 goto out; 396 goto out;
397 397
@@ -399,12 +399,15 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
399 if (error) 399 if (error)
400 goto out_ipreserv; 400 goto out_ipreserv;
401 401
402 error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation); 402 error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation);
403 ip->i_no_formal_ino = ip->i_generation;
404 ip->i_inode.i_ino = ip->i_no_addr;
405 ip->i_goal = ip->i_no_addr;
403 406
404 gfs2_trans_end(sdp); 407 gfs2_trans_end(sdp);
405 408
406out_ipreserv: 409out_ipreserv:
407 gfs2_inplace_release(dip); 410 gfs2_inplace_release(ip);
408out: 411out:
409 return error; 412 return error;
410} 413}
@@ -429,52 +432,42 @@ static void gfs2_init_dir(struct buffer_head *dibh,
429/** 432/**
430 * init_dinode - Fill in a new dinode structure 433 * init_dinode - Fill in a new dinode structure
431 * @dip: The directory this inode is being created in 434 * @dip: The directory this inode is being created in
432 * @gl: The glock covering the new inode 435 * @ip: The inode
433 * @inum: The inode number
434 * @mode: The file permissions
435 * @uid: The uid of the new inode
436 * @gid: The gid of the new inode
437 * @generation: The generation number of the new inode
438 * @dev: The device number (if a device node)
439 * @symname: The symlink destination (if a symlink) 436 * @symname: The symlink destination (if a symlink)
440 * @size: The inode size (ignored for directories)
441 * @bhp: The buffer head (returned to caller) 437 * @bhp: The buffer head (returned to caller)
442 * 438 *
443 */ 439 */
444 440
445static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, 441static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
446 const struct gfs2_inum_host *inum, umode_t mode, 442 const char *symname, struct buffer_head **bhp)
447 unsigned int uid, unsigned int gid,
448 const u64 *generation, dev_t dev, const char *symname,
449 unsigned size, struct buffer_head **bhp)
450{ 443{
451 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 444 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
452 struct gfs2_dinode *di; 445 struct gfs2_dinode *di;
453 struct buffer_head *dibh; 446 struct buffer_head *dibh;
454 struct timespec tv = CURRENT_TIME; 447 struct timespec tv = CURRENT_TIME;
455 448
456 dibh = gfs2_meta_new(gl, inum->no_addr); 449 dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
457 gfs2_trans_add_bh(gl, dibh, 1); 450 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
458 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); 451 gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
459 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 452 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
460 di = (struct gfs2_dinode *)dibh->b_data; 453 di = (struct gfs2_dinode *)dibh->b_data;
461 454
462 di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino); 455 di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
463 di->di_num.no_addr = cpu_to_be64(inum->no_addr); 456 di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
464 di->di_mode = cpu_to_be32(mode); 457 di->di_mode = cpu_to_be32(ip->i_inode.i_mode);
465 di->di_uid = cpu_to_be32(uid); 458 di->di_uid = cpu_to_be32(ip->i_inode.i_uid);
466 di->di_gid = cpu_to_be32(gid); 459 di->di_gid = cpu_to_be32(ip->i_inode.i_gid);
467 di->di_nlink = 0; 460 di->di_nlink = 0;
468 di->di_size = cpu_to_be64(size); 461 di->di_size = cpu_to_be64(ip->i_inode.i_size);
469 di->di_blocks = cpu_to_be64(1); 462 di->di_blocks = cpu_to_be64(1);
470 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); 463 di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
471 di->di_major = cpu_to_be32(MAJOR(dev)); 464 di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
472 di->di_minor = cpu_to_be32(MINOR(dev)); 465 di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
473 di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); 466 di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr);
474 di->di_generation = cpu_to_be64(*generation); 467 di->di_generation = cpu_to_be64(ip->i_generation);
475 di->di_flags = 0; 468 di->di_flags = 0;
476 di->__pad1 = 0; 469 di->__pad1 = 0;
477 di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0); 470 di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0);
478 di->di_height = 0; 471 di->di_height = 0;
479 di->__pad2 = 0; 472 di->__pad2 = 0;
480 di->__pad3 = 0; 473 di->__pad3 = 0;
@@ -487,7 +480,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
487 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); 480 di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
488 memset(&di->di_reserved, 0, sizeof(di->di_reserved)); 481 memset(&di->di_reserved, 0, sizeof(di->di_reserved));
489 482
490 switch(mode & S_IFMT) { 483 switch(ip->i_inode.i_mode & S_IFMT) {
491 case S_IFREG: 484 case S_IFREG:
492 if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || 485 if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
493 gfs2_tune_get(sdp, gt_new_files_jdata)) 486 gfs2_tune_get(sdp, gt_new_files_jdata))
@@ -502,7 +495,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
502 gfs2_init_dir(dibh, dip); 495 gfs2_init_dir(dibh, dip);
503 break; 496 break;
504 case S_IFLNK: 497 case S_IFLNK:
505 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size); 498 memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size);
506 break; 499 break;
507 } 500 }
508 501
@@ -511,25 +504,22 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
511 *bhp = dibh; 504 *bhp = dibh;
512} 505}
513 506
514static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, 507static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
515 umode_t mode, const struct gfs2_inum_host *inum, 508 const char *symname, struct buffer_head **bhp)
516 const u64 *generation, dev_t dev, const char *symname,
517 unsigned int size, struct buffer_head **bhp)
518{ 509{
510 struct inode *inode = &ip->i_inode;
519 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 511 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
520 unsigned int uid, gid;
521 int error; 512 int error;
522 513
523 munge_mode_uid_gid(dip, &mode, &uid, &gid);
524 error = gfs2_rindex_update(sdp); 514 error = gfs2_rindex_update(sdp);
525 if (error) 515 if (error)
526 return error; 516 return error;
527 517
528 error = gfs2_quota_lock(dip, uid, gid); 518 error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid);
529 if (error) 519 if (error)
530 return error; 520 return error;
531 521
532 error = gfs2_quota_check(dip, uid, gid); 522 error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid);
533 if (error) 523 if (error)
534 goto out_quota; 524 goto out_quota;
535 525
@@ -537,8 +527,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
537 if (error) 527 if (error)
538 goto out_quota; 528 goto out_quota;
539 529
540 init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp); 530 init_dinode(dip, ip, symname, bhp);
541 gfs2_quota_change(dip, +1, uid, gid); 531 gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid);
542 gfs2_trans_end(sdp); 532 gfs2_trans_end(sdp);
543 533
544out_quota: 534out_quota:
@@ -570,7 +560,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
570 if (error) 560 if (error)
571 goto fail_quota_locks; 561 goto fail_quota_locks;
572 562
573 error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); 563 error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
574 if (error) 564 if (error)
575 goto fail_quota_locks; 565 goto fail_quota_locks;
576 566
@@ -657,19 +647,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
657 struct inode *inode = NULL; 647 struct inode *inode = NULL;
658 struct gfs2_inode *dip = GFS2_I(dir), *ip; 648 struct gfs2_inode *dip = GFS2_I(dir), *ip;
659 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 649 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
660 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; 650 struct gfs2_glock *io_gl;
661 int error; 651 int error;
662 u64 generation;
663 struct buffer_head *bh = NULL; 652 struct buffer_head *bh = NULL;
653 u32 aflags = 0;
664 654
665 if (!name->len || name->len > GFS2_FNAMESIZE) 655 if (!name->len || name->len > GFS2_FNAMESIZE)
666 return -ENAMETOOLONG; 656 return -ENAMETOOLONG;
667 657
668 /* We need a reservation to allocate the new dinode block. The
669 directory ip temporarily points to the reservation, but this is
670 being done to get a set of contiguous blocks for the new dinode.
671 Since this is a create, we don't have a sizehint yet, so it will
672 have to use the minimum reservation size. */
673 error = gfs2_rs_alloc(dip); 658 error = gfs2_rs_alloc(dip);
674 if (error) 659 if (error)
675 return error; 660 return error;
@@ -688,45 +673,72 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
688 if (error) 673 if (error)
689 goto fail_gunlock; 674 goto fail_gunlock;
690 675
691 error = alloc_dinode(dip, &inum.no_addr, &generation); 676 inode = new_inode(sdp->sd_vfs);
677 if (!inode) {
678 gfs2_glock_dq_uninit(ghs);
679 return -ENOMEM;
680 }
681 ip = GFS2_I(inode);
682 error = gfs2_rs_alloc(ip);
692 if (error) 683 if (error)
693 goto fail_gunlock; 684 goto fail_free_inode;
694 inum.no_formal_ino = generation; 685
686 set_bit(GIF_INVALID, &ip->i_flags);
687 inode->i_mode = mode;
688 inode->i_rdev = dev;
689 inode->i_size = size;
690 munge_mode_uid_gid(dip, inode);
691 ip->i_goal = dip->i_goal;
695 692
696 error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, 693 if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) ||
697 LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); 694 (dip->i_diskflags & GFS2_DIF_TOPDIR))
695 aflags |= GFS2_AF_ORLOV;
696
697 error = alloc_dinode(ip, aflags);
698 if (error) 698 if (error)
699 goto fail_gunlock; 699 goto fail_free_inode;
700 700
701 error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh); 701 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
702 if (error) 702 if (error)
703 goto fail_gunlock2; 703 goto fail_free_inode;
704 704
705 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, 705 ip->i_gl->gl_object = ip;
706 inum.no_formal_ino, 0); 706 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
707 if (IS_ERR(inode)) 707 if (error)
708 goto fail_free_inode;
709
710 error = make_dinode(dip, ip, symname, &bh);
711 if (error)
708 goto fail_gunlock2; 712 goto fail_gunlock2;
709 713
710 ip = GFS2_I(inode); 714 error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
711 error = gfs2_inode_refresh(ip);
712 if (error) 715 if (error)
713 goto fail_gunlock2; 716 goto fail_gunlock2;
714 717
715 error = gfs2_rs_alloc(ip); 718 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
716 if (error) 719 if (error)
717 goto fail_gunlock2; 720 goto fail_gunlock2;
718 721
722 ip->i_iopen_gh.gh_gl->gl_object = ip;
723 gfs2_glock_put(io_gl);
724 gfs2_set_iop(inode);
725 insert_inode_hash(inode);
726
727 error = gfs2_inode_refresh(ip);
728 if (error)
729 goto fail_gunlock3;
730
719 error = gfs2_acl_create(dip, inode); 731 error = gfs2_acl_create(dip, inode);
720 if (error) 732 if (error)
721 goto fail_gunlock2; 733 goto fail_gunlock3;
722 734
723 error = gfs2_security_init(dip, ip, name); 735 error = gfs2_security_init(dip, ip, name);
724 if (error) 736 if (error)
725 goto fail_gunlock2; 737 goto fail_gunlock3;
726 738
727 error = link_dinode(dip, name, ip); 739 error = link_dinode(dip, name, ip);
728 if (error) 740 if (error)
729 goto fail_gunlock2; 741 goto fail_gunlock3;
730 742
731 if (bh) 743 if (bh)
732 brelse(bh); 744 brelse(bh);
@@ -739,8 +751,20 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
739 d_instantiate(dentry, inode); 751 d_instantiate(dentry, inode);
740 return 0; 752 return 0;
741 753
754fail_gunlock3:
755 gfs2_glock_dq_uninit(ghs + 1);
756 if (ip->i_gl)
757 gfs2_glock_put(ip->i_gl);
758 goto fail_gunlock;
759
742fail_gunlock2: 760fail_gunlock2:
743 gfs2_glock_dq_uninit(ghs + 1); 761 gfs2_glock_dq_uninit(ghs + 1);
762fail_free_inode:
763 if (ip->i_gl)
764 gfs2_glock_put(ip->i_gl);
765 gfs2_rs_delete(ip);
766 free_inode_nonrcu(inode);
767 inode = NULL;
744fail_gunlock: 768fail_gunlock:
745 gfs2_glock_dq_uninit(ghs); 769 gfs2_glock_dq_uninit(ghs);
746 if (inode && !IS_ERR(inode)) { 770 if (inode && !IS_ERR(inode)) {
@@ -748,7 +772,6 @@ fail_gunlock:
748 iput(inode); 772 iput(inode);
749 } 773 }
750fail: 774fail:
751 gfs2_rs_delete(dip);
752 if (bh) 775 if (bh)
753 brelse(bh); 776 brelse(bh);
754 return error; 777 return error;
@@ -884,7 +907,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
884 if (error) 907 if (error)
885 goto out_gunlock; 908 goto out_gunlock;
886 909
887 error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres); 910 error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
888 if (error) 911 if (error)
889 goto out_gunlock_q; 912 goto out_gunlock_q;
890 913
@@ -977,7 +1000,6 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
977 * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it 1000 * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
978 * @dip: The parent directory 1001 * @dip: The parent directory
979 * @name: The name of the entry in the parent directory 1002 * @name: The name of the entry in the parent directory
980 * @bh: The inode buffer for the inode to be removed
981 * @inode: The inode to be removed 1003 * @inode: The inode to be removed
982 * 1004 *
983 * Called with all the locks and in a transaction. This will only be 1005 * Called with all the locks and in a transaction. This will only be
@@ -987,8 +1009,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
987 */ 1009 */
988 1010
989static int gfs2_unlink_inode(struct gfs2_inode *dip, 1011static int gfs2_unlink_inode(struct gfs2_inode *dip,
990 const struct dentry *dentry, 1012 const struct dentry *dentry)
991 struct buffer_head *bh)
992{ 1013{
993 struct inode *inode = dentry->d_inode; 1014 struct inode *inode = dentry->d_inode;
994 struct gfs2_inode *ip = GFS2_I(inode); 1015 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1028,7 +1049,6 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
1028 struct gfs2_sbd *sdp = GFS2_SB(dir); 1049 struct gfs2_sbd *sdp = GFS2_SB(dir);
1029 struct inode *inode = dentry->d_inode; 1050 struct inode *inode = dentry->d_inode;
1030 struct gfs2_inode *ip = GFS2_I(inode); 1051 struct gfs2_inode *ip = GFS2_I(inode);
1031 struct buffer_head *bh;
1032 struct gfs2_holder ghs[3]; 1052 struct gfs2_holder ghs[3];
1033 struct gfs2_rgrpd *rgd; 1053 struct gfs2_rgrpd *rgd;
1034 int error; 1054 int error;
@@ -1077,14 +1097,9 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
1077 1097
1078 error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); 1098 error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
1079 if (error) 1099 if (error)
1080 goto out_gunlock;
1081
1082 error = gfs2_meta_inode_buffer(ip, &bh);
1083 if (error)
1084 goto out_end_trans; 1100 goto out_end_trans;
1085 1101
1086 error = gfs2_unlink_inode(dip, dentry, bh); 1102 error = gfs2_unlink_inode(dip, dentry);
1087 brelse(bh);
1088 1103
1089out_end_trans: 1104out_end_trans:
1090 gfs2_trans_end(sdp); 1105 gfs2_trans_end(sdp);
@@ -1365,7 +1380,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1365 if (error) 1380 if (error)
1366 goto out_gunlock; 1381 goto out_gunlock;
1367 1382
1368 error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres); 1383 error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0);
1369 if (error) 1384 if (error)
1370 goto out_gunlock_q; 1385 goto out_gunlock_q;
1371 1386
@@ -1384,14 +1399,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1384 1399
1385 /* Remove the target file, if it exists */ 1400 /* Remove the target file, if it exists */
1386 1401
1387 if (nip) { 1402 if (nip)
1388 struct buffer_head *bh; 1403 error = gfs2_unlink_inode(ndip, ndentry);
1389 error = gfs2_meta_inode_buffer(nip, &bh);
1390 if (error)
1391 goto out_end_trans;
1392 error = gfs2_unlink_inode(ndip, ndentry, bh);
1393 brelse(bh);
1394 }
1395 1404
1396 if (dir_rename) { 1405 if (dir_rename) {
1397 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); 1406 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0fb6539b0c8c..9802de0f85e6 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -120,8 +120,8 @@ static void gdlm_ast(void *arg)
120 gfs2_update_reply_times(gl); 120 gfs2_update_reply_times(gl);
121 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); 121 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
122 122
123 if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) 123 if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr)
124 memset(gl->gl_lvb, 0, GDLM_LVB_SIZE); 124 memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
125 125
126 switch (gl->gl_lksb.sb_status) { 126 switch (gl->gl_lksb.sb_status) {
127 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 127 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
@@ -203,8 +203,10 @@ static int make_mode(const unsigned int lmstate)
203static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, 203static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
204 const int req) 204 const int req)
205{ 205{
206 u32 lkf = DLM_LKF_VALBLK; 206 u32 lkf = 0;
207 u32 lkid = gl->gl_lksb.sb_lkid; 207
208 if (gl->gl_lksb.sb_lvbptr)
209 lkf |= DLM_LKF_VALBLK;
208 210
209 if (gfs_flags & LM_FLAG_TRY) 211 if (gfs_flags & LM_FLAG_TRY)
210 lkf |= DLM_LKF_NOQUEUE; 212 lkf |= DLM_LKF_NOQUEUE;
@@ -228,7 +230,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
228 BUG(); 230 BUG();
229 } 231 }
230 232
231 if (lkid != 0) { 233 if (gl->gl_lksb.sb_lkid != 0) {
232 lkf |= DLM_LKF_CONVERT; 234 lkf |= DLM_LKF_CONVERT;
233 if (test_bit(GLF_BLOCKING, &gl->gl_flags)) 235 if (test_bit(GLF_BLOCKING, &gl->gl_flags))
234 lkf |= DLM_LKF_QUECVT; 236 lkf |= DLM_LKF_QUECVT;
@@ -239,6 +241,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
239 241
240static void gfs2_reverse_hex(char *c, u64 value) 242static void gfs2_reverse_hex(char *c, u64 value)
241{ 243{
244 *c = '0';
242 while (value) { 245 while (value) {
243 *c-- = hex_asc[value & 0x0f]; 246 *c-- = hex_asc[value & 0x0f];
244 value >>= 4; 247 value >>= 4;
@@ -278,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
278{ 281{
279 struct gfs2_sbd *sdp = gl->gl_sbd; 282 struct gfs2_sbd *sdp = gl->gl_sbd;
280 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 283 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
284 int lvb_needs_unlock = 0;
281 int error; 285 int error;
282 286
283 if (gl->gl_lksb.sb_lkid == 0) { 287 if (gl->gl_lksb.sb_lkid == 0) {
@@ -289,6 +293,18 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
289 gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); 293 gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
290 gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); 294 gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
291 gfs2_update_request_times(gl); 295 gfs2_update_request_times(gl);
296
297 /* don't want to skip dlm_unlock writing the lvb when lock is ex */
298
299 if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE))
300 lvb_needs_unlock = 1;
301
302 if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
303 !lvb_needs_unlock) {
304 gfs2_glock_free(gl);
305 return;
306 }
307
292 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, 308 error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
293 NULL, gl); 309 NULL, gl);
294 if (error) { 310 if (error) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e443966c8106..0e3554edb8f2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -278,6 +278,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
278 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - 278 sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
279 sizeof(struct gfs2_meta_header)) / 279 sizeof(struct gfs2_meta_header)) /
280 sizeof(struct gfs2_quota_change); 280 sizeof(struct gfs2_quota_change);
281 sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize -
282 sizeof(struct gfs2_meta_header))
283 * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */
281 284
282 /* Compute maximum reservation required to add a entry to a directory */ 285 /* Compute maximum reservation required to add a entry to a directory */
283 286
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c5af8e18f27a..ae55e248c3b7 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -816,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
816 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; 816 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
817 817
818 reserved = 1 + (nalloc * (data_blocks + ind_blocks)); 818 reserved = 1 + (nalloc * (data_blocks + ind_blocks));
819 error = gfs2_inplace_reserve(ip, reserved); 819 error = gfs2_inplace_reserve(ip, reserved, 0);
820 if (error) 820 if (error)
821 goto out_alloc; 821 goto out_alloc;
822 822
@@ -869,7 +869,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
869 if (error < 0) 869 if (error < 0)
870 return error; 870 return error;
871 871
872 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 872 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
873 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); 873 qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
874 qlvb->__pad = 0; 874 qlvb->__pad = 0;
875 qlvb->qb_limit = q.qu_limit; 875 qlvb->qb_limit = q.qu_limit;
@@ -893,7 +893,7 @@ restart:
893 if (error) 893 if (error)
894 return error; 894 return error;
895 895
896 qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 896 qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
897 897
898 if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { 898 if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
899 gfs2_glock_dq_uninit(q_gh); 899 gfs2_glock_dq_uninit(q_gh);
@@ -1506,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
1506 if (error) 1506 if (error)
1507 goto out; 1507 goto out;
1508 1508
1509 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 1509 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
1510 fdq->d_version = FS_DQUOT_VERSION; 1510 fdq->d_version = FS_DQUOT_VERSION;
1511 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; 1511 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1512 fdq->d_id = from_kqid(&init_user_ns, qid); 1512 fdq->d_id = from_kqid(&init_user_ns, qid);
@@ -1605,7 +1605,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
1605 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), 1605 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
1606 &data_blocks, &ind_blocks); 1606 &data_blocks, &ind_blocks);
1607 blocks = 1 + data_blocks + ind_blocks; 1607 blocks = 1 + data_blocks + ind_blocks;
1608 error = gfs2_inplace_reserve(ip, blocks); 1608 error = gfs2_inplace_reserve(ip, blocks, 0);
1609 if (error) 1609 if (error)
1610 goto out_i; 1610 goto out_i;
1611 blocks += gfs2_rg_blocks(ip, blocks); 1611 blocks += gfs2_rg_blocks(ip, blocks);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 38fe18f2f055..b7eff078fe90 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -16,6 +16,7 @@
16#include <linux/prefetch.h> 16#include <linux/prefetch.h>
17#include <linux/blkdev.h> 17#include <linux/blkdev.h>
18#include <linux/rbtree.h> 18#include <linux/rbtree.h>
19#include <linux/random.h>
19 20
20#include "gfs2.h" 21#include "gfs2.h"
21#include "incore.h" 22#include "incore.h"
@@ -251,22 +252,25 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
251static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) 252static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
252{ 253{
253 u64 rblock = block - rbm->rgd->rd_data0; 254 u64 rblock = block - rbm->rgd->rd_data0;
254 u32 goal = (u32)rblock; 255 u32 x;
255 int x;
256 256
257 if (WARN_ON_ONCE(rblock > UINT_MAX)) 257 if (WARN_ON_ONCE(rblock > UINT_MAX))
258 return -EINVAL; 258 return -EINVAL;
259 if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data) 259 if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
260 return -E2BIG; 260 return -E2BIG;
261 261
262 for (x = 0; x < rbm->rgd->rd_length; x++) { 262 rbm->bi = rbm->rgd->rd_bits;
263 rbm->bi = rbm->rgd->rd_bits + x; 263 rbm->offset = (u32)(rblock);
264 if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) { 264 /* Check if the block is within the first block */
265 rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY); 265 if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY)
266 break; 266 return 0;
267 }
268 }
269 267
268 /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
269 rbm->offset += (sizeof(struct gfs2_rgrp) -
270 sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
271 x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
272 rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
273 rbm->bi += x;
270 return 0; 274 return 0;
271} 275}
272 276
@@ -346,10 +350,14 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
346 BUG_ON(len < chunk_size); 350 BUG_ON(len < chunk_size);
347 len -= chunk_size; 351 len -= chunk_size;
348 block = gfs2_rbm_to_block(&rbm); 352 block = gfs2_rbm_to_block(&rbm);
349 gfs2_rbm_from_block(&rbm, block + chunk_size); 353 if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
350 n_unaligned = 3; 354 n_unaligned = 0;
351 if (ptr) 355 break;
356 }
357 if (ptr) {
358 n_unaligned = 3;
352 break; 359 break;
360 }
353 n_unaligned = len & 3; 361 n_unaligned = len & 3;
354 } 362 }
355 363
@@ -553,22 +561,20 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
553 */ 561 */
554int gfs2_rs_alloc(struct gfs2_inode *ip) 562int gfs2_rs_alloc(struct gfs2_inode *ip)
555{ 563{
556 struct gfs2_blkreserv *res; 564 int error = 0;
557 565
566 down_write(&ip->i_rw_mutex);
558 if (ip->i_res) 567 if (ip->i_res)
559 return 0; 568 goto out;
560
561 res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
562 if (!res)
563 return -ENOMEM;
564 569
565 RB_CLEAR_NODE(&res->rs_node); 570 ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
571 if (!ip->i_res) {
572 error = -ENOMEM;
573 goto out;
574 }
566 575
567 down_write(&ip->i_rw_mutex); 576 RB_CLEAR_NODE(&ip->i_res->rs_node);
568 if (ip->i_res) 577out:
569 kmem_cache_free(gfs2_rsrv_cachep, res);
570 else
571 ip->i_res = res;
572 up_write(&ip->i_rw_mutex); 578 up_write(&ip->i_rw_mutex);
573 return 0; 579 return 0;
574} 580}
@@ -875,7 +881,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
875 goto fail; 881 goto fail;
876 882
877 rgd->rd_gl->gl_object = rgd; 883 rgd->rd_gl->gl_object = rgd;
878 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lvb; 884 rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
879 rgd->rd_flags &= ~GFS2_RDF_UPTODATE; 885 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
880 if (rgd->rd_data > sdp->sd_max_rg_data) 886 if (rgd->rd_data > sdp->sd_max_rg_data)
881 sdp->sd_max_rg_data = rgd->rd_data; 887 sdp->sd_max_rg_data = rgd->rd_data;
@@ -1420,6 +1426,9 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1420 rs->rs_free = extlen; 1426 rs->rs_free = extlen;
1421 rs->rs_inum = ip->i_no_addr; 1427 rs->rs_inum = ip->i_no_addr;
1422 rs_insert(ip); 1428 rs_insert(ip);
1429 } else {
1430 if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1431 rgd->rd_last_alloc = 0;
1423 } 1432 }
1424} 1433}
1425 1434
@@ -1678,13 +1687,105 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
1678 return; 1687 return;
1679} 1688}
1680 1689
1690/**
1691 * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1692 * @rgd: The rgrp in question
1693 * @loops: An indication of how picky we can be (0=very, 1=less so)
1694 *
1695 * This function uses the recently added glock statistics in order to
1696 * figure out whether a parciular resource group is suffering from
1697 * contention from multiple nodes. This is done purely on the basis
1698 * of timings, since this is the only data we have to work with and
1699 * our aim here is to reject a resource group which is highly contended
1700 * but (very important) not to do this too often in order to ensure that
1701 * we do not land up introducing fragmentation by changing resource
1702 * groups when not actually required.
1703 *
1704 * The calculation is fairly simple, we want to know whether the SRTTB
1705 * (i.e. smoothed round trip time for blocking operations) to acquire
1706 * the lock for this rgrp's glock is significantly greater than the
1707 * time taken for resource groups on average. We introduce a margin in
1708 * the form of the variable @var which is computed as the sum of the two
1709 * respective variences, and multiplied by a factor depending on @loops
1710 * and whether we have a lot of data to base the decision on. This is
1711 * then tested against the square difference of the means in order to
1712 * decide whether the result is statistically significant or not.
1713 *
1714 * Returns: A boolean verdict on the congestion status
1715 */
1716
1717static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1718{
1719 const struct gfs2_glock *gl = rgd->rd_gl;
1720 const struct gfs2_sbd *sdp = gl->gl_sbd;
1721 struct gfs2_lkstats *st;
1722 s64 r_dcount, l_dcount;
1723 s64 r_srttb, l_srttb;
1724 s64 srttb_diff;
1725 s64 sqr_diff;
1726 s64 var;
1727
1728 preempt_disable();
1729 st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1730 r_srttb = st->stats[GFS2_LKS_SRTTB];
1731 r_dcount = st->stats[GFS2_LKS_DCOUNT];
1732 var = st->stats[GFS2_LKS_SRTTVARB] +
1733 gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1734 preempt_enable();
1735
1736 l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1737 l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1738
1739 if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0))
1740 return false;
1741
1742 srttb_diff = r_srttb - l_srttb;
1743 sqr_diff = srttb_diff * srttb_diff;
1744
1745 var *= 2;
1746 if (l_dcount < 8 || r_dcount < 8)
1747 var *= 2;
1748 if (loops == 1)
1749 var *= 2;
1750
1751 return ((srttb_diff < 0) && (sqr_diff > var));
1752}
1753
1754/**
1755 * gfs2_rgrp_used_recently
1756 * @rs: The block reservation with the rgrp to test
1757 * @msecs: The time limit in milliseconds
1758 *
1759 * Returns: True if the rgrp glock has been used within the time limit
1760 */
1761static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1762 u64 msecs)
1763{
1764 u64 tdiff;
1765
1766 tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1767 rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1768
1769 return tdiff > (msecs * 1000 * 1000);
1770}
1771
1772static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1773{
1774 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1775 u32 skip;
1776
1777 get_random_bytes(&skip, sizeof(skip));
1778 return skip % sdp->sd_rgrps;
1779}
1780
1681static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) 1781static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1682{ 1782{
1683 struct gfs2_rgrpd *rgd = *pos; 1783 struct gfs2_rgrpd *rgd = *pos;
1784 struct gfs2_sbd *sdp = rgd->rd_sbd;
1684 1785
1685 rgd = gfs2_rgrpd_get_next(rgd); 1786 rgd = gfs2_rgrpd_get_next(rgd);
1686 if (rgd == NULL) 1787 if (rgd == NULL)
1687 rgd = gfs2_rgrpd_get_next(NULL); 1788 rgd = gfs2_rgrpd_get_first(sdp);
1688 *pos = rgd; 1789 *pos = rgd;
1689 if (rgd != begin) /* If we didn't wrap */ 1790 if (rgd != begin) /* If we didn't wrap */
1690 return true; 1791 return true;
@@ -1699,14 +1800,15 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
1699 * Returns: errno 1800 * Returns: errno
1700 */ 1801 */
1701 1802
1702int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) 1803int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
1703{ 1804{
1704 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1805 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1705 struct gfs2_rgrpd *begin = NULL; 1806 struct gfs2_rgrpd *begin = NULL;
1706 struct gfs2_blkreserv *rs = ip->i_res; 1807 struct gfs2_blkreserv *rs = ip->i_res;
1707 int error = 0, rg_locked, flags = LM_FLAG_TRY; 1808 int error = 0, rg_locked, flags = 0;
1708 u64 last_unlinked = NO_BLOCK; 1809 u64 last_unlinked = NO_BLOCK;
1709 int loops = 0; 1810 int loops = 0;
1811 u32 skip = 0;
1710 1812
1711 if (sdp->sd_args.ar_rgrplvb) 1813 if (sdp->sd_args.ar_rgrplvb)
1712 flags |= GL_SKIP; 1814 flags |= GL_SKIP;
@@ -1720,6 +1822,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
1720 } else { 1822 } else {
1721 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); 1823 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
1722 } 1824 }
1825 if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV))
1826 skip = gfs2_orlov_skip(ip);
1723 if (rs->rs_rbm.rgd == NULL) 1827 if (rs->rs_rbm.rgd == NULL)
1724 return -EBADSLT; 1828 return -EBADSLT;
1725 1829
@@ -1728,13 +1832,20 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
1728 1832
1729 if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { 1833 if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
1730 rg_locked = 0; 1834 rg_locked = 0;
1835 if (skip && skip--)
1836 goto next_rgrp;
1837 if (!gfs2_rs_active(rs) && (loops < 2) &&
1838 gfs2_rgrp_used_recently(rs, 1000) &&
1839 gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
1840 goto next_rgrp;
1731 error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, 1841 error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
1732 LM_ST_EXCLUSIVE, flags, 1842 LM_ST_EXCLUSIVE, flags,
1733 &rs->rs_rgd_gh); 1843 &rs->rs_rgd_gh);
1734 if (error == GLR_TRYFAILED)
1735 goto next_rgrp;
1736 if (unlikely(error)) 1844 if (unlikely(error))
1737 return error; 1845 return error;
1846 if (!gfs2_rs_active(rs) && (loops < 2) &&
1847 gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
1848 goto skip_rgrp;
1738 if (sdp->sd_args.ar_rgrplvb) { 1849 if (sdp->sd_args.ar_rgrplvb) {
1739 error = update_rgrp_lvb(rs->rs_rbm.rgd); 1850 error = update_rgrp_lvb(rs->rs_rbm.rgd);
1740 if (unlikely(error)) { 1851 if (unlikely(error)) {
@@ -1781,12 +1892,13 @@ next_rgrp:
1781 /* Find the next rgrp, and continue looking */ 1892 /* Find the next rgrp, and continue looking */
1782 if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) 1893 if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
1783 continue; 1894 continue;
1895 if (skip)
1896 continue;
1784 1897
1785 /* If we've scanned all the rgrps, but found no free blocks 1898 /* If we've scanned all the rgrps, but found no free blocks
1786 * then this checks for some less likely conditions before 1899 * then this checks for some less likely conditions before
1787 * trying again. 1900 * trying again.
1788 */ 1901 */
1789 flags &= ~LM_FLAG_TRY;
1790 loops++; 1902 loops++;
1791 /* Check that fs hasn't grown if writing to rindex */ 1903 /* Check that fs hasn't grown if writing to rindex */
1792 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { 1904 if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 24077958dcf6..842185853f6b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,7 +39,8 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
39 39
40extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); 40extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
41 41
42extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested); 42#define GFS2_AF_ORLOV 1
43extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags);
43extern void gfs2_inplace_release(struct gfs2_inode *ip); 44extern void gfs2_inplace_release(struct gfs2_inode *ip);
44 45
45extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, 46extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index bbdc78af60ca..2ee13e841e9f 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -486,7 +486,7 @@ TRACE_EVENT(gfs2_block_alloc,
486 ), 486 ),
487 487
488 TP_fast_assign( 488 TP_fast_assign(
489 __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev; 489 __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
490 __entry->start = block; 490 __entry->start = block;
491 __entry->inum = ip->i_no_addr; 491 __entry->inum = ip->i_no_addr;
492 __entry->len = len; 492 __entry->len = len;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index db330e5518cd..76c144b3c9bb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 if (error) 734 if (error)
735 return error; 735 return error;
736 736
737 error = gfs2_inplace_reserve(ip, blks); 737 error = gfs2_inplace_reserve(ip, blks, 0);
738 if (error) 738 if (error)
739 goto out_gunlock_q; 739 goto out_gunlock_q;
740 740
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 0b35903219bc..d47f11658c17 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -35,6 +35,16 @@ static int hfs_readpage(struct file *file, struct page *page)
35 return block_read_full_page(page, hfs_get_block); 35 return block_read_full_page(page, hfs_get_block);
36} 36}
37 37
38static void hfs_write_failed(struct address_space *mapping, loff_t to)
39{
40 struct inode *inode = mapping->host;
41
42 if (to > inode->i_size) {
43 truncate_pagecache(inode, to, inode->i_size);
44 hfs_file_truncate(inode);
45 }
46}
47
38static int hfs_write_begin(struct file *file, struct address_space *mapping, 48static int hfs_write_begin(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned flags, 49 loff_t pos, unsigned len, unsigned flags,
40 struct page **pagep, void **fsdata) 50 struct page **pagep, void **fsdata)
@@ -45,11 +55,8 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
45 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 55 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
46 hfs_get_block, 56 hfs_get_block,
47 &HFS_I(mapping->host)->phys_size); 57 &HFS_I(mapping->host)->phys_size);
48 if (unlikely(ret)) { 58 if (unlikely(ret))
49 loff_t isize = mapping->host->i_size; 59 hfs_write_failed(mapping, pos + len);
50 if (pos + len > isize)
51 vmtruncate(mapping->host, isize);
52 }
53 60
54 return ret; 61 return ret;
55} 62}
@@ -120,6 +127,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
120 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 127 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
121{ 128{
122 struct file *file = iocb->ki_filp; 129 struct file *file = iocb->ki_filp;
130 struct address_space *mapping = file->f_mapping;
123 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 131 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
124 ssize_t ret; 132 ssize_t ret;
125 133
@@ -135,7 +143,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
135 loff_t end = offset + iov_length(iov, nr_segs); 143 loff_t end = offset + iov_length(iov, nr_segs);
136 144
137 if (end > isize) 145 if (end > isize)
138 vmtruncate(inode, isize); 146 hfs_write_failed(mapping, end);
139 } 147 }
140 148
141 return ret; 149 return ret;
@@ -617,9 +625,12 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
617 attr->ia_size != i_size_read(inode)) { 625 attr->ia_size != i_size_read(inode)) {
618 inode_dio_wait(inode); 626 inode_dio_wait(inode);
619 627
620 error = vmtruncate(inode, attr->ia_size); 628 error = inode_newsize_ok(inode, attr->ia_size);
621 if (error) 629 if (error)
622 return error; 630 return error;
631
632 truncate_setsize(inode, attr->ia_size);
633 hfs_file_truncate(inode);
623 } 634 }
624 635
625 setattr_copy(inode, attr); 636 setattr_copy(inode, attr);
@@ -668,7 +679,6 @@ static const struct file_operations hfs_file_operations = {
668 679
669static const struct inode_operations hfs_file_inode_operations = { 680static const struct inode_operations hfs_file_inode_operations = {
670 .lookup = hfs_file_lookup, 681 .lookup = hfs_file_lookup,
671 .truncate = hfs_file_truncate,
672 .setattr = hfs_inode_setattr, 682 .setattr = hfs_inode_setattr,
673 .setxattr = hfs_setxattr, 683 .setxattr = hfs_setxattr,
674 .getxattr = hfs_getxattr, 684 .getxattr = hfs_getxattr,
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 4cfbe2edd296..6feefc0cb48a 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -176,12 +176,14 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
176 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); 176 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
177 /* are all of the bits in range? */ 177 /* are all of the bits in range? */
178 if ((offset + count) > sbi->total_blocks) 178 if ((offset + count) > sbi->total_blocks)
179 return -2; 179 return -ENOENT;
180 180
181 mutex_lock(&sbi->alloc_mutex); 181 mutex_lock(&sbi->alloc_mutex);
182 mapping = sbi->alloc_file->i_mapping; 182 mapping = sbi->alloc_file->i_mapping;
183 pnr = offset / PAGE_CACHE_BITS; 183 pnr = offset / PAGE_CACHE_BITS;
184 page = read_mapping_page(mapping, pnr, NULL); 184 page = read_mapping_page(mapping, pnr, NULL);
185 if (IS_ERR(page))
186 goto kaboom;
185 pptr = kmap(page); 187 pptr = kmap(page);
186 curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; 188 curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
187 end = pptr + PAGE_CACHE_BITS / 32; 189 end = pptr + PAGE_CACHE_BITS / 32;
@@ -214,6 +216,8 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
214 set_page_dirty(page); 216 set_page_dirty(page);
215 kunmap(page); 217 kunmap(page);
216 page = read_mapping_page(mapping, ++pnr, NULL); 218 page = read_mapping_page(mapping, ++pnr, NULL);
219 if (IS_ERR(page))
220 goto kaboom;
217 pptr = kmap(page); 221 pptr = kmap(page);
218 curr = pptr; 222 curr = pptr;
219 end = pptr + PAGE_CACHE_BITS / 32; 223 end = pptr + PAGE_CACHE_BITS / 32;
@@ -232,4 +236,11 @@ out:
232 mutex_unlock(&sbi->alloc_mutex); 236 mutex_unlock(&sbi->alloc_mutex);
233 237
234 return 0; 238 return 0;
239
240kaboom:
241 printk(KERN_CRIT "hfsplus: unable to mark blocks free: error %ld\n",
242 PTR_ERR(page));
243 mutex_unlock(&sbi->alloc_mutex);
244
245 return -EIO;
235} 246}
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 21023d9f8ff3..685d07d0ed18 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -159,7 +159,7 @@ void hfs_btree_close(struct hfs_btree *tree)
159 kfree(tree); 159 kfree(tree);
160} 160}
161 161
162void hfs_btree_write(struct hfs_btree *tree) 162int hfs_btree_write(struct hfs_btree *tree)
163{ 163{
164 struct hfs_btree_header_rec *head; 164 struct hfs_btree_header_rec *head;
165 struct hfs_bnode *node; 165 struct hfs_bnode *node;
@@ -168,7 +168,7 @@ void hfs_btree_write(struct hfs_btree *tree)
168 node = hfs_bnode_find(tree, 0); 168 node = hfs_bnode_find(tree, 0);
169 if (IS_ERR(node)) 169 if (IS_ERR(node))
170 /* panic? */ 170 /* panic? */
171 return; 171 return -EIO;
172 /* Load the header */ 172 /* Load the header */
173 page = node->page[0]; 173 page = node->page[0];
174 head = (struct hfs_btree_header_rec *)(kmap(page) + 174 head = (struct hfs_btree_header_rec *)(kmap(page) +
@@ -186,6 +186,7 @@ void hfs_btree_write(struct hfs_btree *tree)
186 kunmap(page); 186 kunmap(page);
187 set_page_dirty(page); 187 set_page_dirty(page);
188 hfs_bnode_put(node); 188 hfs_bnode_put(node);
189 return 0;
189} 190}
190 191
191static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) 192static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx)
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 5849e3ef35cc..eba76eab6d62 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -329,6 +329,7 @@ static int hfsplus_free_extents(struct super_block *sb,
329{ 329{
330 u32 count, start; 330 u32 count, start;
331 int i; 331 int i;
332 int err = 0;
332 333
333 hfsplus_dump_extent(extent); 334 hfsplus_dump_extent(extent);
334 for (i = 0; i < 8; extent++, i++) { 335 for (i = 0; i < 8; extent++, i++) {
@@ -345,18 +346,33 @@ found:
345 for (;;) { 346 for (;;) {
346 start = be32_to_cpu(extent->start_block); 347 start = be32_to_cpu(extent->start_block);
347 if (count <= block_nr) { 348 if (count <= block_nr) {
348 hfsplus_block_free(sb, start, count); 349 err = hfsplus_block_free(sb, start, count);
350 if (err) {
351 printk(KERN_ERR "hfs: can't free extent\n");
352 dprint(DBG_EXTENT, " start: %u count: %u\n",
353 start, count);
354 }
349 extent->block_count = 0; 355 extent->block_count = 0;
350 extent->start_block = 0; 356 extent->start_block = 0;
351 block_nr -= count; 357 block_nr -= count;
352 } else { 358 } else {
353 count -= block_nr; 359 count -= block_nr;
354 hfsplus_block_free(sb, start + count, block_nr); 360 err = hfsplus_block_free(sb, start + count, block_nr);
361 if (err) {
362 printk(KERN_ERR "hfs: can't free extent\n");
363 dprint(DBG_EXTENT, " start: %u count: %u\n",
364 start, count);
365 }
355 extent->block_count = cpu_to_be32(count); 366 extent->block_count = cpu_to_be32(count);
356 block_nr = 0; 367 block_nr = 0;
357 } 368 }
358 if (!block_nr || !i) 369 if (!block_nr || !i) {
359 return 0; 370 /*
371 * Try to free all extents and
372 * return only last error
373 */
374 return err;
375 }
360 i--; 376 i--;
361 extent--; 377 extent--;
362 count = be32_to_cpu(extent->block_count); 378 count = be32_to_cpu(extent->block_count);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index c571de224b15..a6da86b1b4c1 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -335,7 +335,7 @@ int hfsplus_block_free(struct super_block *, u32, u32);
335/* btree.c */ 335/* btree.c */
336struct hfs_btree *hfs_btree_open(struct super_block *, u32); 336struct hfs_btree *hfs_btree_open(struct super_block *, u32);
337void hfs_btree_close(struct hfs_btree *); 337void hfs_btree_close(struct hfs_btree *);
338void hfs_btree_write(struct hfs_btree *); 338int hfs_btree_write(struct hfs_btree *);
339struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *); 339struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *);
340void hfs_bmap_free(struct hfs_bnode *); 340void hfs_bmap_free(struct hfs_bnode *);
341 341
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 2172aa5976f5..799b336b59f9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -28,6 +28,16 @@ static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
28 return block_write_full_page(page, hfsplus_get_block, wbc); 28 return block_write_full_page(page, hfsplus_get_block, wbc);
29} 29}
30 30
31static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
32{
33 struct inode *inode = mapping->host;
34
35 if (to > inode->i_size) {
36 truncate_pagecache(inode, to, inode->i_size);
37 hfsplus_file_truncate(inode);
38 }
39}
40
31static int hfsplus_write_begin(struct file *file, struct address_space *mapping, 41static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
32 loff_t pos, unsigned len, unsigned flags, 42 loff_t pos, unsigned len, unsigned flags,
33 struct page **pagep, void **fsdata) 43 struct page **pagep, void **fsdata)
@@ -38,11 +48,8 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
38 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 48 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
39 hfsplus_get_block, 49 hfsplus_get_block,
40 &HFSPLUS_I(mapping->host)->phys_size); 50 &HFSPLUS_I(mapping->host)->phys_size);
41 if (unlikely(ret)) { 51 if (unlikely(ret))
42 loff_t isize = mapping->host->i_size; 52 hfsplus_write_failed(mapping, pos + len);
43 if (pos + len > isize)
44 vmtruncate(mapping->host, isize);
45 }
46 53
47 return ret; 54 return ret;
48} 55}
@@ -116,6 +123,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
116 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 123 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
117{ 124{
118 struct file *file = iocb->ki_filp; 125 struct file *file = iocb->ki_filp;
126 struct address_space *mapping = file->f_mapping;
119 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 127 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
120 ssize_t ret; 128 ssize_t ret;
121 129
@@ -131,7 +139,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
131 loff_t end = offset + iov_length(iov, nr_segs); 139 loff_t end = offset + iov_length(iov, nr_segs);
132 140
133 if (end > isize) 141 if (end > isize)
134 vmtruncate(inode, isize); 142 hfsplus_write_failed(mapping, end);
135 } 143 }
136 144
137 return ret; 145 return ret;
@@ -300,10 +308,8 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
300 if ((attr->ia_valid & ATTR_SIZE) && 308 if ((attr->ia_valid & ATTR_SIZE) &&
301 attr->ia_size != i_size_read(inode)) { 309 attr->ia_size != i_size_read(inode)) {
302 inode_dio_wait(inode); 310 inode_dio_wait(inode);
303 311 truncate_setsize(inode, attr->ia_size);
304 error = vmtruncate(inode, attr->ia_size); 312 hfsplus_file_truncate(inode);
305 if (error)
306 return error;
307 } 313 }
308 314
309 setattr_copy(inode, attr); 315 setattr_copy(inode, attr);
@@ -358,7 +364,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
358 364
359static const struct inode_operations hfsplus_file_inode_operations = { 365static const struct inode_operations hfsplus_file_inode_operations = {
360 .lookup = hfsplus_file_lookup, 366 .lookup = hfsplus_file_lookup,
361 .truncate = hfsplus_file_truncate,
362 .setattr = hfsplus_setattr, 367 .setattr = hfsplus_setattr,
363 .setxattr = hfsplus_setxattr, 368 .setxattr = hfsplus_setxattr,
364 .getxattr = hfsplus_getxattr, 369 .getxattr = hfsplus_getxattr,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 811a84d2d964..796198d26553 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -127,8 +127,14 @@ static int hfsplus_system_write_inode(struct inode *inode)
127 hfsplus_mark_mdb_dirty(inode->i_sb); 127 hfsplus_mark_mdb_dirty(inode->i_sb);
128 } 128 }
129 hfsplus_inode_write_fork(inode, fork); 129 hfsplus_inode_write_fork(inode, fork);
130 if (tree) 130 if (tree) {
131 hfs_btree_write(tree); 131 int err = hfs_btree_write(tree);
132 if (err) {
133 printk(KERN_ERR "hfs: b-tree write err: %d, ino %lu\n",
134 err, inode->i_ino);
135 return err;
136 }
137 }
132 return 0; 138 return 0;
133} 139}
134 140
@@ -226,6 +232,7 @@ out:
226 232
227static void delayed_sync_fs(struct work_struct *work) 233static void delayed_sync_fs(struct work_struct *work)
228{ 234{
235 int err;
229 struct hfsplus_sb_info *sbi; 236 struct hfsplus_sb_info *sbi;
230 237
231 sbi = container_of(work, struct hfsplus_sb_info, sync_work.work); 238 sbi = container_of(work, struct hfsplus_sb_info, sync_work.work);
@@ -234,7 +241,9 @@ static void delayed_sync_fs(struct work_struct *work)
234 sbi->work_queued = 0; 241 sbi->work_queued = 0;
235 spin_unlock(&sbi->work_lock); 242 spin_unlock(&sbi->work_lock);
236 243
237 hfsplus_sync_fs(sbi->alloc_file->i_sb, 1); 244 err = hfsplus_sync_fs(sbi->alloc_file->i_sb, 1);
245 if (err)
246 printk(KERN_ERR "hfs: delayed sync fs err %d\n", err);
238} 247}
239 248
240void hfsplus_mark_mdb_dirty(struct super_block *sb) 249void hfsplus_mark_mdb_dirty(struct super_block *sb)
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 89d2a5803ae3..fbfe2df5624b 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -50,7 +50,7 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
50 return disk_secno; 50 return disk_secno;
51} 51}
52 52
53static void hpfs_truncate(struct inode *i) 53void hpfs_truncate(struct inode *i)
54{ 54{
55 if (IS_IMMUTABLE(i)) return /*-EPERM*/; 55 if (IS_IMMUTABLE(i)) return /*-EPERM*/;
56 hpfs_lock_assert(i->i_sb); 56 hpfs_lock_assert(i->i_sb);
@@ -105,6 +105,16 @@ static int hpfs_readpage(struct file *file, struct page *page)
105 return block_read_full_page(page,hpfs_get_block); 105 return block_read_full_page(page,hpfs_get_block);
106} 106}
107 107
108static void hpfs_write_failed(struct address_space *mapping, loff_t to)
109{
110 struct inode *inode = mapping->host;
111
112 if (to > inode->i_size) {
113 truncate_pagecache(inode, to, inode->i_size);
114 hpfs_truncate(inode);
115 }
116}
117
108static int hpfs_write_begin(struct file *file, struct address_space *mapping, 118static int hpfs_write_begin(struct file *file, struct address_space *mapping,
109 loff_t pos, unsigned len, unsigned flags, 119 loff_t pos, unsigned len, unsigned flags,
110 struct page **pagep, void **fsdata) 120 struct page **pagep, void **fsdata)
@@ -115,11 +125,8 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
115 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 125 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
116 hpfs_get_block, 126 hpfs_get_block,
117 &hpfs_i(mapping->host)->mmu_private); 127 &hpfs_i(mapping->host)->mmu_private);
118 if (unlikely(ret)) { 128 if (unlikely(ret))
119 loff_t isize = mapping->host->i_size; 129 hpfs_write_failed(mapping, pos + len);
120 if (pos + len > isize)
121 vmtruncate(mapping->host, isize);
122 }
123 130
124 return ret; 131 return ret;
125} 132}
@@ -166,6 +173,5 @@ const struct file_operations hpfs_file_ops =
166 173
167const struct inode_operations hpfs_file_iops = 174const struct inode_operations hpfs_file_iops =
168{ 175{
169 .truncate = hpfs_truncate,
170 .setattr = hpfs_setattr, 176 .setattr = hpfs_setattr,
171}; 177};
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 7102aaecc244..b7ae286646b5 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -252,6 +252,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
252/* file.c */ 252/* file.c */
253 253
254int hpfs_file_fsync(struct file *, loff_t, loff_t, int); 254int hpfs_file_fsync(struct file *, loff_t, loff_t, int);
255void hpfs_truncate(struct inode *);
255extern const struct file_operations hpfs_file_ops; 256extern const struct file_operations hpfs_file_ops;
256extern const struct inode_operations hpfs_file_iops; 257extern const struct inode_operations hpfs_file_iops;
257extern const struct address_space_operations hpfs_aops; 258extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 804a9a842cbc..5dc06c837105 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,12 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
277 277
278 if ((attr->ia_valid & ATTR_SIZE) && 278 if ((attr->ia_valid & ATTR_SIZE) &&
279 attr->ia_size != i_size_read(inode)) { 279 attr->ia_size != i_size_read(inode)) {
280 error = vmtruncate(inode, attr->ia_size); 280 error = inode_newsize_ok(inode, attr->ia_size);
281 if (error) 281 if (error)
282 goto out_unlock; 282 goto out_unlock;
283
284 truncate_setsize(inode, attr->ia_size);
285 hpfs_truncate(inode);
283 } 286 }
284 287
285 setattr_copy(inode, attr); 288 setattr_copy(inode, attr);
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 78f21f8dc2ec..43b315f2002b 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
710 struct vfsmount *proc_mnt; 710 struct vfsmount *proc_mnt;
711 int err = -ENOENT; 711 int err = -ENOENT;
712 712
713 proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); 713 proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt);
714 if (IS_ERR(proc_mnt)) 714 if (IS_ERR(proc_mnt))
715 goto out; 715 goto out;
716 716
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5bc355d8243..78bde32ea951 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * hugetlbpage-backed filesystem. Based on ramfs. 2 * hugetlbpage-backed filesystem. Based on ramfs.
3 * 3 *
4 * William Irwin, 2002 4 * Nadia Yvette Chambers, 2002
5 * 5 *
6 * Copyright (C) 2002 Linus Torvalds. 6 * Copyright (C) 2002 Linus Torvalds.
7 */ 7 */
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
151{ 151{
152 struct mm_struct *mm = current->mm; 152 struct mm_struct *mm = current->mm;
153 struct vm_area_struct *vma; 153 struct vm_area_struct *vma;
154 unsigned long start_addr;
155 struct hstate *h = hstate_file(file); 154 struct hstate *h = hstate_file(file);
155 struct vm_unmapped_area_info info;
156 156
157 if (len & ~huge_page_mask(h)) 157 if (len & ~huge_page_mask(h))
158 return -EINVAL; 158 return -EINVAL;
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
173 return addr; 173 return addr;
174 } 174 }
175 175
176 if (len > mm->cached_hole_size) 176 info.flags = 0;
177 start_addr = mm->free_area_cache; 177 info.length = len;
178 else { 178 info.low_limit = TASK_UNMAPPED_BASE;
179 start_addr = TASK_UNMAPPED_BASE; 179 info.high_limit = TASK_SIZE;
180 mm->cached_hole_size = 0; 180 info.align_mask = PAGE_MASK & ~huge_page_mask(h);
181 } 181 info.align_offset = 0;
182 182 return vm_unmapped_area(&info);
183full_search:
184 addr = ALIGN(start_addr, huge_page_size(h));
185
186 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
187 /* At this point: (!vma || addr < vma->vm_end). */
188 if (TASK_SIZE - len < addr) {
189 /*
190 * Start a new search - just in case we missed
191 * some holes.
192 */
193 if (start_addr != TASK_UNMAPPED_BASE) {
194 start_addr = TASK_UNMAPPED_BASE;
195 mm->cached_hole_size = 0;
196 goto full_search;
197 }
198 return -ENOMEM;
199 }
200
201 if (!vma || addr + len <= vma->vm_start) {
202 mm->free_area_cache = addr + len;
203 return addr;
204 }
205 if (addr + mm->cached_hole_size < vma->vm_start)
206 mm->cached_hole_size = vma->vm_start - addr;
207 addr = ALIGN(vma->vm_end, huge_page_size(h));
208 }
209} 183}
210#endif 184#endif
211 185
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
608 int rc; 582 int rc;
609 583
610 rc = migrate_huge_page_move_mapping(mapping, newpage, page); 584 rc = migrate_huge_page_move_mapping(mapping, newpage, page);
611 if (rc) 585 if (rc != MIGRATEPAGE_SUCCESS)
612 return rc; 586 return rc;
613 migrate_page_copy(newpage, page); 587 migrate_page_copy(newpage, page);
614 588
615 return 0; 589 return MIGRATEPAGE_SUCCESS;
616} 590}
617 591
618static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 592static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = {
923 .kill_sb = kill_litter_super, 897 .kill_sb = kill_litter_super,
924}; 898};
925 899
926static struct vfsmount *hugetlbfs_vfsmount; 900static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
927 901
928static int can_do_hugetlb_shm(void) 902static int can_do_hugetlb_shm(void)
929{ 903{
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void)
932 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 906 return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
933} 907}
934 908
909static int get_hstate_idx(int page_size_log)
910{
911 struct hstate *h;
912
913 if (!page_size_log)
914 return default_hstate_idx;
915 h = size_to_hstate(1 << page_size_log);
916 if (!h)
917 return -1;
918 return h - hstates;
919}
920
935struct file *hugetlb_file_setup(const char *name, unsigned long addr, 921struct file *hugetlb_file_setup(const char *name, unsigned long addr,
936 size_t size, vm_flags_t acctflag, 922 size_t size, vm_flags_t acctflag,
937 struct user_struct **user, int creat_flags) 923 struct user_struct **user,
924 int creat_flags, int page_size_log)
938{ 925{
939 int error = -ENOMEM; 926 int error = -ENOMEM;
940 struct file *file; 927 struct file *file;
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
944 struct qstr quick_string; 931 struct qstr quick_string;
945 struct hstate *hstate; 932 struct hstate *hstate;
946 unsigned long num_pages; 933 unsigned long num_pages;
934 int hstate_idx;
935
936 hstate_idx = get_hstate_idx(page_size_log);
937 if (hstate_idx < 0)
938 return ERR_PTR(-ENODEV);
947 939
948 *user = NULL; 940 *user = NULL;
949 if (!hugetlbfs_vfsmount) 941 if (!hugetlbfs_vfsmount[hstate_idx])
950 return ERR_PTR(-ENOENT); 942 return ERR_PTR(-ENOENT);
951 943
952 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 944 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
963 } 955 }
964 } 956 }
965 957
966 root = hugetlbfs_vfsmount->mnt_root; 958 root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
967 quick_string.name = name; 959 quick_string.name = name;
968 quick_string.len = strlen(quick_string.name); 960 quick_string.len = strlen(quick_string.name);
969 quick_string.hash = 0; 961 quick_string.hash = 0;
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
971 if (!path.dentry) 963 if (!path.dentry)
972 goto out_shm_unlock; 964 goto out_shm_unlock;
973 965
974 path.mnt = mntget(hugetlbfs_vfsmount); 966 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
975 error = -ENOSPC; 967 error = -ENOSPC;
976 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0); 968 inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
977 if (!inode) 969 if (!inode)
@@ -1011,8 +1003,9 @@ out_shm_unlock:
1011 1003
1012static int __init init_hugetlbfs_fs(void) 1004static int __init init_hugetlbfs_fs(void)
1013{ 1005{
1006 struct hstate *h;
1014 int error; 1007 int error;
1015 struct vfsmount *vfsmount; 1008 int i;
1016 1009
1017 error = bdi_init(&hugetlbfs_backing_dev_info); 1010 error = bdi_init(&hugetlbfs_backing_dev_info);
1018 if (error) 1011 if (error)
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void)
1029 if (error) 1022 if (error)
1030 goto out; 1023 goto out;
1031 1024
1032 vfsmount = kern_mount(&hugetlbfs_fs_type); 1025 i = 0;
1026 for_each_hstate(h) {
1027 char buf[50];
1028 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1033 1029
1034 if (!IS_ERR(vfsmount)) { 1030 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1035 hugetlbfs_vfsmount = vfsmount; 1031 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1036 return 0; 1032 buf);
1037 }
1038 1033
1039 error = PTR_ERR(vfsmount); 1034 if (IS_ERR(hugetlbfs_vfsmount[i])) {
1035 pr_err("hugetlb: Cannot mount internal hugetlbfs for "
1036 "page size %uK", ps_kb);
1037 error = PTR_ERR(hugetlbfs_vfsmount[i]);
1038 hugetlbfs_vfsmount[i] = NULL;
1039 }
1040 i++;
1041 }
1042 /* Non default hstates are optional */
1043 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1044 return 0;
1040 1045
1041 out: 1046 out:
1042 kmem_cache_destroy(hugetlbfs_inode_cachep); 1047 kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void)
1047 1052
1048static void __exit exit_hugetlbfs_fs(void) 1053static void __exit exit_hugetlbfs_fs(void)
1049{ 1054{
1055 struct hstate *h;
1056 int i;
1057
1058
1050 /* 1059 /*
1051 * Make sure all delayed rcu free inodes are flushed before we 1060 * Make sure all delayed rcu free inodes are flushed before we
1052 * destroy cache. 1061 * destroy cache.
1053 */ 1062 */
1054 rcu_barrier(); 1063 rcu_barrier();
1055 kmem_cache_destroy(hugetlbfs_inode_cachep); 1064 kmem_cache_destroy(hugetlbfs_inode_cachep);
1056 kern_unmount(hugetlbfs_vfsmount); 1065 i = 0;
1066 for_each_hstate(h)
1067 kern_unmount(hugetlbfs_vfsmount[i++]);
1057 unregister_filesystem(&hugetlbfs_fs_type); 1068 unregister_filesystem(&hugetlbfs_fs_type);
1058 bdi_destroy(&hugetlbfs_backing_dev_info); 1069 bdi_destroy(&hugetlbfs_backing_dev_info);
1059} 1070}
diff --git a/fs/inode.c b/fs/inode.c
index 64999f144153..14084b72b259 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
165 mapping->host = inode; 165 mapping->host = inode;
166 mapping->flags = 0; 166 mapping->flags = 0;
167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 167 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
168 mapping->assoc_mapping = NULL; 168 mapping->private_data = NULL;
169 mapping->backing_dev_info = &default_backing_dev_info; 169 mapping->backing_dev_info = &default_backing_dev_info;
170 mapping->writeback_index = 0; 170 mapping->writeback_index = 0;
171 171
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index a2862339323b..81cc7eaff863 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -446,7 +446,8 @@ int __log_start_commit(journal_t *journal, tid_t target)
446 * currently running transaction (if it exists). Otherwise, 446 * currently running transaction (if it exists). Otherwise,
447 * the target tid must be an old one. 447 * the target tid must be an old one.
448 */ 448 */
449 if (journal->j_running_transaction && 449 if (journal->j_commit_request != target &&
450 journal->j_running_transaction &&
450 journal->j_running_transaction->t_tid == target) { 451 journal->j_running_transaction->t_tid == target) {
451 /* 452 /*
452 * We want a new commit: OK, mark the request and wakeup the 453 * We want a new commit: OK, mark the request and wakeup the
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 7f5120bf0ec2..071d6905f0dd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1259,7 +1259,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
1259 goto not_jbd; 1259 goto not_jbd;
1260 } 1260 }
1261 1261
1262 /* keep track of wether or not this transaction modified us */ 1262 /* keep track of whether or not this transaction modified us */
1263 was_modified = jh->b_modified; 1263 was_modified = jh->b_modified;
1264 1264
1265 /* 1265 /*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 484b8d1c6cb6..dbf41f9452db 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -60,7 +60,6 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access);
60EXPORT_SYMBOL(jbd2_journal_get_undo_access); 60EXPORT_SYMBOL(jbd2_journal_get_undo_access);
61EXPORT_SYMBOL(jbd2_journal_set_triggers); 61EXPORT_SYMBOL(jbd2_journal_set_triggers);
62EXPORT_SYMBOL(jbd2_journal_dirty_metadata); 62EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
63EXPORT_SYMBOL(jbd2_journal_release_buffer);
64EXPORT_SYMBOL(jbd2_journal_forget); 63EXPORT_SYMBOL(jbd2_journal_forget);
65#if 0 64#if 0
66EXPORT_SYMBOL(journal_sync_buffer); 65EXPORT_SYMBOL(journal_sync_buffer);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a74ba4659549..df9f29760efa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -209,7 +209,8 @@ repeat:
209 if (!new_transaction) 209 if (!new_transaction)
210 goto alloc_transaction; 210 goto alloc_transaction;
211 write_lock(&journal->j_state_lock); 211 write_lock(&journal->j_state_lock);
212 if (!journal->j_running_transaction) { 212 if (!journal->j_running_transaction &&
213 !journal->j_barrier_count) {
213 jbd2_get_transaction(journal, new_transaction); 214 jbd2_get_transaction(journal, new_transaction);
214 new_transaction = NULL; 215 new_transaction = NULL;
215 } 216 }
@@ -1207,17 +1208,6 @@ out:
1207 return ret; 1208 return ret;
1208} 1209}
1209 1210
1210/*
1211 * jbd2_journal_release_buffer: undo a get_write_access without any buffer
1212 * updates, if the update decided in the end that it didn't need access.
1213 *
1214 */
1215void
1216jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1217{
1218 BUFFER_TRACE(bh, "entry");
1219}
1220
1221/** 1211/**
1222 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. 1212 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1223 * @handle: transaction handle 1213 * @handle: transaction handle
@@ -1261,7 +1251,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1261 goto not_jbd; 1251 goto not_jbd;
1262 } 1252 }
1263 1253
1264 /* keep track of wether or not this transaction modified us */ 1254 /* keep track of whether or not this transaction modified us */
1265 was_modified = jh->b_modified; 1255 was_modified = jh->b_modified;
1266 1256
1267 /* 1257 /*
@@ -1850,7 +1840,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
1850 1840
1851 BUFFER_TRACE(bh, "entry"); 1841 BUFFER_TRACE(bh, "entry");
1852 1842
1853retry:
1854 /* 1843 /*
1855 * It is safe to proceed here without the j_list_lock because the 1844 * It is safe to proceed here without the j_list_lock because the
1856 * buffers cannot be stolen by try_to_free_buffers as long as we are 1845 * buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1945,14 +1934,11 @@ retry:
1945 * for commit and try again. 1934 * for commit and try again.
1946 */ 1935 */
1947 if (partial_page) { 1936 if (partial_page) {
1948 tid_t tid = journal->j_committing_transaction->t_tid;
1949
1950 jbd2_journal_put_journal_head(jh); 1937 jbd2_journal_put_journal_head(jh);
1951 spin_unlock(&journal->j_list_lock); 1938 spin_unlock(&journal->j_list_lock);
1952 jbd_unlock_bh_state(bh); 1939 jbd_unlock_bh_state(bh);
1953 write_unlock(&journal->j_state_lock); 1940 write_unlock(&journal->j_state_lock);
1954 jbd2_log_wait_commit(journal, tid); 1941 return -EBUSY;
1955 goto retry;
1956 } 1942 }
1957 /* 1943 /*
1958 * OK, buffer won't be reachable after truncate. We just set 1944 * OK, buffer won't be reachable after truncate. We just set
@@ -2013,21 +1999,23 @@ zap_buffer_unlocked:
2013 * @page: page to flush 1999 * @page: page to flush
2014 * @offset: length of page to invalidate. 2000 * @offset: length of page to invalidate.
2015 * 2001 *
2016 * Reap page buffers containing data after offset in page. 2002 * Reap page buffers containing data after offset in page. Can return -EBUSY
2017 * 2003 * if buffers are part of the committing transaction and the page is straddling
2004 * i_size. Caller then has to wait for current commit and try again.
2018 */ 2005 */
2019void jbd2_journal_invalidatepage(journal_t *journal, 2006int jbd2_journal_invalidatepage(journal_t *journal,
2020 struct page *page, 2007 struct page *page,
2021 unsigned long offset) 2008 unsigned long offset)
2022{ 2009{
2023 struct buffer_head *head, *bh, *next; 2010 struct buffer_head *head, *bh, *next;
2024 unsigned int curr_off = 0; 2011 unsigned int curr_off = 0;
2025 int may_free = 1; 2012 int may_free = 1;
2013 int ret = 0;
2026 2014
2027 if (!PageLocked(page)) 2015 if (!PageLocked(page))
2028 BUG(); 2016 BUG();
2029 if (!page_has_buffers(page)) 2017 if (!page_has_buffers(page))
2030 return; 2018 return 0;
2031 2019
2032 /* We will potentially be playing with lists other than just the 2020 /* We will potentially be playing with lists other than just the
2033 * data lists (especially for journaled data mode), so be 2021 * data lists (especially for journaled data mode), so be
@@ -2041,9 +2029,11 @@ void jbd2_journal_invalidatepage(journal_t *journal,
2041 if (offset <= curr_off) { 2029 if (offset <= curr_off) {
2042 /* This block is wholly outside the truncation point */ 2030 /* This block is wholly outside the truncation point */
2043 lock_buffer(bh); 2031 lock_buffer(bh);
2044 may_free &= journal_unmap_buffer(journal, bh, 2032 ret = journal_unmap_buffer(journal, bh, offset > 0);
2045 offset > 0);
2046 unlock_buffer(bh); 2033 unlock_buffer(bh);
2034 if (ret < 0)
2035 return ret;
2036 may_free &= ret;
2047 } 2037 }
2048 curr_off = next_off; 2038 curr_off = next_off;
2049 bh = next; 2039 bh = next;
@@ -2054,6 +2044,7 @@ void jbd2_journal_invalidatepage(journal_t *journal,
2054 if (may_free && try_to_free_buffers(page)) 2044 if (may_free && try_to_free_buffers(page))
2055 J_ASSERT(!page_has_buffers(page)); 2045 J_ASSERT(!page_has_buffers(page));
2056 } 2046 }
2047 return 0;
2057} 2048}
2058 2049
2059/* 2050/*
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 0c96eb52c797..03310721712f 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -417,14 +417,16 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
417 spin_unlock(&c->erase_completion_lock); 417 spin_unlock(&c->erase_completion_lock);
418 418
419 ret = jffs2_prealloc_raw_node_refs(c, jeb, 1); 419 ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
420 if (ret) 420
421 return ret;
422 /* Just lock it again and continue. Nothing much can change because 421 /* Just lock it again and continue. Nothing much can change because
423 we hold c->alloc_sem anyway. In fact, it's not entirely clear why 422 we hold c->alloc_sem anyway. In fact, it's not entirely clear why
424 we hold c->erase_completion_lock in the majority of this function... 423 we hold c->erase_completion_lock in the majority of this function...
425 but that's a question for another (more caffeine-rich) day. */ 424 but that's a question for another (more caffeine-rich) day. */
426 spin_lock(&c->erase_completion_lock); 425 spin_lock(&c->erase_completion_lock);
427 426
427 if (ret)
428 return ret;
429
428 waste = jeb->free_size; 430 waste = jeb->free_size;
429 jffs2_link_node_ref(c, jeb, 431 jffs2_link_node_ref(c, jeb,
430 (jeb->offset + c->sector_size - waste) | REF_OBSOLETE, 432 (jeb->offset + c->sector_size - waste) | REF_OBSOLETE,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 9d3afd157f99..dd7442c58358 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -119,9 +119,12 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
119 iattr->ia_size != i_size_read(inode)) { 119 iattr->ia_size != i_size_read(inode)) {
120 inode_dio_wait(inode); 120 inode_dio_wait(inode);
121 121
122 rc = vmtruncate(inode, iattr->ia_size); 122 rc = inode_newsize_ok(inode, iattr->ia_size);
123 if (rc) 123 if (rc)
124 return rc; 124 return rc;
125
126 truncate_setsize(inode, iattr->ia_size);
127 jfs_truncate(inode);
125 } 128 }
126 129
127 setattr_copy(inode, iattr); 130 setattr_copy(inode, iattr);
@@ -133,7 +136,6 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
133} 136}
134 137
135const struct inode_operations jfs_file_inode_operations = { 138const struct inode_operations jfs_file_inode_operations = {
136 .truncate = jfs_truncate,
137 .setxattr = jfs_setxattr, 139 .setxattr = jfs_setxattr,
138 .getxattr = jfs_getxattr, 140 .getxattr = jfs_getxattr,
139 .listxattr = jfs_listxattr, 141 .listxattr = jfs_listxattr,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 4692bf3ca8cb..b7dc47ba675e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -300,6 +300,16 @@ static int jfs_readpages(struct file *file, struct address_space *mapping,
300 return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); 300 return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
301} 301}
302 302
303static void jfs_write_failed(struct address_space *mapping, loff_t to)
304{
305 struct inode *inode = mapping->host;
306
307 if (to > inode->i_size) {
308 truncate_pagecache(inode, to, inode->i_size);
309 jfs_truncate(inode);
310 }
311}
312
303static int jfs_write_begin(struct file *file, struct address_space *mapping, 313static int jfs_write_begin(struct file *file, struct address_space *mapping,
304 loff_t pos, unsigned len, unsigned flags, 314 loff_t pos, unsigned len, unsigned flags,
305 struct page **pagep, void **fsdata) 315 struct page **pagep, void **fsdata)
@@ -308,11 +318,8 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
308 318
309 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, 319 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
310 jfs_get_block); 320 jfs_get_block);
311 if (unlikely(ret)) { 321 if (unlikely(ret))
312 loff_t isize = mapping->host->i_size; 322 jfs_write_failed(mapping, pos + len);
313 if (pos + len > isize)
314 vmtruncate(mapping->host, isize);
315 }
316 323
317 return ret; 324 return ret;
318} 325}
@@ -326,6 +333,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
326 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 333 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
327{ 334{
328 struct file *file = iocb->ki_filp; 335 struct file *file = iocb->ki_filp;
336 struct address_space *mapping = file->f_mapping;
329 struct inode *inode = file->f_mapping->host; 337 struct inode *inode = file->f_mapping->host;
330 ssize_t ret; 338 ssize_t ret;
331 339
@@ -341,7 +349,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
341 loff_t end = offset + iov_length(iov, nr_segs); 349 loff_t end = offset + iov_length(iov, nr_segs);
342 350
343 if (end > isize) 351 if (end > isize)
344 vmtruncate(inode, isize); 352 jfs_write_failed(mapping, end);
345 } 353 }
346 354
347 return ret; 355 return ret;
diff --git a/fs/libfs.c b/fs/libfs.c
index 7cc37ca19cd8..916da8c4158b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -81,11 +81,11 @@ int dcache_dir_close(struct inode *inode, struct file *file)
81 return 0; 81 return 0;
82} 82}
83 83
84loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) 84loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
85{ 85{
86 struct dentry *dentry = file->f_path.dentry; 86 struct dentry *dentry = file->f_path.dentry;
87 mutex_lock(&dentry->d_inode->i_mutex); 87 mutex_lock(&dentry->d_inode->i_mutex);
88 switch (origin) { 88 switch (whence) {
89 case 1: 89 case 1:
90 offset += file->f_pos; 90 offset += file->f_pos;
91 case 0: 91 case 0:
@@ -369,8 +369,6 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
369 struct inode *inode = dentry->d_inode; 369 struct inode *inode = dentry->d_inode;
370 int error; 370 int error;
371 371
372 WARN_ON_ONCE(inode->i_op->truncate);
373
374 error = inode_change_ok(inode, iattr); 372 error = inode_change_ok(inode, iattr);
375 if (error) 373 if (error)
376 return error; 374 return error;
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 13ad1539fbf2..00ec0b9c94d1 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -64,10 +64,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock,
64{ 64{
65 const struct file_lock *fl = &lock->fl; 65 const struct file_lock *fl = &lock->fl;
66 66
67 BUG_ON(fl->fl_start > NLM4_OFFSET_MAX);
68 BUG_ON(fl->fl_end > NLM4_OFFSET_MAX &&
69 fl->fl_end != OFFSET_MAX);
70
71 *l_offset = loff_t_to_s64(fl->fl_start); 67 *l_offset = loff_t_to_s64(fl->fl_start);
72 if (fl->fl_end == OFFSET_MAX) 68 if (fl->fl_end == OFFSET_MAX)
73 *l_len = 0; 69 *l_len = 0;
@@ -122,7 +118,6 @@ static void encode_netobj(struct xdr_stream *xdr,
122{ 118{
123 __be32 *p; 119 __be32 *p;
124 120
125 BUG_ON(length > XDR_MAX_NETOBJ);
126 p = xdr_reserve_space(xdr, 4 + length); 121 p = xdr_reserve_space(xdr, 4 + length);
127 xdr_encode_opaque(p, data, length); 122 xdr_encode_opaque(p, data, length);
128} 123}
@@ -156,7 +151,6 @@ out_overflow:
156static void encode_cookie(struct xdr_stream *xdr, 151static void encode_cookie(struct xdr_stream *xdr,
157 const struct nlm_cookie *cookie) 152 const struct nlm_cookie *cookie)
158{ 153{
159 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
160 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); 154 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
161} 155}
162 156
@@ -198,7 +192,6 @@ out_overflow:
198 */ 192 */
199static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) 193static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
200{ 194{
201 BUG_ON(fh->size > NFS3_FHSIZE);
202 encode_netobj(xdr, (u8 *)&fh->data, fh->size); 195 encode_netobj(xdr, (u8 *)&fh->data, fh->size);
203} 196}
204 197
@@ -336,7 +329,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name)
336 u32 length = strlen(name); 329 u32 length = strlen(name);
337 __be32 *p; 330 __be32 *p;
338 331
339 BUG_ON(length > NLM_MAXSTRLEN);
340 p = xdr_reserve_space(xdr, 4 + length); 332 p = xdr_reserve_space(xdr, 4 + length);
341 xdr_encode_opaque(p, name, length); 333 xdr_encode_opaque(p, name, length);
342} 334}
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 05d29124c6ab..54f9e6ce0430 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -141,7 +141,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
141 141
142static void nlmclnt_release_lockargs(struct nlm_rqst *req) 142static void nlmclnt_release_lockargs(struct nlm_rqst *req)
143{ 143{
144 BUG_ON(req->a_args.lock.fl.fl_ops != NULL); 144 WARN_ON_ONCE(req->a_args.lock.fl.fl_ops != NULL);
145} 145}
146 146
147/** 147/**
@@ -465,7 +465,6 @@ static const struct file_lock_operations nlmclnt_lock_ops = {
465 465
466static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host) 466static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
467{ 467{
468 BUG_ON(fl->fl_ops != NULL);
469 fl->fl_u.nfs_fl.state = 0; 468 fl->fl_u.nfs_fl.state = 0;
470 fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner); 469 fl->fl_u.nfs_fl.owner = nlm_find_lockowner(host, fl->fl_owner);
471 INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list); 470 INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 982d2676e1f8..9a55797a1cd4 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -60,10 +60,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock,
60{ 60{
61 const struct file_lock *fl = &lock->fl; 61 const struct file_lock *fl = &lock->fl;
62 62
63 BUG_ON(fl->fl_start > NLM_OFFSET_MAX);
64 BUG_ON(fl->fl_end > NLM_OFFSET_MAX &&
65 fl->fl_end != OFFSET_MAX);
66
67 *l_offset = loff_t_to_s32(fl->fl_start); 63 *l_offset = loff_t_to_s32(fl->fl_start);
68 if (fl->fl_end == OFFSET_MAX) 64 if (fl->fl_end == OFFSET_MAX)
69 *l_len = 0; 65 *l_len = 0;
@@ -119,7 +115,6 @@ static void encode_netobj(struct xdr_stream *xdr,
119{ 115{
120 __be32 *p; 116 __be32 *p;
121 117
122 BUG_ON(length > XDR_MAX_NETOBJ);
123 p = xdr_reserve_space(xdr, 4 + length); 118 p = xdr_reserve_space(xdr, 4 + length);
124 xdr_encode_opaque(p, data, length); 119 xdr_encode_opaque(p, data, length);
125} 120}
@@ -153,7 +148,6 @@ out_overflow:
153static void encode_cookie(struct xdr_stream *xdr, 148static void encode_cookie(struct xdr_stream *xdr,
154 const struct nlm_cookie *cookie) 149 const struct nlm_cookie *cookie)
155{ 150{
156 BUG_ON(cookie->len > NLM_MAXCOOKIELEN);
157 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len); 151 encode_netobj(xdr, (u8 *)&cookie->data, cookie->len);
158} 152}
159 153
@@ -195,7 +189,6 @@ out_overflow:
195 */ 189 */
196static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh) 190static void encode_fh(struct xdr_stream *xdr, const struct nfs_fh *fh)
197{ 191{
198 BUG_ON(fh->size != NFS2_FHSIZE);
199 encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE); 192 encode_netobj(xdr, (u8 *)&fh->data, NFS2_FHSIZE);
200} 193}
201 194
@@ -330,7 +323,6 @@ static void encode_caller_name(struct xdr_stream *xdr, const char *name)
330 u32 length = strlen(name); 323 u32 length = strlen(name);
331 __be32 *p; 324 __be32 *p;
332 325
333 BUG_ON(length > NLM_MAXSTRLEN);
334 p = xdr_reserve_space(xdr, 4 + length); 326 p = xdr_reserve_space(xdr, 4 + length);
335 xdr_encode_opaque(p, name, length); 327 xdr_encode_opaque(p, name, length);
336} 328}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index f9b22e58f78f..0e17090c310f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -177,9 +177,6 @@ static void nlm_destroy_host_locked(struct nlm_host *host)
177 177
178 dprintk("lockd: destroy host %s\n", host->h_name); 178 dprintk("lockd: destroy host %s\n", host->h_name);
179 179
180 BUG_ON(!list_empty(&host->h_lockowners));
181 BUG_ON(atomic_read(&host->h_count));
182
183 hlist_del_init(&host->h_hash); 180 hlist_del_init(&host->h_hash);
184 181
185 nsm_unmonitor(host); 182 nsm_unmonitor(host);
@@ -289,13 +286,12 @@ void nlmclnt_release_host(struct nlm_host *host)
289 286
290 dprintk("lockd: release client host %s\n", host->h_name); 287 dprintk("lockd: release client host %s\n", host->h_name);
291 288
292 BUG_ON(atomic_read(&host->h_count) < 0); 289 WARN_ON_ONCE(host->h_server);
293 BUG_ON(host->h_server);
294 290
295 if (atomic_dec_and_test(&host->h_count)) { 291 if (atomic_dec_and_test(&host->h_count)) {
296 BUG_ON(!list_empty(&host->h_lockowners)); 292 WARN_ON_ONCE(!list_empty(&host->h_lockowners));
297 BUG_ON(!list_empty(&host->h_granted)); 293 WARN_ON_ONCE(!list_empty(&host->h_granted));
298 BUG_ON(!list_empty(&host->h_reclaim)); 294 WARN_ON_ONCE(!list_empty(&host->h_reclaim));
299 295
300 mutex_lock(&nlm_host_mutex); 296 mutex_lock(&nlm_host_mutex);
301 nlm_destroy_host_locked(host); 297 nlm_destroy_host_locked(host);
@@ -412,8 +408,7 @@ void nlmsvc_release_host(struct nlm_host *host)
412 408
413 dprintk("lockd: release server host %s\n", host->h_name); 409 dprintk("lockd: release server host %s\n", host->h_name);
414 410
415 BUG_ON(atomic_read(&host->h_count) < 0); 411 WARN_ON_ONCE(!host->h_server);
416 BUG_ON(!host->h_server);
417 atomic_dec(&host->h_count); 412 atomic_dec(&host->h_count);
418} 413}
419 414
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3d7e09bcc0e9..3c2cfc683631 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -154,8 +154,6 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
154 .rpc_resp = res, 154 .rpc_resp = res,
155 }; 155 };
156 156
157 BUG_ON(clnt == NULL);
158
159 memset(res, 0, sizeof(*res)); 157 memset(res, 0, sizeof(*res));
160 158
161 msg.rpc_proc = &clnt->cl_procinfo[proc]; 159 msg.rpc_proc = &clnt->cl_procinfo[proc];
@@ -466,7 +464,6 @@ static void encode_nsm_string(struct xdr_stream *xdr, const char *string)
466 const u32 len = strlen(string); 464 const u32 len = strlen(string);
467 __be32 *p; 465 __be32 *p;
468 466
469 BUG_ON(len > SM_MAXSTRLEN);
470 p = xdr_reserve_space(xdr, 4 + len); 467 p = xdr_reserve_space(xdr, 4 + len);
471 xdr_encode_opaque(p, string, len); 468 xdr_encode_opaque(p, string, len);
472} 469}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index adb90116d36b..af49e2d6941a 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -33,7 +33,7 @@
33 * are being written out - and waiting for GC to make progress, naturally. 33 * are being written out - and waiting for GC to make progress, naturally.
34 * 34 *
35 * So we cannot just call iget() or some variant of it, but first have to check 35 * So we cannot just call iget() or some variant of it, but first have to check
36 * wether the inode in question might be in I_FREEING state. Therefore we 36 * whether the inode in question might be in I_FREEING state. Therefore we
37 * maintain our own per-sb list of "almost deleted" inodes and check against 37 * maintain our own per-sb list of "almost deleted" inodes and check against
38 * that list first. Normally this should be at most 1-2 entries long. 38 * that list first. Normally this should be at most 1-2 entries long.
39 * 39 *
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index e1a3b6bf6324..9a59cbade2fb 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1887,9 +1887,15 @@ int logfs_truncate(struct inode *inode, u64 target)
1887 logfs_put_wblocks(sb, NULL, 1); 1887 logfs_put_wblocks(sb, NULL, 1);
1888 } 1888 }
1889 1889
1890 if (!err) 1890 if (!err) {
1891 err = vmtruncate(inode, target); 1891 err = inode_newsize_ok(inode, target);
1892 if (err)
1893 goto out;
1894
1895 truncate_setsize(inode, target);
1896 }
1892 1897
1898 out:
1893 /* I don't trust error recovery yet. */ 1899 /* I don't trust error recovery yet. */
1894 WARN_ON(err); 1900 WARN_ON(err);
1895 return err; 1901 return err;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 4493ce695ab8..adc6f5494231 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -34,9 +34,12 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
34 34
35 if ((attr->ia_valid & ATTR_SIZE) && 35 if ((attr->ia_valid & ATTR_SIZE) &&
36 attr->ia_size != i_size_read(inode)) { 36 attr->ia_size != i_size_read(inode)) {
37 error = vmtruncate(inode, attr->ia_size); 37 error = inode_newsize_ok(inode, attr->ia_size);
38 if (error) 38 if (error)
39 return error; 39 return error;
40
41 truncate_setsize(inode, attr->ia_size);
42 minix_truncate(inode);
40 } 43 }
41 44
42 setattr_copy(inode, attr); 45 setattr_copy(inode, attr);
@@ -45,7 +48,6 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
45} 48}
46 49
47const struct inode_operations minix_file_inode_operations = { 50const struct inode_operations minix_file_inode_operations = {
48 .truncate = minix_truncate,
49 .setattr = minix_setattr, 51 .setattr = minix_setattr,
50 .getattr = minix_getattr, 52 .getattr = minix_getattr,
51}; 53};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 4fc5f8ab1c44..99541cceb584 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -395,6 +395,16 @@ int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
395 return __block_write_begin(page, pos, len, minix_get_block); 395 return __block_write_begin(page, pos, len, minix_get_block);
396} 396}
397 397
398static void minix_write_failed(struct address_space *mapping, loff_t to)
399{
400 struct inode *inode = mapping->host;
401
402 if (to > inode->i_size) {
403 truncate_pagecache(inode, to, inode->i_size);
404 minix_truncate(inode);
405 }
406}
407
398static int minix_write_begin(struct file *file, struct address_space *mapping, 408static int minix_write_begin(struct file *file, struct address_space *mapping,
399 loff_t pos, unsigned len, unsigned flags, 409 loff_t pos, unsigned len, unsigned flags,
400 struct page **pagep, void **fsdata) 410 struct page **pagep, void **fsdata)
@@ -403,11 +413,8 @@ static int minix_write_begin(struct file *file, struct address_space *mapping,
403 413
404 ret = block_write_begin(mapping, pos, len, flags, pagep, 414 ret = block_write_begin(mapping, pos, len, flags, pagep,
405 minix_get_block); 415 minix_get_block);
406 if (unlikely(ret)) { 416 if (unlikely(ret))
407 loff_t isize = mapping->host->i_size; 417 minix_write_failed(mapping, pos + len);
408 if (pos + len > isize)
409 vmtruncate(mapping->host, isize);
410 }
411 418
412 return ret; 419 return ret;
413} 420}
diff --git a/fs/mount.h b/fs/mount.h
index 4f291f9de641..cd5007980400 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,8 +4,11 @@
4 4
5struct mnt_namespace { 5struct mnt_namespace {
6 atomic_t count; 6 atomic_t count;
7 unsigned int proc_inum;
7 struct mount * root; 8 struct mount * root;
8 struct list_head list; 9 struct list_head list;
10 struct user_namespace *user_ns;
11 u64 seq; /* Sequence number to prevent loops */
9 wait_queue_head_t poll; 12 wait_queue_head_t poll;
10 int event; 13 int event;
11}; 14};
diff --git a/fs/namei.c b/fs/namei.c
index 5f4cdf3ad913..43a97ee1d4c8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1275,9 +1275,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1275 *need_lookup = false; 1275 *need_lookup = false;
1276 dentry = d_lookup(dir, name); 1276 dentry = d_lookup(dir, name);
1277 if (dentry) { 1277 if (dentry) {
1278 if (d_need_lookup(dentry)) { 1278 if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1279 *need_lookup = true;
1280 } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1281 error = d_revalidate(dentry, flags); 1279 error = d_revalidate(dentry, flags);
1282 if (unlikely(error <= 0)) { 1280 if (unlikely(error <= 0)) {
1283 if (error < 0) { 1281 if (error < 0) {
@@ -1383,8 +1381,6 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,
1383 return -ECHILD; 1381 return -ECHILD;
1384 nd->seq = seq; 1382 nd->seq = seq;
1385 1383
1386 if (unlikely(d_need_lookup(dentry)))
1387 goto unlazy;
1388 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1384 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1389 status = d_revalidate(dentry, nd->flags); 1385 status = d_revalidate(dentry, nd->flags);
1390 if (unlikely(status <= 0)) { 1386 if (unlikely(status <= 0)) {
@@ -1410,11 +1406,6 @@ unlazy:
1410 if (unlikely(!dentry)) 1406 if (unlikely(!dentry))
1411 goto need_lookup; 1407 goto need_lookup;
1412 1408
1413 if (unlikely(d_need_lookup(dentry))) {
1414 dput(dentry);
1415 goto need_lookup;
1416 }
1417
1418 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) 1409 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1419 status = d_revalidate(dentry, nd->flags); 1410 status = d_revalidate(dentry, nd->flags);
1420 if (unlikely(status <= 0)) { 1411 if (unlikely(status <= 0)) {
@@ -1859,7 +1850,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1859 if (flags & LOOKUP_ROOT) { 1850 if (flags & LOOKUP_ROOT) {
1860 struct inode *inode = nd->root.dentry->d_inode; 1851 struct inode *inode = nd->root.dentry->d_inode;
1861 if (*name) { 1852 if (*name) {
1862 if (!inode->i_op->lookup) 1853 if (!can_lookup(inode))
1863 return -ENOTDIR; 1854 return -ENOTDIR;
1864 retval = inode_permission(inode, MAY_EXEC); 1855 retval = inode_permission(inode, MAY_EXEC);
1865 if (retval) 1856 if (retval)
@@ -1903,6 +1894,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1903 get_fs_pwd(current->fs, &nd->path); 1894 get_fs_pwd(current->fs, &nd->path);
1904 } 1895 }
1905 } else { 1896 } else {
1897 /* Caller must check execute permissions on the starting path component */
1906 struct fd f = fdget_raw(dfd); 1898 struct fd f = fdget_raw(dfd);
1907 struct dentry *dentry; 1899 struct dentry *dentry;
1908 1900
@@ -1912,16 +1904,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1912 dentry = f.file->f_path.dentry; 1904 dentry = f.file->f_path.dentry;
1913 1905
1914 if (*name) { 1906 if (*name) {
1915 if (!S_ISDIR(dentry->d_inode->i_mode)) { 1907 if (!can_lookup(dentry->d_inode)) {
1916 fdput(f); 1908 fdput(f);
1917 return -ENOTDIR; 1909 return -ENOTDIR;
1918 } 1910 }
1919
1920 retval = inode_permission(dentry->d_inode, MAY_EXEC);
1921 if (retval) {
1922 fdput(f);
1923 return retval;
1924 }
1925 } 1911 }
1926 1912
1927 nd->path = f.file->f_path; 1913 nd->path = f.file->f_path;
@@ -2189,15 +2175,19 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
2189 * path-walking is complete. 2175 * path-walking is complete.
2190 */ 2176 */
2191static struct filename * 2177static struct filename *
2192user_path_parent(int dfd, const char __user *path, struct nameidata *nd) 2178user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
2179 unsigned int flags)
2193{ 2180{
2194 struct filename *s = getname(path); 2181 struct filename *s = getname(path);
2195 int error; 2182 int error;
2196 2183
2184 /* only LOOKUP_REVAL is allowed in extra flags */
2185 flags &= LOOKUP_REVAL;
2186
2197 if (IS_ERR(s)) 2187 if (IS_ERR(s))
2198 return s; 2188 return s;
2199 2189
2200 error = filename_lookup(dfd, s, LOOKUP_PARENT, nd); 2190 error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
2201 if (error) { 2191 if (error) {
2202 putname(s); 2192 putname(s);
2203 return ERR_PTR(error); 2193 return ERR_PTR(error);
@@ -3044,12 +3034,22 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3044 return file; 3034 return file;
3045} 3035}
3046 3036
3047struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) 3037struct dentry *kern_path_create(int dfd, const char *pathname,
3038 struct path *path, unsigned int lookup_flags)
3048{ 3039{
3049 struct dentry *dentry = ERR_PTR(-EEXIST); 3040 struct dentry *dentry = ERR_PTR(-EEXIST);
3050 struct nameidata nd; 3041 struct nameidata nd;
3051 int err2; 3042 int err2;
3052 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 3043 int error;
3044 bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3045
3046 /*
3047 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3048 * other flags passed in are ignored!
3049 */
3050 lookup_flags &= LOOKUP_REVAL;
3051
3052 error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
3053 if (error) 3053 if (error)
3054 return ERR_PTR(error); 3054 return ERR_PTR(error);
3055 3055
@@ -3113,13 +3113,14 @@ void done_path_create(struct path *path, struct dentry *dentry)
3113} 3113}
3114EXPORT_SYMBOL(done_path_create); 3114EXPORT_SYMBOL(done_path_create);
3115 3115
3116struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 3116struct dentry *user_path_create(int dfd, const char __user *pathname,
3117 struct path *path, unsigned int lookup_flags)
3117{ 3118{
3118 struct filename *tmp = getname(pathname); 3119 struct filename *tmp = getname(pathname);
3119 struct dentry *res; 3120 struct dentry *res;
3120 if (IS_ERR(tmp)) 3121 if (IS_ERR(tmp))
3121 return ERR_CAST(tmp); 3122 return ERR_CAST(tmp);
3122 res = kern_path_create(dfd, tmp->name, path, is_dir); 3123 res = kern_path_create(dfd, tmp->name, path, lookup_flags);
3123 putname(tmp); 3124 putname(tmp);
3124 return res; 3125 return res;
3125} 3126}
@@ -3175,12 +3176,13 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3175 struct dentry *dentry; 3176 struct dentry *dentry;
3176 struct path path; 3177 struct path path;
3177 int error; 3178 int error;
3179 unsigned int lookup_flags = 0;
3178 3180
3179 error = may_mknod(mode); 3181 error = may_mknod(mode);
3180 if (error) 3182 if (error)
3181 return error; 3183 return error;
3182 3184retry:
3183 dentry = user_path_create(dfd, filename, &path, 0); 3185 dentry = user_path_create(dfd, filename, &path, lookup_flags);
3184 if (IS_ERR(dentry)) 3186 if (IS_ERR(dentry))
3185 return PTR_ERR(dentry); 3187 return PTR_ERR(dentry);
3186 3188
@@ -3203,6 +3205,10 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3203 } 3205 }
3204out: 3206out:
3205 done_path_create(&path, dentry); 3207 done_path_create(&path, dentry);
3208 if (retry_estale(error, lookup_flags)) {
3209 lookup_flags |= LOOKUP_REVAL;
3210 goto retry;
3211 }
3206 return error; 3212 return error;
3207} 3213}
3208 3214
@@ -3241,8 +3247,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3241 struct dentry *dentry; 3247 struct dentry *dentry;
3242 struct path path; 3248 struct path path;
3243 int error; 3249 int error;
3250 unsigned int lookup_flags = LOOKUP_DIRECTORY;
3244 3251
3245 dentry = user_path_create(dfd, pathname, &path, 1); 3252retry:
3253 dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3246 if (IS_ERR(dentry)) 3254 if (IS_ERR(dentry))
3247 return PTR_ERR(dentry); 3255 return PTR_ERR(dentry);
3248 3256
@@ -3252,6 +3260,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3252 if (!error) 3260 if (!error)
3253 error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 3261 error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3254 done_path_create(&path, dentry); 3262 done_path_create(&path, dentry);
3263 if (retry_estale(error, lookup_flags)) {
3264 lookup_flags |= LOOKUP_REVAL;
3265 goto retry;
3266 }
3255 return error; 3267 return error;
3256} 3268}
3257 3269
@@ -3327,8 +3339,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
3327 struct filename *name; 3339 struct filename *name;
3328 struct dentry *dentry; 3340 struct dentry *dentry;
3329 struct nameidata nd; 3341 struct nameidata nd;
3330 3342 unsigned int lookup_flags = 0;
3331 name = user_path_parent(dfd, pathname, &nd); 3343retry:
3344 name = user_path_parent(dfd, pathname, &nd, lookup_flags);
3332 if (IS_ERR(name)) 3345 if (IS_ERR(name))
3333 return PTR_ERR(name); 3346 return PTR_ERR(name);
3334 3347
@@ -3370,6 +3383,10 @@ exit2:
3370exit1: 3383exit1:
3371 path_put(&nd.path); 3384 path_put(&nd.path);
3372 putname(name); 3385 putname(name);
3386 if (retry_estale(error, lookup_flags)) {
3387 lookup_flags |= LOOKUP_REVAL;
3388 goto retry;
3389 }
3373 return error; 3390 return error;
3374} 3391}
3375 3392
@@ -3423,8 +3440,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
3423 struct dentry *dentry; 3440 struct dentry *dentry;
3424 struct nameidata nd; 3441 struct nameidata nd;
3425 struct inode *inode = NULL; 3442 struct inode *inode = NULL;
3426 3443 unsigned int lookup_flags = 0;
3427 name = user_path_parent(dfd, pathname, &nd); 3444retry:
3445 name = user_path_parent(dfd, pathname, &nd, lookup_flags);
3428 if (IS_ERR(name)) 3446 if (IS_ERR(name))
3429 return PTR_ERR(name); 3447 return PTR_ERR(name);
3430 3448
@@ -3462,6 +3480,11 @@ exit2:
3462exit1: 3480exit1:
3463 path_put(&nd.path); 3481 path_put(&nd.path);
3464 putname(name); 3482 putname(name);
3483 if (retry_estale(error, lookup_flags)) {
3484 lookup_flags |= LOOKUP_REVAL;
3485 inode = NULL;
3486 goto retry;
3487 }
3465 return error; 3488 return error;
3466 3489
3467slashes: 3490slashes:
@@ -3513,12 +3536,13 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3513 struct filename *from; 3536 struct filename *from;
3514 struct dentry *dentry; 3537 struct dentry *dentry;
3515 struct path path; 3538 struct path path;
3539 unsigned int lookup_flags = 0;
3516 3540
3517 from = getname(oldname); 3541 from = getname(oldname);
3518 if (IS_ERR(from)) 3542 if (IS_ERR(from))
3519 return PTR_ERR(from); 3543 return PTR_ERR(from);
3520 3544retry:
3521 dentry = user_path_create(newdfd, newname, &path, 0); 3545 dentry = user_path_create(newdfd, newname, &path, lookup_flags);
3522 error = PTR_ERR(dentry); 3546 error = PTR_ERR(dentry);
3523 if (IS_ERR(dentry)) 3547 if (IS_ERR(dentry))
3524 goto out_putname; 3548 goto out_putname;
@@ -3527,6 +3551,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3527 if (!error) 3551 if (!error)
3528 error = vfs_symlink(path.dentry->d_inode, dentry, from->name); 3552 error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
3529 done_path_create(&path, dentry); 3553 done_path_create(&path, dentry);
3554 if (retry_estale(error, lookup_flags)) {
3555 lookup_flags |= LOOKUP_REVAL;
3556 goto retry;
3557 }
3530out_putname: 3558out_putname:
3531 putname(from); 3559 putname(from);
3532 return error; 3560 return error;
@@ -3613,12 +3641,13 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3613 3641
3614 if (flags & AT_SYMLINK_FOLLOW) 3642 if (flags & AT_SYMLINK_FOLLOW)
3615 how |= LOOKUP_FOLLOW; 3643 how |= LOOKUP_FOLLOW;
3616 3644retry:
3617 error = user_path_at(olddfd, oldname, how, &old_path); 3645 error = user_path_at(olddfd, oldname, how, &old_path);
3618 if (error) 3646 if (error)
3619 return error; 3647 return error;
3620 3648
3621 new_dentry = user_path_create(newdfd, newname, &new_path, 0); 3649 new_dentry = user_path_create(newdfd, newname, &new_path,
3650 (how & LOOKUP_REVAL));
3622 error = PTR_ERR(new_dentry); 3651 error = PTR_ERR(new_dentry);
3623 if (IS_ERR(new_dentry)) 3652 if (IS_ERR(new_dentry))
3624 goto out; 3653 goto out;
@@ -3635,6 +3664,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3635 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); 3664 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
3636out_dput: 3665out_dput:
3637 done_path_create(&new_path, new_dentry); 3666 done_path_create(&new_path, new_dentry);
3667 if (retry_estale(error, how)) {
3668 how |= LOOKUP_REVAL;
3669 goto retry;
3670 }
3638out: 3671out:
3639 path_put(&old_path); 3672 path_put(&old_path);
3640 3673
@@ -3807,15 +3840,17 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
3807 struct nameidata oldnd, newnd; 3840 struct nameidata oldnd, newnd;
3808 struct filename *from; 3841 struct filename *from;
3809 struct filename *to; 3842 struct filename *to;
3843 unsigned int lookup_flags = 0;
3844 bool should_retry = false;
3810 int error; 3845 int error;
3811 3846retry:
3812 from = user_path_parent(olddfd, oldname, &oldnd); 3847 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
3813 if (IS_ERR(from)) { 3848 if (IS_ERR(from)) {
3814 error = PTR_ERR(from); 3849 error = PTR_ERR(from);
3815 goto exit; 3850 goto exit;
3816 } 3851 }
3817 3852
3818 to = user_path_parent(newdfd, newname, &newnd); 3853 to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
3819 if (IS_ERR(to)) { 3854 if (IS_ERR(to)) {
3820 error = PTR_ERR(to); 3855 error = PTR_ERR(to);
3821 goto exit1; 3856 goto exit1;
@@ -3887,11 +3922,18 @@ exit3:
3887 unlock_rename(new_dir, old_dir); 3922 unlock_rename(new_dir, old_dir);
3888 mnt_drop_write(oldnd.path.mnt); 3923 mnt_drop_write(oldnd.path.mnt);
3889exit2: 3924exit2:
3925 if (retry_estale(error, lookup_flags))
3926 should_retry = true;
3890 path_put(&newnd.path); 3927 path_put(&newnd.path);
3891 putname(to); 3928 putname(to);
3892exit1: 3929exit1:
3893 path_put(&oldnd.path); 3930 path_put(&oldnd.path);
3894 putname(from); 3931 putname(from);
3932 if (should_retry) {
3933 should_retry = false;
3934 lookup_flags |= LOOKUP_REVAL;
3935 goto retry;
3936 }
3895exit: 3937exit:
3896 return error; 3938 return error;
3897} 3939}
diff --git a/fs/namespace.c b/fs/namespace.c
index 24960626bb6b..55605c552787 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -12,6 +12,7 @@
12#include <linux/export.h> 12#include <linux/export.h>
13#include <linux/capability.h> 13#include <linux/capability.h>
14#include <linux/mnt_namespace.h> 14#include <linux/mnt_namespace.h>
15#include <linux/user_namespace.h>
15#include <linux/namei.h> 16#include <linux/namei.h>
16#include <linux/security.h> 17#include <linux/security.h>
17#include <linux/idr.h> 18#include <linux/idr.h>
@@ -20,6 +21,7 @@
20#include <linux/fs_struct.h> /* get_fs_root et.al. */ 21#include <linux/fs_struct.h> /* get_fs_root et.al. */
21#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/proc_fs.h>
23#include "pnode.h" 25#include "pnode.h"
24#include "internal.h" 26#include "internal.h"
25 27
@@ -311,7 +313,7 @@ int __mnt_want_write(struct vfsmount *m)
311 * incremented count after it has set MNT_WRITE_HOLD. 313 * incremented count after it has set MNT_WRITE_HOLD.
312 */ 314 */
313 smp_mb(); 315 smp_mb();
314 while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 316 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
315 cpu_relax(); 317 cpu_relax();
316 /* 318 /*
317 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 319 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
@@ -784,7 +786,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
784 if (!mnt) 786 if (!mnt)
785 return ERR_PTR(-ENOMEM); 787 return ERR_PTR(-ENOMEM);
786 788
787 if (flag & (CL_SLAVE | CL_PRIVATE)) 789 if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
788 mnt->mnt_group_id = 0; /* not a peer of original */ 790 mnt->mnt_group_id = 0; /* not a peer of original */
789 else 791 else
790 mnt->mnt_group_id = old->mnt_group_id; 792 mnt->mnt_group_id = old->mnt_group_id;
@@ -805,7 +807,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
805 list_add_tail(&mnt->mnt_instance, &sb->s_mounts); 807 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
806 br_write_unlock(&vfsmount_lock); 808 br_write_unlock(&vfsmount_lock);
807 809
808 if (flag & CL_SLAVE) { 810 if ((flag & CL_SLAVE) ||
811 ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
809 list_add(&mnt->mnt_slave, &old->mnt_slave_list); 812 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
810 mnt->mnt_master = old; 813 mnt->mnt_master = old;
811 CLEAR_MNT_SHARED(mnt); 814 CLEAR_MNT_SHARED(mnt);
@@ -1266,7 +1269,7 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1266 goto dput_and_out; 1269 goto dput_and_out;
1267 1270
1268 retval = -EPERM; 1271 retval = -EPERM;
1269 if (!capable(CAP_SYS_ADMIN)) 1272 if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
1270 goto dput_and_out; 1273 goto dput_and_out;
1271 1274
1272 retval = do_umount(mnt, flags); 1275 retval = do_umount(mnt, flags);
@@ -1292,7 +1295,7 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
1292 1295
1293static int mount_is_safe(struct path *path) 1296static int mount_is_safe(struct path *path)
1294{ 1297{
1295 if (capable(CAP_SYS_ADMIN)) 1298 if (ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
1296 return 0; 1299 return 0;
1297 return -EPERM; 1300 return -EPERM;
1298#ifdef notyet 1301#ifdef notyet
@@ -1308,6 +1311,26 @@ static int mount_is_safe(struct path *path)
1308#endif 1311#endif
1309} 1312}
1310 1313
1314static bool mnt_ns_loop(struct path *path)
1315{
1316 /* Could bind mounting the mount namespace inode cause a
1317 * mount namespace loop?
1318 */
1319 struct inode *inode = path->dentry->d_inode;
1320 struct proc_inode *ei;
1321 struct mnt_namespace *mnt_ns;
1322
1323 if (!proc_ns_inode(inode))
1324 return false;
1325
1326 ei = PROC_I(inode);
1327 if (ei->ns_ops != &mntns_operations)
1328 return false;
1329
1330 mnt_ns = ei->ns;
1331 return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1332}
1333
1311struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, 1334struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1312 int flag) 1335 int flag)
1313{ 1336{
@@ -1610,7 +1633,7 @@ static int do_change_type(struct path *path, int flag)
1610 int type; 1633 int type;
1611 int err = 0; 1634 int err = 0;
1612 1635
1613 if (!capable(CAP_SYS_ADMIN)) 1636 if (!ns_capable(mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
1614 return -EPERM; 1637 return -EPERM;
1615 1638
1616 if (path->dentry != path->mnt->mnt_root) 1639 if (path->dentry != path->mnt->mnt_root)
@@ -1655,6 +1678,10 @@ static int do_loopback(struct path *path, const char *old_name,
1655 if (err) 1678 if (err)
1656 return err; 1679 return err;
1657 1680
1681 err = -EINVAL;
1682 if (mnt_ns_loop(&old_path))
1683 goto out;
1684
1658 err = lock_mount(path); 1685 err = lock_mount(path);
1659 if (err) 1686 if (err)
1660 goto out; 1687 goto out;
@@ -1770,7 +1797,7 @@ static int do_move_mount(struct path *path, const char *old_name)
1770 struct mount *p; 1797 struct mount *p;
1771 struct mount *old; 1798 struct mount *old;
1772 int err = 0; 1799 int err = 0;
1773 if (!capable(CAP_SYS_ADMIN)) 1800 if (!ns_capable(real_mount(path->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN))
1774 return -EPERM; 1801 return -EPERM;
1775 if (!old_name || !*old_name) 1802 if (!old_name || !*old_name)
1776 return -EINVAL; 1803 return -EINVAL;
@@ -1857,21 +1884,6 @@ static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1857 return ERR_PTR(err); 1884 return ERR_PTR(err);
1858} 1885}
1859 1886
1860static struct vfsmount *
1861do_kern_mount(const char *fstype, int flags, const char *name, void *data)
1862{
1863 struct file_system_type *type = get_fs_type(fstype);
1864 struct vfsmount *mnt;
1865 if (!type)
1866 return ERR_PTR(-ENODEV);
1867 mnt = vfs_kern_mount(type, flags, name, data);
1868 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1869 !mnt->mnt_sb->s_subtype)
1870 mnt = fs_set_subtype(mnt, fstype);
1871 put_filesystem(type);
1872 return mnt;
1873}
1874
1875/* 1887/*
1876 * add a mount into a namespace's mount tree 1888 * add a mount into a namespace's mount tree
1877 */ 1889 */
@@ -1917,20 +1929,46 @@ unlock:
1917 * create a new mount for userspace and request it to be added into the 1929 * create a new mount for userspace and request it to be added into the
1918 * namespace's tree 1930 * namespace's tree
1919 */ 1931 */
1920static int do_new_mount(struct path *path, const char *type, int flags, 1932static int do_new_mount(struct path *path, const char *fstype, int flags,
1921 int mnt_flags, const char *name, void *data) 1933 int mnt_flags, const char *name, void *data)
1922{ 1934{
1935 struct file_system_type *type;
1936 struct user_namespace *user_ns;
1923 struct vfsmount *mnt; 1937 struct vfsmount *mnt;
1924 int err; 1938 int err;
1925 1939
1926 if (!type) 1940 if (!fstype)
1927 return -EINVAL; 1941 return -EINVAL;
1928 1942
1929 /* we need capabilities... */ 1943 /* we need capabilities... */
1930 if (!capable(CAP_SYS_ADMIN)) 1944 user_ns = real_mount(path->mnt)->mnt_ns->user_ns;
1945 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1931 return -EPERM; 1946 return -EPERM;
1932 1947
1933 mnt = do_kern_mount(type, flags, name, data); 1948 type = get_fs_type(fstype);
1949 if (!type)
1950 return -ENODEV;
1951
1952 if (user_ns != &init_user_ns) {
1953 if (!(type->fs_flags & FS_USERNS_MOUNT)) {
1954 put_filesystem(type);
1955 return -EPERM;
1956 }
1957 /* Only in special cases allow devices from mounts
1958 * created outside the initial user namespace.
1959 */
1960 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
1961 flags |= MS_NODEV;
1962 mnt_flags |= MNT_NODEV;
1963 }
1964 }
1965
1966 mnt = vfs_kern_mount(type, flags, name, data);
1967 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1968 !mnt->mnt_sb->s_subtype)
1969 mnt = fs_set_subtype(mnt, fstype);
1970
1971 put_filesystem(type);
1934 if (IS_ERR(mnt)) 1972 if (IS_ERR(mnt))
1935 return PTR_ERR(mnt); 1973 return PTR_ERR(mnt);
1936 1974
@@ -2261,18 +2299,42 @@ dput_out:
2261 return retval; 2299 return retval;
2262} 2300}
2263 2301
2264static struct mnt_namespace *alloc_mnt_ns(void) 2302static void free_mnt_ns(struct mnt_namespace *ns)
2303{
2304 proc_free_inum(ns->proc_inum);
2305 put_user_ns(ns->user_ns);
2306 kfree(ns);
2307}
2308
2309/*
2310 * Assign a sequence number so we can detect when we attempt to bind
2311 * mount a reference to an older mount namespace into the current
2312 * mount namespace, preventing reference counting loops. A 64bit
2313 * number incrementing at 10Ghz will take 12,427 years to wrap which
2314 * is effectively never, so we can ignore the possibility.
2315 */
2316static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2317
2318static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2265{ 2319{
2266 struct mnt_namespace *new_ns; 2320 struct mnt_namespace *new_ns;
2321 int ret;
2267 2322
2268 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); 2323 new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2269 if (!new_ns) 2324 if (!new_ns)
2270 return ERR_PTR(-ENOMEM); 2325 return ERR_PTR(-ENOMEM);
2326 ret = proc_alloc_inum(&new_ns->proc_inum);
2327 if (ret) {
2328 kfree(new_ns);
2329 return ERR_PTR(ret);
2330 }
2331 new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2271 atomic_set(&new_ns->count, 1); 2332 atomic_set(&new_ns->count, 1);
2272 new_ns->root = NULL; 2333 new_ns->root = NULL;
2273 INIT_LIST_HEAD(&new_ns->list); 2334 INIT_LIST_HEAD(&new_ns->list);
2274 init_waitqueue_head(&new_ns->poll); 2335 init_waitqueue_head(&new_ns->poll);
2275 new_ns->event = 0; 2336 new_ns->event = 0;
2337 new_ns->user_ns = get_user_ns(user_ns);
2276 return new_ns; 2338 return new_ns;
2277} 2339}
2278 2340
@@ -2281,24 +2343,28 @@ static struct mnt_namespace *alloc_mnt_ns(void)
2281 * copied from the namespace of the passed in task structure. 2343 * copied from the namespace of the passed in task structure.
2282 */ 2344 */
2283static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, 2345static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2284 struct fs_struct *fs) 2346 struct user_namespace *user_ns, struct fs_struct *fs)
2285{ 2347{
2286 struct mnt_namespace *new_ns; 2348 struct mnt_namespace *new_ns;
2287 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; 2349 struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2288 struct mount *p, *q; 2350 struct mount *p, *q;
2289 struct mount *old = mnt_ns->root; 2351 struct mount *old = mnt_ns->root;
2290 struct mount *new; 2352 struct mount *new;
2353 int copy_flags;
2291 2354
2292 new_ns = alloc_mnt_ns(); 2355 new_ns = alloc_mnt_ns(user_ns);
2293 if (IS_ERR(new_ns)) 2356 if (IS_ERR(new_ns))
2294 return new_ns; 2357 return new_ns;
2295 2358
2296 down_write(&namespace_sem); 2359 down_write(&namespace_sem);
2297 /* First pass: copy the tree topology */ 2360 /* First pass: copy the tree topology */
2298 new = copy_tree(old, old->mnt.mnt_root, CL_COPY_ALL | CL_EXPIRE); 2361 copy_flags = CL_COPY_ALL | CL_EXPIRE;
2362 if (user_ns != mnt_ns->user_ns)
2363 copy_flags |= CL_SHARED_TO_SLAVE;
2364 new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2299 if (IS_ERR(new)) { 2365 if (IS_ERR(new)) {
2300 up_write(&namespace_sem); 2366 up_write(&namespace_sem);
2301 kfree(new_ns); 2367 free_mnt_ns(new_ns);
2302 return ERR_CAST(new); 2368 return ERR_CAST(new);
2303 } 2369 }
2304 new_ns->root = new; 2370 new_ns->root = new;
@@ -2339,7 +2405,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
2339} 2405}
2340 2406
2341struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, 2407struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2342 struct fs_struct *new_fs) 2408 struct user_namespace *user_ns, struct fs_struct *new_fs)
2343{ 2409{
2344 struct mnt_namespace *new_ns; 2410 struct mnt_namespace *new_ns;
2345 2411
@@ -2349,7 +2415,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2349 if (!(flags & CLONE_NEWNS)) 2415 if (!(flags & CLONE_NEWNS))
2350 return ns; 2416 return ns;
2351 2417
2352 new_ns = dup_mnt_ns(ns, new_fs); 2418 new_ns = dup_mnt_ns(ns, user_ns, new_fs);
2353 2419
2354 put_mnt_ns(ns); 2420 put_mnt_ns(ns);
2355 return new_ns; 2421 return new_ns;
@@ -2361,7 +2427,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2361 */ 2427 */
2362static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) 2428static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
2363{ 2429{
2364 struct mnt_namespace *new_ns = alloc_mnt_ns(); 2430 struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2365 if (!IS_ERR(new_ns)) { 2431 if (!IS_ERR(new_ns)) {
2366 struct mount *mnt = real_mount(m); 2432 struct mount *mnt = real_mount(m);
2367 mnt->mnt_ns = new_ns; 2433 mnt->mnt_ns = new_ns;
@@ -2501,7 +2567,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2501 struct mount *new_mnt, *root_mnt; 2567 struct mount *new_mnt, *root_mnt;
2502 int error; 2568 int error;
2503 2569
2504 if (!capable(CAP_SYS_ADMIN)) 2570 if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN))
2505 return -EPERM; 2571 return -EPERM;
2506 2572
2507 error = user_path_dir(new_root, &new); 2573 error = user_path_dir(new_root, &new);
@@ -2583,8 +2649,13 @@ static void __init init_mount_tree(void)
2583 struct vfsmount *mnt; 2649 struct vfsmount *mnt;
2584 struct mnt_namespace *ns; 2650 struct mnt_namespace *ns;
2585 struct path root; 2651 struct path root;
2652 struct file_system_type *type;
2586 2653
2587 mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); 2654 type = get_fs_type("rootfs");
2655 if (!type)
2656 panic("Can't find rootfs type");
2657 mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
2658 put_filesystem(type);
2588 if (IS_ERR(mnt)) 2659 if (IS_ERR(mnt))
2589 panic("Can't create rootfs"); 2660 panic("Can't create rootfs");
2590 2661
@@ -2647,7 +2718,7 @@ void put_mnt_ns(struct mnt_namespace *ns)
2647 br_write_unlock(&vfsmount_lock); 2718 br_write_unlock(&vfsmount_lock);
2648 up_write(&namespace_sem); 2719 up_write(&namespace_sem);
2649 release_mounts(&umount_list); 2720 release_mounts(&umount_list);
2650 kfree(ns); 2721 free_mnt_ns(ns);
2651} 2722}
2652 2723
2653struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) 2724struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
@@ -2681,3 +2752,72 @@ bool our_mnt(struct vfsmount *mnt)
2681{ 2752{
2682 return check_mnt(real_mount(mnt)); 2753 return check_mnt(real_mount(mnt));
2683} 2754}
2755
2756static void *mntns_get(struct task_struct *task)
2757{
2758 struct mnt_namespace *ns = NULL;
2759 struct nsproxy *nsproxy;
2760
2761 rcu_read_lock();
2762 nsproxy = task_nsproxy(task);
2763 if (nsproxy) {
2764 ns = nsproxy->mnt_ns;
2765 get_mnt_ns(ns);
2766 }
2767 rcu_read_unlock();
2768
2769 return ns;
2770}
2771
2772static void mntns_put(void *ns)
2773{
2774 put_mnt_ns(ns);
2775}
2776
2777static int mntns_install(struct nsproxy *nsproxy, void *ns)
2778{
2779 struct fs_struct *fs = current->fs;
2780 struct mnt_namespace *mnt_ns = ns;
2781 struct path root;
2782
2783 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
2784 !nsown_capable(CAP_SYS_CHROOT) ||
2785 !nsown_capable(CAP_SYS_ADMIN))
2786 return -EPERM;
2787
2788 if (fs->users != 1)
2789 return -EINVAL;
2790
2791 get_mnt_ns(mnt_ns);
2792 put_mnt_ns(nsproxy->mnt_ns);
2793 nsproxy->mnt_ns = mnt_ns;
2794
2795 /* Find the root */
2796 root.mnt = &mnt_ns->root->mnt;
2797 root.dentry = mnt_ns->root->mnt.mnt_root;
2798 path_get(&root);
2799 while(d_mountpoint(root.dentry) && follow_down_one(&root))
2800 ;
2801
2802 /* Update the pwd and root */
2803 set_fs_pwd(fs, &root);
2804 set_fs_root(fs, &root);
2805
2806 path_put(&root);
2807 return 0;
2808}
2809
2810static unsigned int mntns_inum(void *ns)
2811{
2812 struct mnt_namespace *mnt_ns = ns;
2813 return mnt_ns->proc_inum;
2814}
2815
2816const struct proc_ns_operations mntns_operations = {
2817 .name = "mnt",
2818 .type = CLONE_NEWNS,
2819 .get = mntns_get,
2820 .put = mntns_put,
2821 .install = mntns_install,
2822 .inum = mntns_inum,
2823};
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d7e9fe77188a..1acdad7fcec7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -976,9 +976,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
976 goto out; 976 goto out;
977 977
978 if (attr->ia_size != i_size_read(inode)) { 978 if (attr->ia_size != i_size_read(inode)) {
979 result = vmtruncate(inode, attr->ia_size); 979 truncate_setsize(inode, attr->ia_size);
980 if (result)
981 goto out;
982 mark_inode_dirty(inode); 980 mark_inode_dirty(inode);
983 } 981 }
984 } 982 }
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index be20a7e171a0..63d14a99483d 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
89 /* 89 /*
90 * If I understand ncp_read_kernel() properly, the above always 90 * If I understand ncp_read_kernel() properly, the above always
91 * fetches from the network, here the analogue of disk. 91 * fetches from the network, here the analogue of disk.
92 * -- wli 92 * -- nyc
93 */ 93 */
94 count_vm_event(PGMAJFAULT); 94 count_vm_event(PGMAJFAULT);
95 mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); 95 mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index b7db60897f91..cce2c057bd2d 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -24,7 +24,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
24 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ 24 delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
25 nfs4namespace.o nfs4getroot.o nfs4client.o 25 nfs4namespace.o nfs4getroot.o nfs4client.o
26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 26nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
27nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o 27nfsv4-$(CONFIG_NFS_V4_1) += nfs4session.o pnfs.o pnfs_dev.o
28 28
29obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o 29obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
30nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 30nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index f1027b06a1a9..4fa788c93f46 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -40,6 +40,7 @@
40#include <linux/pagevec.h> 40#include <linux/pagevec.h>
41 41
42#include "../pnfs.h" 42#include "../pnfs.h"
43#include "../nfs4session.h"
43#include "../internal.h" 44#include "../internal.h"
44#include "blocklayout.h" 45#include "blocklayout.h"
45 46
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index dded26368111..862a2f16db64 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -118,7 +118,6 @@ int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
118 struct dentry *dir; 118 struct dentry *dir;
119 119
120 dir = rpc_d_lookup_sb(sb, "cache"); 120 dir = rpc_d_lookup_sb(sb, "cache");
121 BUG_ON(dir == NULL);
122 ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd); 121 ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
123 dput(dir); 122 dput(dir);
124 return ret; 123 return ret;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 4251c2ae06ad..efd54f0a4c46 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -142,7 +142,7 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
142 142
143struct cb_recallslotargs { 143struct cb_recallslotargs {
144 struct sockaddr *crsa_addr; 144 struct sockaddr *crsa_addr;
145 uint32_t crsa_target_max_slots; 145 uint32_t crsa_target_highest_slotid;
146}; 146};
147extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, 147extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
148 void *dummy, 148 void *dummy,
@@ -167,8 +167,6 @@ extern __be32 nfs4_callback_layoutrecall(
167 struct cb_layoutrecallargs *args, 167 struct cb_layoutrecallargs *args,
168 void *dummy, struct cb_process_state *cps); 168 void *dummy, struct cb_process_state *cps);
169 169
170extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
171
172struct cb_devicenotifyitem { 170struct cb_devicenotifyitem {
173 uint32_t cbd_notify_type; 171 uint32_t cbd_notify_type;
174 uint32_t cbd_layout_type; 172 uint32_t cbd_layout_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 76b4a7a3e559..264d1aa935f2 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -14,6 +14,7 @@
14#include "delegation.h" 14#include "delegation.h"
15#include "internal.h" 15#include "internal.h"
16#include "pnfs.h" 16#include "pnfs.h"
17#include "nfs4session.h"
17 18
18#ifdef NFS_DEBUG 19#ifdef NFS_DEBUG
19#define NFSDBG_FACILITY NFSDBG_CALLBACK 20#define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -205,7 +206,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
205 206
206 list_for_each_entry(lo, &server->layouts, plh_layouts) { 207 list_for_each_entry(lo, &server->layouts, plh_layouts) {
207 ino = igrab(lo->plh_inode); 208 ino = igrab(lo->plh_inode);
208 if (ino) 209 if (!ino)
209 continue; 210 continue;
210 spin_lock(&ino->i_lock); 211 spin_lock(&ino->i_lock);
211 /* Is this layout in the process of being freed? */ 212 /* Is this layout in the process of being freed? */
@@ -216,7 +217,6 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
216 } 217 }
217 pnfs_get_layout_hdr(lo); 218 pnfs_get_layout_hdr(lo);
218 spin_unlock(&ino->i_lock); 219 spin_unlock(&ino->i_lock);
219 BUG_ON(!list_empty(&lo->plh_bulk_recall));
220 list_add(&lo->plh_bulk_recall, &recall_list); 220 list_add(&lo->plh_bulk_recall, &recall_list);
221 } 221 }
222 } 222 }
@@ -562,23 +562,16 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
562 if (!cps->clp) /* set in cb_sequence */ 562 if (!cps->clp) /* set in cb_sequence */
563 goto out; 563 goto out;
564 564
565 dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target max slots %d\n", 565 dprintk_rcu("NFS: CB_RECALL_SLOT request from %s target highest slotid %d\n",
566 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR), 566 rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR),
567 args->crsa_target_max_slots); 567 args->crsa_target_highest_slotid);
568 568
569 fc_tbl = &cps->clp->cl_session->fc_slot_table; 569 fc_tbl = &cps->clp->cl_session->fc_slot_table;
570 570
571 status = htonl(NFS4ERR_BAD_HIGH_SLOT);
572 if (args->crsa_target_max_slots > fc_tbl->max_slots ||
573 args->crsa_target_max_slots < 1)
574 goto out;
575
576 status = htonl(NFS4_OK); 571 status = htonl(NFS4_OK);
577 if (args->crsa_target_max_slots == fc_tbl->max_slots)
578 goto out;
579 572
580 fc_tbl->target_max_slots = args->crsa_target_max_slots; 573 nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
581 nfs41_handle_recall_slot(cps->clp); 574 nfs41_server_notify_target_slotid_update(cps->clp);
582out: 575out:
583 dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); 576 dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
584 return status; 577 return status;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 742ff4ffced7..59461c957d9d 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -16,6 +16,7 @@
16#include "nfs4_fs.h" 16#include "nfs4_fs.h"
17#include "callback.h" 17#include "callback.h"
18#include "internal.h" 18#include "internal.h"
19#include "nfs4session.h"
19 20
20#define CB_OP_TAGLEN_MAXSZ (512) 21#define CB_OP_TAGLEN_MAXSZ (512)
21#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ) 22#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
@@ -520,7 +521,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
520 p = read_buf(xdr, 4); 521 p = read_buf(xdr, 4);
521 if (unlikely(p == NULL)) 522 if (unlikely(p == NULL))
522 return htonl(NFS4ERR_BADXDR); 523 return htonl(NFS4ERR_BADXDR);
523 args->crsa_target_max_slots = ntohl(*p++); 524 args->crsa_target_highest_slotid = ntohl(*p++);
524 return 0; 525 return 0;
525} 526}
526 527
@@ -762,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
762 * A single slot, so highest used slotid is either 0 or -1 763 * A single slot, so highest used slotid is either 0 or -1
763 */ 764 */
764 tbl->highest_used_slotid = NFS4_NO_SLOT; 765 tbl->highest_used_slotid = NFS4_NO_SLOT;
765 nfs4_check_drain_bc_complete(session); 766 nfs4_session_drain_complete(session, tbl);
766 spin_unlock(&tbl->slot_tbl_lock); 767 spin_unlock(&tbl->slot_tbl_lock);
767} 768}
768 769
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8b39a42ac35e..9f3c66438d0e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -277,7 +277,7 @@ void nfs_put_client(struct nfs_client *clp)
277 nfs_cb_idr_remove_locked(clp); 277 nfs_cb_idr_remove_locked(clp);
278 spin_unlock(&nn->nfs_client_lock); 278 spin_unlock(&nn->nfs_client_lock);
279 279
280 BUG_ON(!list_empty(&clp->cl_superblocks)); 280 WARN_ON_ONCE(!list_empty(&clp->cl_superblocks));
281 281
282 clp->rpc_ops->free_client(clp); 282 clp->rpc_ops->free_client(clp);
283 } 283 }
@@ -615,8 +615,7 @@ EXPORT_SYMBOL_GPL(nfs_create_rpc_client);
615 */ 615 */
616static void nfs_destroy_server(struct nfs_server *server) 616static void nfs_destroy_server(struct nfs_server *server)
617{ 617{
618 if (!(server->flags & NFS_MOUNT_LOCAL_FLOCK) || 618 if (server->nlm_host)
619 !(server->flags & NFS_MOUNT_LOCAL_FCNTL))
620 nlmclnt_done(server->nlm_host); 619 nlmclnt_done(server->nlm_host);
621} 620}
622 621
@@ -1061,10 +1060,6 @@ struct nfs_server *nfs_create_server(struct nfs_mount_info *mount_info,
1061 if (error < 0) 1060 if (error < 0)
1062 goto error; 1061 goto error;
1063 1062
1064 BUG_ON(!server->nfs_client);
1065 BUG_ON(!server->nfs_client->rpc_ops);
1066 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1067
1068 /* Probe the root fh to retrieve its FSID */ 1063 /* Probe the root fh to retrieve its FSID */
1069 error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr); 1064 error = nfs_probe_fsinfo(server, mount_info->mntfh, fattr);
1070 if (error < 0) 1065 if (error < 0)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b9e66b7e0c14..1b2d7eb93796 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -871,7 +871,7 @@ out:
871 return res; 871 return res;
872} 872}
873 873
874static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) 874static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
875{ 875{
876 struct dentry *dentry = filp->f_path.dentry; 876 struct dentry *dentry = filp->f_path.dentry;
877 struct inode *inode = dentry->d_inode; 877 struct inode *inode = dentry->d_inode;
@@ -880,10 +880,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
880 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", 880 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
881 dentry->d_parent->d_name.name, 881 dentry->d_parent->d_name.name,
882 dentry->d_name.name, 882 dentry->d_name.name,
883 offset, origin); 883 offset, whence);
884 884
885 mutex_lock(&inode->i_mutex); 885 mutex_lock(&inode->i_mutex);
886 switch (origin) { 886 switch (whence) {
887 case 1: 887 case 1:
888 offset += filp->f_pos; 888 offset += filp->f_pos;
889 case 0: 889 case 0:
@@ -979,10 +979,11 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
979 * particular file and the "nocto" mount flag is not set. 979 * particular file and the "nocto" mount flag is not set.
980 * 980 *
981 */ 981 */
982static inline 982static
983int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags) 983int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
984{ 984{
985 struct nfs_server *server = NFS_SERVER(inode); 985 struct nfs_server *server = NFS_SERVER(inode);
986 int ret;
986 987
987 if (IS_AUTOMOUNT(inode)) 988 if (IS_AUTOMOUNT(inode))
988 return 0; 989 return 0;
@@ -993,9 +994,13 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
993 if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) && 994 if ((flags & LOOKUP_OPEN) && !(server->flags & NFS_MOUNT_NOCTO) &&
994 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) 995 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
995 goto out_force; 996 goto out_force;
996 return 0; 997out:
998 return (inode->i_nlink == 0) ? -ENOENT : 0;
997out_force: 999out_force:
998 return __nfs_revalidate_inode(server, inode); 1000 ret = __nfs_revalidate_inode(server, inode);
1001 if (ret != 0)
1002 return ret;
1003 goto out;
999} 1004}
1000 1005
1001/* 1006/*
@@ -1156,11 +1161,14 @@ static int nfs_dentry_delete(const struct dentry *dentry)
1156 1161
1157} 1162}
1158 1163
1164/* Ensure that we revalidate inode->i_nlink */
1159static void nfs_drop_nlink(struct inode *inode) 1165static void nfs_drop_nlink(struct inode *inode)
1160{ 1166{
1161 spin_lock(&inode->i_lock); 1167 spin_lock(&inode->i_lock);
1162 if (inode->i_nlink > 0) 1168 /* drop the inode if we're reasonably sure this is the last link */
1163 drop_nlink(inode); 1169 if (inode->i_nlink == 1)
1170 clear_nlink(inode);
1171 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
1164 spin_unlock(&inode->i_lock); 1172 spin_unlock(&inode->i_lock);
1165} 1173}
1166 1174
@@ -1175,8 +1183,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
1175 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 1183 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
1176 1184
1177 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 1185 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
1178 drop_nlink(inode);
1179 nfs_complete_unlink(dentry, inode); 1186 nfs_complete_unlink(dentry, inode);
1187 nfs_drop_nlink(inode);
1180 } 1188 }
1181 iput(inode); 1189 iput(inode);
1182} 1190}
@@ -1647,10 +1655,8 @@ static int nfs_safe_remove(struct dentry *dentry)
1647 if (inode != NULL) { 1655 if (inode != NULL) {
1648 NFS_PROTO(inode)->return_delegation(inode); 1656 NFS_PROTO(inode)->return_delegation(inode);
1649 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1657 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1650 /* The VFS may want to delete this inode */
1651 if (error == 0) 1658 if (error == 0)
1652 nfs_drop_nlink(inode); 1659 nfs_drop_nlink(inode);
1653 nfs_mark_for_revalidate(inode);
1654 } else 1660 } else
1655 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1661 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
1656 if (error == -ENOENT) 1662 if (error == -ENOENT)
@@ -2147,12 +2153,16 @@ static int nfs_open_permission_mask(int openflags)
2147{ 2153{
2148 int mask = 0; 2154 int mask = 0;
2149 2155
2150 if ((openflags & O_ACCMODE) != O_WRONLY) 2156 if (openflags & __FMODE_EXEC) {
2151 mask |= MAY_READ; 2157 /* ONLY check exec rights */
2152 if ((openflags & O_ACCMODE) != O_RDONLY) 2158 mask = MAY_EXEC;
2153 mask |= MAY_WRITE; 2159 } else {
2154 if (openflags & __FMODE_EXEC) 2160 if ((openflags & O_ACCMODE) != O_WRONLY)
2155 mask |= MAY_EXEC; 2161 mask |= MAY_READ;
2162 if ((openflags & O_ACCMODE) != O_RDONLY)
2163 mask |= MAY_WRITE;
2164 }
2165
2156 return mask; 2166 return mask;
2157} 2167}
2158 2168
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index cae26cbd59ee..0bd7a55a5f07 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,21 +266,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
266 struct nfs_page *req = nfs_list_entry(hdr->pages.next); 266 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
267 struct page *page = req->wb_page; 267 struct page *page = req->wb_page;
268 268
269 if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { 269 if (!PageCompound(page) && bytes < hdr->good_bytes)
270 if (bytes > hdr->good_bytes) 270 set_page_dirty(page);
271 zero_user(page, 0, PAGE_SIZE);
272 else if (hdr->good_bytes - bytes < PAGE_SIZE)
273 zero_user_segment(page,
274 hdr->good_bytes & ~PAGE_MASK,
275 PAGE_SIZE);
276 }
277 if (!PageCompound(page)) {
278 if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
279 if (bytes < hdr->good_bytes)
280 set_page_dirty(page);
281 } else
282 set_page_dirty(page);
283 }
284 bytes += req->wb_bytes; 271 bytes += req->wb_bytes;
285 nfs_list_remove_request(req); 272 nfs_list_remove_request(req);
286 nfs_direct_readpage_release(req); 273 nfs_direct_readpage_release(req);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 582bb8866131..3c2b893665ba 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -119,18 +119,18 @@ force_reval:
119 return __nfs_revalidate_inode(server, inode); 119 return __nfs_revalidate_inode(server, inode);
120} 120}
121 121
122loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) 122loff_t nfs_file_llseek(struct file *filp, loff_t offset, int whence)
123{ 123{
124 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", 124 dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
125 filp->f_path.dentry->d_parent->d_name.name, 125 filp->f_path.dentry->d_parent->d_name.name,
126 filp->f_path.dentry->d_name.name, 126 filp->f_path.dentry->d_name.name,
127 offset, origin); 127 offset, whence);
128 128
129 /* 129 /*
130 * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate 130 * whence == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate
131 * the cached file length 131 * the cached file length
132 */ 132 */
133 if (origin != SEEK_SET && origin != SEEK_CUR) { 133 if (whence != SEEK_SET && whence != SEEK_CUR) {
134 struct inode *inode = filp->f_mapping->host; 134 struct inode *inode = filp->f_mapping->host;
135 135
136 int retval = nfs_revalidate_file_size(inode, filp); 136 int retval = nfs_revalidate_file_size(inode, filp);
@@ -138,7 +138,7 @@ loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
138 return (loff_t)retval; 138 return (loff_t)retval;
139 } 139 }
140 140
141 return generic_file_llseek(filp, offset, origin); 141 return generic_file_llseek(filp, offset, whence);
142} 142}
143EXPORT_SYMBOL_GPL(nfs_file_llseek); 143EXPORT_SYMBOL_GPL(nfs_file_llseek);
144 144
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index c817787fbdb4..24d1d1c5fcaf 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -307,6 +307,7 @@ void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
307 nfs_fscache_inode_unlock(inode); 307 nfs_fscache_inode_unlock(inode);
308 } 308 }
309} 309}
310EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
310 311
311/* 312/*
312 * Replace a per-inode cookie due to revalidation detecting a file having 313 * Replace a per-inode cookie due to revalidation detecting a file having
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index c5b11b53ff33..4ecb76652eba 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -153,6 +153,22 @@ static inline void nfs_readpage_to_fscache(struct inode *inode,
153} 153}
154 154
155/* 155/*
156 * Invalidate the contents of fscache for this inode. This will not sleep.
157 */
158static inline void nfs_fscache_invalidate(struct inode *inode)
159{
160 fscache_invalidate(NFS_I(inode)->fscache);
161}
162
163/*
164 * Wait for an object to finish being invalidated.
165 */
166static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
167{
168 fscache_wait_on_invalidate(NFS_I(inode)->fscache);
169}
170
171/*
156 * indicate the client caching state as readable text 172 * indicate the client caching state as readable text
157 */ 173 */
158static inline const char *nfs_server_fscache_state(struct nfs_server *server) 174static inline const char *nfs_server_fscache_state(struct nfs_server *server)
@@ -162,7 +178,6 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
162 return "no "; 178 return "no ";
163} 179}
164 180
165
166#else /* CONFIG_NFS_FSCACHE */ 181#else /* CONFIG_NFS_FSCACHE */
167static inline int nfs_fscache_register(void) { return 0; } 182static inline int nfs_fscache_register(void) { return 0; }
168static inline void nfs_fscache_unregister(void) {} 183static inline void nfs_fscache_unregister(void) {}
@@ -205,6 +220,10 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
205static inline void nfs_readpage_to_fscache(struct inode *inode, 220static inline void nfs_readpage_to_fscache(struct inode *inode,
206 struct page *page, int sync) {} 221 struct page *page, int sync) {}
207 222
223
224static inline void nfs_fscache_invalidate(struct inode *inode) {}
225static inline void nfs_fscache_wait_on_invalidate(struct inode *inode) {}
226
208static inline const char *nfs_server_fscache_state(struct nfs_server *server) 227static inline const char *nfs_server_fscache_state(struct nfs_server *server)
209{ 228{
210 return "no "; 229 return "no ";
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9cc4a3fbf4b0..bc3968fa81e5 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -193,19 +193,15 @@ static int nfs_idmap_init_keyring(void)
193 if (!cred) 193 if (!cred)
194 return -ENOMEM; 194 return -ENOMEM;
195 195
196 keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, 196 keyring = keyring_alloc(".id_resolver", 0, 0, cred,
197 (KEY_POS_ALL & ~KEY_POS_SETATTR) | 197 (KEY_POS_ALL & ~KEY_POS_SETATTR) |
198 KEY_USR_VIEW | KEY_USR_READ, 198 KEY_USR_VIEW | KEY_USR_READ,
199 KEY_ALLOC_NOT_IN_QUOTA); 199 KEY_ALLOC_NOT_IN_QUOTA, NULL);
200 if (IS_ERR(keyring)) { 200 if (IS_ERR(keyring)) {
201 ret = PTR_ERR(keyring); 201 ret = PTR_ERR(keyring);
202 goto failed_put_cred; 202 goto failed_put_cred;
203 } 203 }
204 204
205 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
206 if (ret < 0)
207 goto failed_put_key;
208
209 ret = register_key_type(&key_type_id_resolver); 205 ret = register_key_type(&key_type_id_resolver);
210 if (ret < 0) 206 if (ret < 0)
211 goto failed_put_key; 207 goto failed_put_key;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6fa01aea2488..ebeb94ce1b0b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -107,13 +107,19 @@ u64 nfs_compat_user_ino64(u64 fileid)
107 return ino; 107 return ino;
108} 108}
109 109
110int nfs_drop_inode(struct inode *inode)
111{
112 return NFS_STALE(inode) || generic_drop_inode(inode);
113}
114EXPORT_SYMBOL_GPL(nfs_drop_inode);
115
110void nfs_clear_inode(struct inode *inode) 116void nfs_clear_inode(struct inode *inode)
111{ 117{
112 /* 118 /*
113 * The following should never happen... 119 * The following should never happen...
114 */ 120 */
115 BUG_ON(nfs_have_writebacks(inode)); 121 WARN_ON_ONCE(nfs_have_writebacks(inode));
116 BUG_ON(!list_empty(&NFS_I(inode)->open_files)); 122 WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
117 nfs_zap_acl_cache(inode); 123 nfs_zap_acl_cache(inode);
118 nfs_access_zap_cache(inode); 124 nfs_access_zap_cache(inode);
119 nfs_fscache_release_inode_cookie(inode); 125 nfs_fscache_release_inode_cookie(inode);
@@ -155,10 +161,12 @@ static void nfs_zap_caches_locked(struct inode *inode)
155 nfsi->attrtimeo_timestamp = jiffies; 161 nfsi->attrtimeo_timestamp = jiffies;
156 162
157 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); 163 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
158 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) 164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
159 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
160 else 166 nfs_fscache_invalidate(inode);
167 } else {
161 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 168 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
169 }
162} 170}
163 171
164void nfs_zap_caches(struct inode *inode) 172void nfs_zap_caches(struct inode *inode)
@@ -173,6 +181,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
173 if (mapping->nrpages != 0) { 181 if (mapping->nrpages != 0) {
174 spin_lock(&inode->i_lock); 182 spin_lock(&inode->i_lock);
175 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 183 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
184 nfs_fscache_invalidate(inode);
176 spin_unlock(&inode->i_lock); 185 spin_unlock(&inode->i_lock);
177 } 186 }
178} 187}
@@ -875,7 +884,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
875 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 884 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
876 spin_unlock(&inode->i_lock); 885 spin_unlock(&inode->i_lock);
877 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 886 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
878 nfs_fscache_reset_inode_cookie(inode); 887 nfs_fscache_wait_on_invalidate(inode);
879 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 888 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
880 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 889 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
881 return 0; 890 return 0;
@@ -951,6 +960,10 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
951 i_size_write(inode, nfs_size_to_loff_t(fattr->size)); 960 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
952 ret |= NFS_INO_INVALID_ATTR; 961 ret |= NFS_INO_INVALID_ATTR;
953 } 962 }
963
964 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
965 nfs_fscache_invalidate(inode);
966
954 return ret; 967 return ret;
955} 968}
956 969
@@ -1199,8 +1212,10 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
1199 struct nfs_inode *nfsi = NFS_I(inode); 1212 struct nfs_inode *nfsi = NFS_I(inode);
1200 1213
1201 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1214 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
1202 if (S_ISDIR(inode->i_mode)) 1215 if (S_ISDIR(inode->i_mode)) {
1203 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 1216 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
1217 nfs_fscache_invalidate(inode);
1218 }
1204 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 1219 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1205 return 0; 1220 return 0;
1206 return nfs_refresh_inode_locked(inode, fattr); 1221 return nfs_refresh_inode_locked(inode, fattr);
@@ -1488,6 +1503,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1488 (save_cache_validity & NFS_INO_REVAL_FORCED)) 1503 (save_cache_validity & NFS_INO_REVAL_FORCED))
1489 nfsi->cache_validity |= invalid; 1504 nfsi->cache_validity |= invalid;
1490 1505
1506 if (invalid & NFS_INO_INVALID_DATA)
1507 nfs_fscache_invalidate(inode);
1508
1491 return 0; 1509 return 0;
1492 out_err: 1510 out_err:
1493 /* 1511 /*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 05521cadac2e..f0e6c7df1a07 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -18,27 +18,6 @@ struct nfs_string;
18 */ 18 */
19#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) 19#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1)
20 20
21/*
22 * Determine if sessions are in use.
23 */
24static inline int nfs4_has_session(const struct nfs_client *clp)
25{
26#ifdef CONFIG_NFS_V4_1
27 if (clp->cl_session)
28 return 1;
29#endif /* CONFIG_NFS_V4_1 */
30 return 0;
31}
32
33static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
34{
35#ifdef CONFIG_NFS_V4_1
36 if (nfs4_has_session(clp))
37 return (clp->cl_session->flags & SESSION4_PERSIST);
38#endif /* CONFIG_NFS_V4_1 */
39 return 0;
40}
41
42static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) 21static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
43{ 22{
44 if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) 23 if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
@@ -276,8 +255,6 @@ extern const u32 nfs41_maxwrite_overhead;
276extern struct rpc_procinfo nfs4_procedures[]; 255extern struct rpc_procinfo nfs4_procedures[];
277#endif 256#endif
278 257
279extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
280
281/* proc.c */ 258/* proc.c */
282void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 259void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
283extern struct nfs_client *nfs_init_client(struct nfs_client *clp, 260extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
@@ -319,6 +296,7 @@ extern struct workqueue_struct *nfsiod_workqueue;
319extern struct inode *nfs_alloc_inode(struct super_block *sb); 296extern struct inode *nfs_alloc_inode(struct super_block *sb);
320extern void nfs_destroy_inode(struct inode *); 297extern void nfs_destroy_inode(struct inode *);
321extern int nfs_write_inode(struct inode *, struct writeback_control *); 298extern int nfs_write_inode(struct inode *, struct writeback_control *);
299extern int nfs_drop_inode(struct inode *);
322extern void nfs_clear_inode(struct inode *); 300extern void nfs_clear_inode(struct inode *);
323extern void nfs_evict_inode(struct inode *); 301extern void nfs_evict_inode(struct inode *);
324void nfs_zap_acl_cache(struct inode *inode); 302void nfs_zap_acl_cache(struct inode *inode);
@@ -386,9 +364,6 @@ extern int nfs_initiate_read(struct rpc_clnt *clnt,
386extern void nfs_read_prepare(struct rpc_task *task, void *calldata); 364extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
387extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, 365extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
388 struct nfs_pgio_header *hdr); 366 struct nfs_pgio_header *hdr);
389extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
390 struct inode *inode,
391 const struct nfs_pgio_completion_ops *compl_ops);
392extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); 367extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
393extern void nfs_readdata_release(struct nfs_read_data *rdata); 368extern void nfs_readdata_release(struct nfs_read_data *rdata);
394 369
@@ -411,9 +386,6 @@ extern struct nfs_write_header *nfs_writehdr_alloc(void);
411extern void nfs_writehdr_free(struct nfs_pgio_header *hdr); 386extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
412extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, 387extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
413 struct nfs_pgio_header *hdr); 388 struct nfs_pgio_header *hdr);
414extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
415 struct inode *inode, int ioflags,
416 const struct nfs_pgio_completion_ops *compl_ops);
417extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); 389extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
418extern void nfs_writedata_release(struct nfs_write_data *wdata); 390extern void nfs_writedata_release(struct nfs_write_data *wdata);
419extern void nfs_commit_free(struct nfs_commit_data *p); 391extern void nfs_commit_free(struct nfs_commit_data *p);
@@ -474,18 +446,6 @@ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
474 const struct rpc_timeout *timeparms, 446 const struct rpc_timeout *timeparms,
475 const char *ip_addr, 447 const char *ip_addr,
476 rpc_authflavor_t authflavour); 448 rpc_authflavor_t authflavour);
477extern int _nfs4_call_sync(struct rpc_clnt *clnt,
478 struct nfs_server *server,
479 struct rpc_message *msg,
480 struct nfs4_sequence_args *args,
481 struct nfs4_sequence_res *res,
482 int cache_reply);
483extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
484 struct nfs_server *server,
485 struct rpc_message *msg,
486 struct nfs4_sequence_args *args,
487 struct nfs4_sequence_res *res,
488 int cache_reply);
489extern int nfs40_walk_client_list(struct nfs_client *clp, 449extern int nfs40_walk_client_list(struct nfs_client *clp,
490 struct nfs_client **result, 450 struct nfs_client **result,
491 struct rpc_cred *cred); 451 struct rpc_cred *cred);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 015f71f8f62c..91a6faf811ac 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -169,6 +169,9 @@ int nfs_mount(struct nfs_mount_request *info)
169 (info->hostname ? info->hostname : "server"), 169 (info->hostname ? info->hostname : "server"),
170 info->dirpath); 170 info->dirpath);
171 171
172 if (strlen(info->dirpath) > MNTPATHLEN)
173 return -ENAMETOOLONG;
174
172 if (info->noresvport) 175 if (info->noresvport)
173 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 176 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
174 177
@@ -242,6 +245,9 @@ void nfs_umount(const struct nfs_mount_request *info)
242 struct rpc_clnt *clnt; 245 struct rpc_clnt *clnt;
243 int status; 246 int status;
244 247
248 if (strlen(info->dirpath) > MNTPATHLEN)
249 return;
250
245 if (info->noresvport) 251 if (info->noresvport)
246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 252 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
247 253
@@ -283,7 +289,6 @@ static void encode_mntdirpath(struct xdr_stream *xdr, const char *pathname)
283 const u32 pathname_len = strlen(pathname); 289 const u32 pathname_len = strlen(pathname);
284 __be32 *p; 290 __be32 *p;
285 291
286 BUG_ON(pathname_len > MNTPATHLEN);
287 p = xdr_reserve_space(xdr, 4 + pathname_len); 292 p = xdr_reserve_space(xdr, 4 + pathname_len);
288 xdr_encode_opaque(p, pathname, pathname_len); 293 xdr_encode_opaque(p, pathname, pathname_len);
289} 294}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index dd057bc6b65b..fc8dc20fdeb9 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -177,11 +177,31 @@ out_nofree:
177 return mnt; 177 return mnt;
178} 178}
179 179
180static int
181nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
182{
183 if (NFS_FH(dentry->d_inode)->size != 0)
184 return nfs_getattr(mnt, dentry, stat);
185 generic_fillattr(dentry->d_inode, stat);
186 return 0;
187}
188
189static int
190nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
191{
192 if (NFS_FH(dentry->d_inode)->size != 0)
193 return nfs_setattr(dentry, attr);
194 return -EACCES;
195}
196
180const struct inode_operations nfs_mountpoint_inode_operations = { 197const struct inode_operations nfs_mountpoint_inode_operations = {
181 .getattr = nfs_getattr, 198 .getattr = nfs_getattr,
199 .setattr = nfs_setattr,
182}; 200};
183 201
184const struct inode_operations nfs_referral_inode_operations = { 202const struct inode_operations nfs_referral_inode_operations = {
203 .getattr = nfs_namespace_getattr,
204 .setattr = nfs_namespace_setattr,
185}; 205};
186 206
187static void nfs_expire_automounts(struct work_struct *work) 207static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index d04f0df7be55..06b9df49f7f7 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -195,7 +195,6 @@ static void encode_fhandle(struct xdr_stream *xdr, const struct nfs_fh *fh)
195{ 195{
196 __be32 *p; 196 __be32 *p;
197 197
198 BUG_ON(fh->size != NFS2_FHSIZE);
199 p = xdr_reserve_space(xdr, NFS2_FHSIZE); 198 p = xdr_reserve_space(xdr, NFS2_FHSIZE);
200 memcpy(p, fh->data, NFS2_FHSIZE); 199 memcpy(p, fh->data, NFS2_FHSIZE);
201} 200}
@@ -388,7 +387,7 @@ static void encode_filename(struct xdr_stream *xdr,
388{ 387{
389 __be32 *p; 388 __be32 *p;
390 389
391 BUG_ON(length > NFS2_MAXNAMLEN); 390 WARN_ON_ONCE(length > NFS2_MAXNAMLEN);
392 p = xdr_reserve_space(xdr, 4 + length); 391 p = xdr_reserve_space(xdr, 4 + length);
393 xdr_encode_opaque(p, name, length); 392 xdr_encode_opaque(p, name, length);
394} 393}
@@ -428,7 +427,6 @@ static void encode_path(struct xdr_stream *xdr, struct page **pages, u32 length)
428{ 427{
429 __be32 *p; 428 __be32 *p;
430 429
431 BUG_ON(length > NFS2_MAXPATHLEN);
432 p = xdr_reserve_space(xdr, 4); 430 p = xdr_reserve_space(xdr, 4);
433 *p = cpu_to_be32(length); 431 *p = cpu_to_be32(length);
434 xdr_write_pages(xdr, pages, 0, length); 432 xdr_write_pages(xdr, pages, 0, length);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 69322096c325..70efb63b1e42 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -24,14 +24,14 @@
24 24
25#define NFSDBG_FACILITY NFSDBG_PROC 25#define NFSDBG_FACILITY NFSDBG_PROC
26 26
27/* A wrapper to handle the EJUKEBOX and EKEYEXPIRED error messages */ 27/* A wrapper to handle the EJUKEBOX error messages */
28static int 28static int
29nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) 29nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
30{ 30{
31 int res; 31 int res;
32 do { 32 do {
33 res = rpc_call_sync(clnt, msg, flags); 33 res = rpc_call_sync(clnt, msg, flags);
34 if (res != -EJUKEBOX && res != -EKEYEXPIRED) 34 if (res != -EJUKEBOX)
35 break; 35 break;
36 freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME); 36 freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
37 res = -ERESTARTSYS; 37 res = -ERESTARTSYS;
@@ -44,7 +44,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
44static int 44static int
45nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) 45nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode)
46{ 46{
47 if (task->tk_status != -EJUKEBOX && task->tk_status != -EKEYEXPIRED) 47 if (task->tk_status != -EJUKEBOX)
48 return 0; 48 return 0;
49 if (task->tk_status == -EJUKEBOX) 49 if (task->tk_status == -EJUKEBOX)
50 nfs_inc_stats(inode, NFSIOS_DELAY); 50 nfs_inc_stats(inode, NFSIOS_DELAY);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cbe89400dfc..bffc32406fbf 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -198,7 +198,7 @@ static void encode_filename3(struct xdr_stream *xdr,
198{ 198{
199 __be32 *p; 199 __be32 *p;
200 200
201 BUG_ON(length > NFS3_MAXNAMLEN); 201 WARN_ON_ONCE(length > NFS3_MAXNAMLEN);
202 p = xdr_reserve_space(xdr, 4 + length); 202 p = xdr_reserve_space(xdr, 4 + length);
203 xdr_encode_opaque(p, name, length); 203 xdr_encode_opaque(p, name, length);
204} 204}
@@ -238,7 +238,6 @@ out_overflow:
238static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages, 238static void encode_nfspath3(struct xdr_stream *xdr, struct page **pages,
239 const u32 length) 239 const u32 length)
240{ 240{
241 BUG_ON(length > NFS3_MAXPATHLEN);
242 encode_uint32(xdr, length); 241 encode_uint32(xdr, length);
243 xdr_write_pages(xdr, pages, 0, length); 242 xdr_write_pages(xdr, pages, 0, length);
244} 243}
@@ -388,7 +387,6 @@ out_overflow:
388 */ 387 */
389static void encode_ftype3(struct xdr_stream *xdr, const u32 type) 388static void encode_ftype3(struct xdr_stream *xdr, const u32 type)
390{ 389{
391 BUG_ON(type > NF3FIFO);
392 encode_uint32(xdr, type); 390 encode_uint32(xdr, type);
393} 391}
394 392
@@ -443,7 +441,7 @@ static void encode_nfs_fh3(struct xdr_stream *xdr, const struct nfs_fh *fh)
443{ 441{
444 __be32 *p; 442 __be32 *p;
445 443
446 BUG_ON(fh->size > NFS3_FHSIZE); 444 WARN_ON_ONCE(fh->size > NFS3_FHSIZE);
447 p = xdr_reserve_space(xdr, 4 + fh->size); 445 p = xdr_reserve_space(xdr, 4 + fh->size);
448 xdr_encode_opaque(p, fh->data, fh->size); 446 xdr_encode_opaque(p, fh->data, fh->size);
449} 447}
@@ -1339,6 +1337,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
1339 error = nfsacl_encode(xdr->buf, base, args->inode, 1337 error = nfsacl_encode(xdr->buf, base, args->inode,
1340 (args->mask & NFS_ACL) ? 1338 (args->mask & NFS_ACL) ?
1341 args->acl_access : NULL, 1, 0); 1339 args->acl_access : NULL, 1, 0);
1340 /* FIXME: this is just broken */
1342 BUG_ON(error < 0); 1341 BUG_ON(error < 0);
1343 error = nfsacl_encode(xdr->buf, base + error, args->inode, 1342 error = nfsacl_encode(xdr->buf, base + error, args->inode,
1344 (args->mask & NFS_DFACL) ? 1343 (args->mask & NFS_DFACL) ?
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a525fdefccde..a3f488b074a2 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -11,6 +11,8 @@
11 11
12#if IS_ENABLED(CONFIG_NFS_V4) 12#if IS_ENABLED(CONFIG_NFS_V4)
13 13
14#define NFS4_MAX_LOOP_ON_RECOVER (10)
15
14struct idmap; 16struct idmap;
15 17
16enum nfs4_client_state { 18enum nfs4_client_state {
@@ -21,18 +23,12 @@ enum nfs4_client_state {
21 NFS4CLNT_RECLAIM_NOGRACE, 23 NFS4CLNT_RECLAIM_NOGRACE,
22 NFS4CLNT_DELEGRETURN, 24 NFS4CLNT_DELEGRETURN,
23 NFS4CLNT_SESSION_RESET, 25 NFS4CLNT_SESSION_RESET,
24 NFS4CLNT_RECALL_SLOT,
25 NFS4CLNT_LEASE_CONFIRM, 26 NFS4CLNT_LEASE_CONFIRM,
26 NFS4CLNT_SERVER_SCOPE_MISMATCH, 27 NFS4CLNT_SERVER_SCOPE_MISMATCH,
27 NFS4CLNT_PURGE_STATE, 28 NFS4CLNT_PURGE_STATE,
28 NFS4CLNT_BIND_CONN_TO_SESSION, 29 NFS4CLNT_BIND_CONN_TO_SESSION,
29}; 30};
30 31
31enum nfs4_session_state {
32 NFS4_SESSION_INITING,
33 NFS4_SESSION_DRAINING,
34};
35
36#define NFS4_RENEW_TIMEOUT 0x01 32#define NFS4_RENEW_TIMEOUT 0x01
37#define NFS4_RENEW_DELEGATION_CB 0x02 33#define NFS4_RENEW_DELEGATION_CB 0x02
38 34
@@ -43,8 +39,7 @@ struct nfs4_minor_version_ops {
43 struct nfs_server *server, 39 struct nfs_server *server,
44 struct rpc_message *msg, 40 struct rpc_message *msg,
45 struct nfs4_sequence_args *args, 41 struct nfs4_sequence_args *args,
46 struct nfs4_sequence_res *res, 42 struct nfs4_sequence_res *res);
47 int cache_reply);
48 bool (*match_stateid)(const nfs4_stateid *, 43 bool (*match_stateid)(const nfs4_stateid *,
49 const nfs4_stateid *); 44 const nfs4_stateid *);
50 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, 45 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
@@ -241,18 +236,14 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
241 return server->nfs_client->cl_session; 236 return server->nfs_client->cl_session;
242} 237}
243 238
244extern bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy);
245extern int nfs4_setup_sequence(const struct nfs_server *server, 239extern int nfs4_setup_sequence(const struct nfs_server *server,
246 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 240 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
247 struct rpc_task *task); 241 struct rpc_task *task);
248extern int nfs41_setup_sequence(struct nfs4_session *session, 242extern int nfs41_setup_sequence(struct nfs4_session *session,
249 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 243 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
250 struct rpc_task *task); 244 struct rpc_task *task);
251extern void nfs4_destroy_session(struct nfs4_session *session);
252extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
253extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *); 245extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
254extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *); 246extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
255extern int nfs4_init_session(struct nfs_server *server);
256extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 247extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
257 struct nfs_fsinfo *fsinfo); 248 struct nfs_fsinfo *fsinfo);
258extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, 249extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
@@ -280,11 +271,7 @@ static inline int nfs4_setup_sequence(const struct nfs_server *server,
280 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 271 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
281 struct rpc_task *task) 272 struct rpc_task *task)
282{ 273{
283 return 0; 274 rpc_call_start(task);
284}
285
286static inline int nfs4_init_session(struct nfs_server *server)
287{
288 return 0; 275 return 0;
289} 276}
290 277
@@ -321,17 +308,20 @@ extern void nfs4_renew_state(struct work_struct *);
321 308
322/* nfs4state.c */ 309/* nfs4state.c */
323struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp); 310struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp);
311struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
324struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); 312struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
325int nfs4_discover_server_trunking(struct nfs_client *clp, 313int nfs4_discover_server_trunking(struct nfs_client *clp,
326 struct nfs_client **); 314 struct nfs_client **);
327int nfs40_discover_server_trunking(struct nfs_client *clp, 315int nfs40_discover_server_trunking(struct nfs_client *clp,
328 struct nfs_client **, struct rpc_cred *); 316 struct nfs_client **, struct rpc_cred *);
329#if defined(CONFIG_NFS_V4_1) 317#if defined(CONFIG_NFS_V4_1)
330struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
331struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); 318struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
332int nfs41_discover_server_trunking(struct nfs_client *clp, 319int nfs41_discover_server_trunking(struct nfs_client *clp,
333 struct nfs_client **, struct rpc_cred *); 320 struct nfs_client **, struct rpc_cred *);
334extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); 321extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
322extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
323extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
324
335#else 325#else
336static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) 326static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
337{ 327{
@@ -349,11 +339,12 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
349extern void nfs_inode_find_state_and_recover(struct inode *inode, 339extern void nfs_inode_find_state_and_recover(struct inode *inode,
350 const nfs4_stateid *stateid); 340 const nfs4_stateid *stateid);
351extern void nfs4_schedule_lease_recovery(struct nfs_client *); 341extern void nfs4_schedule_lease_recovery(struct nfs_client *);
342extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
343extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
352extern void nfs4_schedule_state_manager(struct nfs_client *); 344extern void nfs4_schedule_state_manager(struct nfs_client *);
353extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); 345extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
354extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); 346extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
355extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 347extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
356extern void nfs41_handle_recall_slot(struct nfs_client *clp);
357extern void nfs41_handle_server_scope(struct nfs_client *, 348extern void nfs41_handle_server_scope(struct nfs_client *,
358 struct nfs41_server_scope **); 349 struct nfs41_server_scope **);
359extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 350extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 6bacfde1319a..2e9779b58b7a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -12,6 +12,7 @@
12#include "internal.h" 12#include "internal.h"
13#include "callback.h" 13#include "callback.h"
14#include "delegation.h" 14#include "delegation.h"
15#include "nfs4session.h"
15#include "pnfs.h" 16#include "pnfs.h"
16#include "netns.h" 17#include "netns.h"
17 18
@@ -235,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
235 error = nfs4_discover_server_trunking(clp, &old); 236 error = nfs4_discover_server_trunking(clp, &old);
236 if (error < 0) 237 if (error < 0)
237 goto error; 238 goto error;
239 nfs_put_client(clp);
238 if (clp != old) { 240 if (clp != old) {
239 clp->cl_preserve_clid = true; 241 clp->cl_preserve_clid = true;
240 nfs_put_client(clp);
241 clp = old; 242 clp = old;
242 atomic_inc(&clp->cl_count);
243 } 243 }
244 244
245 return clp; 245 return clp;
@@ -305,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
305 .clientid = new->cl_clientid, 305 .clientid = new->cl_clientid,
306 .confirm = new->cl_confirm, 306 .confirm = new->cl_confirm,
307 }; 307 };
308 int status; 308 int status = -NFS4ERR_STALE_CLIENTID;
309 309
310 spin_lock(&nn->nfs_client_lock); 310 spin_lock(&nn->nfs_client_lock);
311 list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { 311 list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -331,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new,
331 331
332 if (prev) 332 if (prev)
333 nfs_put_client(prev); 333 nfs_put_client(prev);
334 prev = pos;
334 335
335 status = nfs4_proc_setclientid_confirm(pos, &clid, cred); 336 status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
336 if (status == 0) { 337 switch (status) {
338 case -NFS4ERR_STALE_CLIENTID:
339 break;
340 case 0:
337 nfs4_swap_callback_idents(pos, new); 341 nfs4_swap_callback_idents(pos, new);
338 342
339 nfs_put_client(pos); 343 prev = NULL;
340 *result = pos; 344 *result = pos;
341 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", 345 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
342 __func__, pos, atomic_read(&pos->cl_count)); 346 __func__, pos, atomic_read(&pos->cl_count));
343 return 0; 347 default:
344 } 348 goto out;
345 if (status != -NFS4ERR_STALE_CLIENTID) {
346 nfs_put_client(pos);
347 dprintk("NFS: <-- %s status = %d, no result\n",
348 __func__, status);
349 return status;
350 } 349 }
351 350
352 spin_lock(&nn->nfs_client_lock); 351 spin_lock(&nn->nfs_client_lock);
353 prev = pos;
354 } 352 }
353 spin_unlock(&nn->nfs_client_lock);
355 354
356 /* 355 /* No match found. The server lost our clientid */
357 * No matching nfs_client found. This should be impossible, 356out:
358 * because the new nfs_client has already been added to
359 * nfs_client_list by nfs_get_client().
360 *
361 * Don't BUG(), since the caller is holding a mutex.
362 */
363 if (prev) 357 if (prev)
364 nfs_put_client(prev); 358 nfs_put_client(prev);
365 spin_unlock(&nn->nfs_client_lock); 359 dprintk("NFS: <-- %s status = %d\n", __func__, status);
366 pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); 360 return status;
367 return -NFS4ERR_STALE_CLIENTID;
368} 361}
369 362
370#ifdef CONFIG_NFS_V4_1 363#ifdef CONFIG_NFS_V4_1
@@ -431,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
431{ 424{
432 struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); 425 struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
433 struct nfs_client *pos, *n, *prev = NULL; 426 struct nfs_client *pos, *n, *prev = NULL;
434 int error; 427 int status = -NFS4ERR_STALE_CLIENTID;
435 428
436 spin_lock(&nn->nfs_client_lock); 429 spin_lock(&nn->nfs_client_lock);
437 list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { 430 list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -447,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new,
447 nfs_put_client(prev); 440 nfs_put_client(prev);
448 prev = pos; 441 prev = pos;
449 442
450 error = nfs_wait_client_init_complete(pos); 443 nfs4_schedule_lease_recovery(pos);
451 if (error < 0) { 444 status = nfs_wait_client_init_complete(pos);
445 if (status < 0) {
452 nfs_put_client(pos); 446 nfs_put_client(pos);
453 spin_lock(&nn->nfs_client_lock); 447 spin_lock(&nn->nfs_client_lock);
454 continue; 448 continue;
455 } 449 }
456 450 status = pos->cl_cons_state;
457 spin_lock(&nn->nfs_client_lock); 451 spin_lock(&nn->nfs_client_lock);
452 if (status < 0)
453 continue;
458 } 454 }
459 455
460 if (pos->rpc_ops != new->rpc_ops) 456 if (pos->rpc_ops != new->rpc_ops)
@@ -472,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
472 if (!nfs4_match_serverowners(pos, new)) 468 if (!nfs4_match_serverowners(pos, new))
473 continue; 469 continue;
474 470
471 atomic_inc(&pos->cl_count);
475 spin_unlock(&nn->nfs_client_lock); 472 spin_unlock(&nn->nfs_client_lock);
476 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", 473 dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
477 __func__, pos, atomic_read(&pos->cl_count)); 474 __func__, pos, atomic_read(&pos->cl_count));
@@ -480,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new,
480 return 0; 477 return 0;
481 } 478 }
482 479
483 /* 480 /* No matching nfs_client found. */
484 * No matching nfs_client found. This should be impossible,
485 * because the new nfs_client has already been added to
486 * nfs_client_list by nfs_get_client().
487 *
488 * Don't BUG(), since the caller is holding a mutex.
489 */
490 spin_unlock(&nn->nfs_client_lock); 481 spin_unlock(&nn->nfs_client_lock);
491 pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); 482 dprintk("NFS: <-- %s status = %d\n", __func__, status);
492 return -NFS4ERR_STALE_CLIENTID; 483 return status;
493} 484}
494#endif /* CONFIG_NFS_V4_1 */ 485#endif /* CONFIG_NFS_V4_1 */
495 486
@@ -713,10 +704,6 @@ static int nfs4_server_common_setup(struct nfs_server *server,
713 struct nfs_fattr *fattr; 704 struct nfs_fattr *fattr;
714 int error; 705 int error;
715 706
716 BUG_ON(!server->nfs_client);
717 BUG_ON(!server->nfs_client->rpc_ops);
718 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
719
720 /* data servers support only a subset of NFSv4.1 */ 707 /* data servers support only a subset of NFSv4.1 */
721 if (is_ds_only_client(server->nfs_client)) 708 if (is_ds_only_client(server->nfs_client))
722 return -EPROTONOSUPPORT; 709 return -EPROTONOSUPPORT;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index afddd6639afb..08ddcccb8887 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -5,6 +5,7 @@
5 */ 5 */
6#include <linux/nfs_fs.h> 6#include <linux/nfs_fs.h>
7#include "internal.h" 7#include "internal.h"
8#include "fscache.h"
8#include "pnfs.h" 9#include "pnfs.h"
9 10
10#define NFSDBG_FACILITY NFSDBG_FILE 11#define NFSDBG_FACILITY NFSDBG_FILE
@@ -20,7 +21,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
20 struct iattr attr; 21 struct iattr attr;
21 int err; 22 int err;
22 23
23 BUG_ON(inode != dentry->d_inode);
24 /* 24 /*
25 * If no cached dentry exists or if it's negative, NFSv4 handled the 25 * If no cached dentry exists or if it's negative, NFSv4 handled the
26 * opens in ->lookup() or ->create(). 26 * opens in ->lookup() or ->create().
@@ -75,6 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
75 75
76 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 76 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
77 nfs_file_set_open_context(filp, ctx); 77 nfs_file_set_open_context(filp, ctx);
78 nfs_fscache_set_inode_cookie(inode, filp);
78 err = 0; 79 err = 0;
79 80
80out_put_ctx: 81out_put_ctx:
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 2e45fd9c02a3..194c48410336 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/sunrpc/metrics.h> 36#include <linux/sunrpc/metrics.h>
37 37
38#include "nfs4session.h"
38#include "internal.h" 39#include "internal.h"
39#include "delegation.h" 40#include "delegation.h"
40#include "nfs4filelayout.h" 41#include "nfs4filelayout.h"
@@ -178,7 +179,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
178 break; 179 break;
179 case -NFS4ERR_DELAY: 180 case -NFS4ERR_DELAY:
180 case -NFS4ERR_GRACE: 181 case -NFS4ERR_GRACE:
181 case -EKEYEXPIRED:
182 rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); 182 rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
183 break; 183 break;
184 case -NFS4ERR_RETRY_UNCACHED_REP: 184 case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -306,12 +306,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
306 } 306 }
307 rdata->read_done_cb = filelayout_read_done_cb; 307 rdata->read_done_cb = filelayout_read_done_cb;
308 308
309 if (nfs41_setup_sequence(rdata->ds_clp->cl_session, 309 nfs41_setup_sequence(rdata->ds_clp->cl_session,
310 &rdata->args.seq_args, &rdata->res.seq_res, 310 &rdata->args.seq_args,
311 task)) 311 &rdata->res.seq_res,
312 return; 312 task);
313
314 rpc_call_start(task);
315} 313}
316 314
317static void filelayout_read_call_done(struct rpc_task *task, void *data) 315static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -408,12 +406,10 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
408 rpc_exit(task, 0); 406 rpc_exit(task, 0);
409 return; 407 return;
410 } 408 }
411 if (nfs41_setup_sequence(wdata->ds_clp->cl_session, 409 nfs41_setup_sequence(wdata->ds_clp->cl_session,
412 &wdata->args.seq_args, &wdata->res.seq_res, 410 &wdata->args.seq_args,
413 task)) 411 &wdata->res.seq_res,
414 return; 412 task);
415
416 rpc_call_start(task);
417} 413}
418 414
419static void filelayout_write_call_done(struct rpc_task *task, void *data) 415static void filelayout_write_call_done(struct rpc_task *task, void *data)
@@ -449,12 +445,10 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
449{ 445{
450 struct nfs_commit_data *wdata = data; 446 struct nfs_commit_data *wdata = data;
451 447
452 if (nfs41_setup_sequence(wdata->ds_clp->cl_session, 448 nfs41_setup_sequence(wdata->ds_clp->cl_session,
453 &wdata->args.seq_args, &wdata->res.seq_res, 449 &wdata->args.seq_args,
454 task)) 450 &wdata->res.seq_res,
455 return; 451 task);
456
457 rpc_call_start(task);
458} 452}
459 453
460static void filelayout_write_commit_done(struct rpc_task *task, void *data) 454static void filelayout_write_commit_done(struct rpc_task *task, void *data)
@@ -512,7 +506,6 @@ filelayout_read_pagelist(struct nfs_read_data *data)
512 loff_t offset = data->args.offset; 506 loff_t offset = data->args.offset;
513 u32 j, idx; 507 u32 j, idx;
514 struct nfs_fh *fh; 508 struct nfs_fh *fh;
515 int status;
516 509
517 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", 510 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
518 __func__, hdr->inode->i_ino, 511 __func__, hdr->inode->i_ino,
@@ -538,9 +531,8 @@ filelayout_read_pagelist(struct nfs_read_data *data)
538 data->mds_offset = offset; 531 data->mds_offset = offset;
539 532
540 /* Perform an asynchronous read to ds */ 533 /* Perform an asynchronous read to ds */
541 status = nfs_initiate_read(ds->ds_clp->cl_rpcclient, data, 534 nfs_initiate_read(ds->ds_clp->cl_rpcclient, data,
542 &filelayout_read_call_ops, RPC_TASK_SOFTCONN); 535 &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
543 BUG_ON(status != 0);
544 return PNFS_ATTEMPTED; 536 return PNFS_ATTEMPTED;
545} 537}
546 538
@@ -554,7 +546,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
554 loff_t offset = data->args.offset; 546 loff_t offset = data->args.offset;
555 u32 j, idx; 547 u32 j, idx;
556 struct nfs_fh *fh; 548 struct nfs_fh *fh;
557 int status;
558 549
559 /* Retrieve the correct rpc_client for the byte range */ 550 /* Retrieve the correct rpc_client for the byte range */
560 j = nfs4_fl_calc_j_index(lseg, offset); 551 j = nfs4_fl_calc_j_index(lseg, offset);
@@ -579,10 +570,9 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
579 data->args.offset = filelayout_get_dserver_offset(lseg, offset); 570 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
580 571
581 /* Perform an asynchronous write */ 572 /* Perform an asynchronous write */
582 status = nfs_initiate_write(ds->ds_clp->cl_rpcclient, data, 573 nfs_initiate_write(ds->ds_clp->cl_rpcclient, data,
583 &filelayout_write_call_ops, sync, 574 &filelayout_write_call_ops, sync,
584 RPC_TASK_SOFTCONN); 575 RPC_TASK_SOFTCONN);
585 BUG_ON(status != 0);
586 return PNFS_ATTEMPTED; 576 return PNFS_ATTEMPTED;
587} 577}
588 578
@@ -909,7 +899,7 @@ static void
909filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, 899filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
910 struct nfs_page *req) 900 struct nfs_page *req)
911{ 901{
912 BUG_ON(pgio->pg_lseg != NULL); 902 WARN_ON_ONCE(pgio->pg_lseg != NULL);
913 903
914 if (req->wb_offset != req->wb_pgbase) { 904 if (req->wb_offset != req->wb_pgbase) {
915 /* 905 /*
@@ -939,7 +929,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
939 struct nfs_commit_info cinfo; 929 struct nfs_commit_info cinfo;
940 int status; 930 int status;
941 931
942 BUG_ON(pgio->pg_lseg != NULL); 932 WARN_ON_ONCE(pgio->pg_lseg != NULL);
943 933
944 if (req->wb_offset != req->wb_pgbase) 934 if (req->wb_offset != req->wb_pgbase)
945 goto out_mds; 935 goto out_mds;
@@ -1187,7 +1177,6 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
1187 */ 1177 */
1188 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { 1178 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1189 if (transfer_commit_list(&b->written, dst, cinfo, 0)) { 1179 if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
1190 BUG_ON(!list_empty(&b->written));
1191 pnfs_put_lseg(b->wlseg); 1180 pnfs_put_lseg(b->wlseg);
1192 b->wlseg = NULL; 1181 b->wlseg = NULL;
1193 } 1182 }
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index a8eaa9b7bb0f..b720064bcd7f 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -33,6 +33,7 @@
33#include <linux/module.h> 33#include <linux/module.h>
34 34
35#include "internal.h" 35#include "internal.h"
36#include "nfs4session.h"
36#include "nfs4filelayout.h" 37#include "nfs4filelayout.h"
37 38
38#define NFSDBG_FACILITY NFSDBG_PNFS_LD 39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -162,8 +163,6 @@ nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
162 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, 163 dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
163 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); 164 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
164 165
165 BUG_ON(list_empty(&ds->ds_addrs));
166
167 list_for_each_entry(da, &ds->ds_addrs, da_node) { 166 list_for_each_entry(da, &ds->ds_addrs, da_node) {
168 dprintk("%s: DS %s: trying address %s\n", 167 dprintk("%s: DS %s: trying address %s\n",
169 __func__, ds->ds_remotestr, da->da_remotestr); 168 __func__, ds->ds_remotestr, da->da_remotestr);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5eec4429970c..cf747ef86650 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,7 +52,6 @@
52#include <linux/mount.h> 52#include <linux/mount.h>
53#include <linux/module.h> 53#include <linux/module.h>
54#include <linux/nfs_idmap.h> 54#include <linux/nfs_idmap.h>
55#include <linux/sunrpc/bc_xprt.h>
56#include <linux/xattr.h> 55#include <linux/xattr.h>
57#include <linux/utsname.h> 56#include <linux/utsname.h>
58#include <linux/freezer.h> 57#include <linux/freezer.h>
@@ -64,14 +63,14 @@
64#include "callback.h" 63#include "callback.h"
65#include "pnfs.h" 64#include "pnfs.h"
66#include "netns.h" 65#include "netns.h"
66#include "nfs4session.h"
67#include "fscache.h"
67 68
68#define NFSDBG_FACILITY NFSDBG_PROC 69#define NFSDBG_FACILITY NFSDBG_PROC
69 70
70#define NFS4_POLL_RETRY_MIN (HZ/10) 71#define NFS4_POLL_RETRY_MIN (HZ/10)
71#define NFS4_POLL_RETRY_MAX (15*HZ) 72#define NFS4_POLL_RETRY_MAX (15*HZ)
72 73
73#define NFS4_MAX_LOOP_ON_RECOVER (10)
74
75struct nfs4_opendata; 74struct nfs4_opendata;
76static int _nfs4_proc_open(struct nfs4_opendata *data); 75static int _nfs4_proc_open(struct nfs4_opendata *data);
77static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 76static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
@@ -206,7 +205,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
206{ 205{
207 __be32 *start, *p; 206 __be32 *start, *p;
208 207
209 BUG_ON(readdir->count < 80);
210 if (cookie > 2) { 208 if (cookie > 2) {
211 readdir->cookie = cookie; 209 readdir->cookie = cookie;
212 memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); 210 memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier));
@@ -256,22 +254,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
256 kunmap_atomic(start); 254 kunmap_atomic(start);
257} 255}
258 256
259static int nfs4_wait_clnt_recover(struct nfs_client *clp)
260{
261 int res;
262
263 might_sleep();
264
265 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
266 nfs_wait_bit_killable, TASK_KILLABLE);
267 if (res)
268 return res;
269
270 if (clp->cl_cons_state < 0)
271 return clp->cl_cons_state;
272 return 0;
273}
274
275static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) 257static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
276{ 258{
277 int res = 0; 259 int res = 0;
@@ -351,7 +333,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
351 } 333 }
352 case -NFS4ERR_GRACE: 334 case -NFS4ERR_GRACE:
353 case -NFS4ERR_DELAY: 335 case -NFS4ERR_DELAY:
354 case -EKEYEXPIRED:
355 ret = nfs4_delay(server->client, &exception->timeout); 336 ret = nfs4_delay(server->client, &exception->timeout);
356 if (ret != 0) 337 if (ret != 0)
357 break; 338 break;
@@ -397,144 +378,136 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
397 378
398#if defined(CONFIG_NFS_V4_1) 379#if defined(CONFIG_NFS_V4_1)
399 380
400/*
401 * nfs4_free_slot - free a slot and efficiently update slot table.
402 *
403 * freeing a slot is trivially done by clearing its respective bit
404 * in the bitmap.
405 * If the freed slotid equals highest_used_slotid we want to update it
406 * so that the server would be able to size down the slot table if needed,
407 * otherwise we know that the highest_used_slotid is still in use.
408 * When updating highest_used_slotid there may be "holes" in the bitmap
409 * so we need to scan down from highest_used_slotid to 0 looking for the now
410 * highest slotid in use.
411 * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
412 *
413 * Must be called while holding tbl->slot_tbl_lock
414 */
415static void
416nfs4_free_slot(struct nfs4_slot_table *tbl, u32 slotid)
417{
418 BUG_ON(slotid >= NFS4_MAX_SLOT_TABLE);
419 /* clear used bit in bitmap */
420 __clear_bit(slotid, tbl->used_slots);
421
422 /* update highest_used_slotid when it is freed */
423 if (slotid == tbl->highest_used_slotid) {
424 slotid = find_last_bit(tbl->used_slots, tbl->max_slots);
425 if (slotid < tbl->max_slots)
426 tbl->highest_used_slotid = slotid;
427 else
428 tbl->highest_used_slotid = NFS4_NO_SLOT;
429 }
430 dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
431 slotid, tbl->highest_used_slotid);
432}
433
434bool nfs4_set_task_privileged(struct rpc_task *task, void *dummy)
435{
436 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
437 return true;
438}
439
440/*
441 * Signal state manager thread if session fore channel is drained
442 */
443static void nfs4_check_drain_fc_complete(struct nfs4_session *ses)
444{
445 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
446 rpc_wake_up_first(&ses->fc_slot_table.slot_tbl_waitq,
447 nfs4_set_task_privileged, NULL);
448 return;
449 }
450
451 if (ses->fc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
452 return;
453
454 dprintk("%s COMPLETE: Session Fore Channel Drained\n", __func__);
455 complete(&ses->fc_slot_table.complete);
456}
457
458/*
459 * Signal state manager thread if session back channel is drained
460 */
461void nfs4_check_drain_bc_complete(struct nfs4_session *ses)
462{
463 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state) ||
464 ses->bc_slot_table.highest_used_slotid != NFS4_NO_SLOT)
465 return;
466 dprintk("%s COMPLETE: Session Back Channel Drained\n", __func__);
467 complete(&ses->bc_slot_table.complete);
468}
469
470static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) 381static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
471{ 382{
383 struct nfs4_session *session;
472 struct nfs4_slot_table *tbl; 384 struct nfs4_slot_table *tbl;
385 bool send_new_highest_used_slotid = false;
473 386
474 tbl = &res->sr_session->fc_slot_table;
475 if (!res->sr_slot) { 387 if (!res->sr_slot) {
476 /* just wake up the next guy waiting since 388 /* just wake up the next guy waiting since
477 * we may have not consumed a slot after all */ 389 * we may have not consumed a slot after all */
478 dprintk("%s: No slot\n", __func__); 390 dprintk("%s: No slot\n", __func__);
479 return; 391 return;
480 } 392 }
393 tbl = res->sr_slot->table;
394 session = tbl->session;
481 395
482 spin_lock(&tbl->slot_tbl_lock); 396 spin_lock(&tbl->slot_tbl_lock);
483 nfs4_free_slot(tbl, res->sr_slot - tbl->slots); 397 /* Be nice to the server: try to ensure that the last transmitted
484 nfs4_check_drain_fc_complete(res->sr_session); 398 * value for highest_user_slotid <= target_highest_slotid
399 */
400 if (tbl->highest_used_slotid > tbl->target_highest_slotid)
401 send_new_highest_used_slotid = true;
402
403 if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) {
404 send_new_highest_used_slotid = false;
405 goto out_unlock;
406 }
407 nfs4_free_slot(tbl, res->sr_slot);
408
409 if (tbl->highest_used_slotid != NFS4_NO_SLOT)
410 send_new_highest_used_slotid = false;
411out_unlock:
485 spin_unlock(&tbl->slot_tbl_lock); 412 spin_unlock(&tbl->slot_tbl_lock);
486 res->sr_slot = NULL; 413 res->sr_slot = NULL;
414 if (send_new_highest_used_slotid)
415 nfs41_server_notify_highest_slotid_update(session->clp);
487} 416}
488 417
489static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 418static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
490{ 419{
491 unsigned long timestamp; 420 struct nfs4_session *session;
421 struct nfs4_slot *slot;
492 struct nfs_client *clp; 422 struct nfs_client *clp;
493 423 bool interrupted = false;
494 /* 424 int ret = 1;
495 * sr_status remains 1 if an RPC level error occurred. The server
496 * may or may not have processed the sequence operation..
497 * Proceed as if the server received and processed the sequence
498 * operation.
499 */
500 if (res->sr_status == 1)
501 res->sr_status = NFS_OK;
502 425
503 /* don't increment the sequence number if the task wasn't sent */ 426 /* don't increment the sequence number if the task wasn't sent */
504 if (!RPC_WAS_SENT(task)) 427 if (!RPC_WAS_SENT(task))
505 goto out; 428 goto out;
506 429
430 slot = res->sr_slot;
431 session = slot->table->session;
432
433 if (slot->interrupted) {
434 slot->interrupted = 0;
435 interrupted = true;
436 }
437
507 /* Check the SEQUENCE operation status */ 438 /* Check the SEQUENCE operation status */
508 switch (res->sr_status) { 439 switch (res->sr_status) {
509 case 0: 440 case 0:
510 /* Update the slot's sequence and clientid lease timer */ 441 /* Update the slot's sequence and clientid lease timer */
511 ++res->sr_slot->seq_nr; 442 ++slot->seq_nr;
512 timestamp = res->sr_renewal_time; 443 clp = session->clp;
513 clp = res->sr_session->clp; 444 do_renew_lease(clp, res->sr_timestamp);
514 do_renew_lease(clp, timestamp);
515 /* Check sequence flags */ 445 /* Check sequence flags */
516 if (res->sr_status_flags != 0) 446 if (res->sr_status_flags != 0)
517 nfs4_schedule_lease_recovery(clp); 447 nfs4_schedule_lease_recovery(clp);
448 nfs41_update_target_slotid(slot->table, slot, res);
518 break; 449 break;
450 case 1:
451 /*
452 * sr_status remains 1 if an RPC level error occurred.
453 * The server may or may not have processed the sequence
454 * operation..
455 * Mark the slot as having hosted an interrupted RPC call.
456 */
457 slot->interrupted = 1;
458 goto out;
519 case -NFS4ERR_DELAY: 459 case -NFS4ERR_DELAY:
520 /* The server detected a resend of the RPC call and 460 /* The server detected a resend of the RPC call and
521 * returned NFS4ERR_DELAY as per Section 2.10.6.2 461 * returned NFS4ERR_DELAY as per Section 2.10.6.2
522 * of RFC5661. 462 * of RFC5661.
523 */ 463 */
524 dprintk("%s: slot=%td seq=%d: Operation in progress\n", 464 dprintk("%s: slot=%u seq=%u: Operation in progress\n",
525 __func__, 465 __func__,
526 res->sr_slot - res->sr_session->fc_slot_table.slots, 466 slot->slot_nr,
527 res->sr_slot->seq_nr); 467 slot->seq_nr);
528 goto out_retry; 468 goto out_retry;
469 case -NFS4ERR_BADSLOT:
470 /*
471 * The slot id we used was probably retired. Try again
472 * using a different slot id.
473 */
474 goto retry_nowait;
475 case -NFS4ERR_SEQ_MISORDERED:
476 /*
477 * Was the last operation on this sequence interrupted?
478 * If so, retry after bumping the sequence number.
479 */
480 if (interrupted) {
481 ++slot->seq_nr;
482 goto retry_nowait;
483 }
484 /*
485 * Could this slot have been previously retired?
486 * If so, then the server may be expecting seq_nr = 1!
487 */
488 if (slot->seq_nr != 1) {
489 slot->seq_nr = 1;
490 goto retry_nowait;
491 }
492 break;
493 case -NFS4ERR_SEQ_FALSE_RETRY:
494 ++slot->seq_nr;
495 goto retry_nowait;
529 default: 496 default:
530 /* Just update the slot sequence no. */ 497 /* Just update the slot sequence no. */
531 ++res->sr_slot->seq_nr; 498 ++slot->seq_nr;
532 } 499 }
533out: 500out:
534 /* The session may be reset by one of the error handlers. */ 501 /* The session may be reset by one of the error handlers. */
535 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); 502 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
536 nfs41_sequence_free_slot(res); 503 nfs41_sequence_free_slot(res);
537 return 1; 504 return ret;
505retry_nowait:
506 if (rpc_restart_call_prepare(task)) {
507 task->tk_status = 0;
508 ret = 0;
509 }
510 goto out;
538out_retry: 511out_retry:
539 if (!rpc_restart_call(task)) 512 if (!rpc_restart_call(task))
540 goto out; 513 goto out;
@@ -545,55 +518,27 @@ out_retry:
545static int nfs4_sequence_done(struct rpc_task *task, 518static int nfs4_sequence_done(struct rpc_task *task,
546 struct nfs4_sequence_res *res) 519 struct nfs4_sequence_res *res)
547{ 520{
548 if (res->sr_session == NULL) 521 if (res->sr_slot == NULL)
549 return 1; 522 return 1;
550 return nfs41_sequence_done(task, res); 523 return nfs41_sequence_done(task, res);
551} 524}
552 525
553/*
554 * nfs4_find_slot - efficiently look for a free slot
555 *
556 * nfs4_find_slot looks for an unset bit in the used_slots bitmap.
557 * If found, we mark the slot as used, update the highest_used_slotid,
558 * and respectively set up the sequence operation args.
559 * The slot number is returned if found, or NFS4_NO_SLOT otherwise.
560 *
561 * Note: must be called with under the slot_tbl_lock.
562 */
563static u32
564nfs4_find_slot(struct nfs4_slot_table *tbl)
565{
566 u32 slotid;
567 u32 ret_id = NFS4_NO_SLOT;
568
569 dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
570 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
571 tbl->max_slots);
572 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slots);
573 if (slotid >= tbl->max_slots)
574 goto out;
575 __set_bit(slotid, tbl->used_slots);
576 if (slotid > tbl->highest_used_slotid ||
577 tbl->highest_used_slotid == NFS4_NO_SLOT)
578 tbl->highest_used_slotid = slotid;
579 ret_id = slotid;
580out:
581 dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
582 __func__, tbl->used_slots[0], tbl->highest_used_slotid, ret_id);
583 return ret_id;
584}
585
586static void nfs41_init_sequence(struct nfs4_sequence_args *args, 526static void nfs41_init_sequence(struct nfs4_sequence_args *args,
587 struct nfs4_sequence_res *res, int cache_reply) 527 struct nfs4_sequence_res *res, int cache_reply)
588{ 528{
589 args->sa_session = NULL; 529 args->sa_slot = NULL;
590 args->sa_cache_this = 0; 530 args->sa_cache_this = 0;
531 args->sa_privileged = 0;
591 if (cache_reply) 532 if (cache_reply)
592 args->sa_cache_this = 1; 533 args->sa_cache_this = 1;
593 res->sr_session = NULL;
594 res->sr_slot = NULL; 534 res->sr_slot = NULL;
595} 535}
596 536
537static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
538{
539 args->sa_privileged = 1;
540}
541
597int nfs41_setup_sequence(struct nfs4_session *session, 542int nfs41_setup_sequence(struct nfs4_session *session,
598 struct nfs4_sequence_args *args, 543 struct nfs4_sequence_args *args,
599 struct nfs4_sequence_res *res, 544 struct nfs4_sequence_res *res,
@@ -601,59 +546,59 @@ int nfs41_setup_sequence(struct nfs4_session *session,
601{ 546{
602 struct nfs4_slot *slot; 547 struct nfs4_slot *slot;
603 struct nfs4_slot_table *tbl; 548 struct nfs4_slot_table *tbl;
604 u32 slotid;
605 549
606 dprintk("--> %s\n", __func__); 550 dprintk("--> %s\n", __func__);
607 /* slot already allocated? */ 551 /* slot already allocated? */
608 if (res->sr_slot != NULL) 552 if (res->sr_slot != NULL)
609 return 0; 553 goto out_success;
610 554
611 tbl = &session->fc_slot_table; 555 tbl = &session->fc_slot_table;
612 556
557 task->tk_timeout = 0;
558
613 spin_lock(&tbl->slot_tbl_lock); 559 spin_lock(&tbl->slot_tbl_lock);
614 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && 560 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
615 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { 561 !args->sa_privileged) {
616 /* The state manager will wait until the slot table is empty */ 562 /* The state manager will wait until the slot table is empty */
617 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
618 spin_unlock(&tbl->slot_tbl_lock);
619 dprintk("%s session is draining\n", __func__); 563 dprintk("%s session is draining\n", __func__);
620 return -EAGAIN; 564 goto out_sleep;
621 }
622
623 if (!rpc_queue_empty(&tbl->slot_tbl_waitq) &&
624 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
625 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
626 spin_unlock(&tbl->slot_tbl_lock);
627 dprintk("%s enforce FIFO order\n", __func__);
628 return -EAGAIN;
629 } 565 }
630 566
631 slotid = nfs4_find_slot(tbl); 567 slot = nfs4_alloc_slot(tbl);
632 if (slotid == NFS4_NO_SLOT) { 568 if (IS_ERR(slot)) {
633 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); 569 /* If out of memory, try again in 1/4 second */
634 spin_unlock(&tbl->slot_tbl_lock); 570 if (slot == ERR_PTR(-ENOMEM))
571 task->tk_timeout = HZ >> 2;
635 dprintk("<-- %s: no free slots\n", __func__); 572 dprintk("<-- %s: no free slots\n", __func__);
636 return -EAGAIN; 573 goto out_sleep;
637 } 574 }
638 spin_unlock(&tbl->slot_tbl_lock); 575 spin_unlock(&tbl->slot_tbl_lock);
639 576
640 rpc_task_set_priority(task, RPC_PRIORITY_NORMAL); 577 args->sa_slot = slot;
641 slot = tbl->slots + slotid;
642 args->sa_session = session;
643 args->sa_slotid = slotid;
644 578
645 dprintk("<-- %s slotid=%d seqid=%d\n", __func__, slotid, slot->seq_nr); 579 dprintk("<-- %s slotid=%d seqid=%d\n", __func__,
580 slot->slot_nr, slot->seq_nr);
646 581
647 res->sr_session = session;
648 res->sr_slot = slot; 582 res->sr_slot = slot;
649 res->sr_renewal_time = jiffies; 583 res->sr_timestamp = jiffies;
650 res->sr_status_flags = 0; 584 res->sr_status_flags = 0;
651 /* 585 /*
652 * sr_status is only set in decode_sequence, and so will remain 586 * sr_status is only set in decode_sequence, and so will remain
653 * set to 1 if an rpc level failure occurs. 587 * set to 1 if an rpc level failure occurs.
654 */ 588 */
655 res->sr_status = 1; 589 res->sr_status = 1;
590out_success:
591 rpc_call_start(task);
656 return 0; 592 return 0;
593out_sleep:
594 /* Privileged tasks are queued with top priority */
595 if (args->sa_privileged)
596 rpc_sleep_on_priority(&tbl->slot_tbl_waitq, task,
597 NULL, RPC_PRIORITY_PRIVILEGED);
598 else
599 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
600 spin_unlock(&tbl->slot_tbl_lock);
601 return -EAGAIN;
657} 602}
658EXPORT_SYMBOL_GPL(nfs41_setup_sequence); 603EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
659 604
@@ -665,12 +610,14 @@ int nfs4_setup_sequence(const struct nfs_server *server,
665 struct nfs4_session *session = nfs4_get_session(server); 610 struct nfs4_session *session = nfs4_get_session(server);
666 int ret = 0; 611 int ret = 0;
667 612
668 if (session == NULL) 613 if (session == NULL) {
614 rpc_call_start(task);
669 goto out; 615 goto out;
616 }
670 617
671 dprintk("--> %s clp %p session %p sr_slot %td\n", 618 dprintk("--> %s clp %p session %p sr_slot %d\n",
672 __func__, session->clp, session, res->sr_slot ? 619 __func__, session->clp, session, res->sr_slot ?
673 res->sr_slot - session->fc_slot_table.slots : -1); 620 res->sr_slot->slot_nr : -1);
674 621
675 ret = nfs41_setup_sequence(session, args, res, task); 622 ret = nfs41_setup_sequence(session, args, res, task);
676out: 623out:
@@ -687,19 +634,11 @@ struct nfs41_call_sync_data {
687static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) 634static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
688{ 635{
689 struct nfs41_call_sync_data *data = calldata; 636 struct nfs41_call_sync_data *data = calldata;
637 struct nfs4_session *session = nfs4_get_session(data->seq_server);
690 638
691 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); 639 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
692 640
693 if (nfs4_setup_sequence(data->seq_server, data->seq_args, 641 nfs41_setup_sequence(session, data->seq_args, data->seq_res, task);
694 data->seq_res, task))
695 return;
696 rpc_call_start(task);
697}
698
699static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata)
700{
701 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
702 nfs41_call_sync_prepare(task, calldata);
703} 642}
704 643
705static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) 644static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
@@ -714,17 +653,11 @@ static const struct rpc_call_ops nfs41_call_sync_ops = {
714 .rpc_call_done = nfs41_call_sync_done, 653 .rpc_call_done = nfs41_call_sync_done,
715}; 654};
716 655
717static const struct rpc_call_ops nfs41_call_priv_sync_ops = {
718 .rpc_call_prepare = nfs41_call_priv_sync_prepare,
719 .rpc_call_done = nfs41_call_sync_done,
720};
721
722static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, 656static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
723 struct nfs_server *server, 657 struct nfs_server *server,
724 struct rpc_message *msg, 658 struct rpc_message *msg,
725 struct nfs4_sequence_args *args, 659 struct nfs4_sequence_args *args,
726 struct nfs4_sequence_res *res, 660 struct nfs4_sequence_res *res)
727 int privileged)
728{ 661{
729 int ret; 662 int ret;
730 struct rpc_task *task; 663 struct rpc_task *task;
@@ -740,8 +673,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
740 .callback_data = &data 673 .callback_data = &data
741 }; 674 };
742 675
743 if (privileged)
744 task_setup.callback_ops = &nfs41_call_priv_sync_ops;
745 task = rpc_run_task(&task_setup); 676 task = rpc_run_task(&task_setup);
746 if (IS_ERR(task)) 677 if (IS_ERR(task))
747 ret = PTR_ERR(task); 678 ret = PTR_ERR(task);
@@ -752,24 +683,18 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
752 return ret; 683 return ret;
753} 684}
754 685
755int _nfs4_call_sync_session(struct rpc_clnt *clnt,
756 struct nfs_server *server,
757 struct rpc_message *msg,
758 struct nfs4_sequence_args *args,
759 struct nfs4_sequence_res *res,
760 int cache_reply)
761{
762 nfs41_init_sequence(args, res, cache_reply);
763 return nfs4_call_sync_sequence(clnt, server, msg, args, res, 0);
764}
765
766#else 686#else
767static inline 687static
768void nfs41_init_sequence(struct nfs4_sequence_args *args, 688void nfs41_init_sequence(struct nfs4_sequence_args *args,
769 struct nfs4_sequence_res *res, int cache_reply) 689 struct nfs4_sequence_res *res, int cache_reply)
770{ 690{
771} 691}
772 692
693static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
694{
695}
696
697
773static int nfs4_sequence_done(struct rpc_task *task, 698static int nfs4_sequence_done(struct rpc_task *task,
774 struct nfs4_sequence_res *res) 699 struct nfs4_sequence_res *res)
775{ 700{
@@ -777,18 +702,17 @@ static int nfs4_sequence_done(struct rpc_task *task,
777} 702}
778#endif /* CONFIG_NFS_V4_1 */ 703#endif /* CONFIG_NFS_V4_1 */
779 704
705static
780int _nfs4_call_sync(struct rpc_clnt *clnt, 706int _nfs4_call_sync(struct rpc_clnt *clnt,
781 struct nfs_server *server, 707 struct nfs_server *server,
782 struct rpc_message *msg, 708 struct rpc_message *msg,
783 struct nfs4_sequence_args *args, 709 struct nfs4_sequence_args *args,
784 struct nfs4_sequence_res *res, 710 struct nfs4_sequence_res *res)
785 int cache_reply)
786{ 711{
787 nfs41_init_sequence(args, res, cache_reply);
788 return rpc_call_sync(clnt, msg, 0); 712 return rpc_call_sync(clnt, msg, 0);
789} 713}
790 714
791static inline 715static
792int nfs4_call_sync(struct rpc_clnt *clnt, 716int nfs4_call_sync(struct rpc_clnt *clnt,
793 struct nfs_server *server, 717 struct nfs_server *server,
794 struct rpc_message *msg, 718 struct rpc_message *msg,
@@ -796,8 +720,9 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
796 struct nfs4_sequence_res *res, 720 struct nfs4_sequence_res *res,
797 int cache_reply) 721 int cache_reply)
798{ 722{
723 nfs41_init_sequence(args, res, cache_reply);
799 return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, 724 return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
800 args, res, cache_reply); 725 args, res);
801} 726}
802 727
803static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 728static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
@@ -809,6 +734,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
809 if (!cinfo->atomic || cinfo->before != dir->i_version) 734 if (!cinfo->atomic || cinfo->before != dir->i_version)
810 nfs_force_lookup_revalidate(dir); 735 nfs_force_lookup_revalidate(dir);
811 dir->i_version = cinfo->after; 736 dir->i_version = cinfo->after;
737 nfs_fscache_invalidate(dir);
812 spin_unlock(&dir->i_lock); 738 spin_unlock(&dir->i_lock);
813} 739}
814 740
@@ -1445,13 +1371,6 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1445 nfs_inode_find_state_and_recover(state->inode, 1371 nfs_inode_find_state_and_recover(state->inode,
1446 stateid); 1372 stateid);
1447 nfs4_schedule_stateid_recovery(server, state); 1373 nfs4_schedule_stateid_recovery(server, state);
1448 case -EKEYEXPIRED:
1449 /*
1450 * User RPCSEC_GSS context has expired.
1451 * We cannot recover this stateid now, so
1452 * skip it and allow recovery thread to
1453 * proceed.
1454 */
1455 case -ENOMEM: 1374 case -ENOMEM:
1456 err = 0; 1375 err = 0;
1457 goto out; 1376 goto out;
@@ -1574,20 +1493,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1574 &data->o_res.seq_res, 1493 &data->o_res.seq_res,
1575 task) != 0) 1494 task) != 0)
1576 nfs_release_seqid(data->o_arg.seqid); 1495 nfs_release_seqid(data->o_arg.seqid);
1577 else
1578 rpc_call_start(task);
1579 return; 1496 return;
1580unlock_no_action: 1497unlock_no_action:
1581 rcu_read_unlock(); 1498 rcu_read_unlock();
1582out_no_action: 1499out_no_action:
1583 task->tk_action = NULL; 1500 task->tk_action = NULL;
1584 1501 nfs4_sequence_done(task, &data->o_res.seq_res);
1585}
1586
1587static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata)
1588{
1589 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
1590 nfs4_open_prepare(task, calldata);
1591} 1502}
1592 1503
1593static void nfs4_open_done(struct rpc_task *task, void *calldata) 1504static void nfs4_open_done(struct rpc_task *task, void *calldata)
@@ -1648,12 +1559,6 @@ static const struct rpc_call_ops nfs4_open_ops = {
1648 .rpc_release = nfs4_open_release, 1559 .rpc_release = nfs4_open_release,
1649}; 1560};
1650 1561
1651static const struct rpc_call_ops nfs4_recover_open_ops = {
1652 .rpc_call_prepare = nfs4_recover_open_prepare,
1653 .rpc_call_done = nfs4_open_done,
1654 .rpc_release = nfs4_open_release,
1655};
1656
1657static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) 1562static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
1658{ 1563{
1659 struct inode *dir = data->dir->d_inode; 1564 struct inode *dir = data->dir->d_inode;
@@ -1683,7 +1588,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
1683 data->rpc_status = 0; 1588 data->rpc_status = 0;
1684 data->cancelled = 0; 1589 data->cancelled = 0;
1685 if (isrecover) 1590 if (isrecover)
1686 task_setup_data.callback_ops = &nfs4_recover_open_ops; 1591 nfs4_set_sequence_privileged(&o_arg->seq_args);
1687 task = rpc_run_task(&task_setup_data); 1592 task = rpc_run_task(&task_setup_data);
1688 if (IS_ERR(task)) 1593 if (IS_ERR(task))
1689 return PTR_ERR(task); 1594 return PTR_ERR(task);
@@ -1721,7 +1626,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
1721 1626
1722static int nfs4_opendata_access(struct rpc_cred *cred, 1627static int nfs4_opendata_access(struct rpc_cred *cred,
1723 struct nfs4_opendata *opendata, 1628 struct nfs4_opendata *opendata,
1724 struct nfs4_state *state, fmode_t fmode) 1629 struct nfs4_state *state, fmode_t fmode,
1630 int openflags)
1725{ 1631{
1726 struct nfs_access_entry cache; 1632 struct nfs_access_entry cache;
1727 u32 mask; 1633 u32 mask;
@@ -1733,11 +1639,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
1733 1639
1734 mask = 0; 1640 mask = 0;
1735 /* don't check MAY_WRITE - a newly created file may not have 1641 /* don't check MAY_WRITE - a newly created file may not have
1736 * write mode bits, but POSIX allows the creating process to write */ 1642 * write mode bits, but POSIX allows the creating process to write.
1737 if (fmode & FMODE_READ) 1643 * use openflags to check for exec, because fmode won't
1738 mask |= MAY_READ; 1644 * always have FMODE_EXEC set when file open for exec. */
1739 if (fmode & FMODE_EXEC) 1645 if (openflags & __FMODE_EXEC) {
1740 mask |= MAY_EXEC; 1646 /* ONLY check for exec rights */
1647 mask = MAY_EXEC;
1648 } else if (fmode & FMODE_READ)
1649 mask = MAY_READ;
1741 1650
1742 cache.cred = cred; 1651 cache.cred = cred;
1743 cache.jiffies = jiffies; 1652 cache.jiffies = jiffies;
@@ -1789,24 +1698,6 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1789 return 0; 1698 return 0;
1790} 1699}
1791 1700
1792static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
1793{
1794 unsigned int loop;
1795 int ret;
1796
1797 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
1798 ret = nfs4_wait_clnt_recover(clp);
1799 if (ret != 0)
1800 break;
1801 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1802 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1803 break;
1804 nfs4_schedule_state_manager(clp);
1805 ret = -EIO;
1806 }
1807 return ret;
1808}
1809
1810static int nfs4_recover_expired_lease(struct nfs_server *server) 1701static int nfs4_recover_expired_lease(struct nfs_server *server)
1811{ 1702{
1812 return nfs4_client_recover_expired_lease(server->nfs_client); 1703 return nfs4_client_recover_expired_lease(server->nfs_client);
@@ -2009,7 +1900,7 @@ static int _nfs4_do_open(struct inode *dir,
2009 if (server->caps & NFS_CAP_POSIX_LOCK) 1900 if (server->caps & NFS_CAP_POSIX_LOCK)
2010 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); 1901 set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
2011 1902
2012 status = nfs4_opendata_access(cred, opendata, state, fmode); 1903 status = nfs4_opendata_access(cred, opendata, state, fmode, flags);
2013 if (status != 0) 1904 if (status != 0)
2014 goto err_opendata_put; 1905 goto err_opendata_put;
2015 1906
@@ -2282,6 +2173,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2282 if (!call_close) { 2173 if (!call_close) {
2283 /* Note: exit _without_ calling nfs4_close_done */ 2174 /* Note: exit _without_ calling nfs4_close_done */
2284 task->tk_action = NULL; 2175 task->tk_action = NULL;
2176 nfs4_sequence_done(task, &calldata->res.seq_res);
2285 goto out; 2177 goto out;
2286 } 2178 }
2287 2179
@@ -2299,8 +2191,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2299 &calldata->res.seq_res, 2191 &calldata->res.seq_res,
2300 task) != 0) 2192 task) != 0)
2301 nfs_release_seqid(calldata->arg.seqid); 2193 nfs_release_seqid(calldata->arg.seqid);
2302 else
2303 rpc_call_start(task);
2304out: 2194out:
2305 dprintk("%s: done!\n", __func__); 2195 dprintk("%s: done!\n", __func__);
2306} 2196}
@@ -2533,7 +2423,8 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
2533 rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS]; 2423 rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS];
2534 2424
2535 len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array)); 2425 len = rpcauth_list_flavors(flav_array, ARRAY_SIZE(flav_array));
2536 BUG_ON(len < 0); 2426 if (len < 0)
2427 return len;
2537 2428
2538 for (i = 0; i < len; i++) { 2429 for (i = 0; i < len; i++) {
2539 /* AUTH_UNIX is the default flavor if none was specified, 2430 /* AUTH_UNIX is the default flavor if none was specified,
@@ -3038,12 +2929,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
3038 2929
3039static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) 2930static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data)
3040{ 2931{
3041 if (nfs4_setup_sequence(NFS_SERVER(data->dir), 2932 nfs4_setup_sequence(NFS_SERVER(data->dir),
3042 &data->args.seq_args, 2933 &data->args.seq_args,
3043 &data->res.seq_res, 2934 &data->res.seq_res,
3044 task)) 2935 task);
3045 return;
3046 rpc_call_start(task);
3047} 2936}
3048 2937
3049static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) 2938static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
@@ -3071,12 +2960,10 @@ static void nfs4_proc_rename_setup(struct rpc_message *msg, struct inode *dir)
3071 2960
3072static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data) 2961static void nfs4_proc_rename_rpc_prepare(struct rpc_task *task, struct nfs_renamedata *data)
3073{ 2962{
3074 if (nfs4_setup_sequence(NFS_SERVER(data->old_dir), 2963 nfs4_setup_sequence(NFS_SERVER(data->old_dir),
3075 &data->args.seq_args, 2964 &data->args.seq_args,
3076 &data->res.seq_res, 2965 &data->res.seq_res,
3077 task)) 2966 task);
3078 return;
3079 rpc_call_start(task);
3080} 2967}
3081 2968
3082static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, 2969static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
@@ -3362,9 +3249,6 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3362 int mode = sattr->ia_mode; 3249 int mode = sattr->ia_mode;
3363 int status = -ENOMEM; 3250 int status = -ENOMEM;
3364 3251
3365 BUG_ON(!(sattr->ia_valid & ATTR_MODE));
3366 BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
3367
3368 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK); 3252 data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
3369 if (data == NULL) 3253 if (data == NULL)
3370 goto out; 3254 goto out;
@@ -3380,10 +3264,13 @@ static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
3380 data->arg.ftype = NF4CHR; 3264 data->arg.ftype = NF4CHR;
3381 data->arg.u.device.specdata1 = MAJOR(rdev); 3265 data->arg.u.device.specdata1 = MAJOR(rdev);
3382 data->arg.u.device.specdata2 = MINOR(rdev); 3266 data->arg.u.device.specdata2 = MINOR(rdev);
3267 } else if (!S_ISSOCK(mode)) {
3268 status = -EINVAL;
3269 goto out_free;
3383 } 3270 }
3384 3271
3385 status = nfs4_do_create(dir, dentry, data); 3272 status = nfs4_do_create(dir, dentry, data);
3386 3273out_free:
3387 nfs4_free_createdata(data); 3274 nfs4_free_createdata(data);
3388out: 3275out:
3389 return status; 3276 return status;
@@ -3565,12 +3452,10 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
3565 3452
3566static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data) 3453static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
3567{ 3454{
3568 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), 3455 nfs4_setup_sequence(NFS_SERVER(data->header->inode),
3569 &data->args.seq_args, 3456 &data->args.seq_args,
3570 &data->res.seq_res, 3457 &data->res.seq_res,
3571 task)) 3458 task);
3572 return;
3573 rpc_call_start(task);
3574} 3459}
3575 3460
3576static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) 3461static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3631,22 +3516,18 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
3631 3516
3632static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data) 3517static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
3633{ 3518{
3634 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), 3519 nfs4_setup_sequence(NFS_SERVER(data->header->inode),
3635 &data->args.seq_args, 3520 &data->args.seq_args,
3636 &data->res.seq_res, 3521 &data->res.seq_res,
3637 task)) 3522 task);
3638 return;
3639 rpc_call_start(task);
3640} 3523}
3641 3524
3642static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) 3525static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
3643{ 3526{
3644 if (nfs4_setup_sequence(NFS_SERVER(data->inode), 3527 nfs4_setup_sequence(NFS_SERVER(data->inode),
3645 &data->args.seq_args, 3528 &data->args.seq_args,
3646 &data->res.seq_res, 3529 &data->res.seq_res,
3647 task)) 3530 task);
3648 return;
3649 rpc_call_start(task);
3650} 3531}
3651 3532
3652static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) 3533static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data)
@@ -3937,8 +3818,13 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3937 goto out_free; 3818 goto out_free;
3938 } 3819 }
3939 nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); 3820 nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len);
3940 if (buf) 3821 if (buf) {
3822 if (res.acl_len > buflen) {
3823 ret = -ERANGE;
3824 goto out_free;
3825 }
3941 _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len); 3826 _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len);
3827 }
3942out_ok: 3828out_ok:
3943 ret = res.acl_len; 3829 ret = res.acl_len;
3944out_free: 3830out_free:
@@ -4085,7 +3971,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4085 case -NFS4ERR_DELAY: 3971 case -NFS4ERR_DELAY:
4086 nfs_inc_server_stats(server, NFSIOS_DELAY); 3972 nfs_inc_server_stats(server, NFSIOS_DELAY);
4087 case -NFS4ERR_GRACE: 3973 case -NFS4ERR_GRACE:
4088 case -EKEYEXPIRED:
4089 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3974 rpc_delay(task, NFS4_POLL_RETRY_MAX);
4090 task->tk_status = 0; 3975 task->tk_status = 0;
4091 return -EAGAIN; 3976 return -EAGAIN;
@@ -4293,11 +4178,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
4293 4178
4294 d_data = (struct nfs4_delegreturndata *)data; 4179 d_data = (struct nfs4_delegreturndata *)data;
4295 4180
4296 if (nfs4_setup_sequence(d_data->res.server, 4181 nfs4_setup_sequence(d_data->res.server,
4297 &d_data->args.seq_args, 4182 &d_data->args.seq_args,
4298 &d_data->res.seq_res, task)) 4183 &d_data->res.seq_res,
4299 return; 4184 task);
4300 rpc_call_start(task);
4301} 4185}
4302#endif /* CONFIG_NFS_V4_1 */ 4186#endif /* CONFIG_NFS_V4_1 */
4303 4187
@@ -4543,6 +4427,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
4543 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { 4427 if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
4544 /* Note: exit _without_ running nfs4_locku_done */ 4428 /* Note: exit _without_ running nfs4_locku_done */
4545 task->tk_action = NULL; 4429 task->tk_action = NULL;
4430 nfs4_sequence_done(task, &calldata->res.seq_res);
4546 return; 4431 return;
4547 } 4432 }
4548 calldata->timestamp = jiffies; 4433 calldata->timestamp = jiffies;
@@ -4551,8 +4436,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
4551 &calldata->res.seq_res, 4436 &calldata->res.seq_res,
4552 task) != 0) 4437 task) != 0)
4553 nfs_release_seqid(calldata->arg.seqid); 4438 nfs_release_seqid(calldata->arg.seqid);
4554 else
4555 rpc_call_start(task);
4556} 4439}
4557 4440
4558static const struct rpc_call_ops nfs4_locku_ops = { 4441static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4696,8 +4579,9 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4696 return; 4579 return;
4697 /* Do we need to do an open_to_lock_owner? */ 4580 /* Do we need to do an open_to_lock_owner? */
4698 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { 4581 if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
4699 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) 4582 if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
4700 goto out_release_lock_seqid; 4583 goto out_release_lock_seqid;
4584 }
4701 data->arg.open_stateid = &state->stateid; 4585 data->arg.open_stateid = &state->stateid;
4702 data->arg.new_lock_owner = 1; 4586 data->arg.new_lock_owner = 1;
4703 data->res.open_seqid = data->arg.open_seqid; 4587 data->res.open_seqid = data->arg.open_seqid;
@@ -4707,20 +4591,12 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4707 if (nfs4_setup_sequence(data->server, 4591 if (nfs4_setup_sequence(data->server,
4708 &data->arg.seq_args, 4592 &data->arg.seq_args,
4709 &data->res.seq_res, 4593 &data->res.seq_res,
4710 task) == 0) { 4594 task) == 0)
4711 rpc_call_start(task);
4712 return; 4595 return;
4713 }
4714 nfs_release_seqid(data->arg.open_seqid); 4596 nfs_release_seqid(data->arg.open_seqid);
4715out_release_lock_seqid: 4597out_release_lock_seqid:
4716 nfs_release_seqid(data->arg.lock_seqid); 4598 nfs_release_seqid(data->arg.lock_seqid);
4717 dprintk("%s: done!, ret = %d\n", __func__, task->tk_status); 4599 dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
4718}
4719
4720static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
4721{
4722 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
4723 nfs4_lock_prepare(task, calldata);
4724} 4600}
4725 4601
4726static void nfs4_lock_done(struct rpc_task *task, void *calldata) 4602static void nfs4_lock_done(struct rpc_task *task, void *calldata)
@@ -4775,12 +4651,6 @@ static const struct rpc_call_ops nfs4_lock_ops = {
4775 .rpc_release = nfs4_lock_release, 4651 .rpc_release = nfs4_lock_release,
4776}; 4652};
4777 4653
4778static const struct rpc_call_ops nfs4_recover_lock_ops = {
4779 .rpc_call_prepare = nfs4_recover_lock_prepare,
4780 .rpc_call_done = nfs4_lock_done,
4781 .rpc_release = nfs4_lock_release,
4782};
4783
4784static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) 4654static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
4785{ 4655{
4786 switch (error) { 4656 switch (error) {
@@ -4823,15 +4693,15 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
4823 return -ENOMEM; 4693 return -ENOMEM;
4824 if (IS_SETLKW(cmd)) 4694 if (IS_SETLKW(cmd))
4825 data->arg.block = 1; 4695 data->arg.block = 1;
4826 if (recovery_type > NFS_LOCK_NEW) {
4827 if (recovery_type == NFS_LOCK_RECLAIM)
4828 data->arg.reclaim = NFS_LOCK_RECLAIM;
4829 task_setup_data.callback_ops = &nfs4_recover_lock_ops;
4830 }
4831 nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1); 4696 nfs41_init_sequence(&data->arg.seq_args, &data->res.seq_res, 1);
4832 msg.rpc_argp = &data->arg; 4697 msg.rpc_argp = &data->arg;
4833 msg.rpc_resp = &data->res; 4698 msg.rpc_resp = &data->res;
4834 task_setup_data.callback_data = data; 4699 task_setup_data.callback_data = data;
4700 if (recovery_type > NFS_LOCK_NEW) {
4701 if (recovery_type == NFS_LOCK_RECLAIM)
4702 data->arg.reclaim = NFS_LOCK_RECLAIM;
4703 nfs4_set_sequence_privileged(&data->arg.seq_args);
4704 }
4835 task = rpc_run_task(&task_setup_data); 4705 task = rpc_run_task(&task_setup_data);
4836 if (IS_ERR(task)) 4706 if (IS_ERR(task))
4837 return PTR_ERR(task); 4707 return PTR_ERR(task);
@@ -5100,15 +4970,6 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
5100 nfs4_schedule_stateid_recovery(server, state); 4970 nfs4_schedule_stateid_recovery(server, state);
5101 err = 0; 4971 err = 0;
5102 goto out; 4972 goto out;
5103 case -EKEYEXPIRED:
5104 /*
5105 * User RPCSEC_GSS context has expired.
5106 * We cannot recover this stateid now, so
5107 * skip it and allow recovery thread to
5108 * proceed.
5109 */
5110 err = 0;
5111 goto out;
5112 case -ENOMEM: 4973 case -ENOMEM:
5113 case -NFS4ERR_DENIED: 4974 case -NFS4ERR_DENIED:
5114 /* kill_proc(fl->fl_pid, SIGLOST, 1); */ 4975 /* kill_proc(fl->fl_pid, SIGLOST, 1); */
@@ -5357,7 +5218,6 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
5357 }; 5218 };
5358 5219
5359 dprintk("--> %s\n", __func__); 5220 dprintk("--> %s\n", __func__);
5360 BUG_ON(clp == NULL);
5361 5221
5362 res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); 5222 res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
5363 if (unlikely(res.session == NULL)) { 5223 if (unlikely(res.session == NULL)) {
@@ -5569,20 +5429,16 @@ struct nfs4_get_lease_time_data {
5569static void nfs4_get_lease_time_prepare(struct rpc_task *task, 5429static void nfs4_get_lease_time_prepare(struct rpc_task *task,
5570 void *calldata) 5430 void *calldata)
5571{ 5431{
5572 int ret;
5573 struct nfs4_get_lease_time_data *data = 5432 struct nfs4_get_lease_time_data *data =
5574 (struct nfs4_get_lease_time_data *)calldata; 5433 (struct nfs4_get_lease_time_data *)calldata;
5575 5434
5576 dprintk("--> %s\n", __func__); 5435 dprintk("--> %s\n", __func__);
5577 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
5578 /* just setup sequence, do not trigger session recovery 5436 /* just setup sequence, do not trigger session recovery
5579 since we're invoked within one */ 5437 since we're invoked within one */
5580 ret = nfs41_setup_sequence(data->clp->cl_session, 5438 nfs41_setup_sequence(data->clp->cl_session,
5581 &data->args->la_seq_args, 5439 &data->args->la_seq_args,
5582 &data->res->lr_seq_res, task); 5440 &data->res->lr_seq_res,
5583 5441 task);
5584 BUG_ON(ret == -EAGAIN);
5585 rpc_call_start(task);
5586 dprintk("<-- %s\n", __func__); 5442 dprintk("<-- %s\n", __func__);
5587} 5443}
5588 5444
@@ -5644,6 +5500,7 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
5644 int status; 5500 int status;
5645 5501
5646 nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); 5502 nfs41_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0);
5503 nfs4_set_sequence_privileged(&args.la_seq_args);
5647 dprintk("--> %s\n", __func__); 5504 dprintk("--> %s\n", __func__);
5648 task = rpc_run_task(&task_setup); 5505 task = rpc_run_task(&task_setup);
5649 5506
@@ -5658,145 +5515,6 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
5658 return status; 5515 return status;
5659} 5516}
5660 5517
5661static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags)
5662{
5663 return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags);
5664}
5665
5666static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
5667 struct nfs4_slot *new,
5668 u32 max_slots,
5669 u32 ivalue)
5670{
5671 struct nfs4_slot *old = NULL;
5672 u32 i;
5673
5674 spin_lock(&tbl->slot_tbl_lock);
5675 if (new) {
5676 old = tbl->slots;
5677 tbl->slots = new;
5678 tbl->max_slots = max_slots;
5679 }
5680 tbl->highest_used_slotid = NFS4_NO_SLOT;
5681 for (i = 0; i < tbl->max_slots; i++)
5682 tbl->slots[i].seq_nr = ivalue;
5683 spin_unlock(&tbl->slot_tbl_lock);
5684 kfree(old);
5685}
5686
5687/*
5688 * (re)Initialise a slot table
5689 */
5690static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
5691 u32 ivalue)
5692{
5693 struct nfs4_slot *new = NULL;
5694 int ret = -ENOMEM;
5695
5696 dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
5697 max_reqs, tbl->max_slots);
5698
5699 /* Does the newly negotiated max_reqs match the existing slot table? */
5700 if (max_reqs != tbl->max_slots) {
5701 new = nfs4_alloc_slots(max_reqs, GFP_NOFS);
5702 if (!new)
5703 goto out;
5704 }
5705 ret = 0;
5706
5707 nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);
5708 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
5709 tbl, tbl->slots, tbl->max_slots);
5710out:
5711 dprintk("<-- %s: return %d\n", __func__, ret);
5712 return ret;
5713}
5714
5715/* Destroy the slot table */
5716static void nfs4_destroy_slot_tables(struct nfs4_session *session)
5717{
5718 if (session->fc_slot_table.slots != NULL) {
5719 kfree(session->fc_slot_table.slots);
5720 session->fc_slot_table.slots = NULL;
5721 }
5722 if (session->bc_slot_table.slots != NULL) {
5723 kfree(session->bc_slot_table.slots);
5724 session->bc_slot_table.slots = NULL;
5725 }
5726 return;
5727}
5728
5729/*
5730 * Initialize or reset the forechannel and backchannel tables
5731 */
5732static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
5733{
5734 struct nfs4_slot_table *tbl;
5735 int status;
5736
5737 dprintk("--> %s\n", __func__);
5738 /* Fore channel */
5739 tbl = &ses->fc_slot_table;
5740 status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
5741 if (status) /* -ENOMEM */
5742 return status;
5743 /* Back channel */
5744 tbl = &ses->bc_slot_table;
5745 status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
5746 if (status && tbl->slots == NULL)
5747 /* Fore and back channel share a connection so get
5748 * both slot tables or neither */
5749 nfs4_destroy_slot_tables(ses);
5750 return status;
5751}
5752
5753struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
5754{
5755 struct nfs4_session *session;
5756 struct nfs4_slot_table *tbl;
5757
5758 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
5759 if (!session)
5760 return NULL;
5761
5762 tbl = &session->fc_slot_table;
5763 tbl->highest_used_slotid = NFS4_NO_SLOT;
5764 spin_lock_init(&tbl->slot_tbl_lock);
5765 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
5766 init_completion(&tbl->complete);
5767
5768 tbl = &session->bc_slot_table;
5769 tbl->highest_used_slotid = NFS4_NO_SLOT;
5770 spin_lock_init(&tbl->slot_tbl_lock);
5771 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
5772 init_completion(&tbl->complete);
5773
5774 session->session_state = 1<<NFS4_SESSION_INITING;
5775
5776 session->clp = clp;
5777 return session;
5778}
5779
5780void nfs4_destroy_session(struct nfs4_session *session)
5781{
5782 struct rpc_xprt *xprt;
5783 struct rpc_cred *cred;
5784
5785 cred = nfs4_get_exchange_id_cred(session->clp);
5786 nfs4_proc_destroy_session(session, cred);
5787 if (cred)
5788 put_rpccred(cred);
5789
5790 rcu_read_lock();
5791 xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
5792 rcu_read_unlock();
5793 dprintk("%s Destroy backchannel for xprt %p\n",
5794 __func__, xprt);
5795 xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
5796 nfs4_destroy_slot_tables(session);
5797 kfree(session);
5798}
5799
5800/* 5518/*
5801 * Initialize the values to be used by the client in CREATE_SESSION 5519 * Initialize the values to be used by the client in CREATE_SESSION
5802 * If nfs4_init_session set the fore channel request and response sizes, 5520 * If nfs4_init_session set the fore channel request and response sizes,
@@ -5809,8 +5527,8 @@ void nfs4_destroy_session(struct nfs4_session *session)
5809static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) 5527static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
5810{ 5528{
5811 struct nfs4_session *session = args->client->cl_session; 5529 struct nfs4_session *session = args->client->cl_session;
5812 unsigned int mxrqst_sz = session->fc_attrs.max_rqst_sz, 5530 unsigned int mxrqst_sz = session->fc_target_max_rqst_sz,
5813 mxresp_sz = session->fc_attrs.max_resp_sz; 5531 mxresp_sz = session->fc_target_max_resp_sz;
5814 5532
5815 if (mxrqst_sz == 0) 5533 if (mxrqst_sz == 0)
5816 mxrqst_sz = NFS_MAX_FILE_IO_SIZE; 5534 mxrqst_sz = NFS_MAX_FILE_IO_SIZE;
@@ -5919,10 +5637,9 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
5919 5637
5920 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 5638 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
5921 5639
5922 if (!status) 5640 if (!status) {
5923 /* Verify the session's negotiated channel_attrs values */ 5641 /* Verify the session's negotiated channel_attrs values */
5924 status = nfs4_verify_channel_attrs(&args, session); 5642 status = nfs4_verify_channel_attrs(&args, session);
5925 if (!status) {
5926 /* Increment the clientid slot sequence id */ 5643 /* Increment the clientid slot sequence id */
5927 clp->cl_seqid++; 5644 clp->cl_seqid++;
5928 } 5645 }
@@ -5992,83 +5709,6 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
5992} 5709}
5993 5710
5994/* 5711/*
5995 * With sessions, the client is not marked ready until after a
5996 * successful EXCHANGE_ID and CREATE_SESSION.
5997 *
5998 * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
5999 * other versions of NFS can be tried.
6000 */
6001static int nfs41_check_session_ready(struct nfs_client *clp)
6002{
6003 int ret;
6004
6005 if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
6006 ret = nfs4_client_recover_expired_lease(clp);
6007 if (ret)
6008 return ret;
6009 }
6010 if (clp->cl_cons_state < NFS_CS_READY)
6011 return -EPROTONOSUPPORT;
6012 smp_rmb();
6013 return 0;
6014}
6015
6016int nfs4_init_session(struct nfs_server *server)
6017{
6018 struct nfs_client *clp = server->nfs_client;
6019 struct nfs4_session *session;
6020 unsigned int rsize, wsize;
6021
6022 if (!nfs4_has_session(clp))
6023 return 0;
6024
6025 session = clp->cl_session;
6026 spin_lock(&clp->cl_lock);
6027 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
6028
6029 rsize = server->rsize;
6030 if (rsize == 0)
6031 rsize = NFS_MAX_FILE_IO_SIZE;
6032 wsize = server->wsize;
6033 if (wsize == 0)
6034 wsize = NFS_MAX_FILE_IO_SIZE;
6035
6036 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
6037 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
6038 }
6039 spin_unlock(&clp->cl_lock);
6040
6041 return nfs41_check_session_ready(clp);
6042}
6043
6044int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
6045{
6046 struct nfs4_session *session = clp->cl_session;
6047 int ret;
6048
6049 spin_lock(&clp->cl_lock);
6050 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
6051 /*
6052 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
6053 * DS lease to be equal to the MDS lease.
6054 */
6055 clp->cl_lease_time = lease_time;
6056 clp->cl_last_renewal = jiffies;
6057 }
6058 spin_unlock(&clp->cl_lock);
6059
6060 ret = nfs41_check_session_ready(clp);
6061 if (ret)
6062 return ret;
6063 /* Test for the DS role */
6064 if (!is_ds_client(clp))
6065 return -ENODEV;
6066 return 0;
6067}
6068EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
6069
6070
6071/*
6072 * Renew the cl_session lease. 5712 * Renew the cl_session lease.
6073 */ 5713 */
6074struct nfs4_sequence_data { 5714struct nfs4_sequence_data {
@@ -6133,9 +5773,7 @@ static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
6133 args = task->tk_msg.rpc_argp; 5773 args = task->tk_msg.rpc_argp;
6134 res = task->tk_msg.rpc_resp; 5774 res = task->tk_msg.rpc_resp;
6135 5775
6136 if (nfs41_setup_sequence(clp->cl_session, args, res, task)) 5776 nfs41_setup_sequence(clp->cl_session, args, res, task);
6137 return;
6138 rpc_call_start(task);
6139} 5777}
6140 5778
6141static const struct rpc_call_ops nfs41_sequence_ops = { 5779static const struct rpc_call_ops nfs41_sequence_ops = {
@@ -6144,7 +5782,9 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
6144 .rpc_release = nfs41_sequence_release, 5782 .rpc_release = nfs41_sequence_release,
6145}; 5783};
6146 5784
6147static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) 5785static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
5786 struct rpc_cred *cred,
5787 bool is_privileged)
6148{ 5788{
6149 struct nfs4_sequence_data *calldata; 5789 struct nfs4_sequence_data *calldata;
6150 struct rpc_message msg = { 5790 struct rpc_message msg = {
@@ -6166,6 +5806,8 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
6166 return ERR_PTR(-ENOMEM); 5806 return ERR_PTR(-ENOMEM);
6167 } 5807 }
6168 nfs41_init_sequence(&calldata->args, &calldata->res, 0); 5808 nfs41_init_sequence(&calldata->args, &calldata->res, 0);
5809 if (is_privileged)
5810 nfs4_set_sequence_privileged(&calldata->args);
6169 msg.rpc_argp = &calldata->args; 5811 msg.rpc_argp = &calldata->args;
6170 msg.rpc_resp = &calldata->res; 5812 msg.rpc_resp = &calldata->res;
6171 calldata->clp = clp; 5813 calldata->clp = clp;
@@ -6181,7 +5823,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
6181 5823
6182 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) 5824 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
6183 return 0; 5825 return 0;
6184 task = _nfs41_proc_sequence(clp, cred); 5826 task = _nfs41_proc_sequence(clp, cred, false);
6185 if (IS_ERR(task)) 5827 if (IS_ERR(task))
6186 ret = PTR_ERR(task); 5828 ret = PTR_ERR(task);
6187 else 5829 else
@@ -6195,7 +5837,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
6195 struct rpc_task *task; 5837 struct rpc_task *task;
6196 int ret; 5838 int ret;
6197 5839
6198 task = _nfs41_proc_sequence(clp, cred); 5840 task = _nfs41_proc_sequence(clp, cred, true);
6199 if (IS_ERR(task)) { 5841 if (IS_ERR(task)) {
6200 ret = PTR_ERR(task); 5842 ret = PTR_ERR(task);
6201 goto out; 5843 goto out;
@@ -6224,13 +5866,10 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
6224{ 5866{
6225 struct nfs4_reclaim_complete_data *calldata = data; 5867 struct nfs4_reclaim_complete_data *calldata = data;
6226 5868
6227 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 5869 nfs41_setup_sequence(calldata->clp->cl_session,
6228 if (nfs41_setup_sequence(calldata->clp->cl_session, 5870 &calldata->arg.seq_args,
6229 &calldata->arg.seq_args, 5871 &calldata->res.seq_res,
6230 &calldata->res.seq_res, task)) 5872 task);
6231 return;
6232
6233 rpc_call_start(task);
6234} 5873}
6235 5874
6236static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) 5875static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
@@ -6307,6 +5946,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
6307 calldata->arg.one_fs = 0; 5946 calldata->arg.one_fs = 0;
6308 5947
6309 nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0); 5948 nfs41_init_sequence(&calldata->arg.seq_args, &calldata->res.seq_res, 0);
5949 nfs4_set_sequence_privileged(&calldata->arg.seq_args);
6310 msg.rpc_argp = &calldata->arg; 5950 msg.rpc_argp = &calldata->arg;
6311 msg.rpc_resp = &calldata->res; 5951 msg.rpc_resp = &calldata->res;
6312 task_setup_data.callback_data = calldata; 5952 task_setup_data.callback_data = calldata;
@@ -6330,6 +5970,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
6330{ 5970{
6331 struct nfs4_layoutget *lgp = calldata; 5971 struct nfs4_layoutget *lgp = calldata;
6332 struct nfs_server *server = NFS_SERVER(lgp->args.inode); 5972 struct nfs_server *server = NFS_SERVER(lgp->args.inode);
5973 struct nfs4_session *session = nfs4_get_session(server);
6333 5974
6334 dprintk("--> %s\n", __func__); 5975 dprintk("--> %s\n", __func__);
6335 /* Note the is a race here, where a CB_LAYOUTRECALL can come in 5976 /* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -6337,16 +5978,14 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
6337 * However, that is not so catastrophic, and there seems 5978 * However, that is not so catastrophic, and there seems
6338 * to be no way to prevent it completely. 5979 * to be no way to prevent it completely.
6339 */ 5980 */
6340 if (nfs4_setup_sequence(server, &lgp->args.seq_args, 5981 if (nfs41_setup_sequence(session, &lgp->args.seq_args,
6341 &lgp->res.seq_res, task)) 5982 &lgp->res.seq_res, task))
6342 return; 5983 return;
6343 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, 5984 if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
6344 NFS_I(lgp->args.inode)->layout, 5985 NFS_I(lgp->args.inode)->layout,
6345 lgp->args.ctx->state)) { 5986 lgp->args.ctx->state)) {
6346 rpc_exit(task, NFS4_OK); 5987 rpc_exit(task, NFS4_OK);
6347 return;
6348 } 5988 }
6349 rpc_call_start(task);
6350} 5989}
6351 5990
6352static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) 5991static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -6359,7 +5998,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
6359 5998
6360 dprintk("--> %s\n", __func__); 5999 dprintk("--> %s\n", __func__);
6361 6000
6362 if (!nfs4_sequence_done(task, &lgp->res.seq_res)) 6001 if (!nfs41_sequence_done(task, &lgp->res.seq_res))
6363 goto out; 6002 goto out;
6364 6003
6365 switch (task->tk_status) { 6004 switch (task->tk_status) {
@@ -6510,10 +6149,10 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
6510 struct nfs4_layoutreturn *lrp = calldata; 6149 struct nfs4_layoutreturn *lrp = calldata;
6511 6150
6512 dprintk("--> %s\n", __func__); 6151 dprintk("--> %s\n", __func__);
6513 if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args, 6152 nfs41_setup_sequence(lrp->clp->cl_session,
6514 &lrp->res.seq_res, task)) 6153 &lrp->args.seq_args,
6515 return; 6154 &lrp->res.seq_res,
6516 rpc_call_start(task); 6155 task);
6517} 6156}
6518 6157
6519static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) 6158static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
@@ -6523,7 +6162,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
6523 6162
6524 dprintk("--> %s\n", __func__); 6163 dprintk("--> %s\n", __func__);
6525 6164
6526 if (!nfs4_sequence_done(task, &lrp->res.seq_res)) 6165 if (!nfs41_sequence_done(task, &lrp->res.seq_res))
6527 return; 6166 return;
6528 6167
6529 server = NFS_SERVER(lrp->args.inode); 6168 server = NFS_SERVER(lrp->args.inode);
@@ -6672,11 +6311,12 @@ static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
6672{ 6311{
6673 struct nfs4_layoutcommit_data *data = calldata; 6312 struct nfs4_layoutcommit_data *data = calldata;
6674 struct nfs_server *server = NFS_SERVER(data->args.inode); 6313 struct nfs_server *server = NFS_SERVER(data->args.inode);
6314 struct nfs4_session *session = nfs4_get_session(server);
6675 6315
6676 if (nfs4_setup_sequence(server, &data->args.seq_args, 6316 nfs41_setup_sequence(session,
6677 &data->res.seq_res, task)) 6317 &data->args.seq_args,
6678 return; 6318 &data->res.seq_res,
6679 rpc_call_start(task); 6319 task);
6680} 6320}
6681 6321
6682static void 6322static void
@@ -6685,7 +6325,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
6685 struct nfs4_layoutcommit_data *data = calldata; 6325 struct nfs4_layoutcommit_data *data = calldata;
6686 struct nfs_server *server = NFS_SERVER(data->args.inode); 6326 struct nfs_server *server = NFS_SERVER(data->args.inode);
6687 6327
6688 if (!nfs4_sequence_done(task, &data->res.seq_res)) 6328 if (!nfs41_sequence_done(task, &data->res.seq_res))
6689 return; 6329 return;
6690 6330
6691 switch (task->tk_status) { /* Just ignore these failures */ 6331 switch (task->tk_status) { /* Just ignore these failures */
@@ -6873,7 +6513,9 @@ static int _nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6873 6513
6874 dprintk("NFS call test_stateid %p\n", stateid); 6514 dprintk("NFS call test_stateid %p\n", stateid);
6875 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); 6515 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
6876 status = nfs4_call_sync_sequence(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); 6516 nfs4_set_sequence_privileged(&args.seq_args);
6517 status = nfs4_call_sync_sequence(server->client, server, &msg,
6518 &args.seq_args, &res.seq_res);
6877 if (status != NFS_OK) { 6519 if (status != NFS_OK) {
6878 dprintk("NFS reply test_stateid: failed, %d\n", status); 6520 dprintk("NFS reply test_stateid: failed, %d\n", status);
6879 return status; 6521 return status;
@@ -6920,8 +6562,9 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
6920 6562
6921 dprintk("NFS call free_stateid %p\n", stateid); 6563 dprintk("NFS call free_stateid %p\n", stateid);
6922 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); 6564 nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
6565 nfs4_set_sequence_privileged(&args.seq_args);
6923 status = nfs4_call_sync_sequence(server->client, server, &msg, 6566 status = nfs4_call_sync_sequence(server->client, server, &msg,
6924 &args.seq_args, &res.seq_res, 1); 6567 &args.seq_args, &res.seq_res);
6925 dprintk("NFS reply free_stateid: %d\n", status); 6568 dprintk("NFS reply free_stateid: %d\n", status);
6926 return status; 6569 return status;
6927} 6570}
@@ -7041,7 +6684,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
7041#if defined(CONFIG_NFS_V4_1) 6684#if defined(CONFIG_NFS_V4_1)
7042static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { 6685static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
7043 .minor_version = 1, 6686 .minor_version = 1,
7044 .call_sync = _nfs4_call_sync_session, 6687 .call_sync = nfs4_call_sync_sequence,
7045 .match_stateid = nfs41_match_stateid, 6688 .match_stateid = nfs41_match_stateid,
7046 .find_root_sec = nfs41_find_root_sec, 6689 .find_root_sec = nfs41_find_root_sec,
7047 .reboot_recovery_ops = &nfs41_reboot_recovery_ops, 6690 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
new file mode 100644
index 000000000000..ebda5f4a031b
--- /dev/null
+++ b/fs/nfs/nfs4session.c
@@ -0,0 +1,552 @@
1/*
2 * fs/nfs/nfs4session.c
3 *
4 * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 */
7#include <linux/kernel.h>
8#include <linux/errno.h>
9#include <linux/string.h>
10#include <linux/printk.h>
11#include <linux/slab.h>
12#include <linux/sunrpc/sched.h>
13#include <linux/sunrpc/bc_xprt.h>
14#include <linux/nfs.h>
15#include <linux/nfs4.h>
16#include <linux/nfs_fs.h>
17#include <linux/module.h>
18
19#include "nfs4_fs.h"
20#include "internal.h"
21#include "nfs4session.h"
22#include "callback.h"
23
24#define NFSDBG_FACILITY NFSDBG_STATE
25
26/*
27 * nfs4_shrink_slot_table - free retired slots from the slot table
28 */
29static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize)
30{
31 struct nfs4_slot **p;
32 if (newsize >= tbl->max_slots)
33 return;
34
35 p = &tbl->slots;
36 while (newsize--)
37 p = &(*p)->next;
38 while (*p) {
39 struct nfs4_slot *slot = *p;
40
41 *p = slot->next;
42 kfree(slot);
43 tbl->max_slots--;
44 }
45}
46
47/*
48 * nfs4_free_slot - free a slot and efficiently update slot table.
49 *
50 * freeing a slot is trivially done by clearing its respective bit
51 * in the bitmap.
52 * If the freed slotid equals highest_used_slotid we want to update it
53 * so that the server would be able to size down the slot table if needed,
54 * otherwise we know that the highest_used_slotid is still in use.
55 * When updating highest_used_slotid there may be "holes" in the bitmap
56 * so we need to scan down from highest_used_slotid to 0 looking for the now
57 * highest slotid in use.
58 * If none found, highest_used_slotid is set to NFS4_NO_SLOT.
59 *
60 * Must be called while holding tbl->slot_tbl_lock
61 */
62void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
63{
64 u32 slotid = slot->slot_nr;
65
66 /* clear used bit in bitmap */
67 __clear_bit(slotid, tbl->used_slots);
68
69 /* update highest_used_slotid when it is freed */
70 if (slotid == tbl->highest_used_slotid) {
71 u32 new_max = find_last_bit(tbl->used_slots, slotid);
72 if (new_max < slotid)
73 tbl->highest_used_slotid = new_max;
74 else {
75 tbl->highest_used_slotid = NFS4_NO_SLOT;
76 nfs4_session_drain_complete(tbl->session, tbl);
77 }
78 }
79 dprintk("%s: slotid %u highest_used_slotid %d\n", __func__,
80 slotid, tbl->highest_used_slotid);
81}
82
83static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl,
84 u32 slotid, u32 seq_init, gfp_t gfp_mask)
85{
86 struct nfs4_slot *slot;
87
88 slot = kzalloc(sizeof(*slot), gfp_mask);
89 if (slot) {
90 slot->table = tbl;
91 slot->slot_nr = slotid;
92 slot->seq_nr = seq_init;
93 }
94 return slot;
95}
96
97static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl,
98 u32 slotid, u32 seq_init, gfp_t gfp_mask)
99{
100 struct nfs4_slot **p, *slot;
101
102 p = &tbl->slots;
103 for (;;) {
104 if (*p == NULL) {
105 *p = nfs4_new_slot(tbl, tbl->max_slots,
106 seq_init, gfp_mask);
107 if (*p == NULL)
108 break;
109 tbl->max_slots++;
110 }
111 slot = *p;
112 if (slot->slot_nr == slotid)
113 return slot;
114 p = &slot->next;
115 }
116 return ERR_PTR(-ENOMEM);
117}
118
119/*
120 * nfs4_alloc_slot - efficiently look for a free slot
121 *
122 * nfs4_alloc_slot looks for an unset bit in the used_slots bitmap.
123 * If found, we mark the slot as used, update the highest_used_slotid,
124 * and respectively set up the sequence operation args.
125 *
126 * Note: must be called with under the slot_tbl_lock.
127 */
128struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
129{
130 struct nfs4_slot *ret = ERR_PTR(-EBUSY);
131 u32 slotid;
132
133 dprintk("--> %s used_slots=%04lx highest_used=%u max_slots=%u\n",
134 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
135 tbl->max_slotid + 1);
136 slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
137 if (slotid > tbl->max_slotid)
138 goto out;
139 ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
140 if (IS_ERR(ret))
141 goto out;
142 __set_bit(slotid, tbl->used_slots);
143 if (slotid > tbl->highest_used_slotid ||
144 tbl->highest_used_slotid == NFS4_NO_SLOT)
145 tbl->highest_used_slotid = slotid;
146 ret->generation = tbl->generation;
147
148out:
149 dprintk("<-- %s used_slots=%04lx highest_used=%d slotid=%d \n",
150 __func__, tbl->used_slots[0], tbl->highest_used_slotid,
151 !IS_ERR(ret) ? ret->slot_nr : -1);
152 return ret;
153}
154
155static int nfs4_grow_slot_table(struct nfs4_slot_table *tbl,
156 u32 max_reqs, u32 ivalue)
157{
158 if (max_reqs <= tbl->max_slots)
159 return 0;
160 if (!IS_ERR(nfs4_find_or_create_slot(tbl, max_reqs - 1, ivalue, GFP_NOFS)))
161 return 0;
162 return -ENOMEM;
163}
164
165static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl,
166 u32 server_highest_slotid,
167 u32 ivalue)
168{
169 struct nfs4_slot **p;
170
171 nfs4_shrink_slot_table(tbl, server_highest_slotid + 1);
172 p = &tbl->slots;
173 while (*p) {
174 (*p)->seq_nr = ivalue;
175 (*p)->interrupted = 0;
176 p = &(*p)->next;
177 }
178 tbl->highest_used_slotid = NFS4_NO_SLOT;
179 tbl->target_highest_slotid = server_highest_slotid;
180 tbl->server_highest_slotid = server_highest_slotid;
181 tbl->d_target_highest_slotid = 0;
182 tbl->d2_target_highest_slotid = 0;
183 tbl->max_slotid = server_highest_slotid;
184}
185
186/*
187 * (re)Initialise a slot table
188 */
189static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl,
190 u32 max_reqs, u32 ivalue)
191{
192 int ret;
193
194 dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
195 max_reqs, tbl->max_slots);
196
197 if (max_reqs > NFS4_MAX_SLOT_TABLE)
198 max_reqs = NFS4_MAX_SLOT_TABLE;
199
200 ret = nfs4_grow_slot_table(tbl, max_reqs, ivalue);
201 if (ret)
202 goto out;
203
204 spin_lock(&tbl->slot_tbl_lock);
205 nfs4_reset_slot_table(tbl, max_reqs - 1, ivalue);
206 spin_unlock(&tbl->slot_tbl_lock);
207
208 dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
209 tbl, tbl->slots, tbl->max_slots);
210out:
211 dprintk("<-- %s: return %d\n", __func__, ret);
212 return ret;
213}
214
215/* Destroy the slot table */
216static void nfs4_destroy_slot_tables(struct nfs4_session *session)
217{
218 nfs4_shrink_slot_table(&session->fc_slot_table, 0);
219 nfs4_shrink_slot_table(&session->bc_slot_table, 0);
220}
221
222static bool nfs41_assign_slot(struct rpc_task *task, void *pslot)
223{
224 struct nfs4_sequence_args *args = task->tk_msg.rpc_argp;
225 struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
226 struct nfs4_slot *slot = pslot;
227 struct nfs4_slot_table *tbl = slot->table;
228
229 if (nfs4_session_draining(tbl->session) && !args->sa_privileged)
230 return false;
231 slot->generation = tbl->generation;
232 args->sa_slot = slot;
233 res->sr_timestamp = jiffies;
234 res->sr_slot = slot;
235 res->sr_status_flags = 0;
236 res->sr_status = 1;
237 return true;
238}
239
240static bool __nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
241 struct nfs4_slot *slot)
242{
243 if (rpc_wake_up_first(&tbl->slot_tbl_waitq, nfs41_assign_slot, slot))
244 return true;
245 return false;
246}
247
248bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
249 struct nfs4_slot *slot)
250{
251 if (slot->slot_nr > tbl->max_slotid)
252 return false;
253 return __nfs41_wake_and_assign_slot(tbl, slot);
254}
255
256static bool nfs41_try_wake_next_slot_table_entry(struct nfs4_slot_table *tbl)
257{
258 struct nfs4_slot *slot = nfs4_alloc_slot(tbl);
259 if (!IS_ERR(slot)) {
260 bool ret = __nfs41_wake_and_assign_slot(tbl, slot);
261 if (ret)
262 return ret;
263 nfs4_free_slot(tbl, slot);
264 }
265 return false;
266}
267
268void nfs41_wake_slot_table(struct nfs4_slot_table *tbl)
269{
270 for (;;) {
271 if (!nfs41_try_wake_next_slot_table_entry(tbl))
272 break;
273 }
274}
275
276static void nfs41_set_max_slotid_locked(struct nfs4_slot_table *tbl,
277 u32 target_highest_slotid)
278{
279 u32 max_slotid;
280
281 max_slotid = min(NFS4_MAX_SLOT_TABLE - 1, target_highest_slotid);
282 if (max_slotid > tbl->server_highest_slotid)
283 max_slotid = tbl->server_highest_slotid;
284 if (max_slotid > tbl->target_highest_slotid)
285 max_slotid = tbl->target_highest_slotid;
286 tbl->max_slotid = max_slotid;
287 nfs41_wake_slot_table(tbl);
288}
289
290/* Update the client's idea of target_highest_slotid */
291static void nfs41_set_target_slotid_locked(struct nfs4_slot_table *tbl,
292 u32 target_highest_slotid)
293{
294 if (tbl->target_highest_slotid == target_highest_slotid)
295 return;
296 tbl->target_highest_slotid = target_highest_slotid;
297 tbl->generation++;
298}
299
300void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
301 u32 target_highest_slotid)
302{
303 spin_lock(&tbl->slot_tbl_lock);
304 nfs41_set_target_slotid_locked(tbl, target_highest_slotid);
305 tbl->d_target_highest_slotid = 0;
306 tbl->d2_target_highest_slotid = 0;
307 nfs41_set_max_slotid_locked(tbl, target_highest_slotid);
308 spin_unlock(&tbl->slot_tbl_lock);
309}
310
311static void nfs41_set_server_slotid_locked(struct nfs4_slot_table *tbl,
312 u32 highest_slotid)
313{
314 if (tbl->server_highest_slotid == highest_slotid)
315 return;
316 if (tbl->highest_used_slotid > highest_slotid)
317 return;
318 /* Deallocate slots */
319 nfs4_shrink_slot_table(tbl, highest_slotid + 1);
320 tbl->server_highest_slotid = highest_slotid;
321}
322
323static s32 nfs41_derivative_target_slotid(s32 s1, s32 s2)
324{
325 s1 -= s2;
326 if (s1 == 0)
327 return 0;
328 if (s1 < 0)
329 return (s1 - 1) >> 1;
330 return (s1 + 1) >> 1;
331}
332
333static int nfs41_sign_s32(s32 s1)
334{
335 if (s1 > 0)
336 return 1;
337 if (s1 < 0)
338 return -1;
339 return 0;
340}
341
342static bool nfs41_same_sign_or_zero_s32(s32 s1, s32 s2)
343{
344 if (!s1 || !s2)
345 return true;
346 return nfs41_sign_s32(s1) == nfs41_sign_s32(s2);
347}
348
349/* Try to eliminate outliers by checking for sharp changes in the
350 * derivatives and second derivatives
351 */
352static bool nfs41_is_outlier_target_slotid(struct nfs4_slot_table *tbl,
353 u32 new_target)
354{
355 s32 d_target, d2_target;
356 bool ret = true;
357
358 d_target = nfs41_derivative_target_slotid(new_target,
359 tbl->target_highest_slotid);
360 d2_target = nfs41_derivative_target_slotid(d_target,
361 tbl->d_target_highest_slotid);
362 /* Is first derivative same sign? */
363 if (nfs41_same_sign_or_zero_s32(d_target, tbl->d_target_highest_slotid))
364 ret = false;
365 /* Is second derivative same sign? */
366 if (nfs41_same_sign_or_zero_s32(d2_target, tbl->d2_target_highest_slotid))
367 ret = false;
368 tbl->d_target_highest_slotid = d_target;
369 tbl->d2_target_highest_slotid = d2_target;
370 return ret;
371}
372
373void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
374 struct nfs4_slot *slot,
375 struct nfs4_sequence_res *res)
376{
377 spin_lock(&tbl->slot_tbl_lock);
378 if (!nfs41_is_outlier_target_slotid(tbl, res->sr_target_highest_slotid))
379 nfs41_set_target_slotid_locked(tbl, res->sr_target_highest_slotid);
380 if (tbl->generation == slot->generation)
381 nfs41_set_server_slotid_locked(tbl, res->sr_highest_slotid);
382 nfs41_set_max_slotid_locked(tbl, res->sr_target_highest_slotid);
383 spin_unlock(&tbl->slot_tbl_lock);
384}
385
386/*
387 * Initialize or reset the forechannel and backchannel tables
388 */
389int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
390{
391 struct nfs4_slot_table *tbl;
392 int status;
393
394 dprintk("--> %s\n", __func__);
395 /* Fore channel */
396 tbl = &ses->fc_slot_table;
397 tbl->session = ses;
398 status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
399 if (status) /* -ENOMEM */
400 return status;
401 /* Back channel */
402 tbl = &ses->bc_slot_table;
403 tbl->session = ses;
404 status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
405 if (status && tbl->slots == NULL)
406 /* Fore and back channel share a connection so get
407 * both slot tables or neither */
408 nfs4_destroy_slot_tables(ses);
409 return status;
410}
411
412struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
413{
414 struct nfs4_session *session;
415 struct nfs4_slot_table *tbl;
416
417 session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
418 if (!session)
419 return NULL;
420
421 tbl = &session->fc_slot_table;
422 tbl->highest_used_slotid = NFS4_NO_SLOT;
423 spin_lock_init(&tbl->slot_tbl_lock);
424 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
425 init_completion(&tbl->complete);
426
427 tbl = &session->bc_slot_table;
428 tbl->highest_used_slotid = NFS4_NO_SLOT;
429 spin_lock_init(&tbl->slot_tbl_lock);
430 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
431 init_completion(&tbl->complete);
432
433 session->session_state = 1<<NFS4_SESSION_INITING;
434
435 session->clp = clp;
436 return session;
437}
438
439void nfs4_destroy_session(struct nfs4_session *session)
440{
441 struct rpc_xprt *xprt;
442 struct rpc_cred *cred;
443
444 cred = nfs4_get_exchange_id_cred(session->clp);
445 nfs4_proc_destroy_session(session, cred);
446 if (cred)
447 put_rpccred(cred);
448
449 rcu_read_lock();
450 xprt = rcu_dereference(session->clp->cl_rpcclient->cl_xprt);
451 rcu_read_unlock();
452 dprintk("%s Destroy backchannel for xprt %p\n",
453 __func__, xprt);
454 xprt_destroy_backchannel(xprt, NFS41_BC_MIN_CALLBACKS);
455 nfs4_destroy_slot_tables(session);
456 kfree(session);
457}
458
459/*
460 * With sessions, the client is not marked ready until after a
461 * successful EXCHANGE_ID and CREATE_SESSION.
462 *
463 * Map errors cl_cons_state errors to EPROTONOSUPPORT to indicate
464 * other versions of NFS can be tried.
465 */
466static int nfs41_check_session_ready(struct nfs_client *clp)
467{
468 int ret;
469
470 if (clp->cl_cons_state == NFS_CS_SESSION_INITING) {
471 ret = nfs4_client_recover_expired_lease(clp);
472 if (ret)
473 return ret;
474 }
475 if (clp->cl_cons_state < NFS_CS_READY)
476 return -EPROTONOSUPPORT;
477 smp_rmb();
478 return 0;
479}
480
481int nfs4_init_session(struct nfs_server *server)
482{
483 struct nfs_client *clp = server->nfs_client;
484 struct nfs4_session *session;
485 unsigned int target_max_rqst_sz = NFS_MAX_FILE_IO_SIZE;
486 unsigned int target_max_resp_sz = NFS_MAX_FILE_IO_SIZE;
487
488 if (!nfs4_has_session(clp))
489 return 0;
490
491 if (server->rsize != 0)
492 target_max_resp_sz = server->rsize;
493 target_max_resp_sz += nfs41_maxread_overhead;
494
495 if (server->wsize != 0)
496 target_max_rqst_sz = server->wsize;
497 target_max_rqst_sz += nfs41_maxwrite_overhead;
498
499 session = clp->cl_session;
500 spin_lock(&clp->cl_lock);
501 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
502 /* Initialise targets and channel attributes */
503 session->fc_target_max_rqst_sz = target_max_rqst_sz;
504 session->fc_attrs.max_rqst_sz = target_max_rqst_sz;
505 session->fc_target_max_resp_sz = target_max_resp_sz;
506 session->fc_attrs.max_resp_sz = target_max_resp_sz;
507 } else {
508 /* Just adjust the targets */
509 if (target_max_rqst_sz > session->fc_target_max_rqst_sz) {
510 session->fc_target_max_rqst_sz = target_max_rqst_sz;
511 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
512 }
513 if (target_max_resp_sz > session->fc_target_max_resp_sz) {
514 session->fc_target_max_resp_sz = target_max_resp_sz;
515 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
516 }
517 }
518 spin_unlock(&clp->cl_lock);
519
520 if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
521 nfs4_schedule_lease_recovery(clp);
522
523 return nfs41_check_session_ready(clp);
524}
525
526int nfs4_init_ds_session(struct nfs_client *clp, unsigned long lease_time)
527{
528 struct nfs4_session *session = clp->cl_session;
529 int ret;
530
531 spin_lock(&clp->cl_lock);
532 if (test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) {
533 /*
534 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the
535 * DS lease to be equal to the MDS lease.
536 */
537 clp->cl_lease_time = lease_time;
538 clp->cl_last_renewal = jiffies;
539 }
540 spin_unlock(&clp->cl_lock);
541
542 ret = nfs41_check_session_ready(clp);
543 if (ret)
544 return ret;
545 /* Test for the DS role */
546 if (!is_ds_client(clp))
547 return -ENODEV;
548 return 0;
549}
550EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
551
552
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
new file mode 100644
index 000000000000..6f3cb39386d4
--- /dev/null
+++ b/fs/nfs/nfs4session.h
@@ -0,0 +1,142 @@
1/*
2 * fs/nfs/nfs4session.h
3 *
4 * Copyright (c) 2012 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 */
7#ifndef __LINUX_FS_NFS_NFS4SESSION_H
8#define __LINUX_FS_NFS_NFS4SESSION_H
9
10/* maximum number of slots to use */
11#define NFS4_DEF_SLOT_TABLE_SIZE (16U)
12#define NFS4_MAX_SLOT_TABLE (1024U)
13#define NFS4_NO_SLOT ((u32)-1)
14
15#if IS_ENABLED(CONFIG_NFS_V4)
16
17/* Sessions slot seqid */
18struct nfs4_slot {
19 struct nfs4_slot_table *table;
20 struct nfs4_slot *next;
21 unsigned long generation;
22 u32 slot_nr;
23 u32 seq_nr;
24 unsigned int interrupted : 1;
25};
26
27/* Sessions */
28#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
29struct nfs4_slot_table {
30 struct nfs4_session *session; /* Parent session */
31 struct nfs4_slot *slots; /* seqid per slot */
32 unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
33 spinlock_t slot_tbl_lock;
34 struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
35 u32 max_slots; /* # slots in table */
36 u32 max_slotid; /* Max allowed slotid value */
37 u32 highest_used_slotid; /* sent to server on each SEQ.
38 * op for dynamic resizing */
39 u32 target_highest_slotid; /* Server max_slot target */
40 u32 server_highest_slotid; /* Server highest slotid */
41 s32 d_target_highest_slotid; /* Derivative */
42 s32 d2_target_highest_slotid; /* 2nd derivative */
43 unsigned long generation; /* Generation counter for
44 target_highest_slotid */
45 struct completion complete;
46};
47
48/*
49 * Session related parameters
50 */
51struct nfs4_session {
52 struct nfs4_sessionid sess_id;
53 u32 flags;
54 unsigned long session_state;
55 u32 hash_alg;
56 u32 ssv_len;
57
58 /* The fore and back channel */
59 struct nfs4_channel_attrs fc_attrs;
60 struct nfs4_slot_table fc_slot_table;
61 struct nfs4_channel_attrs bc_attrs;
62 struct nfs4_slot_table bc_slot_table;
63 struct nfs_client *clp;
64 /* Create session arguments */
65 unsigned int fc_target_max_rqst_sz;
66 unsigned int fc_target_max_resp_sz;
67};
68
69enum nfs4_session_state {
70 NFS4_SESSION_INITING,
71 NFS4_SESSION_DRAINING,
72};
73
74#if defined(CONFIG_NFS_V4_1)
75extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
76extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
77
78extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
79 u32 target_highest_slotid);
80extern void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
81 struct nfs4_slot *slot,
82 struct nfs4_sequence_res *res);
83
84extern int nfs4_setup_session_slot_tables(struct nfs4_session *ses);
85
86extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
87extern void nfs4_destroy_session(struct nfs4_session *session);
88extern int nfs4_init_session(struct nfs_server *server);
89extern int nfs4_init_ds_session(struct nfs_client *, unsigned long);
90
91extern void nfs4_session_drain_complete(struct nfs4_session *session,
92 struct nfs4_slot_table *tbl);
93
94static inline bool nfs4_session_draining(struct nfs4_session *session)
95{
96 return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state);
97}
98
99bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
100 struct nfs4_slot *slot);
101void nfs41_wake_slot_table(struct nfs4_slot_table *tbl);
102
103/*
104 * Determine if sessions are in use.
105 */
106static inline int nfs4_has_session(const struct nfs_client *clp)
107{
108 if (clp->cl_session)
109 return 1;
110 return 0;
111}
112
113static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
114{
115 if (nfs4_has_session(clp))
116 return (clp->cl_session->flags & SESSION4_PERSIST);
117 return 0;
118}
119
120#else /* defined(CONFIG_NFS_V4_1) */
121
122static inline int nfs4_init_session(struct nfs_server *server)
123{
124 return 0;
125}
126
127/*
128 * Determine if sessions are in use.
129 */
130static inline int nfs4_has_session(const struct nfs_client *clp)
131{
132 return 0;
133}
134
135static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
136{
137 return 0;
138}
139
140#endif /* defined(CONFIG_NFS_V4_1) */
141#endif /* IS_ENABLED(CONFIG_NFS_V4) */
142#endif /* __LINUX_FS_NFS_NFS4SESSION_H */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c351e6b39838..e61f68d5ef21 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -57,6 +57,7 @@
57#include "callback.h" 57#include "callback.h"
58#include "delegation.h" 58#include "delegation.h"
59#include "internal.h" 59#include "internal.h"
60#include "nfs4session.h"
60#include "pnfs.h" 61#include "pnfs.h"
61#include "netns.h" 62#include "netns.h"
62 63
@@ -66,7 +67,6 @@
66 67
67const nfs4_stateid zero_stateid; 68const nfs4_stateid zero_stateid;
68static DEFINE_MUTEX(nfs_clid_init_mutex); 69static DEFINE_MUTEX(nfs_clid_init_mutex);
69static LIST_HEAD(nfs4_clientid_list);
70 70
71int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) 71int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
72{ 72{
@@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
136 clp->cl_confirm = clid.confirm; 136 clp->cl_confirm = clid.confirm;
137 137
138 status = nfs40_walk_client_list(clp, result, cred); 138 status = nfs40_walk_client_list(clp, result, cred);
139 switch (status) { 139 if (status == 0) {
140 case -NFS4ERR_STALE_CLIENTID:
141 set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
142 case 0:
143 /* Sustain the lease, even if it's empty. If the clientid4 140 /* Sustain the lease, even if it's empty. If the clientid4
144 * goes stale it's of no use for trunking discovery. */ 141 * goes stale it's of no use for trunking discovery. */
145 nfs4_schedule_state_renewal(*result); 142 nfs4_schedule_state_renewal(*result);
146 break;
147 } 143 }
148
149out: 144out:
150 return status; 145 return status;
151} 146}
@@ -254,24 +249,27 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
254{ 249{
255 struct nfs4_session *ses = clp->cl_session; 250 struct nfs4_session *ses = clp->cl_session;
256 struct nfs4_slot_table *tbl; 251 struct nfs4_slot_table *tbl;
257 int max_slots;
258 252
259 if (ses == NULL) 253 if (ses == NULL)
260 return; 254 return;
261 tbl = &ses->fc_slot_table; 255 tbl = &ses->fc_slot_table;
262 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { 256 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
263 spin_lock(&tbl->slot_tbl_lock); 257 spin_lock(&tbl->slot_tbl_lock);
264 max_slots = tbl->max_slots; 258 nfs41_wake_slot_table(tbl);
265 while (max_slots--) {
266 if (rpc_wake_up_first(&tbl->slot_tbl_waitq,
267 nfs4_set_task_privileged,
268 NULL) == NULL)
269 break;
270 }
271 spin_unlock(&tbl->slot_tbl_lock); 259 spin_unlock(&tbl->slot_tbl_lock);
272 } 260 }
273} 261}
274 262
263/*
264 * Signal state manager thread if session fore channel is drained
265 */
266void nfs4_session_drain_complete(struct nfs4_session *session,
267 struct nfs4_slot_table *tbl)
268{
269 if (nfs4_session_draining(session))
270 complete(&tbl->complete);
271}
272
275static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) 273static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl)
276{ 274{
277 spin_lock(&tbl->slot_tbl_lock); 275 spin_lock(&tbl->slot_tbl_lock);
@@ -303,7 +301,6 @@ static void nfs41_finish_session_reset(struct nfs_client *clp)
303 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); 301 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
304 clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); 302 clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
305 /* create_session negotiated new slot table */ 303 /* create_session negotiated new slot table */
306 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
307 clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); 304 clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
308 nfs41_setup_state_renewal(clp); 305 nfs41_setup_state_renewal(clp);
309} 306}
@@ -1086,7 +1083,6 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
1086 */ 1083 */
1087static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) 1084static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
1088{ 1085{
1089 BUG_ON(list_first_entry(&seqid->sequence->list, struct nfs_seqid, list) != seqid);
1090 switch (status) { 1086 switch (status) {
1091 case 0: 1087 case 0:
1092 break; 1088 break;
@@ -1209,6 +1205,40 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
1209} 1205}
1210EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery); 1206EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
1211 1207
1208int nfs4_wait_clnt_recover(struct nfs_client *clp)
1209{
1210 int res;
1211
1212 might_sleep();
1213
1214 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
1215 nfs_wait_bit_killable, TASK_KILLABLE);
1216 if (res)
1217 return res;
1218
1219 if (clp->cl_cons_state < 0)
1220 return clp->cl_cons_state;
1221 return 0;
1222}
1223
1224int nfs4_client_recover_expired_lease(struct nfs_client *clp)
1225{
1226 unsigned int loop;
1227 int ret;
1228
1229 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
1230 ret = nfs4_wait_clnt_recover(clp);
1231 if (ret != 0)
1232 break;
1233 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1234 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1235 break;
1236 nfs4_schedule_state_manager(clp);
1237 ret = -EIO;
1238 }
1239 return ret;
1240}
1241
1212/* 1242/*
1213 * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN 1243 * nfs40_handle_cb_pathdown - return all delegations after NFS4ERR_CB_PATH_DOWN
1214 * @clp: client to process 1244 * @clp: client to process
@@ -1401,14 +1431,6 @@ restart:
1401 /* Mark the file as being 'closed' */ 1431 /* Mark the file as being 'closed' */
1402 state->state = 0; 1432 state->state = 0;
1403 break; 1433 break;
1404 case -EKEYEXPIRED:
1405 /*
1406 * User RPCSEC_GSS context has expired.
1407 * We cannot recover this stateid now, so
1408 * skip it and allow recovery thread to
1409 * proceed.
1410 */
1411 break;
1412 case -NFS4ERR_ADMIN_REVOKED: 1434 case -NFS4ERR_ADMIN_REVOKED:
1413 case -NFS4ERR_STALE_STATEID: 1435 case -NFS4ERR_STALE_STATEID:
1414 case -NFS4ERR_BAD_STATEID: 1436 case -NFS4ERR_BAD_STATEID:
@@ -1561,14 +1583,6 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
1561 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); 1583 nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
1562} 1584}
1563 1585
1564static void nfs4_warn_keyexpired(const char *s)
1565{
1566 printk_ratelimited(KERN_WARNING "Error: state manager"
1567 " encountered RPCSEC_GSS session"
1568 " expired against NFSv4 server %s.\n",
1569 s);
1570}
1571
1572static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) 1586static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1573{ 1587{
1574 switch (error) { 1588 switch (error) {
@@ -1602,10 +1616,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
1602 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 1616 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1603 set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); 1617 set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
1604 break; 1618 break;
1605 case -EKEYEXPIRED:
1606 /* Nothing we can do */
1607 nfs4_warn_keyexpired(clp->cl_hostname);
1608 break;
1609 default: 1619 default:
1610 dprintk("%s: failed to handle error %d for server %s\n", 1620 dprintk("%s: failed to handle error %d for server %s\n",
1611 __func__, error, clp->cl_hostname); 1621 __func__, error, clp->cl_hostname);
@@ -1722,8 +1732,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1722 dprintk("%s: exit with error %d for server %s\n", 1732 dprintk("%s: exit with error %d for server %s\n",
1723 __func__, -EPROTONOSUPPORT, clp->cl_hostname); 1733 __func__, -EPROTONOSUPPORT, clp->cl_hostname);
1724 return -EPROTONOSUPPORT; 1734 return -EPROTONOSUPPORT;
1725 case -EKEYEXPIRED:
1726 nfs4_warn_keyexpired(clp->cl_hostname);
1727 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1735 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1728 * in nfs4_exchange_id */ 1736 * in nfs4_exchange_id */
1729 default: 1737 default:
@@ -1850,6 +1858,7 @@ again:
1850 case -ETIMEDOUT: 1858 case -ETIMEDOUT:
1851 case -EAGAIN: 1859 case -EAGAIN:
1852 ssleep(1); 1860 ssleep(1);
1861 case -NFS4ERR_STALE_CLIENTID:
1853 dprintk("NFS: %s after status %d, retrying\n", 1862 dprintk("NFS: %s after status %d, retrying\n",
1854 __func__, status); 1863 __func__, status);
1855 goto again; 1864 goto again;
@@ -1876,7 +1885,6 @@ again:
1876 break; 1885 break;
1877 1886
1878 case -EKEYEXPIRED: 1887 case -EKEYEXPIRED:
1879 nfs4_warn_keyexpired(clp->cl_hostname);
1880 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery 1888 case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery
1881 * in nfs4_exchange_id */ 1889 * in nfs4_exchange_id */
1882 status = -EKEYEXPIRED; 1890 status = -EKEYEXPIRED;
@@ -1907,14 +1915,23 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
1907} 1915}
1908EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); 1916EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
1909 1917
1910void nfs41_handle_recall_slot(struct nfs_client *clp) 1918static void nfs41_ping_server(struct nfs_client *clp)
1911{ 1919{
1912 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); 1920 /* Use CHECK_LEASE to ping the server with a SEQUENCE */
1913 dprintk("%s: scheduling slot recall for server %s\n", __func__, 1921 set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
1914 clp->cl_hostname);
1915 nfs4_schedule_state_manager(clp); 1922 nfs4_schedule_state_manager(clp);
1916} 1923}
1917 1924
1925void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
1926{
1927 nfs41_ping_server(clp);
1928}
1929
1930void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
1931{
1932 nfs41_ping_server(clp);
1933}
1934
1918static void nfs4_reset_all_state(struct nfs_client *clp) 1935static void nfs4_reset_all_state(struct nfs_client *clp)
1919{ 1936{
1920 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { 1937 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
@@ -2001,8 +2018,18 @@ static int nfs4_reset_session(struct nfs_client *clp)
2001 nfs4_begin_drain_session(clp); 2018 nfs4_begin_drain_session(clp);
2002 cred = nfs4_get_exchange_id_cred(clp); 2019 cred = nfs4_get_exchange_id_cred(clp);
2003 status = nfs4_proc_destroy_session(clp->cl_session, cred); 2020 status = nfs4_proc_destroy_session(clp->cl_session, cred);
2004 if (status && status != -NFS4ERR_BADSESSION && 2021 switch (status) {
2005 status != -NFS4ERR_DEADSESSION) { 2022 case 0:
2023 case -NFS4ERR_BADSESSION:
2024 case -NFS4ERR_DEADSESSION:
2025 break;
2026 case -NFS4ERR_BACK_CHAN_BUSY:
2027 case -NFS4ERR_DELAY:
2028 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
2029 status = 0;
2030 ssleep(1);
2031 goto out;
2032 default:
2006 status = nfs4_recovery_handle_error(clp, status); 2033 status = nfs4_recovery_handle_error(clp, status);
2007 goto out; 2034 goto out;
2008 } 2035 }
@@ -2024,35 +2051,6 @@ out:
2024 return status; 2051 return status;
2025} 2052}
2026 2053
2027static int nfs4_recall_slot(struct nfs_client *clp)
2028{
2029 struct nfs4_slot_table *fc_tbl;
2030 struct nfs4_slot *new, *old;
2031 int i;
2032
2033 if (!nfs4_has_session(clp))
2034 return 0;
2035 nfs4_begin_drain_session(clp);
2036 fc_tbl = &clp->cl_session->fc_slot_table;
2037 new = kmalloc(fc_tbl->target_max_slots * sizeof(struct nfs4_slot),
2038 GFP_NOFS);
2039 if (!new)
2040 return -ENOMEM;
2041
2042 spin_lock(&fc_tbl->slot_tbl_lock);
2043 for (i = 0; i < fc_tbl->target_max_slots; i++)
2044 new[i].seq_nr = fc_tbl->slots[i].seq_nr;
2045 old = fc_tbl->slots;
2046 fc_tbl->slots = new;
2047 fc_tbl->max_slots = fc_tbl->target_max_slots;
2048 fc_tbl->target_max_slots = 0;
2049 clp->cl_session->fc_attrs.max_reqs = fc_tbl->max_slots;
2050 spin_unlock(&fc_tbl->slot_tbl_lock);
2051
2052 kfree(old);
2053 return 0;
2054}
2055
2056static int nfs4_bind_conn_to_session(struct nfs_client *clp) 2054static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2057{ 2055{
2058 struct rpc_cred *cred; 2056 struct rpc_cred *cred;
@@ -2083,7 +2081,6 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2083#else /* CONFIG_NFS_V4_1 */ 2081#else /* CONFIG_NFS_V4_1 */
2084static int nfs4_reset_session(struct nfs_client *clp) { return 0; } 2082static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
2085static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; } 2083static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
2086static int nfs4_recall_slot(struct nfs_client *clp) { return 0; }
2087 2084
2088static int nfs4_bind_conn_to_session(struct nfs_client *clp) 2085static int nfs4_bind_conn_to_session(struct nfs_client *clp)
2089{ 2086{
@@ -2115,15 +2112,6 @@ static void nfs4_state_manager(struct nfs_client *clp)
2115 continue; 2112 continue;
2116 } 2113 }
2117 2114
2118 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
2119 section = "check lease";
2120 status = nfs4_check_lease(clp);
2121 if (status < 0)
2122 goto out_error;
2123 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
2124 continue;
2125 }
2126
2127 /* Initialize or reset the session */ 2115 /* Initialize or reset the session */
2128 if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) { 2116 if (test_and_clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) {
2129 section = "reset session"; 2117 section = "reset session";
@@ -2144,10 +2132,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
2144 continue; 2132 continue;
2145 } 2133 }
2146 2134
2147 /* Recall session slots */ 2135 if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
2148 if (test_and_clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state)) { 2136 section = "check lease";
2149 section = "recall slot"; 2137 status = nfs4_check_lease(clp);
2150 status = nfs4_recall_slot(clp);
2151 if (status < 0) 2138 if (status < 0)
2152 goto out_error; 2139 goto out_error;
2153 continue; 2140 continue;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index bd61221ad2c5..84d2e9e2f313 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -51,6 +51,7 @@ static const struct super_operations nfs4_sops = {
51 .alloc_inode = nfs_alloc_inode, 51 .alloc_inode = nfs_alloc_inode,
52 .destroy_inode = nfs_destroy_inode, 52 .destroy_inode = nfs_destroy_inode,
53 .write_inode = nfs4_write_inode, 53 .write_inode = nfs4_write_inode,
54 .drop_inode = nfs_drop_inode,
54 .put_super = nfs_put_super, 55 .put_super = nfs_put_super,
55 .statfs = nfs_statfs, 56 .statfs = nfs_statfs,
56 .evict_inode = nfs4_evict_inode, 57 .evict_inode = nfs4_evict_inode,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 40836ee5dc3a..26b143920433 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -56,6 +56,7 @@
56 56
57#include "nfs4_fs.h" 57#include "nfs4_fs.h"
58#include "internal.h" 58#include "internal.h"
59#include "nfs4session.h"
59#include "pnfs.h" 60#include "pnfs.h"
60#include "netns.h" 61#include "netns.h"
61 62
@@ -270,6 +271,8 @@ static int nfs4_stat_to_errno(int);
270 271
271#if defined(CONFIG_NFS_V4_1) 272#if defined(CONFIG_NFS_V4_1)
272#define NFS4_MAX_MACHINE_NAME_LEN (64) 273#define NFS4_MAX_MACHINE_NAME_LEN (64)
274#define IMPL_NAME_LIMIT (sizeof(utsname()->sysname) + sizeof(utsname()->release) + \
275 sizeof(utsname()->version) + sizeof(utsname()->machine) + 8)
273 276
274#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ 277#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
275 encode_verifier_maxsz + \ 278 encode_verifier_maxsz + \
@@ -282,7 +285,7 @@ static int nfs4_stat_to_errno(int);
282 1 /* nii_domain */ + \ 285 1 /* nii_domain */ + \
283 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 286 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
284 1 /* nii_name */ + \ 287 1 /* nii_name */ + \
285 XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 288 XDR_QUADLEN(IMPL_NAME_LIMIT) + \
286 3 /* nii_date */) 289 3 /* nii_date */)
287#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \ 290#define decode_exchange_id_maxsz (op_decode_hdr_maxsz + \
288 2 /* eir_clientid */ + \ 291 2 /* eir_clientid */ + \
@@ -936,7 +939,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
936 * but this is not required as a MUST for the server to do so. */ 939 * but this is not required as a MUST for the server to do so. */
937 hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; 940 hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen;
938 941
939 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); 942 WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN);
940 encode_string(xdr, hdr->taglen, hdr->tag); 943 encode_string(xdr, hdr->taglen, hdr->tag);
941 p = reserve_space(xdr, 8); 944 p = reserve_space(xdr, 8);
942 *p++ = cpu_to_be32(hdr->minorversion); 945 *p++ = cpu_to_be32(hdr->minorversion);
@@ -955,7 +958,7 @@ static void encode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 op,
955 958
956static void encode_nops(struct compound_hdr *hdr) 959static void encode_nops(struct compound_hdr *hdr)
957{ 960{
958 BUG_ON(hdr->nops > NFS4_MAX_OPS); 961 WARN_ON_ONCE(hdr->nops > NFS4_MAX_OPS);
959 *hdr->nops_p = htonl(hdr->nops); 962 *hdr->nops_p = htonl(hdr->nops);
960} 963}
961 964
@@ -1403,7 +1406,6 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
1403 *p = cpu_to_be32(NFS4_OPEN_NOCREATE); 1406 *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
1404 break; 1407 break;
1405 default: 1408 default:
1406 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
1407 *p = cpu_to_be32(NFS4_OPEN_CREATE); 1409 *p = cpu_to_be32(NFS4_OPEN_CREATE);
1408 encode_createmode(xdr, arg); 1410 encode_createmode(xdr, arg);
1409 } 1411 }
@@ -1621,7 +1623,6 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1621 p = reserve_space(xdr, 2*4); 1623 p = reserve_space(xdr, 2*4);
1622 *p++ = cpu_to_be32(1); 1624 *p++ = cpu_to_be32(1);
1623 *p = cpu_to_be32(FATTR4_WORD0_ACL); 1625 *p = cpu_to_be32(FATTR4_WORD0_ACL);
1624 BUG_ON(arg->acl_len % 4);
1625 p = reserve_space(xdr, 4); 1626 p = reserve_space(xdr, 4);
1626 *p = cpu_to_be32(arg->acl_len); 1627 *p = cpu_to_be32(arg->acl_len);
1627 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1628 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
@@ -1713,7 +1714,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1713 struct compound_hdr *hdr) 1714 struct compound_hdr *hdr)
1714{ 1715{
1715 __be32 *p; 1716 __be32 *p;
1716 char impl_name[NFS4_OPAQUE_LIMIT]; 1717 char impl_name[IMPL_NAME_LIMIT];
1717 int len = 0; 1718 int len = 0;
1718 1719
1719 encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); 1720 encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
@@ -1728,7 +1729,7 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1728 if (send_implementation_id && 1729 if (send_implementation_id &&
1729 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 && 1730 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) > 1 &&
1730 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN) 1731 sizeof(CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN)
1731 <= NFS4_OPAQUE_LIMIT + 1) 1732 <= sizeof(impl_name) + 1)
1732 len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s", 1733 len = snprintf(impl_name, sizeof(impl_name), "%s %s %s %s",
1733 utsname()->sysname, utsname()->release, 1734 utsname()->sysname, utsname()->release,
1734 utsname()->version, utsname()->machine); 1735 utsname()->version, utsname()->machine);
@@ -1835,18 +1836,16 @@ static void encode_sequence(struct xdr_stream *xdr,
1835 struct compound_hdr *hdr) 1836 struct compound_hdr *hdr)
1836{ 1837{
1837#if defined(CONFIG_NFS_V4_1) 1838#if defined(CONFIG_NFS_V4_1)
1838 struct nfs4_session *session = args->sa_session; 1839 struct nfs4_session *session;
1839 struct nfs4_slot_table *tp; 1840 struct nfs4_slot_table *tp;
1840 struct nfs4_slot *slot; 1841 struct nfs4_slot *slot = args->sa_slot;
1841 __be32 *p; 1842 __be32 *p;
1842 1843
1843 if (!session) 1844 if (slot == NULL)
1844 return; 1845 return;
1845 1846
1846 tp = &session->fc_slot_table; 1847 tp = slot->table;
1847 1848 session = tp->session;
1848 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
1849 slot = tp->slots + args->sa_slotid;
1850 1849
1851 encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr); 1850 encode_op_hdr(xdr, OP_SEQUENCE, decode_sequence_maxsz, hdr);
1852 1851
@@ -1860,12 +1859,12 @@ static void encode_sequence(struct xdr_stream *xdr,
1860 ((u32 *)session->sess_id.data)[1], 1859 ((u32 *)session->sess_id.data)[1],
1861 ((u32 *)session->sess_id.data)[2], 1860 ((u32 *)session->sess_id.data)[2],
1862 ((u32 *)session->sess_id.data)[3], 1861 ((u32 *)session->sess_id.data)[3],
1863 slot->seq_nr, args->sa_slotid, 1862 slot->seq_nr, slot->slot_nr,
1864 tp->highest_used_slotid, args->sa_cache_this); 1863 tp->highest_used_slotid, args->sa_cache_this);
1865 p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16); 1864 p = reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 16);
1866 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1865 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1867 *p++ = cpu_to_be32(slot->seq_nr); 1866 *p++ = cpu_to_be32(slot->seq_nr);
1868 *p++ = cpu_to_be32(args->sa_slotid); 1867 *p++ = cpu_to_be32(slot->slot_nr);
1869 *p++ = cpu_to_be32(tp->highest_used_slotid); 1868 *p++ = cpu_to_be32(tp->highest_used_slotid);
1870 *p = cpu_to_be32(args->sa_cache_this); 1869 *p = cpu_to_be32(args->sa_cache_this);
1871#endif /* CONFIG_NFS_V4_1 */ 1870#endif /* CONFIG_NFS_V4_1 */
@@ -2027,8 +2026,9 @@ static void encode_free_stateid(struct xdr_stream *xdr,
2027static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) 2026static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
2028{ 2027{
2029#if defined(CONFIG_NFS_V4_1) 2028#if defined(CONFIG_NFS_V4_1)
2030 if (args->sa_session) 2029
2031 return args->sa_session->clp->cl_mvops->minor_version; 2030 if (args->sa_slot)
2031 return args->sa_slot->table->session->clp->cl_mvops->minor_version;
2032#endif /* CONFIG_NFS_V4_1 */ 2032#endif /* CONFIG_NFS_V4_1 */
2033 return 0; 2033 return 0;
2034} 2034}
@@ -5509,12 +5509,13 @@ static int decode_sequence(struct xdr_stream *xdr,
5509 struct rpc_rqst *rqstp) 5509 struct rpc_rqst *rqstp)
5510{ 5510{
5511#if defined(CONFIG_NFS_V4_1) 5511#if defined(CONFIG_NFS_V4_1)
5512 struct nfs4_session *session;
5512 struct nfs4_sessionid id; 5513 struct nfs4_sessionid id;
5513 u32 dummy; 5514 u32 dummy;
5514 int status; 5515 int status;
5515 __be32 *p; 5516 __be32 *p;
5516 5517
5517 if (!res->sr_session) 5518 if (res->sr_slot == NULL)
5518 return 0; 5519 return 0;
5519 5520
5520 status = decode_op_hdr(xdr, OP_SEQUENCE); 5521 status = decode_op_hdr(xdr, OP_SEQUENCE);
@@ -5528,8 +5529,9 @@ static int decode_sequence(struct xdr_stream *xdr,
5528 * sequence number, the server is looney tunes. 5529 * sequence number, the server is looney tunes.
5529 */ 5530 */
5530 status = -EREMOTEIO; 5531 status = -EREMOTEIO;
5532 session = res->sr_slot->table->session;
5531 5533
5532 if (memcmp(id.data, res->sr_session->sess_id.data, 5534 if (memcmp(id.data, session->sess_id.data,
5533 NFS4_MAX_SESSIONID_LEN)) { 5535 NFS4_MAX_SESSIONID_LEN)) {
5534 dprintk("%s Invalid session id\n", __func__); 5536 dprintk("%s Invalid session id\n", __func__);
5535 goto out_err; 5537 goto out_err;
@@ -5547,14 +5549,14 @@ static int decode_sequence(struct xdr_stream *xdr,
5547 } 5549 }
5548 /* slot id */ 5550 /* slot id */
5549 dummy = be32_to_cpup(p++); 5551 dummy = be32_to_cpup(p++);
5550 if (dummy != res->sr_slot - res->sr_session->fc_slot_table.slots) { 5552 if (dummy != res->sr_slot->slot_nr) {
5551 dprintk("%s Invalid slot id\n", __func__); 5553 dprintk("%s Invalid slot id\n", __func__);
5552 goto out_err; 5554 goto out_err;
5553 } 5555 }
5554 /* highest slot id - currently not processed */ 5556 /* highest slot id */
5555 dummy = be32_to_cpup(p++); 5557 res->sr_highest_slotid = be32_to_cpup(p++);
5556 /* target highest slot id - currently not processed */ 5558 /* target highest slot id */
5557 dummy = be32_to_cpup(p++); 5559 res->sr_target_highest_slotid = be32_to_cpup(p++);
5558 /* result flags */ 5560 /* result flags */
5559 res->sr_status_flags = be32_to_cpup(p); 5561 res->sr_status_flags = be32_to_cpup(p);
5560 status = 0; 5562 status = 0;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 874613545301..a9ebd817278b 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -148,17 +148,6 @@ end_offset(u64 start, u64 len)
148 return end >= start ? end : NFS4_MAX_UINT64; 148 return end >= start ? end : NFS4_MAX_UINT64;
149} 149}
150 150
151/* last octet in a range */
152static inline u64
153last_byte_offset(u64 start, u64 len)
154{
155 u64 end;
156
157 BUG_ON(!len);
158 end = start + len;
159 return end > start ? end - 1 : NFS4_MAX_UINT64;
160}
161
162static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, 151static void _fix_verify_io_params(struct pnfs_layout_segment *lseg,
163 struct page ***p_pages, unsigned *p_pgbase, 152 struct page ***p_pages, unsigned *p_pgbase,
164 u64 offset, unsigned long count) 153 u64 offset, unsigned long count)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2878f97bd78d..d00260b08103 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -254,7 +254,7 @@ static void
254pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 254pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
255{ 255{
256 lo->plh_retry_timestamp = jiffies; 256 lo->plh_retry_timestamp = jiffies;
257 if (test_and_set_bit(fail_bit, &lo->plh_flags)) 257 if (!test_and_set_bit(fail_bit, &lo->plh_flags))
258 atomic_inc(&lo->plh_refcount); 258 atomic_inc(&lo->plh_refcount);
259} 259}
260 260
@@ -369,17 +369,6 @@ end_offset(u64 start, u64 len)
369 return end >= start ? end : NFS4_MAX_UINT64; 369 return end >= start ? end : NFS4_MAX_UINT64;
370} 370}
371 371
372/* last octet in a range */
373static inline u64
374last_byte_offset(u64 start, u64 len)
375{
376 u64 end;
377
378 BUG_ON(!len);
379 end = start + len;
380 return end > start ? end - 1 : NFS4_MAX_UINT64;
381}
382
383/* 372/*
384 * is l2 fully contained in l1? 373 * is l2 fully contained in l1?
385 * start1 end1 374 * start1 end1
@@ -645,7 +634,6 @@ send_layoutget(struct pnfs_layout_hdr *lo,
645 634
646 dprintk("--> %s\n", __func__); 635 dprintk("--> %s\n", __func__);
647 636
648 BUG_ON(ctx == NULL);
649 lgp = kzalloc(sizeof(*lgp), gfp_flags); 637 lgp = kzalloc(sizeof(*lgp), gfp_flags);
650 if (lgp == NULL) 638 if (lgp == NULL)
651 return NULL; 639 return NULL;
@@ -1126,7 +1114,6 @@ pnfs_update_layout(struct inode *ino,
1126 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1114 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1127 */ 1115 */
1128 spin_lock(&clp->cl_lock); 1116 spin_lock(&clp->cl_lock);
1129 BUG_ON(!list_empty(&lo->plh_layouts));
1130 list_add_tail(&lo->plh_layouts, &server->layouts); 1117 list_add_tail(&lo->plh_layouts, &server->layouts);
1131 spin_unlock(&clp->cl_lock); 1118 spin_unlock(&clp->cl_lock);
1132 } 1119 }
@@ -1222,7 +1209,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
1222{ 1209{
1223 u64 rd_size = req->wb_bytes; 1210 u64 rd_size = req->wb_bytes;
1224 1211
1225 BUG_ON(pgio->pg_lseg != NULL); 1212 WARN_ON_ONCE(pgio->pg_lseg != NULL);
1226 1213
1227 if (req->wb_offset != req->wb_pgbase) { 1214 if (req->wb_offset != req->wb_pgbase) {
1228 nfs_pageio_reset_read_mds(pgio); 1215 nfs_pageio_reset_read_mds(pgio);
@@ -1251,7 +1238,7 @@ void
1251pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 1238pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1252 struct nfs_page *req, u64 wb_size) 1239 struct nfs_page *req, u64 wb_size)
1253{ 1240{
1254 BUG_ON(pgio->pg_lseg != NULL); 1241 WARN_ON_ONCE(pgio->pg_lseg != NULL);
1255 1242
1256 if (req->wb_offset != req->wb_pgbase) { 1243 if (req->wb_offset != req->wb_pgbase) {
1257 nfs_pageio_reset_write_mds(pgio); 1244 nfs_pageio_reset_write_mds(pgio);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 50a88c3546ed..f084dac948e1 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -47,39 +47,6 @@
47#define NFSDBG_FACILITY NFSDBG_PROC 47#define NFSDBG_FACILITY NFSDBG_PROC
48 48
49/* 49/*
50 * wrapper to handle the -EKEYEXPIRED error message. This should generally
51 * only happen if using krb5 auth and a user's TGT expires. NFSv2 doesn't
52 * support the NFSERR_JUKEBOX error code, but we handle this situation in the
53 * same way that we handle that error with NFSv3.
54 */
55static int
56nfs_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
57{
58 int res;
59 do {
60 res = rpc_call_sync(clnt, msg, flags);
61 if (res != -EKEYEXPIRED)
62 break;
63 freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
64 res = -ERESTARTSYS;
65 } while (!fatal_signal_pending(current));
66 return res;
67}
68
69#define rpc_call_sync(clnt, msg, flags) nfs_rpc_wrapper(clnt, msg, flags)
70
71static int
72nfs_async_handle_expired_key(struct rpc_task *task)
73{
74 if (task->tk_status != -EKEYEXPIRED)
75 return 0;
76 task->tk_status = 0;
77 rpc_restart_call(task);
78 rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
79 return 1;
80}
81
82/*
83 * Bare-bones access to getattr: this is for nfs_read_super. 50 * Bare-bones access to getattr: this is for nfs_read_super.
84 */ 51 */
85static int 52static int
@@ -364,8 +331,6 @@ static void nfs_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlink
364 331
365static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir) 332static int nfs_proc_unlink_done(struct rpc_task *task, struct inode *dir)
366{ 333{
367 if (nfs_async_handle_expired_key(task))
368 return 0;
369 nfs_mark_for_revalidate(dir); 334 nfs_mark_for_revalidate(dir);
370 return 1; 335 return 1;
371} 336}
@@ -385,8 +350,6 @@ static int
385nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir, 350nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
386 struct inode *new_dir) 351 struct inode *new_dir)
387{ 352{
388 if (nfs_async_handle_expired_key(task))
389 return 0;
390 nfs_mark_for_revalidate(old_dir); 353 nfs_mark_for_revalidate(old_dir);
391 nfs_mark_for_revalidate(new_dir); 354 nfs_mark_for_revalidate(new_dir);
392 return 1; 355 return 1;
@@ -642,9 +605,6 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
642{ 605{
643 struct inode *inode = data->header->inode; 606 struct inode *inode = data->header->inode;
644 607
645 if (nfs_async_handle_expired_key(task))
646 return -EAGAIN;
647
648 nfs_invalidate_atime(inode); 608 nfs_invalidate_atime(inode);
649 if (task->tk_status >= 0) { 609 if (task->tk_status >= 0) {
650 nfs_refresh_inode(inode, data->res.fattr); 610 nfs_refresh_inode(inode, data->res.fattr);
@@ -671,9 +631,6 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
671{ 631{
672 struct inode *inode = data->header->inode; 632 struct inode *inode = data->header->inode;
673 633
674 if (nfs_async_handle_expired_key(task))
675 return -EAGAIN;
676
677 if (task->tk_status >= 0) 634 if (task->tk_status >= 0)
678 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); 635 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
679 return 0; 636 return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index b6bdb18e892c..a5e5d9899d56 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -91,12 +91,16 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
91 put_nfs_open_context(rdata->args.context); 91 put_nfs_open_context(rdata->args.context);
92 if (rdata->pages.pagevec != rdata->pages.page_array) 92 if (rdata->pages.pagevec != rdata->pages.page_array)
93 kfree(rdata->pages.pagevec); 93 kfree(rdata->pages.pagevec);
94 if (rdata != &read_header->rpc_data) 94 if (rdata == &read_header->rpc_data) {
95 kfree(rdata);
96 else
97 rdata->header = NULL; 95 rdata->header = NULL;
96 rdata = NULL;
97 }
98 if (atomic_dec_and_test(&hdr->refcnt)) 98 if (atomic_dec_and_test(&hdr->refcnt))
99 hdr->completion_ops->completion(hdr); 99 hdr->completion_ops->completion(hdr);
100 /* Note: we only free the rpc_task after callbacks are done.
101 * See the comment in rpc_free_task() for why
102 */
103 kfree(rdata);
100} 104}
101EXPORT_SYMBOL_GPL(nfs_readdata_release); 105EXPORT_SYMBOL_GPL(nfs_readdata_release);
102 106
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 652d3f7176a9..b056b1628722 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -64,6 +64,7 @@
64#include "iostat.h" 64#include "iostat.h"
65#include "internal.h" 65#include "internal.h"
66#include "fscache.h" 66#include "fscache.h"
67#include "nfs4session.h"
67#include "pnfs.h" 68#include "pnfs.h"
68#include "nfs.h" 69#include "nfs.h"
69 70
@@ -307,6 +308,7 @@ const struct super_operations nfs_sops = {
307 .alloc_inode = nfs_alloc_inode, 308 .alloc_inode = nfs_alloc_inode,
308 .destroy_inode = nfs_destroy_inode, 309 .destroy_inode = nfs_destroy_inode,
309 .write_inode = nfs_write_inode, 310 .write_inode = nfs_write_inode,
311 .drop_inode = nfs_drop_inode,
310 .put_super = nfs_put_super, 312 .put_super = nfs_put_super,
311 .statfs = nfs_statfs, 313 .statfs = nfs_statfs,
312 .evict_inode = nfs_evict_inode, 314 .evict_inode = nfs_evict_inode,
@@ -1150,7 +1152,7 @@ static int nfs_get_option_str(substring_t args[], char **option)
1150{ 1152{
1151 kfree(*option); 1153 kfree(*option);
1152 *option = match_strdup(args); 1154 *option = match_strdup(args);
1153 return !option; 1155 return !*option;
1154} 1156}
1155 1157
1156static int nfs_get_option_ul(substring_t args[], unsigned long *option) 1158static int nfs_get_option_ul(substring_t args[], unsigned long *option)
@@ -2373,19 +2375,30 @@ static void nfs_get_cache_cookie(struct super_block *sb,
2373 struct nfs_parsed_mount_data *parsed, 2375 struct nfs_parsed_mount_data *parsed,
2374 struct nfs_clone_mount *cloned) 2376 struct nfs_clone_mount *cloned)
2375{ 2377{
2378 struct nfs_server *nfss = NFS_SB(sb);
2376 char *uniq = NULL; 2379 char *uniq = NULL;
2377 int ulen = 0; 2380 int ulen = 0;
2378 2381
2379 if (parsed && parsed->fscache_uniq) { 2382 nfss->fscache_key = NULL;
2380 uniq = parsed->fscache_uniq; 2383 nfss->fscache = NULL;
2381 ulen = strlen(parsed->fscache_uniq); 2384
2385 if (parsed) {
2386 if (!(parsed->options & NFS_OPTION_FSCACHE))
2387 return;
2388 if (parsed->fscache_uniq) {
2389 uniq = parsed->fscache_uniq;
2390 ulen = strlen(parsed->fscache_uniq);
2391 }
2382 } else if (cloned) { 2392 } else if (cloned) {
2383 struct nfs_server *mnt_s = NFS_SB(cloned->sb); 2393 struct nfs_server *mnt_s = NFS_SB(cloned->sb);
2394 if (!(mnt_s->options & NFS_OPTION_FSCACHE))
2395 return;
2384 if (mnt_s->fscache_key) { 2396 if (mnt_s->fscache_key) {
2385 uniq = mnt_s->fscache_key->key.uniquifier; 2397 uniq = mnt_s->fscache_key->key.uniquifier;
2386 ulen = mnt_s->fscache_key->key.uniq_len; 2398 ulen = mnt_s->fscache_key->key.uniq_len;
2387 }; 2399 };
2388 } 2400 } else
2401 return;
2389 2402
2390 nfs_fscache_get_super_cookie(sb, uniq, ulen); 2403 nfs_fscache_get_super_cookie(sb, uniq, ulen);
2391} 2404}
@@ -2576,27 +2589,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2576 struct nfs_server *server; 2589 struct nfs_server *server;
2577 struct dentry *mntroot = ERR_PTR(-ENOMEM); 2590 struct dentry *mntroot = ERR_PTR(-ENOMEM);
2578 struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; 2591 struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod;
2579 int error;
2580 2592
2581 dprintk("--> nfs_xdev_mount_common()\n"); 2593 dprintk("--> nfs_xdev_mount()\n");
2582 2594
2583 mount_info.mntfh = mount_info.cloned->fh; 2595 mount_info.mntfh = mount_info.cloned->fh;
2584 2596
2585 /* create a new volume representation */ 2597 /* create a new volume representation */
2586 server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); 2598 server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
2587 if (IS_ERR(server)) {
2588 error = PTR_ERR(server);
2589 goto out_err;
2590 }
2591 2599
2592 mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod); 2600 if (IS_ERR(server))
2593 dprintk("<-- nfs_xdev_mount_common() = 0\n"); 2601 mntroot = ERR_CAST(server);
2594out: 2602 else
2595 return mntroot; 2603 mntroot = nfs_fs_mount_common(server, flags,
2604 dev_name, &mount_info, nfs_mod);
2596 2605
2597out_err: 2606 dprintk("<-- nfs_xdev_mount() = %ld\n",
2598 dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error); 2607 IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L);
2599 goto out; 2608 return mntroot;
2600} 2609}
2601 2610
2602#if IS_ENABLED(CONFIG_NFS_V4) 2611#if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9347ab7c9574..c483cc50b82e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -126,12 +126,16 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
126 put_nfs_open_context(wdata->args.context); 126 put_nfs_open_context(wdata->args.context);
127 if (wdata->pages.pagevec != wdata->pages.page_array) 127 if (wdata->pages.pagevec != wdata->pages.page_array)
128 kfree(wdata->pages.pagevec); 128 kfree(wdata->pages.pagevec);
129 if (wdata != &write_header->rpc_data) 129 if (wdata == &write_header->rpc_data) {
130 kfree(wdata);
131 else
132 wdata->header = NULL; 130 wdata->header = NULL;
131 wdata = NULL;
132 }
133 if (atomic_dec_and_test(&hdr->refcnt)) 133 if (atomic_dec_and_test(&hdr->refcnt))
134 hdr->completion_ops->completion(hdr); 134 hdr->completion_ops->completion(hdr);
135 /* Note: we only free the rpc_task after callbacks are done.
136 * See the comment in rpc_free_task() for why
137 */
138 kfree(wdata);
135} 139}
136EXPORT_SYMBOL_GPL(nfs_writedata_release); 140EXPORT_SYMBOL_GPL(nfs_writedata_release);
137 141
@@ -202,7 +206,6 @@ out:
202/* A writeback failed: mark the page as bad, and invalidate the page cache */ 206/* A writeback failed: mark the page as bad, and invalidate the page cache */
203static void nfs_set_pageerror(struct page *page) 207static void nfs_set_pageerror(struct page *page)
204{ 208{
205 SetPageError(page);
206 nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); 209 nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
207} 210}
208 211
@@ -239,21 +242,18 @@ int nfs_congestion_kb;
239#define NFS_CONGESTION_OFF_THRESH \ 242#define NFS_CONGESTION_OFF_THRESH \
240 (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) 243 (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
241 244
242static int nfs_set_page_writeback(struct page *page) 245static void nfs_set_page_writeback(struct page *page)
243{ 246{
247 struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
244 int ret = test_set_page_writeback(page); 248 int ret = test_set_page_writeback(page);
245 249
246 if (!ret) { 250 WARN_ON_ONCE(ret != 0);
247 struct inode *inode = page_file_mapping(page)->host;
248 struct nfs_server *nfss = NFS_SERVER(inode);
249 251
250 if (atomic_long_inc_return(&nfss->writeback) > 252 if (atomic_long_inc_return(&nfss->writeback) >
251 NFS_CONGESTION_ON_THRESH) { 253 NFS_CONGESTION_ON_THRESH) {
252 set_bdi_congested(&nfss->backing_dev_info, 254 set_bdi_congested(&nfss->backing_dev_info,
253 BLK_RW_ASYNC); 255 BLK_RW_ASYNC);
254 }
255 } 256 }
256 return ret;
257} 257}
258 258
259static void nfs_end_page_writeback(struct page *page) 259static void nfs_end_page_writeback(struct page *page)
@@ -315,10 +315,10 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
315 if (IS_ERR(req)) 315 if (IS_ERR(req))
316 goto out; 316 goto out;
317 317
318 ret = nfs_set_page_writeback(page); 318 nfs_set_page_writeback(page);
319 BUG_ON(ret != 0); 319 WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
320 BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
321 320
321 ret = 0;
322 if (!nfs_pageio_add_request(pgio, req)) { 322 if (!nfs_pageio_add_request(pgio, req)) {
323 nfs_redirty_request(req); 323 nfs_redirty_request(req);
324 ret = pgio->pg_error; 324 ret = pgio->pg_error;
@@ -451,8 +451,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
451 struct inode *inode = req->wb_context->dentry->d_inode; 451 struct inode *inode = req->wb_context->dentry->d_inode;
452 struct nfs_inode *nfsi = NFS_I(inode); 452 struct nfs_inode *nfsi = NFS_I(inode);
453 453
454 BUG_ON (!NFS_WBACK_BUSY(req));
455
456 spin_lock(&inode->i_lock); 454 spin_lock(&inode->i_lock);
457 if (likely(!PageSwapCache(req->wb_page))) { 455 if (likely(!PageSwapCache(req->wb_page))) {
458 set_page_private(req->wb_page, 0); 456 set_page_private(req->wb_page, 0);
@@ -884,7 +882,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
884{ 882{
885 if (nfs_have_delegated_attributes(inode)) 883 if (nfs_have_delegated_attributes(inode))
886 goto out; 884 goto out;
887 if (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) 885 if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
888 return false; 886 return false;
889out: 887out:
890 return PageUptodate(page) != 0; 888 return PageUptodate(page) != 0;
@@ -1727,7 +1725,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
1727 struct nfs_page *req; 1725 struct nfs_page *req;
1728 int ret = 0; 1726 int ret = 0;
1729 1727
1730 BUG_ON(!PageLocked(page));
1731 for (;;) { 1728 for (;;) {
1732 wait_on_page_writeback(page); 1729 wait_on_page_writeback(page);
1733 req = nfs_page_find_request(page); 1730 req = nfs_page_find_request(page);
@@ -1801,7 +1798,8 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1801 if (PagePrivate(page)) 1798 if (PagePrivate(page))
1802 return -EBUSY; 1799 return -EBUSY;
1803 1800
1804 nfs_fscache_release_page(page, GFP_KERNEL); 1801 if (!nfs_fscache_release_page(page, GFP_KERNEL))
1802 return -EBUSY;
1805 1803
1806 return migrate_page(mapping, newpage, page, mode); 1804 return migrate_page(mapping, newpage, page, mode);
1807} 1805}
@@ -1829,7 +1827,7 @@ int __init nfs_init_writepagecache(void)
1829 goto out_destroy_write_mempool; 1827 goto out_destroy_write_mempool;
1830 1828
1831 nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT, 1829 nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
1832 nfs_wdata_cachep); 1830 nfs_cdata_cachep);
1833 if (nfs_commit_mempool == NULL) 1831 if (nfs_commit_mempool == NULL)
1834 goto out_destroy_commit_cache; 1832 goto out_destroy_commit_cache;
1835 1833
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index e6c38159622f..e761ee95617f 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -8,61 +8,144 @@
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/nsproxy.h>
12#include <linux/sunrpc/clnt.h>
13#include <asm/uaccess.h>
11 14
12#include "state.h" 15#include "state.h"
13#include "fault_inject.h" 16#include "netns.h"
14 17
15struct nfsd_fault_inject_op { 18struct nfsd_fault_inject_op {
16 char *file; 19 char *file;
17 void (*func)(u64); 20 u64 (*forget)(struct nfs4_client *, u64);
21 u64 (*print)(struct nfs4_client *, u64);
18}; 22};
19 23
20static struct nfsd_fault_inject_op inject_ops[] = { 24static struct nfsd_fault_inject_op inject_ops[] = {
21 { 25 {
22 .file = "forget_clients", 26 .file = "forget_clients",
23 .func = nfsd_forget_clients, 27 .forget = nfsd_forget_client,
28 .print = nfsd_print_client,
24 }, 29 },
25 { 30 {
26 .file = "forget_locks", 31 .file = "forget_locks",
27 .func = nfsd_forget_locks, 32 .forget = nfsd_forget_client_locks,
33 .print = nfsd_print_client_locks,
28 }, 34 },
29 { 35 {
30 .file = "forget_openowners", 36 .file = "forget_openowners",
31 .func = nfsd_forget_openowners, 37 .forget = nfsd_forget_client_openowners,
38 .print = nfsd_print_client_openowners,
32 }, 39 },
33 { 40 {
34 .file = "forget_delegations", 41 .file = "forget_delegations",
35 .func = nfsd_forget_delegations, 42 .forget = nfsd_forget_client_delegations,
43 .print = nfsd_print_client_delegations,
36 }, 44 },
37 { 45 {
38 .file = "recall_delegations", 46 .file = "recall_delegations",
39 .func = nfsd_recall_delegations, 47 .forget = nfsd_recall_client_delegations,
48 .print = nfsd_print_client_delegations,
40 }, 49 },
41}; 50};
42 51
43static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op); 52static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
44static struct dentry *debug_dir; 53static struct dentry *debug_dir;
45 54
46static int nfsd_inject_set(void *op_ptr, u64 val) 55static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)
47{ 56{
48 struct nfsd_fault_inject_op *op = op_ptr; 57 u64 count = 0;
49 58
50 if (val == 0) 59 if (val == 0)
51 printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file); 60 printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
52 else 61 else
53 printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); 62 printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
54 63
55 op->func(val); 64 nfs4_lock_state();
56 return 0; 65 count = nfsd_for_n_state(val, op->forget);
66 nfs4_unlock_state();
67 printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);
57} 68}
58 69
59static int nfsd_inject_get(void *data, u64 *val) 70static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op,
71 struct sockaddr_storage *addr,
72 size_t addr_size)
60{ 73{
61 *val = 0; 74 char buf[INET6_ADDRSTRLEN];
62 return 0; 75 struct nfs4_client *clp;
76 u64 count;
77
78 nfs4_lock_state();
79 clp = nfsd_find_client(addr, addr_size);
80 if (clp) {
81 count = op->forget(clp, 0);
82 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
83 printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count);
84 }
85 nfs4_unlock_state();
86}
87
88static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val)
89{
90 nfs4_lock_state();
91 *val = nfsd_for_n_state(0, op->print);
92 nfs4_unlock_state();
63} 93}
64 94
65DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n"); 95static ssize_t fault_inject_read(struct file *file, char __user *buf,
96 size_t len, loff_t *ppos)
97{
98 static u64 val;
99 char read_buf[25];
100 size_t size, ret;
101 loff_t pos = *ppos;
102
103 if (!pos)
104 nfsd_inject_get(file->f_dentry->d_inode->i_private, &val);
105 size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
106
107 if (pos < 0)
108 return -EINVAL;
109 if (pos >= size || !len)
110 return 0;
111 if (len > size - pos)
112 len = size - pos;
113 ret = copy_to_user(buf, read_buf + pos, len);
114 if (ret == len)
115 return -EFAULT;
116 len -= ret;
117 *ppos = pos + len;
118 return len;
119}
120
121static ssize_t fault_inject_write(struct file *file, const char __user *buf,
122 size_t len, loff_t *ppos)
123{
124 char write_buf[INET6_ADDRSTRLEN];
125 size_t size = min(sizeof(write_buf) - 1, len);
126 struct net *net = current->nsproxy->net_ns;
127 struct sockaddr_storage sa;
128 u64 val;
129
130 if (copy_from_user(write_buf, buf, size))
131 return -EFAULT;
132 write_buf[size] = '\0';
133
134 size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
135 if (size > 0)
136 nfsd_inject_set_client(file->f_dentry->d_inode->i_private, &sa, size);
137 else {
138 val = simple_strtoll(write_buf, NULL, 0);
139 nfsd_inject_set(file->f_dentry->d_inode->i_private, val);
140 }
141 return len; /* on success, claim we got the whole input */
142}
143
144static const struct file_operations fops_nfsd = {
145 .owner = THIS_MODULE,
146 .read = fault_inject_read,
147 .write = fault_inject_write,
148};
66 149
67void nfsd_fault_inject_cleanup(void) 150void nfsd_fault_inject_cleanup(void)
68{ 151{
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
deleted file mode 100644
index 90bd0570956c..000000000000
--- a/fs/nfsd/fault_inject.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
3 *
4 * Function definitions for fault injection
5 */
6
7#ifndef LINUX_NFSD_FAULT_INJECT_H
8#define LINUX_NFSD_FAULT_INJECT_H
9
10#ifdef CONFIG_NFSD_FAULT_INJECTION
11int nfsd_fault_inject_init(void);
12void nfsd_fault_inject_cleanup(void);
13void nfsd_forget_clients(u64);
14void nfsd_forget_locks(u64);
15void nfsd_forget_openowners(u64);
16void nfsd_forget_delegations(u64);
17void nfsd_recall_delegations(u64);
18#else /* CONFIG_NFSD_FAULT_INJECTION */
19static inline int nfsd_fault_inject_init(void) { return 0; }
20static inline void nfsd_fault_inject_cleanup(void) {}
21static inline void nfsd_forget_clients(u64 num) {}
22static inline void nfsd_forget_locks(u64 num) {}
23static inline void nfsd_forget_openowners(u64 num) {}
24static inline void nfsd_forget_delegations(u64 num) {}
25static inline void nfsd_recall_delegations(u64 num) {}
26#endif /* CONFIG_NFSD_FAULT_INJECTION */
27
28#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 65c2431ea32f..1051bebff1b0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -24,7 +24,18 @@
24#include <net/net_namespace.h> 24#include <net/net_namespace.h>
25#include <net/netns/generic.h> 25#include <net/netns/generic.h>
26 26
27/* Hash tables for nfs4_clientid state */
28#define CLIENT_HASH_BITS 4
29#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
30#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
31
32#define LOCKOWNER_INO_HASH_BITS 8
33#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
34
35#define SESSION_HASH_SIZE 512
36
27struct cld_net; 37struct cld_net;
38struct nfsd4_client_tracking_ops;
28 39
29struct nfsd_net { 40struct nfsd_net {
30 struct cld_net *cld_net; 41 struct cld_net *cld_net;
@@ -38,7 +49,62 @@ struct nfsd_net {
38 struct lock_manager nfsd4_manager; 49 struct lock_manager nfsd4_manager;
39 bool grace_ended; 50 bool grace_ended;
40 time_t boot_time; 51 time_t boot_time;
52
53 /*
54 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
55 * used in reboot/reset lease grace period processing
56 *
57 * conf_id_hashtbl[], and conf_name_tree hold confirmed
58 * setclientid_confirmed info.
59 *
60 * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed
61 * setclientid info.
62 */
63 struct list_head *reclaim_str_hashtbl;
64 int reclaim_str_hashtbl_size;
65 struct list_head *conf_id_hashtbl;
66 struct rb_root conf_name_tree;
67 struct list_head *unconf_id_hashtbl;
68 struct rb_root unconf_name_tree;
69 struct list_head *ownerstr_hashtbl;
70 struct list_head *lockowner_ino_hashtbl;
71 struct list_head *sessionid_hashtbl;
72 /*
73 * client_lru holds client queue ordered by nfs4_client.cl_time
74 * for lease renewal.
75 *
76 * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
77 * for last close replay.
78 *
79 * All of the above fields are protected by the client_mutex.
80 */
81 struct list_head client_lru;
82 struct list_head close_lru;
83
84 struct delayed_work laundromat_work;
85
86 /* client_lock protects the client lru list and session hash table */
87 spinlock_t client_lock;
88
89 struct file *rec_file;
90 bool in_grace;
91 struct nfsd4_client_tracking_ops *client_tracking_ops;
92
93 time_t nfsd4_lease;
94 time_t nfsd4_grace;
95
96 bool nfsd_net_up;
97
98 /*
99 * Time of server startup
100 */
101 struct timeval nfssvc_boot;
102
103 struct svc_serv *nfsd_serv;
41}; 104};
42 105
106/* Simple check to find out if a given net was properly initialized */
107#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
108
43extern int nfsd_net_id; 109extern int nfsd_net_id;
44#endif /* __NFSD_NETNS_H__ */ 110#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b314888825d5..9170861c804a 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -253,7 +253,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
253 (resp->mask & NFS_ACL) ? resp->acl_access : NULL, 253 (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
254 (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); 254 (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
255 while (w > 0) { 255 while (w > 0) {
256 if (!rqstp->rq_respages[rqstp->rq_resused++]) 256 if (!*(rqstp->rq_next_page++))
257 return 0; 257 return 0;
258 w -= PAGE_SIZE; 258 w -= PAGE_SIZE;
259 } 259 }
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index a596e9d987e4..9cbc1a841f87 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -184,7 +184,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
184 (resp->mask & NFS_ACL) ? resp->acl_access : NULL, 184 (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
185 (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); 185 (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
186 while (w > 0) { 186 while (w > 0) {
187 if (!rqstp->rq_respages[rqstp->rq_resused++]) 187 if (!*(rqstp->rq_next_page++))
188 return 0; 188 return 0;
189 w -= PAGE_SIZE; 189 w -= PAGE_SIZE;
190 } 190 }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 97d90d1c8608..1fc02dfdc5c4 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -460,7 +460,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
460 __be32 nfserr; 460 __be32 nfserr;
461 int count = 0; 461 int count = 0;
462 loff_t offset; 462 loff_t offset;
463 int i; 463 struct page **p;
464 caddr_t page_addr = NULL; 464 caddr_t page_addr = NULL;
465 465
466 dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", 466 dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
@@ -484,8 +484,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
484 &resp->common, 484 &resp->common,
485 nfs3svc_encode_entry_plus); 485 nfs3svc_encode_entry_plus);
486 memcpy(resp->verf, argp->verf, 8); 486 memcpy(resp->verf, argp->verf, 8);
487 for (i=1; i<rqstp->rq_resused ; i++) { 487 for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
488 page_addr = page_address(rqstp->rq_respages[i]); 488 page_addr = page_address(*p);
489 489
490 if (((caddr_t)resp->buffer >= page_addr) && 490 if (((caddr_t)resp->buffer >= page_addr) &&
491 ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { 491 ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 43f46cd9edea..324c0baf7cda 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -7,8 +7,10 @@
7 */ 7 */
8 8
9#include <linux/namei.h> 9#include <linux/namei.h>
10#include <linux/sunrpc/svc_xprt.h>
10#include "xdr3.h" 11#include "xdr3.h"
11#include "auth.h" 12#include "auth.h"
13#include "netns.h"
12 14
13#define NFSDDBG_FACILITY NFSDDBG_XDR 15#define NFSDDBG_FACILITY NFSDDBG_XDR
14 16
@@ -323,7 +325,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
323 struct nfsd3_readargs *args) 325 struct nfsd3_readargs *args)
324{ 326{
325 unsigned int len; 327 unsigned int len;
326 int v,pn; 328 int v;
327 u32 max_blocksize = svc_max_payload(rqstp); 329 u32 max_blocksize = svc_max_payload(rqstp);
328 330
329 if (!(p = decode_fh(p, &args->fh))) 331 if (!(p = decode_fh(p, &args->fh)))
@@ -338,8 +340,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
338 /* set up the kvec */ 340 /* set up the kvec */
339 v=0; 341 v=0;
340 while (len > 0) { 342 while (len > 0) {
341 pn = rqstp->rq_resused++; 343 struct page *p = *(rqstp->rq_next_page++);
342 rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); 344
345 rqstp->rq_vec[v].iov_base = page_address(p);
343 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; 346 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
344 len -= rqstp->rq_vec[v].iov_len; 347 len -= rqstp->rq_vec[v].iov_len;
345 v++; 348 v++;
@@ -461,8 +464,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
461 len = ntohl(*p++); 464 len = ntohl(*p++);
462 if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) 465 if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
463 return 0; 466 return 0;
464 args->tname = new = 467 args->tname = new = page_address(*(rqstp->rq_next_page++));
465 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
466 args->tlen = len; 468 args->tlen = len;
467 /* first copy and check from the first page */ 469 /* first copy and check from the first page */
468 old = (char*)p; 470 old = (char*)p;
@@ -533,8 +535,7 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
533{ 535{
534 if (!(p = decode_fh(p, &args->fh))) 536 if (!(p = decode_fh(p, &args->fh)))
535 return 0; 537 return 0;
536 args->buffer = 538 args->buffer = page_address(*(rqstp->rq_next_page++));
537 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
538 539
539 return xdr_argsize_check(rqstp, p); 540 return xdr_argsize_check(rqstp, p);
540} 541}
@@ -565,8 +566,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
565 if (args->count > PAGE_SIZE) 566 if (args->count > PAGE_SIZE)
566 args->count = PAGE_SIZE; 567 args->count = PAGE_SIZE;
567 568
568 args->buffer = 569 args->buffer = page_address(*(rqstp->rq_next_page++));
569 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
570 570
571 return xdr_argsize_check(rqstp, p); 571 return xdr_argsize_check(rqstp, p);
572} 572}
@@ -575,7 +575,7 @@ int
575nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, 575nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
576 struct nfsd3_readdirargs *args) 576 struct nfsd3_readdirargs *args)
577{ 577{
578 int len, pn; 578 int len;
579 u32 max_blocksize = svc_max_payload(rqstp); 579 u32 max_blocksize = svc_max_payload(rqstp);
580 580
581 if (!(p = decode_fh(p, &args->fh))) 581 if (!(p = decode_fh(p, &args->fh)))
@@ -590,9 +590,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
590 args->count = len; 590 args->count = len;
591 591
592 while (len > 0) { 592 while (len > 0) {
593 pn = rqstp->rq_resused++; 593 struct page *p = *(rqstp->rq_next_page++);
594 if (!args->buffer) 594 if (!args->buffer)
595 args->buffer = page_address(rqstp->rq_respages[pn]); 595 args->buffer = page_address(p);
596 len -= PAGE_SIZE; 596 len -= PAGE_SIZE;
597 } 597 }
598 598
@@ -720,12 +720,14 @@ int
720nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p, 720nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
721 struct nfsd3_writeres *resp) 721 struct nfsd3_writeres *resp)
722{ 722{
723 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
724
723 p = encode_wcc_data(rqstp, p, &resp->fh); 725 p = encode_wcc_data(rqstp, p, &resp->fh);
724 if (resp->status == 0) { 726 if (resp->status == 0) {
725 *p++ = htonl(resp->count); 727 *p++ = htonl(resp->count);
726 *p++ = htonl(resp->committed); 728 *p++ = htonl(resp->committed);
727 *p++ = htonl(nfssvc_boot.tv_sec); 729 *p++ = htonl(nn->nfssvc_boot.tv_sec);
728 *p++ = htonl(nfssvc_boot.tv_usec); 730 *p++ = htonl(nn->nfssvc_boot.tv_usec);
729 } 731 }
730 return xdr_ressize_check(rqstp, p); 732 return xdr_ressize_check(rqstp, p);
731} 733}
@@ -876,7 +878,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
876 common); 878 common);
877 __be32 *p = cd->buffer; 879 __be32 *p = cd->buffer;
878 caddr_t curr_page_addr = NULL; 880 caddr_t curr_page_addr = NULL;
879 int pn; /* current page number */ 881 struct page ** page;
880 int slen; /* string (name) length */ 882 int slen; /* string (name) length */
881 int elen; /* estimated entry length in words */ 883 int elen; /* estimated entry length in words */
882 int num_entry_words = 0; /* actual number of words */ 884 int num_entry_words = 0; /* actual number of words */
@@ -913,8 +915,9 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
913 } 915 }
914 916
915 /* determine which page in rq_respages[] we are currently filling */ 917 /* determine which page in rq_respages[] we are currently filling */
916 for (pn=1; pn < cd->rqstp->rq_resused; pn++) { 918 for (page = cd->rqstp->rq_respages + 1;
917 curr_page_addr = page_address(cd->rqstp->rq_respages[pn]); 919 page < cd->rqstp->rq_next_page; page++) {
920 curr_page_addr = page_address(*page);
918 921
919 if (((caddr_t)cd->buffer >= curr_page_addr) && 922 if (((caddr_t)cd->buffer >= curr_page_addr) &&
920 ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) 923 ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE))
@@ -929,14 +932,14 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
929 if (plus) 932 if (plus)
930 p = encode_entryplus_baggage(cd, p, name, namlen); 933 p = encode_entryplus_baggage(cd, p, name, namlen);
931 num_entry_words = p - cd->buffer; 934 num_entry_words = p - cd->buffer;
932 } else if (cd->rqstp->rq_respages[pn+1] != NULL) { 935 } else if (*(page+1) != NULL) {
933 /* temporarily encode entry into next page, then move back to 936 /* temporarily encode entry into next page, then move back to
934 * current and next page in rq_respages[] */ 937 * current and next page in rq_respages[] */
935 __be32 *p1, *tmp; 938 __be32 *p1, *tmp;
936 int len1, len2; 939 int len1, len2;
937 940
938 /* grab next page for temporary storage of entry */ 941 /* grab next page for temporary storage of entry */
939 p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]); 942 p1 = tmp = page_address(*(page+1));
940 943
941 p1 = encode_entry_baggage(cd, p1, name, namlen, ino); 944 p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
942 945
@@ -1082,11 +1085,13 @@ int
1082nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p, 1085nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
1083 struct nfsd3_commitres *resp) 1086 struct nfsd3_commitres *resp)
1084{ 1087{
1088 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1089
1085 p = encode_wcc_data(rqstp, p, &resp->fh); 1090 p = encode_wcc_data(rqstp, p, &resp->fh);
1086 /* Write verifier */ 1091 /* Write verifier */
1087 if (resp->status == 0) { 1092 if (resp->status == 0) {
1088 *p++ = htonl(nfssvc_boot.tv_sec); 1093 *p++ = htonl(nn->nfssvc_boot.tv_sec);
1089 *p++ = htonl(nfssvc_boot.tv_usec); 1094 *p++ = htonl(nn->nfssvc_boot.tv_usec);
1090 } 1095 }
1091 return xdr_ressize_check(rqstp, p); 1096 return xdr_ressize_check(rqstp, p);
1092} 1097}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index bdf29c96e4cd..99bc85ff0217 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -36,6 +36,7 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include "nfsd.h" 37#include "nfsd.h"
38#include "state.h" 38#include "state.h"
39#include "netns.h"
39 40
40#define NFSDDBG_FACILITY NFSDDBG_PROC 41#define NFSDDBG_FACILITY NFSDDBG_PROC
41 42
@@ -625,20 +626,46 @@ static const struct rpc_program cb_program = {
625 .pipe_dir_name = "nfsd4_cb", 626 .pipe_dir_name = "nfsd4_cb",
626}; 627};
627 628
628static int max_cb_time(void) 629static int max_cb_time(struct net *net)
629{ 630{
630 return max(nfsd4_lease/10, (time_t)1) * HZ; 631 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
632 return max(nn->nfsd4_lease/10, (time_t)1) * HZ;
631} 633}
632 634
635static struct rpc_cred *callback_cred;
636
637int set_callback_cred(void)
638{
639 if (callback_cred)
640 return 0;
641 callback_cred = rpc_lookup_machine_cred("nfs");
642 if (!callback_cred)
643 return -ENOMEM;
644 return 0;
645}
646
647static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
648{
649 if (clp->cl_minorversion == 0) {
650 return get_rpccred(callback_cred);
651 } else {
652 struct rpc_auth *auth = client->cl_auth;
653 struct auth_cred acred = {};
654
655 acred.uid = ses->se_cb_sec.uid;
656 acred.gid = ses->se_cb_sec.gid;
657 return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0);
658 }
659}
633 660
634static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) 661static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
635{ 662{
636 struct rpc_timeout timeparms = { 663 struct rpc_timeout timeparms = {
637 .to_initval = max_cb_time(), 664 .to_initval = max_cb_time(clp->net),
638 .to_retries = 0, 665 .to_retries = 0,
639 }; 666 };
640 struct rpc_create_args args = { 667 struct rpc_create_args args = {
641 .net = &init_net, 668 .net = clp->net,
642 .address = (struct sockaddr *) &conn->cb_addr, 669 .address = (struct sockaddr *) &conn->cb_addr,
643 .addrsize = conn->cb_addrlen, 670 .addrsize = conn->cb_addrlen,
644 .saddress = (struct sockaddr *) &conn->cb_saddr, 671 .saddress = (struct sockaddr *) &conn->cb_saddr,
@@ -648,6 +675,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
648 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 675 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
649 }; 676 };
650 struct rpc_clnt *client; 677 struct rpc_clnt *client;
678 struct rpc_cred *cred;
651 679
652 if (clp->cl_minorversion == 0) { 680 if (clp->cl_minorversion == 0) {
653 if (!clp->cl_cred.cr_principal && 681 if (!clp->cl_cred.cr_principal &&
@@ -666,7 +694,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
666 args.bc_xprt = conn->cb_xprt; 694 args.bc_xprt = conn->cb_xprt;
667 args.prognumber = clp->cl_cb_session->se_cb_prog; 695 args.prognumber = clp->cl_cb_session->se_cb_prog;
668 args.protocol = XPRT_TRANSPORT_BC_TCP; 696 args.protocol = XPRT_TRANSPORT_BC_TCP;
669 args.authflavor = RPC_AUTH_UNIX; 697 args.authflavor = ses->se_cb_sec.flavor;
670 } 698 }
671 /* Create RPC client */ 699 /* Create RPC client */
672 client = rpc_create(&args); 700 client = rpc_create(&args);
@@ -675,9 +703,14 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
675 PTR_ERR(client)); 703 PTR_ERR(client));
676 return PTR_ERR(client); 704 return PTR_ERR(client);
677 } 705 }
706 cred = get_backchannel_cred(clp, client, ses);
707 if (IS_ERR(cred)) {
708 rpc_shutdown_client(client);
709 return PTR_ERR(cred);
710 }
678 clp->cl_cb_client = client; 711 clp->cl_cb_client = client;
712 clp->cl_cb_cred = cred;
679 return 0; 713 return 0;
680
681} 714}
682 715
683static void warn_no_callback_path(struct nfs4_client *clp, int reason) 716static void warn_no_callback_path(struct nfs4_client *clp, int reason)
@@ -714,18 +747,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
714 .rpc_call_done = nfsd4_cb_probe_done, 747 .rpc_call_done = nfsd4_cb_probe_done,
715}; 748};
716 749
717static struct rpc_cred *callback_cred;
718
719int set_callback_cred(void)
720{
721 if (callback_cred)
722 return 0;
723 callback_cred = rpc_lookup_machine_cred("nfs");
724 if (!callback_cred)
725 return -ENOMEM;
726 return 0;
727}
728
729static struct workqueue_struct *callback_wq; 750static struct workqueue_struct *callback_wq;
730 751
731static void run_nfsd4_cb(struct nfsd4_callback *cb) 752static void run_nfsd4_cb(struct nfsd4_callback *cb)
@@ -743,7 +764,6 @@ static void do_probe_callback(struct nfs4_client *clp)
743 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL]; 764 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
744 cb->cb_msg.rpc_argp = NULL; 765 cb->cb_msg.rpc_argp = NULL;
745 cb->cb_msg.rpc_resp = NULL; 766 cb->cb_msg.rpc_resp = NULL;
746 cb->cb_msg.rpc_cred = callback_cred;
747 767
748 cb->cb_ops = &nfsd4_cb_probe_ops; 768 cb->cb_ops = &nfsd4_cb_probe_ops;
749 769
@@ -962,6 +982,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
962 if (clp->cl_cb_client) { 982 if (clp->cl_cb_client) {
963 rpc_shutdown_client(clp->cl_cb_client); 983 rpc_shutdown_client(clp->cl_cb_client);
964 clp->cl_cb_client = NULL; 984 clp->cl_cb_client = NULL;
985 put_rpccred(clp->cl_cb_cred);
986 clp->cl_cb_cred = NULL;
965 } 987 }
966 if (clp->cl_cb_conn.cb_xprt) { 988 if (clp->cl_cb_conn.cb_xprt) {
967 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 989 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -995,7 +1017,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
995 run_nfsd4_cb(cb); 1017 run_nfsd4_cb(cb);
996} 1018}
997 1019
998void nfsd4_do_callback_rpc(struct work_struct *w) 1020static void nfsd4_do_callback_rpc(struct work_struct *w)
999{ 1021{
1000 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work); 1022 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
1001 struct nfs4_client *clp = cb->cb_clp; 1023 struct nfs4_client *clp = cb->cb_clp;
@@ -1010,10 +1032,16 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
1010 nfsd4_release_cb(cb); 1032 nfsd4_release_cb(cb);
1011 return; 1033 return;
1012 } 1034 }
1035 cb->cb_msg.rpc_cred = clp->cl_cb_cred;
1013 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 1036 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
1014 cb->cb_ops, cb); 1037 cb->cb_ops, cb);
1015} 1038}
1016 1039
1040void nfsd4_init_callback(struct nfsd4_callback *cb)
1041{
1042 INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
1043}
1044
1017void nfsd4_cb_recall(struct nfs4_delegation *dp) 1045void nfsd4_cb_recall(struct nfs4_delegation *dp)
1018{ 1046{
1019 struct nfsd4_callback *cb = &dp->dl_recall; 1047 struct nfsd4_callback *cb = &dp->dl_recall;
@@ -1025,7 +1053,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
1025 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; 1053 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
1026 cb->cb_msg.rpc_argp = cb; 1054 cb->cb_msg.rpc_argp = cb;
1027 cb->cb_msg.rpc_resp = cb; 1055 cb->cb_msg.rpc_resp = cb;
1028 cb->cb_msg.rpc_cred = callback_cred;
1029 1056
1030 cb->cb_ops = &nfsd4_cb_recall_ops; 1057 cb->cb_ops = &nfsd4_cb_recall_ops;
1031 1058
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6c9a4b291dba..9d1c5dba2bbb 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -40,6 +40,7 @@
40#include "xdr4.h" 40#include "xdr4.h"
41#include "vfs.h" 41#include "vfs.h"
42#include "current_stateid.h" 42#include "current_stateid.h"
43#include "netns.h"
43 44
44#define NFSDDBG_FACILITY NFSDDBG_PROC 45#define NFSDDBG_FACILITY NFSDDBG_PROC
45 46
@@ -194,6 +195,7 @@ static __be32
194do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 195do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
195{ 196{
196 struct svc_fh *resfh; 197 struct svc_fh *resfh;
198 int accmode;
197 __be32 status; 199 __be32 status;
198 200
199 resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); 201 resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
@@ -253,9 +255,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
253 /* set reply cache */ 255 /* set reply cache */
254 fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, 256 fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
255 &resfh->fh_handle); 257 &resfh->fh_handle);
256 if (!open->op_created) 258 accmode = NFSD_MAY_NOP;
257 status = do_open_permission(rqstp, resfh, open, 259 if (open->op_created)
258 NFSD_MAY_NOP); 260 accmode |= NFSD_MAY_OWNER_OVERRIDE;
261 status = do_open_permission(rqstp, resfh, open, accmode);
259 set_change_info(&open->op_cinfo, current_fh); 262 set_change_info(&open->op_cinfo, current_fh);
260 fh_dup2(current_fh, resfh); 263 fh_dup2(current_fh, resfh);
261out: 264out:
@@ -304,6 +307,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
304{ 307{
305 __be32 status; 308 __be32 status;
306 struct nfsd4_compoundres *resp; 309 struct nfsd4_compoundres *resp;
310 struct net *net = SVC_NET(rqstp);
311 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
307 312
308 dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", 313 dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
309 (int)open->op_fname.len, open->op_fname.data, 314 (int)open->op_fname.len, open->op_fname.data,
@@ -331,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
331 336
332 /* check seqid for replay. set nfs4_owner */ 337 /* check seqid for replay. set nfs4_owner */
333 resp = rqstp->rq_resp; 338 resp = rqstp->rq_resp;
334 status = nfsd4_process_open1(&resp->cstate, open); 339 status = nfsd4_process_open1(&resp->cstate, open, nn);
335 if (status == nfserr_replay_me) { 340 if (status == nfserr_replay_me) {
336 struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; 341 struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
337 fh_put(&cstate->current_fh); 342 fh_put(&cstate->current_fh);
@@ -354,10 +359,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
354 /* Openowner is now set, so sequence id will get bumped. Now we need 359 /* Openowner is now set, so sequence id will get bumped. Now we need
355 * these checks before we do any creates: */ 360 * these checks before we do any creates: */
356 status = nfserr_grace; 361 status = nfserr_grace;
357 if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) 362 if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
358 goto out; 363 goto out;
359 status = nfserr_no_grace; 364 status = nfserr_no_grace;
360 if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) 365 if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
361 goto out; 366 goto out;
362 367
363 switch (open->op_claim_type) { 368 switch (open->op_claim_type) {
@@ -370,7 +375,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
370 break; 375 break;
371 case NFS4_OPEN_CLAIM_PREVIOUS: 376 case NFS4_OPEN_CLAIM_PREVIOUS:
372 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; 377 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
373 status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion); 378 status = nfs4_check_open_reclaim(&open->op_clientid,
379 cstate->minorversion,
380 nn);
374 if (status) 381 if (status)
375 goto out; 382 goto out;
376 case NFS4_OPEN_CLAIM_FH: 383 case NFS4_OPEN_CLAIM_FH:
@@ -490,12 +497,13 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
490 &access->ac_supported); 497 &access->ac_supported);
491} 498}
492 499
493static void gen_boot_verifier(nfs4_verifier *verifier) 500static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
494{ 501{
495 __be32 verf[2]; 502 __be32 verf[2];
503 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
496 504
497 verf[0] = (__be32)nfssvc_boot.tv_sec; 505 verf[0] = (__be32)nn->nfssvc_boot.tv_sec;
498 verf[1] = (__be32)nfssvc_boot.tv_usec; 506 verf[1] = (__be32)nn->nfssvc_boot.tv_usec;
499 memcpy(verifier->data, verf, sizeof(verifier->data)); 507 memcpy(verifier->data, verf, sizeof(verifier->data));
500} 508}
501 509
@@ -503,7 +511,7 @@ static __be32
503nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 511nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
504 struct nfsd4_commit *commit) 512 struct nfsd4_commit *commit)
505{ 513{
506 gen_boot_verifier(&commit->co_verf); 514 gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp));
507 return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, 515 return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
508 commit->co_count); 516 commit->co_count);
509} 517}
@@ -684,6 +692,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
684 if (read->rd_offset >= OFFSET_MAX) 692 if (read->rd_offset >= OFFSET_MAX)
685 return nfserr_inval; 693 return nfserr_inval;
686 694
695 /*
696 * If we do a zero copy read, then a client will see read data
697 * that reflects the state of the file *after* performing the
698 * following compound.
699 *
700 * To ensure proper ordering, we therefore turn off zero copy if
701 * the client wants us to do more in this compound:
702 */
703 if (!nfsd4_last_compound_op(rqstp))
704 rqstp->rq_splice_ok = false;
705
687 nfs4_lock_state(); 706 nfs4_lock_state();
688 /* check stateid */ 707 /* check stateid */
689 if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), 708 if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
@@ -876,6 +895,24 @@ out:
876 return status; 895 return status;
877} 896}
878 897
898static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write)
899{
900 int i = 1;
901 int buflen = write->wr_buflen;
902
903 vec[0].iov_base = write->wr_head.iov_base;
904 vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len);
905 buflen -= vec[0].iov_len;
906
907 while (buflen) {
908 vec[i].iov_base = page_address(write->wr_pagelist[i - 1]);
909 vec[i].iov_len = min_t(int, PAGE_SIZE, buflen);
910 buflen -= vec[i].iov_len;
911 i++;
912 }
913 return i;
914}
915
879static __be32 916static __be32
880nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 917nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
881 struct nfsd4_write *write) 918 struct nfsd4_write *write)
@@ -884,6 +921,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
884 struct file *filp = NULL; 921 struct file *filp = NULL;
885 __be32 status = nfs_ok; 922 __be32 status = nfs_ok;
886 unsigned long cnt; 923 unsigned long cnt;
924 int nvecs;
887 925
888 /* no need to check permission - this will be done in nfsd_write() */ 926 /* no need to check permission - this will be done in nfsd_write() */
889 927
@@ -904,10 +942,13 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
904 942
905 cnt = write->wr_buflen; 943 cnt = write->wr_buflen;
906 write->wr_how_written = write->wr_stable_how; 944 write->wr_how_written = write->wr_stable_how;
907 gen_boot_verifier(&write->wr_verifier); 945 gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp));
946
947 nvecs = fill_in_write_vector(rqstp->rq_vec, write);
948 WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
908 949
909 status = nfsd_write(rqstp, &cstate->current_fh, filp, 950 status = nfsd_write(rqstp, &cstate->current_fh, filp,
910 write->wr_offset, rqstp->rq_vec, write->wr_vlen, 951 write->wr_offset, rqstp->rq_vec, nvecs,
911 &cnt, &write->wr_how_written); 952 &cnt, &write->wr_how_written);
912 if (filp) 953 if (filp)
913 fput(filp); 954 fput(filp);
@@ -1666,6 +1707,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
1666 .op_name = "OP_EXCHANGE_ID", 1707 .op_name = "OP_EXCHANGE_ID",
1667 .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize, 1708 .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
1668 }, 1709 },
1710 [OP_BACKCHANNEL_CTL] = {
1711 .op_func = (nfsd4op_func)nfsd4_backchannel_ctl,
1712 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
1713 .op_name = "OP_BACKCHANNEL_CTL",
1714 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1715 },
1669 [OP_BIND_CONN_TO_SESSION] = { 1716 [OP_BIND_CONN_TO_SESSION] = {
1670 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, 1717 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
1671 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP 1718 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
@@ -1719,6 +1766,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
1719 .op_func = (nfsd4op_func)nfsd4_free_stateid, 1766 .op_func = (nfsd4op_func)nfsd4_free_stateid,
1720 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, 1767 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
1721 .op_name = "OP_FREE_STATEID", 1768 .op_name = "OP_FREE_STATEID",
1769 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
1722 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 1770 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1723 }, 1771 },
1724}; 1772};
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 43295d45cc2b..ba6fdd4a0455 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,13 +58,11 @@ struct nfsd4_client_tracking_ops {
58 void (*create)(struct nfs4_client *); 58 void (*create)(struct nfs4_client *);
59 void (*remove)(struct nfs4_client *); 59 void (*remove)(struct nfs4_client *);
60 int (*check)(struct nfs4_client *); 60 int (*check)(struct nfs4_client *);
61 void (*grace_done)(struct net *, time_t); 61 void (*grace_done)(struct nfsd_net *, time_t);
62}; 62};
63 63
64/* Globals */ 64/* Globals */
65static struct file *rec_file;
66static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; 65static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
67static struct nfsd4_client_tracking_ops *client_tracking_ops;
68 66
69static int 67static int
70nfs4_save_creds(const struct cred **original_creds) 68nfs4_save_creds(const struct cred **original_creds)
@@ -102,33 +100,39 @@ md5_to_hex(char *out, char *md5)
102 *out = '\0'; 100 *out = '\0';
103} 101}
104 102
105__be32 103static int
106nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) 104nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
107{ 105{
108 struct xdr_netobj cksum; 106 struct xdr_netobj cksum;
109 struct hash_desc desc; 107 struct hash_desc desc;
110 struct scatterlist sg; 108 struct scatterlist sg;
111 __be32 status = nfserr_jukebox; 109 int status;
112 110
113 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", 111 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
114 clname->len, clname->data); 112 clname->len, clname->data);
115 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 113 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
116 desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); 114 desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
117 if (IS_ERR(desc.tfm)) 115 if (IS_ERR(desc.tfm)) {
116 status = PTR_ERR(desc.tfm);
118 goto out_no_tfm; 117 goto out_no_tfm;
118 }
119
119 cksum.len = crypto_hash_digestsize(desc.tfm); 120 cksum.len = crypto_hash_digestsize(desc.tfm);
120 cksum.data = kmalloc(cksum.len, GFP_KERNEL); 121 cksum.data = kmalloc(cksum.len, GFP_KERNEL);
121 if (cksum.data == NULL) 122 if (cksum.data == NULL) {
123 status = -ENOMEM;
122 goto out; 124 goto out;
125 }
123 126
124 sg_init_one(&sg, clname->data, clname->len); 127 sg_init_one(&sg, clname->data, clname->len);
125 128
126 if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data)) 129 status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
130 if (status)
127 goto out; 131 goto out;
128 132
129 md5_to_hex(dname, cksum.data); 133 md5_to_hex(dname, cksum.data);
130 134
131 status = nfs_ok; 135 status = 0;
132out: 136out:
133 kfree(cksum.data); 137 kfree(cksum.data);
134 crypto_free_hash(desc.tfm); 138 crypto_free_hash(desc.tfm);
@@ -136,29 +140,61 @@ out_no_tfm:
136 return status; 140 return status;
137} 141}
138 142
143/*
144 * If we had an error generating the recdir name for the legacy tracker
145 * then warn the admin. If the error doesn't appear to be transient,
146 * then disable recovery tracking.
147 */
148static void
149legacy_recdir_name_error(int error)
150{
151 printk(KERN_ERR "NFSD: unable to generate recoverydir "
152 "name (%d).\n", error);
153
154 /*
155 * if the algorithm just doesn't exist, then disable the recovery
156 * tracker altogether. The crypto libs will generally return this if
157 * FIPS is enabled as well.
158 */
159 if (error == -ENOENT) {
160 printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
161 "Reboot recovery will not function correctly!\n");
162
163 /* the argument is ignored by the legacy exit function */
164 nfsd4_client_tracking_exit(NULL);
165 }
166}
167
139static void 168static void
140nfsd4_create_clid_dir(struct nfs4_client *clp) 169nfsd4_create_clid_dir(struct nfs4_client *clp)
141{ 170{
142 const struct cred *original_cred; 171 const struct cred *original_cred;
143 char *dname = clp->cl_recdir; 172 char dname[HEXDIR_LEN];
144 struct dentry *dir, *dentry; 173 struct dentry *dir, *dentry;
174 struct nfs4_client_reclaim *crp;
145 int status; 175 int status;
176 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
146 177
147 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 178 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
148 179
149 if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) 180 if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
150 return; 181 return;
151 if (!rec_file) 182 if (!nn->rec_file)
152 return; 183 return;
184
185 status = nfs4_make_rec_clidname(dname, &clp->cl_name);
186 if (status)
187 return legacy_recdir_name_error(status);
188
153 status = nfs4_save_creds(&original_cred); 189 status = nfs4_save_creds(&original_cred);
154 if (status < 0) 190 if (status < 0)
155 return; 191 return;
156 192
157 status = mnt_want_write_file(rec_file); 193 status = mnt_want_write_file(nn->rec_file);
158 if (status) 194 if (status)
159 return; 195 return;
160 196
161 dir = rec_file->f_path.dentry; 197 dir = nn->rec_file->f_path.dentry;
162 /* lock the parent */ 198 /* lock the parent */
163 mutex_lock(&dir->d_inode->i_mutex); 199 mutex_lock(&dir->d_inode->i_mutex);
164 200
@@ -182,18 +218,24 @@ out_put:
182 dput(dentry); 218 dput(dentry);
183out_unlock: 219out_unlock:
184 mutex_unlock(&dir->d_inode->i_mutex); 220 mutex_unlock(&dir->d_inode->i_mutex);
185 if (status == 0) 221 if (status == 0) {
186 vfs_fsync(rec_file, 0); 222 if (nn->in_grace) {
187 else 223 crp = nfs4_client_to_reclaim(dname, nn);
224 if (crp)
225 crp->cr_clp = clp;
226 }
227 vfs_fsync(nn->rec_file, 0);
228 } else {
188 printk(KERN_ERR "NFSD: failed to write recovery record" 229 printk(KERN_ERR "NFSD: failed to write recovery record"
189 " (err %d); please check that %s exists" 230 " (err %d); please check that %s exists"
190 " and is writeable", status, 231 " and is writeable", status,
191 user_recovery_dirname); 232 user_recovery_dirname);
192 mnt_drop_write_file(rec_file); 233 }
234 mnt_drop_write_file(nn->rec_file);
193 nfs4_reset_creds(original_cred); 235 nfs4_reset_creds(original_cred);
194} 236}
195 237
196typedef int (recdir_func)(struct dentry *, struct dentry *); 238typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *);
197 239
198struct name_list { 240struct name_list {
199 char name[HEXDIR_LEN]; 241 char name[HEXDIR_LEN];
@@ -219,10 +261,10 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
219} 261}
220 262
221static int 263static int
222nfsd4_list_rec_dir(recdir_func *f) 264nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
223{ 265{
224 const struct cred *original_cred; 266 const struct cred *original_cred;
225 struct dentry *dir = rec_file->f_path.dentry; 267 struct dentry *dir = nn->rec_file->f_path.dentry;
226 LIST_HEAD(names); 268 LIST_HEAD(names);
227 int status; 269 int status;
228 270
@@ -230,13 +272,13 @@ nfsd4_list_rec_dir(recdir_func *f)
230 if (status < 0) 272 if (status < 0)
231 return status; 273 return status;
232 274
233 status = vfs_llseek(rec_file, 0, SEEK_SET); 275 status = vfs_llseek(nn->rec_file, 0, SEEK_SET);
234 if (status < 0) { 276 if (status < 0) {
235 nfs4_reset_creds(original_cred); 277 nfs4_reset_creds(original_cred);
236 return status; 278 return status;
237 } 279 }
238 280
239 status = vfs_readdir(rec_file, nfsd4_build_namelist, &names); 281 status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
240 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 282 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
241 while (!list_empty(&names)) { 283 while (!list_empty(&names)) {
242 struct name_list *entry; 284 struct name_list *entry;
@@ -248,7 +290,7 @@ nfsd4_list_rec_dir(recdir_func *f)
248 status = PTR_ERR(dentry); 290 status = PTR_ERR(dentry);
249 break; 291 break;
250 } 292 }
251 status = f(dir, dentry); 293 status = f(dir, dentry, nn);
252 dput(dentry); 294 dput(dentry);
253 } 295 }
254 list_del(&entry->list); 296 list_del(&entry->list);
@@ -260,14 +302,14 @@ nfsd4_list_rec_dir(recdir_func *f)
260} 302}
261 303
262static int 304static int
263nfsd4_unlink_clid_dir(char *name, int namlen) 305nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
264{ 306{
265 struct dentry *dir, *dentry; 307 struct dentry *dir, *dentry;
266 int status; 308 int status;
267 309
268 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 310 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
269 311
270 dir = rec_file->f_path.dentry; 312 dir = nn->rec_file->f_path.dentry;
271 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 313 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
272 dentry = lookup_one_len(name, dir, namlen); 314 dentry = lookup_one_len(name, dir, namlen);
273 if (IS_ERR(dentry)) { 315 if (IS_ERR(dentry)) {
@@ -289,37 +331,52 @@ static void
289nfsd4_remove_clid_dir(struct nfs4_client *clp) 331nfsd4_remove_clid_dir(struct nfs4_client *clp)
290{ 332{
291 const struct cred *original_cred; 333 const struct cred *original_cred;
334 struct nfs4_client_reclaim *crp;
335 char dname[HEXDIR_LEN];
292 int status; 336 int status;
337 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
293 338
294 if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) 339 if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
295 return; 340 return;
296 341
297 status = mnt_want_write_file(rec_file); 342 status = nfs4_make_rec_clidname(dname, &clp->cl_name);
343 if (status)
344 return legacy_recdir_name_error(status);
345
346 status = mnt_want_write_file(nn->rec_file);
298 if (status) 347 if (status)
299 goto out; 348 goto out;
300 clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); 349 clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
301 350
302 status = nfs4_save_creds(&original_cred); 351 status = nfs4_save_creds(&original_cred);
303 if (status < 0) 352 if (status < 0)
304 goto out; 353 goto out_drop_write;
305 354
306 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); 355 status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
307 nfs4_reset_creds(original_cred); 356 nfs4_reset_creds(original_cred);
308 if (status == 0) 357 if (status == 0) {
309 vfs_fsync(rec_file, 0); 358 vfs_fsync(nn->rec_file, 0);
310 mnt_drop_write_file(rec_file); 359 if (nn->in_grace) {
360 /* remove reclaim record */
361 crp = nfsd4_find_reclaim_client(dname, nn);
362 if (crp)
363 nfs4_remove_reclaim_record(crp, nn);
364 }
365 }
366out_drop_write:
367 mnt_drop_write_file(nn->rec_file);
311out: 368out:
312 if (status) 369 if (status)
313 printk("NFSD: Failed to remove expired client state directory" 370 printk("NFSD: Failed to remove expired client state directory"
314 " %.*s\n", HEXDIR_LEN, clp->cl_recdir); 371 " %.*s\n", HEXDIR_LEN, dname);
315} 372}
316 373
317static int 374static int
318purge_old(struct dentry *parent, struct dentry *child) 375purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
319{ 376{
320 int status; 377 int status;
321 378
322 if (nfs4_has_reclaimed_state(child->d_name.name, false)) 379 if (nfs4_has_reclaimed_state(child->d_name.name, nn))
323 return 0; 380 return 0;
324 381
325 status = vfs_rmdir(parent->d_inode, child); 382 status = vfs_rmdir(parent->d_inode, child);
@@ -331,27 +388,29 @@ purge_old(struct dentry *parent, struct dentry *child)
331} 388}
332 389
333static void 390static void
334nfsd4_recdir_purge_old(struct net *net, time_t boot_time) 391nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
335{ 392{
336 int status; 393 int status;
337 394
338 if (!rec_file) 395 nn->in_grace = false;
396 if (!nn->rec_file)
339 return; 397 return;
340 status = mnt_want_write_file(rec_file); 398 status = mnt_want_write_file(nn->rec_file);
341 if (status) 399 if (status)
342 goto out; 400 goto out;
343 status = nfsd4_list_rec_dir(purge_old); 401 status = nfsd4_list_rec_dir(purge_old, nn);
344 if (status == 0) 402 if (status == 0)
345 vfs_fsync(rec_file, 0); 403 vfs_fsync(nn->rec_file, 0);
346 mnt_drop_write_file(rec_file); 404 mnt_drop_write_file(nn->rec_file);
347out: 405out:
406 nfs4_release_reclaim(nn);
348 if (status) 407 if (status)
349 printk("nfsd4: failed to purge old clients from recovery" 408 printk("nfsd4: failed to purge old clients from recovery"
350 " directory %s\n", rec_file->f_path.dentry->d_name.name); 409 " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
351} 410}
352 411
353static int 412static int
354load_recdir(struct dentry *parent, struct dentry *child) 413load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
355{ 414{
356 if (child->d_name.len != HEXDIR_LEN - 1) { 415 if (child->d_name.len != HEXDIR_LEN - 1) {
357 printk("nfsd4: illegal name %s in recovery directory\n", 416 printk("nfsd4: illegal name %s in recovery directory\n",
@@ -359,21 +418,22 @@ load_recdir(struct dentry *parent, struct dentry *child)
359 /* Keep trying; maybe the others are OK: */ 418 /* Keep trying; maybe the others are OK: */
360 return 0; 419 return 0;
361 } 420 }
362 nfs4_client_to_reclaim(child->d_name.name); 421 nfs4_client_to_reclaim(child->d_name.name, nn);
363 return 0; 422 return 0;
364} 423}
365 424
366static int 425static int
367nfsd4_recdir_load(void) { 426nfsd4_recdir_load(struct net *net) {
368 int status; 427 int status;
428 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
369 429
370 if (!rec_file) 430 if (!nn->rec_file)
371 return 0; 431 return 0;
372 432
373 status = nfsd4_list_rec_dir(load_recdir); 433 status = nfsd4_list_rec_dir(load_recdir, nn);
374 if (status) 434 if (status)
375 printk("nfsd4: failed loading clients from recovery" 435 printk("nfsd4: failed loading clients from recovery"
376 " directory %s\n", rec_file->f_path.dentry->d_name.name); 436 " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
377 return status; 437 return status;
378} 438}
379 439
@@ -382,15 +442,16 @@ nfsd4_recdir_load(void) {
382 */ 442 */
383 443
384static int 444static int
385nfsd4_init_recdir(void) 445nfsd4_init_recdir(struct net *net)
386{ 446{
447 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
387 const struct cred *original_cred; 448 const struct cred *original_cred;
388 int status; 449 int status;
389 450
390 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 451 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
391 user_recovery_dirname); 452 user_recovery_dirname);
392 453
393 BUG_ON(rec_file); 454 BUG_ON(nn->rec_file);
394 455
395 status = nfs4_save_creds(&original_cred); 456 status = nfs4_save_creds(&original_cred);
396 if (status < 0) { 457 if (status < 0) {
@@ -400,23 +461,65 @@ nfsd4_init_recdir(void)
400 return status; 461 return status;
401 } 462 }
402 463
403 rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); 464 nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
404 if (IS_ERR(rec_file)) { 465 if (IS_ERR(nn->rec_file)) {
405 printk("NFSD: unable to find recovery directory %s\n", 466 printk("NFSD: unable to find recovery directory %s\n",
406 user_recovery_dirname); 467 user_recovery_dirname);
407 status = PTR_ERR(rec_file); 468 status = PTR_ERR(nn->rec_file);
408 rec_file = NULL; 469 nn->rec_file = NULL;
409 } 470 }
410 471
411 nfs4_reset_creds(original_cred); 472 nfs4_reset_creds(original_cred);
473 if (!status)
474 nn->in_grace = true;
412 return status; 475 return status;
413} 476}
414 477
478
479static int
480nfs4_legacy_state_init(struct net *net)
481{
482 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
483 int i;
484
485 nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) *
486 CLIENT_HASH_SIZE, GFP_KERNEL);
487 if (!nn->reclaim_str_hashtbl)
488 return -ENOMEM;
489
490 for (i = 0; i < CLIENT_HASH_SIZE; i++)
491 INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
492 nn->reclaim_str_hashtbl_size = 0;
493
494 return 0;
495}
496
497static void
498nfs4_legacy_state_shutdown(struct net *net)
499{
500 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
501
502 kfree(nn->reclaim_str_hashtbl);
503}
504
415static int 505static int
416nfsd4_load_reboot_recovery_data(struct net *net) 506nfsd4_load_reboot_recovery_data(struct net *net)
417{ 507{
418 int status; 508 int status;
419 509
510 status = nfsd4_init_recdir(net);
511 if (!status)
512 status = nfsd4_recdir_load(net);
513 if (status)
514 printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
515 return status;
516}
517
518static int
519nfsd4_legacy_tracking_init(struct net *net)
520{
521 int status;
522
420 /* XXX: The legacy code won't work in a container */ 523 /* XXX: The legacy code won't work in a container */
421 if (net != &init_net) { 524 if (net != &init_net) {
422 WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client " 525 WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
@@ -424,30 +527,37 @@ nfsd4_load_reboot_recovery_data(struct net *net)
424 return -EINVAL; 527 return -EINVAL;
425 } 528 }
426 529
427 nfs4_lock_state(); 530 status = nfs4_legacy_state_init(net);
428 status = nfsd4_init_recdir();
429 if (!status)
430 status = nfsd4_recdir_load();
431 nfs4_unlock_state();
432 if (status) 531 if (status)
433 printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); 532 return status;
533
534 status = nfsd4_load_reboot_recovery_data(net);
535 if (status)
536 goto err;
537 return 0;
538
539err:
540 nfs4_legacy_state_shutdown(net);
434 return status; 541 return status;
435} 542}
436 543
437static void 544static void
438nfsd4_shutdown_recdir(void) 545nfsd4_shutdown_recdir(struct nfsd_net *nn)
439{ 546{
440 if (!rec_file) 547 if (!nn->rec_file)
441 return; 548 return;
442 fput(rec_file); 549 fput(nn->rec_file);
443 rec_file = NULL; 550 nn->rec_file = NULL;
444} 551}
445 552
446static void 553static void
447nfsd4_legacy_tracking_exit(struct net *net) 554nfsd4_legacy_tracking_exit(struct net *net)
448{ 555{
449 nfs4_release_reclaim(); 556 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
450 nfsd4_shutdown_recdir(); 557
558 nfs4_release_reclaim(nn);
559 nfsd4_shutdown_recdir(nn);
560 nfs4_legacy_state_shutdown(net);
451} 561}
452 562
453/* 563/*
@@ -480,13 +590,26 @@ nfs4_recoverydir(void)
480static int 590static int
481nfsd4_check_legacy_client(struct nfs4_client *clp) 591nfsd4_check_legacy_client(struct nfs4_client *clp)
482{ 592{
593 int status;
594 char dname[HEXDIR_LEN];
595 struct nfs4_client_reclaim *crp;
596 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
597
483 /* did we already find that this client is stable? */ 598 /* did we already find that this client is stable? */
484 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) 599 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
485 return 0; 600 return 0;
486 601
602 status = nfs4_make_rec_clidname(dname, &clp->cl_name);
603 if (status) {
604 legacy_recdir_name_error(status);
605 return status;
606 }
607
487 /* look for it in the reclaim hashtable otherwise */ 608 /* look for it in the reclaim hashtable otherwise */
488 if (nfsd4_find_reclaim_client(clp)) { 609 crp = nfsd4_find_reclaim_client(dname, nn);
610 if (crp) {
489 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); 611 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
612 crp->cr_clp = clp;
490 return 0; 613 return 0;
491 } 614 }
492 615
@@ -494,7 +617,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
494} 617}
495 618
496static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { 619static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
497 .init = nfsd4_load_reboot_recovery_data, 620 .init = nfsd4_legacy_tracking_init,
498 .exit = nfsd4_legacy_tracking_exit, 621 .exit = nfsd4_legacy_tracking_exit,
499 .create = nfsd4_create_clid_dir, 622 .create = nfsd4_create_clid_dir,
500 .remove = nfsd4_remove_clid_dir, 623 .remove = nfsd4_remove_clid_dir,
@@ -785,8 +908,7 @@ nfsd4_cld_create(struct nfs4_client *clp)
785{ 908{
786 int ret; 909 int ret;
787 struct cld_upcall *cup; 910 struct cld_upcall *cup;
788 /* FIXME: determine net from clp */ 911 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
789 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
790 struct cld_net *cn = nn->cld_net; 912 struct cld_net *cn = nn->cld_net;
791 913
792 /* Don't upcall if it's already stored */ 914 /* Don't upcall if it's already stored */
@@ -823,8 +945,7 @@ nfsd4_cld_remove(struct nfs4_client *clp)
823{ 945{
824 int ret; 946 int ret;
825 struct cld_upcall *cup; 947 struct cld_upcall *cup;
826 /* FIXME: determine net from clp */ 948 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
827 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
828 struct cld_net *cn = nn->cld_net; 949 struct cld_net *cn = nn->cld_net;
829 950
830 /* Don't upcall if it's already removed */ 951 /* Don't upcall if it's already removed */
@@ -861,8 +982,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
861{ 982{
862 int ret; 983 int ret;
863 struct cld_upcall *cup; 984 struct cld_upcall *cup;
864 /* FIXME: determine net from clp */ 985 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
865 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
866 struct cld_net *cn = nn->cld_net; 986 struct cld_net *cn = nn->cld_net;
867 987
868 /* Don't upcall if one was already stored during this grace pd */ 988 /* Don't upcall if one was already stored during this grace pd */
@@ -892,11 +1012,10 @@ nfsd4_cld_check(struct nfs4_client *clp)
892} 1012}
893 1013
894static void 1014static void
895nfsd4_cld_grace_done(struct net *net, time_t boot_time) 1015nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
896{ 1016{
897 int ret; 1017 int ret;
898 struct cld_upcall *cup; 1018 struct cld_upcall *cup;
899 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
900 struct cld_net *cn = nn->cld_net; 1019 struct cld_net *cn = nn->cld_net;
901 1020
902 cup = alloc_cld_upcall(cn); 1021 cup = alloc_cld_upcall(cn);
@@ -926,28 +1045,261 @@ static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
926 .grace_done = nfsd4_cld_grace_done, 1045 .grace_done = nfsd4_cld_grace_done,
927}; 1046};
928 1047
1048/* upcall via usermodehelper */
1049static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
1050module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
1051 S_IRUGO|S_IWUSR);
1052MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program");
1053
1054static bool cltrack_legacy_disable;
1055module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR);
1056MODULE_PARM_DESC(cltrack_legacy_disable,
1057 "Disable legacy recoverydir conversion. Default: false");
1058
1059#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
1060#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
1061
1062static char *
1063nfsd4_cltrack_legacy_topdir(void)
1064{
1065 int copied;
1066 size_t len;
1067 char *result;
1068
1069 if (cltrack_legacy_disable)
1070 return NULL;
1071
1072 len = strlen(LEGACY_TOPDIR_ENV_PREFIX) +
1073 strlen(nfs4_recoverydir()) + 1;
1074
1075 result = kmalloc(len, GFP_KERNEL);
1076 if (!result)
1077 return result;
1078
1079 copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s",
1080 nfs4_recoverydir());
1081 if (copied >= len) {
1082 /* just return nothing if output was truncated */
1083 kfree(result);
1084 return NULL;
1085 }
1086
1087 return result;
1088}
1089
1090static char *
1091nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
1092{
1093 int copied;
1094 size_t len;
1095 char *result;
1096
1097 if (cltrack_legacy_disable)
1098 return NULL;
1099
1100 /* +1 is for '/' between "topdir" and "recdir" */
1101 len = strlen(LEGACY_RECDIR_ENV_PREFIX) +
1102 strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN;
1103
1104 result = kmalloc(len, GFP_KERNEL);
1105 if (!result)
1106 return result;
1107
1108 copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/",
1109 nfs4_recoverydir());
1110 if (copied > (len - HEXDIR_LEN)) {
1111 /* just return nothing if output will be truncated */
1112 kfree(result);
1113 return NULL;
1114 }
1115
1116 copied = nfs4_make_rec_clidname(result + copied, name);
1117 if (copied) {
1118 kfree(result);
1119 return NULL;
1120 }
1121
1122 return result;
1123}
1124
1125static int
1126nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
1127{
1128 char *envp[2];
1129 char *argv[4];
1130 int ret;
1131
1132 if (unlikely(!cltrack_prog[0])) {
1133 dprintk("%s: cltrack_prog is disabled\n", __func__);
1134 return -EACCES;
1135 }
1136
1137 dprintk("%s: cmd: %s\n", __func__, cmd);
1138 dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
1139 dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
1140
1141 envp[0] = legacy;
1142 envp[1] = NULL;
1143
1144 argv[0] = (char *)cltrack_prog;
1145 argv[1] = cmd;
1146 argv[2] = arg;
1147 argv[3] = NULL;
1148
1149 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
1150 /*
1151 * Disable the upcall mechanism if we're getting an ENOENT or EACCES
1152 * error. The admin can re-enable it on the fly by using sysfs
1153 * once the problem has been fixed.
1154 */
1155 if (ret == -ENOENT || ret == -EACCES) {
1156 dprintk("NFSD: %s was not found or isn't executable (%d). "
1157 "Setting cltrack_prog to blank string!",
1158 cltrack_prog, ret);
1159 cltrack_prog[0] = '\0';
1160 }
1161 dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret);
1162
1163 return ret;
1164}
1165
1166static char *
1167bin_to_hex_dup(const unsigned char *src, int srclen)
1168{
1169 int i;
1170 char *buf, *hex;
1171
1172 /* +1 for terminating NULL */
1173 buf = kmalloc((srclen * 2) + 1, GFP_KERNEL);
1174 if (!buf)
1175 return buf;
1176
1177 hex = buf;
1178 for (i = 0; i < srclen; i++) {
1179 sprintf(hex, "%2.2x", *src++);
1180 hex += 2;
1181 }
1182 return buf;
1183}
1184
1185static int
1186nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
1187{
1188 return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
1189}
1190
1191static void
1192nfsd4_umh_cltrack_create(struct nfs4_client *clp)
1193{
1194 char *hexid;
1195
1196 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1197 if (!hexid) {
1198 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1199 return;
1200 }
1201 nfsd4_umh_cltrack_upcall("create", hexid, NULL);
1202 kfree(hexid);
1203}
1204
1205static void
1206nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
1207{
1208 char *hexid;
1209
1210 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1211 if (!hexid) {
1212 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1213 return;
1214 }
1215 nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
1216 kfree(hexid);
1217}
1218
1219static int
1220nfsd4_umh_cltrack_check(struct nfs4_client *clp)
1221{
1222 int ret;
1223 char *hexid, *legacy;
1224
1225 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1226 if (!hexid) {
1227 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1228 return -ENOMEM;
1229 }
1230 legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
1231 ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
1232 kfree(legacy);
1233 kfree(hexid);
1234 return ret;
1235}
1236
1237static void
1238nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
1239 time_t boot_time)
1240{
1241 char *legacy;
1242 char timestr[22]; /* FIXME: better way to determine max size? */
1243
1244 sprintf(timestr, "%ld", boot_time);
1245 legacy = nfsd4_cltrack_legacy_topdir();
1246 nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
1247 kfree(legacy);
1248}
1249
1250static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
1251 .init = nfsd4_umh_cltrack_init,
1252 .exit = NULL,
1253 .create = nfsd4_umh_cltrack_create,
1254 .remove = nfsd4_umh_cltrack_remove,
1255 .check = nfsd4_umh_cltrack_check,
1256 .grace_done = nfsd4_umh_cltrack_grace_done,
1257};
1258
929int 1259int
930nfsd4_client_tracking_init(struct net *net) 1260nfsd4_client_tracking_init(struct net *net)
931{ 1261{
932 int status; 1262 int status;
933 struct path path; 1263 struct path path;
1264 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
934 1265
935 if (!client_tracking_ops) { 1266 /* just run the init if it the method is already decided */
936 client_tracking_ops = &nfsd4_cld_tracking_ops; 1267 if (nn->client_tracking_ops)
937 status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); 1268 goto do_init;
938 if (!status) { 1269
939 if (S_ISDIR(path.dentry->d_inode->i_mode)) 1270 /*
940 client_tracking_ops = 1271 * First, try a UMH upcall. It should succeed or fail quickly, so
941 &nfsd4_legacy_tracking_ops; 1272 * there's little harm in trying that first.
942 path_put(&path); 1273 */
943 } 1274 nn->client_tracking_ops = &nfsd4_umh_tracking_ops;
1275 status = nn->client_tracking_ops->init(net);
1276 if (!status)
1277 return status;
1278
1279 /*
1280 * See if the recoverydir exists and is a directory. If it is,
1281 * then use the legacy ops.
1282 */
1283 nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
1284 status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
1285 if (!status) {
1286 status = S_ISDIR(path.dentry->d_inode->i_mode);
1287 path_put(&path);
1288 if (status)
1289 goto do_init;
944 } 1290 }
945 1291
946 status = client_tracking_ops->init(net); 1292 /* Finally, try to use nfsdcld */
1293 nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
1294 printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
1295 "removed in 3.10. Please transition to using "
1296 "nfsdcltrack.\n");
1297do_init:
1298 status = nn->client_tracking_ops->init(net);
947 if (status) { 1299 if (status) {
948 printk(KERN_WARNING "NFSD: Unable to initialize client " 1300 printk(KERN_WARNING "NFSD: Unable to initialize client "
949 "recovery tracking! (%d)\n", status); 1301 "recovery tracking! (%d)\n", status);
950 client_tracking_ops = NULL; 1302 nn->client_tracking_ops = NULL;
951 } 1303 }
952 return status; 1304 return status;
953} 1305}
@@ -955,40 +1307,49 @@ nfsd4_client_tracking_init(struct net *net)
955void 1307void
956nfsd4_client_tracking_exit(struct net *net) 1308nfsd4_client_tracking_exit(struct net *net)
957{ 1309{
958 if (client_tracking_ops) { 1310 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
959 client_tracking_ops->exit(net); 1311
960 client_tracking_ops = NULL; 1312 if (nn->client_tracking_ops) {
1313 if (nn->client_tracking_ops->exit)
1314 nn->client_tracking_ops->exit(net);
1315 nn->client_tracking_ops = NULL;
961 } 1316 }
962} 1317}
963 1318
964void 1319void
965nfsd4_client_record_create(struct nfs4_client *clp) 1320nfsd4_client_record_create(struct nfs4_client *clp)
966{ 1321{
967 if (client_tracking_ops) 1322 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
968 client_tracking_ops->create(clp); 1323
1324 if (nn->client_tracking_ops)
1325 nn->client_tracking_ops->create(clp);
969} 1326}
970 1327
971void 1328void
972nfsd4_client_record_remove(struct nfs4_client *clp) 1329nfsd4_client_record_remove(struct nfs4_client *clp)
973{ 1330{
974 if (client_tracking_ops) 1331 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
975 client_tracking_ops->remove(clp); 1332
1333 if (nn->client_tracking_ops)
1334 nn->client_tracking_ops->remove(clp);
976} 1335}
977 1336
978int 1337int
979nfsd4_client_record_check(struct nfs4_client *clp) 1338nfsd4_client_record_check(struct nfs4_client *clp)
980{ 1339{
981 if (client_tracking_ops) 1340 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
982 return client_tracking_ops->check(clp); 1341
1342 if (nn->client_tracking_ops)
1343 return nn->client_tracking_ops->check(clp);
983 1344
984 return -EOPNOTSUPP; 1345 return -EOPNOTSUPP;
985} 1346}
986 1347
987void 1348void
988nfsd4_record_grace_done(struct net *net, time_t boot_time) 1349nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
989{ 1350{
990 if (client_tracking_ops) 1351 if (nn->client_tracking_ops)
991 client_tracking_ops->grace_done(net, boot_time); 1352 nn->client_tracking_ops->grace_done(nn, boot_time);
992} 1353}
993 1354
994static int 1355static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d0237f872cc4..ac8ed96c4199 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,16 +44,11 @@
44#include "xdr4.h" 44#include "xdr4.h"
45#include "vfs.h" 45#include "vfs.h"
46#include "current_stateid.h" 46#include "current_stateid.h"
47#include "fault_inject.h"
48 47
49#include "netns.h" 48#include "netns.h"
50 49
51#define NFSDDBG_FACILITY NFSDDBG_PROC 50#define NFSDDBG_FACILITY NFSDDBG_PROC
52 51
53/* Globals */
54time_t nfsd4_lease = 90; /* default lease time */
55time_t nfsd4_grace = 90;
56
57#define all_ones {{~0,~0},~0} 52#define all_ones {{~0,~0},~0}
58static const stateid_t one_stateid = { 53static const stateid_t one_stateid = {
59 .si_generation = ~0, 54 .si_generation = ~0,
@@ -176,8 +171,6 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
176 return ret & OWNER_HASH_MASK; 171 return ret & OWNER_HASH_MASK;
177} 172}
178 173
179static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
180
181/* hash table for nfs4_file */ 174/* hash table for nfs4_file */
182#define FILE_HASH_BITS 8 175#define FILE_HASH_BITS 8
183#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) 176#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
@@ -192,7 +185,7 @@ static struct list_head file_hashtbl[FILE_HASH_SIZE];
192 185
193static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) 186static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
194{ 187{
195 BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); 188 WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
196 atomic_inc(&fp->fi_access[oflag]); 189 atomic_inc(&fp->fi_access[oflag]);
197} 190}
198 191
@@ -251,7 +244,7 @@ static inline int get_new_stid(struct nfs4_stid *stid)
251 * preallocations that can exist at a time, but the state lock 244 * preallocations that can exist at a time, but the state lock
252 * prevents anyone from using ours before we get here: 245 * prevents anyone from using ours before we get here:
253 */ 246 */
254 BUG_ON(error); 247 WARN_ON_ONCE(error);
255 /* 248 /*
256 * It shouldn't be a problem to reuse an opaque stateid value. 249 * It shouldn't be a problem to reuse an opaque stateid value.
257 * I don't think it is for 4.1. But with 4.0 I worry that, for 250 * I don't think it is for 4.1. But with 4.0 I worry that, for
@@ -340,7 +333,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
340 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 333 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
341 dp->dl_time = 0; 334 dp->dl_time = 0;
342 atomic_set(&dp->dl_count, 1); 335 atomic_set(&dp->dl_count, 1);
343 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); 336 nfsd4_init_callback(&dp->dl_recall);
344 return dp; 337 return dp;
345} 338}
346 339
@@ -390,14 +383,6 @@ unhash_delegation(struct nfs4_delegation *dp)
390 * SETCLIENTID state 383 * SETCLIENTID state
391 */ 384 */
392 385
393/* client_lock protects the client lru list and session hash table */
394static DEFINE_SPINLOCK(client_lock);
395
396/* Hash tables for nfs4_clientid state */
397#define CLIENT_HASH_BITS 4
398#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
399#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
400
401static unsigned int clientid_hashval(u32 id) 386static unsigned int clientid_hashval(u32 id)
402{ 387{
403 return id & CLIENT_HASH_MASK; 388 return id & CLIENT_HASH_MASK;
@@ -409,31 +394,6 @@ static unsigned int clientstr_hashval(const char *name)
409} 394}
410 395
411/* 396/*
412 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
413 * used in reboot/reset lease grace period processing
414 *
415 * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
416 * setclientid_confirmed info.
417 *
418 * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed
419 * setclientid info.
420 *
421 * client_lru holds client queue ordered by nfs4_client.cl_time
422 * for lease renewal.
423 *
424 * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
425 * for last close replay.
426 */
427static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE];
428static int reclaim_str_hashtbl_size = 0;
429static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE];
430static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE];
431static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE];
432static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
433static struct list_head client_lru;
434static struct list_head close_lru;
435
436/*
437 * We store the NONE, READ, WRITE, and BOTH bits separately in the 397 * We store the NONE, READ, WRITE, and BOTH bits separately in the
438 * st_{access,deny}_bmap field of the stateid, in order to track not 398 * st_{access,deny}_bmap field of the stateid, in order to track not
439 * only what share bits are currently in force, but also what 399 * only what share bits are currently in force, but also what
@@ -526,7 +486,8 @@ static int nfs4_access_to_omode(u32 access)
526 case NFS4_SHARE_ACCESS_BOTH: 486 case NFS4_SHARE_ACCESS_BOTH:
527 return O_RDWR; 487 return O_RDWR;
528 } 488 }
529 BUG(); 489 WARN_ON_ONCE(1);
490 return O_RDONLY;
530} 491}
531 492
532/* release all access and file references for a given stateid */ 493/* release all access and file references for a given stateid */
@@ -652,9 +613,6 @@ static void release_openowner(struct nfs4_openowner *oo)
652 nfs4_free_openowner(oo); 613 nfs4_free_openowner(oo);
653} 614}
654 615
655#define SESSION_HASH_SIZE 512
656static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
657
658static inline int 616static inline int
659hash_sessionid(struct nfs4_sessionid *sessionid) 617hash_sessionid(struct nfs4_sessionid *sessionid)
660{ 618{
@@ -785,9 +743,12 @@ out_free:
785 return NULL; 743 return NULL;
786} 744}
787 745
788static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize) 746static void init_forechannel_attrs(struct nfsd4_channel_attrs *new,
747 struct nfsd4_channel_attrs *req,
748 int numslots, int slotsize,
749 struct nfsd_net *nn)
789{ 750{
790 u32 maxrpc = nfsd_serv->sv_max_mesg; 751 u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
791 752
792 new->maxreqs = numslots; 753 new->maxreqs = numslots;
793 new->maxresp_cached = min_t(u32, req->maxresp_cached, 754 new->maxresp_cached = min_t(u32, req->maxresp_cached,
@@ -906,21 +867,27 @@ static void __free_session(struct nfsd4_session *ses)
906static void free_session(struct kref *kref) 867static void free_session(struct kref *kref)
907{ 868{
908 struct nfsd4_session *ses; 869 struct nfsd4_session *ses;
870 struct nfsd_net *nn;
909 871
910 lockdep_assert_held(&client_lock);
911 ses = container_of(kref, struct nfsd4_session, se_ref); 872 ses = container_of(kref, struct nfsd4_session, se_ref);
873 nn = net_generic(ses->se_client->net, nfsd_net_id);
874
875 lockdep_assert_held(&nn->client_lock);
912 nfsd4_del_conns(ses); 876 nfsd4_del_conns(ses);
913 __free_session(ses); 877 __free_session(ses);
914} 878}
915 879
916void nfsd4_put_session(struct nfsd4_session *ses) 880void nfsd4_put_session(struct nfsd4_session *ses)
917{ 881{
918 spin_lock(&client_lock); 882 struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
883
884 spin_lock(&nn->client_lock);
919 nfsd4_put_session_locked(ses); 885 nfsd4_put_session_locked(ses);
920 spin_unlock(&client_lock); 886 spin_unlock(&nn->client_lock);
921} 887}
922 888
923static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) 889static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
890 struct nfsd_net *nn)
924{ 891{
925 struct nfsd4_session *new; 892 struct nfsd4_session *new;
926 int numslots, slotsize; 893 int numslots, slotsize;
@@ -941,13 +908,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
941 nfsd4_put_drc_mem(slotsize, fchan->maxreqs); 908 nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
942 return NULL; 909 return NULL;
943 } 910 }
944 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize); 911 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
945 return new; 912 return new;
946} 913}
947 914
948static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) 915static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
949{ 916{
950 int idx; 917 int idx;
918 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
951 919
952 new->se_client = clp; 920 new->se_client = clp;
953 gen_sessionid(new); 921 gen_sessionid(new);
@@ -957,14 +925,15 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
957 new->se_cb_seq_nr = 1; 925 new->se_cb_seq_nr = 1;
958 new->se_flags = cses->flags; 926 new->se_flags = cses->flags;
959 new->se_cb_prog = cses->callback_prog; 927 new->se_cb_prog = cses->callback_prog;
928 new->se_cb_sec = cses->cb_sec;
960 kref_init(&new->se_ref); 929 kref_init(&new->se_ref);
961 idx = hash_sessionid(&new->se_sessionid); 930 idx = hash_sessionid(&new->se_sessionid);
962 spin_lock(&client_lock); 931 spin_lock(&nn->client_lock);
963 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 932 list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
964 spin_lock(&clp->cl_lock); 933 spin_lock(&clp->cl_lock);
965 list_add(&new->se_perclnt, &clp->cl_sessions); 934 list_add(&new->se_perclnt, &clp->cl_sessions);
966 spin_unlock(&clp->cl_lock); 935 spin_unlock(&clp->cl_lock);
967 spin_unlock(&client_lock); 936 spin_unlock(&nn->client_lock);
968 937
969 if (cses->flags & SESSION4_BACK_CHAN) { 938 if (cses->flags & SESSION4_BACK_CHAN) {
970 struct sockaddr *sa = svc_addr(rqstp); 939 struct sockaddr *sa = svc_addr(rqstp);
@@ -978,20 +947,20 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
978 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); 947 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
979 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); 948 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
980 } 949 }
981 return new;
982} 950}
983 951
984/* caller must hold client_lock */ 952/* caller must hold client_lock */
985static struct nfsd4_session * 953static struct nfsd4_session *
986find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) 954find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
987{ 955{
988 struct nfsd4_session *elem; 956 struct nfsd4_session *elem;
989 int idx; 957 int idx;
958 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
990 959
991 dump_sessionid(__func__, sessionid); 960 dump_sessionid(__func__, sessionid);
992 idx = hash_sessionid(sessionid); 961 idx = hash_sessionid(sessionid);
993 /* Search in the appropriate list */ 962 /* Search in the appropriate list */
994 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { 963 list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) {
995 if (!memcmp(elem->se_sessionid.data, sessionid->data, 964 if (!memcmp(elem->se_sessionid.data, sessionid->data,
996 NFS4_MAX_SESSIONID_LEN)) { 965 NFS4_MAX_SESSIONID_LEN)) {
997 return elem; 966 return elem;
@@ -1016,6 +985,8 @@ unhash_session(struct nfsd4_session *ses)
1016static inline void 985static inline void
1017renew_client_locked(struct nfs4_client *clp) 986renew_client_locked(struct nfs4_client *clp)
1018{ 987{
988 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
989
1019 if (is_client_expired(clp)) { 990 if (is_client_expired(clp)) {
1020 WARN_ON(1); 991 WARN_ON(1);
1021 printk("%s: client (clientid %08x/%08x) already expired\n", 992 printk("%s: client (clientid %08x/%08x) already expired\n",
@@ -1028,16 +999,18 @@ renew_client_locked(struct nfs4_client *clp)
1028 dprintk("renewing client (clientid %08x/%08x)\n", 999 dprintk("renewing client (clientid %08x/%08x)\n",
1029 clp->cl_clientid.cl_boot, 1000 clp->cl_clientid.cl_boot,
1030 clp->cl_clientid.cl_id); 1001 clp->cl_clientid.cl_id);
1031 list_move_tail(&clp->cl_lru, &client_lru); 1002 list_move_tail(&clp->cl_lru, &nn->client_lru);
1032 clp->cl_time = get_seconds(); 1003 clp->cl_time = get_seconds();
1033} 1004}
1034 1005
1035static inline void 1006static inline void
1036renew_client(struct nfs4_client *clp) 1007renew_client(struct nfs4_client *clp)
1037{ 1008{
1038 spin_lock(&client_lock); 1009 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1010
1011 spin_lock(&nn->client_lock);
1039 renew_client_locked(clp); 1012 renew_client_locked(clp);
1040 spin_unlock(&client_lock); 1013 spin_unlock(&nn->client_lock);
1041} 1014}
1042 1015
1043/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ 1016/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
@@ -1075,7 +1048,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
1075static inline void 1048static inline void
1076free_client(struct nfs4_client *clp) 1049free_client(struct nfs4_client *clp)
1077{ 1050{
1078 lockdep_assert_held(&client_lock); 1051 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1052
1053 lockdep_assert_held(&nn->client_lock);
1079 while (!list_empty(&clp->cl_sessions)) { 1054 while (!list_empty(&clp->cl_sessions)) {
1080 struct nfsd4_session *ses; 1055 struct nfsd4_session *ses;
1081 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, 1056 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
@@ -1092,15 +1067,16 @@ void
1092release_session_client(struct nfsd4_session *session) 1067release_session_client(struct nfsd4_session *session)
1093{ 1068{
1094 struct nfs4_client *clp = session->se_client; 1069 struct nfs4_client *clp = session->se_client;
1070 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1095 1071
1096 if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) 1072 if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
1097 return; 1073 return;
1098 if (is_client_expired(clp)) { 1074 if (is_client_expired(clp)) {
1099 free_client(clp); 1075 free_client(clp);
1100 session->se_client = NULL; 1076 session->se_client = NULL;
1101 } else 1077 } else
1102 renew_client_locked(clp); 1078 renew_client_locked(clp);
1103 spin_unlock(&client_lock); 1079 spin_unlock(&nn->client_lock);
1104} 1080}
1105 1081
1106/* must be called under the client_lock */ 1082/* must be called under the client_lock */
@@ -1123,6 +1099,7 @@ destroy_client(struct nfs4_client *clp)
1123 struct nfs4_openowner *oo; 1099 struct nfs4_openowner *oo;
1124 struct nfs4_delegation *dp; 1100 struct nfs4_delegation *dp;
1125 struct list_head reaplist; 1101 struct list_head reaplist;
1102 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1126 1103
1127 INIT_LIST_HEAD(&reaplist); 1104 INIT_LIST_HEAD(&reaplist);
1128 spin_lock(&recall_lock); 1105 spin_lock(&recall_lock);
@@ -1144,12 +1121,15 @@ destroy_client(struct nfs4_client *clp)
1144 if (clp->cl_cb_conn.cb_xprt) 1121 if (clp->cl_cb_conn.cb_xprt)
1145 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 1122 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
1146 list_del(&clp->cl_idhash); 1123 list_del(&clp->cl_idhash);
1147 list_del(&clp->cl_strhash); 1124 if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
1148 spin_lock(&client_lock); 1125 rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
1126 else
1127 rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
1128 spin_lock(&nn->client_lock);
1149 unhash_client_locked(clp); 1129 unhash_client_locked(clp);
1150 if (atomic_read(&clp->cl_refcount) == 0) 1130 if (atomic_read(&clp->cl_refcount) == 0)
1151 free_client(clp); 1131 free_client(clp);
1152 spin_unlock(&client_lock); 1132 spin_unlock(&nn->client_lock);
1153} 1133}
1154 1134
1155static void expire_client(struct nfs4_client *clp) 1135static void expire_client(struct nfs4_client *clp)
@@ -1187,6 +1167,17 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
1187 return 0; 1167 return 0;
1188} 1168}
1189 1169
1170static long long
1171compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
1172{
1173 long long res;
1174
1175 res = o1->len - o2->len;
1176 if (res)
1177 return res;
1178 return (long long)memcmp(o1->data, o2->data, o1->len);
1179}
1180
1190static int same_name(const char *n1, const char *n2) 1181static int same_name(const char *n1, const char *n2)
1191{ 1182{
1192 return 0 == memcmp(n1, n2, HEXDIR_LEN); 1183 return 0 == memcmp(n1, n2, HEXDIR_LEN);
@@ -1247,10 +1238,9 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
1247 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); 1238 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
1248} 1239}
1249 1240
1250static void gen_clid(struct nfs4_client *clp) 1241static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
1251{ 1242{
1252 static u32 current_clientid = 1; 1243 static u32 current_clientid = 1;
1253 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
1254 1244
1255 clp->cl_clientid.cl_boot = nn->boot_time; 1245 clp->cl_clientid.cl_boot = nn->boot_time;
1256 clp->cl_clientid.cl_id = current_clientid++; 1246 clp->cl_clientid.cl_id = current_clientid++;
@@ -1283,12 +1273,14 @@ static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t
1283 return NULL; 1273 return NULL;
1284} 1274}
1285 1275
1286static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, 1276static struct nfs4_client *create_client(struct xdr_netobj name,
1287 struct svc_rqst *rqstp, nfs4_verifier *verf) 1277 struct svc_rqst *rqstp, nfs4_verifier *verf)
1288{ 1278{
1289 struct nfs4_client *clp; 1279 struct nfs4_client *clp;
1290 struct sockaddr *sa = svc_addr(rqstp); 1280 struct sockaddr *sa = svc_addr(rqstp);
1291 int ret; 1281 int ret;
1282 struct net *net = SVC_NET(rqstp);
1283 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1292 1284
1293 clp = alloc_client(name); 1285 clp = alloc_client(name);
1294 if (clp == NULL) 1286 if (clp == NULL)
@@ -1297,23 +1289,21 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1297 INIT_LIST_HEAD(&clp->cl_sessions); 1289 INIT_LIST_HEAD(&clp->cl_sessions);
1298 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); 1290 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
1299 if (ret) { 1291 if (ret) {
1300 spin_lock(&client_lock); 1292 spin_lock(&nn->client_lock);
1301 free_client(clp); 1293 free_client(clp);
1302 spin_unlock(&client_lock); 1294 spin_unlock(&nn->client_lock);
1303 return NULL; 1295 return NULL;
1304 } 1296 }
1305 idr_init(&clp->cl_stateids); 1297 idr_init(&clp->cl_stateids);
1306 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
1307 atomic_set(&clp->cl_refcount, 0); 1298 atomic_set(&clp->cl_refcount, 0);
1308 clp->cl_cb_state = NFSD4_CB_UNKNOWN; 1299 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
1309 INIT_LIST_HEAD(&clp->cl_idhash); 1300 INIT_LIST_HEAD(&clp->cl_idhash);
1310 INIT_LIST_HEAD(&clp->cl_strhash);
1311 INIT_LIST_HEAD(&clp->cl_openowners); 1301 INIT_LIST_HEAD(&clp->cl_openowners);
1312 INIT_LIST_HEAD(&clp->cl_delegations); 1302 INIT_LIST_HEAD(&clp->cl_delegations);
1313 INIT_LIST_HEAD(&clp->cl_lru); 1303 INIT_LIST_HEAD(&clp->cl_lru);
1314 INIT_LIST_HEAD(&clp->cl_callbacks); 1304 INIT_LIST_HEAD(&clp->cl_callbacks);
1315 spin_lock_init(&clp->cl_lock); 1305 spin_lock_init(&clp->cl_lock);
1316 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); 1306 nfsd4_init_callback(&clp->cl_cb_null);
1317 clp->cl_time = get_seconds(); 1307 clp->cl_time = get_seconds();
1318 clear_bit(0, &clp->cl_cb_slot_busy); 1308 clear_bit(0, &clp->cl_cb_slot_busy);
1319 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1309 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -1321,17 +1311,60 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1321 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); 1311 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
1322 gen_confirm(clp); 1312 gen_confirm(clp);
1323 clp->cl_cb_session = NULL; 1313 clp->cl_cb_session = NULL;
1314 clp->net = net;
1324 return clp; 1315 return clp;
1325} 1316}
1326 1317
1327static void 1318static void
1328add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) 1319add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
1320{
1321 struct rb_node **new = &(root->rb_node), *parent = NULL;
1322 struct nfs4_client *clp;
1323
1324 while (*new) {
1325 clp = rb_entry(*new, struct nfs4_client, cl_namenode);
1326 parent = *new;
1327
1328 if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0)
1329 new = &((*new)->rb_left);
1330 else
1331 new = &((*new)->rb_right);
1332 }
1333
1334 rb_link_node(&new_clp->cl_namenode, parent, new);
1335 rb_insert_color(&new_clp->cl_namenode, root);
1336}
1337
1338static struct nfs4_client *
1339find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
1340{
1341 long long cmp;
1342 struct rb_node *node = root->rb_node;
1343 struct nfs4_client *clp;
1344
1345 while (node) {
1346 clp = rb_entry(node, struct nfs4_client, cl_namenode);
1347 cmp = compare_blob(&clp->cl_name, name);
1348 if (cmp > 0)
1349 node = node->rb_left;
1350 else if (cmp < 0)
1351 node = node->rb_right;
1352 else
1353 return clp;
1354 }
1355 return NULL;
1356}
1357
1358static void
1359add_to_unconfirmed(struct nfs4_client *clp)
1329{ 1360{
1330 unsigned int idhashval; 1361 unsigned int idhashval;
1362 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1331 1363
1332 list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); 1364 clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
1365 add_clp_to_name_tree(clp, &nn->unconf_name_tree);
1333 idhashval = clientid_hashval(clp->cl_clientid.cl_id); 1366 idhashval = clientid_hashval(clp->cl_clientid.cl_id);
1334 list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); 1367 list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
1335 renew_client(clp); 1368 renew_client(clp);
1336} 1369}
1337 1370
@@ -1339,22 +1372,23 @@ static void
1339move_to_confirmed(struct nfs4_client *clp) 1372move_to_confirmed(struct nfs4_client *clp)
1340{ 1373{
1341 unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); 1374 unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
1342 unsigned int strhashval; 1375 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1343 1376
1344 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); 1377 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
1345 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); 1378 list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
1346 strhashval = clientstr_hashval(clp->cl_recdir); 1379 rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
1347 list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); 1380 add_clp_to_name_tree(clp, &nn->conf_name_tree);
1381 set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
1348 renew_client(clp); 1382 renew_client(clp);
1349} 1383}
1350 1384
1351static struct nfs4_client * 1385static struct nfs4_client *
1352find_confirmed_client(clientid_t *clid, bool sessions) 1386find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
1353{ 1387{
1354 struct nfs4_client *clp; 1388 struct nfs4_client *clp;
1355 unsigned int idhashval = clientid_hashval(clid->cl_id); 1389 unsigned int idhashval = clientid_hashval(clid->cl_id);
1356 1390
1357 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { 1391 list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {
1358 if (same_clid(&clp->cl_clientid, clid)) { 1392 if (same_clid(&clp->cl_clientid, clid)) {
1359 if ((bool)clp->cl_minorversion != sessions) 1393 if ((bool)clp->cl_minorversion != sessions)
1360 return NULL; 1394 return NULL;
@@ -1366,12 +1400,12 @@ find_confirmed_client(clientid_t *clid, bool sessions)
1366} 1400}
1367 1401
1368static struct nfs4_client * 1402static struct nfs4_client *
1369find_unconfirmed_client(clientid_t *clid, bool sessions) 1403find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
1370{ 1404{
1371 struct nfs4_client *clp; 1405 struct nfs4_client *clp;
1372 unsigned int idhashval = clientid_hashval(clid->cl_id); 1406 unsigned int idhashval = clientid_hashval(clid->cl_id);
1373 1407
1374 list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) { 1408 list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {
1375 if (same_clid(&clp->cl_clientid, clid)) { 1409 if (same_clid(&clp->cl_clientid, clid)) {
1376 if ((bool)clp->cl_minorversion != sessions) 1410 if ((bool)clp->cl_minorversion != sessions)
1377 return NULL; 1411 return NULL;
@@ -1387,27 +1421,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
1387} 1421}
1388 1422
1389static struct nfs4_client * 1423static struct nfs4_client *
1390find_confirmed_client_by_str(const char *dname, unsigned int hashval) 1424find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
1391{ 1425{
1392 struct nfs4_client *clp; 1426 return find_clp_in_name_tree(name, &nn->conf_name_tree);
1393
1394 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
1395 if (same_name(clp->cl_recdir, dname))
1396 return clp;
1397 }
1398 return NULL;
1399} 1427}
1400 1428
1401static struct nfs4_client * 1429static struct nfs4_client *
1402find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) 1430find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
1403{ 1431{
1404 struct nfs4_client *clp; 1432 return find_clp_in_name_tree(name, &nn->unconf_name_tree);
1405
1406 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
1407 if (same_name(clp->cl_recdir, dname))
1408 return clp;
1409 }
1410 return NULL;
1411} 1433}
1412 1434
1413static void 1435static void
@@ -1428,7 +1450,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
1428 else 1450 else
1429 goto out_err; 1451 goto out_err;
1430 1452
1431 conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val, 1453 conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val,
1432 se->se_callback_addr_len, 1454 se->se_callback_addr_len,
1433 (struct sockaddr *)&conn->cb_addr, 1455 (struct sockaddr *)&conn->cb_addr,
1434 sizeof(conn->cb_addr)); 1456 sizeof(conn->cb_addr));
@@ -1572,12 +1594,11 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1572{ 1594{
1573 struct nfs4_client *unconf, *conf, *new; 1595 struct nfs4_client *unconf, *conf, *new;
1574 __be32 status; 1596 __be32 status;
1575 unsigned int strhashval;
1576 char dname[HEXDIR_LEN];
1577 char addr_str[INET6_ADDRSTRLEN]; 1597 char addr_str[INET6_ADDRSTRLEN];
1578 nfs4_verifier verf = exid->verifier; 1598 nfs4_verifier verf = exid->verifier;
1579 struct sockaddr *sa = svc_addr(rqstp); 1599 struct sockaddr *sa = svc_addr(rqstp);
1580 bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; 1600 bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
1601 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1581 1602
1582 rpc_ntop(sa, addr_str, sizeof(addr_str)); 1603 rpc_ntop(sa, addr_str, sizeof(addr_str));
1583 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " 1604 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1592,24 +1613,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1592 switch (exid->spa_how) { 1613 switch (exid->spa_how) {
1593 case SP4_NONE: 1614 case SP4_NONE:
1594 break; 1615 break;
1616 default: /* checked by xdr code */
1617 WARN_ON_ONCE(1);
1595 case SP4_SSV: 1618 case SP4_SSV:
1596 return nfserr_serverfault;
1597 default:
1598 BUG(); /* checked by xdr code */
1599 case SP4_MACH_CRED: 1619 case SP4_MACH_CRED:
1600 return nfserr_serverfault; /* no excuse :-/ */ 1620 return nfserr_serverfault; /* no excuse :-/ */
1601 } 1621 }
1602 1622
1603 status = nfs4_make_rec_clidname(dname, &exid->clname);
1604
1605 if (status)
1606 return status;
1607
1608 strhashval = clientstr_hashval(dname);
1609
1610 /* Cases below refer to rfc 5661 section 18.35.4: */ 1623 /* Cases below refer to rfc 5661 section 18.35.4: */
1611 nfs4_lock_state(); 1624 nfs4_lock_state();
1612 conf = find_confirmed_client_by_str(dname, strhashval); 1625 conf = find_confirmed_client_by_name(&exid->clname, nn);
1613 if (conf) { 1626 if (conf) {
1614 bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); 1627 bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
1615 bool verfs_match = same_verf(&verf, &conf->cl_verifier); 1628 bool verfs_match = same_verf(&verf, &conf->cl_verifier);
@@ -1654,21 +1667,21 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1654 goto out; 1667 goto out;
1655 } 1668 }
1656 1669
1657 unconf = find_unconfirmed_client_by_str(dname, strhashval); 1670 unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
1658 if (unconf) /* case 4, possible retry or client restart */ 1671 if (unconf) /* case 4, possible retry or client restart */
1659 expire_client(unconf); 1672 expire_client(unconf);
1660 1673
1661 /* case 1 (normal case) */ 1674 /* case 1 (normal case) */
1662out_new: 1675out_new:
1663 new = create_client(exid->clname, dname, rqstp, &verf); 1676 new = create_client(exid->clname, rqstp, &verf);
1664 if (new == NULL) { 1677 if (new == NULL) {
1665 status = nfserr_jukebox; 1678 status = nfserr_jukebox;
1666 goto out; 1679 goto out;
1667 } 1680 }
1668 new->cl_minorversion = 1; 1681 new->cl_minorversion = 1;
1669 1682
1670 gen_clid(new); 1683 gen_clid(new, nn);
1671 add_to_unconfirmed(new, strhashval); 1684 add_to_unconfirmed(new);
1672out_copy: 1685out_copy:
1673 exid->clientid.cl_boot = new->cl_clientid.cl_boot; 1686 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1674 exid->clientid.cl_id = new->cl_clientid.cl_id; 1687 exid->clientid.cl_id = new->cl_clientid.cl_id;
@@ -1761,12 +1774,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1761 struct nfsd4_conn *conn; 1774 struct nfsd4_conn *conn;
1762 struct nfsd4_clid_slot *cs_slot = NULL; 1775 struct nfsd4_clid_slot *cs_slot = NULL;
1763 __be32 status = 0; 1776 __be32 status = 0;
1777 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1764 1778
1765 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) 1779 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1766 return nfserr_inval; 1780 return nfserr_inval;
1767 if (check_forechannel_attrs(cr_ses->fore_channel)) 1781 if (check_forechannel_attrs(cr_ses->fore_channel))
1768 return nfserr_toosmall; 1782 return nfserr_toosmall;
1769 new = alloc_session(&cr_ses->fore_channel); 1783 new = alloc_session(&cr_ses->fore_channel, nn);
1770 if (!new) 1784 if (!new)
1771 return nfserr_jukebox; 1785 return nfserr_jukebox;
1772 status = nfserr_jukebox; 1786 status = nfserr_jukebox;
@@ -1775,8 +1789,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1775 goto out_free_session; 1789 goto out_free_session;
1776 1790
1777 nfs4_lock_state(); 1791 nfs4_lock_state();
1778 unconf = find_unconfirmed_client(&cr_ses->clientid, true); 1792 unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
1779 conf = find_confirmed_client(&cr_ses->clientid, true); 1793 conf = find_confirmed_client(&cr_ses->clientid, true, nn);
1780 1794
1781 if (conf) { 1795 if (conf) {
1782 cs_slot = &conf->cl_cs_slot; 1796 cs_slot = &conf->cl_cs_slot;
@@ -1789,7 +1803,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1789 goto out_free_conn; 1803 goto out_free_conn;
1790 } 1804 }
1791 } else if (unconf) { 1805 } else if (unconf) {
1792 unsigned int hash;
1793 struct nfs4_client *old; 1806 struct nfs4_client *old;
1794 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1807 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1795 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 1808 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1803,8 +1816,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1803 status = nfserr_seq_misordered; 1816 status = nfserr_seq_misordered;
1804 goto out_free_conn; 1817 goto out_free_conn;
1805 } 1818 }
1806 hash = clientstr_hashval(unconf->cl_recdir); 1819 old = find_confirmed_client_by_name(&unconf->cl_name, nn);
1807 old = find_confirmed_client_by_str(unconf->cl_recdir, hash);
1808 if (old) 1820 if (old)
1809 expire_client(old); 1821 expire_client(old);
1810 move_to_confirmed(unconf); 1822 move_to_confirmed(unconf);
@@ -1843,14 +1855,6 @@ out_free_session:
1843 goto out; 1855 goto out;
1844} 1856}
1845 1857
1846static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1847{
1848 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1849 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
1850
1851 return argp->opcnt == resp->opcnt;
1852}
1853
1854static __be32 nfsd4_map_bcts_dir(u32 *dir) 1858static __be32 nfsd4_map_bcts_dir(u32 *dir)
1855{ 1859{
1856 switch (*dir) { 1860 switch (*dir) {
@@ -1865,24 +1869,40 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir)
1865 return nfserr_inval; 1869 return nfserr_inval;
1866} 1870}
1867 1871
1872__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc)
1873{
1874 struct nfsd4_session *session = cstate->session;
1875 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1876
1877 spin_lock(&nn->client_lock);
1878 session->se_cb_prog = bc->bc_cb_program;
1879 session->se_cb_sec = bc->bc_cb_sec;
1880 spin_unlock(&nn->client_lock);
1881
1882 nfsd4_probe_callback(session->se_client);
1883
1884 return nfs_ok;
1885}
1886
1868__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, 1887__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1869 struct nfsd4_compound_state *cstate, 1888 struct nfsd4_compound_state *cstate,
1870 struct nfsd4_bind_conn_to_session *bcts) 1889 struct nfsd4_bind_conn_to_session *bcts)
1871{ 1890{
1872 __be32 status; 1891 __be32 status;
1873 struct nfsd4_conn *conn; 1892 struct nfsd4_conn *conn;
1893 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1874 1894
1875 if (!nfsd4_last_compound_op(rqstp)) 1895 if (!nfsd4_last_compound_op(rqstp))
1876 return nfserr_not_only_op; 1896 return nfserr_not_only_op;
1877 spin_lock(&client_lock); 1897 spin_lock(&nn->client_lock);
1878 cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid); 1898 cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
1879 /* Sorta weird: we only need the refcnt'ing because new_conn acquires 1899 /* Sorta weird: we only need the refcnt'ing because new_conn acquires
1880 * client_lock iself: */ 1900 * client_lock iself: */
1881 if (cstate->session) { 1901 if (cstate->session) {
1882 nfsd4_get_session(cstate->session); 1902 nfsd4_get_session(cstate->session);
1883 atomic_inc(&cstate->session->se_client->cl_refcount); 1903 atomic_inc(&cstate->session->se_client->cl_refcount);
1884 } 1904 }
1885 spin_unlock(&client_lock); 1905 spin_unlock(&nn->client_lock);
1886 if (!cstate->session) 1906 if (!cstate->session)
1887 return nfserr_badsession; 1907 return nfserr_badsession;
1888 1908
@@ -1910,6 +1930,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
1910{ 1930{
1911 struct nfsd4_session *ses; 1931 struct nfsd4_session *ses;
1912 __be32 status = nfserr_badsession; 1932 __be32 status = nfserr_badsession;
1933 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
1913 1934
1914 /* Notes: 1935 /* Notes:
1915 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid 1936 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1923,24 +1944,24 @@ nfsd4_destroy_session(struct svc_rqst *r,
1923 return nfserr_not_only_op; 1944 return nfserr_not_only_op;
1924 } 1945 }
1925 dump_sessionid(__func__, &sessionid->sessionid); 1946 dump_sessionid(__func__, &sessionid->sessionid);
1926 spin_lock(&client_lock); 1947 spin_lock(&nn->client_lock);
1927 ses = find_in_sessionid_hashtbl(&sessionid->sessionid); 1948 ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
1928 if (!ses) { 1949 if (!ses) {
1929 spin_unlock(&client_lock); 1950 spin_unlock(&nn->client_lock);
1930 goto out; 1951 goto out;
1931 } 1952 }
1932 1953
1933 unhash_session(ses); 1954 unhash_session(ses);
1934 spin_unlock(&client_lock); 1955 spin_unlock(&nn->client_lock);
1935 1956
1936 nfs4_lock_state(); 1957 nfs4_lock_state();
1937 nfsd4_probe_callback_sync(ses->se_client); 1958 nfsd4_probe_callback_sync(ses->se_client);
1938 nfs4_unlock_state(); 1959 nfs4_unlock_state();
1939 1960
1940 spin_lock(&client_lock); 1961 spin_lock(&nn->client_lock);
1941 nfsd4_del_conns(ses); 1962 nfsd4_del_conns(ses);
1942 nfsd4_put_session_locked(ses); 1963 nfsd4_put_session_locked(ses);
1943 spin_unlock(&client_lock); 1964 spin_unlock(&nn->client_lock);
1944 status = nfs_ok; 1965 status = nfs_ok;
1945out: 1966out:
1946 dprintk("%s returns %d\n", __func__, ntohl(status)); 1967 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -2006,6 +2027,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2006 struct nfsd4_slot *slot; 2027 struct nfsd4_slot *slot;
2007 struct nfsd4_conn *conn; 2028 struct nfsd4_conn *conn;
2008 __be32 status; 2029 __be32 status;
2030 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2009 2031
2010 if (resp->opcnt != 1) 2032 if (resp->opcnt != 1)
2011 return nfserr_sequence_pos; 2033 return nfserr_sequence_pos;
@@ -2018,9 +2040,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2018 if (!conn) 2040 if (!conn)
2019 return nfserr_jukebox; 2041 return nfserr_jukebox;
2020 2042
2021 spin_lock(&client_lock); 2043 spin_lock(&nn->client_lock);
2022 status = nfserr_badsession; 2044 status = nfserr_badsession;
2023 session = find_in_sessionid_hashtbl(&seq->sessionid); 2045 session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
2024 if (!session) 2046 if (!session)
2025 goto out; 2047 goto out;
2026 2048
@@ -2094,7 +2116,7 @@ out:
2094 } 2116 }
2095 } 2117 }
2096 kfree(conn); 2118 kfree(conn);
2097 spin_unlock(&client_lock); 2119 spin_unlock(&nn->client_lock);
2098 dprintk("%s: return %d\n", __func__, ntohl(status)); 2120 dprintk("%s: return %d\n", __func__, ntohl(status));
2099 return status; 2121 return status;
2100} 2122}
@@ -2104,10 +2126,11 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2104{ 2126{
2105 struct nfs4_client *conf, *unconf, *clp; 2127 struct nfs4_client *conf, *unconf, *clp;
2106 __be32 status = 0; 2128 __be32 status = 0;
2129 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2107 2130
2108 nfs4_lock_state(); 2131 nfs4_lock_state();
2109 unconf = find_unconfirmed_client(&dc->clientid, true); 2132 unconf = find_unconfirmed_client(&dc->clientid, true, nn);
2110 conf = find_confirmed_client(&dc->clientid, true); 2133 conf = find_confirmed_client(&dc->clientid, true, nn);
2111 2134
2112 if (conf) { 2135 if (conf) {
2113 clp = conf; 2136 clp = conf;
@@ -2181,20 +2204,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2181{ 2204{
2182 struct xdr_netobj clname = setclid->se_name; 2205 struct xdr_netobj clname = setclid->se_name;
2183 nfs4_verifier clverifier = setclid->se_verf; 2206 nfs4_verifier clverifier = setclid->se_verf;
2184 unsigned int strhashval;
2185 struct nfs4_client *conf, *unconf, *new; 2207 struct nfs4_client *conf, *unconf, *new;
2186 __be32 status; 2208 __be32 status;
2187 char dname[HEXDIR_LEN]; 2209 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2188
2189 status = nfs4_make_rec_clidname(dname, &clname);
2190 if (status)
2191 return status;
2192
2193 strhashval = clientstr_hashval(dname);
2194 2210
2195 /* Cases below refer to rfc 3530 section 14.2.33: */ 2211 /* Cases below refer to rfc 3530 section 14.2.33: */
2196 nfs4_lock_state(); 2212 nfs4_lock_state();
2197 conf = find_confirmed_client_by_str(dname, strhashval); 2213 conf = find_confirmed_client_by_name(&clname, nn);
2198 if (conf) { 2214 if (conf) {
2199 /* case 0: */ 2215 /* case 0: */
2200 status = nfserr_clid_inuse; 2216 status = nfserr_clid_inuse;
@@ -2209,21 +2225,21 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2209 goto out; 2225 goto out;
2210 } 2226 }
2211 } 2227 }
2212 unconf = find_unconfirmed_client_by_str(dname, strhashval); 2228 unconf = find_unconfirmed_client_by_name(&clname, nn);
2213 if (unconf) 2229 if (unconf)
2214 expire_client(unconf); 2230 expire_client(unconf);
2215 status = nfserr_jukebox; 2231 status = nfserr_jukebox;
2216 new = create_client(clname, dname, rqstp, &clverifier); 2232 new = create_client(clname, rqstp, &clverifier);
2217 if (new == NULL) 2233 if (new == NULL)
2218 goto out; 2234 goto out;
2219 if (conf && same_verf(&conf->cl_verifier, &clverifier)) 2235 if (conf && same_verf(&conf->cl_verifier, &clverifier))
2220 /* case 1: probable callback update */ 2236 /* case 1: probable callback update */
2221 copy_clid(new, conf); 2237 copy_clid(new, conf);
2222 else /* case 4 (new client) or cases 2, 3 (client reboot): */ 2238 else /* case 4 (new client) or cases 2, 3 (client reboot): */
2223 gen_clid(new); 2239 gen_clid(new, nn);
2224 new->cl_minorversion = 0; 2240 new->cl_minorversion = 0;
2225 gen_callback(new, setclid, rqstp); 2241 gen_callback(new, setclid, rqstp);
2226 add_to_unconfirmed(new, strhashval); 2242 add_to_unconfirmed(new);
2227 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 2243 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
2228 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 2244 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
2229 memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); 2245 memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
@@ -2243,14 +2259,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2243 nfs4_verifier confirm = setclientid_confirm->sc_confirm; 2259 nfs4_verifier confirm = setclientid_confirm->sc_confirm;
2244 clientid_t * clid = &setclientid_confirm->sc_clientid; 2260 clientid_t * clid = &setclientid_confirm->sc_clientid;
2245 __be32 status; 2261 __be32 status;
2246 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 2262 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2247 2263
2248 if (STALE_CLIENTID(clid, nn)) 2264 if (STALE_CLIENTID(clid, nn))
2249 return nfserr_stale_clientid; 2265 return nfserr_stale_clientid;
2250 nfs4_lock_state(); 2266 nfs4_lock_state();
2251 2267
2252 conf = find_confirmed_client(clid, false); 2268 conf = find_confirmed_client(clid, false, nn);
2253 unconf = find_unconfirmed_client(clid, false); 2269 unconf = find_unconfirmed_client(clid, false, nn);
2254 /* 2270 /*
2255 * We try hard to give out unique clientid's, so if we get an 2271 * We try hard to give out unique clientid's, so if we get an
2256 * attempt to confirm the same clientid with a different cred, 2272 * attempt to confirm the same clientid with a different cred,
@@ -2276,9 +2292,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2276 nfsd4_probe_callback(conf); 2292 nfsd4_probe_callback(conf);
2277 expire_client(unconf); 2293 expire_client(unconf);
2278 } else { /* case 3: normal case; new or rebooted client */ 2294 } else { /* case 3: normal case; new or rebooted client */
2279 unsigned int hash = clientstr_hashval(unconf->cl_recdir); 2295 conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
2280
2281 conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
2282 if (conf) 2296 if (conf)
2283 expire_client(conf); 2297 expire_client(conf);
2284 move_to_confirmed(unconf); 2298 move_to_confirmed(unconf);
@@ -2340,7 +2354,7 @@ nfsd4_init_slabs(void)
2340 if (openowner_slab == NULL) 2354 if (openowner_slab == NULL)
2341 goto out_nomem; 2355 goto out_nomem;
2342 lockowner_slab = kmem_cache_create("nfsd4_lockowners", 2356 lockowner_slab = kmem_cache_create("nfsd4_lockowners",
2343 sizeof(struct nfs4_openowner), 0, 0, NULL); 2357 sizeof(struct nfs4_lockowner), 0, 0, NULL);
2344 if (lockowner_slab == NULL) 2358 if (lockowner_slab == NULL)
2345 goto out_nomem; 2359 goto out_nomem;
2346 file_slab = kmem_cache_create("nfsd4_files", 2360 file_slab = kmem_cache_create("nfsd4_files",
@@ -2404,7 +2418,9 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
2404 2418
2405static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) 2419static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
2406{ 2420{
2407 list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); 2421 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
2422
2423 list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
2408 list_add(&oo->oo_perclient, &clp->cl_openowners); 2424 list_add(&oo->oo_perclient, &clp->cl_openowners);
2409} 2425}
2410 2426
@@ -2444,11 +2460,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
2444} 2460}
2445 2461
2446static void 2462static void
2447move_to_close_lru(struct nfs4_openowner *oo) 2463move_to_close_lru(struct nfs4_openowner *oo, struct net *net)
2448{ 2464{
2465 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
2466
2449 dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); 2467 dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
2450 2468
2451 list_move_tail(&oo->oo_close_lru, &close_lru); 2469 list_move_tail(&oo->oo_close_lru, &nn->close_lru);
2452 oo->oo_time = get_seconds(); 2470 oo->oo_time = get_seconds();
2453} 2471}
2454 2472
@@ -2462,13 +2480,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
2462} 2480}
2463 2481
2464static struct nfs4_openowner * 2482static struct nfs4_openowner *
2465find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions) 2483find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
2484 bool sessions, struct nfsd_net *nn)
2466{ 2485{
2467 struct nfs4_stateowner *so; 2486 struct nfs4_stateowner *so;
2468 struct nfs4_openowner *oo; 2487 struct nfs4_openowner *oo;
2469 struct nfs4_client *clp; 2488 struct nfs4_client *clp;
2470 2489
2471 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { 2490 list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) {
2472 if (!so->so_is_open_owner) 2491 if (!so->so_is_open_owner)
2473 continue; 2492 continue;
2474 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { 2493 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
@@ -2555,9 +2574,14 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
2555 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; 2574 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
2556 struct nfs4_delegation *dp; 2575 struct nfs4_delegation *dp;
2557 2576
2558 BUG_ON(!fp); 2577 if (!fp) {
2559 /* We assume break_lease is only called once per lease: */ 2578 WARN(1, "(%p)->fl_owner NULL\n", fl);
2560 BUG_ON(fp->fi_had_conflict); 2579 return;
2580 }
2581 if (fp->fi_had_conflict) {
2582 WARN(1, "duplicate break on %p\n", fp);
2583 return;
2584 }
2561 /* 2585 /*
2562 * We don't want the locks code to timeout the lease for us; 2586 * We don't want the locks code to timeout the lease for us;
2563 * we'll remove it ourself if a delegation isn't returned 2587 * we'll remove it ourself if a delegation isn't returned
@@ -2599,14 +2623,13 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
2599 2623
2600__be32 2624__be32
2601nfsd4_process_open1(struct nfsd4_compound_state *cstate, 2625nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2602 struct nfsd4_open *open) 2626 struct nfsd4_open *open, struct nfsd_net *nn)
2603{ 2627{
2604 clientid_t *clientid = &open->op_clientid; 2628 clientid_t *clientid = &open->op_clientid;
2605 struct nfs4_client *clp = NULL; 2629 struct nfs4_client *clp = NULL;
2606 unsigned int strhashval; 2630 unsigned int strhashval;
2607 struct nfs4_openowner *oo = NULL; 2631 struct nfs4_openowner *oo = NULL;
2608 __be32 status; 2632 __be32 status;
2609 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
2610 2633
2611 if (STALE_CLIENTID(&open->op_clientid, nn)) 2634 if (STALE_CLIENTID(&open->op_clientid, nn))
2612 return nfserr_stale_clientid; 2635 return nfserr_stale_clientid;
@@ -2619,10 +2642,11 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2619 return nfserr_jukebox; 2642 return nfserr_jukebox;
2620 2643
2621 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); 2644 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
2622 oo = find_openstateowner_str(strhashval, open, cstate->minorversion); 2645 oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn);
2623 open->op_openowner = oo; 2646 open->op_openowner = oo;
2624 if (!oo) { 2647 if (!oo) {
2625 clp = find_confirmed_client(clientid, cstate->minorversion); 2648 clp = find_confirmed_client(clientid, cstate->minorversion,
2649 nn);
2626 if (clp == NULL) 2650 if (clp == NULL)
2627 return nfserr_expired; 2651 return nfserr_expired;
2628 goto new_owner; 2652 goto new_owner;
@@ -2891,7 +2915,7 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
2891 open->op_why_no_deleg = WND4_CANCELLED; 2915 open->op_why_no_deleg = WND4_CANCELLED;
2892 break; 2916 break;
2893 case NFS4_SHARE_WANT_NO_DELEG: 2917 case NFS4_SHARE_WANT_NO_DELEG:
2894 BUG(); /* not supposed to get here */ 2918 WARN_ON_ONCE(1);
2895 } 2919 }
2896 } 2920 }
2897} 2921}
@@ -2959,6 +2983,7 @@ out:
2959 } 2983 }
2960 return; 2984 return;
2961out_free: 2985out_free:
2986 unhash_stid(&dp->dl_stid);
2962 nfs4_put_delegation(dp); 2987 nfs4_put_delegation(dp);
2963out_no_deleg: 2988out_no_deleg:
2964 flag = NFS4_OPEN_DELEGATE_NONE; 2989 flag = NFS4_OPEN_DELEGATE_NONE;
@@ -3104,27 +3129,32 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
3104 free_generic_stateid(open->op_stp); 3129 free_generic_stateid(open->op_stp);
3105} 3130}
3106 3131
3132static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp)
3133{
3134 struct nfs4_client *found;
3135
3136 if (STALE_CLIENTID(clid, nn))
3137 return nfserr_stale_clientid;
3138 found = find_confirmed_client(clid, session, nn);
3139 if (clp)
3140 *clp = found;
3141 return found ? nfs_ok : nfserr_expired;
3142}
3143
3107__be32 3144__be32
3108nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3145nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3109 clientid_t *clid) 3146 clientid_t *clid)
3110{ 3147{
3111 struct nfs4_client *clp; 3148 struct nfs4_client *clp;
3112 __be32 status; 3149 __be32 status;
3113 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 3150 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3114 3151
3115 nfs4_lock_state(); 3152 nfs4_lock_state();
3116 dprintk("process_renew(%08x/%08x): starting\n", 3153 dprintk("process_renew(%08x/%08x): starting\n",
3117 clid->cl_boot, clid->cl_id); 3154 clid->cl_boot, clid->cl_id);
3118 status = nfserr_stale_clientid; 3155 status = lookup_clientid(clid, cstate->minorversion, nn, &clp);
3119 if (STALE_CLIENTID(clid, nn)) 3156 if (status)
3120 goto out;
3121 clp = find_confirmed_client(clid, cstate->minorversion);
3122 status = nfserr_expired;
3123 if (clp == NULL) {
3124 /* We assume the client took too long to RENEW. */
3125 dprintk("nfsd4_renew: clientid not found!\n");
3126 goto out; 3157 goto out;
3127 }
3128 status = nfserr_cb_path_down; 3158 status = nfserr_cb_path_down;
3129 if (!list_empty(&clp->cl_delegations) 3159 if (!list_empty(&clp->cl_delegations)
3130 && clp->cl_cb_state != NFSD4_CB_UP) 3160 && clp->cl_cb_state != NFSD4_CB_UP)
@@ -3136,44 +3166,42 @@ out:
3136} 3166}
3137 3167
3138static void 3168static void
3139nfsd4_end_grace(struct net *net) 3169nfsd4_end_grace(struct nfsd_net *nn)
3140{ 3170{
3141 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
3142
3143 /* do nothing if grace period already ended */ 3171 /* do nothing if grace period already ended */
3144 if (nn->grace_ended) 3172 if (nn->grace_ended)
3145 return; 3173 return;
3146 3174
3147 dprintk("NFSD: end of grace period\n"); 3175 dprintk("NFSD: end of grace period\n");
3148 nn->grace_ended = true; 3176 nn->grace_ended = true;
3149 nfsd4_record_grace_done(net, nn->boot_time); 3177 nfsd4_record_grace_done(nn, nn->boot_time);
3150 locks_end_grace(&nn->nfsd4_manager); 3178 locks_end_grace(&nn->nfsd4_manager);
3151 /* 3179 /*
3152 * Now that every NFSv4 client has had the chance to recover and 3180 * Now that every NFSv4 client has had the chance to recover and
3153 * to see the (possibly new, possibly shorter) lease time, we 3181 * to see the (possibly new, possibly shorter) lease time, we
3154 * can safely set the next grace time to the current lease time: 3182 * can safely set the next grace time to the current lease time:
3155 */ 3183 */
3156 nfsd4_grace = nfsd4_lease; 3184 nn->nfsd4_grace = nn->nfsd4_lease;
3157} 3185}
3158 3186
3159static time_t 3187static time_t
3160nfs4_laundromat(void) 3188nfs4_laundromat(struct nfsd_net *nn)
3161{ 3189{
3162 struct nfs4_client *clp; 3190 struct nfs4_client *clp;
3163 struct nfs4_openowner *oo; 3191 struct nfs4_openowner *oo;
3164 struct nfs4_delegation *dp; 3192 struct nfs4_delegation *dp;
3165 struct list_head *pos, *next, reaplist; 3193 struct list_head *pos, *next, reaplist;
3166 time_t cutoff = get_seconds() - nfsd4_lease; 3194 time_t cutoff = get_seconds() - nn->nfsd4_lease;
3167 time_t t, clientid_val = nfsd4_lease; 3195 time_t t, clientid_val = nn->nfsd4_lease;
3168 time_t u, test_val = nfsd4_lease; 3196 time_t u, test_val = nn->nfsd4_lease;
3169 3197
3170 nfs4_lock_state(); 3198 nfs4_lock_state();
3171 3199
3172 dprintk("NFSD: laundromat service - starting\n"); 3200 dprintk("NFSD: laundromat service - starting\n");
3173 nfsd4_end_grace(&init_net); 3201 nfsd4_end_grace(nn);
3174 INIT_LIST_HEAD(&reaplist); 3202 INIT_LIST_HEAD(&reaplist);
3175 spin_lock(&client_lock); 3203 spin_lock(&nn->client_lock);
3176 list_for_each_safe(pos, next, &client_lru) { 3204 list_for_each_safe(pos, next, &nn->client_lru) {
3177 clp = list_entry(pos, struct nfs4_client, cl_lru); 3205 clp = list_entry(pos, struct nfs4_client, cl_lru);
3178 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { 3206 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
3179 t = clp->cl_time - cutoff; 3207 t = clp->cl_time - cutoff;
@@ -3189,7 +3217,7 @@ nfs4_laundromat(void)
3189 unhash_client_locked(clp); 3217 unhash_client_locked(clp);
3190 list_add(&clp->cl_lru, &reaplist); 3218 list_add(&clp->cl_lru, &reaplist);
3191 } 3219 }
3192 spin_unlock(&client_lock); 3220 spin_unlock(&nn->client_lock);
3193 list_for_each_safe(pos, next, &reaplist) { 3221 list_for_each_safe(pos, next, &reaplist) {
3194 clp = list_entry(pos, struct nfs4_client, cl_lru); 3222 clp = list_entry(pos, struct nfs4_client, cl_lru);
3195 dprintk("NFSD: purging unused client (clientid %08x)\n", 3223 dprintk("NFSD: purging unused client (clientid %08x)\n",
@@ -3199,6 +3227,8 @@ nfs4_laundromat(void)
3199 spin_lock(&recall_lock); 3227 spin_lock(&recall_lock);
3200 list_for_each_safe(pos, next, &del_recall_lru) { 3228 list_for_each_safe(pos, next, &del_recall_lru) {
3201 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 3229 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
3230 if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
3231 continue;
3202 if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { 3232 if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
3203 u = dp->dl_time - cutoff; 3233 u = dp->dl_time - cutoff;
3204 if (test_val > u) 3234 if (test_val > u)
@@ -3212,8 +3242,8 @@ nfs4_laundromat(void)
3212 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 3242 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
3213 unhash_delegation(dp); 3243 unhash_delegation(dp);
3214 } 3244 }
3215 test_val = nfsd4_lease; 3245 test_val = nn->nfsd4_lease;
3216 list_for_each_safe(pos, next, &close_lru) { 3246 list_for_each_safe(pos, next, &nn->close_lru) {
3217 oo = container_of(pos, struct nfs4_openowner, oo_close_lru); 3247 oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
3218 if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { 3248 if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
3219 u = oo->oo_time - cutoff; 3249 u = oo->oo_time - cutoff;
@@ -3231,16 +3261,19 @@ nfs4_laundromat(void)
3231 3261
3232static struct workqueue_struct *laundry_wq; 3262static struct workqueue_struct *laundry_wq;
3233static void laundromat_main(struct work_struct *); 3263static void laundromat_main(struct work_struct *);
3234static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main);
3235 3264
3236static void 3265static void
3237laundromat_main(struct work_struct *not_used) 3266laundromat_main(struct work_struct *laundry)
3238{ 3267{
3239 time_t t; 3268 time_t t;
3269 struct delayed_work *dwork = container_of(laundry, struct delayed_work,
3270 work);
3271 struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
3272 laundromat_work);
3240 3273
3241 t = nfs4_laundromat(); 3274 t = nfs4_laundromat(nn);
3242 dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t); 3275 dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t);
3243 queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); 3276 queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
3244} 3277}
3245 3278
3246static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) 3279static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
@@ -3385,16 +3418,17 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
3385 return nfs_ok; 3418 return nfs_ok;
3386} 3419}
3387 3420
3388static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions) 3421static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
3422 struct nfs4_stid **s, bool sessions,
3423 struct nfsd_net *nn)
3389{ 3424{
3390 struct nfs4_client *cl; 3425 struct nfs4_client *cl;
3391 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
3392 3426
3393 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3427 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3394 return nfserr_bad_stateid; 3428 return nfserr_bad_stateid;
3395 if (STALE_STATEID(stateid, nn)) 3429 if (STALE_STATEID(stateid, nn))
3396 return nfserr_stale_stateid; 3430 return nfserr_stale_stateid;
3397 cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions); 3431 cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);
3398 if (!cl) 3432 if (!cl)
3399 return nfserr_expired; 3433 return nfserr_expired;
3400 *s = find_stateid_by_type(cl, stateid, typemask); 3434 *s = find_stateid_by_type(cl, stateid, typemask);
@@ -3416,6 +3450,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3416 struct nfs4_delegation *dp = NULL; 3450 struct nfs4_delegation *dp = NULL;
3417 struct svc_fh *current_fh = &cstate->current_fh; 3451 struct svc_fh *current_fh = &cstate->current_fh;
3418 struct inode *ino = current_fh->fh_dentry->d_inode; 3452 struct inode *ino = current_fh->fh_dentry->d_inode;
3453 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
3419 __be32 status; 3454 __be32 status;
3420 3455
3421 if (filpp) 3456 if (filpp)
@@ -3427,7 +3462,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3427 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3462 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3428 return check_special_stateids(net, current_fh, stateid, flags); 3463 return check_special_stateids(net, current_fh, stateid, flags);
3429 3464
3430 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion); 3465 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
3466 &s, cstate->minorversion, nn);
3431 if (status) 3467 if (status)
3432 return status; 3468 return status;
3433 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); 3469 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
@@ -3441,7 +3477,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3441 goto out; 3477 goto out;
3442 if (filpp) { 3478 if (filpp) {
3443 *filpp = dp->dl_file->fi_deleg_file; 3479 *filpp = dp->dl_file->fi_deleg_file;
3444 BUG_ON(!*filpp); 3480 if (!*filpp) {
3481 WARN_ON_ONCE(1);
3482 status = nfserr_serverfault;
3483 goto out;
3484 }
3445 } 3485 }
3446 break; 3486 break;
3447 case NFS4_OPEN_STID: 3487 case NFS4_OPEN_STID:
@@ -3568,7 +3608,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
3568static __be32 3608static __be32
3569nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, 3609nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3570 stateid_t *stateid, char typemask, 3610 stateid_t *stateid, char typemask,
3571 struct nfs4_ol_stateid **stpp) 3611 struct nfs4_ol_stateid **stpp,
3612 struct nfsd_net *nn)
3572{ 3613{
3573 __be32 status; 3614 __be32 status;
3574 struct nfs4_stid *s; 3615 struct nfs4_stid *s;
@@ -3577,7 +3618,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3577 seqid, STATEID_VAL(stateid)); 3618 seqid, STATEID_VAL(stateid));
3578 3619
3579 *stpp = NULL; 3620 *stpp = NULL;
3580 status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion); 3621 status = nfsd4_lookup_stateid(stateid, typemask, &s,
3622 cstate->minorversion, nn);
3581 if (status) 3623 if (status)
3582 return status; 3624 return status;
3583 *stpp = openlockstateid(s); 3625 *stpp = openlockstateid(s);
@@ -3586,13 +3628,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3586 return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); 3628 return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
3587} 3629}
3588 3630
3589static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp) 3631static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3632 stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn)
3590{ 3633{
3591 __be32 status; 3634 __be32 status;
3592 struct nfs4_openowner *oo; 3635 struct nfs4_openowner *oo;
3593 3636
3594 status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, 3637 status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
3595 NFS4_OPEN_STID, stpp); 3638 NFS4_OPEN_STID, stpp, nn);
3596 if (status) 3639 if (status)
3597 return status; 3640 return status;
3598 oo = openowner((*stpp)->st_stateowner); 3641 oo = openowner((*stpp)->st_stateowner);
@@ -3608,6 +3651,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3608 __be32 status; 3651 __be32 status;
3609 struct nfs4_openowner *oo; 3652 struct nfs4_openowner *oo;
3610 struct nfs4_ol_stateid *stp; 3653 struct nfs4_ol_stateid *stp;
3654 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3611 3655
3612 dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", 3656 dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
3613 (int)cstate->current_fh.fh_dentry->d_name.len, 3657 (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3621,7 +3665,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3621 3665
3622 status = nfs4_preprocess_seqid_op(cstate, 3666 status = nfs4_preprocess_seqid_op(cstate,
3623 oc->oc_seqid, &oc->oc_req_stateid, 3667 oc->oc_seqid, &oc->oc_req_stateid,
3624 NFS4_OPEN_STID, &stp); 3668 NFS4_OPEN_STID, &stp, nn);
3625 if (status) 3669 if (status)
3626 goto out; 3670 goto out;
3627 oo = openowner(stp->st_stateowner); 3671 oo = openowner(stp->st_stateowner);
@@ -3664,7 +3708,7 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
3664 case NFS4_SHARE_ACCESS_BOTH: 3708 case NFS4_SHARE_ACCESS_BOTH:
3665 break; 3709 break;
3666 default: 3710 default:
3667 BUG(); 3711 WARN_ON_ONCE(1);
3668 } 3712 }
3669} 3713}
3670 3714
@@ -3685,6 +3729,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3685{ 3729{
3686 __be32 status; 3730 __be32 status;
3687 struct nfs4_ol_stateid *stp; 3731 struct nfs4_ol_stateid *stp;
3732 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3688 3733
3689 dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 3734 dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",
3690 (int)cstate->current_fh.fh_dentry->d_name.len, 3735 (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3697,7 +3742,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3697 3742
3698 nfs4_lock_state(); 3743 nfs4_lock_state();
3699 status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, 3744 status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
3700 &od->od_stateid, &stp); 3745 &od->od_stateid, &stp, nn);
3701 if (status) 3746 if (status)
3702 goto out; 3747 goto out;
3703 status = nfserr_inval; 3748 status = nfserr_inval;
@@ -3760,6 +3805,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3760 __be32 status; 3805 __be32 status;
3761 struct nfs4_openowner *oo; 3806 struct nfs4_openowner *oo;
3762 struct nfs4_ol_stateid *stp; 3807 struct nfs4_ol_stateid *stp;
3808 struct net *net = SVC_NET(rqstp);
3809 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
3763 3810
3764 dprintk("NFSD: nfsd4_close on file %.*s\n", 3811 dprintk("NFSD: nfsd4_close on file %.*s\n",
3765 (int)cstate->current_fh.fh_dentry->d_name.len, 3812 (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3769,7 +3816,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3769 status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, 3816 status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
3770 &close->cl_stateid, 3817 &close->cl_stateid,
3771 NFS4_OPEN_STID|NFS4_CLOSED_STID, 3818 NFS4_OPEN_STID|NFS4_CLOSED_STID,
3772 &stp); 3819 &stp, nn);
3773 if (status) 3820 if (status)
3774 goto out; 3821 goto out;
3775 oo = openowner(stp->st_stateowner); 3822 oo = openowner(stp->st_stateowner);
@@ -3791,7 +3838,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3791 * little while to handle CLOSE replay. 3838 * little while to handle CLOSE replay.
3792 */ 3839 */
3793 if (list_empty(&oo->oo_owner.so_stateids)) 3840 if (list_empty(&oo->oo_owner.so_stateids))
3794 move_to_close_lru(oo); 3841 move_to_close_lru(oo, SVC_NET(rqstp));
3795 } 3842 }
3796 } 3843 }
3797out: 3844out:
@@ -3807,15 +3854,15 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3807 struct nfs4_delegation *dp; 3854 struct nfs4_delegation *dp;
3808 stateid_t *stateid = &dr->dr_stateid; 3855 stateid_t *stateid = &dr->dr_stateid;
3809 struct nfs4_stid *s; 3856 struct nfs4_stid *s;
3810 struct inode *inode;
3811 __be32 status; 3857 __be32 status;
3858 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3812 3859
3813 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3860 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
3814 return status; 3861 return status;
3815 inode = cstate->current_fh.fh_dentry->d_inode;
3816 3862
3817 nfs4_lock_state(); 3863 nfs4_lock_state();
3818 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion); 3864 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s,
3865 cstate->minorversion, nn);
3819 if (status) 3866 if (status)
3820 goto out; 3867 goto out;
3821 dp = delegstateid(s); 3868 dp = delegstateid(s);
@@ -3833,8 +3880,6 @@ out:
3833 3880
3834#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) 3881#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
3835 3882
3836#define LOCKOWNER_INO_HASH_BITS 8
3837#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
3838#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1) 3883#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
3839 3884
3840static inline u64 3885static inline u64
@@ -3852,7 +3897,7 @@ last_byte_offset(u64 start, u64 len)
3852{ 3897{
3853 u64 end; 3898 u64 end;
3854 3899
3855 BUG_ON(!len); 3900 WARN_ON_ONCE(!len);
3856 end = start + len; 3901 end = start + len;
3857 return end > start ? end - 1: NFS4_MAX_UINT64; 3902 return end > start ? end - 1: NFS4_MAX_UINT64;
3858} 3903}
@@ -3864,8 +3909,6 @@ static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct
3864 & LOCKOWNER_INO_HASH_MASK; 3909 & LOCKOWNER_INO_HASH_MASK;
3865} 3910}
3866 3911
3867static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
3868
3869/* 3912/*
3870 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that 3913 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
3871 * we can't properly handle lock requests that go beyond the (2^63 - 1)-th 3914 * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -3931,12 +3974,12 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
3931 3974
3932static struct nfs4_lockowner * 3975static struct nfs4_lockowner *
3933find_lockowner_str(struct inode *inode, clientid_t *clid, 3976find_lockowner_str(struct inode *inode, clientid_t *clid,
3934 struct xdr_netobj *owner) 3977 struct xdr_netobj *owner, struct nfsd_net *nn)
3935{ 3978{
3936 unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner); 3979 unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
3937 struct nfs4_lockowner *lo; 3980 struct nfs4_lockowner *lo;
3938 3981
3939 list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { 3982 list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
3940 if (same_lockowner_ino(lo, inode, clid, owner)) 3983 if (same_lockowner_ino(lo, inode, clid, owner))
3941 return lo; 3984 return lo;
3942 } 3985 }
@@ -3948,9 +3991,10 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
3948 struct inode *inode = open_stp->st_file->fi_inode; 3991 struct inode *inode = open_stp->st_file->fi_inode;
3949 unsigned int inohash = lockowner_ino_hashval(inode, 3992 unsigned int inohash = lockowner_ino_hashval(inode,
3950 clp->cl_clientid.cl_id, &lo->lo_owner.so_owner); 3993 clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
3994 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
3951 3995
3952 list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); 3996 list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
3953 list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]); 3997 list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]);
3954 list_add(&lo->lo_perstateid, &open_stp->st_lockowners); 3998 list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
3955} 3999}
3956 4000
@@ -4024,8 +4068,10 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, s
4024 struct nfs4_client *cl = oo->oo_owner.so_client; 4068 struct nfs4_client *cl = oo->oo_owner.so_client;
4025 struct nfs4_lockowner *lo; 4069 struct nfs4_lockowner *lo;
4026 unsigned int strhashval; 4070 unsigned int strhashval;
4071 struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id);
4027 4072
4028 lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner); 4073 lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid,
4074 &lock->v.new.owner, nn);
4029 if (lo) { 4075 if (lo) {
4030 if (!cstate->minorversion) 4076 if (!cstate->minorversion)
4031 return nfserr_bad_seqid; 4077 return nfserr_bad_seqid;
@@ -4065,7 +4111,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4065 bool new_state = false; 4111 bool new_state = false;
4066 int lkflg; 4112 int lkflg;
4067 int err; 4113 int err;
4068 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 4114 struct net *net = SVC_NET(rqstp);
4115 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4069 4116
4070 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", 4117 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
4071 (long long) lock->lk_offset, 4118 (long long) lock->lk_offset,
@@ -4099,7 +4146,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4099 status = nfs4_preprocess_confirmed_seqid_op(cstate, 4146 status = nfs4_preprocess_confirmed_seqid_op(cstate,
4100 lock->lk_new_open_seqid, 4147 lock->lk_new_open_seqid,
4101 &lock->lk_new_open_stateid, 4148 &lock->lk_new_open_stateid,
4102 &open_stp); 4149 &open_stp, nn);
4103 if (status) 4150 if (status)
4104 goto out; 4151 goto out;
4105 open_sop = openowner(open_stp->st_stateowner); 4152 open_sop = openowner(open_stp->st_stateowner);
@@ -4113,7 +4160,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4113 status = nfs4_preprocess_seqid_op(cstate, 4160 status = nfs4_preprocess_seqid_op(cstate,
4114 lock->lk_old_lock_seqid, 4161 lock->lk_old_lock_seqid,
4115 &lock->lk_old_lock_stateid, 4162 &lock->lk_old_lock_stateid,
4116 NFS4_LOCK_STID, &lock_stp); 4163 NFS4_LOCK_STID, &lock_stp, nn);
4117 if (status) 4164 if (status)
4118 goto out; 4165 goto out;
4119 lock_sop = lockowner(lock_stp->st_stateowner); 4166 lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4124,10 +4171,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4124 goto out; 4171 goto out;
4125 4172
4126 status = nfserr_grace; 4173 status = nfserr_grace;
4127 if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim) 4174 if (locks_in_grace(net) && !lock->lk_reclaim)
4128 goto out; 4175 goto out;
4129 status = nfserr_no_grace; 4176 status = nfserr_no_grace;
4130 if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim) 4177 if (!locks_in_grace(net) && lock->lk_reclaim)
4131 goto out; 4178 goto out;
4132 4179
4133 file_lock = locks_alloc_lock(); 4180 file_lock = locks_alloc_lock();
@@ -4238,7 +4285,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4238 struct file_lock *file_lock = NULL; 4285 struct file_lock *file_lock = NULL;
4239 struct nfs4_lockowner *lo; 4286 struct nfs4_lockowner *lo;
4240 __be32 status; 4287 __be32 status;
4241 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 4288 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
4242 4289
4243 if (locks_in_grace(SVC_NET(rqstp))) 4290 if (locks_in_grace(SVC_NET(rqstp)))
4244 return nfserr_grace; 4291 return nfserr_grace;
@@ -4248,9 +4295,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4248 4295
4249 nfs4_lock_state(); 4296 nfs4_lock_state();
4250 4297
4251 status = nfserr_stale_clientid; 4298 if (!nfsd4_has_session(cstate)) {
4252 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn)) 4299 status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL);
4253 goto out; 4300 if (status)
4301 goto out;
4302 }
4254 4303
4255 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 4304 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
4256 goto out; 4305 goto out;
@@ -4278,7 +4327,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4278 goto out; 4327 goto out;
4279 } 4328 }
4280 4329
4281 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); 4330 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn);
4282 if (lo) 4331 if (lo)
4283 file_lock->fl_owner = (fl_owner_t)lo; 4332 file_lock->fl_owner = (fl_owner_t)lo;
4284 file_lock->fl_pid = current->tgid; 4333 file_lock->fl_pid = current->tgid;
@@ -4313,7 +4362,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4313 struct file_lock *file_lock = NULL; 4362 struct file_lock *file_lock = NULL;
4314 __be32 status; 4363 __be32 status;
4315 int err; 4364 int err;
4316 4365 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
4366
4317 dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", 4367 dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
4318 (long long) locku->lu_offset, 4368 (long long) locku->lu_offset,
4319 (long long) locku->lu_length); 4369 (long long) locku->lu_length);
@@ -4324,7 +4374,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4324 nfs4_lock_state(); 4374 nfs4_lock_state();
4325 4375
4326 status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, 4376 status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
4327 &locku->lu_stateid, NFS4_LOCK_STID, &stp); 4377 &locku->lu_stateid, NFS4_LOCK_STID,
4378 &stp, nn);
4328 if (status) 4379 if (status)
4329 goto out; 4380 goto out;
4330 filp = find_any_file(stp->st_file); 4381 filp = find_any_file(stp->st_file);
@@ -4414,23 +4465,21 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4414 struct list_head matches; 4465 struct list_head matches;
4415 unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); 4466 unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
4416 __be32 status; 4467 __be32 status;
4417 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 4468 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
4418 4469
4419 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", 4470 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
4420 clid->cl_boot, clid->cl_id); 4471 clid->cl_boot, clid->cl_id);
4421 4472
4422 /* XXX check for lease expiration */
4423
4424 status = nfserr_stale_clientid;
4425 if (STALE_CLIENTID(clid, nn))
4426 return status;
4427
4428 nfs4_lock_state(); 4473 nfs4_lock_state();
4429 4474
4475 status = lookup_clientid(clid, cstate->minorversion, nn, NULL);
4476 if (status)
4477 goto out;
4478
4430 status = nfserr_locks_held; 4479 status = nfserr_locks_held;
4431 INIT_LIST_HEAD(&matches); 4480 INIT_LIST_HEAD(&matches);
4432 4481
4433 list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) { 4482 list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) {
4434 if (sop->so_is_open_owner) 4483 if (sop->so_is_open_owner)
4435 continue; 4484 continue;
4436 if (!same_owner_str(sop, owner, clid)) 4485 if (!same_owner_str(sop, owner, clid))
@@ -4466,73 +4515,74 @@ alloc_reclaim(void)
4466 return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); 4515 return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
4467} 4516}
4468 4517
4469int 4518bool
4470nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) 4519nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
4471{ 4520{
4472 unsigned int strhashval = clientstr_hashval(name); 4521 struct nfs4_client_reclaim *crp;
4473 struct nfs4_client *clp;
4474 4522
4475 clp = find_confirmed_client_by_str(name, strhashval); 4523 crp = nfsd4_find_reclaim_client(name, nn);
4476 if (!clp) 4524 return (crp && crp->cr_clp);
4477 return 0;
4478 return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
4479} 4525}
4480 4526
4481/* 4527/*
4482 * failure => all reset bets are off, nfserr_no_grace... 4528 * failure => all reset bets are off, nfserr_no_grace...
4483 */ 4529 */
4484int 4530struct nfs4_client_reclaim *
4485nfs4_client_to_reclaim(const char *name) 4531nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)
4486{ 4532{
4487 unsigned int strhashval; 4533 unsigned int strhashval;
4488 struct nfs4_client_reclaim *crp = NULL; 4534 struct nfs4_client_reclaim *crp;
4489 4535
4490 dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name); 4536 dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
4491 crp = alloc_reclaim(); 4537 crp = alloc_reclaim();
4492 if (!crp) 4538 if (crp) {
4493 return 0; 4539 strhashval = clientstr_hashval(name);
4494 strhashval = clientstr_hashval(name); 4540 INIT_LIST_HEAD(&crp->cr_strhash);
4495 INIT_LIST_HEAD(&crp->cr_strhash); 4541 list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
4496 list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); 4542 memcpy(crp->cr_recdir, name, HEXDIR_LEN);
4497 memcpy(crp->cr_recdir, name, HEXDIR_LEN); 4543 crp->cr_clp = NULL;
4498 reclaim_str_hashtbl_size++; 4544 nn->reclaim_str_hashtbl_size++;
4499 return 1; 4545 }
4546 return crp;
4547}
4548
4549void
4550nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
4551{
4552 list_del(&crp->cr_strhash);
4553 kfree(crp);
4554 nn->reclaim_str_hashtbl_size--;
4500} 4555}
4501 4556
4502void 4557void
4503nfs4_release_reclaim(void) 4558nfs4_release_reclaim(struct nfsd_net *nn)
4504{ 4559{
4505 struct nfs4_client_reclaim *crp = NULL; 4560 struct nfs4_client_reclaim *crp = NULL;
4506 int i; 4561 int i;
4507 4562
4508 for (i = 0; i < CLIENT_HASH_SIZE; i++) { 4563 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4509 while (!list_empty(&reclaim_str_hashtbl[i])) { 4564 while (!list_empty(&nn->reclaim_str_hashtbl[i])) {
4510 crp = list_entry(reclaim_str_hashtbl[i].next, 4565 crp = list_entry(nn->reclaim_str_hashtbl[i].next,
4511 struct nfs4_client_reclaim, cr_strhash); 4566 struct nfs4_client_reclaim, cr_strhash);
4512 list_del(&crp->cr_strhash); 4567 nfs4_remove_reclaim_record(crp, nn);
4513 kfree(crp);
4514 reclaim_str_hashtbl_size--;
4515 } 4568 }
4516 } 4569 }
4517 BUG_ON(reclaim_str_hashtbl_size); 4570 WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);
4518} 4571}
4519 4572
4520/* 4573/*
4521 * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ 4574 * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
4522struct nfs4_client_reclaim * 4575struct nfs4_client_reclaim *
4523nfsd4_find_reclaim_client(struct nfs4_client *clp) 4576nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
4524{ 4577{
4525 unsigned int strhashval; 4578 unsigned int strhashval;
4526 struct nfs4_client_reclaim *crp = NULL; 4579 struct nfs4_client_reclaim *crp = NULL;
4527 4580
4528 dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n", 4581 dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir);
4529 clp->cl_name.len, clp->cl_name.data,
4530 clp->cl_recdir);
4531 4582
4532 /* find clp->cl_name in reclaim_str_hashtbl */ 4583 strhashval = clientstr_hashval(recdir);
4533 strhashval = clientstr_hashval(clp->cl_recdir); 4584 list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
4534 list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) { 4585 if (same_name(crp->cr_recdir, recdir)) {
4535 if (same_name(crp->cr_recdir, clp->cl_recdir)) {
4536 return crp; 4586 return crp;
4537 } 4587 }
4538 } 4588 }
@@ -4543,12 +4593,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)
4543* Called from OPEN. Look for clientid in reclaim list. 4593* Called from OPEN. Look for clientid in reclaim list.
4544*/ 4594*/
4545__be32 4595__be32
4546nfs4_check_open_reclaim(clientid_t *clid, bool sessions) 4596nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
4547{ 4597{
4548 struct nfs4_client *clp; 4598 struct nfs4_client *clp;
4549 4599
4550 /* find clientid in conf_id_hashtbl */ 4600 /* find clientid in conf_id_hashtbl */
4551 clp = find_confirmed_client(clid, sessions); 4601 clp = find_confirmed_client(clid, sessions, nn);
4552 if (clp == NULL) 4602 if (clp == NULL)
4553 return nfserr_reclaim_bad; 4603 return nfserr_reclaim_bad;
4554 4604
@@ -4557,124 +4607,177 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
4557 4607
4558#ifdef CONFIG_NFSD_FAULT_INJECTION 4608#ifdef CONFIG_NFSD_FAULT_INJECTION
4559 4609
4560void nfsd_forget_clients(u64 num) 4610u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
4561{ 4611{
4562 struct nfs4_client *clp, *next; 4612 expire_client(clp);
4563 int count = 0; 4613 return 1;
4564
4565 nfs4_lock_state();
4566 list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
4567 expire_client(clp);
4568 if (++count == num)
4569 break;
4570 }
4571 nfs4_unlock_state();
4572
4573 printk(KERN_INFO "NFSD: Forgot %d clients", count);
4574} 4614}
4575 4615
4576static void release_lockowner_sop(struct nfs4_stateowner *sop) 4616u64 nfsd_print_client(struct nfs4_client *clp, u64 num)
4577{ 4617{
4578 release_lockowner(lockowner(sop)); 4618 char buf[INET6_ADDRSTRLEN];
4619 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
4620 printk(KERN_INFO "NFS Client: %s\n", buf);
4621 return 1;
4579} 4622}
4580 4623
4581static void release_openowner_sop(struct nfs4_stateowner *sop) 4624static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
4625 const char *type)
4582{ 4626{
4583 release_openowner(openowner(sop)); 4627 char buf[INET6_ADDRSTRLEN];
4628 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
4629 printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
4584} 4630}
4585 4631
4586static int nfsd_release_n_owners(u64 num, bool is_open_owner, 4632static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *))
4587 void (*release_sop)(struct nfs4_stateowner *))
4588{ 4633{
4589 int i, count = 0; 4634 struct nfs4_openowner *oop;
4590 struct nfs4_stateowner *sop, *next; 4635 struct nfs4_lockowner *lop, *lo_next;
4636 struct nfs4_ol_stateid *stp, *st_next;
4637 u64 count = 0;
4591 4638
4592 for (i = 0; i < OWNER_HASH_SIZE; i++) { 4639 list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
4593 list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) { 4640 list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) {
4594 if (sop->so_is_open_owner != is_open_owner) 4641 list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) {
4595 continue; 4642 if (func)
4596 release_sop(sop); 4643 func(lop);
4597 if (++count == num) 4644 if (++count == max)
4598 return count; 4645 return count;
4646 }
4599 } 4647 }
4600 } 4648 }
4649
4601 return count; 4650 return count;
4602} 4651}
4603 4652
4604void nfsd_forget_locks(u64 num) 4653u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max)
4605{ 4654{
4606 int count; 4655 return nfsd_foreach_client_lock(clp, max, release_lockowner);
4607 4656}
4608 nfs4_lock_state();
4609 count = nfsd_release_n_owners(num, false, release_lockowner_sop);
4610 nfs4_unlock_state();
4611 4657
4612 printk(KERN_INFO "NFSD: Forgot %d locks", count); 4658u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max)
4659{
4660 u64 count = nfsd_foreach_client_lock(clp, max, NULL);
4661 nfsd_print_count(clp, count, "locked files");
4662 return count;
4613} 4663}
4614 4664
4615void nfsd_forget_openowners(u64 num) 4665static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *))
4616{ 4666{
4617 int count; 4667 struct nfs4_openowner *oop, *next;
4668 u64 count = 0;
4618 4669
4619 nfs4_lock_state(); 4670 list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
4620 count = nfsd_release_n_owners(num, true, release_openowner_sop); 4671 if (func)
4621 nfs4_unlock_state(); 4672 func(oop);
4673 if (++count == max)
4674 break;
4675 }
4622 4676
4623 printk(KERN_INFO "NFSD: Forgot %d open owners", count); 4677 return count;
4624} 4678}
4625 4679
4626static int nfsd_process_n_delegations(u64 num, struct list_head *list) 4680u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max)
4627{ 4681{
4628 int i, count = 0; 4682 return nfsd_foreach_client_open(clp, max, release_openowner);
4629 struct nfs4_file *fp, *fnext; 4683}
4630 struct nfs4_delegation *dp, *dnext;
4631 4684
4632 for (i = 0; i < FILE_HASH_SIZE; i++) { 4685u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max)
4633 list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) { 4686{
4634 list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) { 4687 u64 count = nfsd_foreach_client_open(clp, max, NULL);
4635 list_move(&dp->dl_recall_lru, list); 4688 nfsd_print_count(clp, count, "open files");
4636 if (++count == num) 4689 return count;
4637 return count; 4690}
4638 } 4691
4639 } 4692static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
4640 } 4693 struct list_head *victims)
4694{
4695 struct nfs4_delegation *dp, *next;
4696 u64 count = 0;
4641 4697
4698 list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
4699 if (victims)
4700 list_move(&dp->dl_recall_lru, victims);
4701 if (++count == max)
4702 break;
4703 }
4642 return count; 4704 return count;
4643} 4705}
4644 4706
4645void nfsd_forget_delegations(u64 num) 4707u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
4646{ 4708{
4647 unsigned int count; 4709 struct nfs4_delegation *dp, *next;
4648 LIST_HEAD(victims); 4710 LIST_HEAD(victims);
4649 struct nfs4_delegation *dp, *dnext; 4711 u64 count;
4650 4712
4651 spin_lock(&recall_lock); 4713 spin_lock(&recall_lock);
4652 count = nfsd_process_n_delegations(num, &victims); 4714 count = nfsd_find_all_delegations(clp, max, &victims);
4653 spin_unlock(&recall_lock); 4715 spin_unlock(&recall_lock);
4654 4716
4655 nfs4_lock_state(); 4717 list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
4656 list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru)
4657 unhash_delegation(dp); 4718 unhash_delegation(dp);
4658 nfs4_unlock_state();
4659 4719
4660 printk(KERN_INFO "NFSD: Forgot %d delegations", count); 4720 return count;
4661} 4721}
4662 4722
4663void nfsd_recall_delegations(u64 num) 4723u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
4664{ 4724{
4665 unsigned int count; 4725 struct nfs4_delegation *dp, *next;
4666 LIST_HEAD(victims); 4726 LIST_HEAD(victims);
4667 struct nfs4_delegation *dp, *dnext; 4727 u64 count;
4668 4728
4669 spin_lock(&recall_lock); 4729 spin_lock(&recall_lock);
4670 count = nfsd_process_n_delegations(num, &victims); 4730 count = nfsd_find_all_delegations(clp, max, &victims);
4671 list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) { 4731 list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
4672 list_del(&dp->dl_recall_lru);
4673 nfsd_break_one_deleg(dp); 4732 nfsd_break_one_deleg(dp);
4674 }
4675 spin_unlock(&recall_lock); 4733 spin_unlock(&recall_lock);
4676 4734
4677 printk(KERN_INFO "NFSD: Recalled %d delegations", count); 4735 return count;
4736}
4737
4738u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
4739{
4740 u64 count = 0;
4741
4742 spin_lock(&recall_lock);
4743 count = nfsd_find_all_delegations(clp, max, NULL);
4744 spin_unlock(&recall_lock);
4745
4746 nfsd_print_count(clp, count, "delegations");
4747 return count;
4748}
4749
4750u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64))
4751{
4752 struct nfs4_client *clp, *next;
4753 u64 count = 0;
4754 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
4755
4756 if (!nfsd_netns_ready(nn))
4757 return 0;
4758
4759 list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
4760 count += func(clp, max - count);
4761 if ((max != 0) && (count >= max))
4762 break;
4763 }
4764
4765 return count;
4766}
4767
4768struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
4769{
4770 struct nfs4_client *clp;
4771 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
4772
4773 if (!nfsd_netns_ready(nn))
4774 return NULL;
4775
4776 list_for_each_entry(clp, &nn->client_lru, cl_lru) {
4777 if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
4778 return clp;
4779 }
4780 return NULL;
4678} 4781}
4679 4782
4680#endif /* CONFIG_NFSD_FAULT_INJECTION */ 4783#endif /* CONFIG_NFSD_FAULT_INJECTION */
@@ -4686,27 +4789,10 @@ nfs4_state_init(void)
4686{ 4789{
4687 int i; 4790 int i;
4688 4791
4689 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4690 INIT_LIST_HEAD(&conf_id_hashtbl[i]);
4691 INIT_LIST_HEAD(&conf_str_hashtbl[i]);
4692 INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
4693 INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
4694 INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
4695 }
4696 for (i = 0; i < SESSION_HASH_SIZE; i++)
4697 INIT_LIST_HEAD(&sessionid_hashtbl[i]);
4698 for (i = 0; i < FILE_HASH_SIZE; i++) { 4792 for (i = 0; i < FILE_HASH_SIZE; i++) {
4699 INIT_LIST_HEAD(&file_hashtbl[i]); 4793 INIT_LIST_HEAD(&file_hashtbl[i]);
4700 } 4794 }
4701 for (i = 0; i < OWNER_HASH_SIZE; i++) {
4702 INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
4703 }
4704 for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
4705 INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
4706 INIT_LIST_HEAD(&close_lru);
4707 INIT_LIST_HEAD(&client_lru);
4708 INIT_LIST_HEAD(&del_recall_lru); 4795 INIT_LIST_HEAD(&del_recall_lru);
4709 reclaim_str_hashtbl_size = 0;
4710} 4796}
4711 4797
4712/* 4798/*
@@ -4730,12 +4816,100 @@ set_max_delegations(void)
4730 max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); 4816 max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
4731} 4817}
4732 4818
4733/* initialization to perform when the nfsd service is started: */ 4819static int nfs4_state_create_net(struct net *net)
4820{
4821 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4822 int i;
4823
4824 nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) *
4825 CLIENT_HASH_SIZE, GFP_KERNEL);
4826 if (!nn->conf_id_hashtbl)
4827 goto err;
4828 nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) *
4829 CLIENT_HASH_SIZE, GFP_KERNEL);
4830 if (!nn->unconf_id_hashtbl)
4831 goto err_unconf_id;
4832 nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
4833 OWNER_HASH_SIZE, GFP_KERNEL);
4834 if (!nn->ownerstr_hashtbl)
4835 goto err_ownerstr;
4836 nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) *
4837 LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL);
4838 if (!nn->lockowner_ino_hashtbl)
4839 goto err_lockowner_ino;
4840 nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
4841 SESSION_HASH_SIZE, GFP_KERNEL);
4842 if (!nn->sessionid_hashtbl)
4843 goto err_sessionid;
4844
4845 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4846 INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
4847 INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
4848 }
4849 for (i = 0; i < OWNER_HASH_SIZE; i++)
4850 INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]);
4851 for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
4852 INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]);
4853 for (i = 0; i < SESSION_HASH_SIZE; i++)
4854 INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
4855 nn->conf_name_tree = RB_ROOT;
4856 nn->unconf_name_tree = RB_ROOT;
4857 INIT_LIST_HEAD(&nn->client_lru);
4858 INIT_LIST_HEAD(&nn->close_lru);
4859 spin_lock_init(&nn->client_lock);
4860
4861 INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
4862 get_net(net);
4863
4864 return 0;
4865
4866err_sessionid:
4867 kfree(nn->lockowner_ino_hashtbl);
4868err_lockowner_ino:
4869 kfree(nn->ownerstr_hashtbl);
4870err_ownerstr:
4871 kfree(nn->unconf_id_hashtbl);
4872err_unconf_id:
4873 kfree(nn->conf_id_hashtbl);
4874err:
4875 return -ENOMEM;
4876}
4877
4878static void
4879nfs4_state_destroy_net(struct net *net)
4880{
4881 int i;
4882 struct nfs4_client *clp = NULL;
4883 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4884 struct rb_node *node, *tmp;
4885
4886 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4887 while (!list_empty(&nn->conf_id_hashtbl[i])) {
4888 clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
4889 destroy_client(clp);
4890 }
4891 }
4892
4893 node = rb_first(&nn->unconf_name_tree);
4894 while (node != NULL) {
4895 tmp = node;
4896 node = rb_next(tmp);
4897 clp = rb_entry(tmp, struct nfs4_client, cl_namenode);
4898 rb_erase(tmp, &nn->unconf_name_tree);
4899 destroy_client(clp);
4900 }
4901
4902 kfree(nn->sessionid_hashtbl);
4903 kfree(nn->lockowner_ino_hashtbl);
4904 kfree(nn->ownerstr_hashtbl);
4905 kfree(nn->unconf_id_hashtbl);
4906 kfree(nn->conf_id_hashtbl);
4907 put_net(net);
4908}
4734 4909
4735int 4910int
4736nfs4_state_start(void) 4911nfs4_state_start_net(struct net *net)
4737{ 4912{
4738 struct net *net = &init_net;
4739 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 4913 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4740 int ret; 4914 int ret;
4741 4915
@@ -4746,18 +4920,32 @@ nfs4_state_start(void)
4746 * to that instead and then do most of the rest of this on a per-net 4920 * to that instead and then do most of the rest of this on a per-net
4747 * basis. 4921 * basis.
4748 */ 4922 */
4749 get_net(net); 4923 if (net != &init_net)
4924 return -EINVAL;
4925
4926 ret = nfs4_state_create_net(net);
4927 if (ret)
4928 return ret;
4750 nfsd4_client_tracking_init(net); 4929 nfsd4_client_tracking_init(net);
4751 nn->boot_time = get_seconds(); 4930 nn->boot_time = get_seconds();
4752 locks_start_grace(net, &nn->nfsd4_manager); 4931 locks_start_grace(net, &nn->nfsd4_manager);
4753 nn->grace_ended = false; 4932 nn->grace_ended = false;
4754 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 4933 printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
4755 nfsd4_grace); 4934 nn->nfsd4_grace, net);
4935 queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
4936 return 0;
4937}
4938
4939/* initialization to perform when the nfsd service is started: */
4940
4941int
4942nfs4_state_start(void)
4943{
4944 int ret;
4945
4756 ret = set_callback_cred(); 4946 ret = set_callback_cred();
4757 if (ret) { 4947 if (ret)
4758 ret = -ENOMEM; 4948 return -ENOMEM;
4759 goto out_recovery;
4760 }
4761 laundry_wq = create_singlethread_workqueue("nfsd4"); 4949 laundry_wq = create_singlethread_workqueue("nfsd4");
4762 if (laundry_wq == NULL) { 4950 if (laundry_wq == NULL) {
4763 ret = -ENOMEM; 4951 ret = -ENOMEM;
@@ -4766,39 +4954,34 @@ nfs4_state_start(void)
4766 ret = nfsd4_create_callback_queue(); 4954 ret = nfsd4_create_callback_queue();
4767 if (ret) 4955 if (ret)
4768 goto out_free_laundry; 4956 goto out_free_laundry;
4769 queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); 4957
4770 set_max_delegations(); 4958 set_max_delegations();
4959
4771 return 0; 4960 return 0;
4961
4772out_free_laundry: 4962out_free_laundry:
4773 destroy_workqueue(laundry_wq); 4963 destroy_workqueue(laundry_wq);
4774out_recovery: 4964out_recovery:
4775 nfsd4_client_tracking_exit(net);
4776 put_net(net);
4777 return ret; 4965 return ret;
4778} 4966}
4779 4967
4780static void 4968/* should be called with the state lock held */
4781__nfs4_state_shutdown(void) 4969void
4970nfs4_state_shutdown_net(struct net *net)
4782{ 4971{
4783 int i;
4784 struct nfs4_client *clp = NULL;
4785 struct nfs4_delegation *dp = NULL; 4972 struct nfs4_delegation *dp = NULL;
4786 struct list_head *pos, *next, reaplist; 4973 struct list_head *pos, *next, reaplist;
4974 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4975
4976 cancel_delayed_work_sync(&nn->laundromat_work);
4977 locks_end_grace(&nn->nfsd4_manager);
4787 4978
4788 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4789 while (!list_empty(&conf_id_hashtbl[i])) {
4790 clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
4791 destroy_client(clp);
4792 }
4793 while (!list_empty(&unconf_str_hashtbl[i])) {
4794 clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
4795 destroy_client(clp);
4796 }
4797 }
4798 INIT_LIST_HEAD(&reaplist); 4979 INIT_LIST_HEAD(&reaplist);
4799 spin_lock(&recall_lock); 4980 spin_lock(&recall_lock);
4800 list_for_each_safe(pos, next, &del_recall_lru) { 4981 list_for_each_safe(pos, next, &del_recall_lru) {
4801 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 4982 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
4983 if (dp->dl_stid.sc_client->net != net)
4984 continue;
4802 list_move(&dp->dl_recall_lru, &reaplist); 4985 list_move(&dp->dl_recall_lru, &reaplist);
4803 } 4986 }
4804 spin_unlock(&recall_lock); 4987 spin_unlock(&recall_lock);
@@ -4807,22 +4990,14 @@ __nfs4_state_shutdown(void)
4807 unhash_delegation(dp); 4990 unhash_delegation(dp);
4808 } 4991 }
4809 4992
4810 nfsd4_client_tracking_exit(&init_net); 4993 nfsd4_client_tracking_exit(net);
4811 put_net(&init_net); 4994 nfs4_state_destroy_net(net);
4812} 4995}
4813 4996
4814void 4997void
4815nfs4_state_shutdown(void) 4998nfs4_state_shutdown(void)
4816{ 4999{
4817 struct net *net = &init_net;
4818 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4819
4820 cancel_delayed_work_sync(&laundromat_work);
4821 destroy_workqueue(laundry_wq); 5000 destroy_workqueue(laundry_wq);
4822 locks_end_grace(&nn->nfsd4_manager);
4823 nfs4_lock_state();
4824 __nfs4_state_shutdown();
4825 nfs4_unlock_state();
4826 nfsd4_destroy_callback_queue(); 5001 nfsd4_destroy_callback_queue();
4827} 5002}
4828 5003
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index fd548d155088..0dc11586682f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -53,6 +53,7 @@
53#include "vfs.h" 53#include "vfs.h"
54#include "state.h" 54#include "state.h"
55#include "cache.h" 55#include "cache.h"
56#include "netns.h"
56 57
57#define NFSDDBG_FACILITY NFSDDBG_XDR 58#define NFSDDBG_FACILITY NFSDDBG_XDR
58 59
@@ -65,17 +66,17 @@
65#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL 66#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL
66 67
67static __be32 68static __be32
68check_filename(char *str, int len, __be32 err) 69check_filename(char *str, int len)
69{ 70{
70 int i; 71 int i;
71 72
72 if (len == 0) 73 if (len == 0)
73 return nfserr_inval; 74 return nfserr_inval;
74 if (isdotent(str, len)) 75 if (isdotent(str, len))
75 return err; 76 return nfserr_badname;
76 for (i = 0; i < len; i++) 77 for (i = 0; i < len; i++)
77 if (str[i] == '/') 78 if (str[i] == '/')
78 return err; 79 return nfserr_badname;
79 return 0; 80 return 0;
80} 81}
81 82
@@ -422,6 +423,86 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
422 DECODE_TAIL; 423 DECODE_TAIL;
423} 424}
424 425
426static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
427{
428 DECODE_HEAD;
429 u32 dummy, uid, gid;
430 char *machine_name;
431 int i;
432 int nr_secflavs;
433
434 /* callback_sec_params4 */
435 READ_BUF(4);
436 READ32(nr_secflavs);
437 cbs->flavor = (u32)(-1);
438 for (i = 0; i < nr_secflavs; ++i) {
439 READ_BUF(4);
440 READ32(dummy);
441 switch (dummy) {
442 case RPC_AUTH_NULL:
443 /* Nothing to read */
444 if (cbs->flavor == (u32)(-1))
445 cbs->flavor = RPC_AUTH_NULL;
446 break;
447 case RPC_AUTH_UNIX:
448 READ_BUF(8);
449 /* stamp */
450 READ32(dummy);
451
452 /* machine name */
453 READ32(dummy);
454 READ_BUF(dummy);
455 SAVEMEM(machine_name, dummy);
456
457 /* uid, gid */
458 READ_BUF(8);
459 READ32(uid);
460 READ32(gid);
461
462 /* more gids */
463 READ_BUF(4);
464 READ32(dummy);
465 READ_BUF(dummy * 4);
466 if (cbs->flavor == (u32)(-1)) {
467 cbs->uid = uid;
468 cbs->gid = gid;
469 cbs->flavor = RPC_AUTH_UNIX;
470 }
471 break;
472 case RPC_AUTH_GSS:
473 dprintk("RPC_AUTH_GSS callback secflavor "
474 "not supported!\n");
475 READ_BUF(8);
476 /* gcbp_service */
477 READ32(dummy);
478 /* gcbp_handle_from_server */
479 READ32(dummy);
480 READ_BUF(dummy);
481 p += XDR_QUADLEN(dummy);
482 /* gcbp_handle_from_client */
483 READ_BUF(4);
484 READ32(dummy);
485 READ_BUF(dummy);
486 break;
487 default:
488 dprintk("Illegal callback secflavor\n");
489 return nfserr_inval;
490 }
491 }
492 DECODE_TAIL;
493}
494
495static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
496{
497 DECODE_HEAD;
498
499 READ_BUF(4);
500 READ32(bc->bc_cb_program);
501 nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
502
503 DECODE_TAIL;
504}
505
425static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) 506static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
426{ 507{
427 DECODE_HEAD; 508 DECODE_HEAD;
@@ -490,7 +571,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
490 READ32(create->cr_namelen); 571 READ32(create->cr_namelen);
491 READ_BUF(create->cr_namelen); 572 READ_BUF(create->cr_namelen);
492 SAVEMEM(create->cr_name, create->cr_namelen); 573 SAVEMEM(create->cr_name, create->cr_namelen);
493 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) 574 if ((status = check_filename(create->cr_name, create->cr_namelen)))
494 return status; 575 return status;
495 576
496 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, 577 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
@@ -522,7 +603,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
522 READ32(link->li_namelen); 603 READ32(link->li_namelen);
523 READ_BUF(link->li_namelen); 604 READ_BUF(link->li_namelen);
524 SAVEMEM(link->li_name, link->li_namelen); 605 SAVEMEM(link->li_name, link->li_namelen);
525 if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval))) 606 if ((status = check_filename(link->li_name, link->li_namelen)))
526 return status; 607 return status;
527 608
528 DECODE_TAIL; 609 DECODE_TAIL;
@@ -616,7 +697,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
616 READ32(lookup->lo_len); 697 READ32(lookup->lo_len);
617 READ_BUF(lookup->lo_len); 698 READ_BUF(lookup->lo_len);
618 SAVEMEM(lookup->lo_name, lookup->lo_len); 699 SAVEMEM(lookup->lo_name, lookup->lo_len);
619 if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent))) 700 if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
620 return status; 701 return status;
621 702
622 DECODE_TAIL; 703 DECODE_TAIL;
@@ -780,7 +861,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
780 READ32(open->op_fname.len); 861 READ32(open->op_fname.len);
781 READ_BUF(open->op_fname.len); 862 READ_BUF(open->op_fname.len);
782 SAVEMEM(open->op_fname.data, open->op_fname.len); 863 SAVEMEM(open->op_fname.data, open->op_fname.len);
783 if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) 864 if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
784 return status; 865 return status;
785 break; 866 break;
786 case NFS4_OPEN_CLAIM_PREVIOUS: 867 case NFS4_OPEN_CLAIM_PREVIOUS:
@@ -795,7 +876,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
795 READ32(open->op_fname.len); 876 READ32(open->op_fname.len);
796 READ_BUF(open->op_fname.len); 877 READ_BUF(open->op_fname.len);
797 SAVEMEM(open->op_fname.data, open->op_fname.len); 878 SAVEMEM(open->op_fname.data, open->op_fname.len);
798 if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) 879 if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
799 return status; 880 return status;
800 break; 881 break;
801 case NFS4_OPEN_CLAIM_FH: 882 case NFS4_OPEN_CLAIM_FH:
@@ -907,7 +988,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
907 READ32(remove->rm_namelen); 988 READ32(remove->rm_namelen);
908 READ_BUF(remove->rm_namelen); 989 READ_BUF(remove->rm_namelen);
909 SAVEMEM(remove->rm_name, remove->rm_namelen); 990 SAVEMEM(remove->rm_name, remove->rm_namelen);
910 if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent))) 991 if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
911 return status; 992 return status;
912 993
913 DECODE_TAIL; 994 DECODE_TAIL;
@@ -925,9 +1006,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
925 READ32(rename->rn_tnamelen); 1006 READ32(rename->rn_tnamelen);
926 READ_BUF(rename->rn_tnamelen); 1007 READ_BUF(rename->rn_tnamelen);
927 SAVEMEM(rename->rn_tname, rename->rn_tnamelen); 1008 SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
928 if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent))) 1009 if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
929 return status; 1010 return status;
930 if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval))) 1011 if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
931 return status; 1012 return status;
932 1013
933 DECODE_TAIL; 1014 DECODE_TAIL;
@@ -954,8 +1035,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
954 READ32(secinfo->si_namelen); 1035 READ32(secinfo->si_namelen);
955 READ_BUF(secinfo->si_namelen); 1036 READ_BUF(secinfo->si_namelen);
956 SAVEMEM(secinfo->si_name, secinfo->si_namelen); 1037 SAVEMEM(secinfo->si_name, secinfo->si_namelen);
957 status = check_filename(secinfo->si_name, secinfo->si_namelen, 1038 status = check_filename(secinfo->si_name, secinfo->si_namelen);
958 nfserr_noent);
959 if (status) 1039 if (status)
960 return status; 1040 return status;
961 DECODE_TAIL; 1041 DECODE_TAIL;
@@ -1026,31 +1106,14 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
1026static __be32 1106static __be32
1027nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) 1107nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
1028{ 1108{
1029#if 0
1030 struct nfsd4_compoundargs save = {
1031 .p = argp->p,
1032 .end = argp->end,
1033 .rqstp = argp->rqstp,
1034 };
1035 u32 ve_bmval[2];
1036 struct iattr ve_iattr; /* request */
1037 struct nfs4_acl *ve_acl; /* request */
1038#endif
1039 DECODE_HEAD; 1109 DECODE_HEAD;
1040 1110
1041 if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) 1111 if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
1042 goto out; 1112 goto out;
1043 1113
1044 /* For convenience's sake, we compare raw xdr'd attributes in 1114 /* For convenience's sake, we compare raw xdr'd attributes in
1045 * nfsd4_proc_verify; however we still decode here just to return 1115 * nfsd4_proc_verify */
1046 * correct error in case of bad xdr. */ 1116
1047#if 0
1048 status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl);
1049 if (status == nfserr_inval) {
1050 status = nfserrno(status);
1051 goto out;
1052 }
1053#endif
1054 READ_BUF(4); 1117 READ_BUF(4);
1055 READ32(verify->ve_attrlen); 1118 READ32(verify->ve_attrlen);
1056 READ_BUF(verify->ve_attrlen); 1119 READ_BUF(verify->ve_attrlen);
@@ -1063,7 +1126,6 @@ static __be32
1063nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) 1126nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
1064{ 1127{
1065 int avail; 1128 int avail;
1066 int v;
1067 int len; 1129 int len;
1068 DECODE_HEAD; 1130 DECODE_HEAD;
1069 1131
@@ -1087,27 +1149,26 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
1087 __FILE__, __LINE__); 1149 __FILE__, __LINE__);
1088 goto xdr_error; 1150 goto xdr_error;
1089 } 1151 }
1090 argp->rqstp->rq_vec[0].iov_base = p; 1152 write->wr_head.iov_base = p;
1091 argp->rqstp->rq_vec[0].iov_len = avail; 1153 write->wr_head.iov_len = avail;
1092 v = 0; 1154 WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
1093 len = write->wr_buflen; 1155 write->wr_pagelist = argp->pagelist;
1094 while (len > argp->rqstp->rq_vec[v].iov_len) { 1156
1095 len -= argp->rqstp->rq_vec[v].iov_len; 1157 len = XDR_QUADLEN(write->wr_buflen) << 2;
1096 v++; 1158 if (len >= avail) {
1097 argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]); 1159 int pages;
1098 argp->pagelist++; 1160
1099 if (argp->pagelen >= PAGE_SIZE) { 1161 len -= avail;
1100 argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE; 1162
1101 argp->pagelen -= PAGE_SIZE; 1163 pages = len >> PAGE_SHIFT;
1102 } else { 1164 argp->pagelist += pages;
1103 argp->rqstp->rq_vec[v].iov_len = argp->pagelen; 1165 argp->pagelen -= pages * PAGE_SIZE;
1104 argp->pagelen -= len; 1166 len -= pages * PAGE_SIZE;
1105 } 1167
1168 argp->p = (__be32 *)page_address(argp->pagelist[0]);
1169 argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
1106 } 1170 }
1107 argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len); 1171 argp->p += XDR_QUADLEN(len);
1108 argp->p = (__be32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
1109 argp->rqstp->rq_vec[v].iov_len = len;
1110 write->wr_vlen = v+1;
1111 1172
1112 DECODE_TAIL; 1173 DECODE_TAIL;
1113} 1174}
@@ -1237,11 +1298,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1237 struct nfsd4_create_session *sess) 1298 struct nfsd4_create_session *sess)
1238{ 1299{
1239 DECODE_HEAD; 1300 DECODE_HEAD;
1240
1241 u32 dummy; 1301 u32 dummy;
1242 char *machine_name;
1243 int i;
1244 int nr_secflavs;
1245 1302
1246 READ_BUF(16); 1303 READ_BUF(16);
1247 COPYMEM(&sess->clientid, 8); 1304 COPYMEM(&sess->clientid, 8);
@@ -1282,58 +1339,9 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1282 goto xdr_error; 1339 goto xdr_error;
1283 } 1340 }
1284 1341
1285 READ_BUF(8); 1342 READ_BUF(4);
1286 READ32(sess->callback_prog); 1343 READ32(sess->callback_prog);
1287 1344 nfsd4_decode_cb_sec(argp, &sess->cb_sec);
1288 /* callback_sec_params4 */
1289 READ32(nr_secflavs);
1290 for (i = 0; i < nr_secflavs; ++i) {
1291 READ_BUF(4);
1292 READ32(dummy);
1293 switch (dummy) {
1294 case RPC_AUTH_NULL:
1295 /* Nothing to read */
1296 break;
1297 case RPC_AUTH_UNIX:
1298 READ_BUF(8);
1299 /* stamp */
1300 READ32(dummy);
1301
1302 /* machine name */
1303 READ32(dummy);
1304 READ_BUF(dummy);
1305 SAVEMEM(machine_name, dummy);
1306
1307 /* uid, gid */
1308 READ_BUF(8);
1309 READ32(sess->uid);
1310 READ32(sess->gid);
1311
1312 /* more gids */
1313 READ_BUF(4);
1314 READ32(dummy);
1315 READ_BUF(dummy * 4);
1316 break;
1317 case RPC_AUTH_GSS:
1318 dprintk("RPC_AUTH_GSS callback secflavor "
1319 "not supported!\n");
1320 READ_BUF(8);
1321 /* gcbp_service */
1322 READ32(dummy);
1323 /* gcbp_handle_from_server */
1324 READ32(dummy);
1325 READ_BUF(dummy);
1326 p += XDR_QUADLEN(dummy);
1327 /* gcbp_handle_from_client */
1328 READ_BUF(4);
1329 READ32(dummy);
1330 READ_BUF(dummy);
1331 break;
1332 default:
1333 dprintk("Illegal callback secflavor\n");
1334 return nfserr_inval;
1335 }
1336 }
1337 DECODE_TAIL; 1345 DECODE_TAIL;
1338} 1346}
1339 1347
@@ -1528,7 +1536,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1528 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp, 1536 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp,
1529 1537
1530 /* new operations for NFSv4.1 */ 1538 /* new operations for NFSv4.1 */
1531 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, 1539 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
1532 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, 1540 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
1533 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, 1541 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
1534 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, 1542 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
@@ -1568,12 +1576,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1568 bool cachethis = false; 1576 bool cachethis = false;
1569 int i; 1577 int i;
1570 1578
1571 /*
1572 * XXX: According to spec, we should check the tag
1573 * for UTF-8 compliance. I'm postponing this for
1574 * now because it seems that some clients do use
1575 * binary tags.
1576 */
1577 READ_BUF(4); 1579 READ_BUF(4);
1578 READ32(argp->taglen); 1580 READ32(argp->taglen);
1579 READ_BUF(argp->taglen + 8); 1581 READ_BUF(argp->taglen + 8);
@@ -1603,38 +1605,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1603 op = &argp->ops[i]; 1605 op = &argp->ops[i];
1604 op->replay = NULL; 1606 op->replay = NULL;
1605 1607
1606 /* 1608 READ_BUF(4);
1607 * We can't use READ_BUF() here because we need to handle 1609 READ32(op->opnum);
1608 * a missing opcode as an OP_WRITE + 1. So we need to check
1609 * to see if we're truly at the end of our buffer or if there
1610 * is another page we need to flip to.
1611 */
1612
1613 if (argp->p == argp->end) {
1614 if (argp->pagelen < 4) {
1615 /* There isn't an opcode still on the wire */
1616 op->opnum = OP_WRITE + 1;
1617 op->status = nfserr_bad_xdr;
1618 argp->opcnt = i+1;
1619 break;
1620 }
1621
1622 /*
1623 * False alarm. We just hit a page boundary, but there
1624 * is still data available. Move pointer across page
1625 * boundary. *snip from READ_BUF*
1626 */
1627 argp->p = page_address(argp->pagelist[0]);
1628 argp->pagelist++;
1629 if (argp->pagelen < PAGE_SIZE) {
1630 argp->end = argp->p + (argp->pagelen>>2);
1631 argp->pagelen = 0;
1632 } else {
1633 argp->end = argp->p + (PAGE_SIZE>>2);
1634 argp->pagelen -= PAGE_SIZE;
1635 }
1636 }
1637 op->opnum = ntohl(*argp->p++);
1638 1610
1639 if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP) 1611 if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
1640 op->status = ops->decoders[op->opnum](argp, &op->u); 1612 op->status = ops->decoders[op->opnum](argp, &op->u);
@@ -2014,6 +1986,22 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
2014 return 0; 1986 return 0;
2015} 1987}
2016 1988
1989
1990static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
1991{
1992 struct path path = exp->ex_path;
1993 int err;
1994
1995 path_get(&path);
1996 while (follow_up(&path)) {
1997 if (path.dentry != path.mnt->mnt_root)
1998 break;
1999 }
2000 err = vfs_getattr(path.mnt, path.dentry, stat);
2001 path_put(&path);
2002 return err;
2003}
2004
2017/* 2005/*
2018 * Note: @fhp can be NULL; in this case, we might have to compose the filehandle 2006 * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
2019 * ourselves. 2007 * ourselves.
@@ -2048,6 +2036,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2048 .mnt = exp->ex_path.mnt, 2036 .mnt = exp->ex_path.mnt,
2049 .dentry = dentry, 2037 .dentry = dentry,
2050 }; 2038 };
2039 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2051 2040
2052 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 2041 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
2053 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); 2042 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -2208,7 +2197,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2208 if (bmval0 & FATTR4_WORD0_LEASE_TIME) { 2197 if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
2209 if ((buflen -= 4) < 0) 2198 if ((buflen -= 4) < 0)
2210 goto out_resource; 2199 goto out_resource;
2211 WRITE32(nfsd4_lease); 2200 WRITE32(nn->nfsd4_lease);
2212 } 2201 }
2213 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { 2202 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
2214 if ((buflen -= 4) < 0) 2203 if ((buflen -= 4) < 0)
@@ -2430,18 +2419,8 @@ out_acl:
2430 * and this is the root of a cross-mounted filesystem. 2419 * and this is the root of a cross-mounted filesystem.
2431 */ 2420 */
2432 if (ignore_crossmnt == 0 && 2421 if (ignore_crossmnt == 0 &&
2433 dentry == exp->ex_path.mnt->mnt_root) { 2422 dentry == exp->ex_path.mnt->mnt_root)
2434 struct path path = exp->ex_path; 2423 get_parent_attributes(exp, &stat);
2435 path_get(&path);
2436 while (follow_up(&path)) {
2437 if (path.dentry != path.mnt->mnt_root)
2438 break;
2439 }
2440 err = vfs_getattr(path.mnt, path.dentry, &stat);
2441 path_put(&path);
2442 if (err)
2443 goto out_nfserr;
2444 }
2445 WRITE64(stat.ino); 2424 WRITE64(stat.ino);
2446 } 2425 }
2447 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { 2426 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
@@ -2927,7 +2906,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2927 struct nfsd4_read *read) 2906 struct nfsd4_read *read)
2928{ 2907{
2929 u32 eof; 2908 u32 eof;
2930 int v, pn; 2909 int v;
2910 struct page *page;
2931 unsigned long maxcount; 2911 unsigned long maxcount;
2932 long len; 2912 long len;
2933 __be32 *p; 2913 __be32 *p;
@@ -2946,11 +2926,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2946 len = maxcount; 2926 len = maxcount;
2947 v = 0; 2927 v = 0;
2948 while (len > 0) { 2928 while (len > 0) {
2949 pn = resp->rqstp->rq_resused++; 2929 page = *(resp->rqstp->rq_next_page);
2950 resp->rqstp->rq_vec[v].iov_base = 2930 if (!page) { /* ran out of pages */
2951 page_address(resp->rqstp->rq_respages[pn]); 2931 maxcount -= len;
2932 break;
2933 }
2934 resp->rqstp->rq_vec[v].iov_base = page_address(page);
2952 resp->rqstp->rq_vec[v].iov_len = 2935 resp->rqstp->rq_vec[v].iov_len =
2953 len < PAGE_SIZE ? len : PAGE_SIZE; 2936 len < PAGE_SIZE ? len : PAGE_SIZE;
2937 resp->rqstp->rq_next_page++;
2954 v++; 2938 v++;
2955 len -= PAGE_SIZE; 2939 len -= PAGE_SIZE;
2956 } 2940 }
@@ -2996,8 +2980,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
2996 return nfserr; 2980 return nfserr;
2997 if (resp->xbuf->page_len) 2981 if (resp->xbuf->page_len)
2998 return nfserr_resource; 2982 return nfserr_resource;
2983 if (!*resp->rqstp->rq_next_page)
2984 return nfserr_resource;
2999 2985
3000 page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); 2986 page = page_address(*(resp->rqstp->rq_next_page++));
3001 2987
3002 maxcount = PAGE_SIZE; 2988 maxcount = PAGE_SIZE;
3003 RESERVE_SPACE(4); 2989 RESERVE_SPACE(4);
@@ -3045,6 +3031,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3045 return nfserr; 3031 return nfserr;
3046 if (resp->xbuf->page_len) 3032 if (resp->xbuf->page_len)
3047 return nfserr_resource; 3033 return nfserr_resource;
3034 if (!*resp->rqstp->rq_next_page)
3035 return nfserr_resource;
3048 3036
3049 RESERVE_SPACE(NFS4_VERIFIER_SIZE); 3037 RESERVE_SPACE(NFS4_VERIFIER_SIZE);
3050 savep = p; 3038 savep = p;
@@ -3071,7 +3059,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3071 goto err_no_verf; 3059 goto err_no_verf;
3072 } 3060 }
3073 3061
3074 page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); 3062 page = page_address(*(resp->rqstp->rq_next_page++));
3075 readdir->common.err = 0; 3063 readdir->common.err = 0;
3076 readdir->buflen = maxcount; 3064 readdir->buflen = maxcount;
3077 readdir->buffer = page; 3065 readdir->buffer = page;
@@ -3094,8 +3082,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3094 p = readdir->buffer; 3082 p = readdir->buffer;
3095 *p++ = 0; /* no more entries */ 3083 *p++ = 0; /* no more entries */
3096 *p++ = htonl(readdir->common.err == nfserr_eof); 3084 *p++ = htonl(readdir->common.err == nfserr_eof);
3097 resp->xbuf->page_len = ((char*)p) - (char*)page_address( 3085 resp->xbuf->page_len = ((char*)p) -
3098 resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); 3086 (char*)page_address(*(resp->rqstp->rq_next_page-1));
3099 3087
3100 /* Use rest of head for padding and remaining ops: */ 3088 /* Use rest of head for padding and remaining ops: */
3101 resp->xbuf->tail[0].iov_base = tailbase; 3089 resp->xbuf->tail[0].iov_base = tailbase;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index dab350dfc376..74934284d9a7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -19,7 +19,7 @@
19#include "idmap.h" 19#include "idmap.h"
20#include "nfsd.h" 20#include "nfsd.h"
21#include "cache.h" 21#include "cache.h"
22#include "fault_inject.h" 22#include "state.h"
23#include "netns.h" 23#include "netns.h"
24 24
25/* 25/*
@@ -186,9 +186,6 @@ static struct file_operations supported_enctypes_ops = {
186}; 186};
187#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ 187#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
188 188
189extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
190extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
191
192static const struct file_operations pool_stats_operations = { 189static const struct file_operations pool_stats_operations = {
193 .open = nfsd_pool_stats_open, 190 .open = nfsd_pool_stats_open,
194 .read = seq_read, 191 .read = seq_read,
@@ -399,6 +396,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
399{ 396{
400 char *mesg = buf; 397 char *mesg = buf;
401 int rv; 398 int rv;
399 struct net *net = &init_net;
400
402 if (size > 0) { 401 if (size > 0) {
403 int newthreads; 402 int newthreads;
404 rv = get_int(&mesg, &newthreads); 403 rv = get_int(&mesg, &newthreads);
@@ -406,11 +405,11 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
406 return rv; 405 return rv;
407 if (newthreads < 0) 406 if (newthreads < 0)
408 return -EINVAL; 407 return -EINVAL;
409 rv = nfsd_svc(newthreads); 408 rv = nfsd_svc(newthreads, net);
410 if (rv < 0) 409 if (rv < 0)
411 return rv; 410 return rv;
412 } else 411 } else
413 rv = nfsd_nrthreads(); 412 rv = nfsd_nrthreads(net);
414 413
415 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); 414 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
416} 415}
@@ -448,9 +447,10 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
448 int len; 447 int len;
449 int npools; 448 int npools;
450 int *nthreads; 449 int *nthreads;
450 struct net *net = &init_net;
451 451
452 mutex_lock(&nfsd_mutex); 452 mutex_lock(&nfsd_mutex);
453 npools = nfsd_nrpools(); 453 npools = nfsd_nrpools(net);
454 if (npools == 0) { 454 if (npools == 0) {
455 /* 455 /*
456 * NFS is shut down. The admin can start it by 456 * NFS is shut down. The admin can start it by
@@ -478,12 +478,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
478 if (nthreads[i] < 0) 478 if (nthreads[i] < 0)
479 goto out_free; 479 goto out_free;
480 } 480 }
481 rv = nfsd_set_nrthreads(i, nthreads); 481 rv = nfsd_set_nrthreads(i, nthreads, net);
482 if (rv) 482 if (rv)
483 goto out_free; 483 goto out_free;
484 } 484 }
485 485
486 rv = nfsd_get_nrthreads(npools, nthreads); 486 rv = nfsd_get_nrthreads(npools, nthreads, net);
487 if (rv) 487 if (rv)
488 goto out_free; 488 goto out_free;
489 489
@@ -510,11 +510,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
510 unsigned minor; 510 unsigned minor;
511 ssize_t tlen = 0; 511 ssize_t tlen = 0;
512 char *sep; 512 char *sep;
513 struct net *net = &init_net;
514 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
513 515
514 if (size>0) { 516 if (size>0) {
515 if (nfsd_serv) 517 if (nn->nfsd_serv)
516 /* Cannot change versions without updating 518 /* Cannot change versions without updating
517 * nfsd_serv->sv_xdrsize, and reallocing 519 * nn->nfsd_serv->sv_xdrsize, and reallocing
518 * rq_argp and rq_resp 520 * rq_argp and rq_resp
519 */ 521 */
520 return -EBUSY; 522 return -EBUSY;
@@ -645,11 +647,13 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
645 * Zero-length write. Return a list of NFSD's current listener 647 * Zero-length write. Return a list of NFSD's current listener
646 * transports. 648 * transports.
647 */ 649 */
648static ssize_t __write_ports_names(char *buf) 650static ssize_t __write_ports_names(char *buf, struct net *net)
649{ 651{
650 if (nfsd_serv == NULL) 652 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
653
654 if (nn->nfsd_serv == NULL)
651 return 0; 655 return 0;
652 return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); 656 return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
653} 657}
654 658
655/* 659/*
@@ -657,28 +661,28 @@ static ssize_t __write_ports_names(char *buf)
657 * a socket of a supported family/protocol, and we use it as an 661 * a socket of a supported family/protocol, and we use it as an
658 * nfsd listener. 662 * nfsd listener.
659 */ 663 */
660static ssize_t __write_ports_addfd(char *buf) 664static ssize_t __write_ports_addfd(char *buf, struct net *net)
661{ 665{
662 char *mesg = buf; 666 char *mesg = buf;
663 int fd, err; 667 int fd, err;
664 struct net *net = &init_net; 668 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
665 669
666 err = get_int(&mesg, &fd); 670 err = get_int(&mesg, &fd);
667 if (err != 0 || fd < 0) 671 if (err != 0 || fd < 0)
668 return -EINVAL; 672 return -EINVAL;
669 673
670 err = nfsd_create_serv(); 674 err = nfsd_create_serv(net);
671 if (err != 0) 675 if (err != 0)
672 return err; 676 return err;
673 677
674 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); 678 err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
675 if (err < 0) { 679 if (err < 0) {
676 nfsd_destroy(net); 680 nfsd_destroy(net);
677 return err; 681 return err;
678 } 682 }
679 683
680 /* Decrease the count, but don't shut down the service */ 684 /* Decrease the count, but don't shut down the service */
681 nfsd_serv->sv_nrthreads--; 685 nn->nfsd_serv->sv_nrthreads--;
682 return err; 686 return err;
683} 687}
684 688
@@ -686,12 +690,12 @@ static ssize_t __write_ports_addfd(char *buf)
686 * A transport listener is added by writing it's transport name and 690 * A transport listener is added by writing it's transport name and
687 * a port number. 691 * a port number.
688 */ 692 */
689static ssize_t __write_ports_addxprt(char *buf) 693static ssize_t __write_ports_addxprt(char *buf, struct net *net)
690{ 694{
691 char transport[16]; 695 char transport[16];
692 struct svc_xprt *xprt; 696 struct svc_xprt *xprt;
693 int port, err; 697 int port, err;
694 struct net *net = &init_net; 698 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
695 699
696 if (sscanf(buf, "%15s %5u", transport, &port) != 2) 700 if (sscanf(buf, "%15s %5u", transport, &port) != 2)
697 return -EINVAL; 701 return -EINVAL;
@@ -699,25 +703,25 @@ static ssize_t __write_ports_addxprt(char *buf)
699 if (port < 1 || port > USHRT_MAX) 703 if (port < 1 || port > USHRT_MAX)
700 return -EINVAL; 704 return -EINVAL;
701 705
702 err = nfsd_create_serv(); 706 err = nfsd_create_serv(net);
703 if (err != 0) 707 if (err != 0)
704 return err; 708 return err;
705 709
706 err = svc_create_xprt(nfsd_serv, transport, net, 710 err = svc_create_xprt(nn->nfsd_serv, transport, net,
707 PF_INET, port, SVC_SOCK_ANONYMOUS); 711 PF_INET, port, SVC_SOCK_ANONYMOUS);
708 if (err < 0) 712 if (err < 0)
709 goto out_err; 713 goto out_err;
710 714
711 err = svc_create_xprt(nfsd_serv, transport, net, 715 err = svc_create_xprt(nn->nfsd_serv, transport, net,
712 PF_INET6, port, SVC_SOCK_ANONYMOUS); 716 PF_INET6, port, SVC_SOCK_ANONYMOUS);
713 if (err < 0 && err != -EAFNOSUPPORT) 717 if (err < 0 && err != -EAFNOSUPPORT)
714 goto out_close; 718 goto out_close;
715 719
716 /* Decrease the count, but don't shut down the service */ 720 /* Decrease the count, but don't shut down the service */
717 nfsd_serv->sv_nrthreads--; 721 nn->nfsd_serv->sv_nrthreads--;
718 return 0; 722 return 0;
719out_close: 723out_close:
720 xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port); 724 xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
721 if (xprt != NULL) { 725 if (xprt != NULL) {
722 svc_close_xprt(xprt); 726 svc_close_xprt(xprt);
723 svc_xprt_put(xprt); 727 svc_xprt_put(xprt);
@@ -727,16 +731,17 @@ out_err:
727 return err; 731 return err;
728} 732}
729 733
730static ssize_t __write_ports(struct file *file, char *buf, size_t size) 734static ssize_t __write_ports(struct file *file, char *buf, size_t size,
735 struct net *net)
731{ 736{
732 if (size == 0) 737 if (size == 0)
733 return __write_ports_names(buf); 738 return __write_ports_names(buf, net);
734 739
735 if (isdigit(buf[0])) 740 if (isdigit(buf[0]))
736 return __write_ports_addfd(buf); 741 return __write_ports_addfd(buf, net);
737 742
738 if (isalpha(buf[0])) 743 if (isalpha(buf[0]))
739 return __write_ports_addxprt(buf); 744 return __write_ports_addxprt(buf, net);
740 745
741 return -EINVAL; 746 return -EINVAL;
742} 747}
@@ -787,9 +792,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
787static ssize_t write_ports(struct file *file, char *buf, size_t size) 792static ssize_t write_ports(struct file *file, char *buf, size_t size)
788{ 793{
789 ssize_t rv; 794 ssize_t rv;
795 struct net *net = &init_net;
790 796
791 mutex_lock(&nfsd_mutex); 797 mutex_lock(&nfsd_mutex);
792 rv = __write_ports(file, buf, size); 798 rv = __write_ports(file, buf, size, net);
793 mutex_unlock(&nfsd_mutex); 799 mutex_unlock(&nfsd_mutex);
794 return rv; 800 return rv;
795} 801}
@@ -821,6 +827,9 @@ int nfsd_max_blksize;
821static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) 827static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
822{ 828{
823 char *mesg = buf; 829 char *mesg = buf;
830 struct net *net = &init_net;
831 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
832
824 if (size > 0) { 833 if (size > 0) {
825 int bsize; 834 int bsize;
826 int rv = get_int(&mesg, &bsize); 835 int rv = get_int(&mesg, &bsize);
@@ -835,7 +844,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
835 bsize = NFSSVC_MAXBLKSIZE; 844 bsize = NFSSVC_MAXBLKSIZE;
836 bsize &= ~(1024-1); 845 bsize &= ~(1024-1);
837 mutex_lock(&nfsd_mutex); 846 mutex_lock(&nfsd_mutex);
838 if (nfsd_serv) { 847 if (nn->nfsd_serv) {
839 mutex_unlock(&nfsd_mutex); 848 mutex_unlock(&nfsd_mutex);
840 return -EBUSY; 849 return -EBUSY;
841 } 850 }
@@ -848,13 +857,14 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
848} 857}
849 858
850#ifdef CONFIG_NFSD_V4 859#ifdef CONFIG_NFSD_V4
851static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) 860static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
861 time_t *time, struct nfsd_net *nn)
852{ 862{
853 char *mesg = buf; 863 char *mesg = buf;
854 int rv, i; 864 int rv, i;
855 865
856 if (size > 0) { 866 if (size > 0) {
857 if (nfsd_serv) 867 if (nn->nfsd_serv)
858 return -EBUSY; 868 return -EBUSY;
859 rv = get_int(&mesg, &i); 869 rv = get_int(&mesg, &i);
860 if (rv) 870 if (rv)
@@ -879,12 +889,13 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, tim
879 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); 889 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
880} 890}
881 891
882static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) 892static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
893 time_t *time, struct nfsd_net *nn)
883{ 894{
884 ssize_t rv; 895 ssize_t rv;
885 896
886 mutex_lock(&nfsd_mutex); 897 mutex_lock(&nfsd_mutex);
887 rv = __nfsd4_write_time(file, buf, size, time); 898 rv = __nfsd4_write_time(file, buf, size, time, nn);
888 mutex_unlock(&nfsd_mutex); 899 mutex_unlock(&nfsd_mutex);
889 return rv; 900 return rv;
890} 901}
@@ -912,7 +923,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_
912 */ 923 */
913static ssize_t write_leasetime(struct file *file, char *buf, size_t size) 924static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
914{ 925{
915 return nfsd4_write_time(file, buf, size, &nfsd4_lease); 926 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
927 return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
916} 928}
917 929
918/** 930/**
@@ -927,17 +939,19 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
927 */ 939 */
928static ssize_t write_gracetime(struct file *file, char *buf, size_t size) 940static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
929{ 941{
930 return nfsd4_write_time(file, buf, size, &nfsd4_grace); 942 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
943 return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
931} 944}
932 945
933static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) 946static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
947 struct nfsd_net *nn)
934{ 948{
935 char *mesg = buf; 949 char *mesg = buf;
936 char *recdir; 950 char *recdir;
937 int len, status; 951 int len, status;
938 952
939 if (size > 0) { 953 if (size > 0) {
940 if (nfsd_serv) 954 if (nn->nfsd_serv)
941 return -EBUSY; 955 return -EBUSY;
942 if (size > PATH_MAX || buf[size-1] != '\n') 956 if (size > PATH_MAX || buf[size-1] != '\n')
943 return -EINVAL; 957 return -EINVAL;
@@ -981,9 +995,10 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
981static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) 995static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
982{ 996{
983 ssize_t rv; 997 ssize_t rv;
998 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
984 999
985 mutex_lock(&nfsd_mutex); 1000 mutex_lock(&nfsd_mutex);
986 rv = __write_recoverydir(file, buf, size); 1001 rv = __write_recoverydir(file, buf, size, nn);
987 mutex_unlock(&nfsd_mutex); 1002 mutex_unlock(&nfsd_mutex);
988 return rv; 1003 return rv;
989} 1004}
@@ -1063,6 +1078,7 @@ int nfsd_net_id;
1063static __net_init int nfsd_init_net(struct net *net) 1078static __net_init int nfsd_init_net(struct net *net)
1064{ 1079{
1065 int retval; 1080 int retval;
1081 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1066 1082
1067 retval = nfsd_export_init(net); 1083 retval = nfsd_export_init(net);
1068 if (retval) 1084 if (retval)
@@ -1070,6 +1086,8 @@ static __net_init int nfsd_init_net(struct net *net)
1070 retval = nfsd_idmap_init(net); 1086 retval = nfsd_idmap_init(net);
1071 if (retval) 1087 if (retval)
1072 goto out_idmap_error; 1088 goto out_idmap_error;
1089 nn->nfsd4_lease = 90; /* default lease time */
1090 nn->nfsd4_grace = 90;
1073 return 0; 1091 return 0;
1074 1092
1075out_idmap_error: 1093out_idmap_error:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 80d5ce40aadb..de23db255c69 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -55,7 +55,6 @@ extern struct svc_version nfsd_version2, nfsd_version3,
55 nfsd_version4; 55 nfsd_version4;
56extern u32 nfsd_supported_minorversion; 56extern u32 nfsd_supported_minorversion;
57extern struct mutex nfsd_mutex; 57extern struct mutex nfsd_mutex;
58extern struct svc_serv *nfsd_serv;
59extern spinlock_t nfsd_drc_lock; 58extern spinlock_t nfsd_drc_lock;
60extern unsigned int nfsd_drc_max_mem; 59extern unsigned int nfsd_drc_max_mem;
61extern unsigned int nfsd_drc_mem_used; 60extern unsigned int nfsd_drc_mem_used;
@@ -65,26 +64,17 @@ extern const struct seq_operations nfs_exports_op;
65/* 64/*
66 * Function prototypes. 65 * Function prototypes.
67 */ 66 */
68int nfsd_svc(int nrservs); 67int nfsd_svc(int nrservs, struct net *net);
69int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); 68int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
70 69
71int nfsd_nrthreads(void); 70int nfsd_nrthreads(struct net *);
72int nfsd_nrpools(void); 71int nfsd_nrpools(struct net *);
73int nfsd_get_nrthreads(int n, int *); 72int nfsd_get_nrthreads(int n, int *, struct net *);
74int nfsd_set_nrthreads(int n, int *); 73int nfsd_set_nrthreads(int n, int *, struct net *);
75int nfsd_pool_stats_open(struct inode *, struct file *); 74int nfsd_pool_stats_open(struct inode *, struct file *);
76int nfsd_pool_stats_release(struct inode *, struct file *); 75int nfsd_pool_stats_release(struct inode *, struct file *);
77 76
78static inline void nfsd_destroy(struct net *net) 77void nfsd_destroy(struct net *net);
79{
80 int destroy = (nfsd_serv->sv_nrthreads == 1);
81
82 if (destroy)
83 svc_shutdown_net(nfsd_serv, net);
84 svc_destroy(nfsd_serv);
85 if (destroy)
86 nfsd_serv = NULL;
87}
88 78
89#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 79#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
90#ifdef CONFIG_NFSD_V2_ACL 80#ifdef CONFIG_NFSD_V2_ACL
@@ -103,7 +93,7 @@ enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
103int nfsd_vers(int vers, enum vers_op change); 93int nfsd_vers(int vers, enum vers_op change);
104int nfsd_minorversion(u32 minorversion, enum vers_op change); 94int nfsd_minorversion(u32 minorversion, enum vers_op change);
105void nfsd_reset_versions(void); 95void nfsd_reset_versions(void);
106int nfsd_create_serv(void); 96int nfsd_create_serv(struct net *net);
107 97
108extern int nfsd_max_blksize; 98extern int nfsd_max_blksize;
109 99
@@ -121,7 +111,9 @@ void nfs4_state_init(void);
121int nfsd4_init_slabs(void); 111int nfsd4_init_slabs(void);
122void nfsd4_free_slabs(void); 112void nfsd4_free_slabs(void);
123int nfs4_state_start(void); 113int nfs4_state_start(void);
114int nfs4_state_start_net(struct net *net);
124void nfs4_state_shutdown(void); 115void nfs4_state_shutdown(void);
116void nfs4_state_shutdown_net(struct net *net);
125void nfs4_reset_lease(time_t leasetime); 117void nfs4_reset_lease(time_t leasetime);
126int nfs4_reset_recoverydir(char *recdir); 118int nfs4_reset_recoverydir(char *recdir);
127char * nfs4_recoverydir(void); 119char * nfs4_recoverydir(void);
@@ -130,7 +122,9 @@ static inline void nfs4_state_init(void) { }
130static inline int nfsd4_init_slabs(void) { return 0; } 122static inline int nfsd4_init_slabs(void) { return 0; }
131static inline void nfsd4_free_slabs(void) { } 123static inline void nfsd4_free_slabs(void) { }
132static inline int nfs4_state_start(void) { return 0; } 124static inline int nfs4_state_start(void) { return 0; }
125static inline int nfs4_state_start_net(struct net *net) { return 0; }
133static inline void nfs4_state_shutdown(void) { } 126static inline void nfs4_state_shutdown(void) { }
127static inline void nfs4_state_shutdown_net(struct net *net) { }
134static inline void nfs4_reset_lease(time_t leasetime) { } 128static inline void nfs4_reset_lease(time_t leasetime) { }
135static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } 129static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
136static inline char * nfs4_recoverydir(void) {return NULL; } 130static inline char * nfs4_recoverydir(void) {return NULL; }
@@ -265,16 +259,8 @@ void nfsd_lockd_shutdown(void);
265/* Check for dir entries '.' and '..' */ 259/* Check for dir entries '.' and '..' */
266#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) 260#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
267 261
268/*
269 * Time of server startup
270 */
271extern struct timeval nfssvc_boot;
272
273#ifdef CONFIG_NFSD_V4 262#ifdef CONFIG_NFSD_V4
274 263
275extern time_t nfsd4_lease;
276extern time_t nfsd4_grace;
277
278/* before processing a COMPOUND operation, we have to check that there 264/* before processing a COMPOUND operation, we have to check that there
279 * is enough space in the buffer for XDR encode to succeed. otherwise, 265 * is enough space in the buffer for XDR encode to succeed. otherwise,
280 * we might process an operation with side effects, and be unable to 266 * we might process an operation with side effects, and be unable to
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 032af381b3aa..814afaa4458a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -572,7 +572,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
572 572
573 if (inode) 573 if (inode)
574 _fh_update(fhp, exp, dentry); 574 _fh_update(fhp, exp, dentry);
575 if (fhp->fh_handle.fh_fileid_type == 255) { 575 if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
576 fh_put(fhp); 576 fh_put(fhp);
577 return nfserr_opnotsupp; 577 return nfserr_opnotsupp;
578 } 578 }
@@ -603,7 +603,7 @@ fh_update(struct svc_fh *fhp)
603 goto out; 603 goto out;
604 604
605 _fh_update(fhp, fhp->fh_export, dentry); 605 _fh_update(fhp, fhp->fh_export, dentry);
606 if (fhp->fh_handle.fh_fileid_type == 255) 606 if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
607 return nfserr_opnotsupp; 607 return nfserr_opnotsupp;
608 } 608 }
609out: 609out:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2013aa001dab..cee62ab9d4a3 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/fs_struct.h> 12#include <linux/fs_struct.h>
13#include <linux/swap.h> 13#include <linux/swap.h>
14#include <linux/nsproxy.h>
15 14
16#include <linux/sunrpc/stats.h> 15#include <linux/sunrpc/stats.h>
17#include <linux/sunrpc/svcsock.h> 16#include <linux/sunrpc/svcsock.h>
@@ -22,19 +21,19 @@
22#include "nfsd.h" 21#include "nfsd.h"
23#include "cache.h" 22#include "cache.h"
24#include "vfs.h" 23#include "vfs.h"
24#include "netns.h"
25 25
26#define NFSDDBG_FACILITY NFSDDBG_SVC 26#define NFSDDBG_FACILITY NFSDDBG_SVC
27 27
28extern struct svc_program nfsd_program; 28extern struct svc_program nfsd_program;
29static int nfsd(void *vrqstp); 29static int nfsd(void *vrqstp);
30struct timeval nfssvc_boot;
31 30
32/* 31/*
33 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members 32 * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
34 * of the svc_serv struct. In particular, ->sv_nrthreads but also to some 33 * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
35 * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt 34 * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
36 * 35 *
37 * If (out side the lock) nfsd_serv is non-NULL, then it must point to a 36 * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
38 * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number 37 * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
39 * of nfsd threads must exist and each must listed in ->sp_all_threads in each 38 * of nfsd threads must exist and each must listed in ->sp_all_threads in each
40 * entry of ->sv_pools[]. 39 * entry of ->sv_pools[].
@@ -52,7 +51,6 @@ struct timeval nfssvc_boot;
52 * nfsd_versions 51 * nfsd_versions
53 */ 52 */
54DEFINE_MUTEX(nfsd_mutex); 53DEFINE_MUTEX(nfsd_mutex);
55struct svc_serv *nfsd_serv;
56 54
57/* 55/*
58 * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. 56 * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
@@ -173,28 +171,32 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
173 */ 171 */
174#define NFSD_MAXSERVS 8192 172#define NFSD_MAXSERVS 8192
175 173
176int nfsd_nrthreads(void) 174int nfsd_nrthreads(struct net *net)
177{ 175{
178 int rv = 0; 176 int rv = 0;
177 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
178
179 mutex_lock(&nfsd_mutex); 179 mutex_lock(&nfsd_mutex);
180 if (nfsd_serv) 180 if (nn->nfsd_serv)
181 rv = nfsd_serv->sv_nrthreads; 181 rv = nn->nfsd_serv->sv_nrthreads;
182 mutex_unlock(&nfsd_mutex); 182 mutex_unlock(&nfsd_mutex);
183 return rv; 183 return rv;
184} 184}
185 185
186static int nfsd_init_socks(void) 186static int nfsd_init_socks(struct net *net)
187{ 187{
188 int error; 188 int error;
189 if (!list_empty(&nfsd_serv->sv_permsocks)) 189 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
190
191 if (!list_empty(&nn->nfsd_serv->sv_permsocks))
190 return 0; 192 return 0;
191 193
192 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT, 194 error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
193 SVC_SOCK_DEFAULTS); 195 SVC_SOCK_DEFAULTS);
194 if (error < 0) 196 if (error < 0)
195 return error; 197 return error;
196 198
197 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT, 199 error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
198 SVC_SOCK_DEFAULTS); 200 SVC_SOCK_DEFAULTS);
199 if (error < 0) 201 if (error < 0)
200 return error; 202 return error;
@@ -202,14 +204,15 @@ static int nfsd_init_socks(void)
202 return 0; 204 return 0;
203} 205}
204 206
205static bool nfsd_up = false; 207static int nfsd_users = 0;
206 208
207static int nfsd_startup(int nrservs) 209static int nfsd_startup_generic(int nrservs)
208{ 210{
209 int ret; 211 int ret;
210 212
211 if (nfsd_up) 213 if (nfsd_users++)
212 return 0; 214 return 0;
215
213 /* 216 /*
214 * Readahead param cache - will no-op if it already exists. 217 * Readahead param cache - will no-op if it already exists.
215 * (Note therefore results will be suboptimal if number of 218 * (Note therefore results will be suboptimal if number of
@@ -218,43 +221,79 @@ static int nfsd_startup(int nrservs)
218 ret = nfsd_racache_init(2*nrservs); 221 ret = nfsd_racache_init(2*nrservs);
219 if (ret) 222 if (ret)
220 return ret; 223 return ret;
221 ret = nfsd_init_socks(); 224 ret = nfs4_state_start();
222 if (ret) 225 if (ret)
223 goto out_racache; 226 goto out_racache;
224 ret = lockd_up(&init_net); 227 return 0;
228
229out_racache:
230 nfsd_racache_shutdown();
231 return ret;
232}
233
234static void nfsd_shutdown_generic(void)
235{
236 if (--nfsd_users)
237 return;
238
239 nfs4_state_shutdown();
240 nfsd_racache_shutdown();
241}
242
243static int nfsd_startup_net(int nrservs, struct net *net)
244{
245 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
246 int ret;
247
248 if (nn->nfsd_net_up)
249 return 0;
250
251 ret = nfsd_startup_generic(nrservs);
225 if (ret) 252 if (ret)
226 goto out_racache; 253 return ret;
227 ret = nfs4_state_start(); 254 ret = nfsd_init_socks(net);
255 if (ret)
256 goto out_socks;
257 ret = lockd_up(net);
258 if (ret)
259 goto out_socks;
260 ret = nfs4_state_start_net(net);
228 if (ret) 261 if (ret)
229 goto out_lockd; 262 goto out_lockd;
230 nfsd_up = true; 263
264 nn->nfsd_net_up = true;
231 return 0; 265 return 0;
266
232out_lockd: 267out_lockd:
233 lockd_down(&init_net); 268 lockd_down(net);
234out_racache: 269out_socks:
235 nfsd_racache_shutdown(); 270 nfsd_shutdown_generic();
236 return ret; 271 return ret;
237} 272}
238 273
239static void nfsd_shutdown(void) 274static void nfsd_shutdown_net(struct net *net)
240{ 275{
276 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
277
278 nfs4_state_shutdown_net(net);
279 lockd_down(net);
280 nn->nfsd_net_up = false;
281 nfsd_shutdown_generic();
282}
283
284static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
285{
286 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
287
241 /* 288 /*
242 * write_ports can create the server without actually starting 289 * write_ports can create the server without actually starting
243 * any threads--if we get shut down before any threads are 290 * any threads--if we get shut down before any threads are
244 * started, then nfsd_last_thread will be run before any of this 291 * started, then nfsd_last_thread will be run before any of this
245 * other initialization has been done. 292 * other initialization has been done.
246 */ 293 */
247 if (!nfsd_up) 294 if (!nn->nfsd_net_up)
248 return; 295 return;
249 nfs4_state_shutdown(); 296 nfsd_shutdown_net(net);
250 lockd_down(&init_net);
251 nfsd_racache_shutdown();
252 nfsd_up = false;
253}
254
255static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
256{
257 nfsd_shutdown();
258 297
259 svc_rpcb_cleanup(serv, net); 298 svc_rpcb_cleanup(serv, net);
260 299
@@ -327,69 +366,84 @@ static int nfsd_get_default_max_blksize(void)
327 return ret; 366 return ret;
328} 367}
329 368
330int nfsd_create_serv(void) 369int nfsd_create_serv(struct net *net)
331{ 370{
332 int error; 371 int error;
333 struct net *net = current->nsproxy->net_ns; 372 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
334 373
335 WARN_ON(!mutex_is_locked(&nfsd_mutex)); 374 WARN_ON(!mutex_is_locked(&nfsd_mutex));
336 if (nfsd_serv) { 375 if (nn->nfsd_serv) {
337 svc_get(nfsd_serv); 376 svc_get(nn->nfsd_serv);
338 return 0; 377 return 0;
339 } 378 }
340 if (nfsd_max_blksize == 0) 379 if (nfsd_max_blksize == 0)
341 nfsd_max_blksize = nfsd_get_default_max_blksize(); 380 nfsd_max_blksize = nfsd_get_default_max_blksize();
342 nfsd_reset_versions(); 381 nfsd_reset_versions();
343 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 382 nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
344 nfsd_last_thread, nfsd, THIS_MODULE); 383 nfsd_last_thread, nfsd, THIS_MODULE);
345 if (nfsd_serv == NULL) 384 if (nn->nfsd_serv == NULL)
346 return -ENOMEM; 385 return -ENOMEM;
347 386
348 error = svc_bind(nfsd_serv, net); 387 error = svc_bind(nn->nfsd_serv, net);
349 if (error < 0) { 388 if (error < 0) {
350 svc_destroy(nfsd_serv); 389 svc_destroy(nn->nfsd_serv);
351 return error; 390 return error;
352 } 391 }
353 392
354 set_max_drc(); 393 set_max_drc();
355 do_gettimeofday(&nfssvc_boot); /* record boot time */ 394 do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
356 return 0; 395 return 0;
357} 396}
358 397
359int nfsd_nrpools(void) 398int nfsd_nrpools(struct net *net)
360{ 399{
361 if (nfsd_serv == NULL) 400 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
401
402 if (nn->nfsd_serv == NULL)
362 return 0; 403 return 0;
363 else 404 else
364 return nfsd_serv->sv_nrpools; 405 return nn->nfsd_serv->sv_nrpools;
365} 406}
366 407
367int nfsd_get_nrthreads(int n, int *nthreads) 408int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
368{ 409{
369 int i = 0; 410 int i = 0;
411 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
370 412
371 if (nfsd_serv != NULL) { 413 if (nn->nfsd_serv != NULL) {
372 for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++) 414 for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
373 nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads; 415 nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
374 } 416 }
375 417
376 return 0; 418 return 0;
377} 419}
378 420
379int nfsd_set_nrthreads(int n, int *nthreads) 421void nfsd_destroy(struct net *net)
422{
423 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
424 int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
425
426 if (destroy)
427 svc_shutdown_net(nn->nfsd_serv, net);
428 svc_destroy(nn->nfsd_serv);
429 if (destroy)
430 nn->nfsd_serv = NULL;
431}
432
433int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
380{ 434{
381 int i = 0; 435 int i = 0;
382 int tot = 0; 436 int tot = 0;
383 int err = 0; 437 int err = 0;
384 struct net *net = &init_net; 438 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
385 439
386 WARN_ON(!mutex_is_locked(&nfsd_mutex)); 440 WARN_ON(!mutex_is_locked(&nfsd_mutex));
387 441
388 if (nfsd_serv == NULL || n <= 0) 442 if (nn->nfsd_serv == NULL || n <= 0)
389 return 0; 443 return 0;
390 444
391 if (n > nfsd_serv->sv_nrpools) 445 if (n > nn->nfsd_serv->sv_nrpools)
392 n = nfsd_serv->sv_nrpools; 446 n = nn->nfsd_serv->sv_nrpools;
393 447
394 /* enforce a global maximum number of threads */ 448 /* enforce a global maximum number of threads */
395 tot = 0; 449 tot = 0;
@@ -419,9 +473,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
419 nthreads[0] = 1; 473 nthreads[0] = 1;
420 474
421 /* apply the new numbers */ 475 /* apply the new numbers */
422 svc_get(nfsd_serv); 476 svc_get(nn->nfsd_serv);
423 for (i = 0; i < n; i++) { 477 for (i = 0; i < n; i++) {
424 err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], 478 err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
425 nthreads[i]); 479 nthreads[i]);
426 if (err) 480 if (err)
427 break; 481 break;
@@ -436,11 +490,11 @@ int nfsd_set_nrthreads(int n, int *nthreads)
436 * this is the first time nrservs is nonzero. 490 * this is the first time nrservs is nonzero.
437 */ 491 */
438int 492int
439nfsd_svc(int nrservs) 493nfsd_svc(int nrservs, struct net *net)
440{ 494{
441 int error; 495 int error;
442 bool nfsd_up_before; 496 bool nfsd_up_before;
443 struct net *net = &init_net; 497 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
444 498
445 mutex_lock(&nfsd_mutex); 499 mutex_lock(&nfsd_mutex);
446 dprintk("nfsd: creating service\n"); 500 dprintk("nfsd: creating service\n");
@@ -449,29 +503,29 @@ nfsd_svc(int nrservs)
449 if (nrservs > NFSD_MAXSERVS) 503 if (nrservs > NFSD_MAXSERVS)
450 nrservs = NFSD_MAXSERVS; 504 nrservs = NFSD_MAXSERVS;
451 error = 0; 505 error = 0;
452 if (nrservs == 0 && nfsd_serv == NULL) 506 if (nrservs == 0 && nn->nfsd_serv == NULL)
453 goto out; 507 goto out;
454 508
455 error = nfsd_create_serv(); 509 error = nfsd_create_serv(net);
456 if (error) 510 if (error)
457 goto out; 511 goto out;
458 512
459 nfsd_up_before = nfsd_up; 513 nfsd_up_before = nn->nfsd_net_up;
460 514
461 error = nfsd_startup(nrservs); 515 error = nfsd_startup_net(nrservs, net);
462 if (error) 516 if (error)
463 goto out_destroy; 517 goto out_destroy;
464 error = svc_set_num_threads(nfsd_serv, NULL, nrservs); 518 error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
465 if (error) 519 if (error)
466 goto out_shutdown; 520 goto out_shutdown;
467 /* We are holding a reference to nfsd_serv which 521 /* We are holding a reference to nn->nfsd_serv which
468 * we don't want to count in the return value, 522 * we don't want to count in the return value,
469 * so subtract 1 523 * so subtract 1
470 */ 524 */
471 error = nfsd_serv->sv_nrthreads - 1; 525 error = nn->nfsd_serv->sv_nrthreads - 1;
472out_shutdown: 526out_shutdown:
473 if (error < 0 && !nfsd_up_before) 527 if (error < 0 && !nfsd_up_before)
474 nfsd_shutdown(); 528 nfsd_shutdown_net(net);
475out_destroy: 529out_destroy:
476 nfsd_destroy(net); /* Release server */ 530 nfsd_destroy(net); /* Release server */
477out: 531out:
@@ -487,6 +541,8 @@ static int
487nfsd(void *vrqstp) 541nfsd(void *vrqstp)
488{ 542{
489 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 543 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
544 struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
545 struct net *net = perm_sock->xpt_net;
490 int err; 546 int err;
491 547
492 /* Lock module and set up kernel thread */ 548 /* Lock module and set up kernel thread */
@@ -551,7 +607,7 @@ out:
551 /* Release the thread */ 607 /* Release the thread */
552 svc_exit_thread(rqstp); 608 svc_exit_thread(rqstp);
553 609
554 nfsd_destroy(&init_net); 610 nfsd_destroy(net);
555 611
556 /* Release module */ 612 /* Release module */
557 mutex_unlock(&nfsd_mutex); 613 mutex_unlock(&nfsd_mutex);
@@ -640,21 +696,24 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
640 } 696 }
641 697
642 /* Store reply in cache. */ 698 /* Store reply in cache. */
643 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); 699 nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
644 return 1; 700 return 1;
645} 701}
646 702
647int nfsd_pool_stats_open(struct inode *inode, struct file *file) 703int nfsd_pool_stats_open(struct inode *inode, struct file *file)
648{ 704{
649 int ret; 705 int ret;
706 struct net *net = &init_net;
707 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
708
650 mutex_lock(&nfsd_mutex); 709 mutex_lock(&nfsd_mutex);
651 if (nfsd_serv == NULL) { 710 if (nn->nfsd_serv == NULL) {
652 mutex_unlock(&nfsd_mutex); 711 mutex_unlock(&nfsd_mutex);
653 return -ENODEV; 712 return -ENODEV;
654 } 713 }
655 /* bump up the psudo refcount while traversing */ 714 /* bump up the psudo refcount while traversing */
656 svc_get(nfsd_serv); 715 svc_get(nn->nfsd_serv);
657 ret = svc_pool_stats_open(nfsd_serv, file); 716 ret = svc_pool_stats_open(nn->nfsd_serv, file);
658 mutex_unlock(&nfsd_mutex); 717 mutex_unlock(&nfsd_mutex);
659 return ret; 718 return ret;
660} 719}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 65ec595e2226..979b42106979 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -246,7 +246,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
246 struct nfsd_readargs *args) 246 struct nfsd_readargs *args)
247{ 247{
248 unsigned int len; 248 unsigned int len;
249 int v,pn; 249 int v;
250 if (!(p = decode_fh(p, &args->fh))) 250 if (!(p = decode_fh(p, &args->fh)))
251 return 0; 251 return 0;
252 252
@@ -262,8 +262,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
262 */ 262 */
263 v=0; 263 v=0;
264 while (len > 0) { 264 while (len > 0) {
265 pn = rqstp->rq_resused++; 265 struct page *p = *(rqstp->rq_next_page++);
266 rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); 266
267 rqstp->rq_vec[v].iov_base = page_address(p);
267 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; 268 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
268 len -= rqstp->rq_vec[v].iov_len; 269 len -= rqstp->rq_vec[v].iov_len;
269 v++; 270 v++;
@@ -355,7 +356,7 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
355{ 356{
356 if (!(p = decode_fh(p, &args->fh))) 357 if (!(p = decode_fh(p, &args->fh)))
357 return 0; 358 return 0;
358 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); 359 args->buffer = page_address(*(rqstp->rq_next_page++));
359 360
360 return xdr_argsize_check(rqstp, p); 361 return xdr_argsize_check(rqstp, p);
361} 362}
@@ -396,7 +397,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
396 if (args->count > PAGE_SIZE) 397 if (args->count > PAGE_SIZE)
397 args->count = PAGE_SIZE; 398 args->count = PAGE_SIZE;
398 399
399 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); 400 args->buffer = page_address(*(rqstp->rq_next_page++));
400 401
401 return xdr_argsize_check(rqstp, p); 402 return xdr_argsize_check(rqstp, p);
402} 403}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e036894bce57..d1c229feed52 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -150,6 +150,12 @@ struct nfsd4_channel_attrs {
150 u32 rdma_attrs; 150 u32 rdma_attrs;
151}; 151};
152 152
153struct nfsd4_cb_sec {
154 u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */
155 u32 uid;
156 u32 gid;
157};
158
153struct nfsd4_create_session { 159struct nfsd4_create_session {
154 clientid_t clientid; 160 clientid_t clientid;
155 struct nfs4_sessionid sessionid; 161 struct nfs4_sessionid sessionid;
@@ -158,8 +164,12 @@ struct nfsd4_create_session {
158 struct nfsd4_channel_attrs fore_channel; 164 struct nfsd4_channel_attrs fore_channel;
159 struct nfsd4_channel_attrs back_channel; 165 struct nfsd4_channel_attrs back_channel;
160 u32 callback_prog; 166 u32 callback_prog;
161 u32 uid; 167 struct nfsd4_cb_sec cb_sec;
162 u32 gid; 168};
169
170struct nfsd4_backchannel_ctl {
171 u32 bc_cb_program;
172 struct nfsd4_cb_sec bc_cb_sec;
163}; 173};
164 174
165struct nfsd4_bind_conn_to_session { 175struct nfsd4_bind_conn_to_session {
@@ -192,6 +202,7 @@ struct nfsd4_session {
192 struct nfs4_sessionid se_sessionid; 202 struct nfs4_sessionid se_sessionid;
193 struct nfsd4_channel_attrs se_fchannel; 203 struct nfsd4_channel_attrs se_fchannel;
194 struct nfsd4_channel_attrs se_bchannel; 204 struct nfsd4_channel_attrs se_bchannel;
205 struct nfsd4_cb_sec se_cb_sec;
195 struct list_head se_conns; 206 struct list_head se_conns;
196 u32 se_cb_prog; 207 u32 se_cb_prog;
197 u32 se_cb_seq_nr; 208 u32 se_cb_seq_nr;
@@ -221,13 +232,12 @@ struct nfsd4_sessionid {
221 */ 232 */
222struct nfs4_client { 233struct nfs4_client {
223 struct list_head cl_idhash; /* hash by cl_clientid.id */ 234 struct list_head cl_idhash; /* hash by cl_clientid.id */
224 struct list_head cl_strhash; /* hash by cl_name */ 235 struct rb_node cl_namenode; /* link into by-name trees */
225 struct list_head cl_openowners; 236 struct list_head cl_openowners;
226 struct idr cl_stateids; /* stateid lookup */ 237 struct idr cl_stateids; /* stateid lookup */
227 struct list_head cl_delegations; 238 struct list_head cl_delegations;
228 struct list_head cl_lru; /* tail queue */ 239 struct list_head cl_lru; /* tail queue */
229 struct xdr_netobj cl_name; /* id generated by client */ 240 struct xdr_netobj cl_name; /* id generated by client */
230 char cl_recdir[HEXDIR_LEN]; /* recovery dir */
231 nfs4_verifier cl_verifier; /* generated by client */ 241 nfs4_verifier cl_verifier; /* generated by client */
232 time_t cl_time; /* time of last lease renewal */ 242 time_t cl_time; /* time of last lease renewal */
233 struct sockaddr_storage cl_addr; /* client ipaddress */ 243 struct sockaddr_storage cl_addr; /* client ipaddress */
@@ -242,9 +252,11 @@ struct nfs4_client {
242#define NFSD4_CLIENT_CB_KILL (1) 252#define NFSD4_CLIENT_CB_KILL (1)
243#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ 253#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
244#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ 254#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
255#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
245#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 256#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
246 1 << NFSD4_CLIENT_CB_KILL) 257 1 << NFSD4_CLIENT_CB_KILL)
247 unsigned long cl_flags; 258 unsigned long cl_flags;
259 struct rpc_cred *cl_cb_cred;
248 struct rpc_clnt *cl_cb_client; 260 struct rpc_clnt *cl_cb_client;
249 u32 cl_cb_ident; 261 u32 cl_cb_ident;
250#define NFSD4_CB_UP 0 262#define NFSD4_CB_UP 0
@@ -271,6 +283,7 @@ struct nfs4_client {
271 unsigned long cl_cb_slot_busy; 283 unsigned long cl_cb_slot_busy;
272 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 284 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
273 /* wait here for slots */ 285 /* wait here for slots */
286 struct net *net;
274}; 287};
275 288
276static inline void 289static inline void
@@ -292,6 +305,7 @@ is_client_expired(struct nfs4_client *clp)
292 */ 305 */
293struct nfs4_client_reclaim { 306struct nfs4_client_reclaim {
294 struct list_head cr_strhash; /* hash by cr_name */ 307 struct list_head cr_strhash; /* hash by cr_name */
308 struct nfs4_client *cr_clp; /* pointer to associated clp */
295 char cr_recdir[HEXDIR_LEN]; /* recover dir */ 309 char cr_recdir[HEXDIR_LEN]; /* recover dir */
296}; 310};
297 311
@@ -452,25 +466,26 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net,
452 stateid_t *stateid, int flags, struct file **filp); 466 stateid_t *stateid, int flags, struct file **filp);
453extern void nfs4_lock_state(void); 467extern void nfs4_lock_state(void);
454extern void nfs4_unlock_state(void); 468extern void nfs4_unlock_state(void);
455extern int nfs4_in_grace(void); 469void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
456extern void nfs4_release_reclaim(void); 470extern void nfs4_release_reclaim(struct nfsd_net *);
457extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp); 471extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
458extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions); 472 struct nfsd_net *nn);
473extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
459extern void nfs4_free_openowner(struct nfs4_openowner *); 474extern void nfs4_free_openowner(struct nfs4_openowner *);
460extern void nfs4_free_lockowner(struct nfs4_lockowner *); 475extern void nfs4_free_lockowner(struct nfs4_lockowner *);
461extern int set_callback_cred(void); 476extern int set_callback_cred(void);
477extern void nfsd4_init_callback(struct nfsd4_callback *);
462extern void nfsd4_probe_callback(struct nfs4_client *clp); 478extern void nfsd4_probe_callback(struct nfs4_client *clp);
463extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); 479extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
464extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 480extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
465extern void nfsd4_do_callback_rpc(struct work_struct *);
466extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 481extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
467extern int nfsd4_create_callback_queue(void); 482extern int nfsd4_create_callback_queue(void);
468extern void nfsd4_destroy_callback_queue(void); 483extern void nfsd4_destroy_callback_queue(void);
469extern void nfsd4_shutdown_callback(struct nfs4_client *); 484extern void nfsd4_shutdown_callback(struct nfs4_client *);
470extern void nfs4_put_delegation(struct nfs4_delegation *dp); 485extern void nfs4_put_delegation(struct nfs4_delegation *dp);
471extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 486extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
472extern int nfs4_client_to_reclaim(const char *name); 487 struct nfsd_net *nn);
473extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); 488extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
474extern void release_session_client(struct nfsd4_session *); 489extern void release_session_client(struct nfsd4_session *);
475extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); 490extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
476 491
@@ -480,5 +495,28 @@ extern void nfsd4_client_tracking_exit(struct net *net);
480extern void nfsd4_client_record_create(struct nfs4_client *clp); 495extern void nfsd4_client_record_create(struct nfs4_client *clp);
481extern void nfsd4_client_record_remove(struct nfs4_client *clp); 496extern void nfsd4_client_record_remove(struct nfs4_client *clp);
482extern int nfsd4_client_record_check(struct nfs4_client *clp); 497extern int nfsd4_client_record_check(struct nfs4_client *clp);
483extern void nfsd4_record_grace_done(struct net *net, time_t boot_time); 498extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
499
500/* nfs fault injection functions */
501#ifdef CONFIG_NFSD_FAULT_INJECTION
502int nfsd_fault_inject_init(void);
503void nfsd_fault_inject_cleanup(void);
504u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64));
505struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t);
506
507u64 nfsd_forget_client(struct nfs4_client *, u64);
508u64 nfsd_forget_client_locks(struct nfs4_client*, u64);
509u64 nfsd_forget_client_openowners(struct nfs4_client *, u64);
510u64 nfsd_forget_client_delegations(struct nfs4_client *, u64);
511u64 nfsd_recall_client_delegations(struct nfs4_client *, u64);
512
513u64 nfsd_print_client(struct nfs4_client *, u64);
514u64 nfsd_print_client_locks(struct nfs4_client *, u64);
515u64 nfsd_print_client_openowners(struct nfs4_client *, u64);
516u64 nfsd_print_client_delegations(struct nfs4_client *, u64);
517#else /* CONFIG_NFSD_FAULT_INJECTION */
518static inline int nfsd_fault_inject_init(void) { return 0; }
519static inline void nfsd_fault_inject_cleanup(void) {}
520#endif /* CONFIG_NFSD_FAULT_INJECTION */
521
484#endif /* NFSD4_STATE_H */ 522#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c120b48ec305..d586117fa94a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -886,7 +886,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
886 struct splice_desc *sd) 886 struct splice_desc *sd)
887{ 887{
888 struct svc_rqst *rqstp = sd->u.data; 888 struct svc_rqst *rqstp = sd->u.data;
889 struct page **pp = rqstp->rq_respages + rqstp->rq_resused; 889 struct page **pp = rqstp->rq_next_page;
890 struct page *page = buf->page; 890 struct page *page = buf->page;
891 size_t size; 891 size_t size;
892 892
@@ -894,17 +894,15 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
894 894
895 if (rqstp->rq_res.page_len == 0) { 895 if (rqstp->rq_res.page_len == 0) {
896 get_page(page); 896 get_page(page);
897 put_page(*pp); 897 put_page(*rqstp->rq_next_page);
898 *pp = page; 898 *(rqstp->rq_next_page++) = page;
899 rqstp->rq_resused++;
900 rqstp->rq_res.page_base = buf->offset; 899 rqstp->rq_res.page_base = buf->offset;
901 rqstp->rq_res.page_len = size; 900 rqstp->rq_res.page_len = size;
902 } else if (page != pp[-1]) { 901 } else if (page != pp[-1]) {
903 get_page(page); 902 get_page(page);
904 if (*pp) 903 if (*rqstp->rq_next_page)
905 put_page(*pp); 904 put_page(*rqstp->rq_next_page);
906 *pp = page; 905 *(rqstp->rq_next_page++) = page;
907 rqstp->rq_resused++;
908 rqstp->rq_res.page_len += size; 906 rqstp->rq_res.page_len += size;
909 } else 907 } else
910 rqstp->rq_res.page_len += size; 908 rqstp->rq_res.page_len += size;
@@ -936,7 +934,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
936 .u.data = rqstp, 934 .u.data = rqstp,
937 }; 935 };
938 936
939 rqstp->rq_resused = 1; 937 rqstp->rq_next_page = rqstp->rq_respages + 1;
940 host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); 938 host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
941 } else { 939 } else {
942 oldfs = get_fs(); 940 oldfs = get_fs();
@@ -1020,28 +1018,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1020 inode = dentry->d_inode; 1018 inode = dentry->d_inode;
1021 exp = fhp->fh_export; 1019 exp = fhp->fh_export;
1022 1020
1023 /*
1024 * Request sync writes if
1025 * - the sync export option has been set, or
1026 * - the client requested O_SYNC behavior (NFSv3 feature).
1027 * - The file system doesn't support fsync().
1028 * When NFSv2 gathered writes have been configured for this volume,
1029 * flushing the data to disk is handled separately below.
1030 */
1031 use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); 1021 use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
1032 1022
1033 if (!file->f_op->fsync) {/* COMMIT3 cannot work */
1034 stable = 2;
1035 *stablep = 2; /* FILE_SYNC */
1036 }
1037
1038 if (!EX_ISSYNC(exp)) 1023 if (!EX_ISSYNC(exp))
1039 stable = 0; 1024 stable = 0;
1040 if (stable && !use_wgather) {
1041 spin_lock(&file->f_lock);
1042 file->f_flags |= O_SYNC;
1043 spin_unlock(&file->f_lock);
1044 }
1045 1025
1046 /* Write the data. */ 1026 /* Write the data. */
1047 oldfs = get_fs(); set_fs(KERNEL_DS); 1027 oldfs = get_fs(); set_fs(KERNEL_DS);
@@ -1057,8 +1037,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1057 if (inode->i_mode & (S_ISUID | S_ISGID)) 1037 if (inode->i_mode & (S_ISUID | S_ISGID))
1058 kill_suid(dentry); 1038 kill_suid(dentry);
1059 1039
1060 if (stable && use_wgather) 1040 if (stable) {
1061 host_err = wait_for_concurrent_writes(file); 1041 if (use_wgather)
1042 host_err = wait_for_concurrent_writes(file);
1043 else
1044 host_err = vfs_fsync_range(file, offset, offset+*cnt, 0);
1045 }
1062 1046
1063out_nfserr: 1047out_nfserr:
1064 dprintk("nfsd: write complete host_err=%d\n", host_err); 1048 dprintk("nfsd: write complete host_err=%d\n", host_err);
@@ -1485,13 +1469,19 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1485 case NFS3_CREATE_EXCLUSIVE: 1469 case NFS3_CREATE_EXCLUSIVE:
1486 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime 1470 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
1487 && dchild->d_inode->i_atime.tv_sec == v_atime 1471 && dchild->d_inode->i_atime.tv_sec == v_atime
1488 && dchild->d_inode->i_size == 0 ) 1472 && dchild->d_inode->i_size == 0 ) {
1473 if (created)
1474 *created = 1;
1489 break; 1475 break;
1476 }
1490 case NFS4_CREATE_EXCLUSIVE4_1: 1477 case NFS4_CREATE_EXCLUSIVE4_1:
1491 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime 1478 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
1492 && dchild->d_inode->i_atime.tv_sec == v_atime 1479 && dchild->d_inode->i_atime.tv_sec == v_atime
1493 && dchild->d_inode->i_size == 0 ) 1480 && dchild->d_inode->i_size == 0 ) {
1481 if (created)
1482 *created = 1;
1494 goto set_attr; 1483 goto set_attr;
1484 }
1495 /* fallthru */ 1485 /* fallthru */
1496 case NFS3_CREATE_GUARDED: 1486 case NFS3_CREATE_GUARDED:
1497 err = nfserr_exist; 1487 err = nfserr_exist;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index acd127d4ee82..0889bfb43dc9 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -385,7 +385,8 @@ struct nfsd4_write {
385 u64 wr_offset; /* request */ 385 u64 wr_offset; /* request */
386 u32 wr_stable_how; /* request */ 386 u32 wr_stable_how; /* request */
387 u32 wr_buflen; /* request */ 387 u32 wr_buflen; /* request */
388 int wr_vlen; 388 struct kvec wr_head;
389 struct page ** wr_pagelist; /* request */
389 390
390 u32 wr_bytes_written; /* response */ 391 u32 wr_bytes_written; /* response */
391 u32 wr_how_written; /* response */ 392 u32 wr_how_written; /* response */
@@ -462,6 +463,7 @@ struct nfsd4_op {
462 463
463 /* NFSv4.1 */ 464 /* NFSv4.1 */
464 struct nfsd4_exchange_id exchange_id; 465 struct nfsd4_exchange_id exchange_id;
466 struct nfsd4_backchannel_ctl backchannel_ctl;
465 struct nfsd4_bind_conn_to_session bind_conn_to_session; 467 struct nfsd4_bind_conn_to_session bind_conn_to_session;
466 struct nfsd4_create_session create_session; 468 struct nfsd4_create_session create_session;
467 struct nfsd4_destroy_session destroy_session; 469 struct nfsd4_destroy_session destroy_session;
@@ -526,6 +528,14 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
526 || nfsd4_is_solo_sequence(resp); 528 || nfsd4_is_solo_sequence(resp);
527} 529}
528 530
531static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
532{
533 struct nfsd4_compoundres *resp = rqstp->rq_resp;
534 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
535
536 return argp->opcnt == resp->opcnt;
537}
538
529#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) 539#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
530 540
531static inline void 541static inline void
@@ -566,6 +576,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
566 struct nfsd4_sequence *seq); 576 struct nfsd4_sequence *seq);
567extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 577extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
568 struct nfsd4_compound_state *, struct nfsd4_exchange_id *); 578 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
579extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
569extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *); 580extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
570extern __be32 nfsd4_create_session(struct svc_rqst *, 581extern __be32 nfsd4_create_session(struct svc_rqst *,
571 struct nfsd4_compound_state *, 582 struct nfsd4_compound_state *,
@@ -579,7 +590,7 @@ extern __be32 nfsd4_destroy_session(struct svc_rqst *,
579extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *); 590extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
580__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); 591__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
581extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, 592extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
582 struct nfsd4_open *open); 593 struct nfsd4_open *open, struct nfsd_net *nn);
583extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, 594extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
584 struct svc_fh *current_fh, struct nfsd4_open *open); 595 struct svc_fh *current_fh, struct nfsd4_open *open);
585extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); 596extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 16f35f7423c5..61946883025c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -167,7 +167,6 @@ const struct file_operations nilfs_file_operations = {
167}; 167};
168 168
169const struct inode_operations nilfs_file_inode_operations = { 169const struct inode_operations nilfs_file_inode_operations = {
170 .truncate = nilfs_truncate,
171 .setattr = nilfs_setattr, 170 .setattr = nilfs_setattr,
172 .permission = nilfs_permission, 171 .permission = nilfs_permission,
173 .fiemap = nilfs_fiemap, 172 .fiemap = nilfs_fiemap,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4d31d2cca7fd..6b49f14eac8c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -213,6 +213,16 @@ static int nilfs_set_page_dirty(struct page *page)
213 return ret; 213 return ret;
214} 214}
215 215
216void nilfs_write_failed(struct address_space *mapping, loff_t to)
217{
218 struct inode *inode = mapping->host;
219
220 if (to > inode->i_size) {
221 truncate_pagecache(inode, to, inode->i_size);
222 nilfs_truncate(inode);
223 }
224}
225
216static int nilfs_write_begin(struct file *file, struct address_space *mapping, 226static int nilfs_write_begin(struct file *file, struct address_space *mapping,
217 loff_t pos, unsigned len, unsigned flags, 227 loff_t pos, unsigned len, unsigned flags,
218 struct page **pagep, void **fsdata) 228 struct page **pagep, void **fsdata)
@@ -227,10 +237,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
227 err = block_write_begin(mapping, pos, len, flags, pagep, 237 err = block_write_begin(mapping, pos, len, flags, pagep,
228 nilfs_get_block); 238 nilfs_get_block);
229 if (unlikely(err)) { 239 if (unlikely(err)) {
230 loff_t isize = mapping->host->i_size; 240 nilfs_write_failed(mapping, pos + len);
231 if (pos + len > isize)
232 vmtruncate(mapping->host, isize);
233
234 nilfs_transaction_abort(inode->i_sb); 241 nilfs_transaction_abort(inode->i_sb);
235 } 242 }
236 return err; 243 return err;
@@ -259,6 +266,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
259 loff_t offset, unsigned long nr_segs) 266 loff_t offset, unsigned long nr_segs)
260{ 267{
261 struct file *file = iocb->ki_filp; 268 struct file *file = iocb->ki_filp;
269 struct address_space *mapping = file->f_mapping;
262 struct inode *inode = file->f_mapping->host; 270 struct inode *inode = file->f_mapping->host;
263 ssize_t size; 271 ssize_t size;
264 272
@@ -278,7 +286,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
278 loff_t end = offset + iov_length(iov, nr_segs); 286 loff_t end = offset + iov_length(iov, nr_segs);
279 287
280 if (end > isize) 288 if (end > isize)
281 vmtruncate(inode, isize); 289 nilfs_write_failed(mapping, end);
282 } 290 }
283 291
284 return size; 292 return size;
@@ -786,10 +794,8 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
786 if ((iattr->ia_valid & ATTR_SIZE) && 794 if ((iattr->ia_valid & ATTR_SIZE) &&
787 iattr->ia_size != i_size_read(inode)) { 795 iattr->ia_size != i_size_read(inode)) {
788 inode_dio_wait(inode); 796 inode_dio_wait(inode);
789 797 truncate_setsize(inode, iattr->ia_size);
790 err = vmtruncate(inode, iattr->ia_size); 798 nilfs_truncate(inode);
791 if (unlikely(err))
792 goto out_err;
793 } 799 }
794 800
795 setattr_copy(inode, iattr); 801 setattr_copy(inode, iattr);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index fdb180769485..f3859354e41a 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
664 if (ret < 0) 664 if (ret < 0)
665 printk(KERN_ERR "NILFS: GC failed during preparation: " 665 printk(KERN_ERR "NILFS: GC failed during preparation: "
666 "cannot read source blocks: err=%d\n", ret); 666 "cannot read source blocks: err=%d\n", ret);
667 else 667 else {
668 if (nilfs_sb_need_update(nilfs))
669 set_nilfs_discontinued(nilfs);
668 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 670 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
671 }
669 672
670 nilfs_remove_all_gcinodes(nilfs); 673 nilfs_remove_all_gcinodes(nilfs);
671 clear_nilfs_gc_running(nilfs); 674 clear_nilfs_gc_running(nilfs);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 74cece80e9a3..9bc72dec3fa6 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -277,6 +277,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
277extern void nilfs_truncate(struct inode *); 277extern void nilfs_truncate(struct inode *);
278extern void nilfs_evict_inode(struct inode *); 278extern void nilfs_evict_inode(struct inode *);
279extern int nilfs_setattr(struct dentry *, struct iattr *); 279extern int nilfs_setattr(struct dentry *, struct iattr *);
280extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
280int nilfs_permission(struct inode *inode, int mask); 281int nilfs_permission(struct inode *inode, int mask);
281int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); 282int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
282extern int nilfs_inode_dirty(struct inode *); 283extern int nilfs_inode_dirty(struct inode *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3e7b2a0dc0c8..07f76db04ec7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
431 mapping->host = inode; 431 mapping->host = inode;
432 mapping->flags = 0; 432 mapping->flags = 0;
433 mapping_set_gfp_mask(mapping, GFP_NOFS); 433 mapping_set_gfp_mask(mapping, GFP_NOFS);
434 mapping->assoc_mapping = NULL; 434 mapping->private_data = NULL;
435 mapping->backing_dev_info = bdi; 435 mapping->backing_dev_info = bdi;
436 mapping->a_ops = &empty_aops; 436 mapping->a_ops = &empty_aops;
437} 437}
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index f1626f5011c5..ff00a0b7acb9 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -527,7 +527,8 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
527 if (unlikely(err)) { 527 if (unlikely(err)) {
528 loff_t isize = inode->i_size; 528 loff_t isize = inode->i_size;
529 if (pos + blocksize > isize) 529 if (pos + blocksize > isize)
530 vmtruncate(inode, isize); 530 nilfs_write_failed(inode->i_mapping,
531 pos + blocksize);
531 goto failed_inode; 532 goto failed_inode;
532 } 533 }
533 534
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index ae5f33a6d868..96d3420d0242 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,5 +1,5 @@
1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ 1obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \
2 mark.o vfsmount_mark.o 2 mark.o vfsmount_mark.o fdinfo.o
3 3
4obj-y += dnotify/ 4obj-y += dnotify/
5obj-y += inotify/ 5obj-y += inotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 3344bdd5506e..08b886f119ce 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -201,7 +201,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
201 201
202 /* nothing else could have found us thanks to the dnotify_mark_mutex */ 202 /* nothing else could have found us thanks to the dnotify_mark_mutex */
203 if (dn_mark->dn == NULL) 203 if (dn_mark->dn == NULL)
204 fsnotify_destroy_mark(fsn_mark); 204 fsnotify_destroy_mark(fsn_mark, dnotify_group);
205 205
206 mutex_unlock(&dnotify_mark_mutex); 206 mutex_unlock(&dnotify_mark_mutex);
207 207
@@ -385,7 +385,7 @@ out:
385 spin_unlock(&fsn_mark->lock); 385 spin_unlock(&fsn_mark->lock);
386 386
387 if (destroy) 387 if (destroy)
388 fsnotify_destroy_mark(fsn_mark); 388 fsnotify_destroy_mark(fsn_mark, dnotify_group);
389 389
390 mutex_unlock(&dnotify_mark_mutex); 390 mutex_unlock(&dnotify_mark_mutex);
391 fsnotify_put_mark(fsn_mark); 391 fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 7dceff005a67..e5f911bd80d2 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -4,7 +4,7 @@ config FANOTIFY
4 select ANON_INODES 4 select ANON_INODES
5 default n 5 default n
6 ---help--- 6 ---help---
7 Say Y here to enable fanotify suport. fanotify is a file access 7 Say Y here to enable fanotify support. fanotify is a file access
8 notification system which differs from inotify in that it sends 8 notification system which differs from inotify in that it sends
9 an open file descriptor to the userspace listener along with 9 an open file descriptor to the userspace listener along with
10 the event. 10 the event.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index a50636025364..0c2f9122b262 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -18,6 +18,12 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
18 old->tgid == new->tgid) { 18 old->tgid == new->tgid) {
19 switch (old->data_type) { 19 switch (old->data_type) {
20 case (FSNOTIFY_EVENT_PATH): 20 case (FSNOTIFY_EVENT_PATH):
21#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
22 /* dont merge two permission events */
23 if ((old->mask & FAN_ALL_PERM_EVENTS) &&
24 (new->mask & FAN_ALL_PERM_EVENTS))
25 return false;
26#endif
21 if ((old->path.mnt == new->path.mnt) && 27 if ((old->path.mnt == new->path.mnt) &&
22 (old->path.dentry == new->path.dentry)) 28 (old->path.dentry == new->path.dentry))
23 return true; 29 return true;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6fcaeb8c902e..9ff4a5ee6e20 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,7 @@
17#include <asm/ioctls.h> 17#include <asm/ioctls.h>
18 18
19#include "../../mount.h" 19#include "../../mount.h"
20#include "../fdinfo.h"
20 21
21#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 22#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
22#define FANOTIFY_DEFAULT_MAX_MARKS 8192 23#define FANOTIFY_DEFAULT_MAX_MARKS 8192
@@ -396,8 +397,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
396 397
397 wake_up(&group->fanotify_data.access_waitq); 398 wake_up(&group->fanotify_data.access_waitq);
398#endif 399#endif
400
401 if (file->f_flags & FASYNC)
402 fsnotify_fasync(-1, file, 0);
403
399 /* matches the fanotify_init->fsnotify_alloc_group */ 404 /* matches the fanotify_init->fsnotify_alloc_group */
400 fsnotify_put_group(group); 405 fsnotify_destroy_group(group);
401 406
402 return 0; 407 return 0;
403} 408}
@@ -428,6 +433,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
428} 433}
429 434
430static const struct file_operations fanotify_fops = { 435static const struct file_operations fanotify_fops = {
436 .show_fdinfo = fanotify_show_fdinfo,
431 .poll = fanotify_poll, 437 .poll = fanotify_poll,
432 .read = fanotify_read, 438 .read = fanotify_read,
433 .write = fanotify_write, 439 .write = fanotify_write,
@@ -491,7 +497,8 @@ out:
491 497
492static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, 498static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
493 __u32 mask, 499 __u32 mask,
494 unsigned int flags) 500 unsigned int flags,
501 int *destroy)
495{ 502{
496 __u32 oldmask; 503 __u32 oldmask;
497 504
@@ -505,8 +512,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
505 } 512 }
506 spin_unlock(&fsn_mark->lock); 513 spin_unlock(&fsn_mark->lock);
507 514
508 if (!(oldmask & ~mask)) 515 *destroy = !(oldmask & ~mask);
509 fsnotify_destroy_mark(fsn_mark);
510 516
511 return mask & oldmask; 517 return mask & oldmask;
512} 518}
@@ -517,12 +523,17 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
517{ 523{
518 struct fsnotify_mark *fsn_mark = NULL; 524 struct fsnotify_mark *fsn_mark = NULL;
519 __u32 removed; 525 __u32 removed;
526 int destroy_mark;
520 527
521 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); 528 fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
522 if (!fsn_mark) 529 if (!fsn_mark)
523 return -ENOENT; 530 return -ENOENT;
524 531
525 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); 532 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
533 &destroy_mark);
534 if (destroy_mark)
535 fsnotify_destroy_mark(fsn_mark, group);
536
526 fsnotify_put_mark(fsn_mark); 537 fsnotify_put_mark(fsn_mark);
527 if (removed & real_mount(mnt)->mnt_fsnotify_mask) 538 if (removed & real_mount(mnt)->mnt_fsnotify_mask)
528 fsnotify_recalc_vfsmount_mask(mnt); 539 fsnotify_recalc_vfsmount_mask(mnt);
@@ -536,12 +547,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
536{ 547{
537 struct fsnotify_mark *fsn_mark = NULL; 548 struct fsnotify_mark *fsn_mark = NULL;
538 __u32 removed; 549 __u32 removed;
550 int destroy_mark;
539 551
540 fsn_mark = fsnotify_find_inode_mark(group, inode); 552 fsn_mark = fsnotify_find_inode_mark(group, inode);
541 if (!fsn_mark) 553 if (!fsn_mark)
542 return -ENOENT; 554 return -ENOENT;
543 555
544 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags); 556 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
557 &destroy_mark);
558 if (destroy_mark)
559 fsnotify_destroy_mark(fsn_mark, group);
545 /* matches the fsnotify_find_inode_mark() */ 560 /* matches the fsnotify_find_inode_mark() */
546 fsnotify_put_mark(fsn_mark); 561 fsnotify_put_mark(fsn_mark);
547 if (removed & inode->i_fsnotify_mask) 562 if (removed & inode->i_fsnotify_mask)
@@ -708,13 +723,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
708 break; 723 break;
709 default: 724 default:
710 fd = -EINVAL; 725 fd = -EINVAL;
711 goto out_put_group; 726 goto out_destroy_group;
712 } 727 }
713 728
714 if (flags & FAN_UNLIMITED_QUEUE) { 729 if (flags & FAN_UNLIMITED_QUEUE) {
715 fd = -EPERM; 730 fd = -EPERM;
716 if (!capable(CAP_SYS_ADMIN)) 731 if (!capable(CAP_SYS_ADMIN))
717 goto out_put_group; 732 goto out_destroy_group;
718 group->max_events = UINT_MAX; 733 group->max_events = UINT_MAX;
719 } else { 734 } else {
720 group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; 735 group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
@@ -723,7 +738,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
723 if (flags & FAN_UNLIMITED_MARKS) { 738 if (flags & FAN_UNLIMITED_MARKS) {
724 fd = -EPERM; 739 fd = -EPERM;
725 if (!capable(CAP_SYS_ADMIN)) 740 if (!capable(CAP_SYS_ADMIN))
726 goto out_put_group; 741 goto out_destroy_group;
727 group->fanotify_data.max_marks = UINT_MAX; 742 group->fanotify_data.max_marks = UINT_MAX;
728 } else { 743 } else {
729 group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; 744 group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
@@ -731,12 +746,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
731 746
732 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); 747 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
733 if (fd < 0) 748 if (fd < 0)
734 goto out_put_group; 749 goto out_destroy_group;
735 750
736 return fd; 751 return fd;
737 752
738out_put_group: 753out_destroy_group:
739 fsnotify_put_group(group); 754 fsnotify_destroy_group(group);
740 return fd; 755 return fd;
741} 756}
742 757
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
new file mode 100644
index 000000000000..238a5930cb3c
--- /dev/null
+++ b/fs/notify/fdinfo.c
@@ -0,0 +1,179 @@
1#include <linux/file.h>
2#include <linux/fs.h>
3#include <linux/fsnotify_backend.h>
4#include <linux/idr.h>
5#include <linux/init.h>
6#include <linux/inotify.h>
7#include <linux/fanotify.h>
8#include <linux/kernel.h>
9#include <linux/namei.h>
10#include <linux/sched.h>
11#include <linux/types.h>
12#include <linux/seq_file.h>
13#include <linux/proc_fs.h>
14#include <linux/exportfs.h>
15
16#include "inotify/inotify.h"
17#include "../fs/mount.h"
18
19#if defined(CONFIG_PROC_FS)
20
21#if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY)
22
23static int show_fdinfo(struct seq_file *m, struct file *f,
24 int (*show)(struct seq_file *m, struct fsnotify_mark *mark))
25{
26 struct fsnotify_group *group = f->private_data;
27 struct fsnotify_mark *mark;
28 int ret = 0;
29
30 mutex_lock(&group->mark_mutex);
31 list_for_each_entry(mark, &group->marks_list, g_list) {
32 ret = show(m, mark);
33 if (ret)
34 break;
35 }
36 mutex_unlock(&group->mark_mutex);
37 return ret;
38}
39
40#if defined(CONFIG_EXPORTFS)
41static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
42{
43 struct {
44 struct file_handle handle;
45 u8 pad[64];
46 } f;
47 int size, ret, i;
48
49 f.handle.handle_bytes = sizeof(f.pad);
50 size = f.handle.handle_bytes >> 2;
51
52 ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
53 if ((ret == 255) || (ret == -ENOSPC)) {
54 WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
55 return 0;
56 }
57
58 f.handle.handle_type = ret;
59 f.handle.handle_bytes = size * sizeof(u32);
60
61 ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
62 f.handle.handle_bytes, f.handle.handle_type);
63
64 for (i = 0; i < f.handle.handle_bytes; i++)
65 ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
66
67 return ret;
68}
69#else
70static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
71{
72 return 0;
73}
74#endif
75
76#ifdef CONFIG_INOTIFY_USER
77
78static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
79{
80 struct inotify_inode_mark *inode_mark;
81 struct inode *inode;
82 int ret = 0;
83
84 if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
85 return 0;
86
87 inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
88 inode = igrab(mark->i.inode);
89 if (inode) {
90 ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x "
91 "mask:%x ignored_mask:%x ",
92 inode_mark->wd, inode->i_ino,
93 inode->i_sb->s_dev,
94 mark->mask, mark->ignored_mask);
95 ret |= show_mark_fhandle(m, inode);
96 ret |= seq_putc(m, '\n');
97 iput(inode);
98 }
99
100 return ret;
101}
102
103int inotify_show_fdinfo(struct seq_file *m, struct file *f)
104{
105 return show_fdinfo(m, f, inotify_fdinfo);
106}
107
108#endif /* CONFIG_INOTIFY_USER */
109
110#ifdef CONFIG_FANOTIFY
111
112static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
113{
114 unsigned int mflags = 0;
115 struct inode *inode;
116 int ret = 0;
117
118 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
119 return 0;
120
121 if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
122 mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
123
124 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
125 inode = igrab(mark->i.inode);
126 if (!inode)
127 goto out;
128 ret = seq_printf(m, "fanotify ino:%lx sdev:%x "
129 "mflags:%x mask:%x ignored_mask:%x ",
130 inode->i_ino, inode->i_sb->s_dev,
131 mflags, mark->mask, mark->ignored_mask);
132 ret |= show_mark_fhandle(m, inode);
133 ret |= seq_putc(m, '\n');
134 iput(inode);
135 } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
136 struct mount *mnt = real_mount(mark->m.mnt);
137
138 ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x "
139 "ignored_mask:%x\n", mnt->mnt_id, mflags,
140 mark->mask, mark->ignored_mask);
141 }
142out:
143 return ret;
144}
145
146int fanotify_show_fdinfo(struct seq_file *m, struct file *f)
147{
148 struct fsnotify_group *group = f->private_data;
149 unsigned int flags = 0;
150
151 switch (group->priority) {
152 case FS_PRIO_0:
153 flags |= FAN_CLASS_NOTIF;
154 break;
155 case FS_PRIO_1:
156 flags |= FAN_CLASS_CONTENT;
157 break;
158 case FS_PRIO_2:
159 flags |= FAN_CLASS_PRE_CONTENT;
160 break;
161 }
162
163 if (group->max_events == UINT_MAX)
164 flags |= FAN_UNLIMITED_QUEUE;
165
166 if (group->fanotify_data.max_marks == UINT_MAX)
167 flags |= FAN_UNLIMITED_MARKS;
168
169 seq_printf(m, "fanotify flags:%x event-flags:%x\n",
170 flags, group->fanotify_data.f_flags);
171
172 return show_fdinfo(m, f, fanotify_fdinfo);
173}
174
175#endif /* CONFIG_FANOTIFY */
176
177#endif /* CONFIG_INOTIFY_USER || CONFIG_FANOTIFY */
178
179#endif /* CONFIG_PROC_FS */
diff --git a/fs/notify/fdinfo.h b/fs/notify/fdinfo.h
new file mode 100644
index 000000000000..556afda990e9
--- /dev/null
+++ b/fs/notify/fdinfo.h
@@ -0,0 +1,27 @@
1#ifndef __FSNOTIFY_FDINFO_H__
2#define __FSNOTIFY_FDINFO_H__
3
4#include <linux/errno.h>
5#include <linux/proc_fs.h>
6
7struct seq_file;
8struct file;
9
10#ifdef CONFIG_PROC_FS
11
12#ifdef CONFIG_INOTIFY_USER
13extern int inotify_show_fdinfo(struct seq_file *m, struct file *f);
14#endif
15
16#ifdef CONFIG_FANOTIFY
17extern int fanotify_show_fdinfo(struct seq_file *m, struct file *f);
18#endif
19
20#else /* CONFIG_PROC_FS */
21
22#define inotify_show_fdinfo NULL
23#define fanotify_show_fdinfo NULL
24
25#endif /* CONFIG_PROC_FS */
26
27#endif /* __FSNOTIFY_FDINFO_H__ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 63fc294a4692..bd2625bd88b4 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -33,9 +33,6 @@
33 */ 33 */
34void fsnotify_final_destroy_group(struct fsnotify_group *group) 34void fsnotify_final_destroy_group(struct fsnotify_group *group)
35{ 35{
36 /* clear the notification queue of all events */
37 fsnotify_flush_notify(group);
38
39 if (group->ops->free_group_priv) 36 if (group->ops->free_group_priv)
40 group->ops->free_group_priv(group); 37 group->ops->free_group_priv(group);
41 38
@@ -43,23 +40,30 @@ void fsnotify_final_destroy_group(struct fsnotify_group *group)
43} 40}
44 41
45/* 42/*
46 * Trying to get rid of a group. We need to first get rid of any outstanding 43 * Trying to get rid of a group. Remove all marks, flush all events and release
47 * allocations and then free the group. Remember that fsnotify_clear_marks_by_group 44 * the group reference.
48 * could miss marks that are being freed by inode and those marks could still 45 * Note that another thread calling fsnotify_clear_marks_by_group() may still
49 * hold a reference to this group (via group->num_marks) If we get into that 46 * hold a ref to the group.
50 * situtation, the fsnotify_final_destroy_group will get called when that final
51 * mark is freed.
52 */ 47 */
53static void fsnotify_destroy_group(struct fsnotify_group *group) 48void fsnotify_destroy_group(struct fsnotify_group *group)
54{ 49{
55 /* clear all inode marks for this group */ 50 /* clear all inode marks for this group */
56 fsnotify_clear_marks_by_group(group); 51 fsnotify_clear_marks_by_group(group);
57 52
58 synchronize_srcu(&fsnotify_mark_srcu); 53 synchronize_srcu(&fsnotify_mark_srcu);
59 54
60 /* past the point of no return, matches the initial value of 1 */ 55 /* clear the notification queue of all events */
61 if (atomic_dec_and_test(&group->num_marks)) 56 fsnotify_flush_notify(group);
62 fsnotify_final_destroy_group(group); 57
58 fsnotify_put_group(group);
59}
60
61/*
62 * Get reference to a group.
63 */
64void fsnotify_get_group(struct fsnotify_group *group)
65{
66 atomic_inc(&group->refcnt);
63} 67}
64 68
65/* 69/*
@@ -68,7 +72,7 @@ static void fsnotify_destroy_group(struct fsnotify_group *group)
68void fsnotify_put_group(struct fsnotify_group *group) 72void fsnotify_put_group(struct fsnotify_group *group)
69{ 73{
70 if (atomic_dec_and_test(&group->refcnt)) 74 if (atomic_dec_and_test(&group->refcnt))
71 fsnotify_destroy_group(group); 75 fsnotify_final_destroy_group(group);
72} 76}
73 77
74/* 78/*
@@ -84,21 +88,24 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
84 88
85 /* set to 0 when there a no external references to this group */ 89 /* set to 0 when there a no external references to this group */
86 atomic_set(&group->refcnt, 1); 90 atomic_set(&group->refcnt, 1);
87 /* 91 atomic_set(&group->num_marks, 0);
88 * hits 0 when there are no external references AND no marks for
89 * this group
90 */
91 atomic_set(&group->num_marks, 1);
92 92
93 mutex_init(&group->notification_mutex); 93 mutex_init(&group->notification_mutex);
94 INIT_LIST_HEAD(&group->notification_list); 94 INIT_LIST_HEAD(&group->notification_list);
95 init_waitqueue_head(&group->notification_waitq); 95 init_waitqueue_head(&group->notification_waitq);
96 group->max_events = UINT_MAX; 96 group->max_events = UINT_MAX;
97 97
98 spin_lock_init(&group->mark_lock); 98 mutex_init(&group->mark_mutex);
99 INIT_LIST_HEAD(&group->marks_list); 99 INIT_LIST_HEAD(&group->marks_list);
100 100
101 group->ops = ops; 101 group->ops = ops;
102 102
103 return group; 103 return group;
104} 104}
105
106int fsnotify_fasync(int fd, struct file *file, int on)
107{
108 struct fsnotify_group *group = file->private_data;
109
110 return fasync_helper(fd, file, on, &group->fsn_fa) >= 0 ? 0 : -EIO;
111}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index b13c00ac48eb..f31e90fc050d 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -63,8 +63,8 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
63{ 63{
64 struct inode *inode = mark->i.inode; 64 struct inode *inode = mark->i.inode;
65 65
66 BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
66 assert_spin_locked(&mark->lock); 67 assert_spin_locked(&mark->lock);
67 assert_spin_locked(&mark->group->mark_lock);
68 68
69 spin_lock(&inode->i_lock); 69 spin_lock(&inode->i_lock);
70 70
@@ -99,8 +99,16 @@ void fsnotify_clear_marks_by_inode(struct inode *inode)
99 spin_unlock(&inode->i_lock); 99 spin_unlock(&inode->i_lock);
100 100
101 list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { 101 list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) {
102 fsnotify_destroy_mark(mark); 102 struct fsnotify_group *group;
103
104 spin_lock(&mark->lock);
105 fsnotify_get_group(mark->group);
106 group = mark->group;
107 spin_unlock(&mark->lock);
108
109 fsnotify_destroy_mark(mark, group);
103 fsnotify_put_mark(mark); 110 fsnotify_put_mark(mark);
111 fsnotify_put_group(group);
104 } 112 }
105} 113}
106 114
@@ -116,8 +124,9 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
116 * given a group and inode, find the mark associated with that combination. 124 * given a group and inode, find the mark associated with that combination.
117 * if found take a reference to that mark and return it, else return NULL 125 * if found take a reference to that mark and return it, else return NULL
118 */ 126 */
119struct fsnotify_mark *fsnotify_find_inode_mark_locked(struct fsnotify_group *group, 127static struct fsnotify_mark *fsnotify_find_inode_mark_locked(
120 struct inode *inode) 128 struct fsnotify_group *group,
129 struct inode *inode)
121{ 130{
122 struct fsnotify_mark *mark; 131 struct fsnotify_mark *mark;
123 struct hlist_node *pos; 132 struct hlist_node *pos;
@@ -191,8 +200,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
191 200
192 mark->flags |= FSNOTIFY_MARK_FLAG_INODE; 201 mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
193 202
203 BUG_ON(!mutex_is_locked(&group->mark_mutex));
194 assert_spin_locked(&mark->lock); 204 assert_spin_locked(&mark->lock);
195 assert_spin_locked(&group->mark_lock);
196 205
197 spin_lock(&inode->i_lock); 206 spin_lock(&inode->i_lock);
198 207
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index e3cbd746f64a..871569c7d609 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -118,6 +118,7 @@ static int inotify_handle_event(struct fsnotify_group *group,
118 118
119 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 119 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
120 120
121 fsnotify_get_group(group);
121 fsn_event_priv->group = group; 122 fsn_event_priv->group = group;
122 event_priv->wd = wd; 123 event_priv->wd = wd;
123 124
@@ -131,7 +132,7 @@ static int inotify_handle_event(struct fsnotify_group *group,
131 } 132 }
132 133
133 if (inode_mark->mask & IN_ONESHOT) 134 if (inode_mark->mask & IN_ONESHOT)
134 fsnotify_destroy_mark(inode_mark); 135 fsnotify_destroy_mark(inode_mark, group);
135 136
136 return ret; 137 return ret;
137} 138}
@@ -210,6 +211,7 @@ void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
210 event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, 211 event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
211 fsnotify_event_priv_data); 212 fsnotify_event_priv_data);
212 213
214 fsnotify_put_group(fsn_event_priv->group);
213 kmem_cache_free(event_priv_cachep, event_priv); 215 kmem_cache_free(event_priv_cachep, event_priv);
214} 216}
215 217
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index c311dda054a3..228a2c2ad8d7 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -40,6 +40,7 @@
40#include <linux/wait.h> 40#include <linux/wait.h>
41 41
42#include "inotify.h" 42#include "inotify.h"
43#include "../fdinfo.h"
43 44
44#include <asm/ioctls.h> 45#include <asm/ioctls.h>
45 46
@@ -264,7 +265,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
264 ret = -EAGAIN; 265 ret = -EAGAIN;
265 if (file->f_flags & O_NONBLOCK) 266 if (file->f_flags & O_NONBLOCK)
266 break; 267 break;
267 ret = -EINTR; 268 ret = -ERESTARTSYS;
268 if (signal_pending(current)) 269 if (signal_pending(current))
269 break; 270 break;
270 271
@@ -280,23 +281,17 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
280 return ret; 281 return ret;
281} 282}
282 283
283static int inotify_fasync(int fd, struct file *file, int on)
284{
285 struct fsnotify_group *group = file->private_data;
286
287 return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
288}
289
290static int inotify_release(struct inode *ignored, struct file *file) 284static int inotify_release(struct inode *ignored, struct file *file)
291{ 285{
292 struct fsnotify_group *group = file->private_data; 286 struct fsnotify_group *group = file->private_data;
293 287
294 pr_debug("%s: group=%p\n", __func__, group); 288 pr_debug("%s: group=%p\n", __func__, group);
295 289
296 fsnotify_clear_marks_by_group(group); 290 if (file->f_flags & FASYNC)
291 fsnotify_fasync(-1, file, 0);
297 292
298 /* free this group, matching get was inotify_init->fsnotify_obtain_group */ 293 /* free this group, matching get was inotify_init->fsnotify_obtain_group */
299 fsnotify_put_group(group); 294 fsnotify_destroy_group(group);
300 295
301 return 0; 296 return 0;
302} 297}
@@ -335,9 +330,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
335} 330}
336 331
337static const struct file_operations inotify_fops = { 332static const struct file_operations inotify_fops = {
333 .show_fdinfo = inotify_show_fdinfo,
338 .poll = inotify_poll, 334 .poll = inotify_poll,
339 .read = inotify_read, 335 .read = inotify_read,
340 .fasync = inotify_fasync, 336 .fasync = fsnotify_fasync,
341 .release = inotify_release, 337 .release = inotify_release,
342 .unlocked_ioctl = inotify_ioctl, 338 .unlocked_ioctl = inotify_ioctl,
343 .compat_ioctl = inotify_ioctl, 339 .compat_ioctl = inotify_ioctl,
@@ -519,13 +515,13 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
519 struct fsnotify_event_private_data *fsn_event_priv; 515 struct fsnotify_event_private_data *fsn_event_priv;
520 int ret; 516 int ret;
521 517
518 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
519
522 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, 520 ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
523 FSNOTIFY_EVENT_NONE, NULL, 0, 521 FSNOTIFY_EVENT_NONE, NULL, 0,
524 GFP_NOFS); 522 GFP_NOFS);
525 if (!ignored_event) 523 if (!ignored_event)
526 return; 524 goto skip_send_ignore;
527
528 i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
529 525
530 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); 526 event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
531 if (unlikely(!event_priv)) 527 if (unlikely(!event_priv))
@@ -533,6 +529,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
533 529
534 fsn_event_priv = &event_priv->fsnotify_event_priv_data; 530 fsn_event_priv = &event_priv->fsnotify_event_priv_data;
535 531
532 fsnotify_get_group(group);
536 fsn_event_priv->group = group; 533 fsn_event_priv->group = group;
537 event_priv->wd = i_mark->wd; 534 event_priv->wd = i_mark->wd;
538 535
@@ -546,9 +543,9 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
546 } 543 }
547 544
548skip_send_ignore: 545skip_send_ignore:
549
550 /* matches the reference taken when the event was created */ 546 /* matches the reference taken when the event was created */
551 fsnotify_put_event(ignored_event); 547 if (ignored_event)
548 fsnotify_put_event(ignored_event);
552 549
553 /* remove this mark from the idr */ 550 /* remove this mark from the idr */
554 inotify_remove_from_idr(group, i_mark); 551 inotify_remove_from_idr(group, i_mark);
@@ -707,12 +704,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
707 spin_lock_init(&group->inotify_data.idr_lock); 704 spin_lock_init(&group->inotify_data.idr_lock);
708 idr_init(&group->inotify_data.idr); 705 idr_init(&group->inotify_data.idr);
709 group->inotify_data.last_wd = 0; 706 group->inotify_data.last_wd = 0;
710 group->inotify_data.fa = NULL;
711 group->inotify_data.user = get_current_user(); 707 group->inotify_data.user = get_current_user();
712 708
713 if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > 709 if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
714 inotify_max_user_instances) { 710 inotify_max_user_instances) {
715 fsnotify_put_group(group); 711 fsnotify_destroy_group(group);
716 return ERR_PTR(-EMFILE); 712 return ERR_PTR(-EMFILE);
717 } 713 }
718 714
@@ -741,7 +737,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
741 ret = anon_inode_getfd("inotify", &inotify_fops, group, 737 ret = anon_inode_getfd("inotify", &inotify_fops, group,
742 O_RDONLY | flags); 738 O_RDONLY | flags);
743 if (ret < 0) 739 if (ret < 0)
744 fsnotify_put_group(group); 740 fsnotify_destroy_group(group);
745 741
746 return ret; 742 return ret;
747} 743}
@@ -817,7 +813,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
817 813
818 ret = 0; 814 ret = 0;
819 815
820 fsnotify_destroy_mark(&i_mark->fsn_mark); 816 fsnotify_destroy_mark(&i_mark->fsn_mark, group);
821 817
822 /* match ref taken by inotify_idr_find */ 818 /* match ref taken by inotify_idr_find */
823 fsnotify_put_mark(&i_mark->fsn_mark); 819 fsnotify_put_mark(&i_mark->fsn_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index f104d565b682..fc6b49bf7360 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -109,8 +109,11 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)
109 109
110void fsnotify_put_mark(struct fsnotify_mark *mark) 110void fsnotify_put_mark(struct fsnotify_mark *mark)
111{ 111{
112 if (atomic_dec_and_test(&mark->refcnt)) 112 if (atomic_dec_and_test(&mark->refcnt)) {
113 if (mark->group)
114 fsnotify_put_group(mark->group);
113 mark->free_mark(mark); 115 mark->free_mark(mark);
116 }
114} 117}
115 118
116/* 119/*
@@ -118,14 +121,14 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
118 * The caller had better be holding a reference to this mark so we don't actually 121 * The caller had better be holding a reference to this mark so we don't actually
119 * do the final put under the mark->lock 122 * do the final put under the mark->lock
120 */ 123 */
121void fsnotify_destroy_mark(struct fsnotify_mark *mark) 124void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
125 struct fsnotify_group *group)
122{ 126{
123 struct fsnotify_group *group;
124 struct inode *inode = NULL; 127 struct inode *inode = NULL;
125 128
126 spin_lock(&mark->lock); 129 BUG_ON(!mutex_is_locked(&group->mark_mutex));
127 130
128 group = mark->group; 131 spin_lock(&mark->lock);
129 132
130 /* something else already called this function on this mark */ 133 /* something else already called this function on this mark */
131 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { 134 if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
@@ -135,8 +138,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
135 138
136 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 139 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
137 140
138 spin_lock(&group->mark_lock);
139
140 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 141 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
141 inode = mark->i.inode; 142 inode = mark->i.inode;
142 fsnotify_destroy_inode_mark(mark); 143 fsnotify_destroy_inode_mark(mark);
@@ -147,13 +148,22 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
147 148
148 list_del_init(&mark->g_list); 149 list_del_init(&mark->g_list);
149 150
150 spin_unlock(&group->mark_lock);
151 spin_unlock(&mark->lock); 151 spin_unlock(&mark->lock);
152 152
153 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
154 iput(inode);
155 /* release lock temporarily */
156 mutex_unlock(&group->mark_mutex);
157
153 spin_lock(&destroy_lock); 158 spin_lock(&destroy_lock);
154 list_add(&mark->destroy_list, &destroy_list); 159 list_add(&mark->destroy_list, &destroy_list);
155 spin_unlock(&destroy_lock); 160 spin_unlock(&destroy_lock);
156 wake_up(&destroy_waitq); 161 wake_up(&destroy_waitq);
162 /*
163 * We don't necessarily have a ref on mark from caller so the above destroy
164 * may have actually freed it, unless this group provides a 'freeing_mark'
165 * function which must be holding a reference.
166 */
157 167
158 /* 168 /*
159 * Some groups like to know that marks are being freed. This is a 169 * Some groups like to know that marks are being freed. This is a
@@ -175,21 +185,17 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
175 * is just a lazy update (and could be a perf win...) 185 * is just a lazy update (and could be a perf win...)
176 */ 186 */
177 187
178 if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) 188 atomic_dec(&group->num_marks);
179 iput(inode);
180 189
181 /* 190 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
182 * We don't necessarily have a ref on mark from caller so the above iput 191}
183 * may have already destroyed it. Don't touch from now on.
184 */
185 192
186 /* 193void fsnotify_destroy_mark(struct fsnotify_mark *mark,
187 * it's possible that this group tried to destroy itself, but this 194 struct fsnotify_group *group)
188 * this mark was simultaneously being freed by inode. If that's the 195{
189 * case, we finish freeing the group here. 196 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
190 */ 197 fsnotify_destroy_mark_locked(mark, group);
191 if (unlikely(atomic_dec_and_test(&group->num_marks))) 198 mutex_unlock(&group->mark_mutex);
192 fsnotify_final_destroy_group(group);
193} 199}
194 200
195void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) 201void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
@@ -214,26 +220,26 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
214 * These marks may be used for the fsnotify backend to determine which 220 * These marks may be used for the fsnotify backend to determine which
215 * event types should be delivered to which group. 221 * event types should be delivered to which group.
216 */ 222 */
217int fsnotify_add_mark(struct fsnotify_mark *mark, 223int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
218 struct fsnotify_group *group, struct inode *inode, 224 struct fsnotify_group *group, struct inode *inode,
219 struct vfsmount *mnt, int allow_dups) 225 struct vfsmount *mnt, int allow_dups)
220{ 226{
221 int ret = 0; 227 int ret = 0;
222 228
223 BUG_ON(inode && mnt); 229 BUG_ON(inode && mnt);
224 BUG_ON(!inode && !mnt); 230 BUG_ON(!inode && !mnt);
231 BUG_ON(!mutex_is_locked(&group->mark_mutex));
225 232
226 /* 233 /*
227 * LOCKING ORDER!!!! 234 * LOCKING ORDER!!!!
235 * group->mark_mutex
228 * mark->lock 236 * mark->lock
229 * group->mark_lock
230 * inode->i_lock 237 * inode->i_lock
231 */ 238 */
232 spin_lock(&mark->lock); 239 spin_lock(&mark->lock);
233 spin_lock(&group->mark_lock);
234
235 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE; 240 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
236 241
242 fsnotify_get_group(group);
237 mark->group = group; 243 mark->group = group;
238 list_add(&mark->g_list, &group->marks_list); 244 list_add(&mark->g_list, &group->marks_list);
239 atomic_inc(&group->num_marks); 245 atomic_inc(&group->num_marks);
@@ -251,11 +257,8 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
251 BUG(); 257 BUG();
252 } 258 }
253 259
254 spin_unlock(&group->mark_lock);
255
256 /* this will pin the object if appropriate */ 260 /* this will pin the object if appropriate */
257 fsnotify_set_mark_mask_locked(mark, mark->mask); 261 fsnotify_set_mark_mask_locked(mark, mark->mask);
258
259 spin_unlock(&mark->lock); 262 spin_unlock(&mark->lock);
260 263
261 if (inode) 264 if (inode)
@@ -265,10 +268,10 @@ int fsnotify_add_mark(struct fsnotify_mark *mark,
265err: 268err:
266 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 269 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
267 list_del_init(&mark->g_list); 270 list_del_init(&mark->g_list);
271 fsnotify_put_group(group);
268 mark->group = NULL; 272 mark->group = NULL;
269 atomic_dec(&group->num_marks); 273 atomic_dec(&group->num_marks);
270 274
271 spin_unlock(&group->mark_lock);
272 spin_unlock(&mark->lock); 275 spin_unlock(&mark->lock);
273 276
274 spin_lock(&destroy_lock); 277 spin_lock(&destroy_lock);
@@ -279,6 +282,16 @@ err:
279 return ret; 282 return ret;
280} 283}
281 284
285int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
286 struct inode *inode, struct vfsmount *mnt, int allow_dups)
287{
288 int ret;
289 mutex_lock(&group->mark_mutex);
290 ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups);
291 mutex_unlock(&group->mark_mutex);
292 return ret;
293}
294
282/* 295/*
283 * clear any marks in a group in which mark->flags & flags is true 296 * clear any marks in a group in which mark->flags & flags is true
284 */ 297 */
@@ -286,22 +299,16 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
286 unsigned int flags) 299 unsigned int flags)
287{ 300{
288 struct fsnotify_mark *lmark, *mark; 301 struct fsnotify_mark *lmark, *mark;
289 LIST_HEAD(free_list);
290 302
291 spin_lock(&group->mark_lock); 303 mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
292 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { 304 list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
293 if (mark->flags & flags) { 305 if (mark->flags & flags) {
294 list_add(&mark->free_g_list, &free_list);
295 list_del_init(&mark->g_list);
296 fsnotify_get_mark(mark); 306 fsnotify_get_mark(mark);
307 fsnotify_destroy_mark_locked(mark, group);
308 fsnotify_put_mark(mark);
297 } 309 }
298 } 310 }
299 spin_unlock(&group->mark_lock); 311 mutex_unlock(&group->mark_mutex);
300
301 list_for_each_entry_safe(mark, lmark, &free_list, free_g_list) {
302 fsnotify_destroy_mark(mark);
303 fsnotify_put_mark(mark);
304 }
305} 312}
306 313
307/* 314/*
@@ -317,6 +324,8 @@ void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *ol
317 assert_spin_locked(&old->lock); 324 assert_spin_locked(&old->lock);
318 new->i.inode = old->i.inode; 325 new->i.inode = old->i.inode;
319 new->m.mnt = old->m.mnt; 326 new->m.mnt = old->m.mnt;
327 if (old->group)
328 fsnotify_get_group(old->group);
320 new->group = old->group; 329 new->group = old->group;
321 new->mask = old->mask; 330 new->mask = old->mask;
322 new->free_mark = old->free_mark; 331 new->free_mark = old->free_mark;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index c887b1378f7e..7b51b05f160c 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -18,7 +18,7 @@
18 18
19/* 19/*
20 * Basic idea behind the notification queue: An fsnotify group (like inotify) 20 * Basic idea behind the notification queue: An fsnotify group (like inotify)
21 * sends the userspace notification about events asyncronously some time after 21 * sends the userspace notification about events asynchronously some time after
22 * the event happened. When inotify gets an event it will need to add that 22 * the event happened. When inotify gets an event it will need to add that
23 * event to the group notify queue. Since a single event might need to be on 23 * event to the group notify queue. Since a single event might need to be on
24 * multiple group's notification queues we can't add the event directly to each 24 * multiple group's notification queues we can't add the event directly to each
@@ -225,6 +225,7 @@ alloc_holder:
225 mutex_unlock(&group->notification_mutex); 225 mutex_unlock(&group->notification_mutex);
226 226
227 wake_up(&group->notification_waitq); 227 wake_up(&group->notification_waitq);
228 kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
228 return return_event; 229 return return_event;
229} 230}
230 231
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index b7b4b0e8554f..4df58b8ea64a 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -46,8 +46,16 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
46 spin_unlock(&mnt->mnt_root->d_lock); 46 spin_unlock(&mnt->mnt_root->d_lock);
47 47
48 list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { 48 list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) {
49 fsnotify_destroy_mark(mark); 49 struct fsnotify_group *group;
50
51 spin_lock(&mark->lock);
52 fsnotify_get_group(mark->group);
53 group = mark->group;
54 spin_unlock(&mark->lock);
55
56 fsnotify_destroy_mark(mark, group);
50 fsnotify_put_mark(mark); 57 fsnotify_put_mark(mark);
58 fsnotify_put_group(group);
51 } 59 }
52} 60}
53 61
@@ -88,8 +96,8 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
88{ 96{
89 struct vfsmount *mnt = mark->m.mnt; 97 struct vfsmount *mnt = mark->m.mnt;
90 98
99 BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
91 assert_spin_locked(&mark->lock); 100 assert_spin_locked(&mark->lock);
92 assert_spin_locked(&mark->group->mark_lock);
93 101
94 spin_lock(&mnt->mnt_root->d_lock); 102 spin_lock(&mnt->mnt_root->d_lock);
95 103
@@ -151,8 +159,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
151 159
152 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; 160 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
153 161
162 BUG_ON(!mutex_is_locked(&group->mark_mutex));
154 assert_spin_locked(&mark->lock); 163 assert_spin_locked(&mark->lock);
155 assert_spin_locked(&group->mark_lock);
156 164
157 spin_lock(&mnt->mnt_root->d_lock); 165 spin_lock(&mnt->mnt_root->d_lock);
158 166
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1ecf46448f85..5b2d4f0853ac 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1762,6 +1762,16 @@ err_out:
1762 return err; 1762 return err;
1763} 1763}
1764 1764
1765static void ntfs_write_failed(struct address_space *mapping, loff_t to)
1766{
1767 struct inode *inode = mapping->host;
1768
1769 if (to > inode->i_size) {
1770 truncate_pagecache(inode, to, inode->i_size);
1771 ntfs_truncate_vfs(inode);
1772 }
1773}
1774
1765/** 1775/**
1766 * ntfs_file_buffered_write - 1776 * ntfs_file_buffered_write -
1767 * 1777 *
@@ -2022,8 +2032,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2022 * allocated space, which is not a disaster. 2032 * allocated space, which is not a disaster.
2023 */ 2033 */
2024 i_size = i_size_read(vi); 2034 i_size = i_size_read(vi);
2025 if (pos + bytes > i_size) 2035 if (pos + bytes > i_size) {
2026 vmtruncate(vi, i_size); 2036 ntfs_write_failed(mapping, pos + bytes);
2037 }
2027 break; 2038 break;
2028 } 2039 }
2029 } 2040 }
@@ -2227,7 +2238,6 @@ const struct file_operations ntfs_file_ops = {
2227 2238
2228const struct inode_operations ntfs_file_inode_ops = { 2239const struct inode_operations ntfs_file_inode_ops = {
2229#ifdef NTFS_RW 2240#ifdef NTFS_RW
2230 .truncate = ntfs_truncate_vfs,
2231 .setattr = ntfs_setattr, 2241 .setattr = ntfs_setattr,
2232#endif /* NTFS_RW */ 2242#endif /* NTFS_RW */
2233}; 2243};
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1d27331e6fc9..d3e118cc6ffa 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2866,9 +2866,11 @@ conv_err_out:
2866 * 2866 *
2867 * See ntfs_truncate() description above for details. 2867 * See ntfs_truncate() description above for details.
2868 */ 2868 */
2869#ifdef NTFS_RW
2869void ntfs_truncate_vfs(struct inode *vi) { 2870void ntfs_truncate_vfs(struct inode *vi) {
2870 ntfs_truncate(vi); 2871 ntfs_truncate(vi);
2871} 2872}
2873#endif
2872 2874
2873/** 2875/**
2874 * ntfs_setattr - called from notify_change() when an attribute is being changed 2876 * ntfs_setattr - called from notify_change() when an attribute is being changed
@@ -2914,8 +2916,10 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2914 NInoCompressed(ni) ? 2916 NInoCompressed(ni) ?
2915 "compressed" : "encrypted"); 2917 "compressed" : "encrypted");
2916 err = -EOPNOTSUPP; 2918 err = -EOPNOTSUPP;
2917 } else 2919 } else {
2918 err = vmtruncate(vi, attr->ia_size); 2920 truncate_setsize(vi, attr->ia_size);
2921 ntfs_truncate_vfs(vi);
2922 }
2919 if (err || ia_valid == ATTR_SIZE) 2923 if (err || ia_valid == ATTR_SIZE)
2920 goto out; 2924 goto out;
2921 } else { 2925 } else {
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index db29695f845c..76b6cfb579d7 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -316,6 +316,10 @@ static inline void ntfs_commit_inode(struct inode *vi)
316 return; 316 return;
317} 317}
318 318
319#else
320
321static inline void ntfs_truncate_vfs(struct inode *vi) {}
322
319#endif /* NTFS_RW */ 323#endif /* NTFS_RW */
320 324
321#endif /* _LINUX_NTFS_INODE_H */ 325#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 70b5863a2d64..f487aa343442 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,7 +832,7 @@ out:
832 return ret; 832 return ret;
833} 833}
834 834
835int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) 835int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
836{ 836{
837 struct inode *inode = file->f_mapping->host; 837 struct inode *inode = file->f_mapping->host;
838 int ret; 838 int ret;
@@ -843,7 +843,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
843 struct buffer_head *di_bh = NULL; 843 struct buffer_head *di_bh = NULL;
844 struct ocfs2_extent_rec rec; 844 struct ocfs2_extent_rec rec;
845 845
846 BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); 846 BUG_ON(whence != SEEK_DATA && whence != SEEK_HOLE);
847 847
848 ret = ocfs2_inode_lock(inode, &di_bh, 0); 848 ret = ocfs2_inode_lock(inode, &di_bh, 0);
849 if (ret) { 849 if (ret) {
@@ -859,7 +859,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
859 } 859 }
860 860
861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
862 if (origin == SEEK_HOLE) 862 if (whence == SEEK_HOLE)
863 *offset = inode->i_size; 863 *offset = inode->i_size;
864 goto out_unlock; 864 goto out_unlock;
865 } 865 }
@@ -888,8 +888,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; 888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
889 } 889 }
890 890
891 if ((!is_data && origin == SEEK_HOLE) || 891 if ((!is_data && whence == SEEK_HOLE) ||
892 (is_data && origin == SEEK_DATA)) { 892 (is_data && whence == SEEK_DATA)) {
893 if (extoff > *offset) 893 if (extoff > *offset)
894 *offset = extoff; 894 *offset = extoff;
895 goto out_unlock; 895 goto out_unlock;
@@ -899,7 +899,7 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
899 cpos += clen; 899 cpos += clen;
900 } 900 }
901 901
902 if (origin == SEEK_HOLE) { 902 if (whence == SEEK_HOLE) {
903 extoff = cpos; 903 extoff = cpos;
904 extoff <<= cs_bits; 904 extoff <<= cs_bits;
905 extlen = clen; 905 extlen = clen;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5a4ee77cec51..37d313ede159 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1218,24 +1218,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1218 } 1218 }
1219 } 1219 }
1220 1220
1221 /*
1222 * This will intentionally not wind up calling truncate_setsize(),
1223 * since all the work for a size change has been done above.
1224 * Otherwise, we could get into problems with truncate as
1225 * ip_alloc_sem is used there to protect against i_size
1226 * changes.
1227 *
1228 * XXX: this means the conditional below can probably be removed.
1229 */
1230 if ((attr->ia_valid & ATTR_SIZE) &&
1231 attr->ia_size != i_size_read(inode)) {
1232 status = vmtruncate(inode, attr->ia_size);
1233 if (status) {
1234 mlog_errno(status);
1235 goto bail_commit;
1236 }
1237 }
1238
1239 setattr_copy(inode, attr); 1221 setattr_copy(inode, attr);
1240 mark_inode_dirty(inode); 1222 mark_inode_dirty(inode);
1241 1223
@@ -2513,18 +2495,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2513 ret = sd.num_spliced; 2495 ret = sd.num_spliced;
2514 2496
2515 if (ret > 0) { 2497 if (ret > 0) {
2516 unsigned long nr_pages;
2517 int err; 2498 int err;
2518 2499
2519 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2520
2521 err = generic_write_sync(out, *ppos, ret); 2500 err = generic_write_sync(out, *ppos, ret);
2522 if (err) 2501 if (err)
2523 ret = err; 2502 ret = err;
2524 else 2503 else
2525 *ppos += ret; 2504 *ppos += ret;
2526 2505
2527 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2506 balance_dirty_pages_ratelimited(mapping);
2528 } 2507 }
2529 2508
2530 return ret; 2509 return ret;
@@ -2640,14 +2619,14 @@ bail:
2640} 2619}
2641 2620
2642/* Refer generic_file_llseek_unlocked() */ 2621/* Refer generic_file_llseek_unlocked() */
2643static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) 2622static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2644{ 2623{
2645 struct inode *inode = file->f_mapping->host; 2624 struct inode *inode = file->f_mapping->host;
2646 int ret = 0; 2625 int ret = 0;
2647 2626
2648 mutex_lock(&inode->i_mutex); 2627 mutex_lock(&inode->i_mutex);
2649 2628
2650 switch (origin) { 2629 switch (whence) {
2651 case SEEK_SET: 2630 case SEEK_SET:
2652 break; 2631 break;
2653 case SEEK_END: 2632 case SEEK_END:
@@ -2662,7 +2641,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
2662 break; 2641 break;
2663 case SEEK_DATA: 2642 case SEEK_DATA:
2664 case SEEK_HOLE: 2643 case SEEK_HOLE:
2665 ret = ocfs2_seek_data_hole_offset(file, &offset, origin); 2644 ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
2666 if (ret) 2645 if (ret)
2667 goto out; 2646 goto out;
2668 break; 2647 break;
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 77e3cb2962b4..e0d9b3e722bd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -306,6 +306,16 @@ omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
306 return mpage_writepages(mapping, wbc, omfs_get_block); 306 return mpage_writepages(mapping, wbc, omfs_get_block);
307} 307}
308 308
309static void omfs_write_failed(struct address_space *mapping, loff_t to)
310{
311 struct inode *inode = mapping->host;
312
313 if (to > inode->i_size) {
314 truncate_pagecache(inode, to, inode->i_size);
315 omfs_truncate(inode);
316 }
317}
318
309static int omfs_write_begin(struct file *file, struct address_space *mapping, 319static int omfs_write_begin(struct file *file, struct address_space *mapping,
310 loff_t pos, unsigned len, unsigned flags, 320 loff_t pos, unsigned len, unsigned flags,
311 struct page **pagep, void **fsdata) 321 struct page **pagep, void **fsdata)
@@ -314,11 +324,8 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
314 324
315 ret = block_write_begin(mapping, pos, len, flags, pagep, 325 ret = block_write_begin(mapping, pos, len, flags, pagep,
316 omfs_get_block); 326 omfs_get_block);
317 if (unlikely(ret)) { 327 if (unlikely(ret))
318 loff_t isize = mapping->host->i_size; 328 omfs_write_failed(mapping, pos + len);
319 if (pos + len > isize)
320 vmtruncate(mapping->host, isize);
321 }
322 329
323 return ret; 330 return ret;
324} 331}
@@ -350,9 +357,11 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
350 357
351 if ((attr->ia_valid & ATTR_SIZE) && 358 if ((attr->ia_valid & ATTR_SIZE) &&
352 attr->ia_size != i_size_read(inode)) { 359 attr->ia_size != i_size_read(inode)) {
353 error = vmtruncate(inode, attr->ia_size); 360 error = inode_newsize_ok(inode, attr->ia_size);
354 if (error) 361 if (error)
355 return error; 362 return error;
363 truncate_setsize(inode, attr->ia_size);
364 omfs_truncate(inode);
356 } 365 }
357 366
358 setattr_copy(inode, attr); 367 setattr_copy(inode, attr);
@@ -362,7 +371,6 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
362 371
363const struct inode_operations omfs_file_inops = { 372const struct inode_operations omfs_file_inops = {
364 .setattr = omfs_setattr, 373 .setattr = omfs_setattr,
365 .truncate = omfs_truncate
366}; 374};
367 375
368const struct address_space_operations omfs_aops = { 376const struct address_space_operations omfs_aops = {
diff --git a/fs/open.c b/fs/open.c
index 59071f55bf7f..9b33c0cbfacf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -61,33 +61,22 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
61 return ret; 61 return ret;
62} 62}
63 63
64static long do_sys_truncate(const char __user *pathname, loff_t length) 64long vfs_truncate(struct path *path, loff_t length)
65{ 65{
66 struct path path;
67 struct inode *inode; 66 struct inode *inode;
68 int error; 67 long error;
69
70 error = -EINVAL;
71 if (length < 0) /* sorry, but loff_t says... */
72 goto out;
73 68
74 error = user_path(pathname, &path); 69 inode = path->dentry->d_inode;
75 if (error)
76 goto out;
77 inode = path.dentry->d_inode;
78 70
79 /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ 71 /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
80 error = -EISDIR;
81 if (S_ISDIR(inode->i_mode)) 72 if (S_ISDIR(inode->i_mode))
82 goto dput_and_out; 73 return -EISDIR;
83
84 error = -EINVAL;
85 if (!S_ISREG(inode->i_mode)) 74 if (!S_ISREG(inode->i_mode))
86 goto dput_and_out; 75 return -EINVAL;
87 76
88 error = mnt_want_write(path.mnt); 77 error = mnt_want_write(path->mnt);
89 if (error) 78 if (error)
90 goto dput_and_out; 79 goto out;
91 80
92 error = inode_permission(inode, MAY_WRITE); 81 error = inode_permission(inode, MAY_WRITE);
93 if (error) 82 if (error)
@@ -111,19 +100,40 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
111 100
112 error = locks_verify_truncate(inode, NULL, length); 101 error = locks_verify_truncate(inode, NULL, length);
113 if (!error) 102 if (!error)
114 error = security_path_truncate(&path); 103 error = security_path_truncate(path);
115 if (!error) 104 if (!error)
116 error = do_truncate(path.dentry, length, 0, NULL); 105 error = do_truncate(path->dentry, length, 0, NULL);
117 106
118put_write_and_out: 107put_write_and_out:
119 put_write_access(inode); 108 put_write_access(inode);
120mnt_drop_write_and_out: 109mnt_drop_write_and_out:
121 mnt_drop_write(path.mnt); 110 mnt_drop_write(path->mnt);
122dput_and_out:
123 path_put(&path);
124out: 111out:
125 return error; 112 return error;
126} 113}
114EXPORT_SYMBOL_GPL(vfs_truncate);
115
116static long do_sys_truncate(const char __user *pathname, loff_t length)
117{
118 unsigned int lookup_flags = LOOKUP_FOLLOW;
119 struct path path;
120 int error;
121
122 if (length < 0) /* sorry, but loff_t says... */
123 return -EINVAL;
124
125retry:
126 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
127 if (!error) {
128 error = vfs_truncate(&path, length);
129 path_put(&path);
130 }
131 if (retry_estale(error, lookup_flags)) {
132 lookup_flags |= LOOKUP_REVAL;
133 goto retry;
134 }
135 return error;
136}
127 137
128SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) 138SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
129{ 139{
@@ -306,6 +316,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
306 struct path path; 316 struct path path;
307 struct inode *inode; 317 struct inode *inode;
308 int res; 318 int res;
319 unsigned int lookup_flags = LOOKUP_FOLLOW;
309 320
310 if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ 321 if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
311 return -EINVAL; 322 return -EINVAL;
@@ -328,8 +339,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
328 } 339 }
329 340
330 old_cred = override_creds(override_cred); 341 old_cred = override_creds(override_cred);
331 342retry:
332 res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); 343 res = user_path_at(dfd, filename, lookup_flags, &path);
333 if (res) 344 if (res)
334 goto out; 345 goto out;
335 346
@@ -364,6 +375,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
364 375
365out_path_release: 376out_path_release:
366 path_put(&path); 377 path_put(&path);
378 if (retry_estale(res, lookup_flags)) {
379 lookup_flags |= LOOKUP_REVAL;
380 goto retry;
381 }
367out: 382out:
368 revert_creds(old_cred); 383 revert_creds(old_cred);
369 put_cred(override_cred); 384 put_cred(override_cred);
@@ -379,8 +394,9 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
379{ 394{
380 struct path path; 395 struct path path;
381 int error; 396 int error;
382 397 unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
383 error = user_path_dir(filename, &path); 398retry:
399 error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
384 if (error) 400 if (error)
385 goto out; 401 goto out;
386 402
@@ -392,6 +408,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
392 408
393dput_and_out: 409dput_and_out:
394 path_put(&path); 410 path_put(&path);
411 if (retry_estale(error, lookup_flags)) {
412 lookup_flags |= LOOKUP_REVAL;
413 goto retry;
414 }
395out: 415out:
396 return error; 416 return error;
397} 417}
@@ -425,8 +445,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
425{ 445{
426 struct path path; 446 struct path path;
427 int error; 447 int error;
428 448 unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
429 error = user_path_dir(filename, &path); 449retry:
450 error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
430 if (error) 451 if (error)
431 goto out; 452 goto out;
432 453
@@ -435,7 +456,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
435 goto dput_and_out; 456 goto dput_and_out;
436 457
437 error = -EPERM; 458 error = -EPERM;
438 if (!capable(CAP_SYS_CHROOT)) 459 if (!nsown_capable(CAP_SYS_CHROOT))
439 goto dput_and_out; 460 goto dput_and_out;
440 error = security_path_chroot(&path); 461 error = security_path_chroot(&path);
441 if (error) 462 if (error)
@@ -445,6 +466,10 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
445 error = 0; 466 error = 0;
446dput_and_out: 467dput_and_out:
447 path_put(&path); 468 path_put(&path);
469 if (retry_estale(error, lookup_flags)) {
470 lookup_flags |= LOOKUP_REVAL;
471 goto retry;
472 }
448out: 473out:
449 return error; 474 return error;
450} 475}
@@ -489,11 +514,16 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode
489{ 514{
490 struct path path; 515 struct path path;
491 int error; 516 int error;
492 517 unsigned int lookup_flags = LOOKUP_FOLLOW;
493 error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); 518retry:
519 error = user_path_at(dfd, filename, lookup_flags, &path);
494 if (!error) { 520 if (!error) {
495 error = chmod_common(&path, mode); 521 error = chmod_common(&path, mode);
496 path_put(&path); 522 path_put(&path);
523 if (retry_estale(error, lookup_flags)) {
524 lookup_flags |= LOOKUP_REVAL;
525 goto retry;
526 }
497 } 527 }
498 return error; 528 return error;
499} 529}
@@ -552,6 +582,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
552 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 582 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
553 if (flag & AT_EMPTY_PATH) 583 if (flag & AT_EMPTY_PATH)
554 lookup_flags |= LOOKUP_EMPTY; 584 lookup_flags |= LOOKUP_EMPTY;
585retry:
555 error = user_path_at(dfd, filename, lookup_flags, &path); 586 error = user_path_at(dfd, filename, lookup_flags, &path);
556 if (error) 587 if (error)
557 goto out; 588 goto out;
@@ -562,6 +593,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
562 mnt_drop_write(path.mnt); 593 mnt_drop_write(path.mnt);
563out_release: 594out_release:
564 path_put(&path); 595 path_put(&path);
596 if (retry_estale(error, lookup_flags)) {
597 lookup_flags |= LOOKUP_REVAL;
598 goto retry;
599 }
565out: 600out:
566 return error; 601 return error;
567} 602}
diff --git a/fs/pnode.h b/fs/pnode.h
index 65c60979d541..19b853a3445c 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -22,6 +22,7 @@
22#define CL_COPY_ALL 0x04 22#define CL_COPY_ALL 0x04
23#define CL_MAKE_SHARED 0x08 23#define CL_MAKE_SHARED 0x08
24#define CL_PRIVATE 0x10 24#define CL_PRIVATE 0x10
25#define CL_SHARED_TO_SLAVE 0x20
25 26
26static inline void set_mnt_shared(struct mount *mnt) 27static inline void set_mnt_shared(struct mount *mnt)
27{ 28{
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 99349efbbc2b..981b05601931 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -21,6 +21,7 @@ proc-y += uptime.o
21proc-y += version.o 21proc-y += version.o
22proc-y += softirqs.o 22proc-y += softirqs.o
23proc-y += namespaces.o 23proc-y += namespaces.o
24proc-y += self.o
24proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 25proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
25proc-$(CONFIG_NET) += proc_net.o 26proc-$(CONFIG_NET) += proc_net.o
26proc-$(CONFIG_PROC_KCORE) += kcore.o 27proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36cae..6a91e6ffbcbd 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -162,7 +162,7 @@ static inline const char *get_task_state(struct task_struct *tsk)
162static inline void task_state(struct seq_file *m, struct pid_namespace *ns, 162static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
163 struct pid *pid, struct task_struct *p) 163 struct pid *pid, struct task_struct *p)
164{ 164{
165 struct user_namespace *user_ns = current_user_ns(); 165 struct user_namespace *user_ns = seq_user_ns(m);
166 struct group_info *group_info; 166 struct group_info *group_info;
167 int g; 167 int g;
168 struct fdtable *fdt = NULL; 168 struct fdtable *fdt = NULL;
@@ -212,7 +212,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
212 group_info = cred->group_info; 212 group_info = cred->group_info;
213 task_unlock(p); 213 task_unlock(p);
214 214
215 for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++) 215 for (g = 0; g < group_info->ngroups; g++)
216 seq_printf(m, "%d ", 216 seq_printf(m, "%d ",
217 from_kgid_munged(user_ns, GROUP_AT(group_info, g))); 217 from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
218 put_cred(cred); 218 put_cred(cred);
@@ -220,7 +220,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
220 seq_putc(m, '\n'); 220 seq_putc(m, '\n');
221} 221}
222 222
223static void render_sigset_t(struct seq_file *m, const char *header, 223void render_sigset_t(struct seq_file *m, const char *header,
224 sigset_t *set) 224 sigset_t *set)
225{ 225{
226 int i; 226 int i;
@@ -308,6 +308,10 @@ static void render_cap_t(struct seq_file *m, const char *header,
308 seq_putc(m, '\n'); 308 seq_putc(m, '\n');
309} 309}
310 310
311/* Remove non-existent capabilities */
312#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
313 CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
314
311static inline void task_cap(struct seq_file *m, struct task_struct *p) 315static inline void task_cap(struct seq_file *m, struct task_struct *p)
312{ 316{
313 const struct cred *cred; 317 const struct cred *cred;
@@ -321,12 +325,24 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
321 cap_bset = cred->cap_bset; 325 cap_bset = cred->cap_bset;
322 rcu_read_unlock(); 326 rcu_read_unlock();
323 327
328 NORM_CAPS(cap_inheritable);
329 NORM_CAPS(cap_permitted);
330 NORM_CAPS(cap_effective);
331 NORM_CAPS(cap_bset);
332
324 render_cap_t(m, "CapInh:\t", &cap_inheritable); 333 render_cap_t(m, "CapInh:\t", &cap_inheritable);
325 render_cap_t(m, "CapPrm:\t", &cap_permitted); 334 render_cap_t(m, "CapPrm:\t", &cap_permitted);
326 render_cap_t(m, "CapEff:\t", &cap_effective); 335 render_cap_t(m, "CapEff:\t", &cap_effective);
327 render_cap_t(m, "CapBnd:\t", &cap_bset); 336 render_cap_t(m, "CapBnd:\t", &cap_bset);
328} 337}
329 338
339static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
340{
341#ifdef CONFIG_SECCOMP
342 seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
343#endif
344}
345
330static inline void task_context_switch_counts(struct seq_file *m, 346static inline void task_context_switch_counts(struct seq_file *m,
331 struct task_struct *p) 347 struct task_struct *p)
332{ 348{
@@ -360,6 +376,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
360 } 376 }
361 task_sig(m, task); 377 task_sig(m, task);
362 task_cap(m, task); 378 task_cap(m, task);
379 task_seccomp(m, task);
363 task_cpus_allowed(m, task); 380 task_cpus_allowed(m, task);
364 cpuset_task_status_allowed(m, task); 381 cpuset_task_status_allowed(m, task);
365 task_context_switch_counts(m, task); 382 task_context_switch_counts(m, task);
@@ -438,7 +455,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
438 455
439 min_flt += sig->min_flt; 456 min_flt += sig->min_flt;
440 maj_flt += sig->maj_flt; 457 maj_flt += sig->maj_flt;
441 thread_group_times(task, &utime, &stime); 458 thread_group_cputime_adjusted(task, &utime, &stime);
442 gtime += sig->gtime; 459 gtime += sig->gtime;
443 } 460 }
444 461
@@ -454,7 +471,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
454 if (!whole) { 471 if (!whole) {
455 min_flt = task->min_flt; 472 min_flt = task->min_flt;
456 maj_flt = task->maj_flt; 473 maj_flt = task->maj_flt;
457 task_times(task, &utime, &stime); 474 task_cputime_adjusted(task, &utime, &stime);
458 gtime = task->gtime; 475 gtime = task->gtime;
459 } 476 }
460 477
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9e28356a959a..9b43ff77a51e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -542,13 +542,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
542 if (error) 542 if (error)
543 return error; 543 return error;
544 544
545 if ((attr->ia_valid & ATTR_SIZE) &&
546 attr->ia_size != i_size_read(inode)) {
547 error = vmtruncate(inode, attr->ia_size);
548 if (error)
549 return error;
550 }
551
552 setattr_copy(inode, attr); 545 setattr_copy(inode, attr);
553 mark_inode_dirty(inode); 546 mark_inode_dirty(inode);
554 return 0; 547 return 0;
@@ -985,7 +978,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
985{ 978{
986 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 979 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
987 char buffer[PROC_NUMBUF]; 980 char buffer[PROC_NUMBUF];
988 int oom_score_adj = OOM_SCORE_ADJ_MIN; 981 short oom_score_adj = OOM_SCORE_ADJ_MIN;
989 unsigned long flags; 982 unsigned long flags;
990 size_t len; 983 size_t len;
991 984
@@ -996,7 +989,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
996 unlock_task_sighand(task, &flags); 989 unlock_task_sighand(task, &flags);
997 } 990 }
998 put_task_struct(task); 991 put_task_struct(task);
999 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); 992 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
1000 return simple_read_from_buffer(buf, count, ppos, buffer, len); 993 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1001} 994}
1002 995
@@ -1043,15 +1036,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1043 goto err_task_lock; 1036 goto err_task_lock;
1044 } 1037 }
1045 1038
1046 if (oom_score_adj < task->signal->oom_score_adj_min && 1039 if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
1047 !capable(CAP_SYS_RESOURCE)) { 1040 !capable(CAP_SYS_RESOURCE)) {
1048 err = -EACCES; 1041 err = -EACCES;
1049 goto err_sighand; 1042 goto err_sighand;
1050 } 1043 }
1051 1044
1052 task->signal->oom_score_adj = oom_score_adj; 1045 task->signal->oom_score_adj = (short)oom_score_adj;
1053 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 1046 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1054 task->signal->oom_score_adj_min = oom_score_adj; 1047 task->signal->oom_score_adj_min = (short)oom_score_adj;
1055 trace_oom_score_adj_update(task); 1048 trace_oom_score_adj_update(task);
1056 1049
1057err_sighand: 1050err_sighand:
@@ -2345,146 +2338,6 @@ static const struct file_operations proc_coredump_filter_operations = {
2345}; 2338};
2346#endif 2339#endif
2347 2340
2348/*
2349 * /proc/self:
2350 */
2351static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
2352 int buflen)
2353{
2354 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2355 pid_t tgid = task_tgid_nr_ns(current, ns);
2356 char tmp[PROC_NUMBUF];
2357 if (!tgid)
2358 return -ENOENT;
2359 sprintf(tmp, "%d", tgid);
2360 return vfs_readlink(dentry,buffer,buflen,tmp);
2361}
2362
2363static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2364{
2365 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2366 pid_t tgid = task_tgid_nr_ns(current, ns);
2367 char *name = ERR_PTR(-ENOENT);
2368 if (tgid) {
2369 /* 11 for max length of signed int in decimal + NULL term */
2370 name = kmalloc(12, GFP_KERNEL);
2371 if (!name)
2372 name = ERR_PTR(-ENOMEM);
2373 else
2374 sprintf(name, "%d", tgid);
2375 }
2376 nd_set_link(nd, name);
2377 return NULL;
2378}
2379
2380static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2381 void *cookie)
2382{
2383 char *s = nd_get_link(nd);
2384 if (!IS_ERR(s))
2385 kfree(s);
2386}
2387
2388static const struct inode_operations proc_self_inode_operations = {
2389 .readlink = proc_self_readlink,
2390 .follow_link = proc_self_follow_link,
2391 .put_link = proc_self_put_link,
2392};
2393
2394/*
2395 * proc base
2396 *
2397 * These are the directory entries in the root directory of /proc
2398 * that properly belong to the /proc filesystem, as they describe
2399 * describe something that is process related.
2400 */
2401static const struct pid_entry proc_base_stuff[] = {
2402 NOD("self", S_IFLNK|S_IRWXUGO,
2403 &proc_self_inode_operations, NULL, {}),
2404};
2405
2406static struct dentry *proc_base_instantiate(struct inode *dir,
2407 struct dentry *dentry, struct task_struct *task, const void *ptr)
2408{
2409 const struct pid_entry *p = ptr;
2410 struct inode *inode;
2411 struct proc_inode *ei;
2412 struct dentry *error;
2413
2414 /* Allocate the inode */
2415 error = ERR_PTR(-ENOMEM);
2416 inode = new_inode(dir->i_sb);
2417 if (!inode)
2418 goto out;
2419
2420 /* Initialize the inode */
2421 ei = PROC_I(inode);
2422 inode->i_ino = get_next_ino();
2423 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2424
2425 /*
2426 * grab the reference to the task.
2427 */
2428 ei->pid = get_task_pid(task, PIDTYPE_PID);
2429 if (!ei->pid)
2430 goto out_iput;
2431
2432 inode->i_mode = p->mode;
2433 if (S_ISDIR(inode->i_mode))
2434 set_nlink(inode, 2);
2435 if (S_ISLNK(inode->i_mode))
2436 inode->i_size = 64;
2437 if (p->iop)
2438 inode->i_op = p->iop;
2439 if (p->fop)
2440 inode->i_fop = p->fop;
2441 ei->op = p->op;
2442 d_add(dentry, inode);
2443 error = NULL;
2444out:
2445 return error;
2446out_iput:
2447 iput(inode);
2448 goto out;
2449}
2450
2451static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
2452{
2453 struct dentry *error;
2454 struct task_struct *task = get_proc_task(dir);
2455 const struct pid_entry *p, *last;
2456
2457 error = ERR_PTR(-ENOENT);
2458
2459 if (!task)
2460 goto out_no_task;
2461
2462 /* Lookup the directory entry */
2463 last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
2464 for (p = proc_base_stuff; p <= last; p++) {
2465 if (p->len != dentry->d_name.len)
2466 continue;
2467 if (!memcmp(dentry->d_name.name, p->name, p->len))
2468 break;
2469 }
2470 if (p > last)
2471 goto out;
2472
2473 error = proc_base_instantiate(dir, dentry, task, p);
2474
2475out:
2476 put_task_struct(task);
2477out_no_task:
2478 return error;
2479}
2480
2481static int proc_base_fill_cache(struct file *filp, void *dirent,
2482 filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2483{
2484 return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2485 proc_base_instantiate, task, p);
2486}
2487
2488#ifdef CONFIG_TASK_IO_ACCOUNTING 2341#ifdef CONFIG_TASK_IO_ACCOUNTING
2489static int do_io_accounting(struct task_struct *task, char *buffer, int whole) 2342static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2490{ 2343{
@@ -2839,10 +2692,6 @@ void proc_flush_task(struct task_struct *task)
2839 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2692 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2840 tgid->numbers[i].nr); 2693 tgid->numbers[i].nr);
2841 } 2694 }
2842
2843 upid = &pid->numbers[pid->level];
2844 if (upid->nr == 1)
2845 pid_ns_release_proc(upid->ns);
2846} 2695}
2847 2696
2848static struct dentry *proc_pid_instantiate(struct inode *dir, 2697static struct dentry *proc_pid_instantiate(struct inode *dir,
@@ -2876,15 +2725,11 @@ out:
2876 2725
2877struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2726struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2878{ 2727{
2879 struct dentry *result; 2728 struct dentry *result = NULL;
2880 struct task_struct *task; 2729 struct task_struct *task;
2881 unsigned tgid; 2730 unsigned tgid;
2882 struct pid_namespace *ns; 2731 struct pid_namespace *ns;
2883 2732
2884 result = proc_base_lookup(dir, dentry);
2885 if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
2886 goto out;
2887
2888 tgid = name_to_int(dentry); 2733 tgid = name_to_int(dentry);
2889 if (tgid == ~0U) 2734 if (tgid == ~0U)
2890 goto out; 2735 goto out;
@@ -2947,7 +2792,7 @@ retry:
2947 return iter; 2792 return iter;
2948} 2793}
2949 2794
2950#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) 2795#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
2951 2796
2952static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 2797static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2953 struct tgid_iter iter) 2798 struct tgid_iter iter)
@@ -2967,25 +2812,12 @@ static int fake_filldir(void *buf, const char *name, int namelen,
2967/* for the /proc/ directory itself, after non-process stuff has been done */ 2812/* for the /proc/ directory itself, after non-process stuff has been done */
2968int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 2813int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2969{ 2814{
2970 unsigned int nr;
2971 struct task_struct *reaper;
2972 struct tgid_iter iter; 2815 struct tgid_iter iter;
2973 struct pid_namespace *ns; 2816 struct pid_namespace *ns;
2974 filldir_t __filldir; 2817 filldir_t __filldir;
2975 2818
2976 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) 2819 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
2977 goto out_no_task; 2820 goto out;
2978 nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2979
2980 reaper = get_proc_task(filp->f_path.dentry->d_inode);
2981 if (!reaper)
2982 goto out_no_task;
2983
2984 for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
2985 const struct pid_entry *p = &proc_base_stuff[nr];
2986 if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
2987 goto out;
2988 }
2989 2821
2990 ns = filp->f_dentry->d_sb->s_fs_info; 2822 ns = filp->f_dentry->d_sb->s_fs_info;
2991 iter.task = NULL; 2823 iter.task = NULL;
@@ -3006,8 +2838,6 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
3006 } 2838 }
3007 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 2839 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
3008out: 2840out:
3009 put_task_struct(reaper);
3010out_no_task:
3011 return 0; 2841 return 0;
3012} 2842}
3013 2843
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index f28a875f8779..d7a4a28ef630 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -50,6 +50,8 @@ static int seq_show(struct seq_file *m, void *v)
50 if (!ret) { 50 if (!ret) {
51 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n", 51 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
52 (long long)file->f_pos, f_flags); 52 (long long)file->f_pos, f_flags);
53 if (file->f_op->show_fdinfo)
54 ret = file->f_op->show_fdinfo(m, file);
53 fput(file); 55 fput(file);
54 } 56 }
55 57
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 0d80cef4cfb9..76ddae83daa5 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -261,16 +261,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
261 if (error) 261 if (error)
262 return error; 262 return error;
263 263
264 if ((iattr->ia_valid & ATTR_SIZE) &&
265 iattr->ia_size != i_size_read(inode)) {
266 error = vmtruncate(inode, iattr->ia_size);
267 if (error)
268 return error;
269 }
270
271 setattr_copy(inode, iattr); 264 setattr_copy(inode, iattr);
272 mark_inode_dirty(inode); 265 mark_inode_dirty(inode);
273 266
274 de->uid = inode->i_uid; 267 de->uid = inode->i_uid;
275 de->gid = inode->i_gid; 268 de->gid = inode->i_gid;
276 de->mode = inode->i_mode; 269 de->mode = inode->i_mode;
@@ -350,37 +343,39 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
350 * Return an inode number between PROC_DYNAMIC_FIRST and 343 * Return an inode number between PROC_DYNAMIC_FIRST and
351 * 0xffffffff, or zero on failure. 344 * 0xffffffff, or zero on failure.
352 */ 345 */
353static unsigned int get_inode_number(void) 346int proc_alloc_inum(unsigned int *inum)
354{ 347{
355 unsigned int i; 348 unsigned int i;
356 int error; 349 int error;
357 350
358retry: 351retry:
359 if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0) 352 if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
360 return 0; 353 return -ENOMEM;
361 354
362 spin_lock(&proc_inum_lock); 355 spin_lock_irq(&proc_inum_lock);
363 error = ida_get_new(&proc_inum_ida, &i); 356 error = ida_get_new(&proc_inum_ida, &i);
364 spin_unlock(&proc_inum_lock); 357 spin_unlock_irq(&proc_inum_lock);
365 if (error == -EAGAIN) 358 if (error == -EAGAIN)
366 goto retry; 359 goto retry;
367 else if (error) 360 else if (error)
368 return 0; 361 return error;
369 362
370 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { 363 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
371 spin_lock(&proc_inum_lock); 364 spin_lock_irq(&proc_inum_lock);
372 ida_remove(&proc_inum_ida, i); 365 ida_remove(&proc_inum_ida, i);
373 spin_unlock(&proc_inum_lock); 366 spin_unlock_irq(&proc_inum_lock);
374 return 0; 367 return -ENOSPC;
375 } 368 }
376 return PROC_DYNAMIC_FIRST + i; 369 *inum = PROC_DYNAMIC_FIRST + i;
370 return 0;
377} 371}
378 372
379static void release_inode_number(unsigned int inum) 373void proc_free_inum(unsigned int inum)
380{ 374{
381 spin_lock(&proc_inum_lock); 375 unsigned long flags;
376 spin_lock_irqsave(&proc_inum_lock, flags);
382 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); 377 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
383 spin_unlock(&proc_inum_lock); 378 spin_unlock_irqrestore(&proc_inum_lock, flags);
384} 379}
385 380
386static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) 381static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -554,13 +549,12 @@ static const struct inode_operations proc_dir_inode_operations = {
554 549
555static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) 550static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
556{ 551{
557 unsigned int i;
558 struct proc_dir_entry *tmp; 552 struct proc_dir_entry *tmp;
553 int ret;
559 554
560 i = get_inode_number(); 555 ret = proc_alloc_inum(&dp->low_ino);
561 if (i == 0) 556 if (ret)
562 return -EAGAIN; 557 return ret;
563 dp->low_ino = i;
564 558
565 if (S_ISDIR(dp->mode)) { 559 if (S_ISDIR(dp->mode)) {
566 if (dp->proc_iops == NULL) { 560 if (dp->proc_iops == NULL) {
@@ -764,7 +758,7 @@ EXPORT_SYMBOL(proc_create_data);
764 758
765static void free_proc_entry(struct proc_dir_entry *de) 759static void free_proc_entry(struct proc_dir_entry *de)
766{ 760{
767 release_inode_number(de->low_ino); 761 proc_free_inum(de->low_ino);
768 762
769 if (S_ISLNK(de->mode)) 763 if (S_ISLNK(de->mode))
770 kfree(de->data); 764 kfree(de->data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3b22bbdee9ec..439ae6886507 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -31,6 +31,7 @@ static void proc_evict_inode(struct inode *inode)
31 struct proc_dir_entry *de; 31 struct proc_dir_entry *de;
32 struct ctl_table_header *head; 32 struct ctl_table_header *head;
33 const struct proc_ns_operations *ns_ops; 33 const struct proc_ns_operations *ns_ops;
34 void *ns;
34 35
35 truncate_inode_pages(&inode->i_data, 0); 36 truncate_inode_pages(&inode->i_data, 0);
36 clear_inode(inode); 37 clear_inode(inode);
@@ -49,8 +50,9 @@ static void proc_evict_inode(struct inode *inode)
49 } 50 }
50 /* Release any associated namespace */ 51 /* Release any associated namespace */
51 ns_ops = PROC_I(inode)->ns_ops; 52 ns_ops = PROC_I(inode)->ns_ops;
52 if (ns_ops && ns_ops->put) 53 ns = PROC_I(inode)->ns;
53 ns_ops->put(PROC_I(inode)->ns); 54 if (ns_ops && ns)
55 ns_ops->put(ns);
54} 56}
55 57
56static struct kmem_cache * proc_inode_cachep; 58static struct kmem_cache * proc_inode_cachep;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 43973b084abf..252544c05207 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -15,6 +15,7 @@ struct ctl_table_header;
15struct mempolicy; 15struct mempolicy;
16 16
17extern struct proc_dir_entry proc_root; 17extern struct proc_dir_entry proc_root;
18extern void proc_self_init(void);
18#ifdef CONFIG_PROC_SYSCTL 19#ifdef CONFIG_PROC_SYSCTL
19extern int proc_sys_init(void); 20extern int proc_sys_init(void);
20extern void sysctl_head_put(struct ctl_table_header *head); 21extern void sysctl_head_put(struct ctl_table_header *head);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67eee439f..e96d4f18ca3a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
249 /* Not inialized....update now */ 249 /* Not inialized....update now */
250 /* find out "max pfn" */ 250 /* find out "max pfn" */
251 end_pfn = 0; 251 end_pfn = 0;
252 for_each_node_state(nid, N_HIGH_MEMORY) { 252 for_each_node_state(nid, N_MEMORY) {
253 unsigned long node_end; 253 unsigned long node_end;
254 node_end = NODE_DATA(nid)->node_start_pfn + 254 node_end = NODE_DATA(nid)->node_start_pfn +
255 NODE_DATA(nid)->node_spanned_pages; 255 NODE_DATA(nid)->node_spanned_pages;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index b178ed733c36..b7a47196c8c3 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -11,6 +11,7 @@
11#include <net/net_namespace.h> 11#include <net/net_namespace.h>
12#include <linux/ipc_namespace.h> 12#include <linux/ipc_namespace.h>
13#include <linux/pid_namespace.h> 13#include <linux/pid_namespace.h>
14#include <linux/user_namespace.h>
14#include "internal.h" 15#include "internal.h"
15 16
16 17
@@ -24,12 +25,168 @@ static const struct proc_ns_operations *ns_entries[] = {
24#ifdef CONFIG_IPC_NS 25#ifdef CONFIG_IPC_NS
25 &ipcns_operations, 26 &ipcns_operations,
26#endif 27#endif
28#ifdef CONFIG_PID_NS
29 &pidns_operations,
30#endif
31#ifdef CONFIG_USER_NS
32 &userns_operations,
33#endif
34 &mntns_operations,
27}; 35};
28 36
29static const struct file_operations ns_file_operations = { 37static const struct file_operations ns_file_operations = {
30 .llseek = no_llseek, 38 .llseek = no_llseek,
31}; 39};
32 40
41static const struct inode_operations ns_inode_operations = {
42 .setattr = proc_setattr,
43};
44
45static int ns_delete_dentry(const struct dentry *dentry)
46{
47 /* Don't cache namespace inodes when not in use */
48 return 1;
49}
50
51static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
52{
53 struct inode *inode = dentry->d_inode;
54 const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
55
56 return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
57 ns_ops->name, inode->i_ino);
58}
59
60const struct dentry_operations ns_dentry_operations =
61{
62 .d_delete = ns_delete_dentry,
63 .d_dname = ns_dname,
64};
65
66static struct dentry *proc_ns_get_dentry(struct super_block *sb,
67 struct task_struct *task, const struct proc_ns_operations *ns_ops)
68{
69 struct dentry *dentry, *result;
70 struct inode *inode;
71 struct proc_inode *ei;
72 struct qstr qname = { .name = "", };
73 void *ns;
74
75 ns = ns_ops->get(task);
76 if (!ns)
77 return ERR_PTR(-ENOENT);
78
79 dentry = d_alloc_pseudo(sb, &qname);
80 if (!dentry) {
81 ns_ops->put(ns);
82 return ERR_PTR(-ENOMEM);
83 }
84
85 inode = iget_locked(sb, ns_ops->inum(ns));
86 if (!inode) {
87 dput(dentry);
88 ns_ops->put(ns);
89 return ERR_PTR(-ENOMEM);
90 }
91
92 ei = PROC_I(inode);
93 if (inode->i_state & I_NEW) {
94 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
95 inode->i_op = &ns_inode_operations;
96 inode->i_mode = S_IFREG | S_IRUGO;
97 inode->i_fop = &ns_file_operations;
98 ei->ns_ops = ns_ops;
99 ei->ns = ns;
100 unlock_new_inode(inode);
101 } else {
102 ns_ops->put(ns);
103 }
104
105 d_set_d_op(dentry, &ns_dentry_operations);
106 result = d_instantiate_unique(dentry, inode);
107 if (result) {
108 dput(dentry);
109 dentry = result;
110 }
111
112 return dentry;
113}
114
115static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
116{
117 struct inode *inode = dentry->d_inode;
118 struct super_block *sb = inode->i_sb;
119 struct proc_inode *ei = PROC_I(inode);
120 struct task_struct *task;
121 struct dentry *ns_dentry;
122 void *error = ERR_PTR(-EACCES);
123
124 task = get_proc_task(inode);
125 if (!task)
126 goto out;
127
128 if (!ptrace_may_access(task, PTRACE_MODE_READ))
129 goto out_put_task;
130
131 ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
132 if (IS_ERR(ns_dentry)) {
133 error = ERR_CAST(ns_dentry);
134 goto out_put_task;
135 }
136
137 dput(nd->path.dentry);
138 nd->path.dentry = ns_dentry;
139 error = NULL;
140
141out_put_task:
142 put_task_struct(task);
143out:
144 return error;
145}
146
147static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
148{
149 struct inode *inode = dentry->d_inode;
150 struct proc_inode *ei = PROC_I(inode);
151 const struct proc_ns_operations *ns_ops = ei->ns_ops;
152 struct task_struct *task;
153 void *ns;
154 char name[50];
155 int len = -EACCES;
156
157 task = get_proc_task(inode);
158 if (!task)
159 goto out;
160
161 if (!ptrace_may_access(task, PTRACE_MODE_READ))
162 goto out_put_task;
163
164 len = -ENOENT;
165 ns = ns_ops->get(task);
166 if (!ns)
167 goto out_put_task;
168
169 snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
170 len = strlen(name);
171
172 if (len > buflen)
173 len = buflen;
174 if (copy_to_user(buffer, name, len))
175 len = -EFAULT;
176
177 ns_ops->put(ns);
178out_put_task:
179 put_task_struct(task);
180out:
181 return len;
182}
183
184static const struct inode_operations proc_ns_link_inode_operations = {
185 .readlink = proc_ns_readlink,
186 .follow_link = proc_ns_follow_link,
187 .setattr = proc_setattr,
188};
189
33static struct dentry *proc_ns_instantiate(struct inode *dir, 190static struct dentry *proc_ns_instantiate(struct inode *dir,
34 struct dentry *dentry, struct task_struct *task, const void *ptr) 191 struct dentry *dentry, struct task_struct *task, const void *ptr)
35{ 192{
@@ -37,21 +194,15 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
37 struct inode *inode; 194 struct inode *inode;
38 struct proc_inode *ei; 195 struct proc_inode *ei;
39 struct dentry *error = ERR_PTR(-ENOENT); 196 struct dentry *error = ERR_PTR(-ENOENT);
40 void *ns;
41 197
42 inode = proc_pid_make_inode(dir->i_sb, task); 198 inode = proc_pid_make_inode(dir->i_sb, task);
43 if (!inode) 199 if (!inode)
44 goto out; 200 goto out;
45 201
46 ns = ns_ops->get(task);
47 if (!ns)
48 goto out_iput;
49
50 ei = PROC_I(inode); 202 ei = PROC_I(inode);
51 inode->i_mode = S_IFREG|S_IRUSR; 203 inode->i_mode = S_IFLNK|S_IRWXUGO;
52 inode->i_fop = &ns_file_operations; 204 inode->i_op = &proc_ns_link_inode_operations;
53 ei->ns_ops = ns_ops; 205 ei->ns_ops = ns_ops;
54 ei->ns = ns;
55 206
56 d_set_d_op(dentry, &pid_dentry_operations); 207 d_set_d_op(dentry, &pid_dentry_operations);
57 d_add(dentry, inode); 208 d_add(dentry, inode);
@@ -60,9 +211,6 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
60 error = NULL; 211 error = NULL;
61out: 212out:
62 return error; 213 return error;
63out_iput:
64 iput(inode);
65 goto out;
66} 214}
67 215
68static int proc_ns_fill_cache(struct file *filp, void *dirent, 216static int proc_ns_fill_cache(struct file *filp, void *dirent,
@@ -89,10 +237,6 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent,
89 if (!task) 237 if (!task)
90 goto out_no_task; 238 goto out_no_task;
91 239
92 ret = -EPERM;
93 if (!ptrace_may_access(task, PTRACE_MODE_READ))
94 goto out;
95
96 ret = 0; 240 ret = 0;
97 i = filp->f_pos; 241 i = filp->f_pos;
98 switch (i) { 242 switch (i) {
@@ -152,10 +296,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
152 if (!task) 296 if (!task)
153 goto out_no_task; 297 goto out_no_task;
154 298
155 error = ERR_PTR(-EPERM);
156 if (!ptrace_may_access(task, PTRACE_MODE_READ))
157 goto out;
158
159 last = &ns_entries[ARRAY_SIZE(ns_entries)]; 299 last = &ns_entries[ARRAY_SIZE(ns_entries)];
160 for (entry = ns_entries; entry < last; entry++) { 300 for (entry = ns_entries; entry < last; entry++) {
161 if (strlen((*entry)->name) != len) 301 if (strlen((*entry)->name) != len)
@@ -163,7 +303,6 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
163 if (!memcmp(dentry->d_name.name, (*entry)->name, len)) 303 if (!memcmp(dentry->d_name.name, (*entry)->name, len))
164 break; 304 break;
165 } 305 }
166 error = ERR_PTR(-ENOENT);
167 if (entry == last) 306 if (entry == last)
168 goto out; 307 goto out;
169 308
@@ -198,3 +337,7 @@ out_invalid:
198 return ERR_PTR(-EINVAL); 337 return ERR_PTR(-EINVAL);
199} 338}
200 339
340bool proc_ns_inode(struct inode *inode)
341{
342 return inode->i_fop == &ns_file_operations;
343}
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index df7dd08d4391..de20ec480fa0 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -195,11 +195,7 @@ void proc_device_tree_add_node(struct device_node *np,
195 set_node_proc_entry(np, de); 195 set_node_proc_entry(np, de);
196 for (child = NULL; (child = of_get_next_child(np, child));) { 196 for (child = NULL; (child = of_get_next_child(np, child));) {
197 /* Use everything after the last slash, or the full name */ 197 /* Use everything after the last slash, or the full name */
198 p = strrchr(child->full_name, '/'); 198 p = kbasename(child->full_name);
199 if (!p)
200 p = child->full_name;
201 else
202 ++p;
203 199
204 if (duplicate_name(de, p)) 200 if (duplicate_name(de, p))
205 p = fixup_name(np, de, p); 201 p = fixup_name(np, de, p);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a781bdf06694..1827d88ad58b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -378,12 +378,13 @@ static int test_perm(int mode, int op)
378 return -EACCES; 378 return -EACCES;
379} 379}
380 380
381static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) 381static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
382{ 382{
383 struct ctl_table_root *root = head->root;
383 int mode; 384 int mode;
384 385
385 if (root->permissions) 386 if (root->permissions)
386 mode = root->permissions(root, current->nsproxy, table); 387 mode = root->permissions(head, table);
387 else 388 else
388 mode = table->mode; 389 mode = table->mode;
389 390
@@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
491 * and won't be until we finish. 492 * and won't be until we finish.
492 */ 493 */
493 error = -EPERM; 494 error = -EPERM;
494 if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) 495 if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
495 goto out; 496 goto out;
496 497
497 /* if that can happen at all, it should be -EINVAL, not -EISDIR */ 498 /* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
717 if (!table) /* global root - r-xr-xr-x */ 718 if (!table) /* global root - r-xr-xr-x */
718 error = mask & MAY_WRITE ? -EACCES : 0; 719 error = mask & MAY_WRITE ? -EACCES : 0;
719 else /* Use the permissions on the sysctl table entry */ 720 else /* Use the permissions on the sysctl table entry */
720 error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK); 721 error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
721 722
722 sysctl_head_finish(head); 723 sysctl_head_finish(head);
723 return error; 724 return error;
@@ -735,13 +736,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
735 if (error) 736 if (error)
736 return error; 737 return error;
737 738
738 if ((attr->ia_valid & ATTR_SIZE) &&
739 attr->ia_size != i_size_read(inode)) {
740 error = vmtruncate(inode, attr->ia_size);
741 if (error)
742 return error;
743 }
744
745 setattr_copy(inode, attr); 739 setattr_copy(inode, attr);
746 mark_inode_dirty(inode); 740 mark_inode_dirty(inode);
747 return 0; 741 return 0;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9889a92d2e01..c6e9fac26bac 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -100,14 +100,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
100 int err; 100 int err;
101 struct super_block *sb; 101 struct super_block *sb;
102 struct pid_namespace *ns; 102 struct pid_namespace *ns;
103 struct proc_inode *ei;
104 char *options; 103 char *options;
105 104
106 if (flags & MS_KERNMOUNT) { 105 if (flags & MS_KERNMOUNT) {
107 ns = (struct pid_namespace *)data; 106 ns = (struct pid_namespace *)data;
108 options = NULL; 107 options = NULL;
109 } else { 108 } else {
110 ns = current->nsproxy->pid_ns; 109 ns = task_active_pid_ns(current);
111 options = data; 110 options = data;
112 } 111 }
113 112
@@ -130,13 +129,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
130 sb->s_flags |= MS_ACTIVE; 129 sb->s_flags |= MS_ACTIVE;
131 } 130 }
132 131
133 ei = PROC_I(sb->s_root->d_inode);
134 if (!ei->pid) {
135 rcu_read_lock();
136 ei->pid = get_pid(find_pid_ns(1, ns));
137 rcu_read_unlock();
138 }
139
140 return dget(sb->s_root); 132 return dget(sb->s_root);
141} 133}
142 134
@@ -153,6 +145,7 @@ static struct file_system_type proc_fs_type = {
153 .name = "proc", 145 .name = "proc",
154 .mount = proc_mount, 146 .mount = proc_mount,
155 .kill_sb = proc_kill_sb, 147 .kill_sb = proc_kill_sb,
148 .fs_flags = FS_USERNS_MOUNT,
156}; 149};
157 150
158void __init proc_root_init(void) 151void __init proc_root_init(void)
@@ -163,12 +156,8 @@ void __init proc_root_init(void)
163 err = register_filesystem(&proc_fs_type); 156 err = register_filesystem(&proc_fs_type);
164 if (err) 157 if (err)
165 return; 158 return;
166 err = pid_ns_prepare_proc(&init_pid_ns);
167 if (err) {
168 unregister_filesystem(&proc_fs_type);
169 return;
170 }
171 159
160 proc_self_init();
172 proc_symlink("mounts", NULL, "self/mounts"); 161 proc_symlink("mounts", NULL, "self/mounts");
173 162
174 proc_net_init(); 163 proc_net_init();
diff --git a/fs/proc/self.c b/fs/proc/self.c
new file mode 100644
index 000000000000..aa5cc3bff140
--- /dev/null
+++ b/fs/proc/self.c
@@ -0,0 +1,59 @@
1#include <linux/proc_fs.h>
2#include <linux/sched.h>
3#include <linux/namei.h>
4
5/*
6 * /proc/self:
7 */
8static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
9 int buflen)
10{
11 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
12 pid_t tgid = task_tgid_nr_ns(current, ns);
13 char tmp[PROC_NUMBUF];
14 if (!tgid)
15 return -ENOENT;
16 sprintf(tmp, "%d", tgid);
17 return vfs_readlink(dentry,buffer,buflen,tmp);
18}
19
20static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
21{
22 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
23 pid_t tgid = task_tgid_nr_ns(current, ns);
24 char *name = ERR_PTR(-ENOENT);
25 if (tgid) {
26 /* 11 for max length of signed int in decimal + NULL term */
27 name = kmalloc(12, GFP_KERNEL);
28 if (!name)
29 name = ERR_PTR(-ENOMEM);
30 else
31 sprintf(name, "%d", tgid);
32 }
33 nd_set_link(nd, name);
34 return NULL;
35}
36
37static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
38 void *cookie)
39{
40 char *s = nd_get_link(nd);
41 if (!IS_ERR(s))
42 kfree(s);
43}
44
45static const struct inode_operations proc_self_inode_operations = {
46 .readlink = proc_self_readlink,
47 .follow_link = proc_self_follow_link,
48 .put_link = proc_self_put_link,
49};
50
51void __init proc_self_init(void)
52{
53 struct proc_dir_entry *proc_self_symlink;
54 mode_t mode;
55
56 mode = S_IFLNK | S_IRWXUGO;
57 proc_self_symlink = proc_create("self", mode, NULL, NULL );
58 proc_self_symlink->proc_iops = &proc_self_inode_operations;
59}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9392a5..ca5ce7f9f800 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -526,6 +526,57 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
526 return 0; 526 return 0;
527} 527}
528 528
529static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
530{
531 /*
532 * Don't forget to update Documentation/ on changes.
533 */
534 static const char mnemonics[BITS_PER_LONG][2] = {
535 /*
536 * In case if we meet a flag we don't know about.
537 */
538 [0 ... (BITS_PER_LONG-1)] = "??",
539
540 [ilog2(VM_READ)] = "rd",
541 [ilog2(VM_WRITE)] = "wr",
542 [ilog2(VM_EXEC)] = "ex",
543 [ilog2(VM_SHARED)] = "sh",
544 [ilog2(VM_MAYREAD)] = "mr",
545 [ilog2(VM_MAYWRITE)] = "mw",
546 [ilog2(VM_MAYEXEC)] = "me",
547 [ilog2(VM_MAYSHARE)] = "ms",
548 [ilog2(VM_GROWSDOWN)] = "gd",
549 [ilog2(VM_PFNMAP)] = "pf",
550 [ilog2(VM_DENYWRITE)] = "dw",
551 [ilog2(VM_LOCKED)] = "lo",
552 [ilog2(VM_IO)] = "io",
553 [ilog2(VM_SEQ_READ)] = "sr",
554 [ilog2(VM_RAND_READ)] = "rr",
555 [ilog2(VM_DONTCOPY)] = "dc",
556 [ilog2(VM_DONTEXPAND)] = "de",
557 [ilog2(VM_ACCOUNT)] = "ac",
558 [ilog2(VM_NORESERVE)] = "nr",
559 [ilog2(VM_HUGETLB)] = "ht",
560 [ilog2(VM_NONLINEAR)] = "nl",
561 [ilog2(VM_ARCH_1)] = "ar",
562 [ilog2(VM_DONTDUMP)] = "dd",
563 [ilog2(VM_MIXEDMAP)] = "mm",
564 [ilog2(VM_HUGEPAGE)] = "hg",
565 [ilog2(VM_NOHUGEPAGE)] = "nh",
566 [ilog2(VM_MERGEABLE)] = "mg",
567 };
568 size_t i;
569
570 seq_puts(m, "VmFlags: ");
571 for (i = 0; i < BITS_PER_LONG; i++) {
572 if (vma->vm_flags & (1UL << i)) {
573 seq_printf(m, "%c%c ",
574 mnemonics[i][0], mnemonics[i][1]);
575 }
576 }
577 seq_putc(m, '\n');
578}
579
529static int show_smap(struct seq_file *m, void *v, int is_pid) 580static int show_smap(struct seq_file *m, void *v, int is_pid)
530{ 581{
531 struct proc_maps_private *priv = m->private; 582 struct proc_maps_private *priv = m->private;
@@ -581,6 +632,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
581 seq_printf(m, "Nonlinear: %8lu kB\n", 632 seq_printf(m, "Nonlinear: %8lu kB\n",
582 mss.nonlinear >> 10); 633 mss.nonlinear >> 10);
583 634
635 show_smap_vma_flags(m, vma);
636
584 if (m->count < m->size) /* vma is copied successfully */ 637 if (m->count < m->size) /* vma is copied successfully */
585 m->version = (vma != get_gate_vma(task->mm)) 638 m->version = (vma != get_gate_vma(task->mm))
586 ? vma->vm_start : 0; 639 ? vma->vm_start : 0;
@@ -643,7 +696,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
643 spinlock_t *ptl; 696 spinlock_t *ptl;
644 struct page *page; 697 struct page *page;
645 698
646 split_huge_page_pmd(walk->mm, pmd); 699 split_huge_page_pmd(vma, addr, pmd);
647 if (pmd_trans_unstable(pmd)) 700 if (pmd_trans_unstable(pmd))
648 return 0; 701 return 0;
649 702
@@ -1126,7 +1179,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1126 return NULL; 1179 return NULL;
1127 1180
1128 nid = page_to_nid(page); 1181 nid = page_to_nid(page);
1129 if (!node_isset(nid, node_states[N_HIGH_MEMORY])) 1182 if (!node_isset(nid, node_states[N_MEMORY]))
1130 return NULL; 1183 return NULL;
1131 1184
1132 return page; 1185 return page;
@@ -1225,7 +1278,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1225 walk.mm = mm; 1278 walk.mm = mm;
1226 1279
1227 pol = get_vma_policy(task, vma, vma->vm_start); 1280 pol = get_vma_policy(task, vma, vma->vm_start);
1228 mpol_to_str(buffer, sizeof(buffer), pol, 0); 1281 mpol_to_str(buffer, sizeof(buffer), pol);
1229 mpol_cond_put(pol); 1282 mpol_cond_put(pol);
1230 1283
1231 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1284 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
@@ -1279,7 +1332,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1279 if (md->writeback) 1332 if (md->writeback)
1280 seq_printf(m, " writeback=%lu", md->writeback); 1333 seq_printf(m, " writeback=%lu", md->writeback);
1281 1334
1282 for_each_node_state(n, N_HIGH_MEMORY) 1335 for_each_node_state(n, N_MEMORY)
1283 if (md->node[n]) 1336 if (md->node[n])
1284 seq_printf(m, " N%d=%lu", n, md->node[n]); 1337 seq_printf(m, " N%d=%lu", n, md->node[n]);
1285out: 1338out:
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 2d57e1ac0115..43b12807a51d 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -28,7 +28,9 @@
28#include "internal.h" 28#include "internal.h"
29 29
30static void notrace pstore_ftrace_call(unsigned long ip, 30static void notrace pstore_ftrace_call(unsigned long ip,
31 unsigned long parent_ip) 31 unsigned long parent_ip,
32 struct ftrace_ops *op,
33 struct pt_regs *regs)
32{ 34{
33 unsigned long flags; 35 unsigned long flags;
34 struct pstore_ftrace_record rec = {}; 36 struct pstore_ftrace_record rec = {};
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 4ab572e6d277..67de74ca85f4 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -49,6 +49,7 @@ struct pstore_private {
49 struct pstore_info *psi; 49 struct pstore_info *psi;
50 enum pstore_type_id type; 50 enum pstore_type_id type;
51 u64 id; 51 u64 id;
52 int count;
52 ssize_t size; 53 ssize_t size;
53 char data[]; 54 char data[];
54}; 55};
@@ -150,13 +151,13 @@ static int pstore_file_open(struct inode *inode, struct file *file)
150 return 0; 151 return 0;
151} 152}
152 153
153static loff_t pstore_file_llseek(struct file *file, loff_t off, int origin) 154static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
154{ 155{
155 struct seq_file *sf = file->private_data; 156 struct seq_file *sf = file->private_data;
156 157
157 if (sf->op) 158 if (sf->op)
158 return seq_lseek(file, off, origin); 159 return seq_lseek(file, off, whence);
159 return default_llseek(file, off, origin); 160 return default_llseek(file, off, whence);
160} 161}
161 162
162static const struct file_operations pstore_file_operations = { 163static const struct file_operations pstore_file_operations = {
@@ -175,7 +176,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
175 struct pstore_private *p = dentry->d_inode->i_private; 176 struct pstore_private *p = dentry->d_inode->i_private;
176 177
177 if (p->psi->erase) 178 if (p->psi->erase)
178 p->psi->erase(p->type, p->id, p->psi); 179 p->psi->erase(p->type, p->id, p->count,
180 dentry->d_inode->i_ctime, p->psi);
179 181
180 return simple_unlink(dir, dentry); 182 return simple_unlink(dir, dentry);
181} 183}
@@ -270,7 +272,7 @@ int pstore_is_mounted(void)
270 * Load it up with "size" bytes of data from "buf". 272 * Load it up with "size" bytes of data from "buf".
271 * Set the mtime & ctime to the date that this record was originally stored. 273 * Set the mtime & ctime to the date that this record was originally stored.
272 */ 274 */
273int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, 275int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
274 char *data, size_t size, struct timespec time, 276 char *data, size_t size, struct timespec time,
275 struct pstore_info *psi) 277 struct pstore_info *psi)
276{ 278{
@@ -306,6 +308,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
306 goto fail_alloc; 308 goto fail_alloc;
307 private->type = type; 309 private->type = type;
308 private->id = id; 310 private->id = id;
311 private->count = count;
309 private->psi = psi; 312 private->psi = psi;
310 313
311 switch (type) { 314 switch (type) {
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 4847f588b7d5..937d820f273c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -50,7 +50,7 @@ extern struct pstore_info *psinfo;
50extern void pstore_set_kmsg_bytes(int); 50extern void pstore_set_kmsg_bytes(int);
51extern void pstore_get_records(int); 51extern void pstore_get_records(int);
52extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, 52extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
53 char *data, size_t size, 53 int count, char *data, size_t size,
54 struct timespec time, struct pstore_info *psi); 54 struct timespec time, struct pstore_info *psi);
55extern int pstore_is_mounted(void); 55extern int pstore_is_mounted(void);
56 56
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 947fbe06c3b1..5ea2e77ff023 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -136,7 +136,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
136 break; 136 break;
137 137
138 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, 138 ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
139 hsize + len, psinfo); 139 oopscount, hsize + len, psinfo);
140 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 140 if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
141 pstore_new_entry = 1; 141 pstore_new_entry = 1;
142 142
@@ -173,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
173 spin_lock_irqsave(&psinfo->buf_lock, flags); 173 spin_lock_irqsave(&psinfo->buf_lock, flags);
174 } 174 }
175 memcpy(psinfo->buf, s, c); 175 memcpy(psinfo->buf, s, c);
176 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, c, psinfo); 176 psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
177 spin_unlock_irqrestore(&psinfo->buf_lock, flags); 177 spin_unlock_irqrestore(&psinfo->buf_lock, flags);
178 s += c; 178 s += c;
179 c = e - s; 179 c = e - s;
@@ -197,7 +197,7 @@ static void pstore_register_console(void) {}
197 197
198static int pstore_write_compat(enum pstore_type_id type, 198static int pstore_write_compat(enum pstore_type_id type,
199 enum kmsg_dump_reason reason, 199 enum kmsg_dump_reason reason,
200 u64 *id, unsigned int part, 200 u64 *id, unsigned int part, int count,
201 size_t size, struct pstore_info *psi) 201 size_t size, struct pstore_info *psi)
202{ 202{
203 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi); 203 return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
@@ -267,6 +267,7 @@ void pstore_get_records(int quiet)
267 char *buf = NULL; 267 char *buf = NULL;
268 ssize_t size; 268 ssize_t size;
269 u64 id; 269 u64 id;
270 int count;
270 enum pstore_type_id type; 271 enum pstore_type_id type;
271 struct timespec time; 272 struct timespec time;
272 int failed = 0, rc; 273 int failed = 0, rc;
@@ -278,9 +279,9 @@ void pstore_get_records(int quiet)
278 if (psi->open && psi->open(psi)) 279 if (psi->open && psi->open(psi))
279 goto out; 280 goto out;
280 281
281 while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { 282 while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) {
282 rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, 283 rc = pstore_mkfile(type, psi->name, id, count, buf,
283 time, psi); 284 (size_t)size, time, psi);
284 kfree(buf); 285 kfree(buf);
285 buf = NULL; 286 buf = NULL;
286 if (rc && (rc != -EEXIST || !quiet)) 287 if (rc && (rc != -EEXIST || !quiet))
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1a4f6da58eab..7003e5266f25 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -132,9 +132,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
132} 132}
133 133
134static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, 134static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
135 struct timespec *time, 135 int *count, struct timespec *time,
136 char **buf, 136 char **buf, struct pstore_info *psi)
137 struct pstore_info *psi)
138{ 137{
139 ssize_t size; 138 ssize_t size;
140 struct ramoops_context *cxt = psi->data; 139 struct ramoops_context *cxt = psi->data;
@@ -189,7 +188,7 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
189 struct pstore_info *psi) 188 struct pstore_info *psi)
190{ 189{
191 struct ramoops_context *cxt = psi->data; 190 struct ramoops_context *cxt = psi->data;
192 struct persistent_ram_zone *prz = cxt->przs[cxt->dump_write_cnt]; 191 struct persistent_ram_zone *prz;
193 size_t hlen; 192 size_t hlen;
194 193
195 if (type == PSTORE_TYPE_CONSOLE) { 194 if (type == PSTORE_TYPE_CONSOLE) {
@@ -226,6 +225,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
226 if (part != 1) 225 if (part != 1)
227 return -ENOSPC; 226 return -ENOSPC;
228 227
228 if (!cxt->przs)
229 return -ENOSPC;
230
231 prz = cxt->przs[cxt->dump_write_cnt];
232
229 hlen = ramoops_write_kmsg_hdr(prz); 233 hlen = ramoops_write_kmsg_hdr(prz);
230 if (size + hlen > prz->buffer_size) 234 if (size + hlen > prz->buffer_size)
231 size = prz->buffer_size - hlen; 235 size = prz->buffer_size - hlen;
@@ -236,8 +240,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
236 return 0; 240 return 0;
237} 241}
238 242
239static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, 243static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
240 struct pstore_info *psi) 244 struct timespec time, struct pstore_info *psi)
241{ 245{
242 struct ramoops_context *cxt = psi->data; 246 struct ramoops_context *cxt = psi->data;
243 struct persistent_ram_zone *prz; 247 struct persistent_ram_zone *prz;
@@ -288,7 +292,7 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
288} 292}
289 293
290static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, 294static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
291 phys_addr_t *paddr, size_t dump_mem_sz) 295 phys_addr_t *paddr, size_t dump_mem_sz)
292{ 296{
293 int err = -ENOMEM; 297 int err = -ENOMEM;
294 int i; 298 int i;
@@ -296,6 +300,11 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
296 if (!cxt->record_size) 300 if (!cxt->record_size)
297 return 0; 301 return 0;
298 302
303 if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
304 dev_err(dev, "no room for dumps\n");
305 return -ENOMEM;
306 }
307
299 cxt->max_dump_cnt = dump_mem_sz / cxt->record_size; 308 cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
300 if (!cxt->max_dump_cnt) 309 if (!cxt->max_dump_cnt)
301 return -ENOMEM; 310 return -ENOMEM;
@@ -333,8 +342,12 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
333 if (!sz) 342 if (!sz)
334 return 0; 343 return 0;
335 344
336 if (*paddr + sz > *paddr + cxt->size) 345 if (*paddr + sz - cxt->phys_addr > cxt->size) {
346 dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
347 sz, (unsigned long long)*paddr,
348 cxt->size, (unsigned long long)cxt->phys_addr);
337 return -ENOMEM; 349 return -ENOMEM;
350 }
338 351
339 *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size); 352 *prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size);
340 if (IS_ERR(*prz)) { 353 if (IS_ERR(*prz)) {
@@ -352,7 +365,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
352 return 0; 365 return 0;
353} 366}
354 367
355static int __devinit ramoops_probe(struct platform_device *pdev) 368static int ramoops_probe(struct platform_device *pdev)
356{ 369{
357 struct device *dev = &pdev->dev; 370 struct device *dev = &pdev->dev;
358 struct ramoops_platform_data *pdata = pdev->dev.platform_data; 371 struct ramoops_platform_data *pdata = pdev->dev.platform_data;
@@ -374,10 +387,14 @@ static int __devinit ramoops_probe(struct platform_device *pdev)
374 goto fail_out; 387 goto fail_out;
375 } 388 }
376 389
377 pdata->mem_size = rounddown_pow_of_two(pdata->mem_size); 390 if (!is_power_of_2(pdata->mem_size))
378 pdata->record_size = rounddown_pow_of_two(pdata->record_size); 391 pdata->mem_size = rounddown_pow_of_two(pdata->mem_size);
379 pdata->console_size = rounddown_pow_of_two(pdata->console_size); 392 if (!is_power_of_2(pdata->record_size))
380 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); 393 pdata->record_size = rounddown_pow_of_two(pdata->record_size);
394 if (!is_power_of_2(pdata->console_size))
395 pdata->console_size = rounddown_pow_of_two(pdata->console_size);
396 if (!is_power_of_2(pdata->ftrace_size))
397 pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
381 398
382 cxt->dump_read_cnt = 0; 399 cxt->dump_read_cnt = 0;
383 cxt->size = pdata->mem_size; 400 cxt->size = pdata->mem_size;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index eecd2a8a84dd..0306303be372 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -390,8 +390,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
390 return 0; 390 return 0;
391} 391}
392 392
393static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz, 393static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
394 u32 sig, int ecc_size) 394 int ecc_size)
395{ 395{
396 int ret; 396 int ret;
397 397
@@ -443,9 +443,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
443 kfree(prz); 443 kfree(prz);
444} 444}
445 445
446struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start, 446struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
447 size_t size, u32 sig, 447 u32 sig, int ecc_size)
448 int ecc_size)
449{ 448{
450 struct persistent_ram_zone *prz; 449 struct persistent_ram_zone *prz;
451 int ret = -ENOMEM; 450 int ret = -ENOMEM;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index af1661f7a54f..c7314f1771f5 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -307,6 +307,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
307 } 307 }
308} 308}
309 309
310#ifdef CONFIG_BLOCK
311
310/* Return 1 if 'cmd' will block on frozen filesystem */ 312/* Return 1 if 'cmd' will block on frozen filesystem */
311static int quotactl_cmd_write(int cmd) 313static int quotactl_cmd_write(int cmd)
312{ 314{
@@ -322,6 +324,8 @@ static int quotactl_cmd_write(int cmd)
322 return 1; 324 return 1;
323} 325}
324 326
327#endif /* CONFIG_BLOCK */
328
325/* 329/*
326 * look up a superblock on which quota ops will be performed 330 * look up a superblock on which quota ops will be performed
327 * - use the name of a block device to find the superblock thereon 331 * - use the name of a block device to find the superblock thereon
diff --git a/fs/read_write.c b/fs/read_write.c
index d06534857e9e..bb34af315280 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -54,7 +54,7 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
54 * generic_file_llseek_size - generic llseek implementation for regular files 54 * generic_file_llseek_size - generic llseek implementation for regular files
55 * @file: file structure to seek on 55 * @file: file structure to seek on
56 * @offset: file offset to seek to 56 * @offset: file offset to seek to
57 * @origin: type of seek 57 * @whence: type of seek
58 * @size: max size of this file in file system 58 * @size: max size of this file in file system
59 * @eof: offset used for SEEK_END position 59 * @eof: offset used for SEEK_END position
60 * 60 *
@@ -67,12 +67,12 @@ static loff_t lseek_execute(struct file *file, struct inode *inode,
67 * read/writes behave like SEEK_SET against seeks. 67 * read/writes behave like SEEK_SET against seeks.
68 */ 68 */
69loff_t 69loff_t
70generic_file_llseek_size(struct file *file, loff_t offset, int origin, 70generic_file_llseek_size(struct file *file, loff_t offset, int whence,
71 loff_t maxsize, loff_t eof) 71 loff_t maxsize, loff_t eof)
72{ 72{
73 struct inode *inode = file->f_mapping->host; 73 struct inode *inode = file->f_mapping->host;
74 74
75 switch (origin) { 75 switch (whence) {
76 case SEEK_END: 76 case SEEK_END:
77 offset += eof; 77 offset += eof;
78 break; 78 break;
@@ -122,17 +122,17 @@ EXPORT_SYMBOL(generic_file_llseek_size);
122 * generic_file_llseek - generic llseek implementation for regular files 122 * generic_file_llseek - generic llseek implementation for regular files
123 * @file: file structure to seek on 123 * @file: file structure to seek on
124 * @offset: file offset to seek to 124 * @offset: file offset to seek to
125 * @origin: type of seek 125 * @whence: type of seek
126 * 126 *
127 * This is a generic implemenation of ->llseek useable for all normal local 127 * This is a generic implemenation of ->llseek useable for all normal local
128 * filesystems. It just updates the file offset to the value specified by 128 * filesystems. It just updates the file offset to the value specified by
129 * @offset and @origin under i_mutex. 129 * @offset and @whence under i_mutex.
130 */ 130 */
131loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) 131loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
132{ 132{
133 struct inode *inode = file->f_mapping->host; 133 struct inode *inode = file->f_mapping->host;
134 134
135 return generic_file_llseek_size(file, offset, origin, 135 return generic_file_llseek_size(file, offset, whence,
136 inode->i_sb->s_maxbytes, 136 inode->i_sb->s_maxbytes,
137 i_size_read(inode)); 137 i_size_read(inode));
138} 138}
@@ -142,32 +142,32 @@ EXPORT_SYMBOL(generic_file_llseek);
142 * noop_llseek - No Operation Performed llseek implementation 142 * noop_llseek - No Operation Performed llseek implementation
143 * @file: file structure to seek on 143 * @file: file structure to seek on
144 * @offset: file offset to seek to 144 * @offset: file offset to seek to
145 * @origin: type of seek 145 * @whence: type of seek
146 * 146 *
147 * This is an implementation of ->llseek useable for the rare special case when 147 * This is an implementation of ->llseek useable for the rare special case when
148 * userspace expects the seek to succeed but the (device) file is actually not 148 * userspace expects the seek to succeed but the (device) file is actually not
149 * able to perform the seek. In this case you use noop_llseek() instead of 149 * able to perform the seek. In this case you use noop_llseek() instead of
150 * falling back to the default implementation of ->llseek. 150 * falling back to the default implementation of ->llseek.
151 */ 151 */
152loff_t noop_llseek(struct file *file, loff_t offset, int origin) 152loff_t noop_llseek(struct file *file, loff_t offset, int whence)
153{ 153{
154 return file->f_pos; 154 return file->f_pos;
155} 155}
156EXPORT_SYMBOL(noop_llseek); 156EXPORT_SYMBOL(noop_llseek);
157 157
158loff_t no_llseek(struct file *file, loff_t offset, int origin) 158loff_t no_llseek(struct file *file, loff_t offset, int whence)
159{ 159{
160 return -ESPIPE; 160 return -ESPIPE;
161} 161}
162EXPORT_SYMBOL(no_llseek); 162EXPORT_SYMBOL(no_llseek);
163 163
164loff_t default_llseek(struct file *file, loff_t offset, int origin) 164loff_t default_llseek(struct file *file, loff_t offset, int whence)
165{ 165{
166 struct inode *inode = file->f_path.dentry->d_inode; 166 struct inode *inode = file->f_path.dentry->d_inode;
167 loff_t retval; 167 loff_t retval;
168 168
169 mutex_lock(&inode->i_mutex); 169 mutex_lock(&inode->i_mutex);
170 switch (origin) { 170 switch (whence) {
171 case SEEK_END: 171 case SEEK_END:
172 offset += i_size_read(inode); 172 offset += i_size_read(inode);
173 break; 173 break;
@@ -216,7 +216,7 @@ out:
216} 216}
217EXPORT_SYMBOL(default_llseek); 217EXPORT_SYMBOL(default_llseek);
218 218
219loff_t vfs_llseek(struct file *file, loff_t offset, int origin) 219loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
220{ 220{
221 loff_t (*fn)(struct file *, loff_t, int); 221 loff_t (*fn)(struct file *, loff_t, int);
222 222
@@ -225,11 +225,11 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
225 if (file->f_op && file->f_op->llseek) 225 if (file->f_op && file->f_op->llseek)
226 fn = file->f_op->llseek; 226 fn = file->f_op->llseek;
227 } 227 }
228 return fn(file, offset, origin); 228 return fn(file, offset, whence);
229} 229}
230EXPORT_SYMBOL(vfs_llseek); 230EXPORT_SYMBOL(vfs_llseek);
231 231
232SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin) 232SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
233{ 233{
234 off_t retval; 234 off_t retval;
235 struct fd f = fdget(fd); 235 struct fd f = fdget(fd);
@@ -237,8 +237,8 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
237 return -EBADF; 237 return -EBADF;
238 238
239 retval = -EINVAL; 239 retval = -EINVAL;
240 if (origin <= SEEK_MAX) { 240 if (whence <= SEEK_MAX) {
241 loff_t res = vfs_llseek(f.file, offset, origin); 241 loff_t res = vfs_llseek(f.file, offset, whence);
242 retval = res; 242 retval = res;
243 if (res != (loff_t)retval) 243 if (res != (loff_t)retval)
244 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 244 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
@@ -250,7 +250,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
250#ifdef __ARCH_WANT_SYS_LLSEEK 250#ifdef __ARCH_WANT_SYS_LLSEEK
251SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 251SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
252 unsigned long, offset_low, loff_t __user *, result, 252 unsigned long, offset_low, loff_t __user *, result,
253 unsigned int, origin) 253 unsigned int, whence)
254{ 254{
255 int retval; 255 int retval;
256 struct fd f = fdget(fd); 256 struct fd f = fdget(fd);
@@ -260,11 +260,11 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
260 return -EBADF; 260 return -EBADF;
261 261
262 retval = -EINVAL; 262 retval = -EINVAL;
263 if (origin > SEEK_MAX) 263 if (whence > SEEK_MAX)
264 goto out_putf; 264 goto out_putf;
265 265
266 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, 266 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
267 origin); 267 whence);
268 268
269 retval = (int)offset; 269 retval = (int)offset;
270 if (offset >= 0) { 270 if (offset >= 0) {
@@ -935,6 +935,8 @@ ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
935 if (retval > 0) { 935 if (retval > 0) {
936 add_rchar(current, retval); 936 add_rchar(current, retval);
937 add_wchar(current, retval); 937 add_wchar(current, retval);
938 fsnotify_access(in.file);
939 fsnotify_modify(out.file);
938 } 940 }
939 941
940 inc_syscr(current); 942 inc_syscr(current);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 8375c922c0d5..50302d6f8895 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -126,7 +126,7 @@ static int reiserfs_file_open(struct inode *inode, struct file *file)
126 return err; 126 return err;
127} 127}
128 128
129static void reiserfs_vfs_truncate_file(struct inode *inode) 129void reiserfs_vfs_truncate_file(struct inode *inode)
130{ 130{
131 mutex_lock(&(REISERFS_I(inode)->tailpack)); 131 mutex_lock(&(REISERFS_I(inode)->tailpack));
132 reiserfs_truncate_file(inode, 1); 132 reiserfs_truncate_file(inode, 1);
@@ -312,7 +312,6 @@ const struct file_operations reiserfs_file_operations = {
312}; 312};
313 313
314const struct inode_operations reiserfs_file_inode_operations = { 314const struct inode_operations reiserfs_file_inode_operations = {
315 .truncate = reiserfs_vfs_truncate_file,
316 .setattr = reiserfs_setattr, 315 .setattr = reiserfs_setattr,
317 .setxattr = reiserfs_setxattr, 316 .setxattr = reiserfs_setxattr,
318 .getxattr = reiserfs_getxattr, 317 .getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d83736fbc26c..95d7680ead47 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3085,8 +3085,10 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3085 loff_t isize = i_size_read(inode); 3085 loff_t isize = i_size_read(inode);
3086 loff_t end = offset + iov_length(iov, nr_segs); 3086 loff_t end = offset + iov_length(iov, nr_segs);
3087 3087
3088 if (end > isize) 3088 if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
3089 vmtruncate(inode, isize); 3089 truncate_setsize(inode, isize);
3090 reiserfs_vfs_truncate_file(inode);
3091 }
3090 } 3092 }
3091 3093
3092 return ret; 3094 return ret;
@@ -3200,8 +3202,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3200 */ 3202 */
3201 reiserfs_write_unlock_once(inode->i_sb, depth); 3203 reiserfs_write_unlock_once(inode->i_sb, depth);
3202 if ((attr->ia_valid & ATTR_SIZE) && 3204 if ((attr->ia_valid & ATTR_SIZE) &&
3203 attr->ia_size != i_size_read(inode)) 3205 attr->ia_size != i_size_read(inode)) {
3204 error = vmtruncate(inode, attr->ia_size); 3206 error = inode_newsize_ok(inode, attr->ia_size);
3207 if (!error) {
3208 truncate_setsize(inode, attr->ia_size);
3209 reiserfs_vfs_truncate_file(inode);
3210 }
3211 }
3205 3212
3206 if (!error) { 3213 if (!error) {
3207 setattr_copy(inode, attr); 3214 setattr_copy(inode, attr);
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 33215f57ea06..157e474ab303 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2455,6 +2455,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
2455 *, 2455 *,
2456 int count); 2456 int count);
2457int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); 2457int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
2458void reiserfs_vfs_truncate_file(struct inode *inode);
2458int reiserfs_commit_page(struct inode *inode, struct page *page, 2459int reiserfs_commit_page(struct inode *inode, struct page *page,
2459 unsigned from, unsigned to); 2460 unsigned from, unsigned to);
2460void reiserfs_flush_old_commits(struct super_block *); 2461void reiserfs_flush_old_commits(struct super_block *);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 99dffab4c4e4..f2bc3dfd0b88 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -296,18 +296,18 @@ EXPORT_SYMBOL(seq_read);
296 * seq_lseek - ->llseek() method for sequential files. 296 * seq_lseek - ->llseek() method for sequential files.
297 * @file: the file in question 297 * @file: the file in question
298 * @offset: new position 298 * @offset: new position
299 * @origin: 0 for absolute, 1 for relative position 299 * @whence: 0 for absolute, 1 for relative position
300 * 300 *
301 * Ready-made ->f_op->llseek() 301 * Ready-made ->f_op->llseek()
302 */ 302 */
303loff_t seq_lseek(struct file *file, loff_t offset, int origin) 303loff_t seq_lseek(struct file *file, loff_t offset, int whence)
304{ 304{
305 struct seq_file *m = file->private_data; 305 struct seq_file *m = file->private_data;
306 loff_t retval = -EINVAL; 306 loff_t retval = -EINVAL;
307 307
308 mutex_lock(&m->lock); 308 mutex_lock(&m->lock);
309 m->version = file->f_version; 309 m->version = file->f_version;
310 switch (origin) { 310 switch (whence) {
311 case 1: 311 case 1:
312 offset += file->f_pos; 312 offset += file->f_pos;
313 case 0: 313 case 0:
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 8bee4e570911..b53486961735 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -29,6 +29,7 @@
29#include <linux/anon_inodes.h> 29#include <linux/anon_inodes.h>
30#include <linux/signalfd.h> 30#include <linux/signalfd.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/proc_fs.h>
32 33
33void signalfd_cleanup(struct sighand_struct *sighand) 34void signalfd_cleanup(struct sighand_struct *sighand)
34{ 35{
@@ -227,7 +228,24 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
227 return total ? total: ret; 228 return total ? total: ret;
228} 229}
229 230
231#ifdef CONFIG_PROC_FS
232static int signalfd_show_fdinfo(struct seq_file *m, struct file *f)
233{
234 struct signalfd_ctx *ctx = f->private_data;
235 sigset_t sigmask;
236
237 sigmask = ctx->sigmask;
238 signotset(&sigmask);
239 render_sigset_t(m, "sigmask:\t", &sigmask);
240
241 return 0;
242}
243#endif
244
230static const struct file_operations signalfd_fops = { 245static const struct file_operations signalfd_fops = {
246#ifdef CONFIG_PROC_FS
247 .show_fdinfo = signalfd_show_fdinfo,
248#endif
231 .release = signalfd_release, 249 .release = signalfd_release,
232 .poll = signalfd_poll, 250 .poll = signalfd_poll,
233 .read = signalfd_read, 251 .read = signalfd_read,
diff --git a/fs/splice.c b/fs/splice.c
index 13e5b4776e7a..6909d89d0da5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -696,8 +696,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
696 return -EINVAL; 696 return -EINVAL;
697 697
698 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; 698 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
699 if (sd->len < sd->total_len) 699
700 if (sd->len < sd->total_len && pipe->nrbufs > 1)
700 more |= MSG_SENDPAGE_NOTLAST; 701 more |= MSG_SENDPAGE_NOTLAST;
702
701 return file->f_op->sendpage(file, buf->page, buf->offset, 703 return file->f_op->sendpage(file, buf->page, buf->offset,
702 sd->len, &pos, more); 704 sd->len, &pos, more);
703} 705}
@@ -1024,17 +1026,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
1024 ret = sd.num_spliced; 1026 ret = sd.num_spliced;
1025 1027
1026 if (ret > 0) { 1028 if (ret > 0) {
1027 unsigned long nr_pages;
1028 int err; 1029 int err;
1029 1030
1030 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1031
1032 err = generic_write_sync(out, *ppos, ret); 1031 err = generic_write_sync(out, *ppos, ret);
1033 if (err) 1032 if (err)
1034 ret = err; 1033 ret = err;
1035 else 1034 else
1036 *ppos += ret; 1035 *ppos += ret;
1037 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 1036 balance_dirty_pages_ratelimited(mapping);
1038 } 1037 }
1039 sb_end_write(inode->i_sb); 1038 sb_end_write(inode->i_sb);
1040 1039
diff --git a/fs/stat.c b/fs/stat.c
index eae494630a36..14f45459c83d 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -74,7 +74,7 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
74{ 74{
75 struct path path; 75 struct path path;
76 int error = -EINVAL; 76 int error = -EINVAL;
77 int lookup_flags = 0; 77 unsigned int lookup_flags = 0;
78 78
79 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | 79 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
80 AT_EMPTY_PATH)) != 0) 80 AT_EMPTY_PATH)) != 0)
@@ -84,13 +84,17 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
84 lookup_flags |= LOOKUP_FOLLOW; 84 lookup_flags |= LOOKUP_FOLLOW;
85 if (flag & AT_EMPTY_PATH) 85 if (flag & AT_EMPTY_PATH)
86 lookup_flags |= LOOKUP_EMPTY; 86 lookup_flags |= LOOKUP_EMPTY;
87 87retry:
88 error = user_path_at(dfd, filename, lookup_flags, &path); 88 error = user_path_at(dfd, filename, lookup_flags, &path);
89 if (error) 89 if (error)
90 goto out; 90 goto out;
91 91
92 error = vfs_getattr(path.mnt, path.dentry, stat); 92 error = vfs_getattr(path.mnt, path.dentry, stat);
93 path_put(&path); 93 path_put(&path);
94 if (retry_estale(error, lookup_flags)) {
95 lookup_flags |= LOOKUP_REVAL;
96 goto retry;
97 }
94out: 98out:
95 return error; 99 return error;
96} 100}
@@ -296,11 +300,13 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
296 struct path path; 300 struct path path;
297 int error; 301 int error;
298 int empty = 0; 302 int empty = 0;
303 unsigned int lookup_flags = LOOKUP_EMPTY;
299 304
300 if (bufsiz <= 0) 305 if (bufsiz <= 0)
301 return -EINVAL; 306 return -EINVAL;
302 307
303 error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty); 308retry:
309 error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
304 if (!error) { 310 if (!error) {
305 struct inode *inode = path.dentry->d_inode; 311 struct inode *inode = path.dentry->d_inode;
306 312
@@ -314,6 +320,10 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
314 } 320 }
315 } 321 }
316 path_put(&path); 322 path_put(&path);
323 if (retry_estale(error, lookup_flags)) {
324 lookup_flags |= LOOKUP_REVAL;
325 goto retry;
326 }
317 } 327 }
318 return error; 328 return error;
319} 329}
diff --git a/fs/statfs.c b/fs/statfs.c
index f8e832e6f0a2..c219e733f553 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -77,10 +77,17 @@ EXPORT_SYMBOL(vfs_statfs);
77int user_statfs(const char __user *pathname, struct kstatfs *st) 77int user_statfs(const char __user *pathname, struct kstatfs *st)
78{ 78{
79 struct path path; 79 struct path path;
80 int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 80 int error;
81 unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT;
82retry:
83 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
81 if (!error) { 84 if (!error) {
82 error = vfs_statfs(&path, st); 85 error = vfs_statfs(&path, st);
83 path_put(&path); 86 path_put(&path);
87 if (retry_estale(error, lookup_flags)) {
88 lookup_flags |= LOOKUP_REVAL;
89 goto retry;
90 }
84 } 91 }
85 return error; 92 return error;
86} 93}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 00012e31829d..602f56db0442 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -485,8 +485,8 @@ const struct file_operations sysfs_file_operations = {
485 .poll = sysfs_poll, 485 .poll = sysfs_poll,
486}; 486};
487 487
488int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, 488static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
489 const void **pns) 489 const void **pns)
490{ 490{
491 struct sysfs_dirent *dir_sd = kobj->sd; 491 struct sysfs_dirent *dir_sd = kobj->sd;
492 const struct sysfs_ops *ops; 492 const struct sysfs_ops *ops;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 71eb7e253927..db940a9be045 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -149,6 +149,7 @@ static struct file_system_type sysfs_fs_type = {
149 .name = "sysfs", 149 .name = "sysfs",
150 .mount = sysfs_mount, 150 .mount = sysfs_mount,
151 .kill_sb = sysfs_kill_sb, 151 .kill_sb = sysfs_kill_sb,
152 .fs_flags = FS_USERNS_MOUNT,
152}; 153};
153 154
154int __init sysfs_init(void) 155int __init sysfs_init(void)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 0a65939508e9..9d4dc6831792 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -41,9 +41,11 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
41 41
42 if ((attr->ia_valid & ATTR_SIZE) && 42 if ((attr->ia_valid & ATTR_SIZE) &&
43 attr->ia_size != i_size_read(inode)) { 43 attr->ia_size != i_size_read(inode)) {
44 error = vmtruncate(inode, attr->ia_size); 44 error = inode_newsize_ok(inode, attr->ia_size);
45 if (error) 45 if (error)
46 return error; 46 return error;
47 truncate_setsize(inode, attr->ia_size);
48 sysv_truncate(inode);
47 } 49 }
48 50
49 setattr_copy(inode, attr); 51 setattr_copy(inode, attr);
@@ -52,7 +54,6 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
52} 54}
53 55
54const struct inode_operations sysv_file_inode_operations = { 56const struct inode_operations sysv_file_inode_operations = {
55 .truncate = sysv_truncate,
56 .setattr = sysv_setattr, 57 .setattr = sysv_setattr,
57 .getattr = sysv_getattr, 58 .getattr = sysv_getattr,
58}; 59};
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 90b54b438789..c1a591a4725b 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -464,6 +464,16 @@ int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
464 return __block_write_begin(page, pos, len, get_block); 464 return __block_write_begin(page, pos, len, get_block);
465} 465}
466 466
467static void sysv_write_failed(struct address_space *mapping, loff_t to)
468{
469 struct inode *inode = mapping->host;
470
471 if (to > inode->i_size) {
472 truncate_pagecache(inode, to, inode->i_size);
473 sysv_truncate(inode);
474 }
475}
476
467static int sysv_write_begin(struct file *file, struct address_space *mapping, 477static int sysv_write_begin(struct file *file, struct address_space *mapping,
468 loff_t pos, unsigned len, unsigned flags, 478 loff_t pos, unsigned len, unsigned flags,
469 struct page **pagep, void **fsdata) 479 struct page **pagep, void **fsdata)
@@ -471,11 +481,8 @@ static int sysv_write_begin(struct file *file, struct address_space *mapping,
471 int ret; 481 int ret;
472 482
473 ret = block_write_begin(mapping, pos, len, flags, pagep, get_block); 483 ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
474 if (unlikely(ret)) { 484 if (unlikely(ret))
475 loff_t isize = mapping->host->i_size; 485 sysv_write_failed(mapping, pos + len);
476 if (pos + len > isize)
477 vmtruncate(mapping->host, isize);
478 }
479 486
480 return ret; 487 return ret;
481} 488}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 62911637e12f..12817ffc7345 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2560,7 +2560,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
2560static int corrupt_data(const struct ubifs_info *c, const void *buf, 2560static int corrupt_data(const struct ubifs_info *c, const void *buf,
2561 unsigned int len) 2561 unsigned int len)
2562{ 2562{
2563 unsigned int from, to, i, ffs = chance(1, 2); 2563 unsigned int from, to, ffs = chance(1, 2);
2564 unsigned char *p = (void *)buf; 2564 unsigned char *p = (void *)buf;
2565 2565
2566 from = random32() % (len + 1); 2566 from = random32() % (len + 1);
@@ -2571,11 +2571,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
2571 ffs ? "0xFFs" : "random data"); 2571 ffs ? "0xFFs" : "random data");
2572 2572
2573 if (ffs) 2573 if (ffs)
2574 for (i = from; i < to; i++) 2574 memset(p + from, 0xFF, to - from);
2575 p[i] = 0xFF;
2576 else 2575 else
2577 for (i = from; i < to; i++) 2576 prandom_bytes(p + from, to - from);
2578 p[i] = random32() % 0x100;
2579 2577
2580 return to; 2578 return to;
2581} 2579}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e271fba1651b..8a574776a493 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -453,11 +453,11 @@ out:
453} 453}
454 454
455/* If a directory is seeked, we have to free saved readdir() state */ 455/* If a directory is seeked, we have to free saved readdir() state */
456static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) 456static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
457{ 457{
458 kfree(file->private_data); 458 kfree(file->private_data);
459 file->private_data = NULL; 459 file->private_data = NULL;
460 return generic_file_llseek(file, offset, origin); 460 return generic_file_llseek(file, offset, whence);
461} 461}
462 462
463/* Free saved readdir() state when the directory is closed */ 463/* Free saved readdir() state when the directory is closed */
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index df88b957ccf0..cbae1ed0b7c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -587,7 +587,6 @@ out:
587static sector_t inode_getblk(struct inode *inode, sector_t block, 587static sector_t inode_getblk(struct inode *inode, sector_t block,
588 int *err, int *new) 588 int *err, int *new)
589{ 589{
590 static sector_t last_block;
591 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; 590 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
592 struct extent_position prev_epos, cur_epos, next_epos; 591 struct extent_position prev_epos, cur_epos, next_epos;
593 int count = 0, startnum = 0, endnum = 0; 592 int count = 0, startnum = 0, endnum = 0;
@@ -601,6 +600,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
601 struct udf_inode_info *iinfo = UDF_I(inode); 600 struct udf_inode_info *iinfo = UDF_I(inode);
602 int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; 601 int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
603 int lastblock = 0; 602 int lastblock = 0;
603 bool isBeyondEOF;
604 604
605 *err = 0; 605 *err = 0;
606 *new = 0; 606 *new = 0;
@@ -676,11 +676,10 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
676 return newblock; 676 return newblock;
677 } 677 }
678 678
679 last_block = block;
680 /* Are we beyond EOF? */ 679 /* Are we beyond EOF? */
681 if (etype == -1) { 680 if (etype == -1) {
682 int ret; 681 int ret;
683 682 isBeyondEOF = 1;
684 if (count) { 683 if (count) {
685 if (c) 684 if (c)
686 laarr[0] = laarr[1]; 685 laarr[0] = laarr[1];
@@ -718,11 +717,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
718 memset(&laarr[c].extLocation, 0x00, 717 memset(&laarr[c].extLocation, 0x00,
719 sizeof(struct kernel_lb_addr)); 718 sizeof(struct kernel_lb_addr));
720 count++; 719 count++;
721 endnum++;
722 } 720 }
723 endnum = c + 1; 721 endnum = c + 1;
724 lastblock = 1; 722 lastblock = 1;
725 } else { 723 } else {
724 isBeyondEOF = 0;
726 endnum = startnum = ((count > 2) ? 2 : count); 725 endnum = startnum = ((count > 2) ? 2 : count);
727 726
728 /* if the current extent is in position 0, 727 /* if the current extent is in position 0,
@@ -765,10 +764,13 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
765 goal, err); 764 goal, err);
766 if (!newblocknum) { 765 if (!newblocknum) {
767 brelse(prev_epos.bh); 766 brelse(prev_epos.bh);
767 brelse(cur_epos.bh);
768 brelse(next_epos.bh);
768 *err = -ENOSPC; 769 *err = -ENOSPC;
769 return 0; 770 return 0;
770 } 771 }
771 iinfo->i_lenExtents += inode->i_sb->s_blocksize; 772 if (isBeyondEOF)
773 iinfo->i_lenExtents += inode->i_sb->s_blocksize;
772 } 774 }
773 775
774 /* if the extent the requsted block is located in contains multiple 776 /* if the extent the requsted block is located in contains multiple
@@ -795,6 +797,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
795 udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); 797 udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
796 798
797 brelse(prev_epos.bh); 799 brelse(prev_epos.bh);
800 brelse(cur_epos.bh);
801 brelse(next_epos.bh);
798 802
799 newblock = udf_get_pblock(inode->i_sb, newblocknum, 803 newblock = udf_get_pblock(inode->i_sb, newblocknum,
800 iinfo->i_location.partitionReferenceNum, 0); 804 iinfo->i_location.partitionReferenceNum, 0);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index d44fb568abe1..e9be396a558d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -307,7 +307,8 @@ static void udf_sb_free_partitions(struct super_block *sb)
307{ 307{
308 struct udf_sb_info *sbi = UDF_SB(sb); 308 struct udf_sb_info *sbi = UDF_SB(sb);
309 int i; 309 int i;
310 310 if (sbi->s_partmaps == NULL)
311 return;
311 for (i = 0; i < sbi->s_partitions; i++) 312 for (i = 0; i < sbi->s_partitions; i++)
312 udf_free_partition(&sbi->s_partmaps[i]); 313 udf_free_partition(&sbi->s_partmaps[i]);
313 kfree(sbi->s_partmaps); 314 kfree(sbi->s_partmaps);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index eb6d0b7dc879..ff24e4449ece 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,6 +526,14 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
526 return __block_write_begin(page, pos, len, ufs_getfrag_block); 526 return __block_write_begin(page, pos, len, ufs_getfrag_block);
527} 527}
528 528
529static void ufs_write_failed(struct address_space *mapping, loff_t to)
530{
531 struct inode *inode = mapping->host;
532
533 if (to > inode->i_size)
534 truncate_pagecache(inode, to, inode->i_size);
535}
536
529static int ufs_write_begin(struct file *file, struct address_space *mapping, 537static int ufs_write_begin(struct file *file, struct address_space *mapping,
530 loff_t pos, unsigned len, unsigned flags, 538 loff_t pos, unsigned len, unsigned flags,
531 struct page **pagep, void **fsdata) 539 struct page **pagep, void **fsdata)
@@ -534,11 +542,8 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
534 542
535 ret = block_write_begin(mapping, pos, len, flags, pagep, 543 ret = block_write_begin(mapping, pos, len, flags, pagep,
536 ufs_getfrag_block); 544 ufs_getfrag_block);
537 if (unlikely(ret)) { 545 if (unlikely(ret))
538 loff_t isize = mapping->host->i_size; 546 ufs_write_failed(mapping, pos + len);
539 if (pos + len > isize)
540 vmtruncate(mapping->host, isize);
541 }
542 547
543 return ret; 548 return ret;
544} 549}
diff --git a/fs/utimes.c b/fs/utimes.c
index bb0696a41735..f4fb7eca10e8 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -158,13 +158,17 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
158 158
159 if (!(flags & AT_SYMLINK_NOFOLLOW)) 159 if (!(flags & AT_SYMLINK_NOFOLLOW))
160 lookup_flags |= LOOKUP_FOLLOW; 160 lookup_flags |= LOOKUP_FOLLOW;
161 161retry:
162 error = user_path_at(dfd, filename, lookup_flags, &path); 162 error = user_path_at(dfd, filename, lookup_flags, &path);
163 if (error) 163 if (error)
164 goto out; 164 goto out;
165 165
166 error = utimes_common(&path, times); 166 error = utimes_common(&path, times);
167 path_put(&path); 167 path_put(&path);
168 if (retry_estale(error, lookup_flags)) {
169 lookup_flags |= LOOKUP_REVAL;
170 goto retry;
171 }
168 } 172 }
169 173
170out: 174out:
diff --git a/fs/xattr.c b/fs/xattr.c
index e21c119f4f99..3377dff18404 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -370,8 +370,9 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
370{ 370{
371 struct path path; 371 struct path path;
372 int error; 372 int error;
373 373 unsigned int lookup_flags = LOOKUP_FOLLOW;
374 error = user_path(pathname, &path); 374retry:
375 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
375 if (error) 376 if (error)
376 return error; 377 return error;
377 error = mnt_want_write(path.mnt); 378 error = mnt_want_write(path.mnt);
@@ -380,6 +381,10 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
380 mnt_drop_write(path.mnt); 381 mnt_drop_write(path.mnt);
381 } 382 }
382 path_put(&path); 383 path_put(&path);
384 if (retry_estale(error, lookup_flags)) {
385 lookup_flags |= LOOKUP_REVAL;
386 goto retry;
387 }
383 return error; 388 return error;
384} 389}
385 390
@@ -389,8 +394,9 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
389{ 394{
390 struct path path; 395 struct path path;
391 int error; 396 int error;
392 397 unsigned int lookup_flags = 0;
393 error = user_lpath(pathname, &path); 398retry:
399 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
394 if (error) 400 if (error)
395 return error; 401 return error;
396 error = mnt_want_write(path.mnt); 402 error = mnt_want_write(path.mnt);
@@ -399,6 +405,10 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
399 mnt_drop_write(path.mnt); 405 mnt_drop_write(path.mnt);
400 } 406 }
401 path_put(&path); 407 path_put(&path);
408 if (retry_estale(error, lookup_flags)) {
409 lookup_flags |= LOOKUP_REVAL;
410 goto retry;
411 }
402 return error; 412 return error;
403} 413}
404 414
@@ -476,12 +486,17 @@ SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
476{ 486{
477 struct path path; 487 struct path path;
478 ssize_t error; 488 ssize_t error;
479 489 unsigned int lookup_flags = LOOKUP_FOLLOW;
480 error = user_path(pathname, &path); 490retry:
491 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
481 if (error) 492 if (error)
482 return error; 493 return error;
483 error = getxattr(path.dentry, name, value, size); 494 error = getxattr(path.dentry, name, value, size);
484 path_put(&path); 495 path_put(&path);
496 if (retry_estale(error, lookup_flags)) {
497 lookup_flags |= LOOKUP_REVAL;
498 goto retry;
499 }
485 return error; 500 return error;
486} 501}
487 502
@@ -490,12 +505,17 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
490{ 505{
491 struct path path; 506 struct path path;
492 ssize_t error; 507 ssize_t error;
493 508 unsigned int lookup_flags = 0;
494 error = user_lpath(pathname, &path); 509retry:
510 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
495 if (error) 511 if (error)
496 return error; 512 return error;
497 error = getxattr(path.dentry, name, value, size); 513 error = getxattr(path.dentry, name, value, size);
498 path_put(&path); 514 path_put(&path);
515 if (retry_estale(error, lookup_flags)) {
516 lookup_flags |= LOOKUP_REVAL;
517 goto retry;
518 }
499 return error; 519 return error;
500} 520}
501 521
@@ -556,12 +576,17 @@ SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
556{ 576{
557 struct path path; 577 struct path path;
558 ssize_t error; 578 ssize_t error;
559 579 unsigned int lookup_flags = LOOKUP_FOLLOW;
560 error = user_path(pathname, &path); 580retry:
581 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
561 if (error) 582 if (error)
562 return error; 583 return error;
563 error = listxattr(path.dentry, list, size); 584 error = listxattr(path.dentry, list, size);
564 path_put(&path); 585 path_put(&path);
586 if (retry_estale(error, lookup_flags)) {
587 lookup_flags |= LOOKUP_REVAL;
588 goto retry;
589 }
565 return error; 590 return error;
566} 591}
567 592
@@ -570,12 +595,17 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
570{ 595{
571 struct path path; 596 struct path path;
572 ssize_t error; 597 ssize_t error;
573 598 unsigned int lookup_flags = 0;
574 error = user_lpath(pathname, &path); 599retry:
600 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
575 if (error) 601 if (error)
576 return error; 602 return error;
577 error = listxattr(path.dentry, list, size); 603 error = listxattr(path.dentry, list, size);
578 path_put(&path); 604 path_put(&path);
605 if (retry_estale(error, lookup_flags)) {
606 lookup_flags |= LOOKUP_REVAL;
607 goto retry;
608 }
579 return error; 609 return error;
580} 610}
581 611
@@ -615,8 +645,9 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
615{ 645{
616 struct path path; 646 struct path path;
617 int error; 647 int error;
618 648 unsigned int lookup_flags = LOOKUP_FOLLOW;
619 error = user_path(pathname, &path); 649retry:
650 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
620 if (error) 651 if (error)
621 return error; 652 return error;
622 error = mnt_want_write(path.mnt); 653 error = mnt_want_write(path.mnt);
@@ -625,6 +656,10 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
625 mnt_drop_write(path.mnt); 656 mnt_drop_write(path.mnt);
626 } 657 }
627 path_put(&path); 658 path_put(&path);
659 if (retry_estale(error, lookup_flags)) {
660 lookup_flags |= LOOKUP_REVAL;
661 goto retry;
662 }
628 return error; 663 return error;
629} 664}
630 665
@@ -633,8 +668,9 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
633{ 668{
634 struct path path; 669 struct path path;
635 int error; 670 int error;
636 671 unsigned int lookup_flags = 0;
637 error = user_lpath(pathname, &path); 672retry:
673 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
638 if (error) 674 if (error)
639 return error; 675 return error;
640 error = mnt_want_write(path.mnt); 676 error = mnt_want_write(path.mnt);
@@ -643,6 +679,10 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
643 mnt_drop_write(path.mnt); 679 mnt_drop_write(path.mnt);
644 } 680 }
645 path_put(&path); 681 path_put(&path);
682 if (retry_estale(error, lookup_flags)) {
683 lookup_flags |= LOOKUP_REVAL;
684 goto retry;
685 }
646 return error; 686 return error;
647} 687}
648 688
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d4..5a7ffe54f5d5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
2 tristate "XFS filesystem support" 2 tristate "XFS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 select EXPORTFS 4 select EXPORTFS
5 select LIBCRC32C
5 help 6 help
6 XFS is a high performance journaling filesystem which originated 7 XFS is a high performance journaling filesystem which originated
7 on the SGI IRIX platform. It is completely multi-threaded, can 8 on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2f..d02201df855b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \
37 xfs_file.o \ 37 xfs_file.o \
38 xfs_filestream.o \ 38 xfs_filestream.o \
39 xfs_fsops.o \ 39 xfs_fsops.o \
40 xfs_fs_subr.o \
41 xfs_globals.o \ 40 xfs_globals.o \
42 xfs_iget.o \ 41 xfs_icache.o \
43 xfs_ioctl.o \ 42 xfs_ioctl.o \
44 xfs_iomap.o \ 43 xfs_iomap.o \
45 xfs_iops.o \ 44 xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \
47 xfs_message.o \ 46 xfs_message.o \
48 xfs_mru_cache.o \ 47 xfs_mru_cache.o \
49 xfs_super.o \ 48 xfs_super.o \
50 xfs_sync.o \
51 xfs_xattr.o \ 49 xfs_xattr.o \
52 xfs_rename.o \ 50 xfs_rename.o \
53 xfs_utils.o \ 51 xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..104db0f3bed6 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); 26extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); 27extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
28 28
29static inline void
30uuid_copy(uuid_t *dst, uuid_t *src)
31{
32 memcpy(dst, src, sizeof(uuid_t));
33}
34
29#endif /* __XFS_SUPPORT_UUID_H__ */ 35#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c0..f2aeedb6a579 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, 108extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); 109 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
110 110
111extern const struct xfs_buf_ops xfs_agf_buf_ops;
112
111/* 113/*
112 * Size of the unlinked inode hash table in the agi. 114 * Size of the unlinked inode hash table in the agi.
113 */ 115 */
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
161extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, 163extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
162 xfs_agnumber_t agno, struct xfs_buf **bpp); 164 xfs_agnumber_t agno, struct xfs_buf **bpp);
163 165
166extern const struct xfs_buf_ops xfs_agi_buf_ops;
167
164/* 168/*
165 * The third a.g. block contains the a.g. freelist, an array 169 * The third a.g. block contains the a.g. freelist, an array
166 * of block pointers to blocks owned by the allocation btree code. 170 * of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
233#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup 237#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
234 in xfs_inode_ag_iterator */ 238 in xfs_inode_ag_iterator */
235#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ 239#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
240#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
236 241
237#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) 242#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
238#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ 243#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 335206a9c698..393055fe3aef 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
430 return 0; 430 return 0;
431} 431}
432 432
433static void
434xfs_agfl_verify(
435 struct xfs_buf *bp)
436{
437#ifdef WHEN_CRCS_COME_ALONG
438 /*
439 * we cannot actually do any verification of the AGFL because mkfs does
440 * not initialise the AGFL to zero or NULL. Hence the only valid part of
441 * the AGFL is what the AGF says is active. We can't get to the AGF, so
442 * we can't verify just those entries are valid.
443 *
444 * This problem goes away when the CRC format change comes along as that
445 * requires the AGFL to be initialised by mkfs. At that point, we can
446 * verify the blocks in the agfl -active or not- lie within the bounds
447 * of the AG. Until then, just leave this check ifdef'd out.
448 */
449 struct xfs_mount *mp = bp->b_target->bt_mount;
450 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
451 int agfl_ok = 1;
452
453 int i;
454
455 for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
456 if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
457 be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
458 agfl_ok = 0;
459 }
460
461 if (!agfl_ok) {
462 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
463 xfs_buf_ioerror(bp, EFSCORRUPTED);
464 }
465#endif
466}
467
468static void
469xfs_agfl_write_verify(
470 struct xfs_buf *bp)
471{
472 xfs_agfl_verify(bp);
473}
474
475static void
476xfs_agfl_read_verify(
477 struct xfs_buf *bp)
478{
479 xfs_agfl_verify(bp);
480}
481
482const struct xfs_buf_ops xfs_agfl_buf_ops = {
483 .verify_read = xfs_agfl_read_verify,
484 .verify_write = xfs_agfl_write_verify,
485};
486
433/* 487/*
434 * Read in the allocation group free block array. 488 * Read in the allocation group free block array.
435 */ 489 */
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
447 error = xfs_trans_read_buf( 501 error = xfs_trans_read_buf(
448 mp, tp, mp->m_ddev_targp, 502 mp, tp, mp->m_ddev_targp,
449 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), 503 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
450 XFS_FSS_TO_BB(mp, 1), 0, &bp); 504 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
451 if (error) 505 if (error)
452 return error; 506 return error;
453 ASSERT(!xfs_buf_geterror(bp)); 507 ASSERT(!xfs_buf_geterror(bp));
@@ -2091,6 +2145,63 @@ xfs_alloc_put_freelist(
2091 return 0; 2145 return 0;
2092} 2146}
2093 2147
2148static void
2149xfs_agf_verify(
2150 struct xfs_buf *bp)
2151 {
2152 struct xfs_mount *mp = bp->b_target->bt_mount;
2153 struct xfs_agf *agf;
2154 int agf_ok;
2155
2156 agf = XFS_BUF_TO_AGF(bp);
2157
2158 agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2159 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2160 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2161 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2162 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2163 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
2164
2165 /*
2166 * during growfs operations, the perag is not fully initialised,
2167 * so we can't use it for any useful checking. growfs ensures we can't
2168 * use it by using uncached buffers that don't have the perag attached
2169 * so we can detect and avoid this problem.
2170 */
2171 if (bp->b_pag)
2172 agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
2173 bp->b_pag->pag_agno;
2174
2175 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2176 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2177 be32_to_cpu(agf->agf_length);
2178
2179 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2180 XFS_RANDOM_ALLOC_READ_AGF))) {
2181 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
2182 xfs_buf_ioerror(bp, EFSCORRUPTED);
2183 }
2184}
2185
2186static void
2187xfs_agf_read_verify(
2188 struct xfs_buf *bp)
2189{
2190 xfs_agf_verify(bp);
2191}
2192
2193static void
2194xfs_agf_write_verify(
2195 struct xfs_buf *bp)
2196{
2197 xfs_agf_verify(bp);
2198}
2199
2200const struct xfs_buf_ops xfs_agf_buf_ops = {
2201 .verify_read = xfs_agf_read_verify,
2202 .verify_write = xfs_agf_write_verify,
2203};
2204
2094/* 2205/*
2095 * Read in the allocation group header (free/alloc section). 2206 * Read in the allocation group header (free/alloc section).
2096 */ 2207 */
@@ -2102,44 +2213,19 @@ xfs_read_agf(
2102 int flags, /* XFS_BUF_ */ 2213 int flags, /* XFS_BUF_ */
2103 struct xfs_buf **bpp) /* buffer for the ag freelist header */ 2214 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2104{ 2215{
2105 struct xfs_agf *agf; /* ag freelist header */
2106 int agf_ok; /* set if agf is consistent */
2107 int error; 2216 int error;
2108 2217
2109 ASSERT(agno != NULLAGNUMBER); 2218 ASSERT(agno != NULLAGNUMBER);
2110 error = xfs_trans_read_buf( 2219 error = xfs_trans_read_buf(
2111 mp, tp, mp->m_ddev_targp, 2220 mp, tp, mp->m_ddev_targp,
2112 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 2221 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2113 XFS_FSS_TO_BB(mp, 1), flags, bpp); 2222 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2114 if (error) 2223 if (error)
2115 return error; 2224 return error;
2116 if (!*bpp) 2225 if (!*bpp)
2117 return 0; 2226 return 0;
2118 2227
2119 ASSERT(!(*bpp)->b_error); 2228 ASSERT(!(*bpp)->b_error);
2120 agf = XFS_BUF_TO_AGF(*bpp);
2121
2122 /*
2123 * Validate the magic number of the agf block.
2124 */
2125 agf_ok =
2126 agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
2127 XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2128 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2129 be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
2130 be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
2131 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
2132 be32_to_cpu(agf->agf_seqno) == agno;
2133 if (xfs_sb_version_haslazysbcount(&mp->m_sb))
2134 agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
2135 be32_to_cpu(agf->agf_length);
2136 if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
2137 XFS_RANDOM_ALLOC_READ_AGF))) {
2138 XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
2139 XFS_ERRLEVEL_LOW, mp, agf);
2140 xfs_trans_brelse(tp, *bpp);
2141 return XFS_ERROR(EFSCORRUPTED);
2142 }
2143 xfs_buf_set_ref(*bpp, XFS_AGF_REF); 2229 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2144 return 0; 2230 return 0;
2145} 2231}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index feacb061bab7..99d0a6101558 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
231 xfs_extlen_t *len, /* output: length of extent */ 231 xfs_extlen_t *len, /* output: length of extent */
232 int *stat); /* output: success/failure */ 232 int *stat); /* output: success/failure */
233 233
234extern const struct xfs_buf_ops xfs_agf_buf_ops;
235extern const struct xfs_buf_ops xfs_agfl_buf_ops;
236
234#endif /* __XFS_ALLOC_H__ */ 237#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f7876c6d6165..b1ddef6b2689 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -272,6 +272,82 @@ xfs_allocbt_key_diff(
272 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; 272 return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
273} 273}
274 274
275static void
276xfs_allocbt_verify(
277 struct xfs_buf *bp)
278{
279 struct xfs_mount *mp = bp->b_target->bt_mount;
280 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
281 struct xfs_perag *pag = bp->b_pag;
282 unsigned int level;
283 int sblock_ok; /* block passes checks */
284
285 /*
286 * magic number and level verification
287 *
288 * During growfs operations, we can't verify the exact level as the
289 * perag is not fully initialised and hence not attached to the buffer.
290 * In this case, check against the maximum tree depth.
291 */
292 level = be16_to_cpu(block->bb_level);
293 switch (block->bb_magic) {
294 case cpu_to_be32(XFS_ABTB_MAGIC):
295 if (pag)
296 sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
297 else
298 sblock_ok = level < mp->m_ag_maxlevels;
299 break;
300 case cpu_to_be32(XFS_ABTC_MAGIC):
301 if (pag)
302 sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
303 else
304 sblock_ok = level < mp->m_ag_maxlevels;
305 break;
306 default:
307 sblock_ok = 0;
308 break;
309 }
310
311 /* numrecs verification */
312 sblock_ok = sblock_ok &&
313 be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
314
315 /* sibling pointer verification */
316 sblock_ok = sblock_ok &&
317 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
318 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
319 block->bb_u.s.bb_leftsib &&
320 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
321 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
322 block->bb_u.s.bb_rightsib;
323
324 if (!sblock_ok) {
325 trace_xfs_btree_corrupt(bp, _RET_IP_);
326 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
327 xfs_buf_ioerror(bp, EFSCORRUPTED);
328 }
329}
330
331static void
332xfs_allocbt_read_verify(
333 struct xfs_buf *bp)
334{
335 xfs_allocbt_verify(bp);
336}
337
338static void
339xfs_allocbt_write_verify(
340 struct xfs_buf *bp)
341{
342 xfs_allocbt_verify(bp);
343}
344
345const struct xfs_buf_ops xfs_allocbt_buf_ops = {
346 .verify_read = xfs_allocbt_read_verify,
347 .verify_write = xfs_allocbt_write_verify,
348};
349
350
275#ifdef DEBUG 351#ifdef DEBUG
276STATIC int 352STATIC int
277xfs_allocbt_keys_inorder( 353xfs_allocbt_keys_inorder(
@@ -327,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
327 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, 403 .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
328 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, 404 .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
329 .key_diff = xfs_allocbt_key_diff, 405 .key_diff = xfs_allocbt_key_diff,
406 .buf_ops = &xfs_allocbt_buf_ops,
330#ifdef DEBUG 407#ifdef DEBUG
331 .keys_inorder = xfs_allocbt_keys_inorder, 408 .keys_inorder = xfs_allocbt_keys_inorder,
332 .recs_inorder = xfs_allocbt_recs_inorder, 409 .recs_inorder = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed876..7e89a2b429dd 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
93 xfs_agnumber_t, xfs_btnum_t); 93 xfs_agnumber_t, xfs_btnum_t);
94extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); 94extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
95 95
96extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
97
96#endif /* __XFS_ALLOC_BTREE_H__ */ 98#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e57e2daa357c..5f707e537171 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,11 +86,11 @@ xfs_destroy_ioend(
86 } 86 }
87 87
88 if (ioend->io_iocb) { 88 if (ioend->io_iocb) {
89 inode_dio_done(ioend->io_inode);
89 if (ioend->io_isasync) { 90 if (ioend->io_isasync) {
90 aio_complete(ioend->io_iocb, ioend->io_error ? 91 aio_complete(ioend->io_iocb, ioend->io_error ?
91 ioend->io_error : ioend->io_result, 0); 92 ioend->io_error : ioend->io_result, 0);
92 } 93 }
93 inode_dio_done(ioend->io_inode);
94 } 94 }
95 95
96 mempool_free(ioend, xfs_ioend_pool); 96 mempool_free(ioend, xfs_ioend_pool);
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
124 ioend->io_append_trans = tp; 124 ioend->io_append_trans = tp;
125 125
126 /* 126 /*
127 * We will pass freeze protection with a transaction. So tell lockdep 127 * We may pass freeze protection with a transaction. So tell lockdep
128 * we released it. 128 * we released it.
129 */ 129 */
130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], 130 rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
149 xfs_fsize_t isize; 149 xfs_fsize_t isize;
150 150
151 /* 151 /*
152 * The transaction was allocated in the I/O submission thread, 152 * The transaction may have been allocated in the I/O submission thread,
153 * thus we need to mark ourselves as beeing in a transaction 153 * thus we need to mark ourselves as beeing in a transaction manually.
154 * manually. 154 * Similarly for freeze protection.
155 */ 155 */
156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 156 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
157 rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
158 0, 1, _THIS_IP_);
157 159
158 xfs_ilock(ip, XFS_ILOCK_EXCL); 160 xfs_ilock(ip, XFS_ILOCK_EXCL);
159 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); 161 isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
187 189
188 if (ioend->io_type == XFS_IO_UNWRITTEN) 190 if (ioend->io_type == XFS_IO_UNWRITTEN)
189 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 191 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
190 else if (ioend->io_append_trans) 192 else if (ioend->io_append_trans ||
193 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
191 queue_work(mp->m_data_workqueue, &ioend->io_work); 194 queue_work(mp->m_data_workqueue, &ioend->io_work);
192 else 195 else
193 xfs_destroy_ioend(ioend); 196 xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 208 struct xfs_inode *ip = XFS_I(ioend->io_inode);
206 int error = 0; 209 int error = 0;
207 210
208 if (ioend->io_append_trans) {
209 /*
210 * We've got freeze protection passed with the transaction.
211 * Tell lockdep about it.
212 */
213 rwsem_acquire_read(
214 &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
215 0, 1, _THIS_IP_);
216 }
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 211 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
218 ioend->io_error = -EIO; 212 ioend->io_error = -EIO;
219 goto done; 213 goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
226 * range to normal written extens after the data I/O has finished. 220 * range to normal written extens after the data I/O has finished.
227 */ 221 */
228 if (ioend->io_type == XFS_IO_UNWRITTEN) { 222 if (ioend->io_type == XFS_IO_UNWRITTEN) {
223 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
224 ioend->io_size);
225 } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
229 /* 226 /*
230 * For buffered I/O we never preallocate a transaction when 227 * For direct I/O we do not know if we need to allocate blocks
231 * doing the unwritten extent conversion, but for direct I/O 228 * or not so we can't preallocate an append transaction as that
232 * we do not know if we are converting an unwritten extent 229 * results in nested reservations and log space deadlocks. Hence
233 * or not at the point where we preallocate the transaction. 230 * allocate the transaction here. While this is sub-optimal and
231 * can block IO completion for some time, we're stuck with doing
232 * it this way until we can pass the ioend to the direct IO
233 * allocation callbacks and avoid nesting that way.
234 */ 234 */
235 if (ioend->io_append_trans) { 235 error = xfs_setfilesize_trans_alloc(ioend);
236 ASSERT(ioend->io_isdirect); 236 if (error)
237
238 current_set_flags_nested(
239 &ioend->io_append_trans->t_pflags, PF_FSTRANS);
240 xfs_trans_cancel(ioend->io_append_trans, 0);
241 }
242
243 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
244 ioend->io_size);
245 if (error) {
246 ioend->io_error = -error;
247 goto done; 237 goto done;
248 } 238 error = xfs_setfilesize(ioend);
249 } else if (ioend->io_append_trans) { 239 } else if (ioend->io_append_trans) {
250 error = xfs_setfilesize(ioend); 240 error = xfs_setfilesize(ioend);
251 if (error)
252 ioend->io_error = -error;
253 } else { 241 } else {
254 ASSERT(!xfs_ioend_is_append(ioend)); 242 ASSERT(!xfs_ioend_is_append(ioend));
255 } 243 }
256 244
257done: 245done:
246 if (error)
247 ioend->io_error = -error;
258 xfs_destroy_ioend(ioend); 248 xfs_destroy_ioend(ioend);
259} 249}
260 250
@@ -1432,25 +1422,21 @@ xfs_vm_direct_IO(
1432 size_t size = iov_length(iov, nr_segs); 1422 size_t size = iov_length(iov, nr_segs);
1433 1423
1434 /* 1424 /*
1435 * We need to preallocate a transaction for a size update 1425 * We cannot preallocate a size update transaction here as we
1436 * here. In the case that this write both updates the size 1426 * don't know whether allocation is necessary or not. Hence we
1437 * and converts at least on unwritten extent we will cancel 1427 * can only tell IO completion that one is necessary if we are
1438 * the still clean transaction after the I/O has finished. 1428 * not doing unwritten extent conversion.
1439 */ 1429 */
1440 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); 1430 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
1441 if (offset + size > XFS_I(inode)->i_d.di_size) { 1431 if (offset + size > XFS_I(inode)->i_d.di_size)
1442 ret = xfs_setfilesize_trans_alloc(ioend);
1443 if (ret)
1444 goto out_destroy_ioend;
1445 ioend->io_isdirect = 1; 1432 ioend->io_isdirect = 1;
1446 }
1447 1433
1448 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1434 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1449 offset, nr_segs, 1435 offset, nr_segs,
1450 xfs_get_blocks_direct, 1436 xfs_get_blocks_direct,
1451 xfs_end_io_direct_write, NULL, 0); 1437 xfs_end_io_direct_write, NULL, 0);
1452 if (ret != -EIOCBQUEUED && iocb->private) 1438 if (ret != -EIOCBQUEUED && iocb->private)
1453 goto out_trans_cancel; 1439 goto out_destroy_ioend;
1454 } else { 1440 } else {
1455 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, 1441 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
1456 offset, nr_segs, 1442 offset, nr_segs,
@@ -1460,15 +1446,6 @@ xfs_vm_direct_IO(
1460 1446
1461 return ret; 1447 return ret;
1462 1448
1463out_trans_cancel:
1464 if (ioend->io_append_trans) {
1465 current_set_flags_nested(&ioend->io_append_trans->t_pflags,
1466 PF_FSTRANS);
1467 rwsem_acquire_read(
1468 &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
1469 0, 1, _THIS_IP_);
1470 xfs_trans_cancel(ioend->io_append_trans, 0);
1471 }
1472out_destroy_ioend: 1449out_destroy_ioend:
1473 xfs_destroy_ioend(ioend); 1450 xfs_destroy_ioend(ioend);
1474 return ret; 1451 return ret;
@@ -1641,7 +1618,7 @@ xfs_vm_bmap(
1641 1618
1642 trace_xfs_vm_bmap(XFS_I(inode)); 1619 trace_xfs_vm_bmap(XFS_I(inode));
1643 xfs_ilock(ip, XFS_IOLOCK_SHARED); 1620 xfs_ilock(ip, XFS_IOLOCK_SHARED);
1644 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1621 filemap_write_and_wait(mapping);
1645 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 1622 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1646 return generic_block_bmap(mapping, block, xfs_get_blocks); 1623 return generic_block_bmap(mapping, block, xfs_get_blocks);
1647} 1624}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d2..aaf472532b3c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
903 */ 903 */
904 dp = args->dp; 904 dp = args->dp;
905 args->blkno = 0; 905 args->blkno = 0;
906 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 906 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
907 XFS_ATTR_FORK);
908 if (error) 907 if (error)
909 return(error); 908 return error;
910 ASSERT(bp != NULL);
911 909
912 /* 910 /*
913 * Look up the given attribute in the leaf block. Figure out if 911 * Look up the given attribute in the leaf block. Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
1031 * Read in the block containing the "old" attr, then 1029 * Read in the block containing the "old" attr, then
1032 * remove the "old" attr from that block (neat, huh!) 1030 * remove the "old" attr from that block (neat, huh!)
1033 */ 1031 */
1034 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, 1032 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
1035 &bp, XFS_ATTR_FORK); 1033 -1, &bp);
1036 if (error) 1034 if (error)
1037 return(error); 1035 return error;
1038 ASSERT(bp != NULL); 1036
1039 (void)xfs_attr_leaf_remove(bp, args); 1037 xfs_attr_leaf_remove(bp, args);
1040 1038
1041 /* 1039 /*
1042 * If the result is small enough, shrink it all into the inode. 1040 * If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
1100 */ 1098 */
1101 dp = args->dp; 1099 dp = args->dp;
1102 args->blkno = 0; 1100 args->blkno = 0;
1103 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 1101 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
1104 XFS_ATTR_FORK); 1102 if (error)
1105 if (error) { 1103 return error;
1106 return(error);
1107 }
1108 1104
1109 ASSERT(bp != NULL);
1110 error = xfs_attr_leaf_lookup_int(bp, args); 1105 error = xfs_attr_leaf_lookup_int(bp, args);
1111 if (error == ENOATTR) { 1106 if (error == ENOATTR) {
1112 xfs_trans_brelse(args->trans, bp); 1107 xfs_trans_brelse(args->trans, bp);
1113 return(error); 1108 return(error);
1114 } 1109 }
1115 1110
1116 (void)xfs_attr_leaf_remove(bp, args); 1111 xfs_attr_leaf_remove(bp, args);
1117 1112
1118 /* 1113 /*
1119 * If the result is small enough, shrink it all into the inode. 1114 * If the result is small enough, shrink it all into the inode.
@@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1155 struct xfs_buf *bp; 1150 struct xfs_buf *bp;
1156 int error; 1151 int error;
1157 1152
1153 trace_xfs_attr_leaf_get(args);
1154
1158 args->blkno = 0; 1155 args->blkno = 0;
1159 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 1156 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
1160 XFS_ATTR_FORK);
1161 if (error) 1157 if (error)
1162 return(error); 1158 return error;
1163 ASSERT(bp != NULL);
1164 1159
1165 error = xfs_attr_leaf_lookup_int(bp, args); 1160 error = xfs_attr_leaf_lookup_int(bp, args);
1166 if (error != EEXIST) { 1161 if (error != EEXIST) {
@@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1181STATIC int 1176STATIC int
1182xfs_attr_leaf_list(xfs_attr_list_context_t *context) 1177xfs_attr_leaf_list(xfs_attr_list_context_t *context)
1183{ 1178{
1184 xfs_attr_leafblock_t *leaf;
1185 int error; 1179 int error;
1186 struct xfs_buf *bp; 1180 struct xfs_buf *bp;
1187 1181
1182 trace_xfs_attr_leaf_list(context);
1183
1188 context->cursor->blkno = 0; 1184 context->cursor->blkno = 0;
1189 error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); 1185 error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
1190 if (error) 1186 if (error)
1191 return XFS_ERROR(error); 1187 return XFS_ERROR(error);
1192 ASSERT(bp != NULL);
1193 leaf = bp->b_addr;
1194 if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
1195 XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
1196 context->dp->i_mount, leaf);
1197 xfs_trans_brelse(NULL, bp);
1198 return XFS_ERROR(EFSCORRUPTED);
1199 }
1200 1188
1201 error = xfs_attr_leaf_list_int(bp, context); 1189 error = xfs_attr_leaf_list_int(bp, context);
1202 xfs_trans_brelse(NULL, bp); 1190 xfs_trans_brelse(NULL, bp);
@@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1600 ASSERT(state->path.blk[0].bp); 1588 ASSERT(state->path.blk[0].bp);
1601 state->path.blk[0].bp = NULL; 1589 state->path.blk[0].bp = NULL;
1602 1590
1603 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, 1591 error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
1604 XFS_ATTR_FORK);
1605 if (error) 1592 if (error)
1606 goto out; 1593 goto out;
1607 ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
1608 cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1609 1594
1610 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { 1595 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
1611 xfs_bmap_init(args->flist, args->firstblock); 1596 xfs_bmap_init(args->flist, args->firstblock);
@@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
1653 xfs_da_state_blk_t *blk; 1638 xfs_da_state_blk_t *blk;
1654 int level; 1639 int level;
1655 1640
1641 trace_xfs_attr_fillstate(state->args);
1642
1656 /* 1643 /*
1657 * Roll down the "path" in the state structure, storing the on-disk 1644 * Roll down the "path" in the state structure, storing the on-disk
1658 * block number for those buffers in the "path". 1645 * block number for those buffers in the "path".
@@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1699 xfs_da_state_blk_t *blk; 1686 xfs_da_state_blk_t *blk;
1700 int level, error; 1687 int level, error;
1701 1688
1689 trace_xfs_attr_refillstate(state->args);
1690
1702 /* 1691 /*
1703 * Roll down the "path" in the state structure, storing the on-disk 1692 * Roll down the "path" in the state structure, storing the on-disk
1704 * block number for those buffers in the "path". 1693 * block number for those buffers in the "path".
@@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1707 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); 1696 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1708 for (blk = path->blk, level = 0; level < path->active; blk++, level++) { 1697 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1709 if (blk->disk_blkno) { 1698 if (blk->disk_blkno) {
1710 error = xfs_da_read_buf(state->args->trans, 1699 error = xfs_da_node_read(state->args->trans,
1711 state->args->dp, 1700 state->args->dp,
1712 blk->blkno, blk->disk_blkno, 1701 blk->blkno, blk->disk_blkno,
1713 &blk->bp, XFS_ATTR_FORK); 1702 &blk->bp, XFS_ATTR_FORK);
@@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1726 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); 1715 ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1727 for (blk = path->blk, level = 0; level < path->active; blk++, level++) { 1716 for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
1728 if (blk->disk_blkno) { 1717 if (blk->disk_blkno) {
1729 error = xfs_da_read_buf(state->args->trans, 1718 error = xfs_da_node_read(state->args->trans,
1730 state->args->dp, 1719 state->args->dp,
1731 blk->blkno, blk->disk_blkno, 1720 blk->blkno, blk->disk_blkno,
1732 &blk->bp, XFS_ATTR_FORK); 1721 &blk->bp, XFS_ATTR_FORK);
@@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
1755 int error, retval; 1744 int error, retval;
1756 int i; 1745 int i;
1757 1746
1747 trace_xfs_attr_node_get(args);
1748
1758 state = xfs_da_state_alloc(); 1749 state = xfs_da_state_alloc();
1759 state->args = args; 1750 state->args = args;
1760 state->mp = args->dp->i_mount; 1751 state->mp = args->dp->i_mount;
@@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1804 int error, i; 1795 int error, i;
1805 struct xfs_buf *bp; 1796 struct xfs_buf *bp;
1806 1797
1798 trace_xfs_attr_node_list(context);
1799
1807 cursor = context->cursor; 1800 cursor = context->cursor;
1808 cursor->initted = 1; 1801 cursor->initted = 1;
1809 1802
@@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1814 */ 1807 */
1815 bp = NULL; 1808 bp = NULL;
1816 if (cursor->blkno > 0) { 1809 if (cursor->blkno > 0) {
1817 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, 1810 error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
1818 &bp, XFS_ATTR_FORK); 1811 &bp, XFS_ATTR_FORK);
1819 if ((error != 0) && (error != EFSCORRUPTED)) 1812 if ((error != 0) && (error != EFSCORRUPTED))
1820 return(error); 1813 return(error);
@@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1856 if (bp == NULL) { 1849 if (bp == NULL) {
1857 cursor->blkno = 0; 1850 cursor->blkno = 0;
1858 for (;;) { 1851 for (;;) {
1859 error = xfs_da_read_buf(NULL, context->dp, 1852 error = xfs_da_node_read(NULL, context->dp,
1860 cursor->blkno, -1, &bp, 1853 cursor->blkno, -1, &bp,
1861 XFS_ATTR_FORK); 1854 XFS_ATTR_FORK);
1862 if (error) 1855 if (error)
1863 return(error); 1856 return(error);
1864 if (unlikely(bp == NULL)) {
1865 XFS_ERROR_REPORT("xfs_attr_node_list(2)",
1866 XFS_ERRLEVEL_LOW,
1867 context->dp->i_mount);
1868 return(XFS_ERROR(EFSCORRUPTED));
1869 }
1870 node = bp->b_addr; 1857 node = bp->b_addr;
1871 if (node->hdr.info.magic == 1858 if (node->hdr.info.magic ==
1872 cpu_to_be16(XFS_ATTR_LEAF_MAGIC)) 1859 cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
@@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1907 */ 1894 */
1908 for (;;) { 1895 for (;;) {
1909 leaf = bp->b_addr; 1896 leaf = bp->b_addr;
1910 if (unlikely(leaf->hdr.info.magic !=
1911 cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
1912 XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
1913 XFS_ERRLEVEL_LOW,
1914 context->dp->i_mount, leaf);
1915 xfs_trans_brelse(NULL, bp);
1916 return(XFS_ERROR(EFSCORRUPTED));
1917 }
1918 error = xfs_attr_leaf_list_int(bp, context); 1897 error = xfs_attr_leaf_list_int(bp, context);
1919 if (error) { 1898 if (error) {
1920 xfs_trans_brelse(NULL, bp); 1899 xfs_trans_brelse(NULL, bp);
@@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
1924 break; 1903 break;
1925 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); 1904 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
1926 xfs_trans_brelse(NULL, bp); 1905 xfs_trans_brelse(NULL, bp);
1927 error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, 1906 error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
1928 &bp, XFS_ATTR_FORK); 1907 &bp);
1929 if (error) 1908 if (error)
1930 return(error); 1909 return error;
1931 if (unlikely((bp == NULL))) {
1932 XFS_ERROR_REPORT("xfs_attr_node_list(5)",
1933 XFS_ERRLEVEL_LOW,
1934 context->dp->i_mount);
1935 return(XFS_ERROR(EFSCORRUPTED));
1936 }
1937 } 1910 }
1938 xfs_trans_brelse(NULL, bp); 1911 xfs_trans_brelse(NULL, bp);
1939 return(0); 1912 return(0);
@@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1959 int nmap, error, tmp, valuelen, blkcnt, i; 1932 int nmap, error, tmp, valuelen, blkcnt, i;
1960 xfs_dablk_t lblkno; 1933 xfs_dablk_t lblkno;
1961 1934
1935 trace_xfs_attr_rmtval_get(args);
1936
1962 ASSERT(!(args->flags & ATTR_KERNOVAL)); 1937 ASSERT(!(args->flags & ATTR_KERNOVAL));
1963 1938
1964 mp = args->dp->i_mount; 1939 mp = args->dp->i_mount;
@@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
1980 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); 1955 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
1981 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); 1956 blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
1982 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 1957 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1983 dblkno, blkcnt, 0, &bp); 1958 dblkno, blkcnt, 0, &bp, NULL);
1984 if (error) 1959 if (error)
1985 return(error); 1960 return(error);
1986 1961
@@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
2014 xfs_dablk_t lblkno; 1989 xfs_dablk_t lblkno;
2015 int blkcnt, valuelen, nmap, error, tmp, committed; 1990 int blkcnt, valuelen, nmap, error, tmp, committed;
2016 1991
1992 trace_xfs_attr_rmtval_set(args);
1993
2017 dp = args->dp; 1994 dp = args->dp;
2018 mp = dp->i_mount; 1995 mp = dp->i_mount;
2019 src = args->value; 1996 src = args->value;
@@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
2143 xfs_dablk_t lblkno; 2120 xfs_dablk_t lblkno;
2144 int valuelen, blkcnt, nmap, error, done, committed; 2121 int valuelen, blkcnt, nmap, error, done, committed;
2145 2122
2123 trace_xfs_attr_rmtval_remove(args);
2124
2146 mp = args->dp->i_mount; 2125 mp = args->dp->i_mount;
2147 2126
2148 /* 2127 /*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 70eec1829776..ee24993c7d12 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
57 struct xfs_buf **bpp); 57 struct xfs_buf **bpp);
58STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer, 58STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
59 xfs_da_args_t *args, int freemap_index); 59 xfs_da_args_t *args, int freemap_index);
60STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer); 60STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
61 struct xfs_buf *leaf_buffer);
61STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, 62STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
62 xfs_da_state_blk_t *blk1, 63 xfs_da_state_blk_t *blk1,
63 xfs_da_state_blk_t *blk2); 64 xfs_da_state_blk_t *blk2);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
87 xfs_mount_t *mp); 88 xfs_mount_t *mp);
88STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); 89STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
89 90
91static void
92xfs_attr_leaf_verify(
93 struct xfs_buf *bp)
94{
95 struct xfs_mount *mp = bp->b_target->bt_mount;
96 struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
97 int block_ok = 0;
98
99 block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
100 if (!block_ok) {
101 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
102 xfs_buf_ioerror(bp, EFSCORRUPTED);
103 }
104}
105
106static void
107xfs_attr_leaf_read_verify(
108 struct xfs_buf *bp)
109{
110 xfs_attr_leaf_verify(bp);
111}
112
113static void
114xfs_attr_leaf_write_verify(
115 struct xfs_buf *bp)
116{
117 xfs_attr_leaf_verify(bp);
118}
119
120const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
121 .verify_read = xfs_attr_leaf_read_verify,
122 .verify_write = xfs_attr_leaf_write_verify,
123};
124
125int
126xfs_attr_leaf_read(
127 struct xfs_trans *tp,
128 struct xfs_inode *dp,
129 xfs_dablk_t bno,
130 xfs_daddr_t mappedbno,
131 struct xfs_buf **bpp)
132{
133 return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
134 XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
135}
136
90/*======================================================================== 137/*========================================================================
91 * Namespace helper routines 138 * Namespace helper routines
92 *========================================================================*/ 139 *========================================================================*/
@@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
869 error = xfs_da_grow_inode(args, &blkno); 916 error = xfs_da_grow_inode(args, &blkno);
870 if (error) 917 if (error)
871 goto out; 918 goto out;
872 error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, 919 error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
873 XFS_ATTR_FORK);
874 if (error) 920 if (error)
875 goto out; 921 goto out;
876 ASSERT(bp1 != NULL); 922
877 bp2 = NULL; 923 bp2 = NULL;
878 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, 924 error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
879 XFS_ATTR_FORK); 925 XFS_ATTR_FORK);
880 if (error) 926 if (error)
881 goto out; 927 goto out;
882 ASSERT(bp2 != NULL); 928 bp2->b_ops = bp1->b_ops;
883 memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount)); 929 memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
884 bp1 = NULL; 930 bp1 = NULL;
885 xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); 931 xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -933,7 +979,7 @@ xfs_attr_leaf_create(
933 XFS_ATTR_FORK); 979 XFS_ATTR_FORK);
934 if (error) 980 if (error)
935 return(error); 981 return(error);
936 ASSERT(bp != NULL); 982 bp->b_ops = &xfs_attr_leaf_buf_ops;
937 leaf = bp->b_addr; 983 leaf = bp->b_addr;
938 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); 984 memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
939 hdr = &leaf->hdr; 985 hdr = &leaf->hdr;
@@ -1071,7 +1117,7 @@ xfs_attr_leaf_add(
1071 * Compact the entries to coalesce free space. 1117 * Compact the entries to coalesce free space.
1072 * This may change the hdr->count via dropping INCOMPLETE entries. 1118 * This may change the hdr->count via dropping INCOMPLETE entries.
1073 */ 1119 */
1074 xfs_attr_leaf_compact(args->trans, bp); 1120 xfs_attr_leaf_compact(args, bp);
1075 1121
1076 /* 1122 /*
1077 * After compaction, the block is guaranteed to have only one 1123 * After compaction, the block is guaranteed to have only one
@@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work(
1102 xfs_mount_t *mp; 1148 xfs_mount_t *mp;
1103 int tmp, i; 1149 int tmp, i;
1104 1150
1151 trace_xfs_attr_leaf_add_work(args);
1152
1105 leaf = bp->b_addr; 1153 leaf = bp->b_addr;
1106 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1154 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1107 hdr = &leaf->hdr; 1155 hdr = &leaf->hdr;
@@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work(
1214 */ 1262 */
1215STATIC void 1263STATIC void
1216xfs_attr_leaf_compact( 1264xfs_attr_leaf_compact(
1217 struct xfs_trans *trans, 1265 struct xfs_da_args *args,
1218 struct xfs_buf *bp) 1266 struct xfs_buf *bp)
1219{ 1267{
1220 xfs_attr_leafblock_t *leaf_s, *leaf_d; 1268 xfs_attr_leafblock_t *leaf_s, *leaf_d;
1221 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; 1269 xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
1222 xfs_mount_t *mp; 1270 struct xfs_trans *trans = args->trans;
1223 char *tmpbuffer; 1271 struct xfs_mount *mp = trans->t_mountp;
1272 char *tmpbuffer;
1273
1274 trace_xfs_attr_leaf_compact(args);
1224 1275
1225 mp = trans->t_mountp;
1226 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); 1276 tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
1227 ASSERT(tmpbuffer != NULL); 1277 ASSERT(tmpbuffer != NULL);
1228 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); 1278 memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1345,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1345 max = be16_to_cpu(hdr2->firstused) 1395 max = be16_to_cpu(hdr2->firstused)
1346 - sizeof(xfs_attr_leaf_hdr_t); 1396 - sizeof(xfs_attr_leaf_hdr_t);
1347 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); 1397 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
1348 if (space > max) { 1398 if (space > max)
1349 xfs_attr_leaf_compact(args->trans, blk2->bp); 1399 xfs_attr_leaf_compact(args, blk2->bp);
1350 }
1351 1400
1352 /* 1401 /*
1353 * Move high entries from leaf1 to low end of leaf2. 1402 * Move high entries from leaf1 to low end of leaf2.
@@ -1378,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
1378 max = be16_to_cpu(hdr1->firstused) 1427 max = be16_to_cpu(hdr1->firstused)
1379 - sizeof(xfs_attr_leaf_hdr_t); 1428 - sizeof(xfs_attr_leaf_hdr_t);
1380 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); 1429 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
1381 if (space > max) { 1430 if (space > max)
1382 xfs_attr_leaf_compact(args->trans, blk1->bp); 1431 xfs_attr_leaf_compact(args, blk1->bp);
1383 }
1384 1432
1385 /* 1433 /*
1386 * Move low entries from leaf2 to high end of leaf1. 1434 * Move low entries from leaf2 to high end of leaf1.
@@ -1577,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1577 xfs_dablk_t blkno; 1625 xfs_dablk_t blkno;
1578 struct xfs_buf *bp; 1626 struct xfs_buf *bp;
1579 1627
1628 trace_xfs_attr_leaf_toosmall(state->args);
1629
1580 /* 1630 /*
1581 * Check for the degenerate case of the block being over 50% full. 1631 * Check for the degenerate case of the block being over 50% full.
1582 * If so, it's not worth even looking to see if we might be able 1632 * If so, it's not worth even looking to see if we might be able
@@ -1636,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
1636 blkno = be32_to_cpu(info->back); 1686 blkno = be32_to_cpu(info->back);
1637 if (blkno == 0) 1687 if (blkno == 0)
1638 continue; 1688 continue;
1639 error = xfs_da_read_buf(state->args->trans, state->args->dp, 1689 error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
1640 blkno, -1, &bp, XFS_ATTR_FORK); 1690 blkno, -1, &bp);
1641 if (error) 1691 if (error)
1642 return(error); 1692 return(error);
1643 ASSERT(bp != NULL);
1644 1693
1645 leaf = (xfs_attr_leafblock_t *)info; 1694 leaf = (xfs_attr_leafblock_t *)info;
1646 count = be16_to_cpu(leaf->hdr.count); 1695 count = be16_to_cpu(leaf->hdr.count);
1647 bytes = state->blocksize - (state->blocksize>>2); 1696 bytes = state->blocksize - (state->blocksize>>2);
1648 bytes -= be16_to_cpu(leaf->hdr.usedbytes); 1697 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
1649 leaf = bp->b_addr; 1698 leaf = bp->b_addr;
1650 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1651 count += be16_to_cpu(leaf->hdr.count); 1699 count += be16_to_cpu(leaf->hdr.count);
1652 bytes -= be16_to_cpu(leaf->hdr.usedbytes); 1700 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
1653 bytes -= count * sizeof(xfs_attr_leaf_entry_t); 1701 bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -1702,6 +1750,8 @@ xfs_attr_leaf_remove(
1702 int tablesize, tmp, i; 1750 int tablesize, tmp, i;
1703 xfs_mount_t *mp; 1751 xfs_mount_t *mp;
1704 1752
1753 trace_xfs_attr_leaf_remove(args);
1754
1705 leaf = bp->b_addr; 1755 leaf = bp->b_addr;
1706 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); 1756 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
1707 hdr = &leaf->hdr; 1757 hdr = &leaf->hdr;
@@ -2511,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
2511 /* 2561 /*
2512 * Set up the operation. 2562 * Set up the operation.
2513 */ 2563 */
2514 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 2564 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2515 XFS_ATTR_FORK); 2565 if (error)
2516 if (error) {
2517 return(error); 2566 return(error);
2518 }
2519 ASSERT(bp != NULL);
2520 2567
2521 leaf = bp->b_addr; 2568 leaf = bp->b_addr;
2522 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2523 ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); 2569 ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
2524 ASSERT(args->index >= 0); 2570 ASSERT(args->index >= 0);
2525 entry = &leaf->entries[ args->index ]; 2571 entry = &leaf->entries[ args->index ];
@@ -2576,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
2576 /* 2622 /*
2577 * Set up the operation. 2623 * Set up the operation.
2578 */ 2624 */
2579 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, 2625 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2580 XFS_ATTR_FORK); 2626 if (error)
2581 if (error) {
2582 return(error); 2627 return(error);
2583 }
2584 ASSERT(bp != NULL);
2585 2628
2586 leaf = bp->b_addr; 2629 leaf = bp->b_addr;
2587 ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2588 ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); 2630 ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
2589 ASSERT(args->index >= 0); 2631 ASSERT(args->index >= 0);
2590 entry = &leaf->entries[ args->index ]; 2632 entry = &leaf->entries[ args->index ];
@@ -2633,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
2633 /* 2675 /*
2634 * Read the block containing the "old" attr 2676 * Read the block containing the "old" attr
2635 */ 2677 */
2636 error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, 2678 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
2637 XFS_ATTR_FORK); 2679 if (error)
2638 if (error) { 2680 return error;
2639 return(error);
2640 }
2641 ASSERT(bp1 != NULL);
2642 2681
2643 /* 2682 /*
2644 * Read the block containing the "new" attr, if it is different 2683 * Read the block containing the "new" attr, if it is different
2645 */ 2684 */
2646 if (args->blkno2 != args->blkno) { 2685 if (args->blkno2 != args->blkno) {
2647 error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, 2686 error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
2648 -1, &bp2, XFS_ATTR_FORK); 2687 -1, &bp2);
2649 if (error) { 2688 if (error)
2650 return(error); 2689 return error;
2651 }
2652 ASSERT(bp2 != NULL);
2653 } else { 2690 } else {
2654 bp2 = bp1; 2691 bp2 = bp1;
2655 } 2692 }
2656 2693
2657 leaf1 = bp1->b_addr; 2694 leaf1 = bp1->b_addr;
2658 ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2659 ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); 2695 ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
2660 ASSERT(args->index >= 0); 2696 ASSERT(args->index >= 0);
2661 entry1 = &leaf1->entries[ args->index ]; 2697 entry1 = &leaf1->entries[ args->index ];
2662 2698
2663 leaf2 = bp2->b_addr; 2699 leaf2 = bp2->b_addr;
2664 ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
2665 ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); 2700 ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
2666 ASSERT(args->index2 >= 0); 2701 ASSERT(args->index2 >= 0);
2667 entry2 = &leaf2->entries[ args->index2 ]; 2702 entry2 = &leaf2->entries[ args->index2 ];
@@ -2746,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
2746 * the extents in reverse order the extent containing 2781 * the extents in reverse order the extent containing
2747 * block 0 must still be there. 2782 * block 0 must still be there.
2748 */ 2783 */
2749 error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); 2784 error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
2750 if (error) 2785 if (error)
2751 return(error); 2786 return(error);
2752 blkno = XFS_BUF_ADDR(bp); 2787 blkno = XFS_BUF_ADDR(bp);
@@ -2831,7 +2866,7 @@ xfs_attr_node_inactive(
2831 * traversal of the tree so we may deal with many blocks 2866 * traversal of the tree so we may deal with many blocks
2832 * before we come back to this one. 2867 * before we come back to this one.
2833 */ 2868 */
2834 error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, 2869 error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
2835 XFS_ATTR_FORK); 2870 XFS_ATTR_FORK);
2836 if (error) 2871 if (error)
2837 return(error); 2872 return(error);
@@ -2872,8 +2907,8 @@ xfs_attr_node_inactive(
2872 * child block number. 2907 * child block number.
2873 */ 2908 */
2874 if ((i+1) < count) { 2909 if ((i+1) < count) {
2875 error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, 2910 error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
2876 &bp, XFS_ATTR_FORK); 2911 &bp, XFS_ATTR_FORK);
2877 if (error) 2912 if (error)
2878 return(error); 2913 return(error);
2879 child_fsb = be32_to_cpu(node->btree[i+1].before); 2914 child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index dea17722945e..77de139a58f0 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -261,4 +261,10 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
261 struct xfs_buf *leaf2_bp); 261 struct xfs_buf *leaf2_bp);
262int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, 262int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
263 int *local); 263 int *local);
264int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
265 xfs_dablk_t bno, xfs_daddr_t mappedbno,
266 struct xfs_buf **bpp);
267
268extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
269
264#endif /* __XFS_ATTR_LEAF_H__ */ 270#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 83d0cf3df930..cdb2d3348583 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2662,8 +2662,9 @@ xfs_bmap_btree_to_extents(
2662 if ((error = xfs_btree_check_lptr(cur, cbno, 1))) 2662 if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
2663 return error; 2663 return error;
2664#endif 2664#endif
2665 if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, 2665 error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
2666 XFS_BMAP_BTREE_REF))) 2666 &xfs_bmbt_buf_ops);
2667 if (error)
2667 return error; 2668 return error;
2668 cblock = XFS_BUF_TO_BLOCK(cbp); 2669 cblock = XFS_BUF_TO_BLOCK(cbp);
2669 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) 2670 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3123,6 +3124,7 @@ xfs_bmap_extents_to_btree(
3123 /* 3124 /*
3124 * Fill in the child block. 3125 * Fill in the child block.
3125 */ 3126 */
3127 abp->b_ops = &xfs_bmbt_buf_ops;
3126 ablock = XFS_BUF_TO_BLOCK(abp); 3128 ablock = XFS_BUF_TO_BLOCK(abp);
3127 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); 3129 ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
3128 ablock->bb_level = 0; 3130 ablock->bb_level = 0;
@@ -3269,6 +3271,7 @@ xfs_bmap_local_to_extents(
3269 ASSERT(args.len == 1); 3271 ASSERT(args.len == 1);
3270 *firstblock = args.fsbno; 3272 *firstblock = args.fsbno;
3271 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); 3273 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
3274 bp->b_ops = &xfs_bmbt_buf_ops;
3272 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); 3275 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
3273 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); 3276 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
3274 xfs_bmap_forkoff_reset(args.mp, ip, whichfork); 3277 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4078,8 +4081,9 @@ xfs_bmap_read_extents(
4078 * pointer (leftmost) at each level. 4081 * pointer (leftmost) at each level.
4079 */ 4082 */
4080 while (level-- > 0) { 4083 while (level-- > 0) {
4081 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4084 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4082 XFS_BMAP_BTREE_REF))) 4085 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
4086 if (error)
4083 return error; 4087 return error;
4084 block = XFS_BUF_TO_BLOCK(bp); 4088 block = XFS_BUF_TO_BLOCK(bp);
4085 XFS_WANT_CORRUPTED_GOTO( 4089 XFS_WANT_CORRUPTED_GOTO(
@@ -4124,7 +4128,8 @@ xfs_bmap_read_extents(
4124 */ 4128 */
4125 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 4129 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
4126 if (nextbno != NULLFSBLOCK) 4130 if (nextbno != NULLFSBLOCK)
4127 xfs_btree_reada_bufl(mp, nextbno, 1); 4131 xfs_btree_reada_bufl(mp, nextbno, 1,
4132 &xfs_bmbt_buf_ops);
4128 /* 4133 /*
4129 * Copy records into the extent records. 4134 * Copy records into the extent records.
4130 */ 4135 */
@@ -4156,8 +4161,9 @@ xfs_bmap_read_extents(
4156 */ 4161 */
4157 if (bno == NULLFSBLOCK) 4162 if (bno == NULLFSBLOCK)
4158 break; 4163 break;
4159 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 4164 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
4160 XFS_BMAP_BTREE_REF))) 4165 XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
4166 if (error)
4161 return error; 4167 return error;
4162 block = XFS_BUF_TO_BLOCK(bp); 4168 block = XFS_BUF_TO_BLOCK(bp);
4163 } 4169 }
@@ -4674,9 +4680,6 @@ __xfs_bmapi_allocate(
4674 return error; 4680 return error;
4675 } 4681 }
4676 4682
4677 if (bma->flags & XFS_BMAPI_STACK_SWITCH)
4678 bma->stack_switch = 1;
4679
4680 error = xfs_bmap_alloc(bma); 4683 error = xfs_bmap_alloc(bma);
4681 if (error) 4684 if (error)
4682 return error; 4685 return error;
@@ -4950,6 +4953,9 @@ xfs_bmapi_write(
4950 bma.flist = flist; 4953 bma.flist = flist;
4951 bma.firstblock = firstblock; 4954 bma.firstblock = firstblock;
4952 4955
4956 if (flags & XFS_BMAPI_STACK_SWITCH)
4957 bma.stack_switch = 1;
4958
4953 while (bno < end && n < *nmap) { 4959 while (bno < end && n < *nmap) {
4954 inhole = eof || bma.got.br_startoff > bno; 4960 inhole = eof || bma.got.br_startoff > bno;
4955 wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); 4961 wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
@@ -5599,7 +5605,7 @@ xfs_getbmap(
5599 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5605 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5600 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 5606 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5601 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { 5607 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5602 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); 5608 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
5603 if (error) 5609 if (error)
5604 goto out_unlock_iolock; 5610 goto out_unlock_iolock;
5605 } 5611 }
@@ -5868,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
5868 */ 5874 */
5869 while (level-- > 0) { 5875 while (level-- > 0) {
5870 /* See if buf is in cur first */ 5876 /* See if buf is in cur first */
5877 bp_release = 0;
5871 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); 5878 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5872 if (bp) { 5879 if (!bp) {
5873 bp_release = 0;
5874 } else {
5875 bp_release = 1; 5880 bp_release = 1;
5881 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5882 XFS_BMAP_BTREE_REF,
5883 &xfs_bmbt_buf_ops);
5884 if (error)
5885 goto error_norelse;
5876 } 5886 }
5877 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5878 XFS_BMAP_BTREE_REF)))
5879 goto error_norelse;
5880 block = XFS_BUF_TO_BLOCK(bp); 5887 block = XFS_BUF_TO_BLOCK(bp);
5881 XFS_WANT_CORRUPTED_GOTO( 5888 XFS_WANT_CORRUPTED_GOTO(
5882 xfs_bmap_sanity_check(mp, bp, level), 5889 xfs_bmap_sanity_check(mp, bp, level),
@@ -5953,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
5953 if (bno == NULLFSBLOCK) 5960 if (bno == NULLFSBLOCK)
5954 break; 5961 break;
5955 5962
5963 bp_release = 0;
5956 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); 5964 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
5957 if (bp) { 5965 if (!bp) {
5958 bp_release = 0;
5959 } else {
5960 bp_release = 1; 5966 bp_release = 1;
5967 error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5968 XFS_BMAP_BTREE_REF,
5969 &xfs_bmbt_buf_ops);
5970 if (error)
5971 goto error_norelse;
5961 } 5972 }
5962 if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
5963 XFS_BMAP_BTREE_REF)))
5964 goto error_norelse;
5965 block = XFS_BUF_TO_BLOCK(bp); 5973 block = XFS_BUF_TO_BLOCK(bp);
5966 } 5974 }
5967 if (bp_release) { 5975 if (bp_release) {
@@ -6052,7 +6060,9 @@ xfs_bmap_count_tree(
6052 struct xfs_btree_block *block, *nextblock; 6060 struct xfs_btree_block *block, *nextblock;
6053 int numrecs; 6061 int numrecs;
6054 6062
6055 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) 6063 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
6064 &xfs_bmbt_buf_ops);
6065 if (error)
6056 return error; 6066 return error;
6057 *count += 1; 6067 *count += 1;
6058 block = XFS_BUF_TO_BLOCK(bp); 6068 block = XFS_BUF_TO_BLOCK(bp);
@@ -6061,8 +6071,10 @@ xfs_bmap_count_tree(
6061 /* Not at node above leaves, count this level of nodes */ 6071 /* Not at node above leaves, count this level of nodes */
6062 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); 6072 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
6063 while (nextbno != NULLFSBLOCK) { 6073 while (nextbno != NULLFSBLOCK) {
6064 if ((error = xfs_btree_read_bufl(mp, tp, nextbno, 6074 error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
6065 0, &nbp, XFS_BMAP_BTREE_REF))) 6075 XFS_BMAP_BTREE_REF,
6076 &xfs_bmbt_buf_ops);
6077 if (error)
6066 return error; 6078 return error;
6067 *count += 1; 6079 *count += 1;
6068 nextblock = XFS_BUF_TO_BLOCK(nbp); 6080 nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6091,8 +6103,10 @@ xfs_bmap_count_tree(
6091 if (nextbno == NULLFSBLOCK) 6103 if (nextbno == NULLFSBLOCK)
6092 break; 6104 break;
6093 bno = nextbno; 6105 bno = nextbno;
6094 if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, 6106 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
6095 XFS_BMAP_BTREE_REF))) 6107 XFS_BMAP_BTREE_REF,
6108 &xfs_bmbt_buf_ops);
6109 if (error)
6096 return error; 6110 return error;
6097 *count += 1; 6111 *count += 1;
6098 block = XFS_BUF_TO_BLOCK(bp); 6112 block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7e..061b45cbe614 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
36#include "xfs_bmap.h" 36#include "xfs_bmap.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_quota.h" 38#include "xfs_quota.h"
39#include "xfs_trace.h"
39 40
40/* 41/*
41 * Determine the extent state. 42 * Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
707 cur->bc_rec.b.br_startoff; 708 cur->bc_rec.b.br_startoff;
708} 709}
709 710
711static void
712xfs_bmbt_verify(
713 struct xfs_buf *bp)
714{
715 struct xfs_mount *mp = bp->b_target->bt_mount;
716 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
717 unsigned int level;
718 int lblock_ok; /* block passes checks */
719
720 /* magic number and level verification.
721 *
722 * We don't know waht fork we belong to, so just verify that the level
723 * is less than the maximum of the two. Later checks will be more
724 * precise.
725 */
726 level = be16_to_cpu(block->bb_level);
727 lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
728 level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
729
730 /* numrecs verification */
731 lblock_ok = lblock_ok &&
732 be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
733
734 /* sibling pointer verification */
735 lblock_ok = lblock_ok &&
736 block->bb_u.l.bb_leftsib &&
737 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
738 XFS_FSB_SANITY_CHECK(mp,
739 be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
740 block->bb_u.l.bb_rightsib &&
741 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
742 XFS_FSB_SANITY_CHECK(mp,
743 be64_to_cpu(block->bb_u.l.bb_rightsib)));
744
745 if (!lblock_ok) {
746 trace_xfs_btree_corrupt(bp, _RET_IP_);
747 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
748 xfs_buf_ioerror(bp, EFSCORRUPTED);
749 }
750}
751
752static void
753xfs_bmbt_read_verify(
754 struct xfs_buf *bp)
755{
756 xfs_bmbt_verify(bp);
757}
758
759static void
760xfs_bmbt_write_verify(
761 struct xfs_buf *bp)
762{
763 xfs_bmbt_verify(bp);
764}
765
766const struct xfs_buf_ops xfs_bmbt_buf_ops = {
767 .verify_read = xfs_bmbt_read_verify,
768 .verify_write = xfs_bmbt_write_verify,
769};
770
771
710#ifdef DEBUG 772#ifdef DEBUG
711STATIC int 773STATIC int
712xfs_bmbt_keys_inorder( 774xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
746 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, 808 .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
747 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, 809 .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
748 .key_diff = xfs_bmbt_key_diff, 810 .key_diff = xfs_bmbt_key_diff,
811 .buf_ops = &xfs_bmbt_buf_ops,
749#ifdef DEBUG 812#ifdef DEBUG
750 .keys_inorder = xfs_bmbt_keys_inorder, 813 .keys_inorder = xfs_bmbt_keys_inorder,
751 .recs_inorder = xfs_bmbt_recs_inorder, 814 .recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f85..88469ca08696 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
236extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, 236extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
237 struct xfs_trans *, struct xfs_inode *, int); 237 struct xfs_trans *, struct xfs_inode *, int);
238 238
239extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
239 240
240#endif /* __XFS_BMAP_BTREE_H__ */ 241#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b1582..db010408d701 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
266 for (i = 0; i < new->bc_nlevels; i++) { 266 for (i = 0; i < new->bc_nlevels; i++) {
267 new->bc_ptrs[i] = cur->bc_ptrs[i]; 267 new->bc_ptrs[i] = cur->bc_ptrs[i];
268 new->bc_ra[i] = cur->bc_ra[i]; 268 new->bc_ra[i] = cur->bc_ra[i];
269 if ((bp = cur->bc_bufs[i])) { 269 bp = cur->bc_bufs[i];
270 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 270 if (bp) {
271 XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { 271 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
272 XFS_BUF_ADDR(bp), mp->m_bsize,
273 0, &bp,
274 cur->bc_ops->buf_ops);
275 if (error) {
272 xfs_btree_del_cursor(new, error); 276 xfs_btree_del_cursor(new, error);
273 *ncur = NULL; 277 *ncur = NULL;
274 return error; 278 return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
609 * Get a buffer for the block, return it read in. 613 * Get a buffer for the block, return it read in.
610 * Long-form addressing. 614 * Long-form addressing.
611 */ 615 */
612int /* error */ 616int
613xfs_btree_read_bufl( 617xfs_btree_read_bufl(
614 xfs_mount_t *mp, /* file system mount point */ 618 struct xfs_mount *mp, /* file system mount point */
615 xfs_trans_t *tp, /* transaction pointer */ 619 struct xfs_trans *tp, /* transaction pointer */
616 xfs_fsblock_t fsbno, /* file system block number */ 620 xfs_fsblock_t fsbno, /* file system block number */
617 uint lock, /* lock flags for read_buf */ 621 uint lock, /* lock flags for read_buf */
618 xfs_buf_t **bpp, /* buffer for fsbno */ 622 struct xfs_buf **bpp, /* buffer for fsbno */
619 int refval) /* ref count value for buffer */ 623 int refval, /* ref count value for buffer */
620{ 624 const struct xfs_buf_ops *ops)
621 xfs_buf_t *bp; /* return value */ 625{
626 struct xfs_buf *bp; /* return value */
622 xfs_daddr_t d; /* real disk block address */ 627 xfs_daddr_t d; /* real disk block address */
623 int error; 628 int error;
624 629
625 ASSERT(fsbno != NULLFSBLOCK); 630 ASSERT(fsbno != NULLFSBLOCK);
626 d = XFS_FSB_TO_DADDR(mp, fsbno); 631 d = XFS_FSB_TO_DADDR(mp, fsbno);
627 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 632 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
628 mp->m_bsize, lock, &bp))) { 633 mp->m_bsize, lock, &bp, ops);
634 if (error)
629 return error; 635 return error;
630 }
631 ASSERT(!xfs_buf_geterror(bp)); 636 ASSERT(!xfs_buf_geterror(bp));
632 if (bp) 637 if (bp)
633 xfs_buf_set_ref(bp, refval); 638 xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
642/* ARGSUSED */ 647/* ARGSUSED */
643void 648void
644xfs_btree_reada_bufl( 649xfs_btree_reada_bufl(
645 xfs_mount_t *mp, /* file system mount point */ 650 struct xfs_mount *mp, /* file system mount point */
646 xfs_fsblock_t fsbno, /* file system block number */ 651 xfs_fsblock_t fsbno, /* file system block number */
647 xfs_extlen_t count) /* count of filesystem blocks */ 652 xfs_extlen_t count, /* count of filesystem blocks */
653 const struct xfs_buf_ops *ops)
648{ 654{
649 xfs_daddr_t d; 655 xfs_daddr_t d;
650 656
651 ASSERT(fsbno != NULLFSBLOCK); 657 ASSERT(fsbno != NULLFSBLOCK);
652 d = XFS_FSB_TO_DADDR(mp, fsbno); 658 d = XFS_FSB_TO_DADDR(mp, fsbno);
653 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); 659 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
654} 660}
655 661
656/* 662/*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
660/* ARGSUSED */ 666/* ARGSUSED */
661void 667void
662xfs_btree_reada_bufs( 668xfs_btree_reada_bufs(
663 xfs_mount_t *mp, /* file system mount point */ 669 struct xfs_mount *mp, /* file system mount point */
664 xfs_agnumber_t agno, /* allocation group number */ 670 xfs_agnumber_t agno, /* allocation group number */
665 xfs_agblock_t agbno, /* allocation group block number */ 671 xfs_agblock_t agbno, /* allocation group block number */
666 xfs_extlen_t count) /* count of filesystem blocks */ 672 xfs_extlen_t count, /* count of filesystem blocks */
673 const struct xfs_buf_ops *ops)
667{ 674{
668 xfs_daddr_t d; 675 xfs_daddr_t d;
669 676
670 ASSERT(agno != NULLAGNUMBER); 677 ASSERT(agno != NULLAGNUMBER);
671 ASSERT(agbno != NULLAGBLOCK); 678 ASSERT(agbno != NULLAGBLOCK);
672 d = XFS_AGB_TO_DADDR(mp, agno, agbno); 679 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
673 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count); 680 xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
674} 681}
675 682
676STATIC int 683STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
684 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 691 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
685 692
686 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { 693 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
687 xfs_btree_reada_bufl(cur->bc_mp, left, 1); 694 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
695 cur->bc_ops->buf_ops);
688 rval++; 696 rval++;
689 } 697 }
690 698
691 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { 699 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
692 xfs_btree_reada_bufl(cur->bc_mp, right, 1); 700 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
701 cur->bc_ops->buf_ops);
693 rval++; 702 rval++;
694 } 703 }
695 704
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
709 718
710 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { 719 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
711 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 720 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
712 left, 1); 721 left, 1, cur->bc_ops->buf_ops);
713 rval++; 722 rval++;
714 } 723 }
715 724
716 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { 725 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
717 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, 726 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
718 right, 1); 727 right, 1, cur->bc_ops->buf_ops);
719 rval++; 728 rval++;
720 } 729 }
721 730
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
853 } 862 }
854} 863}
855 864
856STATIC void 865void
857xfs_btree_init_block( 866xfs_btree_init_block(
858 struct xfs_btree_cur *cur, 867 struct xfs_mount *mp,
859 int level, 868 struct xfs_buf *bp,
860 int numrecs, 869 __u32 magic,
861 struct xfs_btree_block *new) /* new block */ 870 __u16 level,
871 __u16 numrecs,
872 unsigned int flags)
862{ 873{
863 new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); 874 struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
875
876 new->bb_magic = cpu_to_be32(magic);
864 new->bb_level = cpu_to_be16(level); 877 new->bb_level = cpu_to_be16(level);
865 new->bb_numrecs = cpu_to_be16(numrecs); 878 new->bb_numrecs = cpu_to_be16(numrecs);
866 879
867 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 880 if (flags & XFS_BTREE_LONG_PTRS) {
868 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); 881 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
869 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); 882 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
870 } else { 883 } else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
873 } 886 }
874} 887}
875 888
889STATIC void
890xfs_btree_init_block_cur(
891 struct xfs_btree_cur *cur,
892 int level,
893 int numrecs,
894 struct xfs_buf *bp)
895{
896 xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
897 level, numrecs, cur->bc_flags);
898}
899
876/* 900/*
877 * Return true if ptr is the last record in the btree and 901 * Return true if ptr is the last record in the btree and
878 * we need to track updateѕ to this record. The decision 902 * we need to track updateѕ to this record. The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
972 if (!*bpp) 996 if (!*bpp)
973 return ENOMEM; 997 return ENOMEM;
974 998
999 (*bpp)->b_ops = cur->bc_ops->buf_ops;
975 *block = XFS_BUF_TO_BLOCK(*bpp); 1000 *block = XFS_BUF_TO_BLOCK(*bpp);
976 return 0; 1001 return 0;
977} 1002}
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
998 1023
999 d = xfs_btree_ptr_to_daddr(cur, ptr); 1024 d = xfs_btree_ptr_to_daddr(cur, ptr);
1000 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, 1025 error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1001 mp->m_bsize, flags, bpp); 1026 mp->m_bsize, flags, bpp,
1027 cur->bc_ops->buf_ops);
1002 if (error) 1028 if (error)
1003 return error; 1029 return error;
1004 1030
1005 ASSERT(!xfs_buf_geterror(*bpp)); 1031 ASSERT(!xfs_buf_geterror(*bpp));
1006
1007 xfs_btree_set_refs(cur, *bpp); 1032 xfs_btree_set_refs(cur, *bpp);
1008 *block = XFS_BUF_TO_BLOCK(*bpp); 1033 *block = XFS_BUF_TO_BLOCK(*bpp);
1009 1034 return 0;
1010 error = xfs_btree_check_block(cur, *block, level, *bpp);
1011 if (error)
1012 xfs_trans_brelse(cur->bc_tp, *bpp);
1013 return error;
1014} 1035}
1015 1036
1016/* 1037/*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
2183 goto error0; 2204 goto error0;
2184 2205
2185 /* Fill in the btree header for the new right block. */ 2206 /* Fill in the btree header for the new right block. */
2186 xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right); 2207 xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
2187 2208
2188 /* 2209 /*
2189 * Split the entries between the old and the new block evenly. 2210 * Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
2492 nptr = 2; 2513 nptr = 2;
2493 } 2514 }
2494 /* Fill in the new block's btree header and log it. */ 2515 /* Fill in the new block's btree header and log it. */
2495 xfs_btree_init_block(cur, cur->bc_nlevels, 2, new); 2516 xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
2496 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); 2517 xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2497 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && 2518 ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2498 !xfs_btree_ptr_is_null(cur, &rptr)); 2519 !xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c0..f932897194eb 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
188 __int64_t (*key_diff)(struct xfs_btree_cur *cur, 188 __int64_t (*key_diff)(struct xfs_btree_cur *cur,
189 union xfs_btree_key *key); 189 union xfs_btree_key *key);
190 190
191 const struct xfs_buf_ops *buf_ops;
192
191#ifdef DEBUG 193#ifdef DEBUG
192 /* check that k1 is lower than k2 */ 194 /* check that k1 is lower than k2 */
193 int (*keys_inorder)(struct xfs_btree_cur *cur, 195 int (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
355 xfs_fsblock_t fsbno, /* file system block number */ 357 xfs_fsblock_t fsbno, /* file system block number */
356 uint lock, /* lock flags for read_buf */ 358 uint lock, /* lock flags for read_buf */
357 struct xfs_buf **bpp, /* buffer for fsbno */ 359 struct xfs_buf **bpp, /* buffer for fsbno */
358 int refval);/* ref count value for buffer */ 360 int refval, /* ref count value for buffer */
361 const struct xfs_buf_ops *ops);
359 362
360/* 363/*
361 * Read-ahead the block, don't wait for it, don't return a buffer. 364 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void /* error */
365xfs_btree_reada_bufl( 368xfs_btree_reada_bufl(
366 struct xfs_mount *mp, /* file system mount point */ 369 struct xfs_mount *mp, /* file system mount point */
367 xfs_fsblock_t fsbno, /* file system block number */ 370 xfs_fsblock_t fsbno, /* file system block number */
368 xfs_extlen_t count); /* count of filesystem blocks */ 371 xfs_extlen_t count, /* count of filesystem blocks */
372 const struct xfs_buf_ops *ops);
369 373
370/* 374/*
371 * Read-ahead the block, don't wait for it, don't return a buffer. 375 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
376 struct xfs_mount *mp, /* file system mount point */ 380 struct xfs_mount *mp, /* file system mount point */
377 xfs_agnumber_t agno, /* allocation group number */ 381 xfs_agnumber_t agno, /* allocation group number */
378 xfs_agblock_t agbno, /* allocation group block number */ 382 xfs_agblock_t agbno, /* allocation group block number */
379 xfs_extlen_t count); /* count of filesystem blocks */ 383 xfs_extlen_t count, /* count of filesystem blocks */
384 const struct xfs_buf_ops *ops);
380 385
386/*
387 * Initialise a new btree block header
388 */
389void
390xfs_btree_init_block(
391 struct xfs_mount *mp,
392 struct xfs_buf *bp,
393 __u32 magic,
394 __u16 level,
395 __u16 numrecs,
396 unsigned int flags);
381 397
382/* 398/*
383 * Common btree core entry points. 399 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4b0b8dd1b7b0..fbbb9eb92e32 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -175,7 +175,7 @@ xfs_buf_get_maps(
175 bp->b_map_count = map_count; 175 bp->b_map_count = map_count;
176 176
177 if (map_count == 1) { 177 if (map_count == 1) {
178 bp->b_maps = &bp->b_map; 178 bp->b_maps = &bp->__b_map;
179 return 0; 179 return 0;
180 } 180 }
181 181
@@ -193,7 +193,7 @@ static void
193xfs_buf_free_maps( 193xfs_buf_free_maps(
194 struct xfs_buf *bp) 194 struct xfs_buf *bp)
195{ 195{
196 if (bp->b_maps != &bp->b_map) { 196 if (bp->b_maps != &bp->__b_map) {
197 kmem_free(bp->b_maps); 197 kmem_free(bp->b_maps);
198 bp->b_maps = NULL; 198 bp->b_maps = NULL;
199 } 199 }
@@ -377,8 +377,8 @@ xfs_buf_allocate_memory(
377 } 377 }
378 378
379use_alloc_page: 379use_alloc_page:
380 start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT; 380 start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
381 end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1) 381 end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
382 >> PAGE_SHIFT; 382 >> PAGE_SHIFT;
383 page_count = end - start; 383 page_count = end - start;
384 error = _xfs_buf_get_pages(bp, page_count, flags); 384 error = _xfs_buf_get_pages(bp, page_count, flags);
@@ -487,6 +487,7 @@ _xfs_buf_find(
487 struct rb_node *parent; 487 struct rb_node *parent;
488 xfs_buf_t *bp; 488 xfs_buf_t *bp;
489 xfs_daddr_t blkno = map[0].bm_bn; 489 xfs_daddr_t blkno = map[0].bm_bn;
490 xfs_daddr_t eofs;
490 int numblks = 0; 491 int numblks = 0;
491 int i; 492 int i;
492 493
@@ -498,6 +499,23 @@ _xfs_buf_find(
498 ASSERT(!(numbytes < (1 << btp->bt_sshift))); 499 ASSERT(!(numbytes < (1 << btp->bt_sshift)));
499 ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); 500 ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
500 501
502 /*
503 * Corrupted block numbers can get through to here, unfortunately, so we
504 * have to check that the buffer falls within the filesystem bounds.
505 */
506 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
507 if (blkno >= eofs) {
508 /*
509 * XXX (dgc): we should really be returning EFSCORRUPTED here,
510 * but none of the higher level infrastructure supports
511 * returning a specific error on buffer lookup failures.
512 */
513 xfs_alert(btp->bt_mount,
514 "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
515 __func__, blkno, eofs);
516 return NULL;
517 }
518
501 /* get tree root */ 519 /* get tree root */
502 pag = xfs_perag_get(btp->bt_mount, 520 pag = xfs_perag_get(btp->bt_mount,
503 xfs_daddr_to_agno(btp->bt_mount, blkno)); 521 xfs_daddr_to_agno(btp->bt_mount, blkno));
@@ -569,7 +587,9 @@ found:
569 */ 587 */
570 if (bp->b_flags & XBF_STALE) { 588 if (bp->b_flags & XBF_STALE) {
571 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 589 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
590 ASSERT(bp->b_iodone == NULL);
572 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 591 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
592 bp->b_ops = NULL;
573 } 593 }
574 594
575 trace_xfs_buf_find(bp, flags, _RET_IP_); 595 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -638,7 +658,7 @@ _xfs_buf_read(
638 xfs_buf_flags_t flags) 658 xfs_buf_flags_t flags)
639{ 659{
640 ASSERT(!(flags & XBF_WRITE)); 660 ASSERT(!(flags & XBF_WRITE));
641 ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL); 661 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
642 662
643 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 663 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
644 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 664 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
@@ -654,7 +674,8 @@ xfs_buf_read_map(
654 struct xfs_buftarg *target, 674 struct xfs_buftarg *target,
655 struct xfs_buf_map *map, 675 struct xfs_buf_map *map,
656 int nmaps, 676 int nmaps,
657 xfs_buf_flags_t flags) 677 xfs_buf_flags_t flags,
678 const struct xfs_buf_ops *ops)
658{ 679{
659 struct xfs_buf *bp; 680 struct xfs_buf *bp;
660 681
@@ -666,6 +687,7 @@ xfs_buf_read_map(
666 687
667 if (!XFS_BUF_ISDONE(bp)) { 688 if (!XFS_BUF_ISDONE(bp)) {
668 XFS_STATS_INC(xb_get_read); 689 XFS_STATS_INC(xb_get_read);
690 bp->b_ops = ops;
669 _xfs_buf_read(bp, flags); 691 _xfs_buf_read(bp, flags);
670 } else if (flags & XBF_ASYNC) { 692 } else if (flags & XBF_ASYNC) {
671 /* 693 /*
@@ -691,13 +713,14 @@ void
691xfs_buf_readahead_map( 713xfs_buf_readahead_map(
692 struct xfs_buftarg *target, 714 struct xfs_buftarg *target,
693 struct xfs_buf_map *map, 715 struct xfs_buf_map *map,
694 int nmaps) 716 int nmaps,
717 const struct xfs_buf_ops *ops)
695{ 718{
696 if (bdi_read_congested(target->bt_bdi)) 719 if (bdi_read_congested(target->bt_bdi))
697 return; 720 return;
698 721
699 xfs_buf_read_map(target, map, nmaps, 722 xfs_buf_read_map(target, map, nmaps,
700 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 723 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
701} 724}
702 725
703/* 726/*
@@ -709,10 +732,10 @@ xfs_buf_read_uncached(
709 struct xfs_buftarg *target, 732 struct xfs_buftarg *target,
710 xfs_daddr_t daddr, 733 xfs_daddr_t daddr,
711 size_t numblks, 734 size_t numblks,
712 int flags) 735 int flags,
736 const struct xfs_buf_ops *ops)
713{ 737{
714 xfs_buf_t *bp; 738 struct xfs_buf *bp;
715 int error;
716 739
717 bp = xfs_buf_get_uncached(target, numblks, flags); 740 bp = xfs_buf_get_uncached(target, numblks, flags);
718 if (!bp) 741 if (!bp)
@@ -723,13 +746,10 @@ xfs_buf_read_uncached(
723 bp->b_bn = daddr; 746 bp->b_bn = daddr;
724 bp->b_maps[0].bm_bn = daddr; 747 bp->b_maps[0].bm_bn = daddr;
725 bp->b_flags |= XBF_READ; 748 bp->b_flags |= XBF_READ;
749 bp->b_ops = ops;
726 750
727 xfsbdstrat(target->bt_mount, bp); 751 xfsbdstrat(target->bt_mount, bp);
728 error = xfs_buf_iowait(bp); 752 xfs_buf_iowait(bp);
729 if (error) {
730 xfs_buf_relse(bp);
731 return NULL;
732 }
733 return bp; 753 return bp;
734} 754}
735 755
@@ -999,27 +1019,37 @@ STATIC void
999xfs_buf_iodone_work( 1019xfs_buf_iodone_work(
1000 struct work_struct *work) 1020 struct work_struct *work)
1001{ 1021{
1002 xfs_buf_t *bp = 1022 struct xfs_buf *bp =
1003 container_of(work, xfs_buf_t, b_iodone_work); 1023 container_of(work, xfs_buf_t, b_iodone_work);
1024 bool read = !!(bp->b_flags & XBF_READ);
1025
1026 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1027 if (read && bp->b_ops)
1028 bp->b_ops->verify_read(bp);
1004 1029
1005 if (bp->b_iodone) 1030 if (bp->b_iodone)
1006 (*(bp->b_iodone))(bp); 1031 (*(bp->b_iodone))(bp);
1007 else if (bp->b_flags & XBF_ASYNC) 1032 else if (bp->b_flags & XBF_ASYNC)
1008 xfs_buf_relse(bp); 1033 xfs_buf_relse(bp);
1034 else {
1035 ASSERT(read && bp->b_ops);
1036 complete(&bp->b_iowait);
1037 }
1009} 1038}
1010 1039
1011void 1040void
1012xfs_buf_ioend( 1041xfs_buf_ioend(
1013 xfs_buf_t *bp, 1042 struct xfs_buf *bp,
1014 int schedule) 1043 int schedule)
1015{ 1044{
1045 bool read = !!(bp->b_flags & XBF_READ);
1046
1016 trace_xfs_buf_iodone(bp, _RET_IP_); 1047 trace_xfs_buf_iodone(bp, _RET_IP_);
1017 1048
1018 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1019 if (bp->b_error == 0) 1049 if (bp->b_error == 0)
1020 bp->b_flags |= XBF_DONE; 1050 bp->b_flags |= XBF_DONE;
1021 1051
1022 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 1052 if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
1023 if (schedule) { 1053 if (schedule) {
1024 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1054 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1025 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1055 queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1027,6 +1057,7 @@ xfs_buf_ioend(
1027 xfs_buf_iodone_work(&bp->b_iodone_work); 1057 xfs_buf_iodone_work(&bp->b_iodone_work);
1028 } 1058 }
1029 } else { 1059 } else {
1060 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1030 complete(&bp->b_iowait); 1061 complete(&bp->b_iowait);
1031 } 1062 }
1032} 1063}
@@ -1314,6 +1345,20 @@ _xfs_buf_ioapply(
1314 rw |= REQ_FUA; 1345 rw |= REQ_FUA;
1315 if (bp->b_flags & XBF_FLUSH) 1346 if (bp->b_flags & XBF_FLUSH)
1316 rw |= REQ_FLUSH; 1347 rw |= REQ_FLUSH;
1348
1349 /*
1350 * Run the write verifier callback function if it exists. If
1351 * this function fails it will mark the buffer with an error and
1352 * the IO should not be dispatched.
1353 */
1354 if (bp->b_ops) {
1355 bp->b_ops->verify_write(bp);
1356 if (bp->b_error) {
1357 xfs_force_shutdown(bp->b_target->bt_mount,
1358 SHUTDOWN_CORRUPT_INCORE);
1359 return;
1360 }
1361 }
1317 } else if (bp->b_flags & XBF_READ_AHEAD) { 1362 } else if (bp->b_flags & XBF_READ_AHEAD) {
1318 rw = READA; 1363 rw = READA;
1319 } else { 1364 } else {
@@ -1460,6 +1505,8 @@ restart:
1460 while (!list_empty(&btp->bt_lru)) { 1505 while (!list_empty(&btp->bt_lru)) {
1461 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1506 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1462 if (atomic_read(&bp->b_hold) > 1) { 1507 if (atomic_read(&bp->b_hold) > 1) {
1508 trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
1509 list_move_tail(&bp->b_lru, &btp->bt_lru);
1463 spin_unlock(&btp->bt_lru_lock); 1510 spin_unlock(&btp->bt_lru_lock);
1464 delay(100); 1511 delay(100);
1465 goto restart; 1512 goto restart;
@@ -1682,7 +1729,7 @@ xfs_buf_cmp(
1682 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1729 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1683 xfs_daddr_t diff; 1730 xfs_daddr_t diff;
1684 1731
1685 diff = ap->b_map.bm_bn - bp->b_map.bm_bn; 1732 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
1686 if (diff < 0) 1733 if (diff < 0)
1687 return -1; 1734 return -1;
1688 if (diff > 0) 1735 if (diff > 0)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7c0b6a0a1557..433a12ed7b17 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
100struct xfs_buf; 100struct xfs_buf;
101typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 101typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
102 102
103
103#define XB_PAGES 2 104#define XB_PAGES 2
104 105
105struct xfs_buf_map { 106struct xfs_buf_map {
@@ -110,6 +111,11 @@ struct xfs_buf_map {
110#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ 111#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
111 struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; 112 struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
112 113
114struct xfs_buf_ops {
115 void (*verify_read)(struct xfs_buf *);
116 void (*verify_write)(struct xfs_buf *);
117};
118
113typedef struct xfs_buf { 119typedef struct xfs_buf {
114 /* 120 /*
115 * first cacheline holds all the fields needed for an uncontended cache 121 * first cacheline holds all the fields needed for an uncontended cache
@@ -145,7 +151,7 @@ typedef struct xfs_buf {
145 struct page **b_pages; /* array of page pointers */ 151 struct page **b_pages; /* array of page pointers */
146 struct page *b_page_array[XB_PAGES]; /* inline pages */ 152 struct page *b_page_array[XB_PAGES]; /* inline pages */
147 struct xfs_buf_map *b_maps; /* compound buffer map */ 153 struct xfs_buf_map *b_maps; /* compound buffer map */
148 struct xfs_buf_map b_map; /* inline compound buffer map */ 154 struct xfs_buf_map __b_map; /* inline compound buffer map */
149 int b_map_count; 155 int b_map_count;
150 int b_io_length; /* IO size in BBs */ 156 int b_io_length; /* IO size in BBs */
151 atomic_t b_pin_count; /* pin count */ 157 atomic_t b_pin_count; /* pin count */
@@ -153,13 +159,13 @@ typedef struct xfs_buf {
153 unsigned int b_page_count; /* size of page array */ 159 unsigned int b_page_count; /* size of page array */
154 unsigned int b_offset; /* page offset in first page */ 160 unsigned int b_offset; /* page offset in first page */
155 unsigned short b_error; /* error code on I/O */ 161 unsigned short b_error; /* error code on I/O */
162 const struct xfs_buf_ops *b_ops;
156 163
157#ifdef XFS_BUF_LOCK_TRACKING 164#ifdef XFS_BUF_LOCK_TRACKING
158 int b_last_holder; 165 int b_last_holder;
159#endif 166#endif
160} xfs_buf_t; 167} xfs_buf_t;
161 168
162
163/* Finding and Reading Buffers */ 169/* Finding and Reading Buffers */
164struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, 170struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
165 struct xfs_buf_map *map, int nmaps, 171 struct xfs_buf_map *map, int nmaps,
@@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
196 xfs_buf_flags_t flags); 202 xfs_buf_flags_t flags);
197struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, 203struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
198 struct xfs_buf_map *map, int nmaps, 204 struct xfs_buf_map *map, int nmaps,
199 xfs_buf_flags_t flags); 205 xfs_buf_flags_t flags,
206 const struct xfs_buf_ops *ops);
200void xfs_buf_readahead_map(struct xfs_buftarg *target, 207void xfs_buf_readahead_map(struct xfs_buftarg *target,
201 struct xfs_buf_map *map, int nmaps); 208 struct xfs_buf_map *map, int nmaps,
209 const struct xfs_buf_ops *ops);
202 210
203static inline struct xfs_buf * 211static inline struct xfs_buf *
204xfs_buf_get( 212xfs_buf_get(
@@ -216,20 +224,22 @@ xfs_buf_read(
216 struct xfs_buftarg *target, 224 struct xfs_buftarg *target,
217 xfs_daddr_t blkno, 225 xfs_daddr_t blkno,
218 size_t numblks, 226 size_t numblks,
219 xfs_buf_flags_t flags) 227 xfs_buf_flags_t flags,
228 const struct xfs_buf_ops *ops)
220{ 229{
221 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 230 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
222 return xfs_buf_read_map(target, &map, 1, flags); 231 return xfs_buf_read_map(target, &map, 1, flags, ops);
223} 232}
224 233
225static inline void 234static inline void
226xfs_buf_readahead( 235xfs_buf_readahead(
227 struct xfs_buftarg *target, 236 struct xfs_buftarg *target,
228 xfs_daddr_t blkno, 237 xfs_daddr_t blkno,
229 size_t numblks) 238 size_t numblks,
239 const struct xfs_buf_ops *ops)
230{ 240{
231 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 241 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
232 return xfs_buf_readahead_map(target, &map, 1); 242 return xfs_buf_readahead_map(target, &map, 1, ops);
233} 243}
234 244
235struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); 245struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
239struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, 249struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
240 int flags); 250 int flags);
241struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, 251struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
242 xfs_daddr_t daddr, size_t numblks, int flags); 252 xfs_daddr_t daddr, size_t numblks, int flags,
253 const struct xfs_buf_ops *ops);
243void xfs_buf_hold(struct xfs_buf *bp); 254void xfs_buf_hold(struct xfs_buf *bp);
244 255
245/* Releasing Buffers */ 256/* Releasing Buffers */
@@ -319,8 +330,8 @@ void xfs_buf_stale(struct xfs_buf *bp);
319 * In future, uncached buffers will pass the block number directly to the io 330 * In future, uncached buffers will pass the block number directly to the io
320 * request function and hence these macros will go away at that point. 331 * request function and hence these macros will go away at that point.
321 */ 332 */
322#define XFS_BUF_ADDR(bp) ((bp)->b_map.bm_bn) 333#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
323#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_map.bm_bn = (xfs_daddr_t)(bno)) 334#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
324 335
325static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 336static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
326{ 337{
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index becf4a97efc6..3f9949fee391 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -71,7 +71,7 @@ xfs_buf_item_log_debug(
71 chunk_num = byte >> XFS_BLF_SHIFT; 71 chunk_num = byte >> XFS_BLF_SHIFT;
72 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 72 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
73 bit_num = chunk_num & (NBWORD - 1); 73 bit_num = chunk_num & (NBWORD - 1);
74 wordp = &(bip->bli_format.blf_data_map[word_num]); 74 wordp = &(bip->__bli_format.blf_data_map[word_num]);
75 bit_set = *wordp & (1 << bit_num); 75 bit_set = *wordp & (1 << bit_num);
76 ASSERT(bit_set); 76 ASSERT(bit_set);
77 byte++; 77 byte++;
@@ -237,7 +237,7 @@ xfs_buf_item_size(
237 * cancel flag in it. 237 * cancel flag in it.
238 */ 238 */
239 trace_xfs_buf_item_size_stale(bip); 239 trace_xfs_buf_item_size_stale(bip);
240 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 240 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
241 return bip->bli_format_count; 241 return bip->bli_format_count;
242 } 242 }
243 243
@@ -278,7 +278,7 @@ xfs_buf_item_format_segment(
278 uint buffer_offset; 278 uint buffer_offset;
279 279
280 /* copy the flags across from the base format item */ 280 /* copy the flags across from the base format item */
281 blfp->blf_flags = bip->bli_format.blf_flags; 281 blfp->blf_flags = bip->__bli_format.blf_flags;
282 282
283 /* 283 /*
284 * Base size is the actual size of the ondisk structure - it reflects 284 * Base size is the actual size of the ondisk structure - it reflects
@@ -287,6 +287,17 @@ xfs_buf_item_format_segment(
287 */ 287 */
288 base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + 288 base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +
289 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); 289 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
290
291 nvecs = 0;
292 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
293 if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
294 /*
295 * If the map is not be dirty in the transaction, mark
296 * the size as zero and do not advance the vector pointer.
297 */
298 goto out;
299 }
300
290 vecp->i_addr = blfp; 301 vecp->i_addr = blfp;
291 vecp->i_len = base_size; 302 vecp->i_len = base_size;
292 vecp->i_type = XLOG_REG_TYPE_BFORMAT; 303 vecp->i_type = XLOG_REG_TYPE_BFORMAT;
@@ -301,15 +312,13 @@ xfs_buf_item_format_segment(
301 */ 312 */
302 trace_xfs_buf_item_format_stale(bip); 313 trace_xfs_buf_item_format_stale(bip);
303 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); 314 ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
304 blfp->blf_size = nvecs; 315 goto out;
305 return vecp;
306 } 316 }
307 317
308 /* 318 /*
309 * Fill in an iovec for each set of contiguous chunks. 319 * Fill in an iovec for each set of contiguous chunks.
310 */ 320 */
311 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 321
312 ASSERT(first_bit != -1);
313 last_bit = first_bit; 322 last_bit = first_bit;
314 nbits = 1; 323 nbits = 1;
315 for (;;) { 324 for (;;) {
@@ -371,7 +380,8 @@ xfs_buf_item_format_segment(
371 nbits++; 380 nbits++;
372 } 381 }
373 } 382 }
374 bip->bli_format.blf_size = nvecs; 383out:
384 blfp->blf_size = nvecs;
375 return vecp; 385 return vecp;
376} 386}
377 387
@@ -405,7 +415,7 @@ xfs_buf_item_format(
405 if (bip->bli_flags & XFS_BLI_INODE_BUF) { 415 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
406 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 416 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
407 xfs_log_item_in_current_chkpt(lip))) 417 xfs_log_item_in_current_chkpt(lip)))
408 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; 418 bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
409 bip->bli_flags &= ~XFS_BLI_INODE_BUF; 419 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
410 } 420 }
411 421
@@ -485,7 +495,7 @@ xfs_buf_item_unpin(
485 ASSERT(bip->bli_flags & XFS_BLI_STALE); 495 ASSERT(bip->bli_flags & XFS_BLI_STALE);
486 ASSERT(xfs_buf_islocked(bp)); 496 ASSERT(xfs_buf_islocked(bp));
487 ASSERT(XFS_BUF_ISSTALE(bp)); 497 ASSERT(XFS_BUF_ISSTALE(bp));
488 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 498 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
489 499
490 trace_xfs_buf_item_unpin_stale(bip); 500 trace_xfs_buf_item_unpin_stale(bip);
491 501
@@ -601,7 +611,7 @@ xfs_buf_item_unlock(
601{ 611{
602 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 612 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
603 struct xfs_buf *bp = bip->bli_buf; 613 struct xfs_buf *bp = bip->bli_buf;
604 int aborted; 614 int aborted, clean, i;
605 uint hold; 615 uint hold;
606 616
607 /* Clear the buffer's association with this transaction. */ 617 /* Clear the buffer's association with this transaction. */
@@ -631,7 +641,7 @@ xfs_buf_item_unlock(
631 */ 641 */
632 if (bip->bli_flags & XFS_BLI_STALE) { 642 if (bip->bli_flags & XFS_BLI_STALE) {
633 trace_xfs_buf_item_unlock_stale(bip); 643 trace_xfs_buf_item_unlock_stale(bip);
634 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 644 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
635 if (!aborted) { 645 if (!aborted) {
636 atomic_dec(&bip->bli_refcount); 646 atomic_dec(&bip->bli_refcount);
637 return; 647 return;
@@ -642,12 +652,27 @@ xfs_buf_item_unlock(
642 652
643 /* 653 /*
644 * If the buf item isn't tracking any data, free it, otherwise drop the 654 * If the buf item isn't tracking any data, free it, otherwise drop the
645 * reference we hold to it. 655 * reference we hold to it. If we are aborting the transaction, this may
656 * be the only reference to the buf item, so we free it anyway
657 * regardless of whether it is dirty or not. A dirty abort implies a
658 * shutdown, anyway.
646 */ 659 */
647 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 660 clean = 1;
648 bip->bli_format.blf_map_size)) 661 for (i = 0; i < bip->bli_format_count; i++) {
662 if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
663 bip->bli_formats[i].blf_map_size)) {
664 clean = 0;
665 break;
666 }
667 }
668 if (clean)
649 xfs_buf_item_relse(bp); 669 xfs_buf_item_relse(bp);
650 else 670 else if (aborted) {
671 if (atomic_dec_and_test(&bip->bli_refcount)) {
672 ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
673 xfs_buf_item_relse(bp);
674 }
675 } else
651 atomic_dec(&bip->bli_refcount); 676 atomic_dec(&bip->bli_refcount);
652 677
653 if (!hold) 678 if (!hold)
@@ -716,7 +741,7 @@ xfs_buf_item_get_format(
716 bip->bli_format_count = count; 741 bip->bli_format_count = count;
717 742
718 if (count == 1) { 743 if (count == 1) {
719 bip->bli_formats = &bip->bli_format; 744 bip->bli_formats = &bip->__bli_format;
720 return 0; 745 return 0;
721 } 746 }
722 747
@@ -731,7 +756,7 @@ STATIC void
731xfs_buf_item_free_format( 756xfs_buf_item_free_format(
732 struct xfs_buf_log_item *bip) 757 struct xfs_buf_log_item *bip)
733{ 758{
734 if (bip->bli_formats != &bip->bli_format) { 759 if (bip->bli_formats != &bip->__bli_format) {
735 kmem_free(bip->bli_formats); 760 kmem_free(bip->bli_formats);
736 bip->bli_formats = NULL; 761 bip->bli_formats = NULL;
737 } 762 }
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 6850f49f4af3..16def435944a 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -104,7 +104,7 @@ typedef struct xfs_buf_log_item {
104#endif 104#endif
105 int bli_format_count; /* count of headers */ 105 int bli_format_count; /* count of headers */
106 struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ 106 struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */
107 struct xfs_buf_log_format bli_format; /* embedded in-log header */ 107 struct xfs_buf_log_format __bli_format; /* embedded in-log header */
108} xfs_buf_log_item_t; 108} xfs_buf_log_item_t;
109 109
110void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); 110void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
1#ifndef _XFS_CKSUM_H
2#define _XFS_CKSUM_H 1
3
4#define XFS_CRC_SEED (~(__uint32_t)0)
5
6/*
7 * Calculate the intermediate checksum for a buffer that has the CRC field
8 * inside it. The offset of the 32bit crc fields is passed as the
9 * cksum_offset parameter.
10 */
11static inline __uint32_t
12xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
13{
14 __uint32_t zero = 0;
15 __uint32_t crc;
16
17 /* Calculate CRC up to the checksum. */
18 crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
19
20 /* Skip checksum field */
21 crc = crc32c(crc, &zero, sizeof(__u32));
22
23 /* Calculate the rest of the CRC. */
24 return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
25 length - (cksum_offset + sizeof(__be32)));
26}
27
28/*
29 * Convert the intermediate checksum to the final ondisk format.
30 *
31 * The CRC32c calculation uses LE format even on BE machines, but returns the
32 * result in host endian format. Hence we need to byte swap it back to LE format
33 * so that it is consistent on disk.
34 */
35static inline __le32
36xfs_end_cksum(__uint32_t crc)
37{
38 return ~cpu_to_le32(crc);
39}
40
41/*
42 * Helper to generate the checksum for a buffer.
43 */
44static inline void
45xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
46{
47 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
48
49 *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
50}
51
52/*
53 * Helper to verify the checksum for a buffer.
54 */
55static inline int
56xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
57{
58 __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
59
60 return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
61}
62
63#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7bfb7dd334fc..4d7696a02418 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,84 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
91 xfs_da_state_blk_t *save_blk); 91 xfs_da_state_blk_t *save_blk);
92STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); 92STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
93 93
94static void
95xfs_da_node_verify(
96 struct xfs_buf *bp)
97{
98 struct xfs_mount *mp = bp->b_target->bt_mount;
99 struct xfs_da_node_hdr *hdr = bp->b_addr;
100 int block_ok = 0;
101
102 block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
103 block_ok = block_ok &&
104 be16_to_cpu(hdr->level) > 0 &&
105 be16_to_cpu(hdr->count) > 0 ;
106 if (!block_ok) {
107 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
108 xfs_buf_ioerror(bp, EFSCORRUPTED);
109 }
110
111}
112
113static void
114xfs_da_node_write_verify(
115 struct xfs_buf *bp)
116{
117 xfs_da_node_verify(bp);
118}
119
120/*
121 * leaf/node format detection on trees is sketchy, so a node read can be done on
122 * leaf level blocks when detection identifies the tree as a node format tree
123 * incorrectly. In this case, we need to swap the verifier to match the correct
124 * format of the block being read.
125 */
126static void
127xfs_da_node_read_verify(
128 struct xfs_buf *bp)
129{
130 struct xfs_mount *mp = bp->b_target->bt_mount;
131 struct xfs_da_blkinfo *info = bp->b_addr;
132
133 switch (be16_to_cpu(info->magic)) {
134 case XFS_DA_NODE_MAGIC:
135 xfs_da_node_verify(bp);
136 break;
137 case XFS_ATTR_LEAF_MAGIC:
138 bp->b_ops = &xfs_attr_leaf_buf_ops;
139 bp->b_ops->verify_read(bp);
140 return;
141 case XFS_DIR2_LEAFN_MAGIC:
142 bp->b_ops = &xfs_dir2_leafn_buf_ops;
143 bp->b_ops->verify_read(bp);
144 return;
145 default:
146 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
147 mp, info);
148 xfs_buf_ioerror(bp, EFSCORRUPTED);
149 break;
150 }
151}
152
153const struct xfs_buf_ops xfs_da_node_buf_ops = {
154 .verify_read = xfs_da_node_read_verify,
155 .verify_write = xfs_da_node_write_verify,
156};
157
158
159int
160xfs_da_node_read(
161 struct xfs_trans *tp,
162 struct xfs_inode *dp,
163 xfs_dablk_t bno,
164 xfs_daddr_t mappedbno,
165 struct xfs_buf **bpp,
166 int which_fork)
167{
168 return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
169 which_fork, &xfs_da_node_buf_ops);
170}
171
94/*======================================================================== 172/*========================================================================
95 * Routines used for growing the Btree. 173 * Routines used for growing the Btree.
96 *========================================================================*/ 174 *========================================================================*/
@@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
125 xfs_trans_log_buf(tp, bp, 203 xfs_trans_log_buf(tp, bp,
126 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); 204 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
127 205
206 bp->b_ops = &xfs_da_node_buf_ops;
128 *bpp = bp; 207 *bpp = bp;
129 return(0); 208 return(0);
130} 209}
@@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
324 } 403 }
325 memcpy(node, oldroot, size); 404 memcpy(node, oldroot, size);
326 xfs_trans_log_buf(tp, bp, 0, size - 1); 405 xfs_trans_log_buf(tp, bp, 0, size - 1);
406
407 bp->b_ops = blk1->bp->b_ops;
327 blk1->bp = bp; 408 blk1->bp = bp;
328 blk1->blkno = blkno; 409 blk1->blkno = blkno;
329 410
@@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
746 */ 827 */
747 child = be32_to_cpu(oldroot->btree[0].before); 828 child = be32_to_cpu(oldroot->btree[0].before);
748 ASSERT(child != 0); 829 ASSERT(child != 0);
749 error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, 830 error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
750 args->whichfork); 831 args->whichfork);
751 if (error) 832 if (error)
752 return(error); 833 return(error);
@@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
754 xfs_da_blkinfo_onlychild_validate(bp->b_addr, 835 xfs_da_blkinfo_onlychild_validate(bp->b_addr,
755 be16_to_cpu(oldroot->hdr.level)); 836 be16_to_cpu(oldroot->hdr.level));
756 837
838 /*
839 * This could be copying a leaf back into the root block in the case of
840 * there only being a single leaf block left in the tree. Hence we have
841 * to update the b_ops pointer as well to match the buffer type change
842 * that could occur.
843 */
757 memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize); 844 memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
845 root_blk->bp->b_ops = bp->b_ops;
758 xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); 846 xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
759 error = xfs_da_shrink_inode(args, child, bp); 847 error = xfs_da_shrink_inode(args, child, bp);
760 return(error); 848 return(error);
@@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
779 xfs_dablk_t blkno; 867 xfs_dablk_t blkno;
780 struct xfs_buf *bp; 868 struct xfs_buf *bp;
781 869
870 trace_xfs_da_node_toosmall(state->args);
871
782 /* 872 /*
783 * Check for the degenerate case of the block being over 50% full. 873 * Check for the degenerate case of the block being over 50% full.
784 * If so, it's not worth even looking to see if we might be able 874 * If so, it's not worth even looking to see if we might be able
@@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
835 blkno = be32_to_cpu(info->back); 925 blkno = be32_to_cpu(info->back);
836 if (blkno == 0) 926 if (blkno == 0)
837 continue; 927 continue;
838 error = xfs_da_read_buf(state->args->trans, state->args->dp, 928 error = xfs_da_node_read(state->args->trans, state->args->dp,
839 blkno, -1, &bp, state->args->whichfork); 929 blkno, -1, &bp, state->args->whichfork);
840 if (error) 930 if (error)
841 return(error); 931 return(error);
@@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
900 xfs_dahash_t lasthash=0; 990 xfs_dahash_t lasthash=0;
901 int level, count; 991 int level, count;
902 992
993 trace_xfs_da_fixhashpath(state->args);
994
903 level = path->active-1; 995 level = path->active-1;
904 blk = &path->blk[ level ]; 996 blk = &path->blk[ level ];
905 switch (blk->magic) { 997 switch (blk->magic) {
@@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
1079 * Read the next node down in the tree. 1171 * Read the next node down in the tree.
1080 */ 1172 */
1081 blk->blkno = blkno; 1173 blk->blkno = blkno;
1082 error = xfs_da_read_buf(args->trans, args->dp, blkno, 1174 error = xfs_da_node_read(args->trans, args->dp, blkno,
1083 -1, &blk->bp, args->whichfork); 1175 -1, &blk->bp, args->whichfork);
1084 if (error) { 1176 if (error) {
1085 blk->blkno = 0; 1177 blk->blkno = 0;
@@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1241 new_info->forw = cpu_to_be32(old_blk->blkno); 1333 new_info->forw = cpu_to_be32(old_blk->blkno);
1242 new_info->back = old_info->back; 1334 new_info->back = old_info->back;
1243 if (old_info->back) { 1335 if (old_info->back) {
1244 error = xfs_da_read_buf(args->trans, args->dp, 1336 error = xfs_da_node_read(args->trans, args->dp,
1245 be32_to_cpu(old_info->back), 1337 be32_to_cpu(old_info->back),
1246 -1, &bp, args->whichfork); 1338 -1, &bp, args->whichfork);
1247 if (error) 1339 if (error)
@@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
1262 new_info->forw = old_info->forw; 1354 new_info->forw = old_info->forw;
1263 new_info->back = cpu_to_be32(old_blk->blkno); 1355 new_info->back = cpu_to_be32(old_blk->blkno);
1264 if (old_info->forw) { 1356 if (old_info->forw) {
1265 error = xfs_da_read_buf(args->trans, args->dp, 1357 error = xfs_da_node_read(args->trans, args->dp,
1266 be32_to_cpu(old_info->forw), 1358 be32_to_cpu(old_info->forw),
1267 -1, &bp, args->whichfork); 1359 -1, &bp, args->whichfork);
1268 if (error) 1360 if (error)
@@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1362 trace_xfs_da_unlink_back(args); 1454 trace_xfs_da_unlink_back(args);
1363 save_info->back = drop_info->back; 1455 save_info->back = drop_info->back;
1364 if (drop_info->back) { 1456 if (drop_info->back) {
1365 error = xfs_da_read_buf(args->trans, args->dp, 1457 error = xfs_da_node_read(args->trans, args->dp,
1366 be32_to_cpu(drop_info->back), 1458 be32_to_cpu(drop_info->back),
1367 -1, &bp, args->whichfork); 1459 -1, &bp, args->whichfork);
1368 if (error) 1460 if (error)
@@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
1379 trace_xfs_da_unlink_forward(args); 1471 trace_xfs_da_unlink_forward(args);
1380 save_info->forw = drop_info->forw; 1472 save_info->forw = drop_info->forw;
1381 if (drop_info->forw) { 1473 if (drop_info->forw) {
1382 error = xfs_da_read_buf(args->trans, args->dp, 1474 error = xfs_da_node_read(args->trans, args->dp,
1383 be32_to_cpu(drop_info->forw), 1475 be32_to_cpu(drop_info->forw),
1384 -1, &bp, args->whichfork); 1476 -1, &bp, args->whichfork);
1385 if (error) 1477 if (error)
@@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1417 xfs_dablk_t blkno=0; 1509 xfs_dablk_t blkno=0;
1418 int level, error; 1510 int level, error;
1419 1511
1512 trace_xfs_da_path_shift(state->args);
1513
1420 /* 1514 /*
1421 * Roll up the Btree looking for the first block where our 1515 * Roll up the Btree looking for the first block where our
1422 * current index is not at the edge of the block. Note that 1516 * current index is not at the edge of the block. Note that
@@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
1463 * Read the next child block. 1557 * Read the next child block.
1464 */ 1558 */
1465 blk->blkno = blkno; 1559 blk->blkno = blkno;
1466 error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, 1560 error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
1467 &blk->bp, args->whichfork); 1561 &blk->bp, args->whichfork);
1468 if (error) 1562 if (error)
1469 return(error); 1563 return(error);
1470 ASSERT(blk->bp != NULL); 1564 ASSERT(blk->bp != NULL);
@@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock(
1727 * Read the last block in the btree space. 1821 * Read the last block in the btree space.
1728 */ 1822 */
1729 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; 1823 last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
1730 if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) 1824 error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
1825 if (error)
1731 return error; 1826 return error;
1732 /* 1827 /*
1733 * Copy the last block into the dead buffer and log it. 1828 * Copy the last block into the dead buffer and log it.
@@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock(
1753 * If the moved block has a left sibling, fix up the pointers. 1848 * If the moved block has a left sibling, fix up the pointers.
1754 */ 1849 */
1755 if ((sib_blkno = be32_to_cpu(dead_info->back))) { 1850 if ((sib_blkno = be32_to_cpu(dead_info->back))) {
1756 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) 1851 error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
1852 if (error)
1757 goto done; 1853 goto done;
1758 sib_info = sib_buf->b_addr; 1854 sib_info = sib_buf->b_addr;
1759 if (unlikely( 1855 if (unlikely(
@@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock(
1774 * If the moved block has a right sibling, fix up the pointers. 1870 * If the moved block has a right sibling, fix up the pointers.
1775 */ 1871 */
1776 if ((sib_blkno = be32_to_cpu(dead_info->forw))) { 1872 if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
1777 if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) 1873 error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
1874 if (error)
1778 goto done; 1875 goto done;
1779 sib_info = sib_buf->b_addr; 1876 sib_info = sib_buf->b_addr;
1780 if (unlikely( 1877 if (unlikely(
@@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock(
1797 * Walk down the tree looking for the parent of the moved block. 1894 * Walk down the tree looking for the parent of the moved block.
1798 */ 1895 */
1799 for (;;) { 1896 for (;;) {
1800 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) 1897 error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
1898 if (error)
1801 goto done; 1899 goto done;
1802 par_node = par_buf->b_addr; 1900 par_node = par_buf->b_addr;
1803 if (unlikely(par_node->hdr.info.magic != 1901 if (unlikely(par_node->hdr.info.magic !=
@@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock(
1847 error = XFS_ERROR(EFSCORRUPTED); 1945 error = XFS_ERROR(EFSCORRUPTED);
1848 goto done; 1946 goto done;
1849 } 1947 }
1850 if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) 1948 error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
1949 if (error)
1851 goto done; 1950 goto done;
1852 par_node = par_buf->b_addr; 1951 par_node = par_buf->b_addr;
1853 if (unlikely( 1952 if (unlikely(
@@ -2133,7 +2232,8 @@ xfs_da_read_buf(
2133 xfs_dablk_t bno, 2232 xfs_dablk_t bno,
2134 xfs_daddr_t mappedbno, 2233 xfs_daddr_t mappedbno,
2135 struct xfs_buf **bpp, 2234 struct xfs_buf **bpp,
2136 int whichfork) 2235 int whichfork,
2236 const struct xfs_buf_ops *ops)
2137{ 2237{
2138 struct xfs_buf *bp; 2238 struct xfs_buf *bp;
2139 struct xfs_buf_map map; 2239 struct xfs_buf_map map;
@@ -2155,7 +2255,7 @@ xfs_da_read_buf(
2155 2255
2156 error = xfs_trans_read_buf_map(dp->i_mount, trans, 2256 error = xfs_trans_read_buf_map(dp->i_mount, trans,
2157 dp->i_mount->m_ddev_targp, 2257 dp->i_mount->m_ddev_targp,
2158 mapp, nmap, 0, &bp); 2258 mapp, nmap, 0, &bp, ops);
2159 if (error) 2259 if (error)
2160 goto out_free; 2260 goto out_free;
2161 2261
@@ -2211,9 +2311,10 @@ xfs_da_reada_buf(
2211 struct xfs_trans *trans, 2311 struct xfs_trans *trans,
2212 struct xfs_inode *dp, 2312 struct xfs_inode *dp,
2213 xfs_dablk_t bno, 2313 xfs_dablk_t bno,
2214 int whichfork) 2314 xfs_daddr_t mappedbno,
2315 int whichfork,
2316 const struct xfs_buf_ops *ops)
2215{ 2317{
2216 xfs_daddr_t mappedbno = -1;
2217 struct xfs_buf_map map; 2318 struct xfs_buf_map map;
2218 struct xfs_buf_map *mapp; 2319 struct xfs_buf_map *mapp;
2219 int nmap; 2320 int nmap;
@@ -2221,7 +2322,7 @@ xfs_da_reada_buf(
2221 2322
2222 mapp = &map; 2323 mapp = &map;
2223 nmap = 1; 2324 nmap = 1;
2224 error = xfs_dabuf_map(trans, dp, bno, -1, whichfork, 2325 error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
2225 &mapp, &nmap); 2326 &mapp, &nmap);
2226 if (error) { 2327 if (error) {
2227 /* mapping a hole is not an error, but we don't continue */ 2328 /* mapping a hole is not an error, but we don't continue */
@@ -2231,7 +2332,7 @@ xfs_da_reada_buf(
2231 } 2332 }
2232 2333
2233 mappedbno = mapp[0].bm_bn; 2334 mappedbno = mapp[0].bm_bn;
2234 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap); 2335 xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
2235 2336
2236out_free: 2337out_free:
2237 if (mapp != &map) 2338 if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 132adafb041e..ee5170c46ae1 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
18#ifndef __XFS_DA_BTREE_H__ 18#ifndef __XFS_DA_BTREE_H__
19#define __XFS_DA_BTREE_H__ 19#define __XFS_DA_BTREE_H__
20 20
21struct xfs_buf;
22struct xfs_bmap_free; 21struct xfs_bmap_free;
23struct xfs_inode; 22struct xfs_inode;
24struct xfs_mount; 23struct xfs_mount;
@@ -214,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
214 */ 213 */
215int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, 214int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
216 xfs_da_state_blk_t *new_blk); 215 xfs_da_state_blk_t *new_blk);
216int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
217 xfs_dablk_t bno, xfs_daddr_t mappedbno,
218 struct xfs_buf **bpp, int which_fork);
217 219
218/* 220/*
219 * Utility routines. 221 * Utility routines.
@@ -226,9 +228,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
226 struct xfs_buf **bp, int whichfork); 228 struct xfs_buf **bp, int whichfork);
227int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, 229int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
228 xfs_dablk_t bno, xfs_daddr_t mappedbno, 230 xfs_dablk_t bno, xfs_daddr_t mappedbno,
229 struct xfs_buf **bpp, int whichfork); 231 struct xfs_buf **bpp, int whichfork,
232 const struct xfs_buf_ops *ops);
230xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, 233xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
231 xfs_dablk_t bno, int whichfork); 234 xfs_dablk_t bno, xfs_daddr_t mapped_bno,
235 int whichfork, const struct xfs_buf_ops *ops);
232int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, 236int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
233 struct xfs_buf *dead_buf); 237 struct xfs_buf *dead_buf);
234 238
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b9b8646e62db..a8bd26b82ecb 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
246 goto out_unlock; 246 goto out_unlock;
247 } 247 }
248 248
249 if (VN_CACHED(VFS_I(tip)) != 0) { 249 error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
250 error = xfs_flushinval_pages(tip, 0, -1, 250 if (error)
251 FI_REMAPF_LOCKED); 251 goto out_unlock;
252 if (error) 252 truncate_pagecache_range(VFS_I(tip), 0, -1);
253 goto out_unlock;
254 }
255 253
256 /* Verify O_DIRECT for ftmp */ 254 /* Verify O_DIRECT for ftmp */
257 if (VN_CACHED(VFS_I(tip)) != 0) { 255 if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
315 * are safe. We don't really care if non-io related 313 * are safe. We don't really care if non-io related
316 * fields change. 314 * fields change.
317 */ 315 */
318 316 truncate_pagecache_range(VFS_I(ip), 0, -1);
319 xfs_tosspages(ip, 0, -1, FI_REMAPF);
320 317
321 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); 318 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
322 if ((error = xfs_trans_reserve(tp, 0, 319 if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e93ca8f054f4..12afe07a91d7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
56 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); 56 xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
57} 57}
58 58
59static void
60xfs_dir2_block_verify(
61 struct xfs_buf *bp)
62{
63 struct xfs_mount *mp = bp->b_target->bt_mount;
64 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
65 int block_ok = 0;
66
67 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
68 block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
69
70 if (!block_ok) {
71 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
72 xfs_buf_ioerror(bp, EFSCORRUPTED);
73 }
74}
75
76static void
77xfs_dir2_block_read_verify(
78 struct xfs_buf *bp)
79{
80 xfs_dir2_block_verify(bp);
81}
82
83static void
84xfs_dir2_block_write_verify(
85 struct xfs_buf *bp)
86{
87 xfs_dir2_block_verify(bp);
88}
89
90const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
91 .verify_read = xfs_dir2_block_read_verify,
92 .verify_write = xfs_dir2_block_write_verify,
93};
94
95static int
96xfs_dir2_block_read(
97 struct xfs_trans *tp,
98 struct xfs_inode *dp,
99 struct xfs_buf **bpp)
100{
101 struct xfs_mount *mp = dp->i_mount;
102
103 return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
104 XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
105}
106
107static void
108xfs_dir2_block_need_space(
109 struct xfs_dir2_data_hdr *hdr,
110 struct xfs_dir2_block_tail *btp,
111 struct xfs_dir2_leaf_entry *blp,
112 __be16 **tagpp,
113 struct xfs_dir2_data_unused **dupp,
114 struct xfs_dir2_data_unused **enddupp,
115 int *compact,
116 int len)
117{
118 struct xfs_dir2_data_free *bf;
119 __be16 *tagp = NULL;
120 struct xfs_dir2_data_unused *dup = NULL;
121 struct xfs_dir2_data_unused *enddup = NULL;
122
123 *compact = 0;
124 bf = hdr->bestfree;
125
126 /*
127 * If there are stale entries we'll use one for the leaf.
128 */
129 if (btp->stale) {
130 if (be16_to_cpu(bf[0].length) >= len) {
131 /*
132 * The biggest entry enough to avoid compaction.
133 */
134 dup = (xfs_dir2_data_unused_t *)
135 ((char *)hdr + be16_to_cpu(bf[0].offset));
136 goto out;
137 }
138
139 /*
140 * Will need to compact to make this work.
141 * Tag just before the first leaf entry.
142 */
143 *compact = 1;
144 tagp = (__be16 *)blp - 1;
145
146 /* Data object just before the first leaf entry. */
147 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
148
149 /*
150 * If it's not free then the data will go where the
151 * leaf data starts now, if it works at all.
152 */
153 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
154 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
155 (uint)sizeof(*blp) < len)
156 dup = NULL;
157 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
158 dup = NULL;
159 else
160 dup = (xfs_dir2_data_unused_t *)blp;
161 goto out;
162 }
163
164 /*
165 * no stale entries, so just use free space.
166 * Tag just before the first leaf entry.
167 */
168 tagp = (__be16 *)blp - 1;
169
170 /* Data object just before the first leaf entry. */
171 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
172
173 /*
174 * If it's not free then can't do this add without cleaning up:
175 * the space before the first leaf entry needs to be free so it
176 * can be expanded to hold the pointer to the new entry.
177 */
178 if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
179 /*
180 * Check out the biggest freespace and see if it's the same one.
181 */
182 dup = (xfs_dir2_data_unused_t *)
183 ((char *)hdr + be16_to_cpu(bf[0].offset));
184 if (dup != enddup) {
185 /*
186 * Not the same free entry, just check its length.
187 */
188 if (be16_to_cpu(dup->length) < len)
189 dup = NULL;
190 goto out;
191 }
192
193 /*
194 * It is the biggest freespace, can it hold the leaf too?
195 */
196 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
197 /*
198 * Yes, use the second-largest entry instead if it works.
199 */
200 if (be16_to_cpu(bf[1].length) >= len)
201 dup = (xfs_dir2_data_unused_t *)
202 ((char *)hdr + be16_to_cpu(bf[1].offset));
203 else
204 dup = NULL;
205 }
206 }
207out:
208 *tagpp = tagp;
209 *dupp = dup;
210 *enddupp = enddup;
211}
212
213/*
214 * compact the leaf entries.
215 * Leave the highest-numbered stale entry stale.
216 * XXX should be the one closest to mid but mid is not yet computed.
217 */
218static void
219xfs_dir2_block_compact(
220 struct xfs_trans *tp,
221 struct xfs_buf *bp,
222 struct xfs_dir2_data_hdr *hdr,
223 struct xfs_dir2_block_tail *btp,
224 struct xfs_dir2_leaf_entry *blp,
225 int *needlog,
226 int *lfloghigh,
227 int *lfloglow)
228{
229 int fromidx; /* source leaf index */
230 int toidx; /* target leaf index */
231 int needscan = 0;
232 int highstale; /* high stale index */
233
234 fromidx = toidx = be32_to_cpu(btp->count) - 1;
235 highstale = *lfloghigh = -1;
236 for (; fromidx >= 0; fromidx--) {
237 if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
238 if (highstale == -1)
239 highstale = toidx;
240 else {
241 if (*lfloghigh == -1)
242 *lfloghigh = toidx;
243 continue;
244 }
245 }
246 if (fromidx < toidx)
247 blp[toidx] = blp[fromidx];
248 toidx--;
249 }
250 *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
251 *lfloghigh -= be32_to_cpu(btp->stale) - 1;
252 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
253 xfs_dir2_data_make_free(tp, bp,
254 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
255 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
256 needlog, &needscan);
257 blp += be32_to_cpu(btp->stale) - 1;
258 btp->stale = cpu_to_be32(1);
259 /*
260 * If we now need to rebuild the bestfree map, do so.
261 * This needs to happen before the next call to use_free.
262 */
263 if (needscan)
264 xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
265}
266
59/* 267/*
60 * Add an entry to a block directory. 268 * Add an entry to a block directory.
61 */ 269 */
@@ -63,7 +271,6 @@ int /* error */
63xfs_dir2_block_addname( 271xfs_dir2_block_addname(
64 xfs_da_args_t *args) /* directory op arguments */ 272 xfs_da_args_t *args) /* directory op arguments */
65{ 273{
66 xfs_dir2_data_free_t *bf; /* bestfree table in block */
67 xfs_dir2_data_hdr_t *hdr; /* block header */ 274 xfs_dir2_data_hdr_t *hdr; /* block header */
68 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ 275 xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
69 struct xfs_buf *bp; /* buffer for block */ 276 struct xfs_buf *bp; /* buffer for block */
@@ -94,134 +301,44 @@ xfs_dir2_block_addname(
94 dp = args->dp; 301 dp = args->dp;
95 tp = args->trans; 302 tp = args->trans;
96 mp = dp->i_mount; 303 mp = dp->i_mount;
97 /* 304
98 * Read the (one and only) directory block into dabuf bp. 305 /* Read the (one and only) directory block into bp. */
99 */ 306 error = xfs_dir2_block_read(tp, dp, &bp);
100 if ((error = 307 if (error)
101 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
102 return error; 308 return error;
103 } 309
104 ASSERT(bp != NULL);
105 hdr = bp->b_addr;
106 /*
107 * Check the magic number, corrupted if wrong.
108 */
109 if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
110 XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
111 XFS_ERRLEVEL_LOW, mp, hdr);
112 xfs_trans_brelse(tp, bp);
113 return XFS_ERROR(EFSCORRUPTED);
114 }
115 len = xfs_dir2_data_entsize(args->namelen); 310 len = xfs_dir2_data_entsize(args->namelen);
311
116 /* 312 /*
117 * Set up pointers to parts of the block. 313 * Set up pointers to parts of the block.
118 */ 314 */
119 bf = hdr->bestfree; 315 hdr = bp->b_addr;
120 btp = xfs_dir2_block_tail_p(mp, hdr); 316 btp = xfs_dir2_block_tail_p(mp, hdr);
121 blp = xfs_dir2_block_leaf_p(btp); 317 blp = xfs_dir2_block_leaf_p(btp);
318
122 /* 319 /*
123 * No stale entries? Need space for entry and new leaf. 320 * Find out if we can reuse stale entries or whether we need extra
124 */ 321 * space for entry and new leaf.
125 if (!btp->stale) {
126 /*
127 * Tag just before the first leaf entry.
128 */
129 tagp = (__be16 *)blp - 1;
130 /*
131 * Data object just before the first leaf entry.
132 */
133 enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
134 /*
135 * If it's not free then can't do this add without cleaning up:
136 * the space before the first leaf entry needs to be free so it
137 * can be expanded to hold the pointer to the new entry.
138 */
139 if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
140 dup = enddup = NULL;
141 /*
142 * Check out the biggest freespace and see if it's the same one.
143 */
144 else {
145 dup = (xfs_dir2_data_unused_t *)
146 ((char *)hdr + be16_to_cpu(bf[0].offset));
147 if (dup == enddup) {
148 /*
149 * It is the biggest freespace, is it too small
150 * to hold the new leaf too?
151 */
152 if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
153 /*
154 * Yes, we use the second-largest
155 * entry instead if it works.
156 */
157 if (be16_to_cpu(bf[1].length) >= len)
158 dup = (xfs_dir2_data_unused_t *)
159 ((char *)hdr +
160 be16_to_cpu(bf[1].offset));
161 else
162 dup = NULL;
163 }
164 } else {
165 /*
166 * Not the same free entry,
167 * just check its length.
168 */
169 if (be16_to_cpu(dup->length) < len) {
170 dup = NULL;
171 }
172 }
173 }
174 compact = 0;
175 }
176 /*
177 * If there are stale entries we'll use one for the leaf.
178 * Is the biggest entry enough to avoid compaction?
179 */
180 else if (be16_to_cpu(bf[0].length) >= len) {
181 dup = (xfs_dir2_data_unused_t *)
182 ((char *)hdr + be16_to_cpu(bf[0].offset));
183 compact = 0;
184 }
185 /*
186 * Will need to compact to make this work.
187 */ 322 */
188 else { 323 xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
189 /* 324 &enddup, &compact, len);
190 * Tag just before the first leaf entry. 325
191 */
192 tagp = (__be16 *)blp - 1;
193 /*
194 * Data object just before the first leaf entry.
195 */
196 dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
197 /*
198 * If it's not free then the data will go where the
199 * leaf data starts now, if it works at all.
200 */
201 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
202 if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
203 (uint)sizeof(*blp) < len)
204 dup = NULL;
205 } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
206 dup = NULL;
207 else
208 dup = (xfs_dir2_data_unused_t *)blp;
209 compact = 1;
210 }
211 /* 326 /*
212 * If this isn't a real add, we're done with the buffer. 327 * Done everything we need for a space check now.
213 */ 328 */
214 if (args->op_flags & XFS_DA_OP_JUSTCHECK) 329 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
215 xfs_trans_brelse(tp, bp); 330 xfs_trans_brelse(tp, bp);
331 if (!dup)
332 return XFS_ERROR(ENOSPC);
333 return 0;
334 }
335
216 /* 336 /*
217 * If we don't have space for the new entry & leaf ... 337 * If we don't have space for the new entry & leaf ...
218 */ 338 */
219 if (!dup) { 339 if (!dup) {
220 /* 340 /* Don't have a space reservation: return no-space. */
221 * Not trying to actually do anything, or don't have 341 if (args->total == 0)
222 * a space reservation: return no-space.
223 */
224 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
225 return XFS_ERROR(ENOSPC); 342 return XFS_ERROR(ENOSPC);
226 /* 343 /*
227 * Convert to the next larger format. 344 * Convert to the next larger format.
@@ -232,65 +349,26 @@ xfs_dir2_block_addname(
232 return error; 349 return error;
233 return xfs_dir2_leaf_addname(args); 350 return xfs_dir2_leaf_addname(args);
234 } 351 }
235 /* 352
236 * Just checking, and it would work, so say so.
237 */
238 if (args->op_flags & XFS_DA_OP_JUSTCHECK)
239 return 0;
240 needlog = needscan = 0; 353 needlog = needscan = 0;
354
241 /* 355 /*
242 * If need to compact the leaf entries, do it now. 356 * If need to compact the leaf entries, do it now.
243 * Leave the highest-numbered stale entry stale.
244 * XXX should be the one closest to mid but mid is not yet computed.
245 */ 357 */
246 if (compact) { 358 if (compact) {
247 int fromidx; /* source leaf index */ 359 xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
248 int toidx; /* target leaf index */ 360 &lfloghigh, &lfloglow);
249 361 /* recalculate blp post-compaction */
250 for (fromidx = toidx = be32_to_cpu(btp->count) - 1, 362 blp = xfs_dir2_block_leaf_p(btp);
251 highstale = lfloghigh = -1; 363 } else if (btp->stale) {
252 fromidx >= 0;
253 fromidx--) {
254 if (blp[fromidx].address ==
255 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
256 if (highstale == -1)
257 highstale = toidx;
258 else {
259 if (lfloghigh == -1)
260 lfloghigh = toidx;
261 continue;
262 }
263 }
264 if (fromidx < toidx)
265 blp[toidx] = blp[fromidx];
266 toidx--;
267 }
268 lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
269 lfloghigh -= be32_to_cpu(btp->stale) - 1;
270 be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
271 xfs_dir2_data_make_free(tp, bp,
272 (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
273 (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
274 &needlog, &needscan);
275 blp += be32_to_cpu(btp->stale) - 1;
276 btp->stale = cpu_to_be32(1);
277 /* 364 /*
278 * If we now need to rebuild the bestfree map, do so. 365 * Set leaf logging boundaries to impossible state.
279 * This needs to happen before the next call to use_free. 366 * For the no-stale case they're set explicitly.
280 */ 367 */
281 if (needscan) {
282 xfs_dir2_data_freescan(mp, hdr, &needlog);
283 needscan = 0;
284 }
285 }
286 /*
287 * Set leaf logging boundaries to impossible state.
288 * For the no-stale case they're set explicitly.
289 */
290 else if (btp->stale) {
291 lfloglow = be32_to_cpu(btp->count); 368 lfloglow = be32_to_cpu(btp->count);
292 lfloghigh = -1; 369 lfloghigh = -1;
293 } 370 }
371
294 /* 372 /*
295 * Find the slot that's first lower than our hash value, -1 if none. 373 * Find the slot that's first lower than our hash value, -1 if none.
296 */ 374 */
@@ -450,18 +528,13 @@ xfs_dir2_block_getdents(
450 /* 528 /*
451 * If the block number in the offset is out of range, we're done. 529 * If the block number in the offset is out of range, we're done.
452 */ 530 */
453 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) { 531 if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
454 return 0; 532 return 0;
455 } 533
456 /* 534 error = xfs_dir2_block_read(NULL, dp, &bp);
457 * Can't read the block, give up, else get dabuf in bp.
458 */
459 error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
460 &bp, XFS_DATA_FORK);
461 if (error) 535 if (error)
462 return error; 536 return error;
463 537
464 ASSERT(bp != NULL);
465 /* 538 /*
466 * Extract the byte offset we start at from the seek pointer. 539 * Extract the byte offset we start at from the seek pointer.
467 * We'll skip entries before this. 540 * We'll skip entries before this.
@@ -637,14 +710,11 @@ xfs_dir2_block_lookup_int(
637 dp = args->dp; 710 dp = args->dp;
638 tp = args->trans; 711 tp = args->trans;
639 mp = dp->i_mount; 712 mp = dp->i_mount;
640 /* 713
641 * Read the buffer, return error if we can't get it. 714 error = xfs_dir2_block_read(tp, dp, &bp);
642 */ 715 if (error)
643 if ((error =
644 xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
645 return error; 716 return error;
646 } 717
647 ASSERT(bp != NULL);
648 hdr = bp->b_addr; 718 hdr = bp->b_addr;
649 xfs_dir2_data_check(dp, bp); 719 xfs_dir2_data_check(dp, bp);
650 btp = xfs_dir2_block_tail_p(mp, hdr); 720 btp = xfs_dir2_block_tail_p(mp, hdr);
@@ -917,10 +987,10 @@ xfs_dir2_leaf_to_block(
917 /* 987 /*
918 * Read the data block if we don't already have it, give up if it fails. 988 * Read the data block if we don't already have it, give up if it fails.
919 */ 989 */
920 if (dbp == NULL && 990 if (!dbp) {
921 (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, 991 error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
922 XFS_DATA_FORK))) { 992 if (error)
923 return error; 993 return error;
924 } 994 }
925 hdr = dbp->b_addr; 995 hdr = dbp->b_addr;
926 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); 996 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
@@ -944,6 +1014,7 @@ xfs_dir2_leaf_to_block(
944 /* 1014 /*
945 * Start converting it to block form. 1015 * Start converting it to block form.
946 */ 1016 */
1017 dbp->b_ops = &xfs_dir2_block_buf_ops;
947 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); 1018 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
948 needlog = 1; 1019 needlog = 1;
949 needscan = 0; 1020 needscan = 0;
@@ -1073,6 +1144,7 @@ xfs_dir2_sf_to_block(
1073 kmem_free(sfp); 1144 kmem_free(sfp);
1074 return error; 1145 return error;
1075 } 1146 }
1147 bp->b_ops = &xfs_dir2_block_buf_ops;
1076 hdr = bp->b_addr; 1148 hdr = bp->b_addr;
1077 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); 1149 hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
1078 /* 1150 /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 44ffd4d6bc91..ffcf1774152e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
34STATIC xfs_dir2_data_free_t * 34STATIC xfs_dir2_data_free_t *
35xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup); 35xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
36 36
37#ifdef DEBUG
38/* 37/*
39 * Check the consistency of the data block. 38 * Check the consistency of the data block.
40 * The input can also be a block-format directory. 39 * The input can also be a block-format directory.
41 * Pop an assert if we find anything bad. 40 * Return 0 is the buffer is good, otherwise an error.
42 */ 41 */
43void 42int
44xfs_dir2_data_check( 43__xfs_dir2_data_check(
45 struct xfs_inode *dp, /* incore inode pointer */ 44 struct xfs_inode *dp, /* incore inode pointer */
46 struct xfs_buf *bp) /* data block's buffer */ 45 struct xfs_buf *bp) /* data block's buffer */
47{ 46{
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
64 int stale; /* count of stale leaves */ 63 int stale; /* count of stale leaves */
65 struct xfs_name name; 64 struct xfs_name name;
66 65
67 mp = dp->i_mount; 66 mp = bp->b_target->bt_mount;
68 hdr = bp->b_addr; 67 hdr = bp->b_addr;
69 bf = hdr->bestfree; 68 bf = hdr->bestfree;
70 p = (char *)(hdr + 1); 69 p = (char *)(hdr + 1);
71 70
72 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 71 switch (hdr->magic) {
72 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
73 btp = xfs_dir2_block_tail_p(mp, hdr); 73 btp = xfs_dir2_block_tail_p(mp, hdr);
74 lep = xfs_dir2_block_leaf_p(btp); 74 lep = xfs_dir2_block_leaf_p(btp);
75 endp = (char *)lep; 75 endp = (char *)lep;
76 } else { 76 break;
77 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC)); 77 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
78 endp = (char *)hdr + mp->m_dirblksize; 78 endp = (char *)hdr + mp->m_dirblksize;
79 break;
80 default:
81 XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
82 return EFSCORRUPTED;
79 } 83 }
80 84
81 count = lastfree = freeseen = 0; 85 count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
83 * Account for zero bestfree entries. 87 * Account for zero bestfree entries.
84 */ 88 */
85 if (!bf[0].length) { 89 if (!bf[0].length) {
86 ASSERT(!bf[0].offset); 90 XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
87 freeseen |= 1 << 0; 91 freeseen |= 1 << 0;
88 } 92 }
89 if (!bf[1].length) { 93 if (!bf[1].length) {
90 ASSERT(!bf[1].offset); 94 XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
91 freeseen |= 1 << 1; 95 freeseen |= 1 << 1;
92 } 96 }
93 if (!bf[2].length) { 97 if (!bf[2].length) {
94 ASSERT(!bf[2].offset); 98 XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
95 freeseen |= 1 << 2; 99 freeseen |= 1 << 2;
96 } 100 }
97 ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); 101
98 ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); 102 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
103 be16_to_cpu(bf[1].length));
104 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
105 be16_to_cpu(bf[2].length));
99 /* 106 /*
100 * Loop over the data/unused entries. 107 * Loop over the data/unused entries.
101 */ 108 */
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
107 * doesn't need to be there. 114 * doesn't need to be there.
108 */ 115 */
109 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 116 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
110 ASSERT(lastfree == 0); 117 XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
111 ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == 118 XFS_WANT_CORRUPTED_RETURN(
112 (char *)dup - (char *)hdr); 119 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
120 (char *)dup - (char *)hdr);
113 dfp = xfs_dir2_data_freefind(hdr, dup); 121 dfp = xfs_dir2_data_freefind(hdr, dup);
114 if (dfp) { 122 if (dfp) {
115 i = (int)(dfp - bf); 123 i = (int)(dfp - bf);
116 ASSERT((freeseen & (1 << i)) == 0); 124 XFS_WANT_CORRUPTED_RETURN(
125 (freeseen & (1 << i)) == 0);
117 freeseen |= 1 << i; 126 freeseen |= 1 << i;
118 } else { 127 } else {
119 ASSERT(be16_to_cpu(dup->length) <= 128 XFS_WANT_CORRUPTED_RETURN(
120 be16_to_cpu(bf[2].length)); 129 be16_to_cpu(dup->length) <=
130 be16_to_cpu(bf[2].length));
121 } 131 }
122 p += be16_to_cpu(dup->length); 132 p += be16_to_cpu(dup->length);
123 lastfree = 1; 133 lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
130 * The linear search is crude but this is DEBUG code. 140 * The linear search is crude but this is DEBUG code.
131 */ 141 */
132 dep = (xfs_dir2_data_entry_t *)p; 142 dep = (xfs_dir2_data_entry_t *)p;
133 ASSERT(dep->namelen != 0); 143 XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
134 ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); 144 XFS_WANT_CORRUPTED_RETURN(
135 ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == 145 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
136 (char *)dep - (char *)hdr); 146 XFS_WANT_CORRUPTED_RETURN(
147 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
148 (char *)dep - (char *)hdr);
137 count++; 149 count++;
138 lastfree = 0; 150 lastfree = 0;
139 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 151 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
148 be32_to_cpu(lep[i].hashval) == hash) 160 be32_to_cpu(lep[i].hashval) == hash)
149 break; 161 break;
150 } 162 }
151 ASSERT(i < be32_to_cpu(btp->count)); 163 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
152 } 164 }
153 p += xfs_dir2_data_entsize(dep->namelen); 165 p += xfs_dir2_data_entsize(dep->namelen);
154 } 166 }
155 /* 167 /*
156 * Need to have seen all the entries and all the bestfree slots. 168 * Need to have seen all the entries and all the bestfree slots.
157 */ 169 */
158 ASSERT(freeseen == 7); 170 XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
159 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) { 171 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
160 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { 172 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
161 if (lep[i].address == 173 if (lep[i].address ==
162 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) 174 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
163 stale++; 175 stale++;
164 if (i > 0) 176 if (i > 0)
165 ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); 177 XFS_WANT_CORRUPTED_RETURN(
178 be32_to_cpu(lep[i].hashval) >=
179 be32_to_cpu(lep[i - 1].hashval));
166 } 180 }
167 ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); 181 XFS_WANT_CORRUPTED_RETURN(count ==
168 ASSERT(stale == be32_to_cpu(btp->stale)); 182 be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
183 XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
169 } 184 }
185 return 0;
186}
187
188static void
189xfs_dir2_data_verify(
190 struct xfs_buf *bp)
191{
192 struct xfs_mount *mp = bp->b_target->bt_mount;
193 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
194 int block_ok = 0;
195
196 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
197 block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
198
199 if (!block_ok) {
200 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
201 xfs_buf_ioerror(bp, EFSCORRUPTED);
202 }
203}
204
205/*
206 * Readahead of the first block of the directory when it is opened is completely
207 * oblivious to the format of the directory. Hence we can either get a block
208 * format buffer or a data format buffer on readahead.
209 */
210static void
211xfs_dir2_data_reada_verify(
212 struct xfs_buf *bp)
213{
214 struct xfs_mount *mp = bp->b_target->bt_mount;
215 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
216
217 switch (hdr->magic) {
218 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
219 bp->b_ops = &xfs_dir2_block_buf_ops;
220 bp->b_ops->verify_read(bp);
221 return;
222 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
223 xfs_dir2_data_verify(bp);
224 return;
225 default:
226 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
227 xfs_buf_ioerror(bp, EFSCORRUPTED);
228 break;
229 }
230}
231
232static void
233xfs_dir2_data_read_verify(
234 struct xfs_buf *bp)
235{
236 xfs_dir2_data_verify(bp);
237}
238
239static void
240xfs_dir2_data_write_verify(
241 struct xfs_buf *bp)
242{
243 xfs_dir2_data_verify(bp);
244}
245
246const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
247 .verify_read = xfs_dir2_data_read_verify,
248 .verify_write = xfs_dir2_data_write_verify,
249};
250
251static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
252 .verify_read = xfs_dir2_data_reada_verify,
253 .verify_write = xfs_dir2_data_write_verify,
254};
255
256
257int
258xfs_dir2_data_read(
259 struct xfs_trans *tp,
260 struct xfs_inode *dp,
261 xfs_dablk_t bno,
262 xfs_daddr_t mapped_bno,
263 struct xfs_buf **bpp)
264{
265 return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
266 XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
267}
268
269int
270xfs_dir2_data_readahead(
271 struct xfs_trans *tp,
272 struct xfs_inode *dp,
273 xfs_dablk_t bno,
274 xfs_daddr_t mapped_bno)
275{
276 return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
277 XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
170} 278}
171#endif
172 279
173/* 280/*
174 * Given a data block and an unused entry from that block, 281 * Given a data block and an unused entry from that block,
@@ -409,10 +516,9 @@ xfs_dir2_data_init(
409 */ 516 */
410 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp, 517 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
411 XFS_DATA_FORK); 518 XFS_DATA_FORK);
412 if (error) { 519 if (error)
413 return error; 520 return error;
414 } 521 bp->b_ops = &xfs_dir2_data_buf_ops;
415 ASSERT(bp != NULL);
416 522
417 /* 523 /*
418 * Initialize the header. 524 * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0b296253bd01..60cd2fa4e047 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
48 int first, int last); 48 int first, int last);
49static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp); 49static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
50 50
51static void
52xfs_dir2_leaf_verify(
53 struct xfs_buf *bp,
54 __be16 magic)
55{
56 struct xfs_mount *mp = bp->b_target->bt_mount;
57 struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
58 int block_ok = 0;
59
60 block_ok = hdr->info.magic == magic;
61 if (!block_ok) {
62 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
63 xfs_buf_ioerror(bp, EFSCORRUPTED);
64 }
65}
66
67static void
68xfs_dir2_leaf1_read_verify(
69 struct xfs_buf *bp)
70{
71 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
72}
73
74static void
75xfs_dir2_leaf1_write_verify(
76 struct xfs_buf *bp)
77{
78 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
79}
80
81void
82xfs_dir2_leafn_read_verify(
83 struct xfs_buf *bp)
84{
85 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
86}
87
88void
89xfs_dir2_leafn_write_verify(
90 struct xfs_buf *bp)
91{
92 xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
93}
94
95static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
96 .verify_read = xfs_dir2_leaf1_read_verify,
97 .verify_write = xfs_dir2_leaf1_write_verify,
98};
99
100const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
101 .verify_read = xfs_dir2_leafn_read_verify,
102 .verify_write = xfs_dir2_leafn_write_verify,
103};
104
105static int
106xfs_dir2_leaf_read(
107 struct xfs_trans *tp,
108 struct xfs_inode *dp,
109 xfs_dablk_t fbno,
110 xfs_daddr_t mappedbno,
111 struct xfs_buf **bpp)
112{
113 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
114 XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
115}
116
117int
118xfs_dir2_leafn_read(
119 struct xfs_trans *tp,
120 struct xfs_inode *dp,
121 xfs_dablk_t fbno,
122 xfs_daddr_t mappedbno,
123 struct xfs_buf **bpp)
124{
125 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
126 XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
127}
51 128
52/* 129/*
53 * Convert a block form directory to a leaf form directory. 130 * Convert a block form directory to a leaf form directory.
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
125 /* 202 /*
126 * Fix up the block header, make it a data block. 203 * Fix up the block header, make it a data block.
127 */ 204 */
205 dbp->b_ops = &xfs_dir2_data_buf_ops;
128 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); 206 hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
129 if (needscan) 207 if (needscan)
130 xfs_dir2_data_freescan(mp, hdr, &needlog); 208 xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -311,15 +389,11 @@ xfs_dir2_leaf_addname(
311 dp = args->dp; 389 dp = args->dp;
312 tp = args->trans; 390 tp = args->trans;
313 mp = dp->i_mount; 391 mp = dp->i_mount;
314 /* 392
315 * Read the leaf block. 393 error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
316 */ 394 if (error)
317 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
318 XFS_DATA_FORK);
319 if (error) {
320 return error; 395 return error;
321 } 396
322 ASSERT(lbp != NULL);
323 /* 397 /*
324 * Look up the entry by hash value and name. 398 * Look up the entry by hash value and name.
325 * We know it's not there, our caller has already done a lookup. 399 * We know it's not there, our caller has already done a lookup.
@@ -494,22 +568,21 @@ xfs_dir2_leaf_addname(
494 hdr = dbp->b_addr; 568 hdr = dbp->b_addr;
495 bestsp[use_block] = hdr->bestfree[0].length; 569 bestsp[use_block] = hdr->bestfree[0].length;
496 grown = 1; 570 grown = 1;
497 } 571 } else {
498 /* 572 /*
499 * Already had space in some data block. 573 * Already had space in some data block.
500 * Just read that one in. 574 * Just read that one in.
501 */ 575 */
502 else { 576 error = xfs_dir2_data_read(tp, dp,
503 if ((error = 577 xfs_dir2_db_to_da(mp, use_block),
504 xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block), 578 -1, &dbp);
505 -1, &dbp, XFS_DATA_FORK))) { 579 if (error) {
506 xfs_trans_brelse(tp, lbp); 580 xfs_trans_brelse(tp, lbp);
507 return error; 581 return error;
508 } 582 }
509 hdr = dbp->b_addr; 583 hdr = dbp->b_addr;
510 grown = 0; 584 grown = 0;
511 } 585 }
512 xfs_dir2_data_check(dp, dbp);
513 /* 586 /*
514 * Point to the biggest freespace in our data block. 587 * Point to the biggest freespace in our data block.
515 */ 588 */
@@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf(
892 * Read the directory block starting at the first mapping. 965 * Read the directory block starting at the first mapping.
893 */ 966 */
894 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff); 967 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
895 error = xfs_da_read_buf(NULL, dp, map->br_startoff, 968 error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
896 map->br_blockcount >= mp->m_dirblkfsbs ? 969 map->br_blockcount >= mp->m_dirblkfsbs ?
897 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, 970 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
898 &bp, XFS_DATA_FORK);
899 971
900 /* 972 /*
901 * Should just skip over the data block instead of giving up. 973 * Should just skip over the data block instead of giving up.
@@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf(
922 */ 994 */
923 if (i > mip->ra_current && 995 if (i > mip->ra_current &&
924 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) { 996 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
925 xfs_buf_readahead(mp->m_ddev_targp, 997 xfs_dir2_data_readahead(NULL, dp,
998 map[mip->ra_index].br_startoff + mip->ra_offset,
926 XFS_FSB_TO_DADDR(mp, 999 XFS_FSB_TO_DADDR(mp,
927 map[mip->ra_index].br_startblock + 1000 map[mip->ra_index].br_startblock +
928 mip->ra_offset), 1001 mip->ra_offset));
929 (int)BTOBB(mp->m_dirblksize));
930 mip->ra_current = i; 1002 mip->ra_current = i;
931 } 1003 }
932 1004
@@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf(
935 * use our mapping, but this is a very rare case. 1007 * use our mapping, but this is a very rare case.
936 */ 1008 */
937 else if (i > mip->ra_current) { 1009 else if (i > mip->ra_current) {
938 xfs_da_reada_buf(NULL, dp, 1010 xfs_dir2_data_readahead(NULL, dp,
939 map[mip->ra_index].br_startoff + 1011 map[mip->ra_index].br_startoff +
940 mip->ra_offset, 1012 mip->ra_offset, -1);
941 XFS_DATA_FORK);
942 mip->ra_current = i; 1013 mip->ra_current = i;
943 } 1014 }
944 1015
@@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init(
1177 * Get the buffer for the block. 1248 * Get the buffer for the block.
1178 */ 1249 */
1179 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp, 1250 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
1180 XFS_DATA_FORK); 1251 XFS_DATA_FORK);
1181 if (error) { 1252 if (error)
1182 return error; 1253 return error;
1183 } 1254
1184 ASSERT(bp != NULL);
1185 leaf = bp->b_addr;
1186 /* 1255 /*
1187 * Initialize the header. 1256 * Initialize the header.
1188 */ 1257 */
1258 leaf = bp->b_addr;
1189 leaf->hdr.info.magic = cpu_to_be16(magic); 1259 leaf->hdr.info.magic = cpu_to_be16(magic);
1190 leaf->hdr.info.forw = 0; 1260 leaf->hdr.info.forw = 0;
1191 leaf->hdr.info.back = 0; 1261 leaf->hdr.info.back = 0;
@@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init(
1198 * the block. 1268 * the block.
1199 */ 1269 */
1200 if (magic == XFS_DIR2_LEAF1_MAGIC) { 1270 if (magic == XFS_DIR2_LEAF1_MAGIC) {
1271 bp->b_ops = &xfs_dir2_leaf1_buf_ops;
1201 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 1272 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
1202 ltp->bestcount = 0; 1273 ltp->bestcount = 0;
1203 xfs_dir2_leaf_log_tail(tp, bp); 1274 xfs_dir2_leaf_log_tail(tp, bp);
1204 } 1275 } else
1276 bp->b_ops = &xfs_dir2_leafn_buf_ops;
1205 *bpp = bp; 1277 *bpp = bp;
1206 return 0; 1278 return 0;
1207} 1279}
@@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int(
1372 dp = args->dp; 1444 dp = args->dp;
1373 tp = args->trans; 1445 tp = args->trans;
1374 mp = dp->i_mount; 1446 mp = dp->i_mount;
1375 /* 1447
1376 * Read the leaf block into the buffer. 1448 error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
1377 */
1378 error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
1379 XFS_DATA_FORK);
1380 if (error) 1449 if (error)
1381 return error; 1450 return error;
1451
1382 *lbpp = lbp; 1452 *lbpp = lbp;
1383 leaf = lbp->b_addr; 1453 leaf = lbp->b_addr;
1384 xfs_dir2_leaf_check(dp, lbp); 1454 xfs_dir2_leaf_check(dp, lbp);
@@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int(
1409 if (newdb != curdb) { 1479 if (newdb != curdb) {
1410 if (dbp) 1480 if (dbp)
1411 xfs_trans_brelse(tp, dbp); 1481 xfs_trans_brelse(tp, dbp);
1412 error = xfs_da_read_buf(tp, dp, 1482 error = xfs_dir2_data_read(tp, dp,
1413 xfs_dir2_db_to_da(mp, newdb), 1483 xfs_dir2_db_to_da(mp, newdb),
1414 -1, &dbp, XFS_DATA_FORK); 1484 -1, &dbp);
1415 if (error) { 1485 if (error) {
1416 xfs_trans_brelse(tp, lbp); 1486 xfs_trans_brelse(tp, lbp);
1417 return error; 1487 return error;
1418 } 1488 }
1419 xfs_dir2_data_check(dp, dbp);
1420 curdb = newdb; 1489 curdb = newdb;
1421 } 1490 }
1422 /* 1491 /*
@@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int(
1451 ASSERT(cidb != -1); 1520 ASSERT(cidb != -1);
1452 if (cidb != curdb) { 1521 if (cidb != curdb) {
1453 xfs_trans_brelse(tp, dbp); 1522 xfs_trans_brelse(tp, dbp);
1454 error = xfs_da_read_buf(tp, dp, 1523 error = xfs_dir2_data_read(tp, dp,
1455 xfs_dir2_db_to_da(mp, cidb), 1524 xfs_dir2_db_to_da(mp, cidb),
1456 -1, &dbp, XFS_DATA_FORK); 1525 -1, &dbp);
1457 if (error) { 1526 if (error) {
1458 xfs_trans_brelse(tp, lbp); 1527 xfs_trans_brelse(tp, lbp);
1459 return error; 1528 return error;
@@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data(
1738 /* 1807 /*
1739 * Read the offending data block. We need its buffer. 1808 * Read the offending data block. We need its buffer.
1740 */ 1809 */
1741 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp, 1810 error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
1742 XFS_DATA_FORK))) { 1811 if (error)
1743 return error; 1812 return error;
1744 }
1745 1813
1746 leaf = lbp->b_addr; 1814 leaf = lbp->b_addr;
1747 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 1815 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf(
1864 /* 1932 /*
1865 * Read the freespace block. 1933 * Read the freespace block.
1866 */ 1934 */
1867 if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, 1935 error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
1868 XFS_DATA_FORK))) { 1936 if (error)
1869 return error; 1937 return error;
1870 }
1871 free = fbp->b_addr; 1938 free = fbp->b_addr;
1872 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1939 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1873 ASSERT(!free->hdr.firstdb); 1940 ASSERT(!free->hdr.firstdb);
@@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf(
1890 xfs_dir2_leaf_compact(args, lbp); 1957 xfs_dir2_leaf_compact(args, lbp);
1891 else 1958 else
1892 xfs_dir2_leaf_log_header(tp, lbp); 1959 xfs_dir2_leaf_log_header(tp, lbp);
1960
1961 lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
1893 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); 1962 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
1963
1894 /* 1964 /*
1895 * Set up the leaf tail from the freespace block. 1965 * Set up the leaf tail from the freespace block.
1896 */ 1966 */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 6c7052406605..5980f9b7fa9b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
55static int xfs_dir2_node_addname_int(xfs_da_args_t *args, 55static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
56 xfs_da_state_blk_t *fblk); 56 xfs_da_state_blk_t *fblk);
57 57
58static void
59xfs_dir2_free_verify(
60 struct xfs_buf *bp)
61{
62 struct xfs_mount *mp = bp->b_target->bt_mount;
63 struct xfs_dir2_free_hdr *hdr = bp->b_addr;
64 int block_ok = 0;
65
66 block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
67 if (!block_ok) {
68 XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
69 XFS_ERRLEVEL_LOW, mp, hdr);
70 xfs_buf_ioerror(bp, EFSCORRUPTED);
71 }
72}
73
74static void
75xfs_dir2_free_read_verify(
76 struct xfs_buf *bp)
77{
78 xfs_dir2_free_verify(bp);
79}
80
81static void
82xfs_dir2_free_write_verify(
83 struct xfs_buf *bp)
84{
85 xfs_dir2_free_verify(bp);
86}
87
88static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
89 .verify_read = xfs_dir2_free_read_verify,
90 .verify_write = xfs_dir2_free_write_verify,
91};
92
93
94static int
95__xfs_dir2_free_read(
96 struct xfs_trans *tp,
97 struct xfs_inode *dp,
98 xfs_dablk_t fbno,
99 xfs_daddr_t mappedbno,
100 struct xfs_buf **bpp)
101{
102 return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
103 XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
104}
105
106int
107xfs_dir2_free_read(
108 struct xfs_trans *tp,
109 struct xfs_inode *dp,
110 xfs_dablk_t fbno,
111 struct xfs_buf **bpp)
112{
113 return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
114}
115
116static int
117xfs_dir2_free_try_read(
118 struct xfs_trans *tp,
119 struct xfs_inode *dp,
120 xfs_dablk_t fbno,
121 struct xfs_buf **bpp)
122{
123 return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
124}
125
58/* 126/*
59 * Log entries from a freespace block. 127 * Log entries from a freespace block.
60 */ 128 */
@@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node(
131 /* 199 /*
132 * Get the buffer for the new freespace block. 200 * Get the buffer for the new freespace block.
133 */ 201 */
134 if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp, 202 error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
135 XFS_DATA_FORK))) { 203 XFS_DATA_FORK);
204 if (error)
136 return error; 205 return error;
137 } 206 fbp->b_ops = &xfs_dir2_free_buf_ops;
138 ASSERT(fbp != NULL); 207
139 free = fbp->b_addr; 208 free = fbp->b_addr;
140 leaf = lbp->b_addr; 209 leaf = lbp->b_addr;
141 ltp = xfs_dir2_leaf_tail_p(mp, leaf); 210 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node(
157 *to = cpu_to_be16(off); 226 *to = cpu_to_be16(off);
158 } 227 }
159 free->hdr.nused = cpu_to_be32(n); 228 free->hdr.nused = cpu_to_be32(n);
229
230 lbp->b_ops = &xfs_dir2_leafn_buf_ops;
160 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); 231 leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
232
161 /* 233 /*
162 * Log everything. 234 * Log everything.
163 */ 235 */
@@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname(
394 */ 466 */
395 if (curbp) 467 if (curbp)
396 xfs_trans_brelse(tp, curbp); 468 xfs_trans_brelse(tp, curbp);
397 /* 469
398 * Read the free block. 470 error = xfs_dir2_free_read(tp, dp,
399 */
400 error = xfs_da_read_buf(tp, dp,
401 xfs_dir2_db_to_da(mp, newfdb), 471 xfs_dir2_db_to_da(mp, newfdb),
402 -1, &curbp, XFS_DATA_FORK); 472 &curbp);
403 if (error) 473 if (error)
404 return error; 474 return error;
405 free = curbp->b_addr; 475 free = curbp->b_addr;
@@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
534 ASSERT(state->extravalid); 604 ASSERT(state->extravalid);
535 curbp = state->extrablk.bp; 605 curbp = state->extrablk.bp;
536 } else { 606 } else {
537 error = xfs_da_read_buf(tp, dp, 607 error = xfs_dir2_data_read(tp, dp,
538 xfs_dir2_db_to_da(mp, newdb), 608 xfs_dir2_db_to_da(mp, newdb),
539 -1, &curbp, XFS_DATA_FORK); 609 -1, &curbp);
540 if (error) 610 if (error)
541 return error; 611 return error;
542 } 612 }
@@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
568 state->extrablk.index = (int)((char *)dep - 638 state->extrablk.index = (int)((char *)dep -
569 (char *)curbp->b_addr); 639 (char *)curbp->b_addr);
570 state->extrablk.magic = XFS_DIR2_DATA_MAGIC; 640 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
641 curbp->b_ops = &xfs_dir2_data_buf_ops;
571 if (cmp == XFS_CMP_EXACT) 642 if (cmp == XFS_CMP_EXACT)
572 return XFS_ERROR(EEXIST); 643 return XFS_ERROR(EEXIST);
573 } 644 }
@@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
582 state->extrablk.index = -1; 653 state->extrablk.index = -1;
583 state->extrablk.blkno = curdb; 654 state->extrablk.blkno = curdb;
584 state->extrablk.magic = XFS_DIR2_DATA_MAGIC; 655 state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
656 curbp->b_ops = &xfs_dir2_data_buf_ops;
585 } else { 657 } else {
586 /* If the curbp is not the CI match block, drop it */ 658 /* If the curbp is not the CI match block, drop it */
587 if (state->extrablk.bp != curbp) 659 if (state->extrablk.bp != curbp)
@@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance(
825 } 897 }
826} 898}
827 899
900static int
901xfs_dir2_data_block_free(
902 xfs_da_args_t *args,
903 struct xfs_dir2_data_hdr *hdr,
904 struct xfs_dir2_free *free,
905 xfs_dir2_db_t fdb,
906 int findex,
907 struct xfs_buf *fbp,
908 int longest)
909{
910 struct xfs_trans *tp = args->trans;
911 int logfree = 0;
912
913 if (!hdr) {
914 /* One less used entry in the free table. */
915 be32_add_cpu(&free->hdr.nused, -1);
916 xfs_dir2_free_log_header(tp, fbp);
917
918 /*
919 * If this was the last entry in the table, we can trim the
920 * table size back. There might be other entries at the end
921 * referring to non-existent data blocks, get those too.
922 */
923 if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
924 int i; /* free entry index */
925
926 for (i = findex - 1; i >= 0; i--) {
927 if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
928 break;
929 }
930 free->hdr.nvalid = cpu_to_be32(i + 1);
931 logfree = 0;
932 } else {
933 /* Not the last entry, just punch it out. */
934 free->bests[findex] = cpu_to_be16(NULLDATAOFF);
935 logfree = 1;
936 }
937 /*
938 * If there are no useful entries left in the block,
939 * get rid of the block if we can.
940 */
941 if (!free->hdr.nused) {
942 int error;
943
944 error = xfs_dir2_shrink_inode(args, fdb, fbp);
945 if (error == 0) {
946 fbp = NULL;
947 logfree = 0;
948 } else if (error != ENOSPC || args->total != 0)
949 return error;
950 /*
951 * It's possible to get ENOSPC if there is no
952 * space reservation. In this case some one
953 * else will eventually get rid of this block.
954 */
955 }
956 } else {
957 /*
958 * Data block is not empty, just set the free entry to the new
959 * value.
960 */
961 free->bests[findex] = cpu_to_be16(longest);
962 logfree = 1;
963 }
964
965 /* Log the free entry that changed, unless we got rid of it. */
966 if (logfree)
967 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
968 return 0;
969}
970
828/* 971/*
829 * Remove an entry from a node directory. 972 * Remove an entry from a node directory.
830 * This removes the leaf entry and the data entry, 973 * This removes the leaf entry and the data entry,
@@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove(
908 xfs_dir2_db_t fdb; /* freeblock block number */ 1051 xfs_dir2_db_t fdb; /* freeblock block number */
909 int findex; /* index in freeblock entries */ 1052 int findex; /* index in freeblock entries */
910 xfs_dir2_free_t *free; /* freeblock structure */ 1053 xfs_dir2_free_t *free; /* freeblock structure */
911 int logfree; /* need to log free entry */
912 1054
913 /* 1055 /*
914 * Convert the data block number to a free block, 1056 * Convert the data block number to a free block,
915 * read in the free block. 1057 * read in the free block.
916 */ 1058 */
917 fdb = xfs_dir2_db_to_fdb(mp, db); 1059 fdb = xfs_dir2_db_to_fdb(mp, db);
918 if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), 1060 error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
919 -1, &fbp, XFS_DATA_FORK))) { 1061 &fbp);
1062 if (error)
920 return error; 1063 return error;
921 }
922 free = fbp->b_addr; 1064 free = fbp->b_addr;
923 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1065 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
924 ASSERT(be32_to_cpu(free->hdr.firstdb) == 1066 ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove(
954 * If we got rid of the data block, we can eliminate that entry 1096 * If we got rid of the data block, we can eliminate that entry
955 * in the free block. 1097 * in the free block.
956 */ 1098 */
957 if (hdr == NULL) { 1099 error = xfs_dir2_data_block_free(args, hdr, free,
958 /* 1100 fdb, findex, fbp, longest);
959 * One less used entry in the free table. 1101 if (error)
960 */ 1102 return error;
961 be32_add_cpu(&free->hdr.nused, -1);
962 xfs_dir2_free_log_header(tp, fbp);
963 /*
964 * If this was the last entry in the table, we can
965 * trim the table size back. There might be other
966 * entries at the end referring to non-existent
967 * data blocks, get those too.
968 */
969 if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
970 int i; /* free entry index */
971
972 for (i = findex - 1;
973 i >= 0 &&
974 free->bests[i] == cpu_to_be16(NULLDATAOFF);
975 i--)
976 continue;
977 free->hdr.nvalid = cpu_to_be32(i + 1);
978 logfree = 0;
979 }
980 /*
981 * Not the last entry, just punch it out.
982 */
983 else {
984 free->bests[findex] = cpu_to_be16(NULLDATAOFF);
985 logfree = 1;
986 }
987 /*
988 * If there are no useful entries left in the block,
989 * get rid of the block if we can.
990 */
991 if (!free->hdr.nused) {
992 error = xfs_dir2_shrink_inode(args, fdb, fbp);
993 if (error == 0) {
994 fbp = NULL;
995 logfree = 0;
996 } else if (error != ENOSPC || args->total != 0)
997 return error;
998 /*
999 * It's possible to get ENOSPC if there is no
1000 * space reservation. In this case some one
1001 * else will eventually get rid of this block.
1002 */
1003 }
1004 }
1005 /*
1006 * Data block is not empty, just set the free entry to
1007 * the new value.
1008 */
1009 else {
1010 free->bests[findex] = cpu_to_be16(longest);
1011 logfree = 1;
1012 }
1013 /*
1014 * Log the free entry that changed, unless we got rid of it.
1015 */
1016 if (logfree)
1017 xfs_dir2_free_log_bests(tp, fbp, findex, findex);
1018 } 1103 }
1104
1019 xfs_dir2_leafn_check(dp, bp); 1105 xfs_dir2_leafn_check(dp, bp);
1020 /* 1106 /*
1021 * Return indication of whether this leaf block is empty enough 1107 * Return indication of whether this leaf block is empty enough
@@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall(
1169 /* 1255 /*
1170 * Read the sibling leaf block. 1256 * Read the sibling leaf block.
1171 */ 1257 */
1172 if ((error = 1258 error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
1173 xfs_da_read_buf(state->args->trans, state->args->dp, blkno, 1259 blkno, -1, &bp);
1174 -1, &bp, XFS_DATA_FORK))) { 1260 if (error)
1175 return error; 1261 return error;
1176 } 1262
1177 ASSERT(bp != NULL);
1178 /* 1263 /*
1179 * Count bytes in the two blocks combined. 1264 * Count bytes in the two blocks combined.
1180 */ 1265 */
@@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int(
1454 * This should be really rare, so there's no reason 1539 * This should be really rare, so there's no reason
1455 * to avoid it. 1540 * to avoid it.
1456 */ 1541 */
1457 if ((error = xfs_da_read_buf(tp, dp, 1542 error = xfs_dir2_free_try_read(tp, dp,
1458 xfs_dir2_db_to_da(mp, fbno), -2, &fbp, 1543 xfs_dir2_db_to_da(mp, fbno),
1459 XFS_DATA_FORK))) { 1544 &fbp);
1545 if (error)
1460 return error; 1546 return error;
1461 } 1547 if (!fbp)
1462 if (unlikely(fbp == NULL)) {
1463 continue; 1548 continue;
1464 }
1465 free = fbp->b_addr; 1549 free = fbp->b_addr;
1466 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 1550 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1467 findex = 0; 1551 findex = 0;
@@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int(
1520 * that was just allocated. 1604 * that was just allocated.
1521 */ 1605 */
1522 fbno = xfs_dir2_db_to_fdb(mp, dbno); 1606 fbno = xfs_dir2_db_to_fdb(mp, dbno);
1523 if (unlikely(error = xfs_da_read_buf(tp, dp, 1607 error = xfs_dir2_free_try_read(tp, dp,
1524 xfs_dir2_db_to_da(mp, fbno), -2, &fbp, 1608 xfs_dir2_db_to_da(mp, fbno),
1525 XFS_DATA_FORK))) 1609 &fbp);
1610 if (error)
1526 return error; 1611 return error;
1527 1612
1528 /* 1613 /*
@@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int(
1561 /* 1646 /*
1562 * Get a buffer for the new block. 1647 * Get a buffer for the new block.
1563 */ 1648 */
1564 if ((error = xfs_da_get_buf(tp, dp, 1649 error = xfs_da_get_buf(tp, dp,
1565 xfs_dir2_db_to_da(mp, fbno), 1650 xfs_dir2_db_to_da(mp, fbno),
1566 -1, &fbp, XFS_DATA_FORK))) { 1651 -1, &fbp, XFS_DATA_FORK);
1652 if (error)
1567 return error; 1653 return error;
1568 } 1654 fbp->b_ops = &xfs_dir2_free_buf_ops;
1569 ASSERT(fbp != NULL);
1570 1655
1571 /* 1656 /*
1572 * Initialize the new block to be empty, and remember 1657 * Initialize the new block to be empty, and remember
@@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int(
1630 /* 1715 /*
1631 * Read the data block in. 1716 * Read the data block in.
1632 */ 1717 */
1633 error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno), 1718 error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
1634 -1, &dbp, XFS_DATA_FORK); 1719 -1, &dbp);
1635 if (error) 1720 if (error)
1636 return error; 1721 return error;
1637 hdr = dbp->b_addr; 1722 hdr = dbp->b_addr;
@@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free(
1917 /* 2002 /*
1918 * Read the freespace block. 2003 * Read the freespace block.
1919 */ 2004 */
1920 if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, 2005 error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
1921 XFS_DATA_FORK))) { 2006 if (error)
1922 return error; 2007 return error;
1923 }
1924
1925 /* 2008 /*
1926 * There can be holes in freespace. If fo is a hole, there's 2009 * There can be holes in freespace. If fo is a hole, there's
1927 * nothing to do. 2010 * nothing to do.
1928 */ 2011 */
1929 if (bp == NULL) { 2012 if (!bp)
1930 return 0; 2013 return 0;
1931 }
1932 free = bp->b_addr; 2014 free = bp->b_addr;
1933 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)); 2015 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
1934 /* 2016 /*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3523d3e15aa8..7da79f6515fd 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
30 const unsigned char *name, int len); 30 const unsigned char *name, int len);
31 31
32/* xfs_dir2_block.c */ 32/* xfs_dir2_block.c */
33extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
34
33extern int xfs_dir2_block_addname(struct xfs_da_args *args); 35extern int xfs_dir2_block_addname(struct xfs_da_args *args);
34extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent, 36extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
35 xfs_off_t *offset, filldir_t filldir); 37 xfs_off_t *offset, filldir_t filldir);
@@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
41 43
42/* xfs_dir2_data.c */ 44/* xfs_dir2_data.c */
43#ifdef DEBUG 45#ifdef DEBUG
44extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp); 46#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
45#else 47#else
46#define xfs_dir2_data_check(dp,bp) 48#define xfs_dir2_data_check(dp,bp)
47#endif 49#endif
50
51extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
52
53extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
54extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
55 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
56extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
57 xfs_dablk_t bno, xfs_daddr_t mapped_bno);
58
48extern struct xfs_dir2_data_free * 59extern struct xfs_dir2_data_free *
49xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, 60xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
50 struct xfs_dir2_data_unused *dup, int *loghead); 61 struct xfs_dir2_data_unused *dup, int *loghead);
@@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
66 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); 77 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
67 78
68/* xfs_dir2_leaf.c */ 79/* xfs_dir2_leaf.c */
80extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
81
82extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
83 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
69extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, 84extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
70 struct xfs_buf *dbp); 85 struct xfs_buf *dbp);
71extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); 86extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
115extern int xfs_dir2_node_replace(struct xfs_da_args *args); 130extern int xfs_dir2_node_replace(struct xfs_da_args *args);
116extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, 131extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
117 int *rvalp); 132 int *rvalp);
133extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
134 xfs_dablk_t fbno, struct xfs_buf **bpp);
118 135
119/* xfs_dir2_sf.c */ 136/* xfs_dir2_sf.c */
120extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp); 137extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca4843..9e1bf5294c91 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
248 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); 248 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
249} 249}
250 250
251static void
252xfs_dquot_buf_verify(
253 struct xfs_buf *bp)
254{
255 struct xfs_mount *mp = bp->b_target->bt_mount;
256 struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
257 struct xfs_disk_dquot *ddq;
258 xfs_dqid_t id = 0;
259 int i;
260
261 /*
262 * On the first read of the buffer, verify that each dquot is valid.
263 * We don't know what the id of the dquot is supposed to be, just that
264 * they should be increasing monotonically within the buffer. If the
265 * first id is corrupt, then it will fail on the second dquot in the
266 * buffer so corruptions could point to the wrong dquot in this case.
267 */
268 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
269 int error;
270
271 ddq = &d[i].dd_diskdq;
272
273 if (i == 0)
274 id = be32_to_cpu(ddq->d_id);
275
276 error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
277 "xfs_dquot_read_verify");
278 if (error) {
279 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
280 xfs_buf_ioerror(bp, EFSCORRUPTED);
281 break;
282 }
283 }
284}
285
286static void
287xfs_dquot_buf_read_verify(
288 struct xfs_buf *bp)
289{
290 xfs_dquot_buf_verify(bp);
291}
292
293void
294xfs_dquot_buf_write_verify(
295 struct xfs_buf *bp)
296{
297 xfs_dquot_buf_verify(bp);
298}
251 299
300const struct xfs_buf_ops xfs_dquot_buf_ops = {
301 .verify_read = xfs_dquot_buf_read_verify,
302 .verify_write = xfs_dquot_buf_write_verify,
303};
252 304
253/* 305/*
254 * Allocate a block and fill it with dquots. 306 * Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
315 error = xfs_buf_geterror(bp); 367 error = xfs_buf_geterror(bp);
316 if (error) 368 if (error)
317 goto error1; 369 goto error1;
370 bp->b_ops = &xfs_dquot_buf_ops;
318 371
319 /* 372 /*
320 * Make a chunk of dquots out of this buffer and log 373 * Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
359 412
360 return (error); 413 return (error);
361} 414}
415STATIC int
416xfs_qm_dqrepair(
417 struct xfs_mount *mp,
418 struct xfs_trans *tp,
419 struct xfs_dquot *dqp,
420 xfs_dqid_t firstid,
421 struct xfs_buf **bpp)
422{
423 int error;
424 struct xfs_disk_dquot *ddq;
425 struct xfs_dqblk *d;
426 int i;
427
428 /*
429 * Read the buffer without verification so we get the corrupted
430 * buffer returned to us. make sure we verify it on write, though.
431 */
432 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
433 mp->m_quotainfo->qi_dqchunklen,
434 0, bpp, NULL);
435
436 if (error) {
437 ASSERT(*bpp == NULL);
438 return XFS_ERROR(error);
439 }
440 (*bpp)->b_ops = &xfs_dquot_buf_ops;
441
442 ASSERT(xfs_buf_islocked(*bpp));
443 d = (struct xfs_dqblk *)(*bpp)->b_addr;
444
445 /* Do the actual repair of dquots in this buffer */
446 for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
447 ddq = &d[i].dd_diskdq;
448 error = xfs_qm_dqcheck(mp, ddq, firstid + i,
449 dqp->dq_flags & XFS_DQ_ALLTYPES,
450 XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
451 if (error) {
452 /* repair failed, we're screwed */
453 xfs_trans_brelse(tp, *bpp);
454 return XFS_ERROR(EIO);
455 }
456 }
457
458 return 0;
459}
362 460
363/* 461/*
364 * Maps a dquot to the buffer containing its on-disk version. 462 * Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
378 xfs_buf_t *bp; 476 xfs_buf_t *bp;
379 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp); 477 xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
380 xfs_mount_t *mp = dqp->q_mount; 478 xfs_mount_t *mp = dqp->q_mount;
381 xfs_disk_dquot_t *ddq;
382 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); 479 xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
383 xfs_trans_t *tp = (tpp ? *tpp : NULL); 480 xfs_trans_t *tp = (tpp ? *tpp : NULL);
384 481
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
439 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 536 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
440 dqp->q_blkno, 537 dqp->q_blkno,
441 mp->m_quotainfo->qi_dqchunklen, 538 mp->m_quotainfo->qi_dqchunklen,
442 0, &bp); 539 0, &bp, &xfs_dquot_buf_ops);
443 if (error || !bp)
444 return XFS_ERROR(error);
445 }
446
447 ASSERT(xfs_buf_islocked(bp));
448 540
449 /* 541 if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
450 * calculate the location of the dquot inside the buffer. 542 xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
451 */ 543 mp->m_quotainfo->qi_dqperchunk;
452 ddq = bp->b_addr + dqp->q_bufoffset; 544 ASSERT(bp == NULL);
545 error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
546 }
453 547
454 /* 548 if (error) {
455 * A simple sanity check in case we got a corrupted dquot... 549 ASSERT(bp == NULL);
456 */ 550 return XFS_ERROR(error);
457 error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
458 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
459 "dqtobp");
460 if (error) {
461 if (!(flags & XFS_QMOPT_DQREPAIR)) {
462 xfs_trans_brelse(tp, bp);
463 return XFS_ERROR(EIO);
464 } 551 }
465 } 552 }
466 553
554 ASSERT(xfs_buf_islocked(bp));
467 *O_bpp = bp; 555 *O_bpp = bp;
468 *O_ddpp = ddq; 556 *O_ddpp = bp->b_addr + dqp->q_bufoffset;
469 557
470 return (0); 558 return (0);
471} 559}
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
920 * Get the buffer containing the on-disk dquot 1008 * Get the buffer containing the on-disk dquot
921 */ 1009 */
922 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, 1010 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
923 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 1011 mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
924 if (error) 1012 if (error)
925 goto out_unlock; 1013 goto out_unlock;
926 1014
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346d..c694a8469c4a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
161 return dqp; 161 return dqp;
162} 162}
163 163
164extern const struct xfs_buf_ops xfs_dquot_buf_ops;
165
164#endif /* __XFS_DQUOT_H__ */ 166#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8c6d1d70278c..a83611849cee 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
29#include "xfs_inode.h" 29#include "xfs_inode.h"
30#include "xfs_inode_item.h" 30#include "xfs_inode_item.h"
31#include "xfs_trace.h" 31#include "xfs_trace.h"
32#include "xfs_icache.h"
32 33
33/* 34/*
34 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..67284edb84d7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
31#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_vnodeops.h" 32#include "xfs_vnodeops.h"
33#include "xfs_da_btree.h" 33#include "xfs_da_btree.h"
34#include "xfs_dir2_format.h"
35#include "xfs_dir2_priv.h"
34#include "xfs_ioctl.h" 36#include "xfs_ioctl.h"
35#include "xfs_trace.h" 37#include "xfs_trace.h"
36 38
@@ -84,7 +86,7 @@ xfs_rw_ilock_demote(
84 * valid before the operation, it will be read from disk before 86 * valid before the operation, it will be read from disk before
85 * being partially zeroed. 87 * being partially zeroed.
86 */ 88 */
87STATIC int 89int
88xfs_iozero( 90xfs_iozero(
89 struct xfs_inode *ip, /* inode */ 91 struct xfs_inode *ip, /* inode */
90 loff_t pos, /* offset in file */ 92 loff_t pos, /* offset in file */
@@ -255,15 +257,14 @@ xfs_file_aio_read(
255 xfs_buftarg_t *target = 257 xfs_buftarg_t *target =
256 XFS_IS_REALTIME_INODE(ip) ? 258 XFS_IS_REALTIME_INODE(ip) ?
257 mp->m_rtdev_targp : mp->m_ddev_targp; 259 mp->m_rtdev_targp : mp->m_ddev_targp;
258 if ((iocb->ki_pos & target->bt_smask) || 260 if ((pos & target->bt_smask) || (size & target->bt_smask)) {
259 (size & target->bt_smask)) { 261 if (pos == i_size_read(inode))
260 if (iocb->ki_pos == i_size_read(inode))
261 return 0; 262 return 0;
262 return -XFS_ERROR(EINVAL); 263 return -XFS_ERROR(EINVAL);
263 } 264 }
264 } 265 }
265 266
266 n = mp->m_super->s_maxbytes - iocb->ki_pos; 267 n = mp->m_super->s_maxbytes - pos;
267 if (n <= 0 || size == 0) 268 if (n <= 0 || size == 0)
268 return 0; 269 return 0;
269 270
@@ -289,20 +290,21 @@ xfs_file_aio_read(
289 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 290 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
290 291
291 if (inode->i_mapping->nrpages) { 292 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip, 293 ret = -filemap_write_and_wait_range(
293 (iocb->ki_pos & PAGE_CACHE_MASK), 294 VFS_I(ip)->i_mapping,
294 -1, FI_REMAPF_LOCKED); 295 pos, -1);
295 if (ret) { 296 if (ret) {
296 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 297 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
297 return ret; 298 return ret;
298 } 299 }
300 truncate_pagecache_range(VFS_I(ip), pos, -1);
299 } 301 }
300 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 302 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
301 } 303 }
302 304
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); 305 trace_xfs_file_read(ip, size, pos, ioflags);
304 306
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); 307 ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
306 if (ret > 0) 308 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret); 309 XFS_STATS_ADD(xs_read_bytes, ret);
308 310
@@ -670,10 +672,11 @@ xfs_file_dio_aio_write(
670 goto out; 672 goto out;
671 673
672 if (mapping->nrpages) { 674 if (mapping->nrpages) {
673 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 675 ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
674 FI_REMAPF_LOCKED); 676 pos, -1);
675 if (ret) 677 if (ret)
676 goto out; 678 goto out;
679 truncate_pagecache_range(VFS_I(ip), pos, -1);
677 } 680 }
678 681
679 /* 682 /*
@@ -728,16 +731,17 @@ xfs_file_buffered_aio_write(
728write_retry: 731write_retry:
729 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 732 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
730 ret = generic_file_buffered_write(iocb, iovp, nr_segs, 733 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
731 pos, &iocb->ki_pos, count, ret); 734 pos, &iocb->ki_pos, count, 0);
735
732 /* 736 /*
733 * if we just got an ENOSPC, flush the inode now we aren't holding any 737 * If we just got an ENOSPC, try to write back all dirty inodes to
734 * page locks and retry *once* 738 * convert delalloc space to free up some of the excess reserved
739 * metadata space.
735 */ 740 */
736 if (ret == -ENOSPC && !enospc) { 741 if (ret == -ENOSPC && !enospc) {
737 enospc = 1; 742 enospc = 1;
738 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); 743 xfs_flush_inodes(ip->i_mount);
739 if (!ret) 744 goto write_retry;
740 goto write_retry;
741 } 745 }
742 746
743 current->backing_dev_info = NULL; 747 current->backing_dev_info = NULL;
@@ -889,7 +893,7 @@ xfs_dir_open(
889 */ 893 */
890 mode = xfs_ilock_map_shared(ip); 894 mode = xfs_ilock_map_shared(ip);
891 if (ip->i_d.di_nextents > 0) 895 if (ip->i_d.di_nextents > 0)
892 xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); 896 xfs_dir2_data_readahead(NULL, ip, 0, -1);
893 xfs_iunlock(ip, mode); 897 xfs_iunlock(ip, mode);
894 return 0; 898 return 0;
895} 899}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394a..6dda3f949b04 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
233#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ 233#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
234#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ 234#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
235#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ 235#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
236#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ 236#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
237#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
237#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ 238#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
238 239
239 240
@@ -339,6 +340,35 @@ typedef struct xfs_error_injection {
339 340
340 341
341/* 342/*
343 * Speculative preallocation trimming.
344 */
345#define XFS_EOFBLOCKS_VERSION 1
346struct xfs_eofblocks {
347 __u32 eof_version;
348 __u32 eof_flags;
349 uid_t eof_uid;
350 gid_t eof_gid;
351 prid_t eof_prid;
352 __u32 pad32;
353 __u64 eof_min_file_size;
354 __u64 pad64[12];
355};
356
357/* eof_flags values */
358#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */
359#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */
360#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
361#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
362#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
363#define XFS_EOF_FLAGS_VALID \
364 (XFS_EOF_FLAGS_SYNC | \
365 XFS_EOF_FLAGS_UID | \
366 XFS_EOF_FLAGS_GID | \
367 XFS_EOF_FLAGS_PRID | \
368 XFS_EOF_FLAGS_MINFILESIZE)
369
370
371/*
342 * The user-level Handle Request interface structure. 372 * The user-level Handle Request interface structure.
343 */ 373 */
344typedef struct xfs_fsop_handlereq { 374typedef struct xfs_fsop_handlereq {
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
456/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ 486/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
457#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) 487#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
458#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) 488#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
489#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
459 490
460/* 491/*
461 * ioctl commands that replace IRIX syssgi()'s 492 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index 652b875a9d4c..000000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_vnodeops.h"
20#include "xfs_bmap_btree.h"
21#include "xfs_inode.h"
22#include "xfs_trace.h"
23
24/*
25 * note: all filemap functions return negative error codes. These
26 * need to be inverted before returning to the xfs core functions.
27 */
28void
29xfs_tosspages(
30 xfs_inode_t *ip,
31 xfs_off_t first,
32 xfs_off_t last,
33 int fiopt)
34{
35 /* can't toss partial tail pages, so mask them out */
36 last &= ~(PAGE_SIZE - 1);
37 truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
38}
39
40int
41xfs_flushinval_pages(
42 xfs_inode_t *ip,
43 xfs_off_t first,
44 xfs_off_t last,
45 int fiopt)
46{
47 struct address_space *mapping = VFS_I(ip)->i_mapping;
48 int ret = 0;
49
50 trace_xfs_pagecache_inval(ip, first, last);
51
52 xfs_iflags_clear(ip, XFS_ITRUNCATED);
53 ret = filemap_write_and_wait_range(mapping, first,
54 last == -1 ? LLONG_MAX : last);
55 if (!ret)
56 truncate_inode_pages_range(mapping, first, last);
57 return -ret;
58}
59
60int
61xfs_flush_pages(
62 xfs_inode_t *ip,
63 xfs_off_t first,
64 xfs_off_t last,
65 uint64_t flags,
66 int fiopt)
67{
68 struct address_space *mapping = VFS_I(ip)->i_mapping;
69 int ret = 0;
70 int ret2;
71
72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
73 ret = -filemap_fdatawrite_range(mapping, first,
74 last == -1 ? LLONG_MAX : last);
75 if (flags & XBF_ASYNC)
76 return ret;
77 ret2 = xfs_wait_on_pages(ip, first, last);
78 if (!ret)
79 ret = ret2;
80 return ret;
81}
82
83int
84xfs_wait_on_pages(
85 xfs_inode_t *ip,
86 xfs_off_t first,
87 xfs_off_t last)
88{
89 struct address_space *mapping = VFS_I(ip)->i_mapping;
90
91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? XFS_ISIZE(ip) - 1 : last);
94 }
95 return 0;
96}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 4beaede43277..94eaeedc5498 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
97 (xfs_sb_version_haslazysbcount(&mp->m_sb) ? 97 (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
98 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | 98 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
99 (xfs_sb_version_hasattr2(&mp->m_sb) ? 99 (xfs_sb_version_hasattr2(&mp->m_sb) ?
100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); 100 XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
101 (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
102 XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
101 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? 103 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
102 mp->m_sb.sb_logsectsize : BBSIZE; 104 mp->m_sb.sb_logsectsize : BBSIZE;
103 geo->rtsectsize = mp->m_sb.sb_blocksize; 105 geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
112 return 0; 114 return 0;
113} 115}
114 116
117static struct xfs_buf *
118xfs_growfs_get_hdr_buf(
119 struct xfs_mount *mp,
120 xfs_daddr_t blkno,
121 size_t numblks,
122 int flags,
123 const struct xfs_buf_ops *ops)
124{
125 struct xfs_buf *bp;
126
127 bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
128 if (!bp)
129 return NULL;
130
131 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
132 bp->b_bn = blkno;
133 bp->b_maps[0].bm_bn = blkno;
134 bp->b_ops = ops;
135
136 return bp;
137}
138
115static int 139static int
116xfs_growfs_data_private( 140xfs_growfs_data_private(
117 xfs_mount_t *mp, /* mount point for filesystem */ 141 xfs_mount_t *mp, /* mount point for filesystem */
118 xfs_growfs_data_t *in) /* growfs data input struct */ 142 xfs_growfs_data_t *in) /* growfs data input struct */
119{ 143{
120 xfs_agf_t *agf; 144 xfs_agf_t *agf;
145 struct xfs_agfl *agfl;
121 xfs_agi_t *agi; 146 xfs_agi_t *agi;
122 xfs_agnumber_t agno; 147 xfs_agnumber_t agno;
123 xfs_extlen_t agsize; 148 xfs_extlen_t agsize;
124 xfs_extlen_t tmpsize; 149 xfs_extlen_t tmpsize;
125 xfs_alloc_rec_t *arec; 150 xfs_alloc_rec_t *arec;
126 struct xfs_btree_block *block;
127 xfs_buf_t *bp; 151 xfs_buf_t *bp;
128 int bucket; 152 int bucket;
129 int dpct; 153 int dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
146 dpct = pct - mp->m_sb.sb_imax_pct; 170 dpct = pct - mp->m_sb.sb_imax_pct;
147 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 171 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
148 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 172 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
149 XFS_FSS_TO_BB(mp, 1), 0); 173 XFS_FSS_TO_BB(mp, 1), 0, NULL);
150 if (!bp) 174 if (!bp)
151 return EIO; 175 return EIO;
176 if (bp->b_error) {
177 int error = bp->b_error;
178 xfs_buf_relse(bp);
179 return error;
180 }
152 xfs_buf_relse(bp); 181 xfs_buf_relse(bp);
153 182
154 new = nb; /* use new as a temporary here */ 183 new = nb; /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
186 nfree = 0; 215 nfree = 0;
187 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 216 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
188 /* 217 /*
189 * AG freelist header block 218 * AG freespace header block
190 */ 219 */
191 bp = xfs_buf_get(mp->m_ddev_targp, 220 bp = xfs_growfs_get_hdr_buf(mp,
192 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 221 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
193 XFS_FSS_TO_BB(mp, 1), 0); 222 XFS_FSS_TO_BB(mp, 1), 0,
223 &xfs_agf_buf_ops);
194 if (!bp) { 224 if (!bp) {
195 error = ENOMEM; 225 error = ENOMEM;
196 goto error0; 226 goto error0;
197 } 227 }
228
198 agf = XFS_BUF_TO_AGF(bp); 229 agf = XFS_BUF_TO_AGF(bp);
199 memset(agf, 0, mp->m_sb.sb_sectsize);
200 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); 230 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
201 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); 231 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
202 agf->agf_seqno = cpu_to_be32(agno); 232 agf->agf_seqno = cpu_to_be32(agno);
@@ -223,17 +253,39 @@ xfs_growfs_data_private(
223 goto error0; 253 goto error0;
224 254
225 /* 255 /*
256 * AG freelist header block
257 */
258 bp = xfs_growfs_get_hdr_buf(mp,
259 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
260 XFS_FSS_TO_BB(mp, 1), 0,
261 &xfs_agfl_buf_ops);
262 if (!bp) {
263 error = ENOMEM;
264 goto error0;
265 }
266
267 agfl = XFS_BUF_TO_AGFL(bp);
268 for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
269 agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
270
271 error = xfs_bwrite(bp);
272 xfs_buf_relse(bp);
273 if (error)
274 goto error0;
275
276 /*
226 * AG inode header block 277 * AG inode header block
227 */ 278 */
228 bp = xfs_buf_get(mp->m_ddev_targp, 279 bp = xfs_growfs_get_hdr_buf(mp,
229 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 280 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
230 XFS_FSS_TO_BB(mp, 1), 0); 281 XFS_FSS_TO_BB(mp, 1), 0,
282 &xfs_agi_buf_ops);
231 if (!bp) { 283 if (!bp) {
232 error = ENOMEM; 284 error = ENOMEM;
233 goto error0; 285 goto error0;
234 } 286 }
287
235 agi = XFS_BUF_TO_AGI(bp); 288 agi = XFS_BUF_TO_AGI(bp);
236 memset(agi, 0, mp->m_sb.sb_sectsize);
237 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); 289 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
238 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); 290 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
239 agi->agi_seqno = cpu_to_be32(agno); 291 agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
254 /* 306 /*
255 * BNO btree root block 307 * BNO btree root block
256 */ 308 */
257 bp = xfs_buf_get(mp->m_ddev_targp, 309 bp = xfs_growfs_get_hdr_buf(mp,
258 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), 310 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
259 BTOBB(mp->m_sb.sb_blocksize), 0); 311 BTOBB(mp->m_sb.sb_blocksize), 0,
312 &xfs_allocbt_buf_ops);
313
260 if (!bp) { 314 if (!bp) {
261 error = ENOMEM; 315 error = ENOMEM;
262 goto error0; 316 goto error0;
263 } 317 }
264 block = XFS_BUF_TO_BLOCK(bp); 318
265 memset(block, 0, mp->m_sb.sb_blocksize); 319 xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
266 block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); 320 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
267 block->bb_level = 0;
268 block->bb_numrecs = cpu_to_be16(1);
269 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
270 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
271 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
272 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 321 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
273 arec->ar_blockcount = cpu_to_be32( 322 arec->ar_blockcount = cpu_to_be32(
274 agsize - be32_to_cpu(arec->ar_startblock)); 323 agsize - be32_to_cpu(arec->ar_startblock));
324
275 error = xfs_bwrite(bp); 325 error = xfs_bwrite(bp);
276 xfs_buf_relse(bp); 326 xfs_buf_relse(bp);
277 if (error) 327 if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
280 /* 330 /*
281 * CNT btree root block 331 * CNT btree root block
282 */ 332 */
283 bp = xfs_buf_get(mp->m_ddev_targp, 333 bp = xfs_growfs_get_hdr_buf(mp,
284 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), 334 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
285 BTOBB(mp->m_sb.sb_blocksize), 0); 335 BTOBB(mp->m_sb.sb_blocksize), 0,
336 &xfs_allocbt_buf_ops);
286 if (!bp) { 337 if (!bp) {
287 error = ENOMEM; 338 error = ENOMEM;
288 goto error0; 339 goto error0;
289 } 340 }
290 block = XFS_BUF_TO_BLOCK(bp); 341
291 memset(block, 0, mp->m_sb.sb_blocksize); 342 xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
292 block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); 343 arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
293 block->bb_level = 0;
294 block->bb_numrecs = cpu_to_be16(1);
295 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
296 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
297 arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
298 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); 344 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
299 arec->ar_blockcount = cpu_to_be32( 345 arec->ar_blockcount = cpu_to_be32(
300 agsize - be32_to_cpu(arec->ar_startblock)); 346 agsize - be32_to_cpu(arec->ar_startblock));
301 nfree += be32_to_cpu(arec->ar_blockcount); 347 nfree += be32_to_cpu(arec->ar_blockcount);
348
302 error = xfs_bwrite(bp); 349 error = xfs_bwrite(bp);
303 xfs_buf_relse(bp); 350 xfs_buf_relse(bp);
304 if (error) 351 if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
307 /* 354 /*
308 * INO btree root block 355 * INO btree root block
309 */ 356 */
310 bp = xfs_buf_get(mp->m_ddev_targp, 357 bp = xfs_growfs_get_hdr_buf(mp,
311 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), 358 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
312 BTOBB(mp->m_sb.sb_blocksize), 0); 359 BTOBB(mp->m_sb.sb_blocksize), 0,
360 &xfs_inobt_buf_ops);
313 if (!bp) { 361 if (!bp) {
314 error = ENOMEM; 362 error = ENOMEM;
315 goto error0; 363 goto error0;
316 } 364 }
317 block = XFS_BUF_TO_BLOCK(bp); 365
318 memset(block, 0, mp->m_sb.sb_blocksize); 366 xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
319 block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); 367
320 block->bb_level = 0;
321 block->bb_numrecs = 0;
322 block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
323 block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
324 error = xfs_bwrite(bp); 368 error = xfs_bwrite(bp);
325 xfs_buf_relse(bp); 369 xfs_buf_relse(bp);
326 if (error) 370 if (error)
@@ -408,14 +452,16 @@ xfs_growfs_data_private(
408 if (agno < oagcount) { 452 if (agno < oagcount) {
409 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 453 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
410 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 454 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
411 XFS_FSS_TO_BB(mp, 1), 0, &bp); 455 XFS_FSS_TO_BB(mp, 1), 0, &bp,
456 &xfs_sb_buf_ops);
412 } else { 457 } else {
413 bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, 458 bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
414 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 459 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
415 XFS_FSS_TO_BB(mp, 1), 0); 460 XFS_FSS_TO_BB(mp, 1), 0);
416 if (bp) 461 if (bp) {
462 bp->b_ops = &xfs_sb_buf_ops;
417 xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); 463 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
418 else 464 } else
419 error = ENOMEM; 465 error = ENOMEM;
420 } 466 }
421 467
@@ -426,6 +472,7 @@ xfs_growfs_data_private(
426 break; 472 break;
427 } 473 }
428 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); 474 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
475
429 /* 476 /*
430 * If we get an error writing out the alternate superblocks, 477 * If we get an error writing out the alternate superblocks,
431 * just issue a warning and continue. The real work is 478 * just issue a warning and continue. The real work is
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..5399ef222dd7 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
21/* 21/*
22 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, 22 * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
23 * other XFS code uses these values. Times are measured in centisecs (i.e. 23 * other XFS code uses these values. Times are measured in centisecs (i.e.
24 * 100ths of a second). 24 * 100ths of a second) with the exception of eofb_timer, which is measured in
25 * seconds.
25 */ 26 */
26xfs_param_t xfs_params = { 27xfs_param_t xfs_params = {
27 /* MIN DFLT MAX */ 28 /* MIN DFLT MAX */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
40 .rotorstep = { 1, 1, 255 }, 41 .rotorstep = { 1, 1, 255 },
41 .inherit_nodfrg = { 0, 1, 1 }, 42 .inherit_nodfrg = { 0, 1, 1 },
42 .fstrm_timer = { 1, 30*100, 3600*100}, 43 .fstrm_timer = { 1, 30*100, 3600*100},
44 .eofb_timer = { 1, 300, 3600*24},
43}; 45};
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c5c4ef4f2bdb..a815412eab80 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
200 */ 200 */
201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); 201 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 202 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
203 mp->m_bsize * blks_per_cluster, 0); 203 mp->m_bsize * blks_per_cluster,
204 XBF_UNMAPPED);
204 if (!fbuf) 205 if (!fbuf)
205 return ENOMEM; 206 return ENOMEM;
206 /* 207 /*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
210 * to log a whole cluster of inodes instead of all the 211 * to log a whole cluster of inodes instead of all the
211 * individual transactions causing a lot of log traffic. 212 * individual transactions causing a lot of log traffic.
212 */ 213 */
214 fbuf->b_ops = &xfs_inode_buf_ops;
213 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); 215 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
214 for (i = 0; i < ninodes; i++) { 216 for (i = 0; i < ninodes; i++) {
215 int ioffset = i << mp->m_sb.sb_inodelog; 217 int ioffset = i << mp->m_sb.sb_inodelog;
@@ -877,9 +879,9 @@ error0:
877 * This function is designed to be called twice if it has to do an allocation 879 * This function is designed to be called twice if it has to do an allocation
878 * to make more free inodes. On the first call, *IO_agbp should be set to NULL. 880 * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
879 * If an inode is available without having to performn an allocation, an inode 881 * If an inode is available without having to performn an allocation, an inode
880 * number is returned. In this case, *IO_agbp would be NULL. If an allocation 882 * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
881 * needes to be done, xfs_dialloc would return the current AGI buffer in 883 * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
882 * *IO_agbp. The caller should then commit the current transaction, allocate a 884 * The caller should then commit the current transaction, allocate a
883 * new transaction, and call xfs_dialloc() again, passing in the previous value 885 * new transaction, and call xfs_dialloc() again, passing in the previous value
884 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI 886 * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
885 * buffer is locked across the two calls, the second call is guaranteed to have 887 * buffer is locked across the two calls, the second call is guaranteed to have
@@ -1472,6 +1474,57 @@ xfs_check_agi_unlinked(
1472#define xfs_check_agi_unlinked(agi) 1474#define xfs_check_agi_unlinked(agi)
1473#endif 1475#endif
1474 1476
1477static void
1478xfs_agi_verify(
1479 struct xfs_buf *bp)
1480{
1481 struct xfs_mount *mp = bp->b_target->bt_mount;
1482 struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
1483 int agi_ok;
1484
1485 /*
1486 * Validate the magic number of the agi block.
1487 */
1488 agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1489 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1490
1491 /*
1492 * during growfs operations, the perag is not fully initialised,
1493 * so we can't use it for any useful checking. growfs ensures we can't
1494 * use it by using uncached buffers that don't have the perag attached
1495 * so we can detect and avoid this problem.
1496 */
1497 if (bp->b_pag)
1498 agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
1499 bp->b_pag->pag_agno;
1500
1501 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1502 XFS_RANDOM_IALLOC_READ_AGI))) {
1503 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
1504 xfs_buf_ioerror(bp, EFSCORRUPTED);
1505 }
1506 xfs_check_agi_unlinked(agi);
1507}
1508
1509static void
1510xfs_agi_read_verify(
1511 struct xfs_buf *bp)
1512{
1513 xfs_agi_verify(bp);
1514}
1515
1516static void
1517xfs_agi_write_verify(
1518 struct xfs_buf *bp)
1519{
1520 xfs_agi_verify(bp);
1521}
1522
1523const struct xfs_buf_ops xfs_agi_buf_ops = {
1524 .verify_read = xfs_agi_read_verify,
1525 .verify_write = xfs_agi_write_verify,
1526};
1527
1475/* 1528/*
1476 * Read in the allocation group header (inode allocation section) 1529 * Read in the allocation group header (inode allocation section)
1477 */ 1530 */
@@ -1482,38 +1535,18 @@ xfs_read_agi(
1482 xfs_agnumber_t agno, /* allocation group number */ 1535 xfs_agnumber_t agno, /* allocation group number */
1483 struct xfs_buf **bpp) /* allocation group hdr buf */ 1536 struct xfs_buf **bpp) /* allocation group hdr buf */
1484{ 1537{
1485 struct xfs_agi *agi; /* allocation group header */
1486 int agi_ok; /* agi is consistent */
1487 int error; 1538 int error;
1488 1539
1489 ASSERT(agno != NULLAGNUMBER); 1540 ASSERT(agno != NULLAGNUMBER);
1490 1541
1491 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 1542 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
1492 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 1543 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
1493 XFS_FSS_TO_BB(mp, 1), 0, bpp); 1544 XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
1494 if (error) 1545 if (error)
1495 return error; 1546 return error;
1496 1547
1497 ASSERT(!xfs_buf_geterror(*bpp)); 1548 ASSERT(!xfs_buf_geterror(*bpp));
1498 agi = XFS_BUF_TO_AGI(*bpp);
1499
1500 /*
1501 * Validate the magic number of the agi block.
1502 */
1503 agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1504 XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
1505 be32_to_cpu(agi->agi_seqno) == agno;
1506 if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1507 XFS_RANDOM_IALLOC_READ_AGI))) {
1508 XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
1509 mp, agi);
1510 xfs_trans_brelse(tp, *bpp);
1511 return XFS_ERROR(EFSCORRUPTED);
1512 }
1513
1514 xfs_buf_set_ref(*bpp, XFS_AGI_REF); 1549 xfs_buf_set_ref(*bpp, XFS_AGI_REF);
1515
1516 xfs_check_agi_unlinked(agi);
1517 return 0; 1550 return 0;
1518} 1551}
1519 1552
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 1fd6ea4e9c91..c8da3df271e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
147/* 147/*
148 * Get the data from the pointed-to record. 148 * Get the data from the pointed-to record.
149 */ 149 */
150extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, 150int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
151 xfs_inobt_rec_incore_t *rec, int *stat); 151 xfs_inobt_rec_incore_t *rec, int *stat);
152 152
153extern const struct xfs_buf_ops xfs_agi_buf_ops;
154
153#endif /* __XFS_IALLOC_H__ */ 155#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa18..bec344b36507 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
33#include "xfs_ialloc.h" 33#include "xfs_ialloc.h"
34#include "xfs_alloc.h" 34#include "xfs_alloc.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_trace.h"
36 37
37 38
38STATIC int 39STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
181 cur->bc_rec.i.ir_startino; 182 cur->bc_rec.i.ir_startino;
182} 183}
183 184
185void
186xfs_inobt_verify(
187 struct xfs_buf *bp)
188{
189 struct xfs_mount *mp = bp->b_target->bt_mount;
190 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
191 unsigned int level;
192 int sblock_ok; /* block passes checks */
193
194 /* magic number and level verification */
195 level = be16_to_cpu(block->bb_level);
196 sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
197 level < mp->m_in_maxlevels;
198
199 /* numrecs verification */
200 sblock_ok = sblock_ok &&
201 be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
202
203 /* sibling pointer verification */
204 sblock_ok = sblock_ok &&
205 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
206 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
207 block->bb_u.s.bb_leftsib &&
208 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
209 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
210 block->bb_u.s.bb_rightsib;
211
212 if (!sblock_ok) {
213 trace_xfs_btree_corrupt(bp, _RET_IP_);
214 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
215 xfs_buf_ioerror(bp, EFSCORRUPTED);
216 }
217}
218
219static void
220xfs_inobt_read_verify(
221 struct xfs_buf *bp)
222{
223 xfs_inobt_verify(bp);
224}
225
226static void
227xfs_inobt_write_verify(
228 struct xfs_buf *bp)
229{
230 xfs_inobt_verify(bp);
231}
232
233const struct xfs_buf_ops xfs_inobt_buf_ops = {
234 .verify_read = xfs_inobt_read_verify,
235 .verify_write = xfs_inobt_write_verify,
236};
237
184#ifdef DEBUG 238#ifdef DEBUG
185STATIC int 239STATIC int
186xfs_inobt_keys_inorder( 240xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
218 .init_rec_from_cur = xfs_inobt_init_rec_from_cur, 272 .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
219 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, 273 .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
220 .key_diff = xfs_inobt_key_diff, 274 .key_diff = xfs_inobt_key_diff,
275 .buf_ops = &xfs_inobt_buf_ops,
221#ifdef DEBUG 276#ifdef DEBUG
222 .keys_inorder = xfs_inobt_keys_inorder, 277 .keys_inorder = xfs_inobt_keys_inorder,
223 .recs_inorder = xfs_inobt_recs_inorder, 278 .recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c4769..25c0239a8eab 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
109 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t); 109 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
110extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); 110extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
111 111
112extern const struct xfs_buf_ops xfs_inobt_buf_ops;
113
112#endif /* __XFS_IALLOC_BTREE_H__ */ 114#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c
index 9500caf15acf..96e344e3e927 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_icache.c
@@ -19,6 +19,7 @@
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_log_priv.h"
22#include "xfs_inum.h" 23#include "xfs_inum.h"
23#include "xfs_trans.h" 24#include "xfs_trans.h"
24#include "xfs_trans_priv.h" 25#include "xfs_trans_priv.h"
@@ -35,11 +36,425 @@
35#include "xfs_quota.h" 36#include "xfs_quota.h"
36#include "xfs_trace.h" 37#include "xfs_trace.h"
37#include "xfs_fsops.h" 38#include "xfs_fsops.h"
39#include "xfs_icache.h"
38 40
39#include <linux/kthread.h> 41#include <linux/kthread.h>
40#include <linux/freezer.h> 42#include <linux/freezer.h>
41 43
42struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 44STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
45 struct xfs_perag *pag, struct xfs_inode *ip);
46
47/*
48 * Allocate and initialise an xfs_inode.
49 */
50STATIC struct xfs_inode *
51xfs_inode_alloc(
52 struct xfs_mount *mp,
53 xfs_ino_t ino)
54{
55 struct xfs_inode *ip;
56
57 /*
58 * if this didn't occur in transactions, we could use
59 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
60 * code up to do this anyway.
61 */
62 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
63 if (!ip)
64 return NULL;
65 if (inode_init_always(mp->m_super, VFS_I(ip))) {
66 kmem_zone_free(xfs_inode_zone, ip);
67 return NULL;
68 }
69
70 ASSERT(atomic_read(&ip->i_pincount) == 0);
71 ASSERT(!spin_is_locked(&ip->i_flags_lock));
72 ASSERT(!xfs_isiflocked(ip));
73 ASSERT(ip->i_ino == 0);
74
75 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
76
77 /* initialise the xfs inode */
78 ip->i_ino = ino;
79 ip->i_mount = mp;
80 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
81 ip->i_afp = NULL;
82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 ip->i_flags = 0;
84 ip->i_delayed_blks = 0;
85 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
86
87 return ip;
88}
89
90STATIC void
91xfs_inode_free_callback(
92 struct rcu_head *head)
93{
94 struct inode *inode = container_of(head, struct inode, i_rcu);
95 struct xfs_inode *ip = XFS_I(inode);
96
97 kmem_zone_free(xfs_inode_zone, ip);
98}
99
100STATIC void
101xfs_inode_free(
102 struct xfs_inode *ip)
103{
104 switch (ip->i_d.di_mode & S_IFMT) {
105 case S_IFREG:
106 case S_IFDIR:
107 case S_IFLNK:
108 xfs_idestroy_fork(ip, XFS_DATA_FORK);
109 break;
110 }
111
112 if (ip->i_afp)
113 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
114
115 if (ip->i_itemp) {
116 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
117 xfs_inode_item_destroy(ip);
118 ip->i_itemp = NULL;
119 }
120
121 /* asserts to verify all state is correct here */
122 ASSERT(atomic_read(&ip->i_pincount) == 0);
123 ASSERT(!spin_is_locked(&ip->i_flags_lock));
124 ASSERT(!xfs_isiflocked(ip));
125
126 /*
127 * Because we use RCU freeing we need to ensure the inode always
128 * appears to be reclaimed with an invalid inode number when in the
129 * free state. The ip->i_flags_lock provides the barrier against lookup
130 * races.
131 */
132 spin_lock(&ip->i_flags_lock);
133 ip->i_flags = XFS_IRECLAIM;
134 ip->i_ino = 0;
135 spin_unlock(&ip->i_flags_lock);
136
137 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138}
139
140/*
141 * Check the validity of the inode we just found it the cache
142 */
143static int
144xfs_iget_cache_hit(
145 struct xfs_perag *pag,
146 struct xfs_inode *ip,
147 xfs_ino_t ino,
148 int flags,
149 int lock_flags) __releases(RCU)
150{
151 struct inode *inode = VFS_I(ip);
152 struct xfs_mount *mp = ip->i_mount;
153 int error;
154
155 /*
156 * check for re-use of an inode within an RCU grace period due to the
157 * radix tree nodes not being updated yet. We monitor for this by
158 * setting the inode number to zero before freeing the inode structure.
159 * If the inode has been reallocated and set up, then the inode number
160 * will not match, so check for that, too.
161 */
162 spin_lock(&ip->i_flags_lock);
163 if (ip->i_ino != ino) {
164 trace_xfs_iget_skip(ip);
165 XFS_STATS_INC(xs_ig_frecycle);
166 error = EAGAIN;
167 goto out_error;
168 }
169
170
171 /*
172 * If we are racing with another cache hit that is currently
173 * instantiating this inode or currently recycling it out of
174 * reclaimabe state, wait for the initialisation to complete
175 * before continuing.
176 *
177 * XXX(hch): eventually we should do something equivalent to
178 * wait_on_inode to wait for these flags to be cleared
179 * instead of polling for it.
180 */
181 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
182 trace_xfs_iget_skip(ip);
183 XFS_STATS_INC(xs_ig_frecycle);
184 error = EAGAIN;
185 goto out_error;
186 }
187
188 /*
189 * If lookup is racing with unlink return an error immediately.
190 */
191 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
192 error = ENOENT;
193 goto out_error;
194 }
195
196 /*
197 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
198 * Need to carefully get it back into useable state.
199 */
200 if (ip->i_flags & XFS_IRECLAIMABLE) {
201 trace_xfs_iget_reclaim(ip);
202
203 /*
204 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
205 * from stomping over us while we recycle the inode. We can't
206 * clear the radix tree reclaimable tag yet as it requires
207 * pag_ici_lock to be held exclusive.
208 */
209 ip->i_flags |= XFS_IRECLAIM;
210
211 spin_unlock(&ip->i_flags_lock);
212 rcu_read_unlock();
213
214 error = -inode_init_always(mp->m_super, inode);
215 if (error) {
216 /*
217 * Re-initializing the inode failed, and we are in deep
218 * trouble. Try to re-add it to the reclaim list.
219 */
220 rcu_read_lock();
221 spin_lock(&ip->i_flags_lock);
222
223 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
224 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
225 trace_xfs_iget_reclaim_fail(ip);
226 goto out_error;
227 }
228
229 spin_lock(&pag->pag_ici_lock);
230 spin_lock(&ip->i_flags_lock);
231
232 /*
233 * Clear the per-lifetime state in the inode as we are now
234 * effectively a new inode and need to return to the initial
235 * state before reuse occurs.
236 */
237 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
238 ip->i_flags |= XFS_INEW;
239 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
240 inode->i_state = I_NEW;
241
242 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
243 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
244
245 spin_unlock(&ip->i_flags_lock);
246 spin_unlock(&pag->pag_ici_lock);
247 } else {
248 /* If the VFS inode is being torn down, pause and try again. */
249 if (!igrab(inode)) {
250 trace_xfs_iget_skip(ip);
251 error = EAGAIN;
252 goto out_error;
253 }
254
255 /* We've got a live one. */
256 spin_unlock(&ip->i_flags_lock);
257 rcu_read_unlock();
258 trace_xfs_iget_hit(ip);
259 }
260
261 if (lock_flags != 0)
262 xfs_ilock(ip, lock_flags);
263
264 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
265 XFS_STATS_INC(xs_ig_found);
266
267 return 0;
268
269out_error:
270 spin_unlock(&ip->i_flags_lock);
271 rcu_read_unlock();
272 return error;
273}
274
275
276static int
277xfs_iget_cache_miss(
278 struct xfs_mount *mp,
279 struct xfs_perag *pag,
280 xfs_trans_t *tp,
281 xfs_ino_t ino,
282 struct xfs_inode **ipp,
283 int flags,
284 int lock_flags)
285{
286 struct xfs_inode *ip;
287 int error;
288 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
289 int iflags;
290
291 ip = xfs_inode_alloc(mp, ino);
292 if (!ip)
293 return ENOMEM;
294
295 error = xfs_iread(mp, tp, ip, flags);
296 if (error)
297 goto out_destroy;
298
299 trace_xfs_iget_miss(ip);
300
301 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
302 error = ENOENT;
303 goto out_destroy;
304 }
305
306 /*
307 * Preload the radix tree so we can insert safely under the
308 * write spinlock. Note that we cannot sleep inside the preload
309 * region. Since we can be called from transaction context, don't
310 * recurse into the file system.
311 */
312 if (radix_tree_preload(GFP_NOFS)) {
313 error = EAGAIN;
314 goto out_destroy;
315 }
316
317 /*
318 * Because the inode hasn't been added to the radix-tree yet it can't
319 * be found by another thread, so we can do the non-sleeping lock here.
320 */
321 if (lock_flags) {
322 if (!xfs_ilock_nowait(ip, lock_flags))
323 BUG();
324 }
325
326 /*
327 * These values must be set before inserting the inode into the radix
328 * tree as the moment it is inserted a concurrent lookup (allowed by the
329 * RCU locking mechanism) can find it and that lookup must see that this
330 * is an inode currently under construction (i.e. that XFS_INEW is set).
331 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
332 * memory barrier that ensures this detection works correctly at lookup
333 * time.
334 */
335 iflags = XFS_INEW;
336 if (flags & XFS_IGET_DONTCACHE)
337 iflags |= XFS_IDONTCACHE;
338 ip->i_udquot = ip->i_gdquot = NULL;
339 xfs_iflags_set(ip, iflags);
340
341 /* insert the new inode */
342 spin_lock(&pag->pag_ici_lock);
343 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
344 if (unlikely(error)) {
345 WARN_ON(error != -EEXIST);
346 XFS_STATS_INC(xs_ig_dup);
347 error = EAGAIN;
348 goto out_preload_end;
349 }
350 spin_unlock(&pag->pag_ici_lock);
351 radix_tree_preload_end();
352
353 *ipp = ip;
354 return 0;
355
356out_preload_end:
357 spin_unlock(&pag->pag_ici_lock);
358 radix_tree_preload_end();
359 if (lock_flags)
360 xfs_iunlock(ip, lock_flags);
361out_destroy:
362 __destroy_inode(VFS_I(ip));
363 xfs_inode_free(ip);
364 return error;
365}
366
367/*
368 * Look up an inode by number in the given file system.
369 * The inode is looked up in the cache held in each AG.
370 * If the inode is found in the cache, initialise the vfs inode
371 * if necessary.
372 *
373 * If it is not in core, read it in from the file system's device,
374 * add it to the cache and initialise the vfs inode.
375 *
376 * The inode is locked according to the value of the lock_flags parameter.
377 * This flag parameter indicates how and if the inode's IO lock and inode lock
378 * should be taken.
379 *
380 * mp -- the mount point structure for the current file system. It points
381 * to the inode hash table.
382 * tp -- a pointer to the current transaction if there is one. This is
383 * simply passed through to the xfs_iread() call.
384 * ino -- the number of the inode desired. This is the unique identifier
385 * within the file system for the inode being requested.
386 * lock_flags -- flags indicating how to lock the inode. See the comment
387 * for xfs_ilock() for a list of valid values.
388 */
389int
390xfs_iget(
391 xfs_mount_t *mp,
392 xfs_trans_t *tp,
393 xfs_ino_t ino,
394 uint flags,
395 uint lock_flags,
396 xfs_inode_t **ipp)
397{
398 xfs_inode_t *ip;
399 int error;
400 xfs_perag_t *pag;
401 xfs_agino_t agino;
402
403 /*
404 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
405 * doesn't get freed while it's being referenced during a
406 * radix tree traversal here. It assumes this function
407 * aqcuires only the ILOCK (and therefore it has no need to
408 * involve the IOLOCK in this synchronization).
409 */
410 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
411
412 /* reject inode numbers outside existing AGs */
413 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
414 return EINVAL;
415
416 /* get the perag structure and ensure that it's inode capable */
417 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
418 agino = XFS_INO_TO_AGINO(mp, ino);
419
420again:
421 error = 0;
422 rcu_read_lock();
423 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
424
425 if (ip) {
426 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
427 if (error)
428 goto out_error_or_again;
429 } else {
430 rcu_read_unlock();
431 XFS_STATS_INC(xs_ig_missed);
432
433 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
434 flags, lock_flags);
435 if (error)
436 goto out_error_or_again;
437 }
438 xfs_perag_put(pag);
439
440 *ipp = ip;
441
442 /*
443 * If we have a real type for an on-disk inode, we can set ops(&unlock)
444 * now. If it's a new inode being created, xfs_ialloc will handle it.
445 */
446 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
447 xfs_setup_inode(ip);
448 return 0;
449
450out_error_or_again:
451 if (error == EAGAIN) {
452 delay(1);
453 goto again;
454 }
455 xfs_perag_put(pag);
456 return error;
457}
43 458
44/* 459/*
45 * The inode lookup is done in batches to keep the amount of lock traffic and 460 * The inode lookup is done in batches to keep the amount of lock traffic and
@@ -101,8 +516,11 @@ xfs_inode_ag_walk(
101 struct xfs_mount *mp, 516 struct xfs_mount *mp,
102 struct xfs_perag *pag, 517 struct xfs_perag *pag,
103 int (*execute)(struct xfs_inode *ip, 518 int (*execute)(struct xfs_inode *ip,
104 struct xfs_perag *pag, int flags), 519 struct xfs_perag *pag, int flags,
105 int flags) 520 void *args),
521 int flags,
522 void *args,
523 int tag)
106{ 524{
107 uint32_t first_index; 525 uint32_t first_index;
108 int last_error = 0; 526 int last_error = 0;
@@ -121,9 +539,17 @@ restart:
121 int i; 539 int i;
122 540
123 rcu_read_lock(); 541 rcu_read_lock();
124 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 542
543 if (tag == -1)
544 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
125 (void **)batch, first_index, 545 (void **)batch, first_index,
126 XFS_LOOKUP_BATCH); 546 XFS_LOOKUP_BATCH);
547 else
548 nr_found = radix_tree_gang_lookup_tag(
549 &pag->pag_ici_root,
550 (void **) batch, first_index,
551 XFS_LOOKUP_BATCH, tag);
552
127 if (!nr_found) { 553 if (!nr_found) {
128 rcu_read_unlock(); 554 rcu_read_unlock();
129 break; 555 break;
@@ -164,7 +590,7 @@ restart:
164 for (i = 0; i < nr_found; i++) { 590 for (i = 0; i < nr_found; i++) {
165 if (!batch[i]) 591 if (!batch[i])
166 continue; 592 continue;
167 error = execute(batch[i], pag, flags); 593 error = execute(batch[i], pag, flags, args);
168 IRELE(batch[i]); 594 IRELE(batch[i]);
169 if (error == EAGAIN) { 595 if (error == EAGAIN) {
170 skipped++; 596 skipped++;
@@ -189,12 +615,40 @@ restart:
189 return last_error; 615 return last_error;
190} 616}
191 617
618/*
619 * Background scanning to trim post-EOF preallocated space. This is queued
620 * based on the 'background_prealloc_discard_period' tunable (5m by default).
621 */
622STATIC void
623xfs_queue_eofblocks(
624 struct xfs_mount *mp)
625{
626 rcu_read_lock();
627 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
628 queue_delayed_work(mp->m_eofblocks_workqueue,
629 &mp->m_eofblocks_work,
630 msecs_to_jiffies(xfs_eofb_secs * 1000));
631 rcu_read_unlock();
632}
633
634void
635xfs_eofblocks_worker(
636 struct work_struct *work)
637{
638 struct xfs_mount *mp = container_of(to_delayed_work(work),
639 struct xfs_mount, m_eofblocks_work);
640 xfs_icache_free_eofblocks(mp, NULL);
641 xfs_queue_eofblocks(mp);
642}
643
192int 644int
193xfs_inode_ag_iterator( 645xfs_inode_ag_iterator(
194 struct xfs_mount *mp, 646 struct xfs_mount *mp,
195 int (*execute)(struct xfs_inode *ip, 647 int (*execute)(struct xfs_inode *ip,
196 struct xfs_perag *pag, int flags), 648 struct xfs_perag *pag, int flags,
197 int flags) 649 void *args),
650 int flags,
651 void *args)
198{ 652{
199 struct xfs_perag *pag; 653 struct xfs_perag *pag;
200 int error = 0; 654 int error = 0;
@@ -204,7 +658,7 @@ xfs_inode_ag_iterator(
204 ag = 0; 658 ag = 0;
205 while ((pag = xfs_perag_get(mp, ag))) { 659 while ((pag = xfs_perag_get(mp, ag))) {
206 ag = pag->pag_agno + 1; 660 ag = pag->pag_agno + 1;
207 error = xfs_inode_ag_walk(mp, pag, execute, flags); 661 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
208 xfs_perag_put(pag); 662 xfs_perag_put(pag);
209 if (error) { 663 if (error) {
210 last_error = error; 664 last_error = error;
@@ -215,224 +669,50 @@ xfs_inode_ag_iterator(
215 return XFS_ERROR(last_error); 669 return XFS_ERROR(last_error);
216} 670}
217 671
218STATIC int
219xfs_sync_inode_data(
220 struct xfs_inode *ip,
221 struct xfs_perag *pag,
222 int flags)
223{
224 struct inode *inode = VFS_I(ip);
225 struct address_space *mapping = inode->i_mapping;
226 int error = 0;
227
228 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
229 return 0;
230
231 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
232 if (flags & SYNC_TRYLOCK)
233 return 0;
234 xfs_ilock(ip, XFS_IOLOCK_SHARED);
235 }
236
237 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
238 0 : XBF_ASYNC, FI_NONE);
239 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
240 return error;
241}
242
243/*
244 * Write out pagecache data for the whole filesystem.
245 */
246STATIC int
247xfs_sync_data(
248 struct xfs_mount *mp,
249 int flags)
250{
251 int error;
252
253 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
254
255 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
256 if (error)
257 return XFS_ERROR(error);
258
259 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
260 return 0;
261}
262
263STATIC int
264xfs_sync_fsdata(
265 struct xfs_mount *mp)
266{
267 struct xfs_buf *bp;
268 int error;
269
270 /*
271 * If the buffer is pinned then push on the log so we won't get stuck
272 * waiting in the write for someone, maybe ourselves, to flush the log.
273 *
274 * Even though we just pushed the log above, we did not have the
275 * superblock buffer locked at that point so it can become pinned in
276 * between there and here.
277 */
278 bp = xfs_getsb(mp, 0);
279 if (xfs_buf_ispinned(bp))
280 xfs_log_force(mp, 0);
281 error = xfs_bwrite(bp);
282 xfs_buf_relse(bp);
283 return error;
284}
285
286/*
287 * When remounting a filesystem read-only or freezing the filesystem, we have
288 * two phases to execute. This first phase is syncing the data before we
289 * quiesce the filesystem, and the second is flushing all the inodes out after
290 * we've waited for all the transactions created by the first phase to
291 * complete. The second phase ensures that the inodes are written to their
292 * location on disk rather than just existing in transactions in the log. This
293 * means after a quiesce there is no log replay required to write the inodes to
294 * disk (this is the main difference between a sync and a quiesce).
295 */
296/*
297 * First stage of freeze - no writers will make progress now we are here,
298 * so we flush delwri and delalloc buffers here, then wait for all I/O to
299 * complete. Data is frozen at that point. Metadata is not frozen,
300 * transactions can still occur here so don't bother emptying the AIL
301 * because it'll just get dirty again.
302 */
303int 672int
304xfs_quiesce_data( 673xfs_inode_ag_iterator_tag(
305 struct xfs_mount *mp) 674 struct xfs_mount *mp,
306{ 675 int (*execute)(struct xfs_inode *ip,
307 int error, error2 = 0; 676 struct xfs_perag *pag, int flags,
308 677 void *args),
309 /* force out the log */ 678 int flags,
310 xfs_log_force(mp, XFS_LOG_SYNC); 679 void *args,
311 680 int tag)
312 /* write superblock and hoover up shutdown errors */
313 error = xfs_sync_fsdata(mp);
314
315 /* mark the log as covered if needed */
316 if (xfs_log_need_covered(mp))
317 error2 = xfs_fs_log_dummy(mp);
318
319 return error ? error : error2;
320}
321
322/*
323 * Second stage of a quiesce. The data is already synced, now we have to take
324 * care of the metadata. New transactions are already blocked, so we need to
325 * wait for any remaining transactions to drain out before proceeding.
326 */
327void
328xfs_quiesce_attr(
329 struct xfs_mount *mp)
330{
331 int error = 0;
332
333 /* wait for all modifications to complete */
334 while (atomic_read(&mp->m_active_trans) > 0)
335 delay(100);
336
337 /* reclaim inodes to do any IO before the freeze completes */
338 xfs_reclaim_inodes(mp, 0);
339 xfs_reclaim_inodes(mp, SYNC_WAIT);
340
341 /* flush all pending changes from the AIL */
342 xfs_ail_push_all_sync(mp->m_ail);
343
344 /*
345 * Just warn here till VFS can correctly support
346 * read-only remount without racing.
347 */
348 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
349
350 /* Push the superblock and write an unmount record */
351 error = xfs_log_sbcount(mp);
352 if (error)
353 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
354 "Frozen image may not be consistent.");
355 xfs_log_unmount_write(mp);
356
357 /*
358 * At this point we might have modified the superblock again and thus
359 * added an item to the AIL, thus flush it again.
360 */
361 xfs_ail_push_all_sync(mp->m_ail);
362
363 /*
364 * The superblock buffer is uncached and xfsaild_push() will lock and
365 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
366 * here but a lock on the superblock buffer will block until iodone()
367 * has completed.
368 */
369 xfs_buf_lock(mp->m_sb_bp);
370 xfs_buf_unlock(mp->m_sb_bp);
371}
372
373static void
374xfs_syncd_queue_sync(
375 struct xfs_mount *mp)
376{
377 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
378 msecs_to_jiffies(xfs_syncd_centisecs * 10));
379}
380
381/*
382 * Every sync period we need to unpin all items, reclaim inodes and sync
383 * disk quotas. We might need to cover the log to indicate that the
384 * filesystem is idle and not frozen.
385 */
386STATIC void
387xfs_sync_worker(
388 struct work_struct *work)
389{ 681{
390 struct xfs_mount *mp = container_of(to_delayed_work(work), 682 struct xfs_perag *pag;
391 struct xfs_mount, m_sync_work); 683 int error = 0;
392 int error; 684 int last_error = 0;
393 685 xfs_agnumber_t ag;
394 /*
395 * We shouldn't write/force the log if we are in the mount/unmount
396 * process or on a read only filesystem. The workqueue still needs to be
397 * active in both cases, however, because it is used for inode reclaim
398 * during these times. Use the MS_ACTIVE flag to avoid doing anything
399 * during mount. Doing work during unmount is avoided by calling
400 * cancel_delayed_work_sync on this work queue before tearing down
401 * the ail and the log in xfs_log_unmount.
402 */
403 if (!(mp->m_super->s_flags & MS_ACTIVE) &&
404 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
405 /* dgc: errors ignored here */
406 if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
407 xfs_log_need_covered(mp))
408 error = xfs_fs_log_dummy(mp);
409 else
410 xfs_log_force(mp, 0);
411 686
412 /* start pushing all the metadata that is currently 687 ag = 0;
413 * dirty */ 688 while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
414 xfs_ail_push_all(mp->m_ail); 689 ag = pag->pag_agno + 1;
690 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
691 xfs_perag_put(pag);
692 if (error) {
693 last_error = error;
694 if (error == EFSCORRUPTED)
695 break;
696 }
415 } 697 }
416 698 return XFS_ERROR(last_error);
417 /* queue us up again */
418 xfs_syncd_queue_sync(mp);
419} 699}
420 700
421/* 701/*
422 * Queue a new inode reclaim pass if there are reclaimable inodes and there 702 * Queue a new inode reclaim pass if there are reclaimable inodes and there
423 * isn't a reclaim pass already in progress. By default it runs every 5s based 703 * isn't a reclaim pass already in progress. By default it runs every 5s based
424 * on the xfs syncd work default of 30s. Perhaps this should have it's own 704 * on the xfs periodic sync default of 30s. Perhaps this should have it's own
425 * tunable, but that can be done if this method proves to be ineffective or too 705 * tunable, but that can be done if this method proves to be ineffective or too
426 * aggressive. 706 * aggressive.
427 */ 707 */
428static void 708static void
429xfs_syncd_queue_reclaim( 709xfs_reclaim_work_queue(
430 struct xfs_mount *mp) 710 struct xfs_mount *mp)
431{ 711{
432 712
433 rcu_read_lock(); 713 rcu_read_lock();
434 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 714 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
435 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, 715 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
436 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 716 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
437 } 717 }
438 rcu_read_unlock(); 718 rcu_read_unlock();
@@ -445,7 +725,7 @@ xfs_syncd_queue_reclaim(
445 * goes low. It scans as quickly as possible avoiding locked inodes or those 725 * goes low. It scans as quickly as possible avoiding locked inodes or those
446 * already being flushed, and once done schedules a future pass. 726 * already being flushed, and once done schedules a future pass.
447 */ 727 */
448STATIC void 728void
449xfs_reclaim_worker( 729xfs_reclaim_worker(
450 struct work_struct *work) 730 struct work_struct *work)
451{ 731{
@@ -453,65 +733,10 @@ xfs_reclaim_worker(
453 struct xfs_mount, m_reclaim_work); 733 struct xfs_mount, m_reclaim_work);
454 734
455 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 735 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
456 xfs_syncd_queue_reclaim(mp); 736 xfs_reclaim_work_queue(mp);
457} 737}
458 738
459/* 739static void
460 * Flush delayed allocate data, attempting to free up reserved space
461 * from existing allocations. At this point a new allocation attempt
462 * has failed with ENOSPC and we are in the process of scratching our
463 * heads, looking about for more room.
464 *
465 * Queue a new data flush if there isn't one already in progress and
466 * wait for completion of the flush. This means that we only ever have one
467 * inode flush in progress no matter how many ENOSPC events are occurring and
468 * so will prevent the system from bogging down due to every concurrent
469 * ENOSPC event scanning all the active inodes in the system for writeback.
470 */
471void
472xfs_flush_inodes(
473 struct xfs_inode *ip)
474{
475 struct xfs_mount *mp = ip->i_mount;
476
477 queue_work(xfs_syncd_wq, &mp->m_flush_work);
478 flush_work(&mp->m_flush_work);
479}
480
481STATIC void
482xfs_flush_worker(
483 struct work_struct *work)
484{
485 struct xfs_mount *mp = container_of(work,
486 struct xfs_mount, m_flush_work);
487
488 xfs_sync_data(mp, SYNC_TRYLOCK);
489 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
490}
491
492int
493xfs_syncd_init(
494 struct xfs_mount *mp)
495{
496 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
497 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
498 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
499
500 xfs_syncd_queue_sync(mp);
501
502 return 0;
503}
504
505void
506xfs_syncd_stop(
507 struct xfs_mount *mp)
508{
509 cancel_delayed_work_sync(&mp->m_sync_work);
510 cancel_delayed_work_sync(&mp->m_reclaim_work);
511 cancel_work_sync(&mp->m_flush_work);
512}
513
514void
515__xfs_inode_set_reclaim_tag( 740__xfs_inode_set_reclaim_tag(
516 struct xfs_perag *pag, 741 struct xfs_perag *pag,
517 struct xfs_inode *ip) 742 struct xfs_inode *ip)
@@ -529,7 +754,7 @@ __xfs_inode_set_reclaim_tag(
529 spin_unlock(&ip->i_mount->m_perag_lock); 754 spin_unlock(&ip->i_mount->m_perag_lock);
530 755
531 /* schedule periodic background inode reclaim */ 756 /* schedule periodic background inode reclaim */
532 xfs_syncd_queue_reclaim(ip->i_mount); 757 xfs_reclaim_work_queue(ip->i_mount);
533 758
534 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 759 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
535 -1, _RET_IP_); 760 -1, _RET_IP_);
@@ -577,7 +802,7 @@ __xfs_inode_clear_reclaim(
577 } 802 }
578} 803}
579 804
580void 805STATIC void
581__xfs_inode_clear_reclaim_tag( 806__xfs_inode_clear_reclaim_tag(
582 xfs_mount_t *mp, 807 xfs_mount_t *mp,
583 xfs_perag_t *pag, 808 xfs_perag_t *pag,
@@ -787,9 +1012,9 @@ out:
787 /* 1012 /*
788 * We could return EAGAIN here to make reclaim rescan the inode tree in 1013 * We could return EAGAIN here to make reclaim rescan the inode tree in
789 * a short while. However, this just burns CPU time scanning the tree 1014 * a short while. However, this just burns CPU time scanning the tree
790 * waiting for IO to complete and xfssyncd never goes back to the idle 1015 * waiting for IO to complete and the reclaim work never goes back to
791 * state. Instead, return 0 to let the next scheduled background reclaim 1016 * the idle state. Instead, return 0 to let the next scheduled
792 * attempt to reclaim the inode again. 1017 * background reclaim attempt to reclaim the inode again.
793 */ 1018 */
794 return 0; 1019 return 0;
795} 1020}
@@ -800,7 +1025,7 @@ out:
800 * then a shut down during filesystem unmount reclaim walk leak all the 1025 * then a shut down during filesystem unmount reclaim walk leak all the
801 * unreclaimed inodes. 1026 * unreclaimed inodes.
802 */ 1027 */
803int 1028STATIC int
804xfs_reclaim_inodes_ag( 1029xfs_reclaim_inodes_ag(
805 struct xfs_mount *mp, 1030 struct xfs_mount *mp,
806 int flags, 1031 int flags,
@@ -945,7 +1170,7 @@ xfs_reclaim_inodes_nr(
945 int nr_to_scan) 1170 int nr_to_scan)
946{ 1171{
947 /* kick background reclaimer and push the AIL */ 1172 /* kick background reclaimer and push the AIL */
948 xfs_syncd_queue_reclaim(mp); 1173 xfs_reclaim_work_queue(mp);
949 xfs_ail_push_all(mp->m_ail); 1174 xfs_ail_push_all(mp->m_ail);
950 1175
951 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1176 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
@@ -971,3 +1196,146 @@ xfs_reclaim_inodes_count(
971 return reclaimable; 1196 return reclaimable;
972} 1197}
973 1198
1199STATIC int
1200xfs_inode_match_id(
1201 struct xfs_inode *ip,
1202 struct xfs_eofblocks *eofb)
1203{
1204 if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
1205 ip->i_d.di_uid != eofb->eof_uid)
1206 return 0;
1207
1208 if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
1209 ip->i_d.di_gid != eofb->eof_gid)
1210 return 0;
1211
1212 if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
1213 xfs_get_projid(ip) != eofb->eof_prid)
1214 return 0;
1215
1216 return 1;
1217}
1218
1219STATIC int
1220xfs_inode_free_eofblocks(
1221 struct xfs_inode *ip,
1222 struct xfs_perag *pag,
1223 int flags,
1224 void *args)
1225{
1226 int ret;
1227 struct xfs_eofblocks *eofb = args;
1228
1229 if (!xfs_can_free_eofblocks(ip, false)) {
1230 /* inode could be preallocated or append-only */
1231 trace_xfs_inode_free_eofblocks_invalid(ip);
1232 xfs_inode_clear_eofblocks_tag(ip);
1233 return 0;
1234 }
1235
1236 /*
1237 * If the mapping is dirty the operation can block and wait for some
1238 * time. Unless we are waiting, skip it.
1239 */
1240 if (!(flags & SYNC_WAIT) &&
1241 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1242 return 0;
1243
1244 if (eofb) {
1245 if (!xfs_inode_match_id(ip, eofb))
1246 return 0;
1247
1248 /* skip the inode if the file size is too small */
1249 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1250 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1251 return 0;
1252 }
1253
1254 ret = xfs_free_eofblocks(ip->i_mount, ip, true);
1255
1256 /* don't revisit the inode if we're not waiting */
1257 if (ret == EAGAIN && !(flags & SYNC_WAIT))
1258 ret = 0;
1259
1260 return ret;
1261}
1262
1263int
1264xfs_icache_free_eofblocks(
1265 struct xfs_mount *mp,
1266 struct xfs_eofblocks *eofb)
1267{
1268 int flags = SYNC_TRYLOCK;
1269
1270 if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
1271 flags = SYNC_WAIT;
1272
1273 return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
1274 eofb, XFS_ICI_EOFBLOCKS_TAG);
1275}
1276
1277void
1278xfs_inode_set_eofblocks_tag(
1279 xfs_inode_t *ip)
1280{
1281 struct xfs_mount *mp = ip->i_mount;
1282 struct xfs_perag *pag;
1283 int tagged;
1284
1285 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1286 spin_lock(&pag->pag_ici_lock);
1287 trace_xfs_inode_set_eofblocks_tag(ip);
1288
1289 tagged = radix_tree_tagged(&pag->pag_ici_root,
1290 XFS_ICI_EOFBLOCKS_TAG);
1291 radix_tree_tag_set(&pag->pag_ici_root,
1292 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1293 XFS_ICI_EOFBLOCKS_TAG);
1294 if (!tagged) {
1295 /* propagate the eofblocks tag up into the perag radix tree */
1296 spin_lock(&ip->i_mount->m_perag_lock);
1297 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
1298 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1299 XFS_ICI_EOFBLOCKS_TAG);
1300 spin_unlock(&ip->i_mount->m_perag_lock);
1301
1302 /* kick off background trimming */
1303 xfs_queue_eofblocks(ip->i_mount);
1304
1305 trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
1306 -1, _RET_IP_);
1307 }
1308
1309 spin_unlock(&pag->pag_ici_lock);
1310 xfs_perag_put(pag);
1311}
1312
1313void
1314xfs_inode_clear_eofblocks_tag(
1315 xfs_inode_t *ip)
1316{
1317 struct xfs_mount *mp = ip->i_mount;
1318 struct xfs_perag *pag;
1319
1320 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
1321 spin_lock(&pag->pag_ici_lock);
1322 trace_xfs_inode_clear_eofblocks_tag(ip);
1323
1324 radix_tree_tag_clear(&pag->pag_ici_root,
1325 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1326 XFS_ICI_EOFBLOCKS_TAG);
1327 if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
1328 /* clear the eofblocks tag from the perag radix tree */
1329 spin_lock(&ip->i_mount->m_perag_lock);
1330 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
1331 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1332 XFS_ICI_EOFBLOCKS_TAG);
1333 spin_unlock(&ip->i_mount->m_perag_lock);
1334 trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
1335 -1, _RET_IP_);
1336 }
1337
1338 spin_unlock(&pag->pag_ici_lock);
1339 xfs_perag_put(pag);
1340}
1341
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h
index 941202e7ac6e..e0f138c70a2f 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,28 +24,30 @@ struct xfs_perag;
24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
26 26
27extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 27int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
28 uint flags, uint lock_flags, xfs_inode_t **ipp);
28 29
29int xfs_syncd_init(struct xfs_mount *mp); 30void xfs_reclaim_worker(struct work_struct *work);
30void xfs_syncd_stop(struct xfs_mount *mp);
31
32int xfs_quiesce_data(struct xfs_mount *mp);
33void xfs_quiesce_attr(struct xfs_mount *mp);
34
35void xfs_flush_inodes(struct xfs_inode *ip);
36 31
37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 32int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38int xfs_reclaim_inodes_count(struct xfs_mount *mp); 33int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 34void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
40 35
41void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 36void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
42void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); 37
43void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 38void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
44 struct xfs_inode *ip); 39void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
40int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
41void xfs_eofblocks_worker(struct work_struct *);
45 42
46int xfs_sync_inode_grab(struct xfs_inode *ip); 43int xfs_sync_inode_grab(struct xfs_inode *ip);
47int xfs_inode_ag_iterator(struct xfs_mount *mp, 44int xfs_inode_ag_iterator(struct xfs_mount *mp,
48 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), 45 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
49 int flags); 46 int flags, void *args),
47 int flags, void *args);
48int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
49 int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
50 int flags, void *args),
51 int flags, void *args, int tag);
50 52
51#endif 53#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 784a803383ec..000000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,705 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_acl.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_alloc_btree.h"
30#include "xfs_ialloc_btree.h"
31#include "xfs_dinode.h"
32#include "xfs_inode.h"
33#include "xfs_btree.h"
34#include "xfs_ialloc.h"
35#include "xfs_quota.h"
36#include "xfs_utils.h"
37#include "xfs_trans_priv.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
40#include "xfs_trace.h"
41
42
43/*
44 * Allocate and initialise an xfs_inode.
45 */
46STATIC struct xfs_inode *
47xfs_inode_alloc(
48 struct xfs_mount *mp,
49 xfs_ino_t ino)
50{
51 struct xfs_inode *ip;
52
53 /*
54 * if this didn't occur in transactions, we could use
55 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
56 * code up to do this anyway.
57 */
58 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
59 if (!ip)
60 return NULL;
61 if (inode_init_always(mp->m_super, VFS_I(ip))) {
62 kmem_zone_free(xfs_inode_zone, ip);
63 return NULL;
64 }
65
66 ASSERT(atomic_read(&ip->i_pincount) == 0);
67 ASSERT(!spin_is_locked(&ip->i_flags_lock));
68 ASSERT(!xfs_isiflocked(ip));
69 ASSERT(ip->i_ino == 0);
70
71 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
72
73 /* initialise the xfs inode */
74 ip->i_ino = ino;
75 ip->i_mount = mp;
76 memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
77 ip->i_afp = NULL;
78 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
79 ip->i_flags = 0;
80 ip->i_delayed_blks = 0;
81 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
82
83 return ip;
84}
85
86STATIC void
87xfs_inode_free_callback(
88 struct rcu_head *head)
89{
90 struct inode *inode = container_of(head, struct inode, i_rcu);
91 struct xfs_inode *ip = XFS_I(inode);
92
93 kmem_zone_free(xfs_inode_zone, ip);
94}
95
96void
97xfs_inode_free(
98 struct xfs_inode *ip)
99{
100 switch (ip->i_d.di_mode & S_IFMT) {
101 case S_IFREG:
102 case S_IFDIR:
103 case S_IFLNK:
104 xfs_idestroy_fork(ip, XFS_DATA_FORK);
105 break;
106 }
107
108 if (ip->i_afp)
109 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
110
111 if (ip->i_itemp) {
112 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
113 xfs_inode_item_destroy(ip);
114 ip->i_itemp = NULL;
115 }
116
117 /* asserts to verify all state is correct here */
118 ASSERT(atomic_read(&ip->i_pincount) == 0);
119 ASSERT(!spin_is_locked(&ip->i_flags_lock));
120 ASSERT(!xfs_isiflocked(ip));
121
122 /*
123 * Because we use RCU freeing we need to ensure the inode always
124 * appears to be reclaimed with an invalid inode number when in the
125 * free state. The ip->i_flags_lock provides the barrier against lookup
126 * races.
127 */
128 spin_lock(&ip->i_flags_lock);
129 ip->i_flags = XFS_IRECLAIM;
130 ip->i_ino = 0;
131 spin_unlock(&ip->i_flags_lock);
132
133 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
134}
135
136/*
137 * Check the validity of the inode we just found it the cache
138 */
139static int
140xfs_iget_cache_hit(
141 struct xfs_perag *pag,
142 struct xfs_inode *ip,
143 xfs_ino_t ino,
144 int flags,
145 int lock_flags) __releases(RCU)
146{
147 struct inode *inode = VFS_I(ip);
148 struct xfs_mount *mp = ip->i_mount;
149 int error;
150
151 /*
152 * check for re-use of an inode within an RCU grace period due to the
153 * radix tree nodes not being updated yet. We monitor for this by
154 * setting the inode number to zero before freeing the inode structure.
155 * If the inode has been reallocated and set up, then the inode number
156 * will not match, so check for that, too.
157 */
158 spin_lock(&ip->i_flags_lock);
159 if (ip->i_ino != ino) {
160 trace_xfs_iget_skip(ip);
161 XFS_STATS_INC(xs_ig_frecycle);
162 error = EAGAIN;
163 goto out_error;
164 }
165
166
167 /*
168 * If we are racing with another cache hit that is currently
169 * instantiating this inode or currently recycling it out of
170 * reclaimabe state, wait for the initialisation to complete
171 * before continuing.
172 *
173 * XXX(hch): eventually we should do something equivalent to
174 * wait_on_inode to wait for these flags to be cleared
175 * instead of polling for it.
176 */
177 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
178 trace_xfs_iget_skip(ip);
179 XFS_STATS_INC(xs_ig_frecycle);
180 error = EAGAIN;
181 goto out_error;
182 }
183
184 /*
185 * If lookup is racing with unlink return an error immediately.
186 */
187 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
188 error = ENOENT;
189 goto out_error;
190 }
191
192 /*
193 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
194 * Need to carefully get it back into useable state.
195 */
196 if (ip->i_flags & XFS_IRECLAIMABLE) {
197 trace_xfs_iget_reclaim(ip);
198
199 /*
200 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
201 * from stomping over us while we recycle the inode. We can't
202 * clear the radix tree reclaimable tag yet as it requires
203 * pag_ici_lock to be held exclusive.
204 */
205 ip->i_flags |= XFS_IRECLAIM;
206
207 spin_unlock(&ip->i_flags_lock);
208 rcu_read_unlock();
209
210 error = -inode_init_always(mp->m_super, inode);
211 if (error) {
212 /*
213 * Re-initializing the inode failed, and we are in deep
214 * trouble. Try to re-add it to the reclaim list.
215 */
216 rcu_read_lock();
217 spin_lock(&ip->i_flags_lock);
218
219 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
220 ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
221 trace_xfs_iget_reclaim_fail(ip);
222 goto out_error;
223 }
224
225 spin_lock(&pag->pag_ici_lock);
226 spin_lock(&ip->i_flags_lock);
227
228 /*
229 * Clear the per-lifetime state in the inode as we are now
230 * effectively a new inode and need to return to the initial
231 * state before reuse occurs.
232 */
233 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
234 ip->i_flags |= XFS_INEW;
235 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
236 inode->i_state = I_NEW;
237
238 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
239 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
240
241 spin_unlock(&ip->i_flags_lock);
242 spin_unlock(&pag->pag_ici_lock);
243 } else {
244 /* If the VFS inode is being torn down, pause and try again. */
245 if (!igrab(inode)) {
246 trace_xfs_iget_skip(ip);
247 error = EAGAIN;
248 goto out_error;
249 }
250
251 /* We've got a live one. */
252 spin_unlock(&ip->i_flags_lock);
253 rcu_read_unlock();
254 trace_xfs_iget_hit(ip);
255 }
256
257 if (lock_flags != 0)
258 xfs_ilock(ip, lock_flags);
259
260 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
261 XFS_STATS_INC(xs_ig_found);
262
263 return 0;
264
265out_error:
266 spin_unlock(&ip->i_flags_lock);
267 rcu_read_unlock();
268 return error;
269}
270
271
272static int
273xfs_iget_cache_miss(
274 struct xfs_mount *mp,
275 struct xfs_perag *pag,
276 xfs_trans_t *tp,
277 xfs_ino_t ino,
278 struct xfs_inode **ipp,
279 int flags,
280 int lock_flags)
281{
282 struct xfs_inode *ip;
283 int error;
284 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
285 int iflags;
286
287 ip = xfs_inode_alloc(mp, ino);
288 if (!ip)
289 return ENOMEM;
290
291 error = xfs_iread(mp, tp, ip, flags);
292 if (error)
293 goto out_destroy;
294
295 trace_xfs_iget_miss(ip);
296
297 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
298 error = ENOENT;
299 goto out_destroy;
300 }
301
302 /*
303 * Preload the radix tree so we can insert safely under the
304 * write spinlock. Note that we cannot sleep inside the preload
305 * region. Since we can be called from transaction context, don't
306 * recurse into the file system.
307 */
308 if (radix_tree_preload(GFP_NOFS)) {
309 error = EAGAIN;
310 goto out_destroy;
311 }
312
313 /*
314 * Because the inode hasn't been added to the radix-tree yet it can't
315 * be found by another thread, so we can do the non-sleeping lock here.
316 */
317 if (lock_flags) {
318 if (!xfs_ilock_nowait(ip, lock_flags))
319 BUG();
320 }
321
322 /*
323 * These values must be set before inserting the inode into the radix
324 * tree as the moment it is inserted a concurrent lookup (allowed by the
325 * RCU locking mechanism) can find it and that lookup must see that this
326 * is an inode currently under construction (i.e. that XFS_INEW is set).
327 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
328 * memory barrier that ensures this detection works correctly at lookup
329 * time.
330 */
331 iflags = XFS_INEW;
332 if (flags & XFS_IGET_DONTCACHE)
333 iflags |= XFS_IDONTCACHE;
334 ip->i_udquot = ip->i_gdquot = NULL;
335 xfs_iflags_set(ip, iflags);
336
337 /* insert the new inode */
338 spin_lock(&pag->pag_ici_lock);
339 error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
340 if (unlikely(error)) {
341 WARN_ON(error != -EEXIST);
342 XFS_STATS_INC(xs_ig_dup);
343 error = EAGAIN;
344 goto out_preload_end;
345 }
346 spin_unlock(&pag->pag_ici_lock);
347 radix_tree_preload_end();
348
349 *ipp = ip;
350 return 0;
351
352out_preload_end:
353 spin_unlock(&pag->pag_ici_lock);
354 radix_tree_preload_end();
355 if (lock_flags)
356 xfs_iunlock(ip, lock_flags);
357out_destroy:
358 __destroy_inode(VFS_I(ip));
359 xfs_inode_free(ip);
360 return error;
361}
362
363/*
364 * Look up an inode by number in the given file system.
365 * The inode is looked up in the cache held in each AG.
366 * If the inode is found in the cache, initialise the vfs inode
367 * if necessary.
368 *
369 * If it is not in core, read it in from the file system's device,
370 * add it to the cache and initialise the vfs inode.
371 *
372 * The inode is locked according to the value of the lock_flags parameter.
373 * This flag parameter indicates how and if the inode's IO lock and inode lock
374 * should be taken.
375 *
376 * mp -- the mount point structure for the current file system. It points
377 * to the inode hash table.
378 * tp -- a pointer to the current transaction if there is one. This is
379 * simply passed through to the xfs_iread() call.
380 * ino -- the number of the inode desired. This is the unique identifier
381 * within the file system for the inode being requested.
382 * lock_flags -- flags indicating how to lock the inode. See the comment
383 * for xfs_ilock() for a list of valid values.
384 */
385int
386xfs_iget(
387 xfs_mount_t *mp,
388 xfs_trans_t *tp,
389 xfs_ino_t ino,
390 uint flags,
391 uint lock_flags,
392 xfs_inode_t **ipp)
393{
394 xfs_inode_t *ip;
395 int error;
396 xfs_perag_t *pag;
397 xfs_agino_t agino;
398
399 /*
400 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
401 * doesn't get freed while it's being referenced during a
402 * radix tree traversal here. It assumes this function
403 * aqcuires only the ILOCK (and therefore it has no need to
404 * involve the IOLOCK in this synchronization).
405 */
406 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
407
408 /* reject inode numbers outside existing AGs */
409 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
410 return EINVAL;
411
412 /* get the perag structure and ensure that it's inode capable */
413 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
414 agino = XFS_INO_TO_AGINO(mp, ino);
415
416again:
417 error = 0;
418 rcu_read_lock();
419 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
420
421 if (ip) {
422 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
423 if (error)
424 goto out_error_or_again;
425 } else {
426 rcu_read_unlock();
427 XFS_STATS_INC(xs_ig_missed);
428
429 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
430 flags, lock_flags);
431 if (error)
432 goto out_error_or_again;
433 }
434 xfs_perag_put(pag);
435
436 *ipp = ip;
437
438 /*
439 * If we have a real type for an on-disk inode, we can set ops(&unlock)
440 * now. If it's a new inode being created, xfs_ialloc will handle it.
441 */
442 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
443 xfs_setup_inode(ip);
444 return 0;
445
446out_error_or_again:
447 if (error == EAGAIN) {
448 delay(1);
449 goto again;
450 }
451 xfs_perag_put(pag);
452 return error;
453}
454
455/*
456 * This is a wrapper routine around the xfs_ilock() routine
457 * used to centralize some grungy code. It is used in places
458 * that wish to lock the inode solely for reading the extents.
459 * The reason these places can't just call xfs_ilock(SHARED)
460 * is that the inode lock also guards to bringing in of the
461 * extents from disk for a file in b-tree format. If the inode
462 * is in b-tree format, then we need to lock the inode exclusively
463 * until the extents are read in. Locking it exclusively all
464 * the time would limit our parallelism unnecessarily, though.
465 * What we do instead is check to see if the extents have been
466 * read in yet, and only lock the inode exclusively if they
467 * have not.
468 *
469 * The function returns a value which should be given to the
470 * corresponding xfs_iunlock_map_shared(). This value is
471 * the mode in which the lock was actually taken.
472 */
473uint
474xfs_ilock_map_shared(
475 xfs_inode_t *ip)
476{
477 uint lock_mode;
478
479 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
480 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
481 lock_mode = XFS_ILOCK_EXCL;
482 } else {
483 lock_mode = XFS_ILOCK_SHARED;
484 }
485
486 xfs_ilock(ip, lock_mode);
487
488 return lock_mode;
489}
490
491/*
492 * This is simply the unlock routine to go with xfs_ilock_map_shared().
493 * All it does is call xfs_iunlock() with the given lock_mode.
494 */
495void
496xfs_iunlock_map_shared(
497 xfs_inode_t *ip,
498 unsigned int lock_mode)
499{
500 xfs_iunlock(ip, lock_mode);
501}
502
503/*
504 * The xfs inode contains 2 locks: a multi-reader lock called the
505 * i_iolock and a multi-reader lock called the i_lock. This routine
506 * allows either or both of the locks to be obtained.
507 *
508 * The 2 locks should always be ordered so that the IO lock is
509 * obtained first in order to prevent deadlock.
510 *
511 * ip -- the inode being locked
512 * lock_flags -- this parameter indicates the inode's locks
513 * to be locked. It can be:
514 * XFS_IOLOCK_SHARED,
515 * XFS_IOLOCK_EXCL,
516 * XFS_ILOCK_SHARED,
517 * XFS_ILOCK_EXCL,
518 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
519 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
520 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
521 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
522 */
523void
524xfs_ilock(
525 xfs_inode_t *ip,
526 uint lock_flags)
527{
528 /*
529 * You can't set both SHARED and EXCL for the same lock,
530 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
531 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
532 */
533 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
534 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
535 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
536 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
537 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
538
539 if (lock_flags & XFS_IOLOCK_EXCL)
540 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
541 else if (lock_flags & XFS_IOLOCK_SHARED)
542 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
543
544 if (lock_flags & XFS_ILOCK_EXCL)
545 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
546 else if (lock_flags & XFS_ILOCK_SHARED)
547 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
548
549 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
550}
551
552/*
553 * This is just like xfs_ilock(), except that the caller
554 * is guaranteed not to sleep. It returns 1 if it gets
555 * the requested locks and 0 otherwise. If the IO lock is
556 * obtained but the inode lock cannot be, then the IO lock
557 * is dropped before returning.
558 *
559 * ip -- the inode being locked
560 * lock_flags -- this parameter indicates the inode's locks to be
561 * to be locked. See the comment for xfs_ilock() for a list
562 * of valid values.
563 */
564int
565xfs_ilock_nowait(
566 xfs_inode_t *ip,
567 uint lock_flags)
568{
569 /*
570 * You can't set both SHARED and EXCL for the same lock,
571 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
572 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
573 */
574 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
575 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
576 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
577 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
578 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
579
580 if (lock_flags & XFS_IOLOCK_EXCL) {
581 if (!mrtryupdate(&ip->i_iolock))
582 goto out;
583 } else if (lock_flags & XFS_IOLOCK_SHARED) {
584 if (!mrtryaccess(&ip->i_iolock))
585 goto out;
586 }
587 if (lock_flags & XFS_ILOCK_EXCL) {
588 if (!mrtryupdate(&ip->i_lock))
589 goto out_undo_iolock;
590 } else if (lock_flags & XFS_ILOCK_SHARED) {
591 if (!mrtryaccess(&ip->i_lock))
592 goto out_undo_iolock;
593 }
594 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
595 return 1;
596
597 out_undo_iolock:
598 if (lock_flags & XFS_IOLOCK_EXCL)
599 mrunlock_excl(&ip->i_iolock);
600 else if (lock_flags & XFS_IOLOCK_SHARED)
601 mrunlock_shared(&ip->i_iolock);
602 out:
603 return 0;
604}
605
606/*
607 * xfs_iunlock() is used to drop the inode locks acquired with
608 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
609 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
610 * that we know which locks to drop.
611 *
612 * ip -- the inode being unlocked
613 * lock_flags -- this parameter indicates the inode's locks to be
614 * to be unlocked. See the comment for xfs_ilock() for a list
615 * of valid values for this parameter.
616 *
617 */
618void
619xfs_iunlock(
620 xfs_inode_t *ip,
621 uint lock_flags)
622{
623 /*
624 * You can't set both SHARED and EXCL for the same lock,
625 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
626 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
627 */
628 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
629 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
630 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
631 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
632 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
633 ASSERT(lock_flags != 0);
634
635 if (lock_flags & XFS_IOLOCK_EXCL)
636 mrunlock_excl(&ip->i_iolock);
637 else if (lock_flags & XFS_IOLOCK_SHARED)
638 mrunlock_shared(&ip->i_iolock);
639
640 if (lock_flags & XFS_ILOCK_EXCL)
641 mrunlock_excl(&ip->i_lock);
642 else if (lock_flags & XFS_ILOCK_SHARED)
643 mrunlock_shared(&ip->i_lock);
644
645 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
646}
647
648/*
649 * give up write locks. the i/o lock cannot be held nested
650 * if it is being demoted.
651 */
652void
653xfs_ilock_demote(
654 xfs_inode_t *ip,
655 uint lock_flags)
656{
657 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
658 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
659
660 if (lock_flags & XFS_ILOCK_EXCL)
661 mrdemote(&ip->i_lock);
662 if (lock_flags & XFS_IOLOCK_EXCL)
663 mrdemote(&ip->i_iolock);
664
665 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
666}
667
668#ifdef DEBUG
669int
670xfs_isilocked(
671 xfs_inode_t *ip,
672 uint lock_flags)
673{
674 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
675 if (!(lock_flags & XFS_ILOCK_SHARED))
676 return !!ip->i_lock.mr_writer;
677 return rwsem_is_locked(&ip->i_lock.mr_lock);
678 }
679
680 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
681 if (!(lock_flags & XFS_IOLOCK_SHARED))
682 return !!ip->i_iolock.mr_writer;
683 return rwsem_is_locked(&ip->i_iolock.mr_lock);
684 }
685
686 ASSERT(0);
687 return 0;
688}
689#endif
690
691void
692__xfs_iflock(
693 struct xfs_inode *ip)
694{
695 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
696 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
697
698 do {
699 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
700 if (xfs_isiflocked(ip))
701 io_schedule();
702 } while (!xfs_iflock_nowait(ip));
703
704 finish_wait(wq, &wait.wait);
705}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1938b41ee9f5..66282dcb821b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
45#include "xfs_filestream.h" 45#include "xfs_filestream.h"
46#include "xfs_vnodeops.h" 46#include "xfs_vnodeops.h"
47#include "xfs_trace.h" 47#include "xfs_trace.h"
48#include "xfs_icache.h"
48 49
49kmem_zone_t *xfs_ifork_zone; 50kmem_zone_t *xfs_ifork_zone;
50kmem_zone_t *xfs_inode_zone; 51kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
74 return 0; 75 return 0;
75} 76}
76 77
78/*
79 * This is a wrapper routine around the xfs_ilock() routine used to centralize
80 * some grungy code. It is used in places that wish to lock the inode solely
81 * for reading the extents. The reason these places can't just call
82 * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
83 * extents from disk for a file in b-tree format. If the inode is in b-tree
84 * format, then we need to lock the inode exclusively until the extents are read
85 * in. Locking it exclusively all the time would limit our parallelism
86 * unnecessarily, though. What we do instead is check to see if the extents
87 * have been read in yet, and only lock the inode exclusively if they have not.
88 *
89 * The function returns a value which should be given to the corresponding
90 * xfs_iunlock_map_shared(). This value is the mode in which the lock was
91 * actually taken.
92 */
93uint
94xfs_ilock_map_shared(
95 xfs_inode_t *ip)
96{
97 uint lock_mode;
98
99 if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
100 ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
101 lock_mode = XFS_ILOCK_EXCL;
102 } else {
103 lock_mode = XFS_ILOCK_SHARED;
104 }
105
106 xfs_ilock(ip, lock_mode);
107
108 return lock_mode;
109}
110
111/*
112 * This is simply the unlock routine to go with xfs_ilock_map_shared().
113 * All it does is call xfs_iunlock() with the given lock_mode.
114 */
115void
116xfs_iunlock_map_shared(
117 xfs_inode_t *ip,
118 unsigned int lock_mode)
119{
120 xfs_iunlock(ip, lock_mode);
121}
122
123/*
124 * The xfs inode contains 2 locks: a multi-reader lock called the
125 * i_iolock and a multi-reader lock called the i_lock. This routine
126 * allows either or both of the locks to be obtained.
127 *
128 * The 2 locks should always be ordered so that the IO lock is
129 * obtained first in order to prevent deadlock.
130 *
131 * ip -- the inode being locked
132 * lock_flags -- this parameter indicates the inode's locks
133 * to be locked. It can be:
134 * XFS_IOLOCK_SHARED,
135 * XFS_IOLOCK_EXCL,
136 * XFS_ILOCK_SHARED,
137 * XFS_ILOCK_EXCL,
138 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
139 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
140 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
141 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
142 */
143void
144xfs_ilock(
145 xfs_inode_t *ip,
146 uint lock_flags)
147{
148 trace_xfs_ilock(ip, lock_flags, _RET_IP_);
149
150 /*
151 * You can't set both SHARED and EXCL for the same lock,
152 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
153 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
154 */
155 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
156 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
157 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
158 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
159 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
160
161 if (lock_flags & XFS_IOLOCK_EXCL)
162 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
163 else if (lock_flags & XFS_IOLOCK_SHARED)
164 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
165
166 if (lock_flags & XFS_ILOCK_EXCL)
167 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
168 else if (lock_flags & XFS_ILOCK_SHARED)
169 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
170}
171
172/*
173 * This is just like xfs_ilock(), except that the caller
174 * is guaranteed not to sleep. It returns 1 if it gets
175 * the requested locks and 0 otherwise. If the IO lock is
176 * obtained but the inode lock cannot be, then the IO lock
177 * is dropped before returning.
178 *
179 * ip -- the inode being locked
180 * lock_flags -- this parameter indicates the inode's locks to be
181 * to be locked. See the comment for xfs_ilock() for a list
182 * of valid values.
183 */
184int
185xfs_ilock_nowait(
186 xfs_inode_t *ip,
187 uint lock_flags)
188{
189 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
190
191 /*
192 * You can't set both SHARED and EXCL for the same lock,
193 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
194 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
195 */
196 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
197 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
198 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
199 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
200 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
201
202 if (lock_flags & XFS_IOLOCK_EXCL) {
203 if (!mrtryupdate(&ip->i_iolock))
204 goto out;
205 } else if (lock_flags & XFS_IOLOCK_SHARED) {
206 if (!mrtryaccess(&ip->i_iolock))
207 goto out;
208 }
209 if (lock_flags & XFS_ILOCK_EXCL) {
210 if (!mrtryupdate(&ip->i_lock))
211 goto out_undo_iolock;
212 } else if (lock_flags & XFS_ILOCK_SHARED) {
213 if (!mrtryaccess(&ip->i_lock))
214 goto out_undo_iolock;
215 }
216 return 1;
217
218 out_undo_iolock:
219 if (lock_flags & XFS_IOLOCK_EXCL)
220 mrunlock_excl(&ip->i_iolock);
221 else if (lock_flags & XFS_IOLOCK_SHARED)
222 mrunlock_shared(&ip->i_iolock);
223 out:
224 return 0;
225}
226
227/*
228 * xfs_iunlock() is used to drop the inode locks acquired with
229 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
230 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
231 * that we know which locks to drop.
232 *
233 * ip -- the inode being unlocked
234 * lock_flags -- this parameter indicates the inode's locks to be
235 * to be unlocked. See the comment for xfs_ilock() for a list
236 * of valid values for this parameter.
237 *
238 */
239void
240xfs_iunlock(
241 xfs_inode_t *ip,
242 uint lock_flags)
243{
244 /*
245 * You can't set both SHARED and EXCL for the same lock,
246 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
247 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
248 */
249 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
250 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
251 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
252 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
253 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
254 ASSERT(lock_flags != 0);
255
256 if (lock_flags & XFS_IOLOCK_EXCL)
257 mrunlock_excl(&ip->i_iolock);
258 else if (lock_flags & XFS_IOLOCK_SHARED)
259 mrunlock_shared(&ip->i_iolock);
260
261 if (lock_flags & XFS_ILOCK_EXCL)
262 mrunlock_excl(&ip->i_lock);
263 else if (lock_flags & XFS_ILOCK_SHARED)
264 mrunlock_shared(&ip->i_lock);
265
266 trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
267}
268
269/*
270 * give up write locks. the i/o lock cannot be held nested
271 * if it is being demoted.
272 */
273void
274xfs_ilock_demote(
275 xfs_inode_t *ip,
276 uint lock_flags)
277{
278 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
279 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
280
281 if (lock_flags & XFS_ILOCK_EXCL)
282 mrdemote(&ip->i_lock);
283 if (lock_flags & XFS_IOLOCK_EXCL)
284 mrdemote(&ip->i_iolock);
285
286 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
287}
288
289#ifdef DEBUG
290int
291xfs_isilocked(
292 xfs_inode_t *ip,
293 uint lock_flags)
294{
295 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
296 if (!(lock_flags & XFS_ILOCK_SHARED))
297 return !!ip->i_lock.mr_writer;
298 return rwsem_is_locked(&ip->i_lock.mr_lock);
299 }
300
301 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
302 if (!(lock_flags & XFS_IOLOCK_SHARED))
303 return !!ip->i_iolock.mr_writer;
304 return rwsem_is_locked(&ip->i_iolock.mr_lock);
305 }
306
307 ASSERT(0);
308 return 0;
309}
310#endif
311
312void
313__xfs_iflock(
314 struct xfs_inode *ip)
315{
316 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
317 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
318
319 do {
320 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
321 if (xfs_isiflocked(ip))
322 io_schedule();
323 } while (!xfs_iflock_nowait(ip));
324
325 finish_wait(wq, &wait.wait);
326}
327
77#ifdef DEBUG 328#ifdef DEBUG
78/* 329/*
79 * Make sure that the extents in the given memory buffer 330 * Make sure that the extents in the given memory buffer
@@ -131,6 +382,65 @@ xfs_inobp_check(
131} 382}
132#endif 383#endif
133 384
385static void
386xfs_inode_buf_verify(
387 struct xfs_buf *bp)
388{
389 struct xfs_mount *mp = bp->b_target->bt_mount;
390 int i;
391 int ni;
392
393 /*
394 * Validate the magic number and version of every inode in the buffer
395 */
396 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
397 for (i = 0; i < ni; i++) {
398 int di_ok;
399 xfs_dinode_t *dip;
400
401 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
402 (i << mp->m_sb.sb_inodelog));
403 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
404 XFS_DINODE_GOOD_VERSION(dip->di_version);
405 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
406 XFS_ERRTAG_ITOBP_INOTOBP,
407 XFS_RANDOM_ITOBP_INOTOBP))) {
408 xfs_buf_ioerror(bp, EFSCORRUPTED);
409 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
410 mp, dip);
411#ifdef DEBUG
412 xfs_emerg(mp,
413 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
414 (unsigned long long)bp->b_bn, i,
415 be16_to_cpu(dip->di_magic));
416 ASSERT(0);
417#endif
418 }
419 }
420 xfs_inobp_check(mp, bp);
421}
422
423
424static void
425xfs_inode_buf_read_verify(
426 struct xfs_buf *bp)
427{
428 xfs_inode_buf_verify(bp);
429}
430
431static void
432xfs_inode_buf_write_verify(
433 struct xfs_buf *bp)
434{
435 xfs_inode_buf_verify(bp);
436}
437
438const struct xfs_buf_ops xfs_inode_buf_ops = {
439 .verify_read = xfs_inode_buf_read_verify,
440 .verify_write = xfs_inode_buf_write_verify,
441};
442
443
134/* 444/*
135 * This routine is called to map an inode to the buffer containing the on-disk 445 * This routine is called to map an inode to the buffer containing the on-disk
136 * version of the inode. It returns a pointer to the buffer containing the 446 * version of the inode. It returns a pointer to the buffer containing the
@@ -145,71 +455,33 @@ xfs_imap_to_bp(
145 struct xfs_mount *mp, 455 struct xfs_mount *mp,
146 struct xfs_trans *tp, 456 struct xfs_trans *tp,
147 struct xfs_imap *imap, 457 struct xfs_imap *imap,
148 struct xfs_dinode **dipp, 458 struct xfs_dinode **dipp,
149 struct xfs_buf **bpp, 459 struct xfs_buf **bpp,
150 uint buf_flags, 460 uint buf_flags,
151 uint iget_flags) 461 uint iget_flags)
152{ 462{
153 struct xfs_buf *bp; 463 struct xfs_buf *bp;
154 int error; 464 int error;
155 int i;
156 int ni;
157 465
158 buf_flags |= XBF_UNMAPPED; 466 buf_flags |= XBF_UNMAPPED;
159 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 467 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
160 (int)imap->im_len, buf_flags, &bp); 468 (int)imap->im_len, buf_flags, &bp,
469 &xfs_inode_buf_ops);
161 if (error) { 470 if (error) {
162 if (error != EAGAIN) { 471 if (error == EAGAIN) {
163 xfs_warn(mp,
164 "%s: xfs_trans_read_buf() returned error %d.",
165 __func__, error);
166 } else {
167 ASSERT(buf_flags & XBF_TRYLOCK); 472 ASSERT(buf_flags & XBF_TRYLOCK);
473 return error;
168 } 474 }
169 return error;
170 }
171
172 /*
173 * Validate the magic number and version of every inode in the buffer
174 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
175 */
176#ifdef DEBUG
177 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
178#else /* usual case */
179 ni = 1;
180#endif
181 475
182 for (i = 0; i < ni; i++) { 476 if (error == EFSCORRUPTED &&
183 int di_ok; 477 (iget_flags & XFS_IGET_UNTRUSTED))
184 xfs_dinode_t *dip; 478 return XFS_ERROR(EINVAL);
185 479
186 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 480 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
187 (i << mp->m_sb.sb_inodelog)); 481 __func__, error);
188 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && 482 return error;
189 XFS_DINODE_GOOD_VERSION(dip->di_version);
190 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
191 XFS_ERRTAG_ITOBP_INOTOBP,
192 XFS_RANDOM_ITOBP_INOTOBP))) {
193 if (iget_flags & XFS_IGET_UNTRUSTED) {
194 xfs_trans_brelse(tp, bp);
195 return XFS_ERROR(EINVAL);
196 }
197 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
198 mp, dip);
199#ifdef DEBUG
200 xfs_emerg(mp,
201 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
202 (unsigned long long)imap->im_blkno, i,
203 be16_to_cpu(dip->di_magic));
204 ASSERT(0);
205#endif
206 xfs_trans_brelse(tp, bp);
207 return XFS_ERROR(EFSCORRUPTED);
208 }
209 } 483 }
210 484
211 xfs_inobp_check(mp, bp);
212
213 *bpp = bp; 485 *bpp = bp;
214 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); 486 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
215 return 0; 487 return 0;
@@ -853,16 +1125,16 @@ xfs_iread_extents(
853 * set according to the contents of the given cred structure. 1125 * set according to the contents of the given cred structure.
854 * 1126 *
855 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() 1127 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
856 * has a free inode available, call xfs_iget() 1128 * has a free inode available, call xfs_iget() to obtain the in-core
857 * to obtain the in-core version of the allocated inode. Finally, 1129 * version of the allocated inode. Finally, fill in the inode and
858 * fill in the inode and log its initial contents. In this case, 1130 * log its initial contents. In this case, ialloc_context would be
859 * ialloc_context would be set to NULL and call_again set to false. 1131 * set to NULL.
860 * 1132 *
861 * If xfs_dialloc() does not have an available inode, 1133 * If xfs_dialloc() does not have an available inode, it will replenish
862 * it will replenish its supply by doing an allocation. Since we can 1134 * its supply by doing an allocation. Since we can only do one
863 * only do one allocation within a transaction without deadlocks, we 1135 * allocation within a transaction without deadlocks, we must commit
864 * must commit the current transaction before returning the inode itself. 1136 * the current transaction before returning the inode itself.
865 * In this case, therefore, we will set call_again to true and return. 1137 * In this case, therefore, we will set ialloc_context and return.
866 * The caller should then commit the current transaction, start a new 1138 * The caller should then commit the current transaction, start a new
867 * transaction, and call xfs_ialloc() again to actually get the inode. 1139 * transaction, and call xfs_ialloc() again to actually get the inode.
868 * 1140 *
@@ -1514,6 +1786,18 @@ xfs_ifree_cluster(
1514 1786
1515 if (!bp) 1787 if (!bp)
1516 return ENOMEM; 1788 return ENOMEM;
1789
1790 /*
1791 * This buffer may not have been correctly initialised as we
1792 * didn't read it from disk. That's not important because we are
1793 * only using to mark the buffer as stale in the log, and to
1794 * attach stale cached inodes on it. That means it will never be
1795 * dispatched for IO. If it is, we want to know about it, and we
1796 * want it to fail. We can acheive this by adding a write
1797 * verifier to the buffer.
1798 */
1799 bp->b_ops = &xfs_inode_buf_ops;
1800
1517 /* 1801 /*
1518 * Walk the inodes already attached to the buffer and mark them 1802 * Walk the inodes already attached to the buffer and mark them
1519 * stale. These will all have the flush locks held, so an 1803 * stale. These will all have the flush locks held, so an
@@ -3661,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
3661 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; 3945 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3662 } 3946 }
3663} 3947}
3948
3949/*
3950 * Test whether it is appropriate to check an inode for and free post EOF
3951 * blocks. The 'force' parameter determines whether we should also consider
3952 * regular files that are marked preallocated or append-only.
3953 */
3954bool
3955xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
3956{
3957 /* prealloc/delalloc exists only on regular files */
3958 if (!S_ISREG(ip->i_d.di_mode))
3959 return false;
3960
3961 /*
3962 * Zero sized files with no cached pages and delalloc blocks will not
3963 * have speculative prealloc/delalloc blocks to remove.
3964 */
3965 if (VFS_I(ip)->i_size == 0 &&
3966 VN_CACHED(VFS_I(ip)) == 0 &&
3967 ip->i_delayed_blks == 0)
3968 return false;
3969
3970 /* If we haven't read in the extent list, then don't do it now. */
3971 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
3972 return false;
3973
3974 /*
3975 * Do not free real preallocated or append-only files unless the file
3976 * has delalloc blocks and we are forced to remove them.
3977 */
3978 if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
3979 if (!force || ip->i_delayed_blks == 0)
3980 return false;
3981
3982 return true;
3983}
3984
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f906e79..22baf6ea4fac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
496 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ 496 (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
497 ((pip)->i_d.di_mode & S_ISGID)) 497 ((pip)->i_d.di_mode & S_ISGID))
498 498
499
499/* 500/*
500 * xfs_iget.c prototypes. 501 * xfs_inode.c prototypes.
501 */ 502 */
502int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
503 uint, uint, xfs_inode_t **);
504void xfs_ilock(xfs_inode_t *, uint); 503void xfs_ilock(xfs_inode_t *, uint);
505int xfs_ilock_nowait(xfs_inode_t *, uint); 504int xfs_ilock_nowait(xfs_inode_t *, uint);
506void xfs_iunlock(xfs_inode_t *, uint); 505void xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
508int xfs_isilocked(xfs_inode_t *, uint); 507int xfs_isilocked(xfs_inode_t *, uint);
509uint xfs_ilock_map_shared(xfs_inode_t *); 508uint xfs_ilock_map_shared(xfs_inode_t *);
510void xfs_iunlock_map_shared(xfs_inode_t *, uint); 509void xfs_iunlock_map_shared(xfs_inode_t *, uint);
511void xfs_inode_free(struct xfs_inode *ip);
512
513/*
514 * xfs_inode.c prototypes.
515 */
516int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, 510int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
517 xfs_nlink_t, xfs_dev_t, prid_t, int, 511 xfs_nlink_t, xfs_dev_t, prid_t, int,
518 struct xfs_buf **, xfs_inode_t **); 512 struct xfs_buf **, xfs_inode_t **);
@@ -591,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *);
591void xfs_iext_irec_compact_pages(xfs_ifork_t *); 585void xfs_iext_irec_compact_pages(xfs_ifork_t *);
592void xfs_iext_irec_compact_full(xfs_ifork_t *); 586void xfs_iext_irec_compact_full(xfs_ifork_t *);
593void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); 587void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
588bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
594 589
595#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 590#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
596 591
@@ -603,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
603extern struct kmem_zone *xfs_ifork_zone; 598extern struct kmem_zone *xfs_ifork_zone;
604extern struct kmem_zone *xfs_inode_zone; 599extern struct kmem_zone *xfs_inode_zone;
605extern struct kmem_zone *xfs_ili_zone; 600extern struct kmem_zone *xfs_ili_zone;
601extern const struct xfs_buf_ops xfs_inode_buf_ops;
606 602
607#endif /* __XFS_INODE_H__ */ 603#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c1df3c623de2..c1c3ef88a260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
42#include "xfs_inode_item.h" 42#include "xfs_inode_item.h"
43#include "xfs_export.h" 43#include "xfs_export.h"
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45#include "xfs_icache.h"
45 46
46#include <linux/capability.h> 47#include <linux/capability.h>
47#include <linux/dcache.h> 48#include <linux/dcache.h>
@@ -1602,6 +1603,26 @@ xfs_file_ioctl(
1602 error = xfs_errortag_clearall(mp, 1); 1603 error = xfs_errortag_clearall(mp, 1);
1603 return -error; 1604 return -error;
1604 1605
1606 case XFS_IOC_FREE_EOFBLOCKS: {
1607 struct xfs_eofblocks eofb;
1608
1609 if (copy_from_user(&eofb, arg, sizeof(eofb)))
1610 return -XFS_ERROR(EFAULT);
1611
1612 if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
1613 return -XFS_ERROR(EINVAL);
1614
1615 if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
1616 return -XFS_ERROR(EINVAL);
1617
1618 if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
1619 memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
1620 return -XFS_ERROR(EINVAL);
1621
1622 error = xfs_icache_free_eofblocks(mp, &eofb);
1623 return -error;
1624 }
1625
1605 default: 1626 default:
1606 return -ENOTTY; 1627 return -ENOTTY;
1607 } 1628 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7f537663365b..364818eef40e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
41#include "xfs_utils.h" 41#include "xfs_utils.h"
42#include "xfs_iomap.h" 42#include "xfs_iomap.h"
43#include "xfs_trace.h" 43#include "xfs_trace.h"
44#include "xfs_icache.h"
44 45
45 46
46#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ 47#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -350,6 +351,15 @@ xfs_iomap_prealloc_size(
350 } 351 }
351 if (shift) 352 if (shift)
352 alloc_blocks >>= shift; 353 alloc_blocks >>= shift;
354
355 /*
356 * If we are still trying to allocate more space than is
357 * available, squash the prealloc hard. This can happen if we
358 * have a large file on a small filesystem and the above
359 * lowspace thresholds are smaller than MAXEXTLEN.
360 */
361 while (alloc_blocks >= freesp)
362 alloc_blocks >>= 4;
353 } 363 }
354 364
355 if (alloc_blocks < mp->m_writeio_blocks) 365 if (alloc_blocks < mp->m_writeio_blocks)
@@ -373,7 +383,7 @@ xfs_iomap_write_delay(
373 xfs_extlen_t extsz; 383 xfs_extlen_t extsz;
374 int nimaps; 384 int nimaps;
375 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 385 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
376 int prealloc, flushed = 0; 386 int prealloc;
377 int error; 387 int error;
378 388
379 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 389 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,31 +444,29 @@ retry:
434 } 444 }
435 445
436 /* 446 /*
437 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For 447 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
438 * ENOSPC, * flush all other inodes with delalloc blocks to free up
439 * some of the excess reserved metadata space. For both cases, retry
440 * without EOF preallocation. 448 * without EOF preallocation.
441 */ 449 */
442 if (nimaps == 0) { 450 if (nimaps == 0) {
443 trace_xfs_delalloc_enospc(ip, offset, count); 451 trace_xfs_delalloc_enospc(ip, offset, count);
444 if (flushed) 452 if (prealloc) {
445 return XFS_ERROR(error ? error : ENOSPC); 453 prealloc = 0;
446 454 error = 0;
447 if (error == ENOSPC) { 455 goto retry;
448 xfs_iunlock(ip, XFS_ILOCK_EXCL);
449 xfs_flush_inodes(ip);
450 xfs_ilock(ip, XFS_ILOCK_EXCL);
451 } 456 }
452 457 return XFS_ERROR(error ? error : ENOSPC);
453 flushed = 1;
454 error = 0;
455 prealloc = 0;
456 goto retry;
457 } 458 }
458 459
459 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 460 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
460 return xfs_alert_fsblock_zero(ip, &imap[0]); 461 return xfs_alert_fsblock_zero(ip, &imap[0]);
461 462
463 /*
464 * Tag the inode as speculatively preallocated so we can reclaim this
465 * space on demand, if necessary.
466 */
467 if (prealloc)
468 xfs_inode_set_eofblocks_tag(ip);
469
462 *ret_imap = imap[0]; 470 *ret_imap = imap[0];
463 return 0; 471 return 0;
464} 472}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4e00cf091d2c..d82efaa2ac73 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
38#include "xfs_vnodeops.h" 38#include "xfs_vnodeops.h"
39#include "xfs_inode_item.h" 39#include "xfs_inode_item.h"
40#include "xfs_trace.h" 40#include "xfs_trace.h"
41#include "xfs_icache.h"
41 42
42#include <linux/capability.h> 43#include <linux/capability.h>
43#include <linux/xattr.h> 44#include <linux/xattr.h>
@@ -779,8 +780,8 @@ xfs_setattr_size(
779 * care about here. 780 * care about here.
780 */ 781 */
781 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { 782 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
782 error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0, 783 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
783 FI_NONE); 784 ip->i_d.di_size, newsize);
784 if (error) 785 if (error)
785 goto out_unlock; 786 goto out_unlock;
786 } 787 }
@@ -854,6 +855,9 @@ xfs_setattr_size(
854 * and do not wait the usual (long) time for writeout. 855 * and do not wait the usual (long) time for writeout.
855 */ 856 */
856 xfs_iflags_set(ip, XFS_ITRUNCATED); 857 xfs_iflags_set(ip, XFS_ITRUNCATED);
858
859 /* A truncate down always removes post-EOF blocks. */
860 xfs_inode_clear_eofblocks_tag(ip);
857 } 861 }
858 862
859 if (mask & ATTR_CTIME) { 863 if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a66e302..2ea7d402188d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
34#include "xfs_error.h" 34#include "xfs_error.h"
35#include "xfs_btree.h" 35#include "xfs_btree.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_icache.h"
37 38
38STATIC int 39STATIC int
39xfs_internal_inum( 40xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
395 if (xfs_inobt_maskn(chunkidx, nicluster) 396 if (xfs_inobt_maskn(chunkidx, nicluster)
396 & ~r.ir_free) 397 & ~r.ir_free)
397 xfs_btree_reada_bufs(mp, agno, 398 xfs_btree_reada_bufs(mp, agno,
398 agbno, nbcluster); 399 agbno, nbcluster,
400 &xfs_inode_buf_ops);
399 } 401 }
400 irbp->ir_startino = r.ir_startino; 402 irbp->ir_startino = r.ir_startino;
401 irbp->ir_freecount = r.ir_freecount; 403 irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d64..fe7e4df85a7b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
44#include <linux/kernel.h> 44#include <linux/kernel.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/crc32c.h>
47#include <linux/module.h> 48#include <linux/module.h>
48#include <linux/mutex.h> 49#include <linux/mutex.h>
49#include <linux/file.h> 50#include <linux/file.h>
@@ -118,6 +119,7 @@
118#define xfs_rotorstep xfs_params.rotorstep.val 119#define xfs_rotorstep xfs_params.rotorstep.val
119#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val 120#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
120#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val 121#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
122#define xfs_eofb_secs xfs_params.eofb_timer.val
121 123
122#define current_cpu() (raw_smp_processor_id()) 124#define current_cpu() (raw_smp_processor_id())
123#define current_pid() (current->pid) 125#define current_pid() (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4dad756962d0..46bd9d52ab51 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
34#include "xfs_dinode.h" 34#include "xfs_dinode.h"
35#include "xfs_inode.h" 35#include "xfs_inode.h"
36#include "xfs_trace.h" 36#include "xfs_trace.h"
37#include "xfs_fsops.h"
38#include "xfs_cksum.h"
37 39
38kmem_zone_t *xfs_log_ticket_zone; 40kmem_zone_t *xfs_log_ticket_zone;
39 41
@@ -458,7 +460,8 @@ xfs_log_reserve(
458 tic->t_trans_type = t_type; 460 tic->t_trans_type = t_type;
459 *ticp = tic; 461 *ticp = tic;
460 462
461 xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); 463 xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
464 : tic->t_unit_res);
462 465
463 trace_xfs_log_reserve(log, tic); 466 trace_xfs_log_reserve(log, tic);
464 467
@@ -679,25 +682,29 @@ out:
679} 682}
680 683
681/* 684/*
682 * Finish the recovery of the file system. This is separate from 685 * Finish the recovery of the file system. This is separate from the
683 * the xfs_log_mount() call, because it depends on the code in 686 * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
684 * xfs_mountfs() to read in the root and real-time bitmap inodes 687 * in the root and real-time bitmap inodes between calling xfs_log_mount() and
685 * between calling xfs_log_mount() and here. 688 * here.
686 * 689 *
687 * mp - ubiquitous xfs mount point structure 690 * If we finish recovery successfully, start the background log work. If we are
691 * not doing recovery, then we have a RO filesystem and we don't need to start
692 * it.
688 */ 693 */
689int 694int
690xfs_log_mount_finish(xfs_mount_t *mp) 695xfs_log_mount_finish(xfs_mount_t *mp)
691{ 696{
692 int error; 697 int error = 0;
693 698
694 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 699 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
695 error = xlog_recover_finish(mp->m_log); 700 error = xlog_recover_finish(mp->m_log);
696 else { 701 if (!error)
697 error = 0; 702 xfs_log_work_queue(mp);
703 } else {
698 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 704 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
699 } 705 }
700 706
707
701 return error; 708 return error;
702} 709}
703 710
@@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
850} /* xfs_log_unmount_write */ 857} /* xfs_log_unmount_write */
851 858
852/* 859/*
853 * Deallocate log structures for unmount/relocation. 860 * Empty the log for unmount/freeze.
861 *
862 * To do this, we first need to shut down the background log work so it is not
863 * trying to cover the log as we clean up. We then need to unpin all objects in
864 * the log so we can then flush them out. Once they have completed their IO and
865 * run the callbacks removing themselves from the AIL, we can write the unmount
866 * record.
867 */
868void
869xfs_log_quiesce(
870 struct xfs_mount *mp)
871{
872 cancel_delayed_work_sync(&mp->m_log->l_work);
873 xfs_log_force(mp, XFS_LOG_SYNC);
874
875 /*
876 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
877 * will push it, xfs_wait_buftarg() will not wait for it. Further,
878 * xfs_buf_iowait() cannot be used because it was pushed with the
879 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
880 * the IO to complete.
881 */
882 xfs_ail_push_all_sync(mp->m_ail);
883 xfs_wait_buftarg(mp->m_ddev_targp);
884 xfs_buf_lock(mp->m_sb_bp);
885 xfs_buf_unlock(mp->m_sb_bp);
886
887 xfs_log_unmount_write(mp);
888}
889
890/*
891 * Shut down and release the AIL and Log.
854 * 892 *
855 * We need to stop the aild from running before we destroy 893 * During unmount, we need to ensure we flush all the dirty metadata objects
856 * and deallocate the log as the aild references the log. 894 * from the AIL so that the log is empty before we write the unmount record to
895 * the log. Once this is done, we can tear down the AIL and the log.
857 */ 896 */
858void 897void
859xfs_log_unmount(xfs_mount_t *mp) 898xfs_log_unmount(
899 struct xfs_mount *mp)
860{ 900{
861 cancel_delayed_work_sync(&mp->m_sync_work); 901 xfs_log_quiesce(mp);
902
862 xfs_trans_ail_destroy(mp); 903 xfs_trans_ail_destroy(mp);
863 xlog_dealloc_log(mp->m_log); 904 xlog_dealloc_log(mp->m_log);
864} 905}
@@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
1090 * with it being freed after writing the unmount record to the 1131 * with it being freed after writing the unmount record to the
1091 * log. 1132 * log.
1092 */ 1133 */
1093 1134}
1094} /* xlog_iodone */
1095 1135
1096/* 1136/*
1097 * Return size of each in-core log record buffer. 1137 * Return size of each in-core log record buffer.
@@ -1161,6 +1201,40 @@ done:
1161} /* xlog_get_iclog_buffer_size */ 1201} /* xlog_get_iclog_buffer_size */
1162 1202
1163 1203
1204void
1205xfs_log_work_queue(
1206 struct xfs_mount *mp)
1207{
1208 queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
1209 msecs_to_jiffies(xfs_syncd_centisecs * 10));
1210}
1211
1212/*
1213 * Every sync period we need to unpin all items in the AIL and push them to
1214 * disk. If there is nothing dirty, then we might need to cover the log to
1215 * indicate that the filesystem is idle.
1216 */
1217void
1218xfs_log_worker(
1219 struct work_struct *work)
1220{
1221 struct xlog *log = container_of(to_delayed_work(work),
1222 struct xlog, l_work);
1223 struct xfs_mount *mp = log->l_mp;
1224
1225 /* dgc: errors ignored - not fatal and nowhere to report them */
1226 if (xfs_log_need_covered(mp))
1227 xfs_fs_log_dummy(mp);
1228 else
1229 xfs_log_force(mp, 0);
1230
1231 /* start pushing all the metadata that is currently dirty */
1232 xfs_ail_push_all(mp->m_ail);
1233
1234 /* queue us up again */
1235 xfs_log_work_queue(mp);
1236}
1237
1164/* 1238/*
1165 * This routine initializes some of the log structure for a given mount point. 1239 * This routine initializes some of the log structure for a given mount point.
1166 * Its primary purpose is to fill in enough, so recovery can occur. However, 1240 * Its primary purpose is to fill in enough, so recovery can occur. However,
@@ -1195,6 +1269,7 @@ xlog_alloc_log(
1195 log->l_logBBsize = num_bblks; 1269 log->l_logBBsize = num_bblks;
1196 log->l_covered_state = XLOG_STATE_COVER_IDLE; 1270 log->l_covered_state = XLOG_STATE_COVER_IDLE;
1197 log->l_flags |= XLOG_ACTIVE_RECOVERY; 1271 log->l_flags |= XLOG_ACTIVE_RECOVERY;
1272 INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
1198 1273
1199 log->l_prev_block = -1; 1274 log->l_prev_block = -1;
1200 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ 1275 /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1417,6 +1492,84 @@ xlog_grant_push_ail(
1417} 1492}
1418 1493
1419/* 1494/*
1495 * Stamp cycle number in every block
1496 */
1497STATIC void
1498xlog_pack_data(
1499 struct xlog *log,
1500 struct xlog_in_core *iclog,
1501 int roundoff)
1502{
1503 int i, j, k;
1504 int size = iclog->ic_offset + roundoff;
1505 __be32 cycle_lsn;
1506 xfs_caddr_t dp;
1507
1508 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
1509
1510 dp = iclog->ic_datap;
1511 for (i = 0; i < BTOBB(size); i++) {
1512 if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
1513 break;
1514 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
1515 *(__be32 *)dp = cycle_lsn;
1516 dp += BBSIZE;
1517 }
1518
1519 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1520 xlog_in_core_2_t *xhdr = iclog->ic_data;
1521
1522 for ( ; i < BTOBB(size); i++) {
1523 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1524 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1525 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
1526 *(__be32 *)dp = cycle_lsn;
1527 dp += BBSIZE;
1528 }
1529
1530 for (i = 1; i < log->l_iclog_heads; i++)
1531 xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
1532 }
1533}
1534
1535/*
1536 * Calculate the checksum for a log buffer.
1537 *
1538 * This is a little more complicated than it should be because the various
1539 * headers and the actual data are non-contiguous.
1540 */
1541__le32
1542xlog_cksum(
1543 struct xlog *log,
1544 struct xlog_rec_header *rhead,
1545 char *dp,
1546 int size)
1547{
1548 __uint32_t crc;
1549
1550 /* first generate the crc for the record header ... */
1551 crc = xfs_start_cksum((char *)rhead,
1552 sizeof(struct xlog_rec_header),
1553 offsetof(struct xlog_rec_header, h_crc));
1554
1555 /* ... then for additional cycle data for v2 logs ... */
1556 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1557 union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
1558 int i;
1559
1560 for (i = 1; i < log->l_iclog_heads; i++) {
1561 crc = crc32c(crc, &xhdr[i].hic_xheader,
1562 sizeof(struct xlog_rec_ext_header));
1563 }
1564 }
1565
1566 /* ... and finally for the payload */
1567 crc = crc32c(crc, dp, size);
1568
1569 return xfs_end_cksum(crc);
1570}
1571
1572/*
1420 * The bdstrat callback function for log bufs. This gives us a central 1573 * The bdstrat callback function for log bufs. This gives us a central
1421 * place to trap bufs in case we get hit by a log I/O error and need to 1574 * place to trap bufs in case we get hit by a log I/O error and need to
1422 * shutdown. Actually, in practice, even when we didn't get a log error, 1575 * shutdown. Actually, in practice, even when we didn't get a log error,
@@ -1476,7 +1629,6 @@ xlog_sync(
1476 struct xlog *log, 1629 struct xlog *log,
1477 struct xlog_in_core *iclog) 1630 struct xlog_in_core *iclog)
1478{ 1631{
1479 xfs_caddr_t dptr; /* pointer to byte sized element */
1480 xfs_buf_t *bp; 1632 xfs_buf_t *bp;
1481 int i; 1633 int i;
1482 uint count; /* byte count of bwrite */ 1634 uint count; /* byte count of bwrite */
@@ -1485,6 +1637,7 @@ xlog_sync(
1485 int split = 0; /* split write into two regions */ 1637 int split = 0; /* split write into two regions */
1486 int error; 1638 int error;
1487 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); 1639 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
1640 int size;
1488 1641
1489 XFS_STATS_INC(xs_log_writes); 1642 XFS_STATS_INC(xs_log_writes);
1490 ASSERT(atomic_read(&iclog->ic_refcnt) == 0); 1643 ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1668,10 @@ xlog_sync(
1515 xlog_pack_data(log, iclog, roundoff); 1668 xlog_pack_data(log, iclog, roundoff);
1516 1669
1517 /* real byte length */ 1670 /* real byte length */
1518 if (v2) { 1671 size = iclog->ic_offset;
1519 iclog->ic_header.h_len = 1672 if (v2)
1520 cpu_to_be32(iclog->ic_offset + roundoff); 1673 size += roundoff;
1521 } else { 1674 iclog->ic_header.h_len = cpu_to_be32(size);
1522 iclog->ic_header.h_len =
1523 cpu_to_be32(iclog->ic_offset);
1524 }
1525 1675
1526 bp = iclog->ic_bp; 1676 bp = iclog->ic_bp;
1527 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); 1677 XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1680,36 @@ xlog_sync(
1530 1680
1531 /* Do we need to split this write into 2 parts? */ 1681 /* Do we need to split this write into 2 parts? */
1532 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { 1682 if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
1683 char *dptr;
1684
1533 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); 1685 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
1534 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); 1686 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
1535 iclog->ic_bwritecnt = 2; /* split into 2 writes */ 1687 iclog->ic_bwritecnt = 2;
1688
1689 /*
1690 * Bump the cycle numbers at the start of each block in the
1691 * part of the iclog that ends up in the buffer that gets
1692 * written to the start of the log.
1693 *
1694 * Watch out for the header magic number case, though.
1695 */
1696 dptr = (char *)&iclog->ic_header + count;
1697 for (i = 0; i < split; i += BBSIZE) {
1698 __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
1699 if (++cycle == XLOG_HEADER_MAGIC_NUM)
1700 cycle++;
1701 *(__be32 *)dptr = cpu_to_be32(cycle);
1702
1703 dptr += BBSIZE;
1704 }
1536 } else { 1705 } else {
1537 iclog->ic_bwritecnt = 1; 1706 iclog->ic_bwritecnt = 1;
1538 } 1707 }
1708
1709 /* calculcate the checksum */
1710 iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
1711 iclog->ic_datap, size);
1712
1539 bp->b_io_length = BTOBB(count); 1713 bp->b_io_length = BTOBB(count);
1540 bp->b_fspriv = iclog; 1714 bp->b_fspriv = iclog;
1541 XFS_BUF_ZEROFLAGS(bp); 1715 XFS_BUF_ZEROFLAGS(bp);
@@ -1589,19 +1763,6 @@ xlog_sync(
1589 bp->b_flags |= XBF_SYNCIO; 1763 bp->b_flags |= XBF_SYNCIO;
1590 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) 1764 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
1591 bp->b_flags |= XBF_FUA; 1765 bp->b_flags |= XBF_FUA;
1592 dptr = bp->b_addr;
1593 /*
1594 * Bump the cycle numbers at the start of each block
1595 * since this part of the buffer is at the start of
1596 * a new cycle. Watch out for the header magic number
1597 * case, though.
1598 */
1599 for (i = 0; i < split; i += BBSIZE) {
1600 be32_add_cpu((__be32 *)dptr, 1);
1601 if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
1602 be32_add_cpu((__be32 *)dptr, 1);
1603 dptr += BBSIZE;
1604 }
1605 1766
1606 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); 1767 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
1607 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); 1768 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1779,6 @@ xlog_sync(
1618 return 0; 1779 return 0;
1619} /* xlog_sync */ 1780} /* xlog_sync */
1620 1781
1621
1622/* 1782/*
1623 * Deallocate a log structure 1783 * Deallocate a log structure
1624 */ 1784 */
@@ -3713,3 +3873,4 @@ xlog_iclogs_empty(
3713 } while (iclog != log->l_iclog); 3873 } while (iclog != log->l_iclog);
3714 return 1; 3874 return 1;
3715} 3875}
3876
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e2..5caee96059df 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
181 xfs_lsn_t *commit_lsn, int flags); 181 xfs_lsn_t *commit_lsn, int flags);
182bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 182bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
183 183
184void xfs_log_work_queue(struct xfs_mount *mp);
185void xfs_log_worker(struct work_struct *work);
186void xfs_log_quiesce(struct xfs_mount *mp);
187
184#endif 188#endif
185#endif /* __XFS_LOG_H__ */ 189#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d76a42..16d8d12ea3b4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
139/* 139/*
140 * Flags for log structure 140 * Flags for log structure
141 */ 141 */
142#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
143#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ 142#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
144#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 143#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
145#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 144#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
291 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ 290 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
292 __be64 h_lsn; /* lsn of this LR : 8 */ 291 __be64 h_lsn; /* lsn of this LR : 8 */
293 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ 292 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
294 __be32 h_chksum; /* may not be used; non-zero if used : 4 */ 293 __le32 h_crc; /* crc of log record : 4 */
295 __be32 h_prev_block; /* block number to previous LR : 4 */ 294 __be32 h_prev_block; /* block number to previous LR : 4 */
296 __be32 h_num_logops; /* number of log operations in this LR : 4 */ 295 __be32 h_num_logops; /* number of log operations in this LR : 4 */
297 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; 296 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -495,6 +494,7 @@ struct xlog {
495 struct xfs_buf *l_xbuf; /* extra buffer for log 494 struct xfs_buf *l_xbuf; /* extra buffer for log
496 * wrapping */ 495 * wrapping */
497 struct xfs_buftarg *l_targ; /* buftarg of log */ 496 struct xfs_buftarg *l_targ; /* buftarg of log */
497 struct delayed_work l_work; /* background flush work */
498 uint l_flags; 498 uint l_flags;
499 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 499 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
500 struct list_head *l_buf_cancel_table; 500 struct list_head *l_buf_cancel_table;
@@ -554,11 +554,9 @@ xlog_recover(
554extern int 554extern int
555xlog_recover_finish( 555xlog_recover_finish(
556 struct xlog *log); 556 struct xlog *log);
557extern void 557
558xlog_pack_data( 558extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
559 struct xlog *log, 559 char *dp, int size);
560 struct xlog_in_core *iclog,
561 int);
562 560
563extern kmem_zone_t *xfs_log_ticket_zone; 561extern kmem_zone_t *xfs_log_ticket_zone;
564struct xlog_ticket * 562struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index d308749fabf1..96fcbb85ff83 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,7 +41,9 @@
41#include "xfs_trans_priv.h" 41#include "xfs_trans_priv.h"
42#include "xfs_quota.h" 42#include "xfs_quota.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
44#include "xfs_cksum.h"
44#include "xfs_trace.h" 45#include "xfs_trace.h"
46#include "xfs_icache.h"
45 47
46STATIC int 48STATIC int
47xlog_find_zeroed( 49xlog_find_zeroed(
@@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2(
2143 buf_flags |= XBF_UNMAPPED; 2145 buf_flags |= XBF_UNMAPPED;
2144 2146
2145 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2147 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2146 buf_flags); 2148 buf_flags, NULL);
2147 if (!bp) 2149 if (!bp)
2148 return XFS_ERROR(ENOMEM); 2150 return XFS_ERROR(ENOMEM);
2149 error = bp->b_error; 2151 error = bp->b_error;
@@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2(
2236 } 2238 }
2237 trace_xfs_log_recover_inode_recover(log, in_f); 2239 trace_xfs_log_recover_inode_recover(log, in_f);
2238 2240
2239 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0); 2241 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2242 NULL);
2240 if (!bp) { 2243 if (!bp) {
2241 error = ENOMEM; 2244 error = ENOMEM;
2242 goto error; 2245 goto error;
@@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2(
2547 ASSERT(dq_f->qlf_len == 1); 2550 ASSERT(dq_f->qlf_len == 1);
2548 2551
2549 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, 2552 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
2550 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp); 2553 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
2554 NULL);
2551 if (error) 2555 if (error)
2552 return error; 2556 return error;
2553 2557
@@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks(
3213 mp->m_dmevmask = mp_dmevmask; 3217 mp->m_dmevmask = mp_dmevmask;
3214} 3218}
3215 3219
3216
3217#ifdef DEBUG
3218STATIC void
3219xlog_pack_data_checksum(
3220 struct xlog *log,
3221 struct xlog_in_core *iclog,
3222 int size)
3223{
3224 int i;
3225 __be32 *up;
3226 uint chksum = 0;
3227
3228 up = (__be32 *)iclog->ic_datap;
3229 /* divide length by 4 to get # words */
3230 for (i = 0; i < (size >> 2); i++) {
3231 chksum ^= be32_to_cpu(*up);
3232 up++;
3233 }
3234 iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3235}
3236#else
3237#define xlog_pack_data_checksum(log, iclog, size)
3238#endif
3239
3240/* 3220/*
3241 * Stamp cycle number in every block 3221 * Upack the log buffer data and crc check it. If the check fails, issue a
3222 * warning if and only if the CRC in the header is non-zero. This makes the
3223 * check an advisory warning, and the zero CRC check will prevent failure
3224 * warnings from being emitted when upgrading the kernel from one that does not
3225 * add CRCs by default.
3226 *
3227 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
3228 * corruption failure
3242 */ 3229 */
3243void 3230STATIC int
3244xlog_pack_data( 3231xlog_unpack_data_crc(
3245 struct xlog *log, 3232 struct xlog_rec_header *rhead,
3246 struct xlog_in_core *iclog, 3233 xfs_caddr_t dp,
3247 int roundoff) 3234 struct xlog *log)
3248{ 3235{
3249 int i, j, k; 3236 __le32 crc;
3250 int size = iclog->ic_offset + roundoff; 3237
3251 __be32 cycle_lsn; 3238 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
3252 xfs_caddr_t dp; 3239 if (crc != rhead->h_crc) {
3253 3240 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
3254 xlog_pack_data_checksum(log, iclog, size); 3241 xfs_alert(log->l_mp,
3255 3242 "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
3256 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); 3243 le32_to_cpu(rhead->h_crc),
3257 3244 le32_to_cpu(crc));
3258 dp = iclog->ic_datap; 3245 xfs_hex_dump(dp, 32);
3259 for (i = 0; i < BTOBB(size) &&
3260 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3261 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3262 *(__be32 *)dp = cycle_lsn;
3263 dp += BBSIZE;
3264 }
3265
3266 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3267 xlog_in_core_2_t *xhdr = iclog->ic_data;
3268
3269 for ( ; i < BTOBB(size); i++) {
3270 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3271 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3272 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3273 *(__be32 *)dp = cycle_lsn;
3274 dp += BBSIZE;
3275 } 3246 }
3276 3247
3277 for (i = 1; i < log->l_iclog_heads; i++) { 3248 /*
3278 xhdr[i].hic_xheader.xh_cycle = cycle_lsn; 3249 * If we've detected a log record corruption, then we can't
3279 } 3250 * recover past this point. Abort recovery if we are enforcing
3251 * CRC protection by punting an error back up the stack.
3252 */
3253 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
3254 return EFSCORRUPTED;
3280 } 3255 }
3256
3257 return 0;
3281} 3258}
3282 3259
3283STATIC void 3260STATIC int
3284xlog_unpack_data( 3261xlog_unpack_data(
3285 struct xlog_rec_header *rhead, 3262 struct xlog_rec_header *rhead,
3286 xfs_caddr_t dp, 3263 xfs_caddr_t dp,
3287 struct xlog *log) 3264 struct xlog *log)
3288{ 3265{
3289 int i, j, k; 3266 int i, j, k;
3267 int error;
3268
3269 error = xlog_unpack_data_crc(rhead, dp, log);
3270 if (error)
3271 return error;
3290 3272
3291 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && 3273 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3292 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 3274 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3303,6 +3285,8 @@ xlog_unpack_data(
3303 dp += BBSIZE; 3285 dp += BBSIZE;
3304 } 3286 }
3305 } 3287 }
3288
3289 return 0;
3306} 3290}
3307 3291
3308STATIC int 3292STATIC int
@@ -3434,9 +3418,13 @@ xlog_do_recovery_pass(
3434 if (error) 3418 if (error)
3435 goto bread_err2; 3419 goto bread_err2;
3436 3420
3437 xlog_unpack_data(rhead, offset, log); 3421 error = xlog_unpack_data(rhead, offset, log);
3438 if ((error = xlog_recover_process_data(log, 3422 if (error)
3439 rhash, rhead, offset, pass))) 3423 goto bread_err2;
3424
3425 error = xlog_recover_process_data(log,
3426 rhash, rhead, offset, pass);
3427 if (error)
3440 goto bread_err2; 3428 goto bread_err2;
3441 blk_no += bblks + hblks; 3429 blk_no += bblks + hblks;
3442 } 3430 }
@@ -3546,9 +3534,14 @@ xlog_do_recovery_pass(
3546 if (error) 3534 if (error)
3547 goto bread_err2; 3535 goto bread_err2;
3548 } 3536 }
3549 xlog_unpack_data(rhead, offset, log); 3537
3550 if ((error = xlog_recover_process_data(log, rhash, 3538 error = xlog_unpack_data(rhead, offset, log);
3551 rhead, offset, pass))) 3539 if (error)
3540 goto bread_err2;
3541
3542 error = xlog_recover_process_data(log, rhash,
3543 rhead, offset, pass);
3544 if (error)
3552 goto bread_err2; 3545 goto bread_err2;
3553 blk_no += bblks; 3546 blk_no += bblks;
3554 } 3547 }
@@ -3573,9 +3566,13 @@ xlog_do_recovery_pass(
3573 if (error) 3566 if (error)
3574 goto bread_err2; 3567 goto bread_err2;
3575 3568
3576 xlog_unpack_data(rhead, offset, log); 3569 error = xlog_unpack_data(rhead, offset, log);
3577 if ((error = xlog_recover_process_data(log, rhash, 3570 if (error)
3578 rhead, offset, pass))) 3571 goto bread_err2;
3572
3573 error = xlog_recover_process_data(log, rhash,
3574 rhead, offset, pass);
3575 if (error)
3579 goto bread_err2; 3576 goto bread_err2;
3580 blk_no += bblks + hblks; 3577 blk_no += bblks + hblks;
3581 } 3578 }
@@ -3689,13 +3686,14 @@ xlog_do_recover(
3689 3686
3690 /* 3687 /*
3691 * Now that we've finished replaying all buffer and inode 3688 * Now that we've finished replaying all buffer and inode
3692 * updates, re-read in the superblock. 3689 * updates, re-read in the superblock and reverify it.
3693 */ 3690 */
3694 bp = xfs_getsb(log->l_mp, 0); 3691 bp = xfs_getsb(log->l_mp, 0);
3695 XFS_BUF_UNDONE(bp); 3692 XFS_BUF_UNDONE(bp);
3696 ASSERT(!(XFS_BUF_ISWRITE(bp))); 3693 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3697 XFS_BUF_READ(bp); 3694 XFS_BUF_READ(bp);
3698 XFS_BUF_UNASYNC(bp); 3695 XFS_BUF_UNASYNC(bp);
3696 bp->b_ops = &xfs_sb_buf_ops;
3699 xfsbdstrat(log->l_mp, bp); 3697 xfsbdstrat(log->l_mp, bp);
3700 error = xfs_buf_iowait(bp); 3698 error = xfs_buf_iowait(bp);
3701 if (error) { 3699 if (error) {
@@ -3707,7 +3705,7 @@ xlog_do_recover(
3707 3705
3708 /* Convert superblock from on-disk format */ 3706 /* Convert superblock from on-disk format */
3709 sbp = &log->l_mp->m_sb; 3707 sbp = &log->l_mp->m_sb;
3710 xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp)); 3708 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
3711 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); 3709 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3712 ASSERT(xfs_sb_good_version(sbp)); 3710 ASSERT(xfs_sb_good_version(sbp));
3713 xfs_buf_relse(bp); 3711 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2bd3a0e6376..7d6df7c00c36 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
42#include "xfs_fsops.h" 42#include "xfs_fsops.h"
43#include "xfs_utils.h" 43#include "xfs_utils.h"
44#include "xfs_trace.h" 44#include "xfs_trace.h"
45#include "xfs_icache.h"
45 46
46 47
47#ifdef HAVE_PERCPU_SB 48#ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
303xfs_mount_validate_sb( 304xfs_mount_validate_sb(
304 xfs_mount_t *mp, 305 xfs_mount_t *mp,
305 xfs_sb_t *sbp, 306 xfs_sb_t *sbp,
306 int flags) 307 bool check_inprogress)
307{ 308{
308 int loud = !(flags & XFS_MFSI_QUIET);
309 309
310 /* 310 /*
311 * If the log device and data device have the 311 * If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
315 * a volume filesystem in a non-volume manner. 315 * a volume filesystem in a non-volume manner.
316 */ 316 */
317 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 317 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
318 if (loud) 318 xfs_warn(mp, "bad magic number");
319 xfs_warn(mp, "bad magic number");
320 return XFS_ERROR(EWRONGFS); 319 return XFS_ERROR(EWRONGFS);
321 } 320 }
322 321
323 if (!xfs_sb_good_version(sbp)) { 322 if (!xfs_sb_good_version(sbp)) {
324 if (loud) 323 xfs_warn(mp, "bad version");
325 xfs_warn(mp, "bad version");
326 return XFS_ERROR(EWRONGFS); 324 return XFS_ERROR(EWRONGFS);
327 } 325 }
328 326
329 if (unlikely( 327 if (unlikely(
330 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 328 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
331 if (loud) 329 xfs_warn(mp,
332 xfs_warn(mp,
333 "filesystem is marked as having an external log; " 330 "filesystem is marked as having an external log; "
334 "specify logdev on the mount command line."); 331 "specify logdev on the mount command line.");
335 return XFS_ERROR(EINVAL); 332 return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
337 334
338 if (unlikely( 335 if (unlikely(
339 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 336 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
340 if (loud) 337 xfs_warn(mp,
341 xfs_warn(mp,
342 "filesystem is marked as having an internal log; " 338 "filesystem is marked as having an internal log; "
343 "do not specify logdev on the mount command line."); 339 "do not specify logdev on the mount command line.");
344 return XFS_ERROR(EINVAL); 340 return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
372 sbp->sb_dblocks == 0 || 368 sbp->sb_dblocks == 0 ||
373 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || 369 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
374 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) { 370 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
375 if (loud) 371 XFS_CORRUPTION_ERROR("SB sanity check failed",
376 XFS_CORRUPTION_ERROR("SB sanity check failed",
377 XFS_ERRLEVEL_LOW, mp, sbp); 372 XFS_ERRLEVEL_LOW, mp, sbp);
378 return XFS_ERROR(EFSCORRUPTED); 373 return XFS_ERROR(EFSCORRUPTED);
379 } 374 }
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
382 * Until this is fixed only page-sized or smaller data blocks work. 377 * Until this is fixed only page-sized or smaller data blocks work.
383 */ 378 */
384 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 379 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
385 if (loud) { 380 xfs_warn(mp,
386 xfs_warn(mp,
387 "File system with blocksize %d bytes. " 381 "File system with blocksize %d bytes. "
388 "Only pagesize (%ld) or less will currently work.", 382 "Only pagesize (%ld) or less will currently work.",
389 sbp->sb_blocksize, PAGE_SIZE); 383 sbp->sb_blocksize, PAGE_SIZE);
390 }
391 return XFS_ERROR(ENOSYS); 384 return XFS_ERROR(ENOSYS);
392 } 385 }
393 386
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
401 case 2048: 394 case 2048:
402 break; 395 break;
403 default: 396 default:
404 if (loud) 397 xfs_warn(mp, "inode size of %d bytes not supported",
405 xfs_warn(mp, "inode size of %d bytes not supported",
406 sbp->sb_inodesize); 398 sbp->sb_inodesize);
407 return XFS_ERROR(ENOSYS); 399 return XFS_ERROR(ENOSYS);
408 } 400 }
409 401
410 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 402 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
411 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 403 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
412 if (loud) 404 xfs_warn(mp,
413 xfs_warn(mp,
414 "file system too large to be mounted on this system."); 405 "file system too large to be mounted on this system.");
415 return XFS_ERROR(EFBIG); 406 return XFS_ERROR(EFBIG);
416 } 407 }
417 408
418 if (unlikely(sbp->sb_inprogress)) { 409 if (check_inprogress && sbp->sb_inprogress) {
419 if (loud) 410 xfs_warn(mp, "Offline file system operation in progress!");
420 xfs_warn(mp, "file system busy");
421 return XFS_ERROR(EFSCORRUPTED); 411 return XFS_ERROR(EFSCORRUPTED);
422 } 412 }
423 413
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
425 * Version 1 directory format has never worked on Linux. 415 * Version 1 directory format has never worked on Linux.
426 */ 416 */
427 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { 417 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
428 if (loud) 418 xfs_warn(mp, "file system using version 1 directory format");
429 xfs_warn(mp,
430 "file system using version 1 directory format");
431 return XFS_ERROR(ENOSYS); 419 return XFS_ERROR(ENOSYS);
432 } 420 }
433 421
@@ -520,11 +508,9 @@ out_unwind:
520 508
521void 509void
522xfs_sb_from_disk( 510xfs_sb_from_disk(
523 struct xfs_mount *mp, 511 struct xfs_sb *to,
524 xfs_dsb_t *from) 512 xfs_dsb_t *from)
525{ 513{
526 struct xfs_sb *to = &mp->m_sb;
527
528 to->sb_magicnum = be32_to_cpu(from->sb_magicnum); 514 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
529 to->sb_blocksize = be32_to_cpu(from->sb_blocksize); 515 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
530 to->sb_dblocks = be64_to_cpu(from->sb_dblocks); 516 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -626,6 +612,72 @@ xfs_sb_to_disk(
626 } 612 }
627} 613}
628 614
615static void
616xfs_sb_verify(
617 struct xfs_buf *bp)
618{
619 struct xfs_mount *mp = bp->b_target->bt_mount;
620 struct xfs_sb sb;
621 int error;
622
623 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
624
625 /*
626 * Only check the in progress field for the primary superblock as
627 * mkfs.xfs doesn't clear it from secondary superblocks.
628 */
629 error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
630 if (error)
631 xfs_buf_ioerror(bp, error);
632}
633
634static void
635xfs_sb_read_verify(
636 struct xfs_buf *bp)
637{
638 xfs_sb_verify(bp);
639}
640
641/*
642 * We may be probed for a filesystem match, so we may not want to emit
643 * messages when the superblock buffer is not actually an XFS superblock.
644 * If we find an XFS superblock, the run a normal, noisy mount because we are
645 * really going to mount it and want to know about errors.
646 */
647static void
648xfs_sb_quiet_read_verify(
649 struct xfs_buf *bp)
650{
651 struct xfs_sb sb;
652
653 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
654
655 if (sb.sb_magicnum == XFS_SB_MAGIC) {
656 /* XFS filesystem, verify noisily! */
657 xfs_sb_read_verify(bp);
658 return;
659 }
660 /* quietly fail */
661 xfs_buf_ioerror(bp, EWRONGFS);
662}
663
664static void
665xfs_sb_write_verify(
666 struct xfs_buf *bp)
667{
668 xfs_sb_verify(bp);
669}
670
671const struct xfs_buf_ops xfs_sb_buf_ops = {
672 .verify_read = xfs_sb_read_verify,
673 .verify_write = xfs_sb_write_verify,
674};
675
676static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
677 .verify_read = xfs_sb_quiet_read_verify,
678 .verify_write = xfs_sb_write_verify,
679};
680
629/* 681/*
630 * xfs_readsb 682 * xfs_readsb
631 * 683 *
@@ -651,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags)
651 703
652reread: 704reread:
653 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 705 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
654 BTOBB(sector_size), 0); 706 BTOBB(sector_size), 0,
707 loud ? &xfs_sb_buf_ops
708 : &xfs_sb_quiet_buf_ops);
655 if (!bp) { 709 if (!bp) {
656 if (loud) 710 if (loud)
657 xfs_warn(mp, "SB buffer read failed"); 711 xfs_warn(mp, "SB buffer read failed");
658 return EIO; 712 return EIO;
659 } 713 }
660 714 if (bp->b_error) {
661 /* 715 error = bp->b_error;
662 * Initialize the mount structure from the superblock.
663 * But first do some basic consistency checking.
664 */
665 xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
666 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
667 if (error) {
668 if (loud) 716 if (loud)
669 xfs_warn(mp, "SB validate failed"); 717 xfs_warn(mp, "SB validate failed");
670 goto release_buf; 718 goto release_buf;
671 } 719 }
672 720
673 /* 721 /*
722 * Initialize the mount structure from the superblock.
723 */
724 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
725
726 /*
674 * We must be able to do sector-sized and sector-aligned IO. 727 * We must be able to do sector-sized and sector-aligned IO.
675 */ 728 */
676 if (sector_size > mp->m_sb.sb_sectsize) { 729 if (sector_size > mp->m_sb.sb_sectsize) {
@@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1001 } 1054 }
1002 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 1055 bp = xfs_buf_read_uncached(mp->m_ddev_targp,
1003 d - XFS_FSS_TO_BB(mp, 1), 1056 d - XFS_FSS_TO_BB(mp, 1),
1004 XFS_FSS_TO_BB(mp, 1), 0); 1057 XFS_FSS_TO_BB(mp, 1), 0, NULL);
1005 if (!bp) { 1058 if (!bp) {
1006 xfs_warn(mp, "last sector read failed"); 1059 xfs_warn(mp, "last sector read failed");
1007 return EIO; 1060 return EIO;
@@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1016 } 1069 }
1017 bp = xfs_buf_read_uncached(mp->m_logdev_targp, 1070 bp = xfs_buf_read_uncached(mp->m_logdev_targp,
1018 d - XFS_FSB_TO_BB(mp, 1), 1071 d - XFS_FSB_TO_BB(mp, 1),
1019 XFS_FSB_TO_BB(mp, 1), 0); 1072 XFS_FSB_TO_BB(mp, 1), 0, NULL);
1020 if (!bp) { 1073 if (!bp) {
1021 xfs_warn(mp, "log device read failed"); 1074 xfs_warn(mp, "log device read failed");
1022 return EIO; 1075 return EIO;
@@ -1427,6 +1480,8 @@ xfs_unmountfs(
1427 __uint64_t resblks; 1480 __uint64_t resblks;
1428 int error; 1481 int error;
1429 1482
1483 cancel_delayed_work_sync(&mp->m_eofblocks_work);
1484
1430 xfs_qm_unmount_quotas(mp); 1485 xfs_qm_unmount_quotas(mp);
1431 xfs_rtunmount_inodes(mp); 1486 xfs_rtunmount_inodes(mp);
1432 IRELE(mp->m_rootip); 1487 IRELE(mp->m_rootip);
@@ -1450,21 +1505,16 @@ xfs_unmountfs(
1450 1505
1451 /* 1506 /*
1452 * And reclaim all inodes. At this point there should be no dirty 1507 * And reclaim all inodes. At this point there should be no dirty
1453 * inode, and none should be pinned or locked, but use synchronous 1508 * inodes and none should be pinned or locked, but use synchronous
1454 * reclaim just to be sure. 1509 * reclaim just to be sure. We can stop background inode reclaim
1510 * here as well if it is still running.
1455 */ 1511 */
1512 cancel_delayed_work_sync(&mp->m_reclaim_work);
1456 xfs_reclaim_inodes(mp, SYNC_WAIT); 1513 xfs_reclaim_inodes(mp, SYNC_WAIT);
1457 1514
1458 xfs_qm_unmount(mp); 1515 xfs_qm_unmount(mp);
1459 1516
1460 /* 1517 /*
1461 * Flush out the log synchronously so that we know for sure
1462 * that nothing is pinned. This is important because bflush()
1463 * will skip pinned buffers.
1464 */
1465 xfs_log_force(mp, XFS_LOG_SYNC);
1466
1467 /*
1468 * Unreserve any blocks we have so that when we unmount we don't account 1518 * Unreserve any blocks we have so that when we unmount we don't account
1469 * the reserved free space as used. This is really only necessary for 1519 * the reserved free space as used. This is really only necessary for
1470 * lazy superblock counting because it trusts the incore superblock 1520 * lazy superblock counting because it trusts the incore superblock
@@ -1489,23 +1539,6 @@ xfs_unmountfs(
1489 xfs_warn(mp, "Unable to update superblock counters. " 1539 xfs_warn(mp, "Unable to update superblock counters. "
1490 "Freespace may not be correct on next mount."); 1540 "Freespace may not be correct on next mount.");
1491 1541
1492 /*
1493 * At this point we might have modified the superblock again and thus
1494 * added an item to the AIL, thus flush it again.
1495 */
1496 xfs_ail_push_all_sync(mp->m_ail);
1497 xfs_wait_buftarg(mp->m_ddev_targp);
1498
1499 /*
1500 * The superblock buffer is uncached and xfsaild_push() will lock and
1501 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
1502 * here but a lock on the superblock buffer will block until iodone()
1503 * has completed.
1504 */
1505 xfs_buf_lock(mp->m_sb_bp);
1506 xfs_buf_unlock(mp->m_sb_bp);
1507
1508 xfs_log_unmount_write(mp);
1509 xfs_log_unmount(mp); 1542 xfs_log_unmount(mp);
1510 xfs_uuid_unmount(mp); 1543 xfs_uuid_unmount(mp);
1511 1544
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e534dc..bab8314507e4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
51 51
52#else /* __KERNEL__ */ 52#else /* __KERNEL__ */
53 53
54#include "xfs_sync.h"
55
56struct xlog; 54struct xlog;
57struct xfs_inode; 55struct xfs_inode;
58struct xfs_mru_cache; 56struct xfs_mru_cache;
@@ -197,9 +195,9 @@ typedef struct xfs_mount {
197 struct mutex m_icsb_mutex; /* balancer sync lock */ 195 struct mutex m_icsb_mutex; /* balancer sync lock */
198#endif 196#endif
199 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 197 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
200 struct delayed_work m_sync_work; /* background sync work */
201 struct delayed_work m_reclaim_work; /* background inode reclaim */ 198 struct delayed_work m_reclaim_work; /* background inode reclaim */
202 struct work_struct m_flush_work; /* background inode flush */ 199 struct delayed_work m_eofblocks_work; /* background eof blocks
200 trimming */
203 __int64_t m_update_flags; /* sb flags we need to update 201 __int64_t m_update_flags; /* sb flags we need to update
204 on the next remount,rw */ 202 on the next remount,rw */
205 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 203 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
@@ -209,6 +207,9 @@ typedef struct xfs_mount {
209 struct workqueue_struct *m_data_workqueue; 207 struct workqueue_struct *m_data_workqueue;
210 struct workqueue_struct *m_unwritten_workqueue; 208 struct workqueue_struct *m_unwritten_workqueue;
211 struct workqueue_struct *m_cil_workqueue; 209 struct workqueue_struct *m_cil_workqueue;
210 struct workqueue_struct *m_reclaim_workqueue;
211 struct workqueue_struct *m_log_workqueue;
212 struct workqueue_struct *m_eofblocks_workqueue;
212} xfs_mount_t; 213} xfs_mount_t;
213 214
214/* 215/*
@@ -387,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
387extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 388extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
388extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t, 389extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
389 xfs_agnumber_t *); 390 xfs_agnumber_t *);
390extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *); 391extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
391extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); 392extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
392 393
394extern const struct xfs_buf_ops xfs_sb_buf_ops;
395
393#endif /* __XFS_MOUNT_H__ */ 396#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0cfc0d..60eff4763156 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h"
43 44
44/* 45/*
45 * The global quota manager. There is only one of these for the entire 46 * The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
891 while (blkcnt--) { 892 while (blkcnt--) {
892 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 893 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
893 XFS_FSB_TO_DADDR(mp, bno), 894 XFS_FSB_TO_DADDR(mp, bno),
894 mp->m_quotainfo->qi_dqchunklen, 0, &bp); 895 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
896 &xfs_dquot_buf_ops);
895 if (error) 897 if (error)
896 break; 898 break;
897 899
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
978 while (rablkcnt--) { 980 while (rablkcnt--) {
979 xfs_buf_readahead(mp->m_ddev_targp, 981 xfs_buf_readahead(mp->m_ddev_targp,
980 XFS_FSB_TO_DADDR(mp, rablkno), 982 XFS_FSB_TO_DADDR(mp, rablkno),
981 mp->m_quotainfo->qi_dqchunklen); 983 mp->m_quotainfo->qi_dqchunklen,
984 NULL);
982 rablkno++; 985 rablkno++;
983 } 986 }
984 } 987 }
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
1453 int error; 1456 int error;
1454 1457
1455 if (!xfs_dqlock_nowait(dqp)) 1458 if (!xfs_dqlock_nowait(dqp))
1456 goto out_busy; 1459 goto out_move_tail;
1457 1460
1458 /* 1461 /*
1459 * This dquot has acquired a reference in the meantime remove it from 1462 * This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
1476 * getting flushed to disk, we don't want to reclaim it. 1479 * getting flushed to disk, we don't want to reclaim it.
1477 */ 1480 */
1478 if (!xfs_dqflock_nowait(dqp)) 1481 if (!xfs_dqflock_nowait(dqp))
1479 goto out_busy; 1482 goto out_unlock_move_tail;
1480 1483
1481 if (XFS_DQ_IS_DIRTY(dqp)) { 1484 if (XFS_DQ_IS_DIRTY(dqp)) {
1482 struct xfs_buf *bp = NULL; 1485 struct xfs_buf *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
1487 if (error) { 1490 if (error) {
1488 xfs_warn(mp, "%s: dquot %p flush failed", 1491 xfs_warn(mp, "%s: dquot %p flush failed",
1489 __func__, dqp); 1492 __func__, dqp);
1490 goto out_busy; 1493 goto out_unlock_move_tail;
1491 } 1494 }
1492 1495
1493 xfs_buf_delwri_queue(bp, buffer_list); 1496 xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
1496 * Give the dquot another try on the freelist, as the 1499 * Give the dquot another try on the freelist, as the
1497 * flushing will take some time. 1500 * flushing will take some time.
1498 */ 1501 */
1499 goto out_busy; 1502 goto out_unlock_move_tail;
1500 } 1503 }
1501 xfs_dqfunlock(dqp); 1504 xfs_dqfunlock(dqp);
1502 1505
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
1515 XFS_STATS_INC(xs_qm_dqreclaims); 1518 XFS_STATS_INC(xs_qm_dqreclaims);
1516 return; 1519 return;
1517 1520
1518out_busy:
1519 xfs_dqunlock(dqp);
1520
1521 /* 1521 /*
1522 * Move the dquot to the tail of the list so that we don't spin on it. 1522 * Move the dquot to the tail of the list so that we don't spin on it.
1523 */ 1523 */
1524out_unlock_move_tail:
1525 xfs_dqunlock(dqp);
1526out_move_tail:
1524 list_move_tail(&dqp->q_lru, &qi->qi_lru_list); 1527 list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
1525
1526 trace_xfs_dqreclaim_busy(dqp); 1528 trace_xfs_dqreclaim_busy(dqp);
1527 XFS_STATS_INC(xs_qm_dqreclaim_misses); 1529 XFS_STATS_INC(xs_qm_dqreclaim_misses);
1528} 1530}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b186110..8a59f8546552 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
40#include "xfs_utils.h" 40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h"
43 44
44STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 45STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
45STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 46STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -783,11 +784,11 @@ xfs_qm_scall_getquota(
783 (XFS_IS_OQUOTA_ENFORCED(mp) && 784 (XFS_IS_OQUOTA_ENFORCED(mp) &&
784 (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && 785 (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
785 dst->d_id != 0) { 786 dst->d_id != 0) {
786 if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && 787 if ((dst->d_bcount > dst->d_blk_softlimit) &&
787 (dst->d_blk_softlimit > 0)) { 788 (dst->d_blk_softlimit > 0)) {
788 ASSERT(dst->d_btimer != 0); 789 ASSERT(dst->d_btimer != 0);
789 } 790 }
790 if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && 791 if ((dst->d_icount > dst->d_ino_softlimit) &&
791 (dst->d_ino_softlimit > 0)) { 792 (dst->d_ino_softlimit > 0)) {
792 ASSERT(dst->d_itimer != 0); 793 ASSERT(dst->d_itimer != 0);
793 } 794 }
@@ -845,7 +846,8 @@ STATIC int
845xfs_dqrele_inode( 846xfs_dqrele_inode(
846 struct xfs_inode *ip, 847 struct xfs_inode *ip,
847 struct xfs_perag *pag, 848 struct xfs_perag *pag,
848 int flags) 849 int flags,
850 void *args)
849{ 851{
850 /* skip quota inodes */ 852 /* skip quota inodes */
851 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || 853 if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
881 uint flags) 883 uint flags)
882{ 884{
883 ASSERT(mp->m_quotainfo); 885 ASSERT(mp->m_quotainfo);
884 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags); 886 xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
885} 887}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4ba4b54..98dc670d3ee0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
38#include "xfs_utils.h" 38#include "xfs_utils.h"
39#include "xfs_trace.h" 39#include "xfs_trace.h"
40#include "xfs_buf.h" 40#include "xfs_buf.h"
41#include "xfs_icache.h"
41 42
42 43
43/* 44/*
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
869 ASSERT(map.br_startblock != NULLFSBLOCK); 870 ASSERT(map.br_startblock != NULLFSBLOCK);
870 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 871 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
871 XFS_FSB_TO_DADDR(mp, map.br_startblock), 872 XFS_FSB_TO_DADDR(mp, map.br_startblock),
872 mp->m_bsize, 0, &bp); 873 mp->m_bsize, 0, &bp, NULL);
873 if (error) 874 if (error)
874 return error; 875 return error;
875 ASSERT(!xfs_buf_geterror(bp)); 876 ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
1872 */ 1873 */
1873 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 1874 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
1874 XFS_FSB_TO_BB(mp, nrblocks - 1), 1875 XFS_FSB_TO_BB(mp, nrblocks - 1),
1875 XFS_FSB_TO_BB(mp, 1), 0); 1876 XFS_FSB_TO_BB(mp, 1), 0, NULL);
1876 if (!bp) 1877 if (!bp)
1877 return EIO; 1878 return EIO;
1879 if (bp->b_error) {
1880 error = bp->b_error;
1881 xfs_buf_relse(bp);
1882 return error;
1883 }
1878 xfs_buf_relse(bp); 1884 xfs_buf_relse(bp);
1879 1885
1880 /* 1886 /*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
2219 } 2225 }
2220 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 2226 bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
2221 d - XFS_FSB_TO_BB(mp, 1), 2227 d - XFS_FSB_TO_BB(mp, 1),
2222 XFS_FSB_TO_BB(mp, 1), 0); 2228 XFS_FSB_TO_BB(mp, 1), 0, NULL);
2223 if (!bp) { 2229 if (!bp || bp->b_error) {
2224 xfs_warn(mp, "realtime device size check failed"); 2230 xfs_warn(mp, "realtime device size check failed");
2231 if (bp)
2232 xfs_buf_relse(bp);
2225 return EIO; 2233 return EIO;
2226 } 2234 }
2227 xfs_buf_relse(bp); 2235 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d325..a05b45175fb0 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ 81#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 82#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ 83#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
84#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
84 85
85#define XFS_SB_VERSION2_OKREALFBITS \ 86#define XFS_SB_VERSION2_OKREALFBITS \
86 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 87 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
503 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT); 504 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
504} 505}
505 506
507static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
508{
509 return (xfs_sb_version_hasmorebits(sbp) &&
510 (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
511}
512
506/* 513/*
507 * end of superblock version macros 514 * end of superblock version macros
508 */ 515 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd7f975..ab8839b26272 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
49#include "xfs_extfree_item.h" 49#include "xfs_extfree_item.h"
50#include "xfs_mru_cache.h" 50#include "xfs_mru_cache.h"
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
52#include "xfs_sync.h" 52#include "xfs_icache.h"
53#include "xfs_trace.h" 53#include "xfs_trace.h"
54 54
55#include <linux/namei.h> 55#include <linux/namei.h>
@@ -863,8 +863,30 @@ xfs_init_mount_workqueues(
863 WQ_MEM_RECLAIM, 0, mp->m_fsname); 863 WQ_MEM_RECLAIM, 0, mp->m_fsname);
864 if (!mp->m_cil_workqueue) 864 if (!mp->m_cil_workqueue)
865 goto out_destroy_unwritten; 865 goto out_destroy_unwritten;
866
867 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
868 WQ_NON_REENTRANT, 0, mp->m_fsname);
869 if (!mp->m_reclaim_workqueue)
870 goto out_destroy_cil;
871
872 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
873 WQ_NON_REENTRANT, 0, mp->m_fsname);
874 if (!mp->m_log_workqueue)
875 goto out_destroy_reclaim;
876
877 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
878 WQ_NON_REENTRANT, 0, mp->m_fsname);
879 if (!mp->m_eofblocks_workqueue)
880 goto out_destroy_log;
881
866 return 0; 882 return 0;
867 883
884out_destroy_log:
885 destroy_workqueue(mp->m_log_workqueue);
886out_destroy_reclaim:
887 destroy_workqueue(mp->m_reclaim_workqueue);
888out_destroy_cil:
889 destroy_workqueue(mp->m_cil_workqueue);
868out_destroy_unwritten: 890out_destroy_unwritten:
869 destroy_workqueue(mp->m_unwritten_workqueue); 891 destroy_workqueue(mp->m_unwritten_workqueue);
870out_destroy_data_iodone_queue: 892out_destroy_data_iodone_queue:
@@ -877,11 +899,32 @@ STATIC void
877xfs_destroy_mount_workqueues( 899xfs_destroy_mount_workqueues(
878 struct xfs_mount *mp) 900 struct xfs_mount *mp)
879{ 901{
902 destroy_workqueue(mp->m_eofblocks_workqueue);
903 destroy_workqueue(mp->m_log_workqueue);
904 destroy_workqueue(mp->m_reclaim_workqueue);
880 destroy_workqueue(mp->m_cil_workqueue); 905 destroy_workqueue(mp->m_cil_workqueue);
881 destroy_workqueue(mp->m_data_workqueue); 906 destroy_workqueue(mp->m_data_workqueue);
882 destroy_workqueue(mp->m_unwritten_workqueue); 907 destroy_workqueue(mp->m_unwritten_workqueue);
883} 908}
884 909
910/*
911 * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
912 * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
913 * for IO to complete so that we effectively throttle multiple callers to the
914 * rate at which IO is completing.
915 */
916void
917xfs_flush_inodes(
918 struct xfs_mount *mp)
919{
920 struct super_block *sb = mp->m_super;
921
922 if (down_read_trylock(&sb->s_umount)) {
923 sync_inodes_sb(sb);
924 up_read(&sb->s_umount);
925 }
926}
927
885/* Catch misguided souls that try to use this interface on XFS */ 928/* Catch misguided souls that try to use this interface on XFS */
886STATIC struct inode * 929STATIC struct inode *
887xfs_fs_alloc_inode( 930xfs_fs_alloc_inode(
@@ -1006,9 +1049,8 @@ xfs_fs_put_super(
1006 struct xfs_mount *mp = XFS_M(sb); 1049 struct xfs_mount *mp = XFS_M(sb);
1007 1050
1008 xfs_filestream_unmount(mp); 1051 xfs_filestream_unmount(mp);
1009 cancel_delayed_work_sync(&mp->m_sync_work);
1010 xfs_unmountfs(mp); 1052 xfs_unmountfs(mp);
1011 xfs_syncd_stop(mp); 1053
1012 xfs_freesb(mp); 1054 xfs_freesb(mp);
1013 xfs_icsb_destroy_counters(mp); 1055 xfs_icsb_destroy_counters(mp);
1014 xfs_destroy_mount_workqueues(mp); 1056 xfs_destroy_mount_workqueues(mp);
@@ -1023,7 +1065,6 @@ xfs_fs_sync_fs(
1023 int wait) 1065 int wait)
1024{ 1066{
1025 struct xfs_mount *mp = XFS_M(sb); 1067 struct xfs_mount *mp = XFS_M(sb);
1026 int error;
1027 1068
1028 /* 1069 /*
1029 * Doing anything during the async pass would be counterproductive. 1070 * Doing anything during the async pass would be counterproductive.
@@ -1031,17 +1072,14 @@ xfs_fs_sync_fs(
1031 if (!wait) 1072 if (!wait)
1032 return 0; 1073 return 0;
1033 1074
1034 error = xfs_quiesce_data(mp); 1075 xfs_log_force(mp, XFS_LOG_SYNC);
1035 if (error)
1036 return -error;
1037
1038 if (laptop_mode) { 1076 if (laptop_mode) {
1039 /* 1077 /*
1040 * The disk must be active because we're syncing. 1078 * The disk must be active because we're syncing.
1041 * We schedule xfssyncd now (now that the disk is 1079 * We schedule log work now (now that the disk is
1042 * active) instead of later (when it might not be). 1080 * active) instead of later (when it might not be).
1043 */ 1081 */
1044 flush_delayed_work(&mp->m_sync_work); 1082 flush_delayed_work(&mp->m_log->l_work);
1045 } 1083 }
1046 1084
1047 return 0; 1085 return 0;
@@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
1118 xfs_reserve_blocks(mp, &resblks, NULL); 1156 xfs_reserve_blocks(mp, &resblks, NULL);
1119} 1157}
1120 1158
1159/*
1160 * Trigger writeback of all the dirty metadata in the file system.
1161 *
1162 * This ensures that the metadata is written to their location on disk rather
1163 * than just existing in transactions in the log. This means after a quiesce
1164 * there is no log replay required to write the inodes to disk - this is the
1165 * primary difference between a sync and a quiesce.
1166 *
1167 * Note: xfs_log_quiesce() stops background log work - the callers must ensure
1168 * it is started again when appropriate.
1169 */
1170void
1171xfs_quiesce_attr(
1172 struct xfs_mount *mp)
1173{
1174 int error = 0;
1175
1176 /* wait for all modifications to complete */
1177 while (atomic_read(&mp->m_active_trans) > 0)
1178 delay(100);
1179
1180 /* force the log to unpin objects from the now complete transactions */
1181 xfs_log_force(mp, XFS_LOG_SYNC);
1182
1183 /* reclaim inodes to do any IO before the freeze completes */
1184 xfs_reclaim_inodes(mp, 0);
1185 xfs_reclaim_inodes(mp, SYNC_WAIT);
1186
1187 /* Push the superblock and write an unmount record */
1188 error = xfs_log_sbcount(mp);
1189 if (error)
1190 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
1191 "Frozen image may not be consistent.");
1192 /*
1193 * Just warn here till VFS can correctly support
1194 * read-only remount without racing.
1195 */
1196 WARN_ON(atomic_read(&mp->m_active_trans) != 0);
1197
1198 xfs_log_quiesce(mp);
1199}
1200
1121STATIC int 1201STATIC int
1122xfs_fs_remount( 1202xfs_fs_remount(
1123 struct super_block *sb, 1203 struct super_block *sb,
@@ -1198,20 +1278,18 @@ xfs_fs_remount(
1198 * value if it is non-zero, otherwise go with the default. 1278 * value if it is non-zero, otherwise go with the default.
1199 */ 1279 */
1200 xfs_restore_resvblks(mp); 1280 xfs_restore_resvblks(mp);
1281 xfs_log_work_queue(mp);
1201 } 1282 }
1202 1283
1203 /* rw -> ro */ 1284 /* rw -> ro */
1204 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { 1285 if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
1205 /* 1286 /*
1206 * After we have synced the data but before we sync the 1287 * Before we sync the metadata, we need to free up the reserve
1207 * metadata, we need to free up the reserve block pool so that 1288 * block pool so that the used block count in the superblock on
1208 * the used block count in the superblock on disk is correct at 1289 * disk is correct at the end of the remount. Stash the current
1209 * the end of the remount. Stash the current reserve pool size 1290 * reserve pool size so that if we get remounted rw, we can
1210 * so that if we get remounted rw, we can return it to the same 1291 * return it to the same size.
1211 * size.
1212 */ 1292 */
1213
1214 xfs_quiesce_data(mp);
1215 xfs_save_resvblks(mp); 1293 xfs_save_resvblks(mp);
1216 xfs_quiesce_attr(mp); 1294 xfs_quiesce_attr(mp);
1217 mp->m_flags |= XFS_MOUNT_RDONLY; 1295 mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1243,6 +1321,7 @@ xfs_fs_unfreeze(
1243 struct xfs_mount *mp = XFS_M(sb); 1321 struct xfs_mount *mp = XFS_M(sb);
1244 1322
1245 xfs_restore_resvblks(mp); 1323 xfs_restore_resvblks(mp);
1324 xfs_log_work_queue(mp);
1246 return 0; 1325 return 0;
1247} 1326}
1248 1327
@@ -1321,6 +1400,8 @@ xfs_fs_fill_super(
1321 spin_lock_init(&mp->m_sb_lock); 1400 spin_lock_init(&mp->m_sb_lock);
1322 mutex_init(&mp->m_growlock); 1401 mutex_init(&mp->m_growlock);
1323 atomic_set(&mp->m_active_trans, 0); 1402 atomic_set(&mp->m_active_trans, 0);
1403 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
1404 INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
1324 1405
1325 mp->m_super = sb; 1406 mp->m_super = sb;
1326 sb->s_fs_info = mp; 1407 sb->s_fs_info = mp;
@@ -1371,10 +1452,6 @@ xfs_fs_fill_super(
1371 /* 1452 /*
1372 * we must configure the block size in the superblock before we run the 1453 * we must configure the block size in the superblock before we run the
1373 * full mount process as the mount process can lookup and cache inodes. 1454 * full mount process as the mount process can lookup and cache inodes.
1374 * For the same reason we must also initialise the syncd and register
1375 * the inode cache shrinker so that inodes can be reclaimed during
1376 * operations like a quotacheck that iterate all inodes in the
1377 * filesystem.
1378 */ 1455 */
1379 sb->s_magic = XFS_SB_MAGIC; 1456 sb->s_magic = XFS_SB_MAGIC;
1380 sb->s_blocksize = mp->m_sb.sb_blocksize; 1457 sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1384,13 +1461,9 @@ xfs_fs_fill_super(
1384 sb->s_time_gran = 1; 1461 sb->s_time_gran = 1;
1385 set_posix_acl_flag(sb); 1462 set_posix_acl_flag(sb);
1386 1463
1387 error = xfs_syncd_init(mp);
1388 if (error)
1389 goto out_filestream_unmount;
1390
1391 error = xfs_mountfs(mp); 1464 error = xfs_mountfs(mp);
1392 if (error) 1465 if (error)
1393 goto out_syncd_stop; 1466 goto out_filestream_unmount;
1394 1467
1395 root = igrab(VFS_I(mp->m_rootip)); 1468 root = igrab(VFS_I(mp->m_rootip));
1396 if (!root) { 1469 if (!root) {
@@ -1408,8 +1481,7 @@ xfs_fs_fill_super(
1408 } 1481 }
1409 1482
1410 return 0; 1483 return 0;
1411 out_syncd_stop: 1484
1412 xfs_syncd_stop(mp);
1413 out_filestream_unmount: 1485 out_filestream_unmount:
1414 xfs_filestream_unmount(mp); 1486 xfs_filestream_unmount(mp);
1415 out_free_sb: 1487 out_free_sb:
@@ -1429,7 +1501,6 @@ out_destroy_workqueues:
1429 out_unmount: 1501 out_unmount:
1430 xfs_filestream_unmount(mp); 1502 xfs_filestream_unmount(mp);
1431 xfs_unmountfs(mp); 1503 xfs_unmountfs(mp);
1432 xfs_syncd_stop(mp);
1433 goto out_free_sb; 1504 goto out_free_sb;
1434} 1505}
1435 1506
@@ -1625,16 +1696,6 @@ STATIC int __init
1625xfs_init_workqueues(void) 1696xfs_init_workqueues(void)
1626{ 1697{
1627 /* 1698 /*
1628 * We never want to the same work item to run twice, reclaiming inodes
1629 * or idling the log is not going to get any faster by multiple CPUs
1630 * competing for ressources. Use the default large max_active value
1631 * so that even lots of filesystems can perform these task in parallel.
1632 */
1633 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
1634 if (!xfs_syncd_wq)
1635 return -ENOMEM;
1636
1637 /*
1638 * The allocation workqueue can be used in memory reclaim situations 1699 * The allocation workqueue can be used in memory reclaim situations
1639 * (writepage path), and parallelism is only limited by the number of 1700 * (writepage path), and parallelism is only limited by the number of
1640 * AGs in all the filesystems mounted. Hence use the default large 1701 * AGs in all the filesystems mounted. Hence use the default large
@@ -1642,20 +1703,15 @@ xfs_init_workqueues(void)
1642 */ 1703 */
1643 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); 1704 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
1644 if (!xfs_alloc_wq) 1705 if (!xfs_alloc_wq)
1645 goto out_destroy_syncd; 1706 return -ENOMEM;
1646 1707
1647 return 0; 1708 return 0;
1648
1649out_destroy_syncd:
1650 destroy_workqueue(xfs_syncd_wq);
1651 return -ENOMEM;
1652} 1709}
1653 1710
1654STATIC void 1711STATIC void
1655xfs_destroy_workqueues(void) 1712xfs_destroy_workqueues(void)
1656{ 1713{
1657 destroy_workqueue(xfs_alloc_wq); 1714 destroy_workqueue(xfs_alloc_wq);
1658 destroy_workqueue(xfs_syncd_wq);
1659} 1715}
1660 1716
1661STATIC int __init 1717STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
74 74
75extern __uint64_t xfs_max_file_offset(unsigned int); 75extern __uint64_t xfs_max_file_offset(unsigned int);
76 76
77extern void xfs_flush_inodes(struct xfs_mount *mp);
77extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 78extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
78extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); 79extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
79extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); 80extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..2801b5ce6cdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
202 .extra1 = &xfs_params.fstrm_timer.min, 202 .extra1 = &xfs_params.fstrm_timer.min,
203 .extra2 = &xfs_params.fstrm_timer.max, 203 .extra2 = &xfs_params.fstrm_timer.max,
204 }, 204 },
205 {
206 .procname = "speculative_prealloc_lifetime",
207 .data = &xfs_params.eofb_timer.val,
208 .maxlen = sizeof(int),
209 .mode = 0644,
210 .proc_handler = proc_dointvec_minmax,
211 .extra1 = &xfs_params.eofb_timer.min,
212 .extra2 = &xfs_params.eofb_timer.max,
213 },
205 /* please keep this the last entry */ 214 /* please keep this the last entry */
206#ifdef CONFIG_PROC_FS 215#ifdef CONFIG_PROC_FS
207 { 216 {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..bd8e157c20ef 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
47 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ 47 xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
48 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ 48 xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
49 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ 49 xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
50 xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
50} xfs_param_t; 51} xfs_param_t;
51 52
52/* 53/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7d36ccf57f93..16a812977eab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); 96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
97DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); 97DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
98DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); 98DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
99DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
100DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
99 101
100DECLARE_EVENT_CLASS(xfs_perag_class, 102DECLARE_EVENT_CLASS(xfs_perag_class,
101 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, 103 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
130DEFINE_PERAG_REF_EVENT(xfs_perag_put); 132DEFINE_PERAG_REF_EVENT(xfs_perag_put);
131DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); 133DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
132DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); 134DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
135DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
136DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
133 137
134TRACE_EVENT(xfs_attr_list_node_descend, 138TRACE_EVENT(xfs_attr_list_node_descend,
135 TP_PROTO(struct xfs_attr_list_context *ctx, 139 TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -337,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse);
337DEFINE_BUF_EVENT(xfs_buf_item_iodone); 341DEFINE_BUF_EVENT(xfs_buf_item_iodone);
338DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); 342DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
339DEFINE_BUF_EVENT(xfs_buf_error_relse); 343DEFINE_BUF_EVENT(xfs_buf_error_relse);
344DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
340DEFINE_BUF_EVENT(xfs_trans_read_buf_io); 345DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
341DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); 346DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
342 347
@@ -585,6 +590,10 @@ DEFINE_INODE_EVENT(xfs_update_time);
585DEFINE_INODE_EVENT(xfs_dquot_dqalloc); 590DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
586DEFINE_INODE_EVENT(xfs_dquot_dqdetach); 591DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
587 592
593DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
594DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
595DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
596
588DECLARE_EVENT_CLASS(xfs_iref_class, 597DECLARE_EVENT_CLASS(xfs_iref_class,
589 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), 598 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
590 TP_ARGS(ip, caller_ip), 599 TP_ARGS(ip, caller_ip),
@@ -1496,8 +1505,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
1496DEFINE_DIR2_EVENT(xfs_dir2_node_removename); 1505DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
1497DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); 1506DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
1498 1507
1508DECLARE_EVENT_CLASS(xfs_attr_class,
1509 TP_PROTO(struct xfs_da_args *args),
1510 TP_ARGS(args),
1511 TP_STRUCT__entry(
1512 __field(dev_t, dev)
1513 __field(xfs_ino_t, ino)
1514 __dynamic_array(char, name, args->namelen)
1515 __field(int, namelen)
1516 __field(int, valuelen)
1517 __field(xfs_dahash_t, hashval)
1518 __field(int, op_flags)
1519 ),
1520 TP_fast_assign(
1521 __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
1522 __entry->ino = args->dp->i_ino;
1523 if (args->namelen)
1524 memcpy(__get_str(name), args->name, args->namelen);
1525 __entry->namelen = args->namelen;
1526 __entry->valuelen = args->valuelen;
1527 __entry->hashval = args->hashval;
1528 __entry->op_flags = args->op_flags;
1529 ),
1530 TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
1531 "hashval 0x%x op_flags %s",
1532 MAJOR(__entry->dev), MINOR(__entry->dev),
1533 __entry->ino,
1534 __entry->namelen,
1535 __entry->namelen ? __get_str(name) : NULL,
1536 __entry->namelen,
1537 __entry->valuelen,
1538 __entry->hashval,
1539 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
1540)
1541
1499#define DEFINE_ATTR_EVENT(name) \ 1542#define DEFINE_ATTR_EVENT(name) \
1500DEFINE_EVENT(xfs_da_class, name, \ 1543DEFINE_EVENT(xfs_attr_class, name, \
1501 TP_PROTO(struct xfs_da_args *args), \ 1544 TP_PROTO(struct xfs_da_args *args), \
1502 TP_ARGS(args)) 1545 TP_ARGS(args))
1503DEFINE_ATTR_EVENT(xfs_attr_sf_add); 1546DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1511,10 +1554,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
1511DEFINE_ATTR_EVENT(xfs_attr_leaf_add); 1554DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
1512DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); 1555DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
1513DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); 1556DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
1557DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
1514DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); 1558DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
1515DEFINE_ATTR_EVENT(xfs_attr_leaf_create); 1559DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
1560DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
1561DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
1516DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); 1562DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
1517DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); 1563DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
1564DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
1518DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); 1565DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
1519DEFINE_ATTR_EVENT(xfs_attr_leaf_split); 1566DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
1520DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); 1567DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1526,12 +1573,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
1526DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); 1573DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
1527DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); 1574DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
1528DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); 1575DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
1576DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
1529 1577
1530DEFINE_ATTR_EVENT(xfs_attr_node_addname); 1578DEFINE_ATTR_EVENT(xfs_attr_node_addname);
1579DEFINE_ATTR_EVENT(xfs_attr_node_get);
1531DEFINE_ATTR_EVENT(xfs_attr_node_lookup); 1580DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
1532DEFINE_ATTR_EVENT(xfs_attr_node_replace); 1581DEFINE_ATTR_EVENT(xfs_attr_node_replace);
1533DEFINE_ATTR_EVENT(xfs_attr_node_removename); 1582DEFINE_ATTR_EVENT(xfs_attr_node_removename);
1534 1583
1584DEFINE_ATTR_EVENT(xfs_attr_fillstate);
1585DEFINE_ATTR_EVENT(xfs_attr_refillstate);
1586
1587DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
1588DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
1589DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
1590
1535#define DEFINE_DA_EVENT(name) \ 1591#define DEFINE_DA_EVENT(name) \
1536DEFINE_EVENT(xfs_da_class, name, \ 1592DEFINE_EVENT(xfs_da_class, name, \
1537 TP_PROTO(struct xfs_da_args *args), \ 1593 TP_PROTO(struct xfs_da_args *args), \
@@ -1550,9 +1606,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
1550DEFINE_DA_EVENT(xfs_da_node_remove); 1606DEFINE_DA_EVENT(xfs_da_node_remove);
1551DEFINE_DA_EVENT(xfs_da_node_rebalance); 1607DEFINE_DA_EVENT(xfs_da_node_rebalance);
1552DEFINE_DA_EVENT(xfs_da_node_unbalance); 1608DEFINE_DA_EVENT(xfs_da_node_unbalance);
1609DEFINE_DA_EVENT(xfs_da_node_toosmall);
1553DEFINE_DA_EVENT(xfs_da_swap_lastblock); 1610DEFINE_DA_EVENT(xfs_da_swap_lastblock);
1554DEFINE_DA_EVENT(xfs_da_grow_inode); 1611DEFINE_DA_EVENT(xfs_da_grow_inode);
1555DEFINE_DA_EVENT(xfs_da_shrink_inode); 1612DEFINE_DA_EVENT(xfs_da_shrink_inode);
1613DEFINE_DA_EVENT(xfs_da_fixhashpath);
1614DEFINE_DA_EVENT(xfs_da_path_shift);
1556 1615
1557DECLARE_EVENT_CLASS(xfs_dir2_space_class, 1616DECLARE_EVENT_CLASS(xfs_dir2_space_class,
1558 TP_PROTO(struct xfs_da_args *args, int idx), 1617 TP_PROTO(struct xfs_da_args *args, int idx),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index db056544cbb5..c6c0601abd7a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
464 int numblks, 464 int numblks,
465 uint flags) 465 uint flags)
466{ 466{
467 struct xfs_buf_map map = { 467 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
468 .bm_bn = blkno,
469 .bm_len = numblks,
470 };
471 return xfs_trans_get_buf_map(tp, target, &map, 1, flags); 468 return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
472} 469}
473 470
@@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp,
476 struct xfs_buftarg *target, 473 struct xfs_buftarg *target,
477 struct xfs_buf_map *map, int nmaps, 474 struct xfs_buf_map *map, int nmaps,
478 xfs_buf_flags_t flags, 475 xfs_buf_flags_t flags,
479 struct xfs_buf **bpp); 476 struct xfs_buf **bpp,
477 const struct xfs_buf_ops *ops);
480 478
481static inline int 479static inline int
482xfs_trans_read_buf( 480xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
486 xfs_daddr_t blkno, 484 xfs_daddr_t blkno,
487 int numblks, 485 int numblks,
488 xfs_buf_flags_t flags, 486 xfs_buf_flags_t flags,
489 struct xfs_buf **bpp) 487 struct xfs_buf **bpp,
488 const struct xfs_buf_ops *ops)
490{ 489{
491 struct xfs_buf_map map = { 490 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
492 .bm_bn = blkno, 491 return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
493 .bm_len = numblks, 492 flags, bpp, ops);
494 };
495 return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
496} 493}
497 494
498struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); 495struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6311b99c267f..3edf5dbee001 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -93,7 +93,7 @@ _xfs_trans_bjoin(
93 xfs_buf_item_init(bp, tp->t_mountp); 93 xfs_buf_item_init(bp, tp->t_mountp);
94 bip = bp->b_fspriv; 94 bip = bp->b_fspriv;
95 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 95 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
96 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); 96 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
97 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 97 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
98 if (reset_recur) 98 if (reset_recur)
99 bip->bli_recur = 0; 99 bip->bli_recur = 0;
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
257 struct xfs_buf_map *map, 257 struct xfs_buf_map *map,
258 int nmaps, 258 int nmaps,
259 xfs_buf_flags_t flags, 259 xfs_buf_flags_t flags,
260 struct xfs_buf **bpp) 260 struct xfs_buf **bpp,
261 const struct xfs_buf_ops *ops)
261{ 262{
262 xfs_buf_t *bp; 263 xfs_buf_t *bp;
263 xfs_buf_log_item_t *bip; 264 xfs_buf_log_item_t *bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
265 266
266 *bpp = NULL; 267 *bpp = NULL;
267 if (!tp) { 268 if (!tp) {
268 bp = xfs_buf_read_map(target, map, nmaps, flags); 269 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
269 if (!bp) 270 if (!bp)
270 return (flags & XBF_TRYLOCK) ? 271 return (flags & XBF_TRYLOCK) ?
271 EAGAIN : XFS_ERROR(ENOMEM); 272 EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
312 if (!(XFS_BUF_ISDONE(bp))) { 313 if (!(XFS_BUF_ISDONE(bp))) {
313 trace_xfs_trans_read_buf_io(bp, _RET_IP_); 314 trace_xfs_trans_read_buf_io(bp, _RET_IP_);
314 ASSERT(!XFS_BUF_ISASYNC(bp)); 315 ASSERT(!XFS_BUF_ISASYNC(bp));
316 ASSERT(bp->b_iodone == NULL);
315 XFS_BUF_READ(bp); 317 XFS_BUF_READ(bp);
318 bp->b_ops = ops;
316 xfsbdstrat(tp->t_mountp, bp); 319 xfsbdstrat(tp->t_mountp, bp);
317 error = xfs_buf_iowait(bp); 320 error = xfs_buf_iowait(bp);
318 if (error) { 321 if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
349 return 0; 352 return 0;
350 } 353 }
351 354
352 bp = xfs_buf_read_map(target, map, nmaps, flags); 355 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
353 if (bp == NULL) { 356 if (bp == NULL) {
354 *bpp = NULL; 357 *bpp = NULL;
355 return (flags & XBF_TRYLOCK) ? 358 return (flags & XBF_TRYLOCK) ?
@@ -429,7 +432,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
429 bip = bp->b_fspriv; 432 bip = bp->b_fspriv;
430 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 433 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
431 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 434 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
432 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); 435 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
433 ASSERT(atomic_read(&bip->bli_refcount) > 0); 436 ASSERT(atomic_read(&bip->bli_refcount) > 0);
434 437
435 trace_xfs_trans_brelse(bip); 438 trace_xfs_trans_brelse(bip);
@@ -516,7 +519,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
516 ASSERT(bp->b_transp == tp); 519 ASSERT(bp->b_transp == tp);
517 ASSERT(bip != NULL); 520 ASSERT(bip != NULL);
518 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 521 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
519 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); 522 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
520 ASSERT(atomic_read(&bip->bli_refcount) > 0); 523 ASSERT(atomic_read(&bip->bli_refcount) > 0);
521 524
522 bip->bli_flags |= XFS_BLI_HOLD; 525 bip->bli_flags |= XFS_BLI_HOLD;
@@ -536,7 +539,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
536 ASSERT(bp->b_transp == tp); 539 ASSERT(bp->b_transp == tp);
537 ASSERT(bip != NULL); 540 ASSERT(bip != NULL);
538 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 541 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
539 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); 542 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
540 ASSERT(atomic_read(&bip->bli_refcount) > 0); 543 ASSERT(atomic_read(&bip->bli_refcount) > 0);
541 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 544 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
542 545
@@ -595,7 +598,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
595 bip->bli_flags &= ~XFS_BLI_STALE; 598 bip->bli_flags &= ~XFS_BLI_STALE;
596 ASSERT(XFS_BUF_ISSTALE(bp)); 599 ASSERT(XFS_BUF_ISSTALE(bp));
597 XFS_BUF_UNSTALE(bp); 600 XFS_BUF_UNSTALE(bp);
598 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; 601 bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
599 } 602 }
600 603
601 tp->t_flags |= XFS_TRANS_DIRTY; 604 tp->t_flags |= XFS_TRANS_DIRTY;
@@ -640,6 +643,7 @@ xfs_trans_binval(
640 xfs_buf_t *bp) 643 xfs_buf_t *bp)
641{ 644{
642 xfs_buf_log_item_t *bip = bp->b_fspriv; 645 xfs_buf_log_item_t *bip = bp->b_fspriv;
646 int i;
643 647
644 ASSERT(bp->b_transp == tp); 648 ASSERT(bp->b_transp == tp);
645 ASSERT(bip != NULL); 649 ASSERT(bip != NULL);
@@ -654,8 +658,8 @@ xfs_trans_binval(
654 */ 658 */
655 ASSERT(XFS_BUF_ISSTALE(bp)); 659 ASSERT(XFS_BUF_ISSTALE(bp));
656 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 660 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
657 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); 661 ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
658 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 662 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
659 ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); 663 ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
660 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 664 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
661 return; 665 return;
@@ -665,10 +669,12 @@ xfs_trans_binval(
665 669
666 bip->bli_flags |= XFS_BLI_STALE; 670 bip->bli_flags |= XFS_BLI_STALE;
667 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); 671 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
668 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; 672 bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
669 bip->bli_format.blf_flags |= XFS_BLF_CANCEL; 673 bip->__bli_format.blf_flags |= XFS_BLF_CANCEL;
670 memset((char *)(bip->bli_format.blf_data_map), 0, 674 for (i = 0; i < bip->bli_format_count; i++) {
671 (bip->bli_format.blf_map_size * sizeof(uint))); 675 memset(bip->bli_formats[i].blf_data_map, 0,
676 (bip->bli_formats[i].blf_map_size * sizeof(uint)));
677 }
672 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; 678 bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
673 tp->t_flags |= XFS_TRANS_DIRTY; 679 tp->t_flags |= XFS_TRANS_DIRTY;
674} 680}
@@ -772,5 +778,5 @@ xfs_trans_dquot_buf(
772 type == XFS_BLF_GDQUOT_BUF); 778 type == XFS_BLF_GDQUOT_BUF);
773 ASSERT(atomic_read(&bip->bli_refcount) > 0); 779 ASSERT(atomic_read(&bip->bli_refcount) > 0);
774 780
775 bip->bli_format.blf_flags |= type; 781 bip->__bli_format.blf_flags |= type;
776} 782}
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..d95f565a390e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
47#include "xfs_filestream.h" 47#include "xfs_filestream.h"
48#include "xfs_vnodeops.h" 48#include "xfs_vnodeops.h"
49#include "xfs_trace.h" 49#include "xfs_trace.h"
50#include "xfs_icache.h"
50 51
51/* 52/*
52 * The maximum pathlen is 1024 bytes. Since the minimum file system 53 * The maximum pathlen is 1024 bytes. Since the minimum file system
@@ -79,7 +80,7 @@ xfs_readlink_bmap(
79 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); 80 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
80 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); 81 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
81 82
82 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); 83 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
83 if (!bp) 84 if (!bp)
84 return XFS_ERROR(ENOMEM); 85 return XFS_ERROR(ENOMEM);
85 error = bp->b_error; 86 error = bp->b_error;
@@ -150,7 +151,7 @@ xfs_readlink(
150 * when the link count isn't zero and by xfs_dm_punch_hole() when 151 * when the link count isn't zero and by xfs_dm_punch_hole() when
151 * punching a hole to EOF. 152 * punching a hole to EOF.
152 */ 153 */
153STATIC int 154int
154xfs_free_eofblocks( 155xfs_free_eofblocks(
155 xfs_mount_t *mp, 156 xfs_mount_t *mp,
156 xfs_inode_t *ip, 157 xfs_inode_t *ip,
@@ -199,7 +200,7 @@ xfs_free_eofblocks(
199 if (need_iolock) { 200 if (need_iolock) {
200 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 201 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
201 xfs_trans_cancel(tp, 0); 202 xfs_trans_cancel(tp, 0);
202 return 0; 203 return EAGAIN;
203 } 204 }
204 } 205 }
205 206
@@ -237,6 +238,8 @@ xfs_free_eofblocks(
237 } else { 238 } else {
238 error = xfs_trans_commit(tp, 239 error = xfs_trans_commit(tp,
239 XFS_TRANS_RELEASE_LOG_RES); 240 XFS_TRANS_RELEASE_LOG_RES);
241 if (!error)
242 xfs_inode_clear_eofblocks_tag(ip);
240 } 243 }
241 244
242 xfs_iunlock(ip, XFS_ILOCK_EXCL); 245 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -425,19 +428,18 @@ xfs_release(
425 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 428 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
426 if (truncated) { 429 if (truncated) {
427 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 430 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
428 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) 431 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
429 xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); 432 error = -filemap_flush(VFS_I(ip)->i_mapping);
433 if (error)
434 return error;
435 }
430 } 436 }
431 } 437 }
432 438
433 if (ip->i_d.di_nlink == 0) 439 if (ip->i_d.di_nlink == 0)
434 return 0; 440 return 0;
435 441
436 if ((S_ISREG(ip->i_d.di_mode) && 442 if (xfs_can_free_eofblocks(ip, false)) {
437 (VFS_I(ip)->i_size > 0 ||
438 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
439 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
440 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
441 443
442 /* 444 /*
443 * If we can't get the iolock just skip truncating the blocks 445 * If we can't get the iolock just skip truncating the blocks
@@ -464,7 +466,7 @@ xfs_release(
464 return 0; 466 return 0;
465 467
466 error = xfs_free_eofblocks(mp, ip, true); 468 error = xfs_free_eofblocks(mp, ip, true);
467 if (error) 469 if (error && error != EAGAIN)
468 return error; 470 return error;
469 471
470 /* delalloc blocks after truncation means it really is dirty */ 472 /* delalloc blocks after truncation means it really is dirty */
@@ -513,13 +515,12 @@ xfs_inactive(
513 goto out; 515 goto out;
514 516
515 if (ip->i_d.di_nlink != 0) { 517 if (ip->i_d.di_nlink != 0) {
516 if ((S_ISREG(ip->i_d.di_mode) && 518 /*
517 (VFS_I(ip)->i_size > 0 || 519 * force is true because we are evicting an inode from the
518 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) && 520 * cache. Post-eof blocks must be freed, lest we end up with
519 (ip->i_df.if_flags & XFS_IFEXTENTS) && 521 * broken free space accounting.
520 (!(ip->i_d.di_flags & 522 */
521 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 523 if (xfs_can_free_eofblocks(ip, true)) {
522 ip->i_delayed_blks != 0))) {
523 error = xfs_free_eofblocks(mp, ip, false); 524 error = xfs_free_eofblocks(mp, ip, false);
524 if (error) 525 if (error)
525 return VN_INACTIVE_CACHE; 526 return VN_INACTIVE_CACHE;
@@ -777,7 +778,7 @@ xfs_create(
777 XFS_TRANS_PERM_LOG_RES, log_count); 778 XFS_TRANS_PERM_LOG_RES, log_count);
778 if (error == ENOSPC) { 779 if (error == ENOSPC) {
779 /* flush outstanding delalloc blocks and retry */ 780 /* flush outstanding delalloc blocks and retry */
780 xfs_flush_inodes(dp); 781 xfs_flush_inodes(mp);
781 error = xfs_trans_reserve(tp, resblks, log_res, 0, 782 error = xfs_trans_reserve(tp, resblks, log_res, 0,
782 XFS_TRANS_PERM_LOG_RES, log_count); 783 XFS_TRANS_PERM_LOG_RES, log_count);
783 } 784 }
@@ -1957,12 +1958,11 @@ xfs_free_file_space(
1957 1958
1958 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 1959 rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1959 ioffset = offset & ~(rounding - 1); 1960 ioffset = offset & ~(rounding - 1);
1960 1961 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1961 if (VN_CACHED(VFS_I(ip)) != 0) { 1962 ioffset, -1);
1962 error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED); 1963 if (error)
1963 if (error) 1964 goto out_unlock_iolock;
1964 goto out_unlock_iolock; 1965 truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1965 }
1966 1966
1967 /* 1967 /*
1968 * Need to zero the stuff we're not freeing, on disk. 1968 * Need to zero the stuff we're not freeing, on disk.
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
2095 return error; 2095 return error;
2096} 2096}
2097 2097
2098
2099STATIC int
2100xfs_zero_file_space(
2101 struct xfs_inode *ip,
2102 xfs_off_t offset,
2103 xfs_off_t len,
2104 int attr_flags)
2105{
2106 struct xfs_mount *mp = ip->i_mount;
2107 uint granularity;
2108 xfs_off_t start_boundary;
2109 xfs_off_t end_boundary;
2110 int error;
2111
2112 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
2113
2114 /*
2115 * Round the range of extents we are going to convert inwards. If the
2116 * offset is aligned, then it doesn't get changed so we zero from the
2117 * start of the block offset points to.
2118 */
2119 start_boundary = round_up(offset, granularity);
2120 end_boundary = round_down(offset + len, granularity);
2121
2122 ASSERT(start_boundary >= offset);
2123 ASSERT(end_boundary <= offset + len);
2124
2125 if (!(attr_flags & XFS_ATTR_NOLOCK))
2126 xfs_ilock(ip, XFS_IOLOCK_EXCL);
2127
2128 if (start_boundary < end_boundary - 1) {
2129 /* punch out the page cache over the conversion range */
2130 truncate_pagecache_range(VFS_I(ip), start_boundary,
2131 end_boundary - 1);
2132 /* convert the blocks */
2133 error = xfs_alloc_file_space(ip, start_boundary,
2134 end_boundary - start_boundary - 1,
2135 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
2136 attr_flags);
2137 if (error)
2138 goto out_unlock;
2139
2140 /* We've handled the interior of the range, now for the edges */
2141 if (start_boundary != offset)
2142 error = xfs_iozero(ip, offset, start_boundary - offset);
2143 if (error)
2144 goto out_unlock;
2145
2146 if (end_boundary != offset + len)
2147 error = xfs_iozero(ip, end_boundary,
2148 offset + len - end_boundary);
2149
2150 } else {
2151 /*
2152 * It's either a sub-granularity range or the range spanned lies
2153 * partially across two adjacent blocks.
2154 */
2155 error = xfs_iozero(ip, offset, len);
2156 }
2157
2158out_unlock:
2159 if (!(attr_flags & XFS_ATTR_NOLOCK))
2160 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
2161 return error;
2162
2163}
2164
2098/* 2165/*
2099 * xfs_change_file_space() 2166 * xfs_change_file_space()
2100 * This routine allocates or frees disk space for the given file. 2167 * This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
2120 xfs_fsize_t fsize; 2187 xfs_fsize_t fsize;
2121 int setprealloc; 2188 int setprealloc;
2122 xfs_off_t startoffset; 2189 xfs_off_t startoffset;
2123 xfs_off_t llen;
2124 xfs_trans_t *tp; 2190 xfs_trans_t *tp;
2125 struct iattr iattr; 2191 struct iattr iattr;
2126 int prealloc_type;
2127 2192
2128 if (!S_ISREG(ip->i_d.di_mode)) 2193 if (!S_ISREG(ip->i_d.di_mode))
2129 return XFS_ERROR(EINVAL); 2194 return XFS_ERROR(EINVAL);
@@ -2141,12 +2206,30 @@ xfs_change_file_space(
2141 return XFS_ERROR(EINVAL); 2206 return XFS_ERROR(EINVAL);
2142 } 2207 }
2143 2208
2144 llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; 2209 /*
2210 * length of <= 0 for resv/unresv/zero is invalid. length for
2211 * alloc/free is ignored completely and we have no idea what userspace
2212 * might have set it to, so set it to zero to allow range
2213 * checks to pass.
2214 */
2215 switch (cmd) {
2216 case XFS_IOC_ZERO_RANGE:
2217 case XFS_IOC_RESVSP:
2218 case XFS_IOC_RESVSP64:
2219 case XFS_IOC_UNRESVSP:
2220 case XFS_IOC_UNRESVSP64:
2221 if (bf->l_len <= 0)
2222 return XFS_ERROR(EINVAL);
2223 break;
2224 default:
2225 bf->l_len = 0;
2226 break;
2227 }
2145 2228
2146 if (bf->l_start < 0 || 2229 if (bf->l_start < 0 ||
2147 bf->l_start > mp->m_super->s_maxbytes || 2230 bf->l_start > mp->m_super->s_maxbytes ||
2148 bf->l_start + llen < 0 || 2231 bf->l_start + bf->l_len < 0 ||
2149 bf->l_start + llen > mp->m_super->s_maxbytes) 2232 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
2150 return XFS_ERROR(EINVAL); 2233 return XFS_ERROR(EINVAL);
2151 2234
2152 bf->l_whence = 0; 2235 bf->l_whence = 0;
@@ -2154,29 +2237,20 @@ xfs_change_file_space(
2154 startoffset = bf->l_start; 2237 startoffset = bf->l_start;
2155 fsize = XFS_ISIZE(ip); 2238 fsize = XFS_ISIZE(ip);
2156 2239
2157 /*
2158 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
2159 * file space.
2160 * These calls do NOT zero the data space allocated to the file,
2161 * nor do they change the file size.
2162 *
2163 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
2164 * space.
2165 * These calls cause the new file data to be zeroed and the file
2166 * size to be changed.
2167 */
2168 setprealloc = clrprealloc = 0; 2240 setprealloc = clrprealloc = 0;
2169 prealloc_type = XFS_BMAPI_PREALLOC;
2170
2171 switch (cmd) { 2241 switch (cmd) {
2172 case XFS_IOC_ZERO_RANGE: 2242 case XFS_IOC_ZERO_RANGE:
2173 prealloc_type |= XFS_BMAPI_CONVERT; 2243 error = xfs_zero_file_space(ip, startoffset, bf->l_len,
2174 xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0); 2244 attr_flags);
2175 /* FALLTHRU */ 2245 if (error)
2246 return error;
2247 setprealloc = 1;
2248 break;
2249
2176 case XFS_IOC_RESVSP: 2250 case XFS_IOC_RESVSP:
2177 case XFS_IOC_RESVSP64: 2251 case XFS_IOC_RESVSP64:
2178 error = xfs_alloc_file_space(ip, startoffset, bf->l_len, 2252 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
2179 prealloc_type, attr_flags); 2253 XFS_BMAPI_PREALLOC, attr_flags);
2180 if (error) 2254 if (error)
2181 return error; 2255 return error;
2182 setprealloc = 1; 2256 setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 447e146b2ba6..5163022d9808 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); 48int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 49int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
50 int flags, struct attrlist_cursor_kern *cursor); 50 int flags, struct attrlist_cursor_kern *cursor);
51void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
52 xfs_off_t last, int fiopt);
53int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
54 xfs_off_t last, int fiopt);
55int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
56 xfs_off_t last, uint64_t flags, int fiopt);
57int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
58 51
52int xfs_iozero(struct xfs_inode *, loff_t, size_t);
59int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); 53int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
54int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
60 55
61#endif /* _XFS_VNODEOPS_H */ 56#endif /* _XFS_VNODEOPS_H */